Merge from trunk revision 577223aebc7acdd31e62b33c1682fe54a622ae27.

author: Ian Lance Taylor <iant@golang.org> 2023-06-21 11:04:04 -0700
committer: Ian Lance Taylor <iant@golang.org> 2023-06-21 11:04:04 -0700
commit: 97e31a0a2a2d2273687fcdb4e5416aab1a2186e1 (patch)
tree: d5c1cae4de436a0fe54a5f0a2a197d309f3d654c /gcc/config/i386
parent: 6612f4f8cb9b0d5af18ec69ad04e56debc3e6ced (diff)
parent: 577223aebc7acdd31e62b33c1682fe54a622ae27 (diff)
download: gcc-97e31a0a2a2d2273687fcdb4e5416aab1a2186e1.zip
gcc-97e31a0a2a2d2273687fcdb4e5416aab1a2186e1.tar.gz
gcc-97e31a0a2a2d2273687fcdb4e5416aab1a2186e1.tar.bz2
39 files changed, 3951 insertions, 1343 deletions
diff --git a/gcc/config/i386/amxcomplexintrin.h b/gcc/config/i386/amxcomplexintrin.h
new file mode 100644
index 0000000..6ea1eca
--- /dev/null
+++ b/gcc/config/i386/amxcomplexintrin.h
@@ -0,0 +1,59 @@
+/* Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXCOMPLEXINTRIN_H_INCLUDED
+#define _AMXCOMPLEXINTRIN_H_INCLUDED
+
+#if !defined(__AMX_COMPLEX__)
+#pragma GCC push_options
+#pragma GCC target("amx-complex")
+#define __DISABLE_AMX_COMPLEX__
+#endif /* __AMX_COMPLEX__ */
+
+#if defined(__x86_64__)
+#define _tile_cmmimfp16ps_internal(src1_dst,src2,src3)				\
+  __asm__ volatile\
+  ("{tcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_cmmrlfp16ps_internal(src1_dst,src2,src3)				\
+  __asm__ volatile\
+  ("{tcmmrlfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmrlfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_cmmimfp16ps(src1_dst,src2,src3)					\
+  _tile_cmmimfp16ps_internal (src1_dst, src2, src3)
+
+#define _tile_cmmrlfp16ps(src1_dst,src2,src3)					\
+  _tile_cmmrlfp16ps_internal (src1_dst, src2, src3)
+
+#endif
+
+#ifdef __DISABLE_AMX_COMPLEX__
+#undef __DISABLE_AMX_COMPLEX__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_COMPLEX__ */
+
+#endif /* _AMXCOMPLEXINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h
index 1b9c816..9b8c13b 100644
--- a/gcc/config/i386/avx2intrin.h
+++ b/gcc/config/i386/avx2intrin.h
@@ -1915,6 +1915,353 @@ _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base,
 					   (int) (SCALE))
 #endif  /* __OPTIMIZE__ */
 
+#define _MM_REDUCE_OPERATOR_BASIC_EPI16(op) \
+  __v8hi __T1 = (__v8hi)__W; \
+  __v8hi __T2 = __builtin_shufflevector (__T1, __T1, 4, 5, 6, 7, 4, 5, 6, 7); \
+  __v8hi __T3 = __T1 op __T2; \
+  __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 2, 3, 2, 3, 4, 5, 6, 7); \
+  __v8hi __T5 = __T3 op __T4; \
+  __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 1, 1, 2, 3, 4, 5, 6, 7); \
+  __v8hi __T7 = __T5 op __T6; \
+  return __T7[0]
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_add_epi16 (__m128i __W)
+{
+  _MM_REDUCE_OPERATOR_BASIC_EPI16 (+);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_mul_epi16 (__m128i __W)
+{
+  _MM_REDUCE_OPERATOR_BASIC_EPI16 (*);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_and_epi16 (__m128i __W)
+{
+  _MM_REDUCE_OPERATOR_BASIC_EPI16 (&);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_or_epi16 (__m128i __W)
+{
+  _MM_REDUCE_OPERATOR_BASIC_EPI16 (|);
+}
+
+#define _MM_REDUCE_OPERATOR_MAX_MIN_EP16(op) \
+  __m128i __T1 = (__m128i)__builtin_shufflevector ((__v8hi)__V, \
+		  (__v8hi)__V, 4, 5, 6, 7, 4, 5, 6, 7); \
+  __m128i __T2 = _mm_##op (__V, __T1); \
+  __m128i __T3 = (__m128i)__builtin_shufflevector ((__v8hi)__T2, \
+		  (__v8hi)__T2, 2, 3, 2, 3, 4, 5, 6, 7); \
+  __m128i __T4 = _mm_##op (__T2, __T3); \
+  __m128i __T5 = (__m128i)__builtin_shufflevector ((__v8hi)__T4, \
+		  (__v8hi)__T4, 1, 1, 2, 3, 4, 5, 6, 7); \
+  __v8hi __T6 = (__v8hi)_mm_##op (__T4, __T5); \
+  return __T6[0]
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_max_epi16 (__m128i __V)
+{
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16);
+}
+
+extern __inline unsigned short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_max_epu16 (__m128i __V)
+{
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_min_epi16 (__m128i __V)
+{
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16);
+}
+
+extern __inline unsigned short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_min_epu16 (__m128i __V)
+{
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16);
+}
+
+#define _MM256_REDUCE_OPERATOR_BASIC_EPI16(op) \
+  __v8hi __T1 = (__v8hi)_mm256_extracti128_si256 (__W, 0); \
+  __v8hi __T2 = (__v8hi)_mm256_extracti128_si256 (__W, 1); \
+  __v8hi __T3 = __T1 op __T2; \
+  __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 4, 5, 6, 7, 4, 5, 6, 7); \
+  __v8hi __T5 = __T3 op __T4; \
+  __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 2, 3, 2, 3, 4, 5, 6, 7); \
+  __v8hi __T7 = __T5 op __T6; \
+  __v8hi __T8 = __builtin_shufflevector (__T7, __T7, 1, 1, 2, 3, 4, 5, 6, 7); \
+  __v8hi __T9 = __T7 op __T8; \
+  return __T9[0]
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_add_epi16 (__m256i __W)
+{
+  _MM256_REDUCE_OPERATOR_BASIC_EPI16 (+);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_mul_epi16 (__m256i __W)
+{
+  _MM256_REDUCE_OPERATOR_BASIC_EPI16 (*);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_and_epi16 (__m256i __W)
+{
+  _MM256_REDUCE_OPERATOR_BASIC_EPI16 (&);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_or_epi16 (__m256i __W)
+{
+  _MM256_REDUCE_OPERATOR_BASIC_EPI16 (|);
+}
+
+#define _MM256_REDUCE_OPERATOR_MAX_MIN_EP16(op) \
+  __m128i __T1 = _mm256_extracti128_si256 (__V, 0); \
+  __m128i __T2 = _mm256_extracti128_si256 (__V, 1); \
+  __m128i __T3 = _mm_##op (__T1, __T2); \
+  __m128i __T4 = (__m128i)__builtin_shufflevector ((__v8hi)__T3, \
+		  (__v8hi)__T3, 4, 5, 6, 7, 4, 5, 6, 7); \
+  __m128i __T5 = _mm_##op (__T3, __T4); \
+  __m128i __T6 = (__m128i)__builtin_shufflevector ((__v8hi)__T5, \
+		  (__v8hi)__T5, 2, 3, 2, 3, 4, 5, 6, 7); \
+  __m128i __T7 = _mm_##op (__T5, __T6); \
+  __m128i __T8 = (__m128i)__builtin_shufflevector ((__v8hi)__T7, \
+		  (__v8hi)__T7, 1, 1, 2, 3, 4, 5, 6, 7); \
+  __v8hi __T9 = (__v8hi)_mm_##op (__T7, __T8); \
+  return __T9[0]
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_max_epi16 (__m256i __V)
+{
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16);
+}
+
+extern __inline unsigned short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_max_epu16 (__m256i __V)
+{
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_min_epi16 (__m256i __V)
+{
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16);
+}
+
+extern __inline unsigned short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_min_epu16 (__m256i __V)
+{
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16);
+}
+
+#define _MM_REDUCE_OPERATOR_BASIC_EPI8(op) \
+  __v16qi __T1 = (__v16qi)__W; \
+  __v16qi __T2 = __builtin_shufflevector (__T1, __T1, \
+		  8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __v16qi __T3 = __T1 op __T2; \
+  __v16qi __T4 = __builtin_shufflevector (__T3, __T3, \
+		  4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __v16qi __T5 = __T3 op __T4; \
+  __v16qi __T6 = __builtin_shufflevector (__T5, __T5, \
+		  2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __v16qi __T7 = __T5 op __T6; \
+  __v16qi __T8 = __builtin_shufflevector (__T7, __T7, \
+		  1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __v16qi __T9 = __T7 op __T8; \
+  return __T9[0]
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_add_epi8 (__m128i __W)
+{
+  _MM_REDUCE_OPERATOR_BASIC_EPI8 (+);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_mul_epi8 (__m128i __W)
+{
+  _MM_REDUCE_OPERATOR_BASIC_EPI8 (*);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_and_epi8 (__m128i __W)
+{
+  _MM_REDUCE_OPERATOR_BASIC_EPI8 (&);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_or_epi8 (__m128i __W)
+{
+  _MM_REDUCE_OPERATOR_BASIC_EPI8 (|);
+}
+
+#define _MM_REDUCE_OPERATOR_MAX_MIN_EP8(op) \
+  __m128i __T1 = (__m128i)__builtin_shufflevector ((__v16qi)__V, (__v16qi)__V, \
+		  8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __m128i __T2 = _mm_##op (__V, __T1); \
+  __m128i __T3 = (__m128i)__builtin_shufflevector ((__v16qi)__T2, \
+		  (__v16qi)__T2, \
+		  4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __m128i __T4 = _mm_##op (__T2, __T3); \
+  __m128i __T5 = (__m128i)__builtin_shufflevector ((__v16qi)__T4, \
+		  (__v16qi)__T4, \
+		  2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __m128i __T6 = _mm_##op (__T4, __T5); \
+  __m128i __T7 = (__m128i)__builtin_shufflevector ((__v16qi)__T6, \
+		  (__v16qi)__T6, \
+		  1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __v16qi __T8 = (__v16qi)_mm_##op (__T6, __T7); \
+  return __T8[0]
+
+extern __inline signed char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_max_epi8 (__m128i __V)
+{
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_max_epu8 (__m128i __V)
+{
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8);
+}
+
+extern __inline signed char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_min_epi8 (__m128i __V)
+{
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_min_epu8 (__m128i __V)
+{
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8);
+}
+
+#define _MM256_REDUCE_OPERATOR_BASIC_EPI8(op) \
+  __v16qi __T1 = (__v16qi)_mm256_extracti128_si256 (__W, 0); \
+  __v16qi __T2 = (__v16qi)_mm256_extracti128_si256 (__W, 1); \
+  __v16qi __T3 = __T1 op __T2; \
+  __v16qi __T4 = __builtin_shufflevector (__T3, __T3, \
+		  8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __v16qi __T5 = __T3 op __T4; \
+  __v16qi __T6 = __builtin_shufflevector (__T5, __T5, \
+		  4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __v16qi __T7 = __T5 op __T6; \
+  __v16qi __T8 = __builtin_shufflevector (__T7, __T7, \
+		  2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __v16qi __T9 = __T7 op __T8; \
+  __v16qi __T10 = __builtin_shufflevector (__T9, __T9, \
+		  1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __v16qi __T11 = __T9 op __T10; \
+  return __T11[0]
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_add_epi8 (__m256i __W)
+{
+  _MM256_REDUCE_OPERATOR_BASIC_EPI8 (+);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_mul_epi8 (__m256i __W)
+{
+  _MM256_REDUCE_OPERATOR_BASIC_EPI8 (*);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_and_epi8 (__m256i __W)
+{
+  _MM256_REDUCE_OPERATOR_BASIC_EPI8 (&);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_or_epi8 (__m256i __W)
+{
+  _MM256_REDUCE_OPERATOR_BASIC_EPI8 (|);
+}
+
+#define _MM256_REDUCE_OPERATOR_MAX_MIN_EP8(op) \
+  __m128i __T1 = _mm256_extracti128_si256 (__V, 0); \
+  __m128i __T2 = _mm256_extracti128_si256 (__V, 1); \
+  __m128i __T3 = _mm_##op (__T1, __T2); \
+  __m128i __T4 = (__m128i)__builtin_shufflevector ((__v16qi)__T3, \
+		  (__v16qi)__T3, \
+		  8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __m128i __T5 = _mm_##op (__T3, __T4); \
+  __m128i __T6 = (__m128i)__builtin_shufflevector ((__v16qi)__T5, \
+		  (__v16qi)__T5, \
+		  4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __m128i __T7 = _mm_##op (__T5, __T6); \
+  __m128i __T8 = (__m128i)__builtin_shufflevector ((__v16qi)__T7, \
+		  (__v16qi)__T5, \
+		  2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __m128i __T9 = _mm_##op (__T7, __T8); \
+  __m128i __T10 = (__m128i)__builtin_shufflevector ((__v16qi)__T9, \
+		  (__v16qi)__T9, \
+		  1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+  __v16qi __T11 = (__v16qi)_mm_##op (__T9, __T10); \
+  return __T11[0]
+
+extern __inline signed char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_max_epi8 (__m256i __V)
+{
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_max_epu8 (__m256i __V)
+{
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8);
+}
+
+extern __inline signed char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_min_epi8 (__m256i __V)
+{
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_min_epu8 (__m256i __V)
+{
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8);
+}
+
 #ifdef __DISABLE_AVX2__
 #undef __DISABLE_AVX2__
 #pragma GCC pop_options
diff --git a/gcc/config/i386/avx512bitalgintrin.h b/gcc/config/i386/avx512bitalgintrin.h
index aa6d652..a1c7be1 100644
--- a/gcc/config/i386/avx512bitalgintrin.h
+++ b/gcc/config/i386/avx512bitalgintrin.h
@@ -48,17 +48,6 @@ _mm512_popcnt_epi16 (__m512i __A)
   return (__m512i) __builtin_ia32_vpopcountw_v32hi ((__v32hi) __A);
 }
 
-#ifdef __DISABLE_AVX512BITALG__
-#undef __DISABLE_AVX512BITALG__
-#pragma GCC pop_options
-#endif /* __DISABLE_AVX512BITALG__ */
-
-#if !defined(__AVX512BITALG__) || !defined(__AVX512BW__)
-#pragma GCC push_options
-#pragma GCC target("avx512bitalg,avx512bw")
-#define __DISABLE_AVX512BITALGBW__
-#endif /* __AVX512VLBW__ */
-
 extern __inline __m512i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_popcnt_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
@@ -114,16 +103,16 @@ _mm512_mask_bitshuffle_epi64_mask (__mmask64 __M, __m512i __A, __m512i __B)
 						 (__mmask64) __M);
 }
 
-#ifdef __DISABLE_AVX512BITALGBW__
-#undef __DISABLE_AVX512BITALGBW__
+#ifdef __DISABLE_AVX512BITALG__
+#undef __DISABLE_AVX512BITALG__
 #pragma GCC pop_options
-#endif /* __DISABLE_AVX512BITALGBW__ */
+#endif /* __DISABLE_AVX512BITALG__ */
 
-#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) || !defined(__AVX512BW__)
+#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__)
 #pragma GCC push_options
-#pragma GCC target("avx512bitalg,avx512vl,avx512bw")
-#define __DISABLE_AVX512BITALGVLBW__
-#endif /* __AVX512VLBW__ */
+#pragma GCC target("avx512bitalg,avx512vl")
+#define __DISABLE_AVX512BITALGVL__
+#endif /* __AVX512BITALGVL__ */
 
 extern __inline __m256i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -162,18 +151,6 @@ _mm256_mask_bitshuffle_epi64_mask (__mmask32 __M, __m256i __A, __m256i __B)
 						 (__mmask32) __M);
 }
 
-#ifdef __DISABLE_AVX512BITALGVLBW__
-#undef __DISABLE_AVX512BITALGVLBW__
-#pragma GCC pop_options
-#endif /* __DISABLE_AVX512BITALGVLBW__ */
-
-
-#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__)
-#pragma GCC push_options
-#pragma GCC target("avx512bitalg,avx512vl")
-#define __DISABLE_AVX512BITALGVL__
-#endif /* __AVX512VLBW__ */
-
 extern __inline __mmask16
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_bitshuffle_epi64_mask (__m128i __A, __m128i __B)
@@ -278,6 +255,6 @@ _mm_maskz_popcnt_epi16 (__mmask8 __U, __m128i __A)
 #ifdef __DISABLE_AVX512BITALGVL__
 #undef __DISABLE_AVX512BITALGVL__
 #pragma GCC pop_options
-#endif /* __DISABLE_AVX512BITALGBW__ */
+#endif /* __DISABLE_AVX512BITALGVL__ */
 
 #endif /* _AVX512BITALGINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/avx512bwintrin.h b/gcc/config/i386/avx512bwintrin.h
index 89790f7..d1cd549 100644
--- a/gcc/config/i386/avx512bwintrin.h
+++ b/gcc/config/i386/avx512bwintrin.h
@@ -2880,7 +2880,7 @@ _mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, __m512i __B,
 
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_srli_epi16 (__m512i __A, const int __imm)
+_mm512_srli_epi16 (__m512i __A, const unsigned int __imm)
 {
   return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
 						  (__v32hi)
@@ -2891,7 +2891,7 @@ _mm512_srli_epi16 (__m512i __A, const int __imm)
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-			const int __imm)
+			const unsigned int __imm)
 {
   return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
 						  (__v32hi) __W,
@@ -2910,7 +2910,7 @@ _mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, const int __imm)
 
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_slli_epi16 (__m512i __A, const int __B)
+_mm512_slli_epi16 (__m512i __A, const unsigned int __B)
 {
   return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
 						  (__v32hi)
@@ -2921,7 +2921,7 @@ _mm512_slli_epi16 (__m512i __A, const int __B)
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-			const int __B)
+			const unsigned int __B)
 {
   return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
 						  (__v32hi) __W,
@@ -2930,7 +2930,7 @@ _mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
 
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const int __B)
+_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const unsigned int __B)
 {
   return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
 						  (__v32hi)
@@ -3008,7 +3008,7 @@ _mm512_maskz_shufflelo_epi16 (__mmask32 __U, __m512i __A,
 
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_srai_epi16 (__m512i __A, const int __imm)
+_mm512_srai_epi16 (__m512i __A, const unsigned int __imm)
 {
   return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
 						  (__v32hi)
@@ -3019,7 +3019,7 @@ _mm512_srai_epi16 (__m512i __A, const int __imm)
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_srai_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-			const int __imm)
+			const unsigned int __imm)
 {
   return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
 						  (__v32hi) __W,
@@ -3028,7 +3028,7 @@ _mm512_mask_srai_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
 
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_maskz_srai_epi16 (__mmask32 __U, __m512i __A, const int __imm)
+_mm512_maskz_srai_epi16 (__mmask32 __U, __m512i __A, const unsigned int __imm)
 {
   return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
 						  (__v32hi)
@@ -3196,29 +3196,32 @@ _mm512_bsrli_epi128 (__m512i __A, const int __N)
 
 #define _mm512_srli_epi16(A, B)                                         \
   ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A),      \
-    (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))
+    (unsigned int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))
 
 #define _mm512_mask_srli_epi16(W, U, A, B)                              \
   ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A),      \
-    (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
+    (unsigned int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
 
 #define _mm512_maskz_srli_epi16(U, A, B)                                \
   ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A),      \
     (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U)))
 
-#define _mm512_slli_epi16(X, C)						   \
-  ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\
-    (__v32hi)(__m512i)_mm512_setzero_si512 (),				   \
+#define _mm512_slli_epi16(X, C)						    \
+  ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X),	    \
+    (unsigned int)(C),							    \
+    (__v32hi)(__m512i)_mm512_setzero_si512 (),				    \
     (__mmask32)-1))
 
-#define _mm512_mask_slli_epi16(W, U, X, C)                                 \
-  ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\
-    (__v32hi)(__m512i)(W),\
+#define _mm512_mask_slli_epi16(W, U, X, C)				    \
+  ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X),	    \
+    (unsigned int)(C),							    \
+    (__v32hi)(__m512i)(W),						    \
     (__mmask32)(U)))
 
-#define _mm512_maskz_slli_epi16(U, X, C)                                   \
-  ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\
-    (__v32hi)(__m512i)_mm512_setzero_si512 (),				   \
+#define _mm512_maskz_slli_epi16(U, X, C)				    \
+  ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X),	    \
+    (unsigned int)(C),							    \
+    (__v32hi)(__m512i)_mm512_setzero_si512 (),				    \
     (__mmask32)(U)))
 
 #define _mm512_shufflehi_epi16(A, B)                                                \
@@ -3257,15 +3260,15 @@ _mm512_bsrli_epi128 (__m512i __A, const int __N)
 
 #define _mm512_srai_epi16(A, B)                                         \
   ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A),      \
-    (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))
+    (unsigned int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))
 
 #define _mm512_mask_srai_epi16(W, U, A, B)                              \
   ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A),      \
-    (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
+    (unsigned int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
 
 #define _mm512_maskz_srai_epi16(U, A, B)                                \
   ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A),      \
-    (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U)))
+    (unsigned int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U)))
 
 #define _mm512_mask_blend_epi16(__U, __A, __W)			      \
   ((__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) (__A),	      \
diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h
index 89b3219..517e787 100644
--- a/gcc/config/i386/avx512fintrin.h
+++ b/gcc/config/i386/avx512fintrin.h
@@ -1037,19 +1037,22 @@ _mm512_maskz_slli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
 						  (__mmask8) __U);
 }
 #else
-#define _mm512_slli_epi64(X, C)						   \
-  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
-    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
+#define _mm512_slli_epi64(X, C)						\
+  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),				\
     (__mmask8)-1))
 
-#define _mm512_mask_slli_epi64(W, U, X, C)				   \
-  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
-    (__v8di)(__m512i)(W),\
+#define _mm512_mask_slli_epi64(W, U, X, C)				\
+  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v8di)(__m512i)(W),						\
     (__mmask8)(U)))
 
-#define _mm512_maskz_slli_epi64(U, X, C)                                   \
-  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
-    (__v8di)(__m512i)_mm512_setzero_si512 (),\
+#define _mm512_maskz_slli_epi64(U, X, C)				\
+  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v8di)(__m512i)_mm512_setzero_si512 (),				\
     (__mmask8)(U)))
 #endif
 
@@ -1116,19 +1119,22 @@ _mm512_maskz_srli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
 						  (__mmask8) __U);
 }
 #else
-#define _mm512_srli_epi64(X, C)						   \
-  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
-    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
+#define _mm512_srli_epi64(X, C)						\
+  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),				\
     (__mmask8)-1))
 
-#define _mm512_mask_srli_epi64(W, U, X, C)				   \
-  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
-    (__v8di)(__m512i)(W),\
+#define _mm512_mask_srli_epi64(W, U, X, C)				\
+  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v8di)(__m512i)(W),						\
     (__mmask8)(U)))
 
-#define _mm512_maskz_srli_epi64(U, X, C)                                   \
-  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
-    (__v8di)(__m512i)_mm512_setzero_si512 (),\
+#define _mm512_maskz_srli_epi64(U, X, C)				\
+  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v8di)(__m512i)_mm512_setzero_si512 (),				\
     (__mmask8)(U)))
 #endif
 
@@ -1195,19 +1201,22 @@ _mm512_maskz_srai_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
 						  (__mmask8) __U);
 }
 #else
-#define _mm512_srai_epi64(X, C)						   \
-  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
-    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
+#define _mm512_srai_epi64(X, C)						\
+  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),				\
     (__mmask8)-1))
 
-#define _mm512_mask_srai_epi64(W, U, X, C)				   \
-  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
-    (__v8di)(__m512i)(W),\
+#define _mm512_mask_srai_epi64(W, U, X, C)				\
+  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v8di)(__m512i)(W),						\
     (__mmask8)(U)))
 
-#define _mm512_maskz_srai_epi64(U, X, C)				   \
-  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
-    (__v8di)(__m512i)_mm512_setzero_si512 (),\
+#define _mm512_maskz_srai_epi64(U, X, C)				\
+  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v8di)(__m512i)_mm512_setzero_si512 (),				\
     (__mmask8)(U)))
 #endif
 
@@ -1274,19 +1283,22 @@ _mm512_maskz_slli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
 						  (__mmask16) __U);
 }
 #else
-#define _mm512_slli_epi32(X, C)						    \
-  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
-    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
+#define _mm512_slli_epi32(X, C)						\
+  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v16si)(__m512i)_mm512_undefined_epi32 (),			\
     (__mmask16)-1))
 
-#define _mm512_mask_slli_epi32(W, U, X, C)                                  \
-  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
-    (__v16si)(__m512i)(W),\
+#define _mm512_mask_slli_epi32(W, U, X, C)				\
+  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v16si)(__m512i)(W),						\
     (__mmask16)(U)))
 
-#define _mm512_maskz_slli_epi32(U, X, C)                                    \
-  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
-    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+#define _mm512_maskz_slli_epi32(U, X, C)				\
+  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),				\
     (__mmask16)(U)))
 #endif
 
@@ -1353,19 +1365,22 @@ _mm512_maskz_srli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
 						  (__mmask16) __U);
 }
 #else
-#define _mm512_srli_epi32(X, C)						    \
-  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+#define _mm512_srli_epi32(X, C)						  \
+  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X),	  \
+    (unsigned int)(C),							  \
     (__v16si)(__m512i)_mm512_undefined_epi32 (),\
     (__mmask16)-1))
 
-#define _mm512_mask_srli_epi32(W, U, X, C)                                  \
-  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
-    (__v16si)(__m512i)(W),\
+#define _mm512_mask_srli_epi32(W, U, X, C)				  \
+  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X),	  \
+    (unsigned int)(C),							  \
+    (__v16si)(__m512i)(W),						  \
     (__mmask16)(U)))
 
-#define _mm512_maskz_srli_epi32(U, X, C)				    \
-  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
-    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+#define _mm512_maskz_srli_epi32(U, X, C)				  \
+  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X),	  \
+    (unsigned int)(C),							  \
+    (__v16si)(__m512i)_mm512_setzero_si512 (),				  \
     (__mmask16)(U)))
 #endif
 
@@ -1432,19 +1447,22 @@ _mm512_maskz_srai_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
 						  (__mmask16) __U);
 }
 #else
-#define _mm512_srai_epi32(X, C)						    \
-  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+#define _mm512_srai_epi32(X, C)						\
+  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X),	\
+    (unsigned int)(C),							\
     (__v16si)(__m512i)_mm512_undefined_epi32 (),\
     (__mmask16)-1))
 
-#define _mm512_mask_srai_epi32(W, U, X, C)				    \
-  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
-    (__v16si)(__m512i)(W),\
+#define _mm512_mask_srai_epi32(W, U, X, C)				\
+  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v16si)(__m512i)(W),						\
     (__mmask16)(U)))
 
-#define _mm512_maskz_srai_epi32(U, X, C)				    \
-  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
-    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+#define _mm512_maskz_srai_epi32(U, X, C)				\
+  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X),	\
+    (unsigned int)(C),							\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),				\
     (__mmask16)(U)))
 #endif
 
diff --git a/gcc/config/i386/avx512vbmi2intrin.h b/gcc/config/i386/avx512vbmi2intrin.h
index 528d193..ca00f8a 100644
--- a/gcc/config/i386/avx512vbmi2intrin.h
+++ b/gcc/config/i386/avx512vbmi2intrin.h
@@ -326,18 +326,6 @@ _mm512_maskz_shldv_epi64 (__mmask8 __A, __m512i __B, __m512i __C, __m512i __D)
 						(__v8di) __D, (__mmask8)__A);
 }
 
-#ifdef __DISABLE_AVX512VBMI2__
-#undef __DISABLE_AVX512VBMI2__
-
-#pragma GCC pop_options
-#endif /* __DISABLE_AVX512VBMI2__ */
-
-#if !defined(__AVX512VBMI2__) || !defined(__AVX512BW__)
-#pragma GCC push_options
-#pragma GCC target("avx512vbmi2,avx512bw")
-#define __DISABLE_AVX512VBMI2BW__
-#endif /* __AVX512VBMI2BW__ */
-
 extern __inline __m512i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_compress_epi8 (__m512i __A, __mmask64 __B, __m512i __C)
@@ -548,10 +536,10 @@ _mm512_maskz_shldv_epi16 (__mmask32 __A, __m512i __B, __m512i __C, __m512i __D)
 				(__v32hi) __C, (__v32hi) __D, (__mmask32)__A);
 }
 
-#ifdef __DISABLE_AVX512VBMI2BW__
-#undef __DISABLE_AVX512VBMI2BW__
+#ifdef __DISABLE_AVX512VBMI2__
+#undef __DISABLE_AVX512VBMI2__
 
 #pragma GCC pop_options
-#endif /* __DISABLE_AVX512VBMI2BW__ */
+#endif /* __DISABLE_AVX512VBMI2__ */
 
 #endif /* __AVX512VBMI2INTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/avx512vbmi2vlintrin.h b/gcc/config/i386/avx512vbmi2vlintrin.h
index 86efca2..92cae8c 100644
--- a/gcc/config/i386/avx512vbmi2vlintrin.h
+++ b/gcc/config/i386/avx512vbmi2vlintrin.h
@@ -957,21 +957,6 @@ _mm_maskz_shldv_epi64 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
 						(__v2di) __D, (__mmask8)__A);
 }
 
-
-
-
-#ifdef __DISABLE_AVX512VBMI2VL__
-#undef __DISABLE_AVX512VBMI2VL__
-#pragma GCC pop_options
-#endif /* __DISABLE_AVX512VBMIVL__ */
-
-#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) || \
-    !defined(__AVX512BW__)
-#pragma GCC push_options
-#pragma GCC target("avx512vbmi2,avx512vl,avx512bw")
-#define __DISABLE_AVX512VBMI2VLBW__
-#endif /* __AVX512VBMIVLBW__ */
-
 extern __inline __m256i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_compress_epi8 (__m256i __A, __mmask32 __B, __m256i __C)
@@ -1029,9 +1014,9 @@ _mm256_maskz_expandloadu_epi8 (__mmask32 __A, const void * __B)
 			(__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
 }
 
-#ifdef __DISABLE_AVX512VBMI2VLBW__
-#undef __DISABLE_AVX512VBMI2VLBW__
+#ifdef __DISABLE_AVX512VBMI2VL__
+#undef __DISABLE_AVX512VBMI2VL__
 #pragma GCC pop_options
-#endif /* __DISABLE_AVX512VBMIVLBW__ */
+#endif /* __DISABLE_AVX512VBMIVL__ */
 
 #endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/avx512vlbwintrin.h b/gcc/config/i386/avx512vlbwintrin.h
index 0232783..bc58fa4 100644
--- a/gcc/config/i386/avx512vlbwintrin.h
+++ b/gcc/config/i386/avx512vlbwintrin.h
@@ -259,6 +259,42 @@ _mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
+						    (__v8hi) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
+						    (__v16qi) __W,
+						    (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
+						    (__v16hi) __W,
+						    (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A,
+						    (__v32qi) __W,
+						    (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_cvtepi16_epi8 (__m256i __A)
 {
 
@@ -1442,42 +1478,6 @@ _mm_maskz_dbsad_epu8 (__mmask8 __U, __m128i __A, __m128i __B,
 						    (__mmask8) __U);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
-{
-  return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
-						    (__v8hi) __W,
-						    (__mmask8) __U);
-}
-
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
-{
-  return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
-						    (__v16qi) __W,
-						    (__mmask16) __U);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
-{
-  return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
-						    (__v16hi) __W,
-						    (__mmask16) __U);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
-{
-  return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A,
-						    (__v32qi) __W,
-						    (__mmask32) __U);
-}
-
 extern __inline __mmask8
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_cmp_epi16_mask (__mmask8 __U, __m128i __X, __m128i __Y,
@@ -1759,7 +1759,7 @@ _mm_maskz_shufflelo_epi16 (__mmask8 __U, __m128i __A, const int __imm)
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_srai_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-			const int __imm)
+			const unsigned int __imm)
 {
   return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm,
 						  (__v16hi) __W,
@@ -1768,7 +1768,7 @@ _mm256_mask_srai_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_srai_epi16 (__mmask16 __U, __m256i __A, const int __imm)
+_mm256_maskz_srai_epi16 (__mmask16 __U, __m256i __A, const unsigned int __imm)
 {
   return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm,
 						  (__v16hi)
@@ -1779,7 +1779,7 @@ _mm256_maskz_srai_epi16 (__mmask16 __U, __m256i __A, const int __imm)
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_srai_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-		     const int __imm)
+		     const unsigned int __imm)
 {
   return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm,
 						  (__v8hi) __W,
@@ -1788,7 +1788,7 @@ _mm_mask_srai_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_srai_epi16 (__mmask8 __U, __m128i __A, const int __imm)
+_mm_maskz_srai_epi16 (__mmask8 __U, __m128i __A, const unsigned int __imm)
 {
   return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm,
 						  (__v8hi)
@@ -1799,7 +1799,7 @@ _mm_maskz_srai_epi16 (__mmask8 __U, __m128i __A, const int __imm)
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_slli_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-			int __B)
+			unsigned int __B)
 {
   return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B,
 						  (__v16hi) __W,
@@ -1808,7 +1808,7 @@ _mm256_mask_slli_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_slli_epi16 (__mmask16 __U, __m256i __A, int __B)
+_mm256_maskz_slli_epi16 (__mmask16 __U, __m256i __A, unsigned int __B)
 {
   return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B,
 						  (__v16hi)
@@ -1818,7 +1818,7 @@ _mm256_maskz_slli_epi16 (__mmask16 __U, __m256i __A, int __B)
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_slli_epi16 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
+_mm_mask_slli_epi16 (__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
   return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B,
 						  (__v8hi) __W,
@@ -1827,7 +1827,7 @@ _mm_mask_slli_epi16 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
+_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
 {
   return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B,
 						  (__v8hi)
@@ -1855,23 +1855,23 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
 
 #define _mm_maskz_srli_epi16(U, A, B)                                   \
   ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A),       \
-    (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U)))
+    (int)(B), (__v8hi)_mm_setzero_si128 (), (__mmask8)(U)))
 
 #define _mm256_mask_srai_epi16(W, U, A, B)                              \
   ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A),      \
-    (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U)))
+    (unsigned int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U)))
 
 #define _mm256_maskz_srai_epi16(U, A, B)                                \
   ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A),      \
-    (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U)))
+    (unsigned int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U)))
 
 #define _mm_mask_srai_epi16(W, U, A, B)                                 \
   ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A),       \
-    (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U)))
+    (unsigned int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U)))
 
 #define _mm_maskz_srai_epi16(U, A, B)                                   \
   ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A),       \
-    (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U)))
+    (unsigned int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U)))
 
 #define _mm256_mask_shufflehi_epi16(W, U, A, B)                                     \
   ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B),       \
@@ -1930,14 +1930,16 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
 					    (__v2di)(__m128i)_mm_setzero_si128 (),  \
 					    (__mmask16)(U)))
 
-#define _mm_mask_slli_epi16(W, U, X, C)					  \
-  ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C),\
-    (__v8hi)(__m128i)(W),\
+#define _mm_mask_slli_epi16(W, U, X, C)					\
+  ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X),	\
+    (unsigned int)(C),							\
+    (__v8hi)(__m128i)(W),						\
     (__mmask8)(U)))
 
-#define _mm_maskz_slli_epi16(U, X, C)					  \
-  ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C),\
-    (__v8hi)(__m128i)_mm_setzero_si128 (),\
+#define _mm_maskz_slli_epi16(U, X, C)					\
+  ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X),	\
+    (unsigned int)(C),							\
+    (__v8hi)(__m128i)_mm_setzero_si128 (),				\
     (__mmask8)(U)))
 
 #define _mm256_dbsad_epu8(X, Y, C)                                                  \
@@ -1946,14 +1948,16 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
                                               (__v16hi)(__m256i)_mm256_setzero_si256(),\
                                               (__mmask16)-1))
 
-#define _mm256_mask_slli_epi16(W, U, X, C)                                 \
-  ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C),\
-    (__v16hi)(__m256i)(W),\
+#define _mm256_mask_slli_epi16(W, U, X, C)				\
+  ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X),	\
+    (unsigned int)(C),							\
+    (__v16hi)(__m256i)(W),						\
     (__mmask16)(U)))
 
-#define _mm256_maskz_slli_epi16(U, X, C)                                   \
-  ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C),\
-    (__v16hi)(__m256i)_mm256_setzero_si256 (),\
+#define _mm256_maskz_slli_epi16(U, X, C)				\
+  ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X),	\
+    (unsigned int)(C),							\
+    (__v16hi)(__m256i)_mm256_setzero_si256 (),				\
     (__mmask16)(U)))
 
 #define _mm256_mask_dbsad_epu8(W, U, X, Y, C)                                       \
@@ -1986,26 +1990,6 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
                                               (__v8hi)(__m128i)_mm_setzero_si128(), \
                                               (__mmask8)(U)))
 
-#define _mm_mask_blend_epi16(__U, __A, __W)			      \
-  ((__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) (__A),	      \
-						    (__v8hi) (__W),   \
-						    (__mmask8) (__U)))
-
-#define _mm_mask_blend_epi8(__U, __A, __W)			      \
-  ((__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) (__A),	      \
-						    (__v16qi) (__W),  \
-						    (__mmask16) (__U)))
-
-#define _mm256_mask_blend_epi16(__U, __A, __W)			      \
-  ((__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) (__A),	      \
-						    (__v16hi) (__W),  \
-						    (__mmask16) (__U)))
-
-#define _mm256_mask_blend_epi8(__U, __A, __W)			      \
-  ((__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) (__A),	      \
-						    (__v32qi) (__W),  \
-						    (__mmask32) (__U)))
-
 #define _mm_cmp_epi16_mask(X, Y, P)				\
   ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X),	\
 					    (__v8hi)(__m128i)(Y), (int)(P),\
@@ -4750,6 +4734,262 @@ _mm256_mask_cmple_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
 						  (__mmask16) __M);
 }
 
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_add_epi16 (__mmask8 __M, __m128i __W)
+{
+  __W = _mm_maskz_mov_epi16 (__M, __W);
+  _MM_REDUCE_OPERATOR_BASIC_EPI16 (+);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_mul_epi16 (__mmask8 __M, __m128i __W)
+{
+  __W = _mm_mask_mov_epi16 (_mm_set1_epi16 (1), __M, __W);
+  _MM_REDUCE_OPERATOR_BASIC_EPI16 (*);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_and_epi16 (__mmask8 __M, __m128i __W)
+{
+  __W = _mm_mask_mov_epi16 (_mm_set1_epi16 (-1), __M, __W);
+  _MM_REDUCE_OPERATOR_BASIC_EPI16 (&);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_or_epi16 (__mmask8 __M, __m128i __W)
+{
+  __W = _mm_maskz_mov_epi16 (__M, __W);
+  _MM_REDUCE_OPERATOR_BASIC_EPI16 (|);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_max_epi16 (__mmask16 __M, __m128i __V)
+{
+  __V = _mm_mask_mov_epi16 (_mm_set1_epi16 (-32767-1), __M, __V);
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16);
+}
+
+extern __inline unsigned short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_max_epu16 (__mmask16 __M, __m128i __V)
+{
+  __V = _mm_maskz_mov_epi16 (__M, __V);
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_min_epi16 (__mmask16 __M, __m128i __V)
+{
+  __V = _mm_mask_mov_epi16 (_mm_set1_epi16 (32767), __M, __V);
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16);
+}
+
+extern __inline unsigned short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_min_epu16 (__mmask16 __M, __m128i __V)
+{
+  __V = _mm_mask_mov_epi16 (_mm_set1_epi16 (-1), __M, __V);
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_add_epi16 (__mmask16 __M, __m256i __W)
+{
+  __W = _mm256_maskz_mov_epi16 (__M, __W);
+  _MM256_REDUCE_OPERATOR_BASIC_EPI16 (+);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_mul_epi16 (__mmask16 __M, __m256i __W)
+{
+  __W = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (1), __M, __W);
+  _MM256_REDUCE_OPERATOR_BASIC_EPI16 (*);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_and_epi16 (__mmask16 __M, __m256i __W)
+{
+  __W = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (-1), __M, __W);
+  _MM256_REDUCE_OPERATOR_BASIC_EPI16 (&);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_or_epi16 (__mmask16 __M, __m256i __W)
+{
+  __W = _mm256_maskz_mov_epi16 (__M, __W);
+  _MM256_REDUCE_OPERATOR_BASIC_EPI16 (|);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_max_epi16 (__mmask16 __M, __m256i __V)
+{
+  __V = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (-32767-1), __M, __V);
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16);
+}
+
+extern __inline unsigned short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_max_epu16 (__mmask16 __M, __m256i __V)
+{
+  __V = _mm256_maskz_mov_epi16 (__M, __V);
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_min_epi16 (__mmask16 __M, __m256i __V)
+{
+  __V = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (32767), __M, __V);
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16);
+}
+
+extern __inline unsigned short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_min_epu16 (__mmask16 __M, __m256i __V)
+{
+  __V = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (-1), __M, __V);
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_add_epi8 (__mmask16 __M, __m128i __W)
+{
+  __W = _mm_maskz_mov_epi8 (__M, __W);
+  _MM_REDUCE_OPERATOR_BASIC_EPI8 (+);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_mul_epi8 (__mmask16 __M, __m128i __W)
+{
+  __W = _mm_mask_mov_epi8 (_mm_set1_epi8 (1), __M, __W);
+  _MM_REDUCE_OPERATOR_BASIC_EPI8 (*);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_and_epi8 (__mmask16 __M, __m128i __W)
+{
+  __W = _mm_mask_mov_epi8 (_mm_set1_epi8 (-1), __M, __W);
+  _MM_REDUCE_OPERATOR_BASIC_EPI8 (&);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_or_epi8 (__mmask16 __M, __m128i __W)
+{
+  __W = _mm_maskz_mov_epi8 (__M, __W);
+  _MM_REDUCE_OPERATOR_BASIC_EPI8 (|);
+}
+
+extern __inline signed char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_max_epi8 (__mmask16 __M, __m128i __V)
+{
+  __V = _mm_mask_mov_epi8 (_mm_set1_epi8 (-127-1), __M, __V);
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_max_epu8 (__mmask16 __M, __m128i __V)
+{
+  __V = _mm_maskz_mov_epi8 (__M, __V);
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8);
+}
+
+extern __inline signed char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_min_epi8 (__mmask16 __M, __m128i __V)
+{
+  __V = _mm_mask_mov_epi8 (_mm_set1_epi8 (127), __M, __V);
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_min_epu8 (__mmask16 __M, __m128i __V)
+{
+  __V = _mm_mask_mov_epi8 (_mm_set1_epi8 (-1), __M, __V);
+  _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_add_epi8 (__mmask32 __M, __m256i __W)
+{
+  __W = _mm256_maskz_mov_epi8 (__M, __W);
+  _MM256_REDUCE_OPERATOR_BASIC_EPI8 (+);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_mul_epi8 (__mmask32 __M, __m256i __W)
+{
+  __W = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (1), __M, __W);
+  _MM256_REDUCE_OPERATOR_BASIC_EPI8 (*);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_and_epi8 (__mmask32 __M, __m256i __W)
+{
+  __W = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (-1), __M, __W);
+  _MM256_REDUCE_OPERATOR_BASIC_EPI8 (&);
+}
+
+extern __inline char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_or_epi8 (__mmask32 __M, __m256i __W)
+{
+  __W = _mm256_maskz_mov_epi8 (__M, __W);
+  _MM256_REDUCE_OPERATOR_BASIC_EPI8 (|);
+}
+
+extern __inline signed char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_max_epi8 (__mmask32 __M, __m256i __V)
+{
+  __V = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (-127-1), __M, __V);
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_max_epu8 (__mmask32 __M, __m256i __V)
+{
+  __V = _mm256_maskz_mov_epi8 (__M, __V);
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8);
+}
+
+extern __inline signed char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_min_epi8 (__mmask32 __M, __m256i __V)
+{
+  __V = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (127), __M, __V);
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_min_epu8 (__mmask32 __M, __m256i __V)
+{
+  __V = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (-1), __M, __V);
+  _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8);
+}
+
 #ifdef __DISABLE_AVX512VLBW__
 #undef __DISABLE_AVX512VLBW__
 #pragma GCC pop_options
diff --git a/gcc/config/i386/avx512vlintrin.h b/gcc/config/i386/avx512vlintrin.h
index 758b71a..08e49e8 100644
--- a/gcc/config/i386/avx512vlintrin.h
+++ b/gcc/config/i386/avx512vlintrin.h
@@ -935,6 +935,78 @@ _mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A)
 				     (__mmask8) __U);
 }
 
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W)
+{
+  return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A,
+						     (__v4df) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W)
+{
+  return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A,
+						    (__v8sf) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A,
+						    (__v4di) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A,
+						    (__v8si) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W)
+{
+  return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A,
+						     (__v2df) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W)
+{
+  return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A,
+						    (__v4sf) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A,
+						    (__v2di) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A,
+						    (__v4si) __W,
+						    (__mmask8) __U);
+}
+
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
@@ -10493,7 +10565,7 @@ _mm_maskz_fixupimm_ps (__mmask8 __U, __m128 __A, __m128 __B,
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_srli_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-			const int __imm)
+			const unsigned int __imm)
 {
   return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm,
 						  (__v8si) __W,
@@ -10502,7 +10574,7 @@ _mm256_mask_srli_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_srli_epi32 (__mmask8 __U, __m256i __A, const int __imm)
+_mm256_maskz_srli_epi32 (__mmask8 __U, __m256i __A, const unsigned int __imm)
 {
   return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm,
 						  (__v8si)
@@ -10513,7 +10585,7 @@ _mm256_maskz_srli_epi32 (__mmask8 __U, __m256i __A, const int __imm)
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_srli_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-		     const int __imm)
+		     const unsigned int __imm)
 {
   return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm,
 						  (__v4si) __W,
@@ -10522,7 +10594,7 @@ _mm_mask_srli_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_srli_epi32 (__mmask8 __U, __m128i __A, const int __imm)
+_mm_maskz_srli_epi32 (__mmask8 __U, __m128i __A, const unsigned int __imm)
 {
   return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm,
 						  (__v4si)
@@ -10533,7 +10605,7 @@ _mm_maskz_srli_epi32 (__mmask8 __U, __m128i __A, const int __imm)
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_srli_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-			const int __imm)
+			const unsigned int __imm)
 {
   return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm,
 						  (__v4di) __W,
@@ -10542,7 +10614,7 @@ _mm256_mask_srli_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_srli_epi64 (__mmask8 __U, __m256i __A, const int __imm)
+_mm256_maskz_srli_epi64 (__mmask8 __U, __m256i __A, const unsigned int __imm)
 {
   return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm,
 						  (__v4di)
@@ -10553,7 +10625,7 @@ _mm256_maskz_srli_epi64 (__mmask8 __U, __m256i __A, const int __imm)
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_srli_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-		     const int __imm)
+		     const unsigned int __imm)
 {
   return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm,
 						  (__v2di) __W,
@@ -10562,7 +10634,7 @@ _mm_mask_srli_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_srli_epi64 (__mmask8 __U, __m128i __A, const int __imm)
+_mm_maskz_srli_epi64 (__mmask8 __U, __m128i __A, const unsigned int __imm)
 {
   return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm,
 						  (__v2di)
@@ -11987,7 +12059,7 @@ _mm256_maskz_cvtps_ph (__mmask8 __U, __m256 __A, const int __I)
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_srai_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-			const int __imm)
+			const unsigned int __imm)
 {
   return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm,
 						  (__v8si) __W,
@@ -11996,7 +12068,7 @@ _mm256_mask_srai_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_srai_epi32 (__mmask8 __U, __m256i __A, const int __imm)
+_mm256_maskz_srai_epi32 (__mmask8 __U, __m256i __A, const unsigned int __imm)
 {
   return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm,
 						  (__v8si)
@@ -12007,7 +12079,7 @@ _mm256_maskz_srai_epi32 (__mmask8 __U, __m256i __A, const int __imm)
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_srai_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-		     const int __imm)
+		     const unsigned int __imm)
 {
   return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm,
 						  (__v4si) __W,
@@ -12016,7 +12088,7 @@ _mm_mask_srai_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_srai_epi32 (__mmask8 __U, __m128i __A, const int __imm)
+_mm_maskz_srai_epi32 (__mmask8 __U, __m128i __A, const unsigned int __imm)
 {
   return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm,
 						  (__v4si)
@@ -12026,7 +12098,7 @@ _mm_maskz_srai_epi32 (__mmask8 __U, __m128i __A, const int __imm)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_srai_epi64 (__m256i __A, const int __imm)
+_mm256_srai_epi64 (__m256i __A, const unsigned int __imm)
 {
   return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
 						  (__v4di)
@@ -12037,7 +12109,7 @@ _mm256_srai_epi64 (__m256i __A, const int __imm)
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_srai_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-			const int __imm)
+			const unsigned int __imm)
 {
   return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
 						  (__v4di) __W,
@@ -12046,7 +12118,7 @@ _mm256_mask_srai_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_srai_epi64 (__mmask8 __U, __m256i __A, const int __imm)
+_mm256_maskz_srai_epi64 (__mmask8 __U, __m256i __A, const unsigned int __imm)
 {
   return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
 						  (__v4di)
@@ -12056,7 +12128,7 @@ _mm256_maskz_srai_epi64 (__mmask8 __U, __m256i __A, const int __imm)
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_srai_epi64 (__m128i __A, const int __imm)
+_mm_srai_epi64 (__m128i __A, const unsigned int __imm)
 {
   return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
 						  (__v2di)
@@ -12067,7 +12139,7 @@ _mm_srai_epi64 (__m128i __A, const int __imm)
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_srai_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-		     const int __imm)
+		     const unsigned int __imm)
 {
   return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
 						  (__v2di) __W,
@@ -12076,7 +12148,7 @@ _mm_mask_srai_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_srai_epi64 (__mmask8 __U, __m128i __A, const int __imm)
+_mm_maskz_srai_epi64 (__mmask8 __U, __m128i __A, const unsigned int __imm)
 {
   return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
 						  (__v2di)
@@ -12086,7 +12158,7 @@ _mm_maskz_srai_epi64 (__mmask8 __U, __m128i __A, const int __imm)
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_slli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
+_mm_mask_slli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
   return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B,
 						  (__v4si) __W,
@@ -12095,7 +12167,7 @@ _mm_mask_slli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_slli_epi32 (__mmask8 __U, __m128i __A, int __B)
+_mm_maskz_slli_epi32 (__mmask8 __U, __m128i __A, unsigned int __B)
 {
   return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B,
 						  (__v4si)
@@ -12105,7 +12177,7 @@ _mm_maskz_slli_epi32 (__mmask8 __U, __m128i __A, int __B)
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_slli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
+_mm_mask_slli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
   return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B,
 						  (__v2di) __W,
@@ -12114,7 +12186,7 @@ _mm_mask_slli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_slli_epi64 (__mmask8 __U, __m128i __A, int __B)
+_mm_maskz_slli_epi64 (__mmask8 __U, __m128i __A, unsigned int __B)
 {
   return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B,
 						  (__v2di)
@@ -12125,7 +12197,7 @@ _mm_maskz_slli_epi64 (__mmask8 __U, __m128i __A, int __B)
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_slli_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-			int __B)
+			unsigned int __B)
 {
   return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B,
 						  (__v8si) __W,
@@ -12134,7 +12206,7 @@ _mm256_mask_slli_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_slli_epi32 (__mmask8 __U, __m256i __A, int __B)
+_mm256_maskz_slli_epi32 (__mmask8 __U, __m256i __A, unsigned int __B)
 {
   return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B,
 						  (__v8si)
@@ -12145,7 +12217,7 @@ _mm256_maskz_slli_epi32 (__mmask8 __U, __m256i __A, int __B)
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_slli_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-			int __B)
+			unsigned int __B)
 {
   return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B,
 						  (__v4di) __W,
@@ -12154,7 +12226,7 @@ _mm256_mask_slli_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_slli_epi64 (__mmask8 __U, __m256i __A, int __B)
+_mm256_maskz_slli_epi64 (__mmask8 __U, __m256i __A, unsigned int __B)
 {
   return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B,
 						  (__v4di)
@@ -12262,78 +12334,6 @@ _mm_maskz_permute_ps (__mmask8 __U, __m128 __X, const int __C)
 						 (__mmask8) __U);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W)
-{
-  return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A,
-						     (__v4df) __W,
-						     (__mmask8) __U);
-}
-
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W)
-{
-  return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A,
-						    (__v8sf) __W,
-						    (__mmask8) __U);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W)
-{
-  return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A,
-						    (__v4di) __W,
-						    (__mmask8) __U);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W)
-{
-  return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A,
-						    (__v8si) __W,
-						    (__mmask8) __U);
-}
-
-extern __inline __m128d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W)
-{
-  return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A,
-						     (__v2df) __W,
-						     (__mmask8) __U);
-}
-
-extern __inline __m128
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W)
-{
-  return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A,
-						    (__v4sf) __W,
-						    (__mmask8) __U);
-}
-
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W)
-{
-  return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A,
-						    (__v2di) __W,
-						    (__mmask8) __U);
-}
-
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W)
-{
-  return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A,
-						    (__v4si) __W,
-						    (__mmask8) __U);
-}
-
 extern __inline __mmask8
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_cmp_epi64_mask (__m256i __X, __m256i __Y, const int __P)
@@ -12864,74 +12864,82 @@ _mm256_permutex_pd (__m256d __X, const int __M)
 
 #define _mm256_mask_srli_epi32(W, U, A, B)				\
   ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A),	\
-    (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U)))
+    (unsigned int)(B), (__v8si)(__m256i)(W), (__mmask8)(U)))
 
 #define _mm256_maskz_srli_epi32(U, A, B)				\
   ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A),	\
-    (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U)))
+    (unsigned int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U)))
 
 #define _mm_mask_srli_epi32(W, U, A, B)                                 \
   ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A),       \
-    (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U)))
+    (unsigned int)(B), (__v4si)(__m128i)(W), (__mmask8)(U)))
 
 #define _mm_maskz_srli_epi32(U, A, B)                                   \
   ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A),       \
-    (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U)))
+    (unsigned int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U)))
 
 #define _mm256_mask_srli_epi64(W, U, A, B)				\
   ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A),	\
-    (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U)))
+    (unsigned int)(B), (__v4di)(__m256i)(W), (__mmask8)(U)))
 
 #define _mm256_maskz_srli_epi64(U, A, B)				\
   ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A),	\
-    (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U)))
+    (unsigned int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U)))
 
 #define _mm_mask_srli_epi64(W, U, A, B)                                 \
   ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A),       \
-    (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U)))
+    (unsigned int)(B), (__v2di)(__m128i)(W), (__mmask8)(U)))
 
 #define _mm_maskz_srli_epi64(U, A, B)                                   \
   ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A),       \
-    (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U)))
+    (unsigned int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U)))
 
-#define _mm256_mask_slli_epi32(W, U, X, C)                                \
-  ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C),\
-    (__v8si)(__m256i)(W),						  \
+#define _mm256_mask_slli_epi32(W, U, X, C)				\
+  ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X),	\
+    (unsigned int)(C),							\
+    (__v8si)(__m256i)(W),						\
     (__mmask8)(U)))
 
-#define _mm256_maskz_slli_epi32(U, X, C)                                  \
-  ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C),\
-    (__v8si)(__m256i)_mm256_setzero_si256 (),				  \
+#define _mm256_maskz_slli_epi32(U, X, C)				\
+  ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X),	\
+    (unsigned int)(C),							\
+    (__v8si)(__m256i)_mm256_setzero_si256 (),				\
     (__mmask8)(U)))
 
-#define _mm256_mask_slli_epi64(W, U, X, C)                                \
-  ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C),\
-    (__v4di)(__m256i)(W),						  \
+#define _mm256_mask_slli_epi64(W, U, X, C)				\
+  ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X),	\
+    (unsigned int)(C),							\
+    (__v4di)(__m256i)(W),						\
     (__mmask8)(U)))
 
-#define _mm256_maskz_slli_epi64(U, X, C)                                  \
-  ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C),\
-    (__v4di)(__m256i)_mm256_setzero_si256 (),				  \
+#define _mm256_maskz_slli_epi64(U, X, C)				\
+  ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X),	\
+    (unsigned int)(C),							\
+    (__v4di)(__m256i)_mm256_setzero_si256 (),				\
     (__mmask8)(U)))
 
-#define _mm_mask_slli_epi32(W, U, X, C)					  \
-  ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C),\
-    (__v4si)(__m128i)(W),\
+#define _mm_mask_slli_epi32(W, U, X, C)					\
+  ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X),	\
+    (unsigned int)(C),							\
+    (__v4si)(__m128i)(W),						\
     (__mmask8)(U)))
 
-#define _mm_maskz_slli_epi32(U, X, C)					  \
-  ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C),\
-    (__v4si)(__m128i)_mm_setzero_si128 (),\
+#define _mm_maskz_slli_epi32(U, X, C)					\
+  ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X),	\
+    (unsigned int)(C),							\
+    (__v4si)(__m128i)_mm_setzero_si128 (),				\
     (__mmask8)(U)))
 
-#define _mm_mask_slli_epi64(W, U, X, C)					  \
-  ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C),\
-    (__v2di)(__m128i)(W),\
+#define _mm_mask_slli_epi64(W, U, X, C)					\
+  ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X),	\
+    (unsigned int)(C),							\
+    (__v2di)(__m128i)(W),						\
     (__mmask8)(U)))
 
-#define _mm_maskz_slli_epi64(U, X, C)					  \
-  ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C),\
-    (__v2di)(__m128i)_mm_setzero_si128 (),\
+#define _mm_maskz_slli_epi64(U, X, C)					\
+  ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X),	\
+    (unsigned int)(C),							\
+    (__v2di)(__m128i)_mm_setzero_si128 (),				\
     (__mmask8)(U)))
 
 #define _mm256_ternarylogic_epi64(A, B, C, I)			\
@@ -13634,43 +13642,43 @@ _mm256_permutex_pd (__m256d __X, const int __M)
 
 #define _mm256_mask_srai_epi32(W, U, A, B)				\
   ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A),	\
-    (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U)))
+    (unsigned int)(B), (__v8si)(__m256i)(W), (__mmask8)(U)))
 
 #define _mm256_maskz_srai_epi32(U, A, B)				\
   ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A),	\
-    (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U)))
+    (unsigned int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U)))
 
 #define _mm_mask_srai_epi32(W, U, A, B)                                 \
   ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A),       \
-    (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U)))
+    (unsigned int)(B), (__v4si)(__m128i)(W), (__mmask8)(U)))
 
 #define _mm_maskz_srai_epi32(U, A, B)                                   \
   ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A),       \
-    (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U)))
+    (unsigned int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U)))
 
 #define _mm256_srai_epi64(A, B)						\
   ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A),	\
-    (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)-1))
+    (unsigned int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)-1))
 
 #define _mm256_mask_srai_epi64(W, U, A, B)				\
   ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A),	\
-    (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U)))
+    (unsigned int)(B), (__v4di)(__m256i)(W), (__mmask8)(U)))
 
 #define _mm256_maskz_srai_epi64(U, A, B)				\
   ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A),	\
-    (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U)))
+    (unsigned int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U)))
 
 #define _mm_srai_epi64(A, B)						\
   ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A),       \
-    (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)-1))
+    (unsigned int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)-1))
 
 #define _mm_mask_srai_epi64(W, U, A, B)                                 \
   ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A),       \
-    (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U)))
+    (unsigned int)(B), (__v2di)(__m128i)(W), (__mmask8)(U)))
 
 #define _mm_maskz_srai_epi64(U, A, B)                                   \
   ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A),       \
-    (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U)))
+    (unsigned int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U)))
 
 #define _mm256_mask_permutex_pd(W, U, A, B)                             \
   ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A),       \
@@ -13717,46 +13725,6 @@ _mm256_permutex_pd (__m256d __X, const int __M)
 					  (__v4sf)(__m128)_mm_setzero_ps (),	    \
 					  (__mmask8)(U)))
 
-#define _mm256_mask_blend_pd(__U, __A, __W)			      \
-  ((__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) (__A),	      \
-						     (__v4df) (__W),  \
-						     (__mmask8) (__U)))
-
-#define _mm256_mask_blend_ps(__U, __A, __W)			      \
-  ((__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) (__A),	      \
-						    (__v8sf) (__W),   \
-						    (__mmask8) (__U)))
-
-#define _mm256_mask_blend_epi64(__U, __A, __W)			      \
-  ((__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) (__A),	      \
-						    (__v4di) (__W),   \
-						    (__mmask8) (__U)))
-
-#define _mm256_mask_blend_epi32(__U, __A, __W)			      \
-  ((__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) (__A),	      \
-						    (__v8si) (__W),   \
-						    (__mmask8) (__U)))
-
-#define _mm_mask_blend_pd(__U, __A, __W)			      \
-  ((__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) (__A),	      \
-						     (__v2df) (__W),  \
-						     (__mmask8) (__U)))
-
-#define _mm_mask_blend_ps(__U, __A, __W)			      \
-  ((__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) (__A),	      \
-						    (__v4sf) (__W),   \
-						    (__mmask8) (__U)))
-
-#define _mm_mask_blend_epi64(__U, __A, __W)			      \
-  ((__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) (__A),	      \
-						    (__v2di) (__W),   \
-						    (__mmask8) (__U)))
-
-#define _mm_mask_blend_epi32(__U, __A, __W)			      \
-  ((__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) (__A),	      \
-						    (__v4si) (__W),   \
-						    (__mmask8) (__U)))
-
 #define _mm256_cmp_epu32_mask(X, Y, P)					\
   ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X),	\
 					    (__v8si)(__m256i)(Y), (int)(P),\
diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index aeda107..fd490f3 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -162,7 +162,9 @@
 ;;  g  GOT memory operand.
 ;;  m  Vector memory operand
 ;;  c  Constant memory operand
+;;  k  TLS address that allows insn using non-integer registers
 ;;  n  Memory operand without REX prefix
+;;  r  Broadcast memory operand
 ;;  s  Sibcall memory operand, not valid for TARGET_X32
 ;;  w  Call memory operand, not valid for TARGET_X32
 ;;  z  Constant call address operand.
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index be162dd..4cc4461 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -24,15 +24,6 @@
 #ifndef _CPUID_H_INCLUDED
 #define _CPUID_H_INCLUDED
 
-/* %eax */
-#define bit_RAOINT	(1 << 3)
-#define bit_AVXVNNI	(1 << 4)
-#define bit_AVX512BF16	(1 << 5)
-#define bit_CMPCCXADD	(1 << 7)
-#define bit_AMX_FP16	(1 << 21)
-#define bit_HRESET	(1 << 22)
-#define bit_AVXIFMA	(1 << 23)
-
 /* %ecx */
 #define bit_SSE3	(1 << 0)
 #define bit_PCLMUL	(1 << 1)
@@ -52,10 +43,7 @@
 #define bit_RDRND	(1 << 30)
 
 /* %edx */
-#define bit_AVXVNNIINT8 (1 << 4)
-#define bit_AVXNECONVERT (1 << 5)
 #define bit_CMPXCHG8B	(1 << 8)
-#define bit_PREFETCHI	(1 << 14)
 #define bit_CMOV	(1 << 15)
 #define bit_MMX		(1 << 23)
 #define bit_FXSAVE	(1 << 24)
@@ -84,19 +72,19 @@
 #define bit_CLZERO	(1 << 0)
 #define bit_WBNOINVD	(1 << 9)
 
-/* Extended Features (%eax == 7) */
+/* Extended Features Leaf (%eax == 7, %ecx == 0) */
 /* %ebx */
 #define bit_FSGSBASE	(1 << 0)
-#define bit_SGX (1 << 2)
-#define bit_BMI	(1 << 3)
-#define bit_HLE	(1 << 4)
+#define bit_SGX		(1 << 2)
+#define bit_BMI		(1 << 3)
+#define bit_HLE		(1 << 4)
 #define bit_AVX2	(1 << 5)
 #define bit_BMI2	(1 << 8)
-#define bit_RTM	(1 << 11)
+#define bit_RTM		(1 << 11)
 #define bit_AVX512F	(1 << 16)
 #define bit_AVX512DQ	(1 << 17)
 #define bit_RDSEED	(1 << 18)
-#define bit_ADX	(1 << 19)
+#define bit_ADX		(1 << 19)
 #define bit_AVX512IFMA	(1 << 21)
 #define bit_CLFLUSHOPT	(1 << 23)
 #define bit_CLWB	(1 << 24)
@@ -108,40 +96,56 @@
 #define bit_AVX512VL	(1u << 31)
 
 /* %ecx */
-#define bit_PREFETCHWT1	  (1 << 0)
+#define bit_PREFETCHWT1	(1 << 0)
 #define bit_AVX512VBMI	(1 << 1)
-#define bit_PKU	(1 << 3)
+#define bit_PKU		(1 << 3)
 #define bit_OSPKE	(1 << 4)
 #define bit_WAITPKG	(1 << 5)
 #define bit_AVX512VBMI2	(1 << 6)
 #define bit_SHSTK	(1 << 7)
 #define bit_GFNI	(1 << 8)
 #define bit_VAES	(1 << 9)
-#define bit_AVX512VNNI	(1 << 11)
 #define bit_VPCLMULQDQ	(1 << 10)
+#define bit_AVX512VNNI	(1 << 11)
 #define bit_AVX512BITALG	(1 << 12)
 #define bit_AVX512VPOPCNTDQ	(1 << 14)
 #define bit_RDPID	(1 << 22)
+#define bit_KL		(1 << 23)
+#define bit_CLDEMOTE	(1 << 25)
 #define bit_MOVDIRI	(1 << 27)
 #define bit_MOVDIR64B	(1 << 28)
 #define bit_ENQCMD	(1 << 29)
-#define bit_CLDEMOTE	(1 << 25)
-#define bit_KL		(1 << 23)
 
 /* %edx */
-#define bit_AVX5124VNNIW (1 << 2)
-#define bit_AVX5124FMAPS (1 << 3)
+#define bit_AVX5124VNNIW	(1 << 2)
+#define bit_AVX5124FMAPS	(1 << 3)
+#define bit_UINTR	(1 << 5)
 #define bit_AVX512VP2INTERSECT	(1 << 8)
-#define bit_AVX512FP16   (1 << 23)
-#define bit_IBT	(1 << 20)
-#define bit_UINTR (1 << 5)
-#define bit_PCONFIG	(1 << 18)
 #define bit_SERIALIZE	(1 << 14)
 #define bit_TSXLDTRK    (1 << 16)
+#define bit_PCONFIG	(1 << 18)
+#define bit_IBT         (1 << 20)
 #define bit_AMX_BF16    (1 << 22)
+#define bit_AVX512FP16	(1 << 23)
 #define bit_AMX_TILE    (1 << 24)
 #define bit_AMX_INT8    (1 << 25)
 
+/* Extended Features Sub-leaf (%eax == 7, %ecx == 1) */
+/* %eax */
+#define bit_RAOINT      (1 << 3)
+#define bit_AVXVNNI     (1 << 4)
+#define bit_AVX512BF16  (1 << 5)
+#define bit_CMPCCXADD   (1 << 7)
+#define bit_AMX_COMPLEX (1 << 8)
+#define bit_AMX_FP16    (1 << 21)
+#define bit_HRESET      (1 << 22)
+#define bit_AVXIFMA     (1 << 23)
+
+/* %edx */
+#define bit_AVXVNNIINT8 (1 << 4)
+#define bit_AVXNECONVERT	(1 << 5)
+#define bit_PREFETCHI	(1 << 14)
+
 /* Extended State Enumeration Sub-leaf (%eax == 0xd, %ecx == 1) */
 #define bit_XSAVEOPT	(1 << 0)
 #define bit_XSAVEC	(1 << 1)
diff --git a/gcc/config/i386/gcc-auto-profile b/gcc/config/i386/gcc-auto-profile
index 5ab224b..04f7d35 100755
--- a/gcc/config/i386/gcc-auto-profile
+++ b/gcc/config/i386/gcc-auto-profile
@@ -43,8 +43,10 @@ model*:\ 47|\
 model*:\ 37|\
 model*:\ 44) E="cpu/event=0x88,umask=0x40/$FLAGS" ;;
 model*:\ 55|\
+model*:\ 74|\
 model*:\ 77|\
 model*:\ 76|\
+model*:\ 90|\
 model*:\ 92|\
 model*:\ 95|\
 model*:\ 87|\
@@ -75,14 +77,19 @@ model*:\ 165|\
 model*:\ 166|\
 model*:\ 85|\
 model*:\ 85) E="cpu/event=0xC4,umask=0x20/p$FLAGS" ;;
+model*:\ 125|\
 model*:\ 126|\
+model*:\ 167|\
 model*:\ 140|\
 model*:\ 141|\
 model*:\ 143|\
+model*:\ 207|\
 model*:\ 106|\
 model*:\ 108) E="cpu/event=0xc4,umask=0x20/p$FLAGS" ;;
 model*:\ 134|\
-model*:\ 150) E="cpu/event=0xc4,umask=0xfe/p$FLAGS" ;;
+model*:\ 150|\
+model*:\ 156|\
+model*:\ 190) E="cpu/event=0xc4,umask=0xfe/p$FLAGS" ;;
 *)
 echo >&2 "Unknown CPU. Run contrib/gen_autofdo_event.py --all --script to update script."
 	exit 1 ;;
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 65fe070..cb2d0cd 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -73,7 +73,7 @@ DEF_PRIMITIVE_TYPE (BFLOAT16, ix86_bf16_type_node)
 DEF_PRIMITIVE_TYPE (FLOAT, float_type_node)
 DEF_PRIMITIVE_TYPE (DOUBLE, double_type_node)
 DEF_PRIMITIVE_TYPE (FLOAT80, float80_type_node)
-DEF_PRIMITIVE_TYPE (FLOAT128, float128_type_node)
+DEF_PRIMITIVE_TYPE (FLOAT128, float128t_type_node)
 DEF_PRIMITIVE_TYPE (CONST_STRING, const_string_type_node)
 
 # MMX vectors
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 17dfe40..7ba5b6a 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -430,20 +430,20 @@ BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_rdpkru,  "__builtin_ia32_rdpkru", IX86_B
 BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_wrpkru,  "__builtin_ia32_wrpkru", IX86_BUILTIN_WRPKRU, UNKNOWN, (int) VOID_FTYPE_UNSIGNED)
 
 /* VBMI2 */
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_compressstorev64qi_mask, "__builtin_ia32_compressstoreuqi512_mask", IX86_BUILTIN_PCOMPRESSBSTORE512, UNKNOWN, (int) VOID_FTYPE_PV64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_compressstorev32hi_mask, "__builtin_ia32_compressstoreuhi512_mask", IX86_BUILTIN_PCOMPRESSWSTORE512, UNKNOWN, (int) VOID_FTYPE_PV32HI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_compressstorev32qi_mask, "__builtin_ia32_compressstoreuqi256_mask", IX86_BUILTIN_PCOMPRESSBSTORE256, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32QI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_compressstorev64qi_mask, "__builtin_ia32_compressstoreuqi512_mask", IX86_BUILTIN_PCOMPRESSBSTORE512, UNKNOWN, (int) VOID_FTYPE_PV64QI_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_compressstorev32hi_mask, "__builtin_ia32_compressstoreuhi512_mask", IX86_BUILTIN_PCOMPRESSWSTORE512, UNKNOWN, (int) VOID_FTYPE_PV32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_compressstorev32qi_mask, "__builtin_ia32_compressstoreuqi256_mask", IX86_BUILTIN_PCOMPRESSBSTORE256, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32QI_USI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_compressstorev16qi_mask, "__builtin_ia32_compressstoreuqi128_mask", IX86_BUILTIN_PCOMPRESSBSTORE128, UNKNOWN, (int) VOID_FTYPE_PV16QI_V16QI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_compressstorev16hi_mask, "__builtin_ia32_compressstoreuhi256_mask", IX86_BUILTIN_PCOMPRESSWSTORE256, UNKNOWN, (int) VOID_FTYPE_PV16HI_V16HI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_compressstorev8hi_mask, "__builtin_ia32_compressstoreuhi128_mask", IX86_BUILTIN_PCOMPRESSWSTORE128, UNKNOWN, (int) VOID_FTYPE_PV8HI_V8HI_UQI)
 
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_expandv64qi_mask, "__builtin_ia32_expandloadqi512_mask", IX86_BUILTIN_PEXPANDBLOAD512, UNKNOWN, (int) V64QI_FTYPE_PCV64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_expandv64qi_maskz, "__builtin_ia32_expandloadqi512_maskz", IX86_BUILTIN_PEXPANDBLOAD512Z, UNKNOWN, (int) V64QI_FTYPE_PCV64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_expandv32hi_mask, "__builtin_ia32_expandloadhi512_mask", IX86_BUILTIN_PEXPANDWLOAD512, UNKNOWN, (int) V32HI_FTYPE_PCV32HI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_expandv32hi_maskz, "__builtin_ia32_expandloadhi512_maskz", IX86_BUILTIN_PEXPANDWLOAD512Z, UNKNOWN, (int) V32HI_FTYPE_PCV32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv64qi_mask, "__builtin_ia32_expandloadqi512_mask", IX86_BUILTIN_PEXPANDBLOAD512, UNKNOWN, (int) V64QI_FTYPE_PCV64QI_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv64qi_maskz, "__builtin_ia32_expandloadqi512_maskz", IX86_BUILTIN_PEXPANDBLOAD512Z, UNKNOWN, (int) V64QI_FTYPE_PCV64QI_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv32hi_mask, "__builtin_ia32_expandloadhi512_mask", IX86_BUILTIN_PEXPANDWLOAD512, UNKNOWN, (int) V32HI_FTYPE_PCV32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv32hi_maskz, "__builtin_ia32_expandloadhi512_maskz", IX86_BUILTIN_PEXPANDWLOAD512Z, UNKNOWN, (int) V32HI_FTYPE_PCV32HI_V32HI_USI)
 
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_expandv32qi_mask, "__builtin_ia32_expandloadqi256_mask", IX86_BUILTIN_PEXPANDBLOAD256, UNKNOWN, (int) V32QI_FTYPE_PCV32QI_V32QI_USI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_expandv32qi_maskz, "__builtin_ia32_expandloadqi256_maskz", IX86_BUILTIN_PEXPANDBLOAD256Z, UNKNOWN, (int) V32QI_FTYPE_PCV32QI_V32QI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv32qi_mask, "__builtin_ia32_expandloadqi256_mask", IX86_BUILTIN_PEXPANDBLOAD256, UNKNOWN, (int) V32QI_FTYPE_PCV32QI_V32QI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv32qi_maskz, "__builtin_ia32_expandloadqi256_maskz", IX86_BUILTIN_PEXPANDBLOAD256Z, UNKNOWN, (int) V32QI_FTYPE_PCV32QI_V32QI_USI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv16hi_mask, "__builtin_ia32_expandloadhi256_mask", IX86_BUILTIN_PEXPANDWLOAD256, UNKNOWN, (int) V16HI_FTYPE_PCV16HI_V16HI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv16hi_maskz, "__builtin_ia32_expandloadhi256_maskz", IX86_BUILTIN_PEXPANDWLOAD256Z, UNKNOWN, (int) V16HI_FTYPE_PCV16HI_V16HI_UHI)
 
@@ -899,11 +899,11 @@ BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps"
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 
 /* SSSE3 */
-BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI)
+BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI)
 BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_ssse3_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI)
-BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI)
+BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI)
 BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_ssse3_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI)
-BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI)
+BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI)
 BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_ssse3_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI)
 
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI)
@@ -942,7 +942,7 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blen
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_insertps_v4sf, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT)
@@ -1004,8 +1004,8 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF)
 
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestv2di, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestv2di, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestzv2di, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestcv2di, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestv2di, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST)
 
 /* SSE4.2 */
@@ -1164,8 +1164,8 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzc
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_ptestv4di, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_ptestv4di, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST)
+BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_ptestzv4di, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST)
+BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_ptestcv4di, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_ptestv4di, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST)
 
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF )
@@ -1178,9 +1178,9 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_
 
 /* AVX2 */
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI)
+BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI)
+BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI)
+BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256",  IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256",  IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256",  IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI)
@@ -1384,7 +1384,7 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressv8df_mask, "__builti
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressv16sf_mask, "__builtin_ia32_compresssf512_mask", IX86_BUILTIN_COMPRESSPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_UHI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_floatv8siv8df2_mask, "__builtin_ia32_cvtdq2pd512_mask", IX86_BUILTIN_CVTDQ2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SI_V8DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vcvtps2ph512_mask_sae,  "__builtin_ia32_vcvtps2ph512_mask", IX86_BUILTIN_CVTPS2PH512, UNKNOWN, (int) V16HI_FTYPE_V16SF_INT_V16HI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_ufloatv8siv8df2_mask, "__builtin_ia32_cvtudq2pd512_mask", IX86_BUILTIN_CVTUDQ2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SI_V8DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_floatunsv8siv8df2_mask, "__builtin_ia32_cvtudq2pd512_mask", IX86_BUILTIN_CVTUDQ2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SI_V8DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_cvtusi2sd32, "__builtin_ia32_cvtusi2sd32", IX86_BUILTIN_CVTUSI2SD32, UNKNOWN, (int) V2DF_FTYPE_V2DF_UINT)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv8df_mask, "__builtin_ia32_expanddf512_mask", IX86_BUILTIN_EXPANDPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8df_maskz, "__builtin_ia32_expanddf512_maskz", IX86_BUILTIN_EXPANDPD512Z, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_UQI)
@@ -1719,32 +1719,32 @@ BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_t
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_truncv2dfv2di2_mask, "__builtin_ia32_cvttpd2uqq128_mask", IX86_BUILTIN_CVTTPD2UQQ128, UNKNOWN, (int) V2DI_FTYPE_V2DF_V2DI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_notruncv4dfv4di2_mask, "__builtin_ia32_cvtpd2qq256_mask", IX86_BUILTIN_CVTPD2QQ256, UNKNOWN, (int) V4DI_FTYPE_V4DF_V4DI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_notruncv2dfv2di2_mask, "__builtin_ia32_cvtpd2qq128_mask", IX86_BUILTIN_CVTPD2QQ128, UNKNOWN, (int) V2DI_FTYPE_V2DF_V2DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_notruncv4dfv4di2_mask, "__builtin_ia32_cvtpd2uqq256_mask", IX86_BUILTIN_CVTPD2UQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DF_V4DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_notruncv2dfv2di2_mask, "__builtin_ia32_cvtpd2uqq128_mask", IX86_BUILTIN_CVTPD2UQQ128, UNKNOWN, (int) V2DI_FTYPE_V2DF_V2DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_notruncv4dfv4si2_mask, "__builtin_ia32_cvtpd2udq256_mask", IX86_BUILTIN_CVTPD2UDQ256_MASK, UNKNOWN, (int) V4SI_FTYPE_V4DF_V4SI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_notruncv2dfv2si2_mask, "__builtin_ia32_cvtpd2udq128_mask", IX86_BUILTIN_CVTPD2UDQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V2DF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_notruncv4dfv4di2_mask, "__builtin_ia32_cvtpd2uqq256_mask", IX86_BUILTIN_CVTPD2UQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DF_V4DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_notruncv2dfv2di2_mask, "__builtin_ia32_cvtpd2uqq128_mask", IX86_BUILTIN_CVTPD2UQQ128, UNKNOWN, (int) V2DI_FTYPE_V2DF_V2DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_notruncv4dfv4si2_mask, "__builtin_ia32_cvtpd2udq256_mask", IX86_BUILTIN_CVTPD2UDQ256_MASK, UNKNOWN, (int) V4SI_FTYPE_V4DF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_notruncv2dfv2si2_mask, "__builtin_ia32_cvtpd2udq128_mask", IX86_BUILTIN_CVTPD2UDQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V2DF_V4SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_truncv4sfv4di2_mask, "__builtin_ia32_cvttps2qq256_mask", IX86_BUILTIN_CVTTPS2QQ256, UNKNOWN, (int) V4DI_FTYPE_V4SF_V4DI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512dq_fix_truncv2sfv2di2_mask, "__builtin_ia32_cvttps2qq128_mask", IX86_BUILTIN_CVTTPS2QQ128, UNKNOWN, (int) V2DI_FTYPE_V4SF_V2DI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_truncv4sfv4di2_mask, "__builtin_ia32_cvttps2uqq256_mask", IX86_BUILTIN_CVTTPS2UQQ256, UNKNOWN, (int) V4DI_FTYPE_V4SF_V4DI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512dq_fixuns_truncv2sfv2di2_mask, "__builtin_ia32_cvttps2uqq128_mask", IX86_BUILTIN_CVTTPS2UQQ128, UNKNOWN, (int) V2DI_FTYPE_V4SF_V2DI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_truncv8sfv8si2_mask, "__builtin_ia32_cvttps2dq256_mask", IX86_BUILTIN_CVTTPS2DQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SF_V8SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_truncv4sfv4si2_mask, "__builtin_ia32_cvttps2dq128_mask", IX86_BUILTIN_CVTTPS2DQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SF_V4SI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_truncv8sfv8si2_mask, "__builtin_ia32_cvttps2udq256_mask", IX86_BUILTIN_CVTTPS2UDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF_V8SI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_truncv4sfv4si2_mask, "__builtin_ia32_cvttps2udq128_mask", IX86_BUILTIN_CVTTPS2UDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_truncv8sfv8si2_mask, "__builtin_ia32_cvttps2udq256_mask", IX86_BUILTIN_CVTTPS2UDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF_V8SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_truncv4sfv4si2_mask, "__builtin_ia32_cvttps2udq128_mask", IX86_BUILTIN_CVTTPS2UDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SF_V4SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_truncv4dfv4si2_mask, "__builtin_ia32_cvttpd2dq256_mask", IX86_BUILTIN_CVTTPD2DQ256_MASK, UNKNOWN, (int) V4SI_FTYPE_V4DF_V4SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_sse2_cvttpd2dq_mask, "__builtin_ia32_cvttpd2dq128_mask", IX86_BUILTIN_CVTTPD2DQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V2DF_V4SI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_truncv4dfv4si2_mask, "__builtin_ia32_cvttpd2udq256_mask", IX86_BUILTIN_CVTTPD2UDQ256_MASK, UNKNOWN, (int) V4SI_FTYPE_V4DF_V4SI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_truncv2dfv2si2_mask, "__builtin_ia32_cvttpd2udq128_mask", IX86_BUILTIN_CVTTPD2UDQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V2DF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_truncv4dfv4si2_mask, "__builtin_ia32_cvttpd2udq256_mask", IX86_BUILTIN_CVTTPD2UDQ256_MASK, UNKNOWN, (int) V4SI_FTYPE_V4DF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_truncv2dfv2si2_mask, "__builtin_ia32_cvttpd2udq128_mask", IX86_BUILTIN_CVTTPD2UDQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V2DF_V4SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx_cvtpd2dq256_mask, "__builtin_ia32_cvtpd2dq256_mask", IX86_BUILTIN_CVTPD2DQ256_MASK, UNKNOWN, (int) V4SI_FTYPE_V4DF_V4SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_sse2_cvtpd2dq_mask, "__builtin_ia32_cvtpd2dq128_mask", IX86_BUILTIN_CVTPD2DQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V2DF_V4SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_floatv4siv4df2_mask, "__builtin_ia32_cvtdq2pd256_mask", IX86_BUILTIN_CVTDQ2PD256_MASK, UNKNOWN, (int) V4DF_FTYPE_V4SI_V4DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_sse2_cvtdq2pd_mask, "__builtin_ia32_cvtdq2pd128_mask", IX86_BUILTIN_CVTDQ2PD128_MASK, UNKNOWN, (int) V2DF_FTYPE_V4SI_V2DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufloatv4siv4df2_mask, "__builtin_ia32_cvtudq2pd256_mask", IX86_BUILTIN_CVTUDQ2PD256_MASK, UNKNOWN, (int) V4DF_FTYPE_V4SI_V4DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufloatv2siv2df2_mask, "__builtin_ia32_cvtudq2pd128_mask", IX86_BUILTIN_CVTUDQ2PD128_MASK, UNKNOWN, (int) V2DF_FTYPE_V4SI_V2DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_floatunsv4siv4df2_mask, "__builtin_ia32_cvtudq2pd256_mask", IX86_BUILTIN_CVTUDQ2PD256_MASK, UNKNOWN, (int) V4DF_FTYPE_V4SI_V4DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_floatunsv2siv2df2_mask, "__builtin_ia32_cvtudq2pd128_mask", IX86_BUILTIN_CVTUDQ2PD128_MASK, UNKNOWN, (int) V2DF_FTYPE_V4SI_V2DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_floatv8siv8sf2_mask, "__builtin_ia32_cvtdq2ps256_mask", IX86_BUILTIN_CVTDQ2PS256_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SI_V8SF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_floatv4siv4sf2_mask, "__builtin_ia32_cvtdq2ps128_mask", IX86_BUILTIN_CVTDQ2PS128_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SI_V4SF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufloatv8siv8sf2_mask, "__builtin_ia32_cvtudq2ps256_mask", IX86_BUILTIN_CVTUDQ2PS256_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SI_V8SF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufloatv4siv4sf2_mask, "__builtin_ia32_cvtudq2ps128_mask", IX86_BUILTIN_CVTUDQ2PS128_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SI_V4SF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_floatunsv8siv8sf2_mask, "__builtin_ia32_cvtudq2ps256_mask", IX86_BUILTIN_CVTUDQ2PS256_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SI_V8SF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_floatunsv4siv4sf2_mask, "__builtin_ia32_cvtudq2ps128_mask", IX86_BUILTIN_CVTUDQ2PS128_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SI_V4SF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx_cvtps2pd256_mask, "__builtin_ia32_cvtps2pd256_mask", IX86_BUILTIN_CVTPS2PD256_MASK, UNKNOWN, (int) V4DF_FTYPE_V4SF_V4DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_sse2_cvtps2pd_mask, "__builtin_ia32_cvtps2pd128_mask", IX86_BUILTIN_CVTPS2PD128_MASK, UNKNOWN, (int) V2DF_FTYPE_V4SF_V2DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_vec_dupv32qi_mask, "__builtin_ia32_pbroadcastb256_mask", IX86_BUILTIN_PBROADCASTB256_MASK, UNKNOWN, (int) V32QI_FTYPE_V16QI_V32QI_USI)
@@ -2072,8 +2072,8 @@ BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl
 BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_lshrvv8hi_mask, "__builtin_ia32_psrlv8hi_mask", IX86_BUILTIN_PSRLVV8HI, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx_fix_notruncv8sfv8si_mask, "__builtin_ia32_cvtps2dq256_mask", IX86_BUILTIN_CVTPS2DQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SF_V8SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_sse2_fix_notruncv4sfv4si_mask, "__builtin_ia32_cvtps2dq128_mask", IX86_BUILTIN_CVTPS2DQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SF_V4SI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_ufix_notruncv8sfv8si_mask, "__builtin_ia32_cvtps2udq256_mask", IX86_BUILTIN_CVTPS2UDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF_V8SI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_ufix_notruncv4sfv4si_mask, "__builtin_ia32_cvtps2udq128_mask", IX86_BUILTIN_CVTPS2UDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_fixuns_notruncv8sfv8si_mask, "__builtin_ia32_cvtps2udq256_mask", IX86_BUILTIN_CVTPS2UDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF_V8SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_fixuns_notruncv4sfv4si_mask, "__builtin_ia32_cvtps2udq128_mask", IX86_BUILTIN_CVTPS2UDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SF_V4SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512dq_cvtps2qqv4di_mask, "__builtin_ia32_cvtps2qq256_mask", IX86_BUILTIN_CVTPS2QQ256, UNKNOWN, (int) V4DI_FTYPE_V4SF_V4DI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512dq_cvtps2qqv2di_mask, "__builtin_ia32_cvtps2qq128_mask", IX86_BUILTIN_CVTPS2QQ128, UNKNOWN, (int) V2DI_FTYPE_V4SF_V2DI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512dq_cvtps2uqqv4di_mask, "__builtin_ia32_cvtps2uqq256_mask", IX86_BUILTIN_CVTPS2UQQ256, UNKNOWN, (int) V4DI_FTYPE_V4SF_V4DI_UQI)
@@ -2553,18 +2553,18 @@ BDESC (OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512
 BDESC (OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_vpermi2varv16qi3_mask, "__builtin_ia32_vpermi2varqi128_mask", IX86_BUILTIN_VPERMI2VARQI128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI_UHI)
 
 /* VBMI2 */
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_compressv64qi_mask, "__builtin_ia32_compressqi512_mask", IX86_BUILTIN_PCOMPRESSB512, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_compressv32hi_mask, "__builtin_ia32_compresshi512_mask", IX86_BUILTIN_PCOMPRESSW512, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_compressv32qi_mask, "__builtin_ia32_compressqi256_mask", IX86_BUILTIN_PCOMPRESSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_compressv64qi_mask, "__builtin_ia32_compressqi512_mask", IX86_BUILTIN_PCOMPRESSB512, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_compressv32hi_mask, "__builtin_ia32_compresshi512_mask", IX86_BUILTIN_PCOMPRESSW512, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_compressv32qi_mask, "__builtin_ia32_compressqi256_mask", IX86_BUILTIN_PCOMPRESSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_USI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_compressv16qi_mask, "__builtin_ia32_compressqi128_mask", IX86_BUILTIN_PCOMPRESSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_compressv16hi_mask, "__builtin_ia32_compresshi256_mask", IX86_BUILTIN_PCOMPRESSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_compressv8hi_mask, "__builtin_ia32_compresshi128_mask", IX86_BUILTIN_PCOMPRESSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_expandv64qi_mask, "__builtin_ia32_expandqi512_mask", IX86_BUILTIN_PEXPANDB512, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_expandv64qi_maskz, "__builtin_ia32_expandqi512_maskz", IX86_BUILTIN_PEXPANDB512Z, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_expandv32hi_mask, "__builtin_ia32_expandhi512_mask", IX86_BUILTIN_PEXPANDW512, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_expandv32hi_maskz, "__builtin_ia32_expandhi512_maskz", IX86_BUILTIN_PEXPANDW512Z, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_expandv32qi_mask, "__builtin_ia32_expandqi256_mask", IX86_BUILTIN_PEXPANDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_USI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_expandv32qi_maskz, "__builtin_ia32_expandqi256_maskz", IX86_BUILTIN_PEXPANDB256Z, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv64qi_mask, "__builtin_ia32_expandqi512_mask", IX86_BUILTIN_PEXPANDB512, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv64qi_maskz, "__builtin_ia32_expandqi512_maskz", IX86_BUILTIN_PEXPANDB512Z, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv32hi_mask, "__builtin_ia32_expandhi512_mask", IX86_BUILTIN_PEXPANDW512, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv32hi_maskz, "__builtin_ia32_expandhi512_maskz", IX86_BUILTIN_PEXPANDW512Z, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv32qi_mask, "__builtin_ia32_expandqi256_mask", IX86_BUILTIN_PEXPANDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv32qi_maskz, "__builtin_ia32_expandqi256_maskz", IX86_BUILTIN_PEXPANDB256Z, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_USI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv16qi_mask, "__builtin_ia32_expandqi128_mask", IX86_BUILTIN_PEXPANDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv16qi_maskz, "__builtin_ia32_expandqi128_maskz", IX86_BUILTIN_PEXPANDB128Z, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv16hi_mask, "__builtin_ia32_expandhi256_mask", IX86_BUILTIN_PEXPANDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_UHI)
@@ -2572,7 +2572,7 @@ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expan
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv8hi_mask, "__builtin_ia32_expandhi128_mask", IX86_BUILTIN_PEXPANDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv8hi_maskz, "__builtin_ia32_expandhi128_maskz", IX86_BUILTIN_PEXPANDW128Z, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrd_v32hi, "__builtin_ia32_vpshrd_v32hi", IX86_BUILTIN_VPSHRDV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_INT)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshrd_v32hi_mask, "__builtin_ia32_vpshrd_v32hi_mask", IX86_BUILTIN_VPSHRDV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrd_v32hi_mask, "__builtin_ia32_vpshrd_v32hi_mask", IX86_BUILTIN_VPSHRDV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrd_v16hi, "__builtin_ia32_vpshrd_v16hi", IX86_BUILTIN_VPSHRDV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrd_v16hi_mask, "__builtin_ia32_vpshrd_v16hi_mask", IX86_BUILTIN_VPSHRDV16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrd_v8hi, "__builtin_ia32_vpshrd_v8hi", IX86_BUILTIN_VPSHRDV8HI, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT)
@@ -2590,7 +2590,7 @@ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshr
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrd_v2di, "__builtin_ia32_vpshrd_v2di", IX86_BUILTIN_VPSHRDV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrd_v2di_mask, "__builtin_ia32_vpshrd_v2di_mask", IX86_BUILTIN_VPSHRDV2DI_MASK, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshld_v32hi, "__builtin_ia32_vpshld_v32hi", IX86_BUILTIN_VPSHLDV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_INT)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshld_v32hi_mask, "__builtin_ia32_vpshld_v32hi_mask", IX86_BUILTIN_VPSHLDV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshld_v32hi_mask, "__builtin_ia32_vpshld_v32hi_mask", IX86_BUILTIN_VPSHLDV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshld_v16hi, "__builtin_ia32_vpshld_v16hi", IX86_BUILTIN_VPSHLDV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshld_v16hi_mask, "__builtin_ia32_vpshld_v16hi_mask", IX86_BUILTIN_VPSHLDV16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshld_v8hi, "__builtin_ia32_vpshld_v8hi", IX86_BUILTIN_VPSHLDV8HI, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT)
@@ -2609,8 +2609,8 @@ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshl
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshld_v2di_mask, "__builtin_ia32_vpshld_v2di_mask", IX86_BUILTIN_VPSHLDV2DI_MASK, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT)
 
 BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v32hi, "__builtin_ia32_vpshrdv_v32hi", IX86_BUILTIN_VPSHRDVV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshrdv_v32hi_mask, "__builtin_ia32_vpshrdv_v32hi_mask", IX86_BUILTIN_VPSHRDVV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshrdv_v32hi_maskz, "__builtin_ia32_vpshrdv_v32hi_maskz", IX86_BUILTIN_VPSHRDVV32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v32hi_mask, "__builtin_ia32_vpshrdv_v32hi_mask", IX86_BUILTIN_VPSHRDVV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v32hi_maskz, "__builtin_ia32_vpshrdv_v32hi_maskz", IX86_BUILTIN_VPSHRDVV32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_USI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v16hi, "__builtin_ia32_vpshrdv_v16hi", IX86_BUILTIN_VPSHRDVV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v16hi_mask, "__builtin_ia32_vpshrdv_v16hi_mask", IX86_BUILTIN_VPSHRDVV16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v16hi_maskz, "__builtin_ia32_vpshrdv_v16hi_maskz", IX86_BUILTIN_VPSHRDVV16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_UHI)
@@ -2637,8 +2637,8 @@ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshr
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v2di_maskz, "__builtin_ia32_vpshrdv_v2di_maskz", IX86_BUILTIN_VPSHRDVV2DI_MASKZ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI_UQI)
 
 BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v32hi, "__builtin_ia32_vpshldv_v32hi", IX86_BUILTIN_VPSHLDVV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshldv_v32hi_mask, "__builtin_ia32_vpshldv_v32hi_mask", IX86_BUILTIN_VPSHLDVV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshldv_v32hi_maskz, "__builtin_ia32_vpshldv_v32hi_maskz", IX86_BUILTIN_VPSHLDVV32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v32hi_mask, "__builtin_ia32_vpshldv_v32hi_mask", IX86_BUILTIN_VPSHLDVV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v32hi_maskz, "__builtin_ia32_vpshldv_v32hi_maskz", IX86_BUILTIN_VPSHLDVV32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_USI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v16hi, "__builtin_ia32_vpshldv_v16hi", IX86_BUILTIN_VPSHLDVV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v16hi_mask, "__builtin_ia32_vpshldv_v16hi_mask", IX86_BUILTIN_VPSHLDVV16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v16hi_maskz, "__builtin_ia32_vpshldv_v16hi_maskz", IX86_BUILTIN_VPSHLDVV16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_UHI)
@@ -2762,21 +2762,21 @@ BDESC (OPTION_MASK_ISA_AVX512VPOPCNTDQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_v
 
 /* BITALG */
 BDESC (OPTION_MASK_ISA_AVX512BITALG, 0, CODE_FOR_vpopcountv64qi, "__builtin_ia32_vpopcountb_v64qi", IX86_BUILTIN_VPOPCOUNTBV64QI, UNKNOWN, (int) V64QI_FTYPE_V64QI)
-BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpopcountv64qi_mask, "__builtin_ia32_vpopcountb_v64qi_mask", IX86_BUILTIN_VPOPCOUNTBV64QI_MASK, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512BITALG, 0, CODE_FOR_vpopcountv64qi_mask, "__builtin_ia32_vpopcountb_v64qi_mask", IX86_BUILTIN_VPOPCOUNTBV64QI_MASK, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_UDI)
 BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpopcountv32qi, "__builtin_ia32_vpopcountb_v32qi", IX86_BUILTIN_VPOPCOUNTBV32QI, UNKNOWN, (int) V32QI_FTYPE_V32QI)
-BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpopcountv32qi_mask, "__builtin_ia32_vpopcountb_v32qi_mask", IX86_BUILTIN_VPOPCOUNTBV32QI_MASK, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpopcountv32qi_mask, "__builtin_ia32_vpopcountb_v32qi_mask", IX86_BUILTIN_VPOPCOUNTBV32QI_MASK, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_USI)
 BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpopcountv16qi, "__builtin_ia32_vpopcountb_v16qi", IX86_BUILTIN_VPOPCOUNTBV16QI, UNKNOWN, (int) V16QI_FTYPE_V16QI)
 BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpopcountv16qi_mask, "__builtin_ia32_vpopcountb_v16qi_mask", IX86_BUILTIN_VPOPCOUNTBV16QI_MASK, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_UHI)
 
 BDESC (OPTION_MASK_ISA_AVX512BITALG, 0, CODE_FOR_vpopcountv32hi, "__builtin_ia32_vpopcountw_v32hi", IX86_BUILTIN_VPOPCOUNTWV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI)
-BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpopcountv32hi_mask, "__builtin_ia32_vpopcountw_v32hi_mask", IX86_BUILTIN_VPOPCOUNTQV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BITALG, 0, CODE_FOR_vpopcountv32hi_mask, "__builtin_ia32_vpopcountw_v32hi_mask", IX86_BUILTIN_VPOPCOUNTQV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_USI)
 BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpopcountv16hi, "__builtin_ia32_vpopcountw_v16hi", IX86_BUILTIN_VPOPCOUNTWV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI)
 BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpopcountv16hi_mask, "__builtin_ia32_vpopcountw_v16hi_mask", IX86_BUILTIN_VPOPCOUNTQV16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpopcountv8hi, "__builtin_ia32_vpopcountw_v8hi", IX86_BUILTIN_VPOPCOUNTWV8HI, UNKNOWN, (int) V8HI_FTYPE_V8HI)
 BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpopcountv8hi_mask, "__builtin_ia32_vpopcountw_v8hi_mask", IX86_BUILTIN_VPOPCOUNTQV8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_UQI)
 
-BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512vl_vpshufbitqmbv64qi_mask, "__builtin_ia32_vpshufbitqmb512_mask", IX86_BUILTIN_VPSHUFBITQMB512_MASK, UNKNOWN, (int) UDI_FTYPE_V64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512vl_vpshufbitqmbv32qi_mask, "__builtin_ia32_vpshufbitqmb256_mask", IX86_BUILTIN_VPSHUFBITQMB256_MASK, UNKNOWN, (int) USI_FTYPE_V32QI_V32QI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BITALG, 0, CODE_FOR_avx512vl_vpshufbitqmbv64qi_mask, "__builtin_ia32_vpshufbitqmb512_mask", IX86_BUILTIN_VPSHUFBITQMB512_MASK, UNKNOWN, (int) UDI_FTYPE_V64QI_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_vpshufbitqmbv32qi_mask, "__builtin_ia32_vpshufbitqmb256_mask", IX86_BUILTIN_VPSHUFBITQMB256_MASK, UNKNOWN, (int) USI_FTYPE_V32QI_V32QI_USI)
 BDESC (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_vpshufbitqmbv16qi_mask, "__builtin_ia32_vpshufbitqmb128_mask", IX86_BUILTIN_VPSHUFBITQMB128_MASK, UNKNOWN, (int) UHI_FTYPE_V16QI_V16QI_UHI)
 
 /* AVX512_4FMAPS and AVX512_4VNNIW builtins with variable number of arguments. Defined in additional ix86_isa_flags2.  */
@@ -3019,11 +3019,11 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse_comi_round, "__builtin_ia32_vcom
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_floatv16siv16sf2_mask_round, "__builtin_ia32_cvtdq2ps512_mask", IX86_BUILTIN_CVTDQ2PS512, UNKNOWN, (int) V16SF_FTYPE_V16SI_V16SF_HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_cvtpd2dq512_mask_round, "__builtin_ia32_cvtpd2dq512_mask", IX86_BUILTIN_CVTPD2DQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_cvtpd2ps512_mask_round,  "__builtin_ia32_cvtpd2ps512_mask", IX86_BUILTIN_CVTPD2PS512, UNKNOWN, (int) V8SF_FTYPE_V8DF_V8SF_QI_INT)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_ufix_notruncv8dfv8si2_mask_round, "__builtin_ia32_cvtpd2udq512_mask", IX86_BUILTIN_CVTPD2UDQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_fixuns_notruncv8dfv8si2_mask_round, "__builtin_ia32_cvtpd2udq512_mask", IX86_BUILTIN_CVTPD2UDQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vcvtph2ps512_mask_round,  "__builtin_ia32_vcvtph2ps512_mask", IX86_BUILTIN_CVTPH2PS512, UNKNOWN, (int) V16SF_FTYPE_V16HI_V16SF_HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_fix_notruncv16sfv16si_mask_round, "__builtin_ia32_cvtps2dq512_mask", IX86_BUILTIN_CVTPS2DQ512_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_cvtps2pd512_mask_round, "__builtin_ia32_cvtps2pd512_mask", IX86_BUILTIN_CVTPS2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SF_V8DF_QI_INT)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_ufix_notruncv16sfv16si_mask_round, "__builtin_ia32_cvtps2udq512_mask", IX86_BUILTIN_CVTPS2UDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_fixuns_notruncv16sfv16si_mask_round, "__builtin_ia32_cvtps2udq512_mask", IX86_BUILTIN_CVTPS2UDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse2_cvtsd2ss_round, "__builtin_ia32_cvtsd2ss_round", IX86_BUILTIN_CVTSD2SS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF_INT)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse2_cvtsd2ss_mask_round, "__builtin_ia32_cvtsd2ss_mask_round", IX86_BUILTIN_CVTSD2SS_MASK_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT)
 BDESC (OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_sse2_cvtsi2sdq_round, "__builtin_ia32_cvtsi2sd64", IX86_BUILTIN_CVTSI2SD64, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT64_INT)
@@ -3034,8 +3034,8 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse2_cvtss2sd_mask_round, "__builtin
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_fix_truncv8dfv8si2_mask_round, "__builtin_ia32_cvttpd2dq512_mask", IX86_BUILTIN_CVTTPD2DQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_fixuns_truncv8dfv8si2_mask_round, "__builtin_ia32_cvttpd2udq512_mask", IX86_BUILTIN_CVTTPD2UDQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_fix_truncv16sfv16si2_mask_round, "__builtin_ia32_cvttps2dq512_mask", IX86_BUILTIN_CVTTPS2DQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_ufix_truncv16sfv16si2_mask_round, "__builtin_ia32_cvttps2udq512_mask", IX86_BUILTIN_CVTTPS2UDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_ufloatv16siv16sf2_mask_round, "__builtin_ia32_cvtudq2ps512_mask", IX86_BUILTIN_CVTUDQ2PS512, UNKNOWN, (int) V16SF_FTYPE_V16SI_V16SF_HI_INT)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_fixuns_truncv16sfv16si2_mask_round, "__builtin_ia32_cvttps2udq512_mask", IX86_BUILTIN_CVTTPS2UDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_floatunsv16siv16sf2_mask_round, "__builtin_ia32_cvtudq2ps512_mask", IX86_BUILTIN_CVTUDQ2PS512, UNKNOWN, (int) V16SF_FTYPE_V16SI_V16SF_HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_cvtusi2sd64_round, "__builtin_ia32_cvtusi2sd64", IX86_BUILTIN_CVTUSI2SD64, UNKNOWN, (int) V2DF_FTYPE_V2DF_UINT64_INT)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_cvtusi2ss32_round, "__builtin_ia32_cvtusi2ss32", IX86_BUILTIN_CVTUSI2SS32, UNKNOWN, (int) V4SF_FTYPE_V4SF_UINT_INT)
 BDESC (OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_cvtusi2ss64_round, "__builtin_ia32_cvtusi2ss64", IX86_BUILTIN_CVTUSI2SS64, UNKNOWN, (int) V4SF_FTYPE_V4SF_UINT64_INT)
@@ -3185,7 +3185,7 @@ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangesv2df_mask_round, "__
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangesv4sf_mask_round, "__builtin_ia32_rangess128_mask_round", IX86_BUILTIN_RANGESS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT)
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_fix_notruncv8dfv8di2_mask_round, "__builtin_ia32_cvtpd2qq512_mask", IX86_BUILTIN_CVTPD2QQ512, UNKNOWN, (int) V8DI_FTYPE_V8DF_V8DI_QI_INT)
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_cvtps2qqv8di_mask_round, "__builtin_ia32_cvtps2qq512_mask", IX86_BUILTIN_CVTPS2QQ512, UNKNOWN, (int) V8DI_FTYPE_V8SF_V8DI_QI_INT)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ufix_notruncv8dfv8di2_mask_round, "__builtin_ia32_cvtpd2uqq512_mask", IX86_BUILTIN_CVTPD2UQQ512, UNKNOWN, (int) V8DI_FTYPE_V8DF_V8DI_QI_INT)
+BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_fixuns_notruncv8dfv8di2_mask_round, "__builtin_ia32_cvtpd2uqq512_mask", IX86_BUILTIN_CVTPD2UQQ512, UNKNOWN, (int) V8DI_FTYPE_V8DF_V8DI_QI_INT)
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_cvtps2uqqv8di_mask_round, "__builtin_ia32_cvtps2uqq512_mask", IX86_BUILTIN_CVTPS2UQQ512, UNKNOWN, (int) V8DI_FTYPE_V8SF_V8DI_QI_INT)
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_floatv8div8sf2_mask_round, "__builtin_ia32_cvtqq2ps512_mask", IX86_BUILTIN_CVTQQ2PS512, UNKNOWN, (int) V8SF_FTYPE_V8DI_V8SF_QI_INT)
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_floatunsv8div8sf2_mask_round, "__builtin_ia32_cvtuqq2ps512_mask", IX86_BUILTIN_CVTUQQ2PS512, UNKNOWN, (int) V8SF_FTYPE_V8DI_V8SF_QI_INT)
diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
index fc0c82b..28f404d 100644
--- a/gcc/config/i386/i386-builtins.cc
+++ b/gcc/config/i386/i386-builtins.cc
@@ -279,14 +279,15 @@ def_builtin (HOST_WIDE_INT mask, HOST_WIDE_INT mask2,
       if (((mask2 == 0 || (mask2 & ix86_isa_flags2) != 0)
 	   && (mask == 0 || (mask & ix86_isa_flags) != 0))
 	  || ((mask & OPTION_MASK_ISA_MMX) != 0 && TARGET_MMX_WITH_SSE)
-	  /* "Unified" builtin used by either AVXVNNI/AVXIFMA intrinsics
-	     or AVX512VNNIVL/AVX512IFMAVL non-mask intrinsics should be
-	     defined whenever avxvnni/avxifma or avx512vnni/avxifma &&
-	     avx512vl exist.  */
+	  /* "Unified" builtin used by either AVXVNNI/AVXIFMA/AES intrinsics
+	     or AVX512VNNIVL/AVX512IFMAVL/VAESVL non-mask intrinsics should be
+	     defined whenever avxvnni/avxifma/aes or avx512vnni/avx512ifma/vaes
+	     && avx512vl exist.  */
 	  || (mask2 == OPTION_MASK_ISA2_AVXVNNI)
 	  || (mask2 == OPTION_MASK_ISA2_AVXIFMA)
 	  || (mask2 == (OPTION_MASK_ISA2_AVXNECONVERT
 			| OPTION_MASK_ISA2_AVX512BF16))
+	  || ((mask2 & OPTION_MASK_ISA2_VAES) != 0)
 	  || (lang_hooks.builtin_function
 	      == lang_hooks.builtin_function_ext_scope))
 	{
@@ -661,16 +662,20 @@ ix86_init_mmx_sse_builtins (void)
 	       VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
 
   /* AES */
-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
+		     OPTION_MASK_ISA2_VAES,
 		     "__builtin_ia32_aesenc128",
 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
+		     OPTION_MASK_ISA2_VAES,
 		     "__builtin_ia32_aesenclast128",
 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
+		     OPTION_MASK_ISA2_VAES,
 		     "__builtin_ia32_aesdec128",
 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
+		     OPTION_MASK_ISA2_VAES,
 		     "__builtin_ia32_aesdeclast128",
 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
   def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index 88689d1..e7bd7cc 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -675,6 +675,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     def_or_undef (parse_in, "__PREFETCHI__");
   if (isa_flag2 & OPTION_MASK_ISA2_RAOINT)
     def_or_undef (parse_in, "__RAOINT__");
+  if (isa_flag2 & OPTION_MASK_ISA2_AMX_COMPLEX)
+    def_or_undef (parse_in, "__AMX_COMPLEX__");
   if (TARGET_IAMCU)
     {
       def_or_undef (parse_in, "__iamcu");
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1e3ce4b..7bb4d39 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -521,7 +521,8 @@ ix86_expand_move (machine_mode mode, rtx operands[])
 		  return;
 		}
 	    }
-	  else if (GET_MODE_SIZE (mode) >= 16)
+	  else if (CONST_WIDE_INT_P (op1)
+		   && GET_MODE_SIZE (mode) >= 16)
 	    {
 	      rtx tmp = ix86_convert_const_wide_int_to_broadcast
 		(GET_MODE (op0), op1);
@@ -696,8 +697,9 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
       return;
     }
 
-  /* Special case TImode to V1TImode conversions, via V2DI.  */
-  if (mode == V1TImode
+  /* Special case TImode to 128-bit vector conversions via V2DI.  */
+  if (VECTOR_MODE_P (mode)
+      && GET_MODE_SIZE (mode) == 16
       && SUBREG_P (op1)
       && GET_MODE (SUBREG_REG (op1)) == TImode
       && TARGET_64BIT && TARGET_SSE
@@ -709,7 +711,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
       emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
       emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
       emit_insn (gen_vec_concatv2di (tmp, lo, hi));
-      emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
+      emit_move_insn (op0, gen_lowpart (mode, tmp));
       return;
     }
 
@@ -1019,6 +1021,7 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
   rtx op0 = operands[0];
   rtx op1 = operands[1];
   rtx op2 = operands[2];
+  rtx src;
 
   machine_mode dmode = GET_MODE (op0);
   machine_mode smode = GET_MODE (op1);
@@ -1042,11 +1045,20 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
   op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
   op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
 
-  op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
-  op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
-  rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
-						    op1, op2));
-  emit_insn (insn);
+  /* paskusdw/packuswb does unsigned saturation of a signed source
+     which is different from generic us_truncate RTX.  */
+  if (code == US_TRUNCATE)
+    src = gen_rtx_UNSPEC (sse_dmode,
+			  gen_rtvec (2, op1, op2),
+			  UNSPEC_US_TRUNCATE);
+  else
+    {
+      op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
+      op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
+      src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
+    }
+
+  emit_move_insn (dest, src);
 
   ix86_move_vector_high_sse_to_mmx (op0);
 }
@@ -2033,7 +2045,7 @@ ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
 }
 
 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
-   pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
+   pattern can be used on it instead of fixuns_trunc*.
    This is done by doing just signed conversion if < 0x1p31, and otherwise by
    subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
 
@@ -2266,7 +2278,7 @@ ix86_expand_copysign (rtx operands[])
   else
     dest = NULL_RTX;
   op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
-  mask = ix86_build_signbit_mask (vmode, 0, 0);
+  mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
 
   if (CONST_DOUBLE_P (operands[1]))
     {
@@ -2370,8 +2382,8 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
       tmp = gen_reg_rtx (mode);
       emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
       tmp = gen_lowpart (p_mode, tmp);
-      emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
-			      gen_rtx_UNSPEC (CCmode,
+      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
+			      gen_rtx_UNSPEC (CCZmode,
 					      gen_rtvec (2, tmp, tmp),
 					      UNSPEC_PTEST)));
       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
@@ -12588,6 +12600,7 @@ ix86_check_builtin_isa_match (unsigned int fcode,
   HOST_WIDE_INT isa2 = ix86_isa_flags2;
   HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
   HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
+  HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
   /* The general case is we require all the ISAs specified in bisa{,2}
      to be enabled.
      The exceptions are:
@@ -12596,60 +12609,36 @@ ix86_check_builtin_isa_match (unsigned int fcode,
      OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
      (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
        OPTION_MASK_ISA2_AVXVNNI
-     (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512IFMA) or
+     (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
        OPTION_MASK_ISA2_AVXIFMA
-     (OPTION_MASK_ISA_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16) or
+     (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
        OPTION_MASK_ISA2_AVXNECONVERT
      where for each such pair it is sufficient if either of the ISAs is
      enabled, plus if it is ored with other options also those others.
      OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
-  if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
-       == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
-      && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
-    isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
-
-  if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
-       == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
-      && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
-    isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
 
-  if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
-       == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
-      && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
-    isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
-
-  if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
-	== (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
-       || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
-      && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
-	   == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
-	  || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
-    {
-      isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
-      isa2 |= OPTION_MASK_ISA2_AVXVNNI;
-    }
-
-  if ((((bisa & (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
-	== (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
-       || (bisa2 & OPTION_MASK_ISA2_AVXIFMA) != 0)
-      && (((isa & (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
-	   == (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
-	  || (isa2 & OPTION_MASK_ISA2_AVXIFMA) != 0))
-    {
-      isa |= OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL;
-      isa2 |= OPTION_MASK_ISA2_AVXIFMA;
-    }
-
-  if ((((bisa & OPTION_MASK_ISA_AVX512VL) != 0
-	 && (bisa2 & OPTION_MASK_ISA2_AVX512BF16) != 0)
-	&& (bisa2 & OPTION_MASK_ISA2_AVXNECONVERT) != 0)
-       && (((isa & OPTION_MASK_ISA_AVX512VL) != 0
-	    && (isa2 & OPTION_MASK_ISA2_AVX512BF16) != 0)
-	   || (isa2 & OPTION_MASK_ISA2_AVXNECONVERT) != 0))
-    {
-      isa |= OPTION_MASK_ISA_AVX512VL;
-      isa2 |= OPTION_MASK_ISA2_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16;
-    }
+#define SHARE_BUILTIN(A1, A2, B1, B2) \
+  if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
+       && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
+      && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
+	  || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
+    { \
+      tmp_isa |= (A1) | (B1); \
+      tmp_isa2 |= (A2) | (B2); \
+    }
+
+  SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
+  SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
+  SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
+  SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
+		 OPTION_MASK_ISA2_AVXVNNI);
+  SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
+		 OPTION_MASK_ISA2_AVXIFMA);
+  SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
+		 OPTION_MASK_ISA2_AVXNECONVERT);
+  SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, 0, OPTION_MASK_ISA2_VAES);
+  isa = tmp_isa;
+  isa2 = tmp_isa2;
 
   if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
       /* __builtin_ia32_maskmovq requires MMX registers.  */
@@ -12667,6 +12656,21 @@ ix86_check_builtin_isa_match (unsigned int fcode,
   return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
 }
 
+/* Emit instructions to set the carry flag from ARG.  */
+
+void
+ix86_expand_carry (rtx arg)
+{
+  if (!CONST_INT_P (arg) || arg == const0_rtx)
+    {
+      arg = convert_to_mode (QImode, arg, 1);
+      arg = copy_to_mode_reg (QImode, arg);
+      emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
+    }
+  else
+    emit_insn (gen_x86_stc ());
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient
    (and in mode MODE if that's convenient).
@@ -13971,8 +13975,6 @@ rdseed_step:
       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
 
       op1 = expand_normal (arg0);
-      if (!integer_zerop (arg0))
-	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
 
       op2 = expand_normal (arg1);
       if (!register_operand (op2, mode0))
@@ -13990,7 +13992,7 @@ rdseed_step:
 	}
 
       op0 = gen_reg_rtx (mode0);
-      if (integer_zerop (arg0))
+      if (op1 == const0_rtx)
 	{
 	  /* If arg0 is 0, optimize right away into add or sub
 	     instruction that sets CCCmode flags.  */
@@ -14000,7 +14002,7 @@ rdseed_step:
       else
 	{
 	  /* Generate CF from input operand.  */
-	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+	  ix86_expand_carry (op1);
 
 	  /* Generate instruction that consumes CF.  */
 	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
@@ -14594,7 +14596,7 @@ rdseed_step:
 		op0 = pc_rtx;
 	    }
 	  else if (TREE_CODE (arg3) == SSA_NAME
-		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
+		   && VECTOR_TYPE_P (TREE_TYPE (arg3)))
 	    {
 	      /* Recognize also when mask is like:
 		 __v2df src = _mm_setzero_pd ();
@@ -16390,11 +16392,12 @@ quarter:
 	emit_move_insn (target, gen_lowpart (mode, words[0]));
       else if (n_words == 2)
 	{
-	  rtx tmp = gen_reg_rtx (mode);
-	  emit_clobber (tmp);
-	  emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
-	  emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
-	  emit_move_insn (target, tmp);
+	  gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
+	  machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
+	  rtx tmp = gen_reg_rtx (concat_mode);
+	  vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
+	  ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
+	  emit_move_insn (target, gen_lowpart (mode, tmp));
 	}
       else if (n_words == 4)
 	{
@@ -18986,6 +18989,78 @@ expand_vec_perm_movs (struct expand_vec_perm_d *d)
 }
 
 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
+   using insertps.  */
+static bool
+expand_vec_perm_insertps (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, cnt_s, nelt = d->nelt;
+  int cnt_d = -1;
+  rtx src, dst;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (!(TARGET_SSE4_1
+	&& (vmode == V4SFmode || vmode == V4SImode
+	    || (TARGET_MMX_WITH_SSE
+		&& (vmode == V2SFmode || vmode == V2SImode)))))
+    return false;
+
+  for (i = 0; i < nelt; ++i)
+    {
+      if (d->perm[i] == i)
+	continue;
+      if (cnt_d != -1)
+	{
+	  cnt_d = -1;
+	  break;
+	}
+      cnt_d = i;
+    }
+
+  if (cnt_d == -1)
+    {
+      for (i = 0; i < nelt; ++i)
+	{
+	  if (d->perm[i] == i + nelt)
+	    continue;
+	  if (cnt_d != -1)
+	    return false;
+	  cnt_d = i;
+	}
+
+      if (cnt_d == -1)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  gcc_assert (cnt_d != -1);
+
+  cnt_s = d->perm[cnt_d];
+  if (cnt_s < nelt)
+    {
+      src = d->op0;
+      dst = d->op1;
+    }
+  else
+    {
+      cnt_s -= nelt;
+      src = d->op1;
+      dst = d->op0;
+     }
+  gcc_assert (cnt_s < nelt);
+
+  rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
+			       GEN_INT (cnt_s << 6 | cnt_d << 4));
+  emit_insn (x);
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
 static bool
@@ -19069,10 +19144,20 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
       goto do_subreg;
 
     case E_V4SImode:
-      for (i = 0; i < 4; ++i)
-	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
-      vmode = V8HImode;
-      goto do_subreg;
+      if (TARGET_AVX2)
+	{
+	  /* Use vpblendd instead of vpblendw.  */
+	  for (i = 0; i < nelt; ++i)
+	    mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
+	  break;
+	}
+      else
+	{
+	  for (i = 0; i < 4; ++i)
+	    mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+	  vmode = V8HImode;
+	  goto do_subreg;
+	}
 
     case E_V16QImode:
       /* See if bytes move in pairs so we can use pblendw with
@@ -19908,6 +19993,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_movs (d))
     return true;
 
+  /* Try the SSE4.1 insertps instruction.  */
+  if (expand_vec_perm_insertps (d))
+    return true;
+
   /* Try the fully general two operand permute.  */
   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
 			      d->testing_p))
@@ -23042,74 +23131,6 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
   gcc_assert (ok);
 }
 
-/* This function is similar as ix86_expand_vecop_qihi,
-   but optimized under AVX512BW by using vpmovwb.
-   For example, optimize vector MUL generation like
-
-   vpmovzxbw ymm2, xmm0
-   vpmovzxbw ymm3, xmm1
-   vpmullw   ymm4, ymm2, ymm3
-   vpmovwb   xmm0, ymm4
-
-   it would take less instructions than ix86_expand_vecop_qihi.
-   Return true if success.  */
-
-static bool
-ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
-{
-  machine_mode himode, qimode = GET_MODE (dest);
-  rtx hop1, hop2, hdest;
-  rtx (*gen_extend)(rtx, rtx);
-  rtx (*gen_truncate)(rtx, rtx);
-  bool uns_p = (code == ASHIFTRT) ? false : true;
-
-  /* There's no V64HImode multiplication instruction.  */
-  if (qimode == E_V64QImode)
-    return false;
-
-  /* vpmovwb only available under AVX512BW.  */
-  if (!TARGET_AVX512BW)
-    return false;
-  if ((qimode == V8QImode || qimode == V16QImode)
-      && !TARGET_AVX512VL)
-    return false;
-  /* Not generate zmm instruction when prefer 128/256 bit vector width.  */
-  if (qimode == V32QImode
-      && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
-    return false;
-
-  switch (qimode)
-    {
-    case E_V8QImode:
-      himode = V8HImode;
-      gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
-      gen_truncate = gen_truncv8hiv8qi2;
-      break;
-    case E_V16QImode:
-      himode = V16HImode;
-      gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
-      gen_truncate = gen_truncv16hiv16qi2;
-      break;
-    case E_V32QImode:
-      himode = V32HImode;
-      gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
-      gen_truncate = gen_truncv32hiv32qi2;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  hop1 = gen_reg_rtx (himode);
-  hop2 = gen_reg_rtx (himode);
-  hdest = gen_reg_rtx (himode);
-  emit_insn (gen_extend (hop1, op1));
-  emit_insn (gen_extend (hop2, op2));
-  emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
-						      hop1, hop2)));
-  emit_insn (gen_truncate (dest, hdest));
-  return true;
-}
-
 /* Expand a vector operation shift by constant for a V*QImode in terms of the
    same operation on V*HImode. Return true if success. */
 static bool
@@ -23210,6 +23231,234 @@ ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
   return true;
 }
 
+void
+ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+  machine_mode qimode = GET_MODE (dest);
+  rtx qop1, qop2, hop1, hop2, qdest, hdest;
+  bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+  bool uns_p = code != ASHIFTRT;
+
+  switch (qimode)
+    {
+    case E_V4QImode:
+    case E_V8QImode:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
+
+  if (op2vec)
+    qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
+  else
+    qop2 = op2;
+
+  qdest = gen_reg_rtx (V16QImode);
+
+  if (CONST_INT_P (op2)
+      && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
+      && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
+    {
+      emit_move_insn (dest, gen_lowpart (qimode, qdest));
+      return;
+    }
+
+  switch (code)
+    {
+    case MULT:
+      gcc_assert (op2vec);
+      if (!TARGET_SSE4_1)
+	{
+	  /* Unpack data such that we've got a source byte in each low byte
+	     of each word.  We don't care what goes into the high byte of
+	     each word.  Rather than trying to get zero in there, most
+	     convenient is to let it be a copy of the low byte.  */
+	  hop1 = copy_to_reg (qop1);
+	  hop2 = copy_to_reg (qop2);
+	  emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
+	  emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
+	  break;
+	}
+      /* FALLTHRU */
+    case ASHIFT:
+    case ASHIFTRT:
+    case LSHIFTRT:
+      hop1 = gen_reg_rtx (V8HImode);
+      ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+      /* mult/vashr/vlshr/vashl  */
+      if (op2vec)
+	{
+	  hop2 = gen_reg_rtx (V8HImode);
+	  ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+	}
+      else
+	hop2 = qop2;
+
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (code != MULT && op2vec)
+    {
+      /* Expand vashr/vlshr/vashl.  */
+      hdest = gen_reg_rtx (V8HImode);
+      emit_insn (gen_rtx_SET (hdest,
+			      simplify_gen_binary (code, V8HImode,
+						   hop1, hop2)));
+    }
+  else
+    /* Expand mult/ashr/lshr/ashl.  */
+    hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
+				NULL_RTX, 1, OPTAB_DIRECT);
+
+  if (TARGET_AVX512BW && TARGET_AVX512VL)
+    {
+      if (qimode == V8QImode)
+	qdest = dest;
+      else
+	qdest = gen_reg_rtx (V8QImode);
+
+      emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
+    }
+  else
+    {
+      struct expand_vec_perm_d d;
+      rtx qres = gen_lowpart (V16QImode, hdest);
+      bool ok;
+      int i;
+
+      /* Merge the data back into the right place.  */
+      d.target = qdest;
+      d.op0 = d.op1 = qres;
+      d.vmode = V16QImode;
+      d.nelt = 16;
+      d.one_operand_p = false;
+      d.testing_p = false;
+
+      for (i = 0; i < d.nelt; ++i)
+	d.perm[i] = i * 2;
+
+      ok = ix86_expand_vec_perm_const_1 (&d);
+      gcc_assert (ok);
+    }
+
+  if (qdest != dest)
+    emit_move_insn (dest, gen_lowpart (qimode, qdest));
+}
+
+/* Emit instruction in 2x wider mode.  For example, optimize
+   vector MUL generation like
+
+   vpmovzxbw ymm2, xmm0
+   vpmovzxbw ymm3, xmm1
+   vpmullw   ymm4, ymm2, ymm3
+   vpmovwb   xmm0, ymm4
+
+   it would take less instructions than ix86_expand_vecop_qihi.
+   Return true if success.  */
+
+static bool
+ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+  machine_mode himode, qimode = GET_MODE (dest);
+  machine_mode wqimode;
+  rtx qop1, qop2, hop1, hop2, hdest;
+  rtx (*gen_truncate)(rtx, rtx) = NULL;
+  bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+  bool uns_p = code != ASHIFTRT;
+
+  if ((qimode == V16QImode && !TARGET_AVX2)
+      || (qimode == V32QImode && !TARGET_AVX512BW)
+      /* There are no V64HImode instructions.  */
+      || qimode == V64QImode)
+     return false;
+
+  /* Do not generate ymm/zmm instructions when
+     target prefers 128/256 bit vector width.  */
+  if ((qimode == V16QImode && TARGET_PREFER_AVX128)
+      || (qimode == V32QImode && TARGET_PREFER_AVX256))
+    return false;
+
+  switch (qimode)
+    {
+    case E_V16QImode:
+      himode = V16HImode;
+      if (TARGET_AVX512VL && TARGET_AVX512BW)
+	gen_truncate = gen_truncv16hiv16qi2;
+      break;
+    case E_V32QImode:
+      himode = V32HImode;
+      gen_truncate = gen_truncv32hiv32qi2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
+  qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
+
+  if (op2vec)
+    qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
+  else
+    qop2 = op2;
+
+  hop1 = gen_reg_rtx (himode);
+  ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+
+  if (op2vec)
+    {
+      hop2 = gen_reg_rtx (himode);
+      ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+    }
+  else
+    hop2 = qop2;
+
+  if (code != MULT && op2vec)
+    {
+      /* Expand vashr/vlshr/vashl.  */
+      hdest = gen_reg_rtx (himode);
+      emit_insn (gen_rtx_SET (hdest,
+			      simplify_gen_binary (code, himode,
+						   hop1, hop2)));
+    }
+  else
+    /* Expand mult/ashr/lshr/ashl.  */
+    hdest = expand_simple_binop (himode, code, hop1, hop2,
+				 NULL_RTX, 1, OPTAB_DIRECT);
+
+  if (gen_truncate)
+    emit_insn (gen_truncate (dest, hdest));
+  else
+    {
+      struct expand_vec_perm_d d;
+      rtx wqdest = gen_reg_rtx (wqimode);
+      rtx wqres = gen_lowpart (wqimode, hdest);
+      bool ok;
+      int i;
+
+      /* Merge the data back into the right place.  */
+      d.target = wqdest;
+      d.op0 = d.op1 = wqres;
+      d.vmode = wqimode;
+      d.nelt = GET_MODE_NUNITS (wqimode);
+      d.one_operand_p = false;
+      d.testing_p = false;
+
+      for (i = 0; i < d.nelt; ++i)
+	d.perm[i] = i * 2;
+
+      ok = ix86_expand_vec_perm_const_1 (&d);
+      gcc_assert (ok);
+
+      emit_move_insn (dest, gen_lowpart (qimode, wqdest));
+    }
+
+  return true;
+}
+
 /* Expand a vector operation CODE for a V*QImode in terms of the
    same operation on V*HImode.  */
 
@@ -23221,9 +23470,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   rtx (*gen_il) (rtx, rtx, rtx);
   rtx (*gen_ih) (rtx, rtx, rtx);
   rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+  bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   struct expand_vec_perm_d d;
-  bool ok, full_interleave;
-  bool uns_p = false;
+  bool full_interleave = true;
+  bool uns_p = code != ASHIFTRT;
+  bool ok;
   int i;
 
   if (CONST_INT_P (op2)
@@ -23231,27 +23482,19 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
       && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
     return;
 
-  if (TARGET_AVX512BW
-      && VECTOR_MODE_P (GET_MODE (op2))
-      && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
+  if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
     return;
 
   switch (qimode)
     {
     case E_V16QImode:
       himode = V8HImode;
-      gen_il = gen_vec_interleave_lowv16qi;
-      gen_ih = gen_vec_interleave_highv16qi;
       break;
     case E_V32QImode:
       himode = V16HImode;
-      gen_il = gen_avx2_interleave_lowv32qi;
-      gen_ih = gen_avx2_interleave_highv32qi;
       break;
     case E_V64QImode:
       himode = V32HImode;
-      gen_il = gen_avx512bw_interleave_lowv64qi;
-      gen_ih = gen_avx512bw_interleave_highv64qi;
       break;
     default:
       gcc_unreachable ();
@@ -23260,10 +23503,31 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   switch (code)
     {
     case MULT:
+      gcc_assert (op2vec);
       /* Unpack data such that we've got a source byte in each low byte of
 	 each word.  We don't care what goes into the high byte of each word.
 	 Rather than trying to get zero in there, most convenient is to let
 	 it be a copy of the low byte.  */
+      switch (qimode)
+	{
+	case E_V16QImode:
+	  gen_il = gen_vec_interleave_lowv16qi;
+	  gen_ih = gen_vec_interleave_highv16qi;
+	  break;
+	case E_V32QImode:
+	  gen_il = gen_avx2_interleave_lowv32qi;
+	  gen_ih = gen_avx2_interleave_highv32qi;
+	  full_interleave = false;
+	  break;
+	case E_V64QImode:
+	  gen_il = gen_avx512bw_interleave_lowv64qi;
+	  gen_ih = gen_avx512bw_interleave_highv64qi;
+	  full_interleave = false;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
       op2_l = gen_reg_rtx (qimode);
       op2_h = gen_reg_rtx (qimode);
       emit_insn (gen_il (op2_l, op2, op2));
@@ -23273,20 +23537,17 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
       op1_h = gen_reg_rtx (qimode);
       emit_insn (gen_il (op1_l, op1, op1));
       emit_insn (gen_ih (op1_h, op1, op1));
-      full_interleave = qimode == V16QImode;
       break;
 
     case ASHIFT:
-    case LSHIFTRT:
-      uns_p = true;
-      /* FALLTHRU */
     case ASHIFTRT:
+    case LSHIFTRT:
       op1_l = gen_reg_rtx (himode);
       op1_h = gen_reg_rtx (himode);
       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
       /* vashr/vlshr/vashl  */
-      if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+      if (op2vec)
 	{
 	  rtx tmp = force_reg (qimode, op2);
 	  op2_l = gen_reg_rtx (himode);
@@ -23297,16 +23558,14 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
       else
 	op2_l = op2_h = op2;
 
-      full_interleave = true;
       break;
     default:
       gcc_unreachable ();
     }
 
-  /* Perform vashr/vlshr/vashl.  */
-  if (code != MULT
-      && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+  if (code != MULT && op2vec)
     {
+      /* Expand vashr/vlshr/vashl.  */
       res_l = gen_reg_rtx (himode);
       res_h = gen_reg_rtx (himode);
       emit_insn (gen_rtx_SET (res_l,
@@ -23316,9 +23575,9 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
 			      simplify_gen_binary (code, himode,
 						   op1_h, op2_h)));
     }
-  /* Performance mult/ashr/lshr/ashl.  */
   else
     {
+      /* Expand mult/ashr/lshr/ashl.  */
       res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
 				   1, OPTAB_DIRECT);
       res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
@@ -23338,7 +23597,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
 
   if (full_interleave)
     {
-      /* For SSE2, we used an full interleave, so the desired
+      /* We used the full interleave, the desired
 	 results are in the even elements.  */
       for (i = 0; i < d.nelt; ++i)
 	d.perm[i] = i * 2;
@@ -23362,9 +23621,6 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
 
   ok = ix86_expand_vec_perm_const_1 (&d);
   gcc_assert (ok);
-
-  set_unique_reg_note (get_last_insn (), REG_EQUAL,
-		       gen_rtx_fmt_ee (code, qimode, op1, op2));
 }
 
 /* Helper function of ix86_expand_mul_widen_evenodd.  Return true
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c09abf8..4a3b07a 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -974,12 +974,45 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
     }
 }
 
-/* Convert COMPARE to vector mode.  */
+/* Convert CCZmode COMPARE to vector mode.  */
 
 rtx
 scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
 {
   rtx src, tmp;
+
+  /* Handle any REG_EQUAL notes.  */
+  tmp = find_reg_equal_equiv_note (insn);
+  if (tmp)
+    {
+      if (GET_CODE (XEXP (tmp, 0)) == COMPARE
+	  && GET_MODE (XEXP (tmp, 0)) == CCZmode
+	  && REG_P (XEXP (XEXP (tmp, 0), 0)))
+	{
+	  rtx *op = &XEXP (XEXP (tmp, 0), 1);
+	  if (CONST_SCALAR_INT_P (*op))
+	    {
+	      if (constm1_operand (*op, GET_MODE (*op)))
+		*op = CONSTM1_RTX (vmode);
+	      else
+		{
+		  unsigned n = GET_MODE_NUNITS (vmode);
+		  rtx *v = XALLOCAVEC (rtx, n);
+		  v[0] = *op;
+		  for (unsigned i = 1; i < n; ++i)
+		    v[i] = const0_rtx;
+		  *op = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
+		}
+	      tmp = NULL_RTX;
+	    }
+	  else if (REG_P (*op))
+	    tmp = NULL_RTX;
+	}
+
+      if (tmp)
+	remove_note (insn, tmp);
+    }
+
   /* Comparison against anything other than zero, requires an XOR.  */
   if (op2 != const0_rtx)
     {
@@ -1023,7 +1056,7 @@ scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
 	  emit_insn_before (gen_rtx_SET (tmp, op11), insn);
 	  op11 = tmp;
 	}
-      return gen_rtx_UNSPEC (CCmode, gen_rtvec (2, op11, op12),
+      return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, op11, op12),
 			     UNSPEC_PTEST);
     }
   else
@@ -1052,7 +1085,7 @@ scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
       src = tmp;
     }
 
-  return gen_rtx_UNSPEC (CCmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
+  return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
 }
 
 /* Helper function for converting INSN to vector mode.  */
@@ -1219,7 +1252,7 @@ general_scalar_chain::convert_insn (rtx_insn *insn)
       break;
 
     case COMPARE:
-      dst = gen_rtx_REG (CCmode, FLAGS_REG);
+      dst = gen_rtx_REG (CCZmode, FLAGS_REG);
       src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
       break;
 
@@ -1635,10 +1668,11 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
   switch (GET_CODE (src))
     {
     case REG:
-      PUT_MODE (src, V1TImode);
-      /* Call fix_debug_reg_uses only if SRC is never defined.  */
-      if (!DF_REG_DEF_CHAIN (REGNO (src)))
-	fix_debug_reg_uses (src);
+      if (GET_MODE (src) == TImode)
+	{
+	  PUT_MODE (src, V1TImode);
+	  fix_debug_reg_uses (src);
+	}
       break;
 
     case MEM:
@@ -1725,7 +1759,7 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
       break;
 
     case COMPARE:
-      dst = gen_rtx_REG (CCmode, FLAGS_REG);
+      dst = gen_rtx_REG (CCZmode, FLAGS_REG);
       src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
       break;
 
diff --git a/gcc/config/i386/i386-isa.def b/gcc/config/i386/i386-isa.def
index 5181a0d..0634c6f 100644
--- a/gcc/config/i386/i386-isa.def
+++ b/gcc/config/i386/i386-isa.def
@@ -116,3 +116,4 @@ DEF_PTA(CMPCCXADD)
 DEF_PTA(AMX_FP16)
 DEF_PTA(PREFETCHI)
 DEF_PTA(RAOINT)
+DEF_PTA(AMX_COMPLEX)
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 80fe655..2cb0bdd 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -238,7 +238,8 @@ static struct ix86_target_opts isa2_opts[] =
   { "-mcmpccxadd",      OPTION_MASK_ISA2_CMPCCXADD },
   { "-mamx-fp16",       OPTION_MASK_ISA2_AMX_FP16 },
   { "-mprefetchi",      OPTION_MASK_ISA2_PREFETCHI },
-  { "-mraoint", 	OPTION_MASK_ISA2_RAOINT }
+  { "-mraoint", 	OPTION_MASK_ISA2_RAOINT },
+  { "-mamx-complex",	OPTION_MASK_ISA2_AMX_COMPLEX }
 };
 static struct ix86_target_opts isa_opts[] =
 {
@@ -1089,6 +1090,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
     IX86_ATTR_ISA ("amx-fp16", OPT_mamx_fp16),
     IX86_ATTR_ISA ("prefetchi",   OPT_mprefetchi),
     IX86_ATTR_ISA ("raoint", OPT_mraoint),
+    IX86_ATTR_ISA ("amx-complex", OPT_mamx_complex),
 
     /* enum options */
     IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 71ae95f..27fe73c 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -140,6 +140,7 @@ extern void ix86_expand_copysign (rtx []);
 extern void ix86_expand_xorsign (rtx []);
 extern bool ix86_unary_operator_ok (enum rtx_code, machine_mode, rtx[2]);
 extern bool ix86_match_ccmode (rtx, machine_mode);
+extern bool ix86_match_ptest_ccmode (rtx);
 extern void ix86_expand_branch (enum rtx_code, rtx, rtx, rtx);
 extern void ix86_expand_setcc (rtx, enum rtx_code, rtx, rtx);
 extern bool ix86_expand_int_movcc (rtx[]);
@@ -154,6 +155,7 @@ extern void ix86_expand_sse_movcc (rtx, rtx, rtx, rtx);
 extern void ix86_expand_sse_unpack (rtx, rtx, bool, bool);
 extern void ix86_expand_fp_spaceship (rtx, rtx, rtx);
 extern bool ix86_expand_int_addcc (rtx[]);
+extern void ix86_expand_carry (rtx arg);
 extern rtx_insn *ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool);
 extern bool ix86_call_use_plt_p (rtx);
 extern void ix86_split_call_vzeroupper (rtx, rtx);
@@ -215,6 +217,7 @@ extern void ix86_expand_round (rtx, rtx);
 extern void ix86_expand_rounddf_32 (rtx, rtx);
 extern void ix86_expand_round_sse4 (rtx, rtx);
 
+extern void ix86_expand_vecop_qihi_partial (enum rtx_code, rtx, rtx, rtx);
 extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
 extern rtx ix86_split_stack_guard (void);
 
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 2cc8e95..32851a5 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -96,6 +96,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "i386-expand.h"
 #include "i386-features.h"
 #include "function-abi.h"
+#include "rtl-error.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -1886,7 +1887,7 @@ type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
 {
   machine_mode mode = TYPE_MODE (type);
 
-  if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
+  if (VECTOR_TYPE_P (type) && !VECTOR_MODE_P (mode))
     {
       HOST_WIDE_INT size = int_size_in_bytes (type);
       if ((size == 8 || size == 16 || size == 32 || size == 64)
@@ -1903,7 +1904,7 @@ type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
 	  if (DECIMAL_FLOAT_MODE_P (innermode))
 	    return mode;
 
-	  if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
+	  if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (type)))
 	    mode = MIN_MODE_VECTOR_FLOAT;
 	  else
 	    mode = MIN_MODE_VECTOR_INT;
@@ -3411,7 +3412,7 @@ ix86_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
 
   /* To simplify the code below, represent vector types with a vector mode
      even if MMX/SSE are not active.  */
-  if (arg.type && TREE_CODE (arg.type) == VECTOR_TYPE)
+  if (arg.type && VECTOR_TYPE_P (arg.type))
     mode = type_natural_mode (arg.type, cum, false);
 
   if (TARGET_64BIT)
@@ -5357,19 +5358,19 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
       if (GET_MODE_SIZE (mode) == 64)
 	{
 	  gcc_assert (TARGET_AVX512F);
-	  return "vpcmpeqd \t %t0, %t0, %t0";
+	  return "vpcmpeqd\t%t0, %t0, %t0";
 	}
       else if (GET_MODE_SIZE (mode) == 32)
 	{
 	  gcc_assert (TARGET_AVX);
-	  return "vpcmpeqd \t %x0, %x0, %x0";
+	  return "vpcmpeqd\t%x0, %x0, %x0";
 	}
       gcc_unreachable ();
     }
   else if (vector_all_ones_zero_extend_quarter_operand (x, mode))
     {
       gcc_assert (TARGET_AVX512F);
-      return "vpcmpeqd \t %x0, %x0, %x0";
+      return "vpcmpeqd\t%x0, %x0, %x0";
     }
 
   gcc_unreachable ();
@@ -11034,8 +11035,9 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
       if (reg == NULL_RTX)
 	return false;
 
-      if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
-	  || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
+      unsigned int regno = REGNO (reg);
+      if ((strict && !REGNO_OK_FOR_BASE_P (regno))
+	  || (!strict && !REGNO_OK_FOR_BASE_NONSTRICT_P (regno)))
 	/* Base is not valid.  */
 	return false;
     }
@@ -11048,8 +11050,9 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
       if (reg == NULL_RTX)
 	return false;
 
-      if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
-	  || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
+      unsigned int regno = REGNO (reg);
+      if ((strict && !REGNO_OK_FOR_INDEX_P (regno))
+	  || (!strict && !REGNO_OK_FOR_INDEX_NONSTRICT_P (regno)))
 	/* Index is not valid.  */
 	return false;
     }
@@ -13218,7 +13221,13 @@ ix86_print_operand (FILE *file, rtx x, int code)
 	    }
 
 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
-	    warning (0, "non-integer operand used with operand code %<z%>");
+	    {
+	      if (this_is_asm_operands)
+		warning_for_asm (this_is_asm_operands,
+				 "non-integer operand used with operand code %<z%>");
+	      else
+		warning (0, "non-integer operand used with operand code %<z%>");
+	    }
 	  /* FALLTHRU */
 
 	case 'Z':
@@ -13281,11 +13290,12 @@ ix86_print_operand (FILE *file, rtx x, int code)
 	  else
 	    {
 	      output_operand_lossage ("invalid operand type used with "
-				      "operand code 'Z'");
+				      "operand code '%c'", code);
 	      return;
 	    }
 
-	  output_operand_lossage ("invalid operand size for operand code 'Z'");
+	  output_operand_lossage ("invalid operand size for operand code '%c'",
+				  code);
 	  return;
 
 	case 'd':
@@ -15944,6 +15954,17 @@ ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
 	       && REGNO (XEXP (op1, 0)) == FLAGS_REG
 	       && XEXP (op1, 1) == const0_rtx)
 	return CCCmode;
+      /* Similarly for *x86_cmc pattern.
+	 Match LTU of op0 (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+	 and op1 (geu:QI (reg:CCC FLAGS_REG) (const_int 0)).
+	 It is sufficient to test that the operand modes are CCCmode.  */
+      else if (code == LTU
+	       && GET_CODE (op0) == NEG
+	       && GET_CODE (XEXP (op0, 0)) == LTU
+	       && GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCCmode
+	       && GET_CODE (op1) == GEU
+	       && GET_MODE (XEXP (op1, 0)) == CCCmode)
+	return CCCmode;
       else
 	return CCmode;
     case GTU:			/* CF=0 & ZF=0 */
@@ -15977,6 +15998,29 @@ ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
     }
 }
 
+/* Return TRUE or FALSE depending on whether the ptest instruction
+   INSN has source and destination with suitable matching CC modes.  */
+
+bool
+ix86_match_ptest_ccmode (rtx insn)
+{
+  rtx set, src;
+  machine_mode set_mode;
+
+  set = PATTERN (insn);
+  gcc_assert (GET_CODE (set) == SET);
+  src = SET_SRC (set);
+  gcc_assert (GET_CODE (src) == UNSPEC
+	      && XINT (src, 1) == UNSPEC_PTEST);
+
+  set_mode = GET_MODE (src);
+  if (set_mode != CCZmode
+      && set_mode != CCCmode
+      && set_mode != CCmode)
+    return false;
+  return GET_MODE (SET_DEST (set)) == set_mode;
+}
+
 /* Return the fixed registers used for condition codes.  */
 
 static bool
@@ -17460,9 +17504,7 @@ ix86_data_alignment (tree type, unsigned int align, bool opt)
 	   || TYPE_MODE (type) == TCmode) && align < 128)
 	return 128;
     }
-  else if ((TREE_CODE (type) == RECORD_TYPE
-	    || TREE_CODE (type) == UNION_TYPE
-	    || TREE_CODE (type) == QUAL_UNION_TYPE)
+  else if (RECORD_OR_UNION_TYPE_P (type)
 	   && TYPE_FIELDS (type))
     {
       if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
@@ -17470,7 +17512,7 @@ ix86_data_alignment (tree type, unsigned int align, bool opt)
       if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
 	return 128;
     }
-  else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
+  else if (SCALAR_FLOAT_TYPE_P (type) || VECTOR_TYPE_P (type)
 	   || TREE_CODE (type) == INTEGER_TYPE)
     {
       if (TYPE_MODE (type) == DFmode && align < 64)
@@ -17586,9 +17628,7 @@ ix86_local_alignment (tree exp, machine_mode mode,
 	   || TYPE_MODE (type) == TCmode) && align < 128)
 	return 128;
     }
-  else if ((TREE_CODE (type) == RECORD_TYPE
-	    || TREE_CODE (type) == UNION_TYPE
-	    || TREE_CODE (type) == QUAL_UNION_TYPE)
+  else if (RECORD_OR_UNION_TYPE_P (type)
 	   && TYPE_FIELDS (type))
     {
       if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
@@ -17596,7 +17636,7 @@ ix86_local_alignment (tree exp, machine_mode mode,
       if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
 	return 128;
     }
-  else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
+  else if (SCALAR_FLOAT_TYPE_P (type) || VECTOR_TYPE_P (type)
 	   || TREE_CODE (type) == INTEGER_TYPE)
     {
 
@@ -17922,6 +17962,8 @@ ix86_masked_all_ones (unsigned HOST_WIDE_INT elems, tree arg_mask)
     return false;
 
   unsigned HOST_WIDE_INT mask = TREE_INT_CST_LOW (arg_mask);
+  if (elems == HOST_BITS_PER_WIDE_INT)
+    return  mask == HOST_WIDE_INT_M1U;
   if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
     return false;
 
@@ -18401,7 +18443,8 @@ ix86_fold_builtin (tree fndecl, int n_args,
 bool
 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 {
-  gimple *stmt = gsi_stmt (*gsi);
+  gimple *stmt = gsi_stmt (*gsi), *g;
+  gimple_seq stmts = NULL;
   tree fndecl = gimple_call_fndecl (stmt);
   gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD));
   int n_args = gimple_call_num_args (stmt);
@@ -18414,6 +18457,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
   unsigned HOST_WIDE_INT count;
   bool is_vshift;
   unsigned HOST_WIDE_INT elems;
+  location_t loc;
 
   /* Don't fold when there's isa mismatch.  */
   if (!ix86_check_builtin_isa_match (fn_code, NULL, NULL))
@@ -18449,8 +18493,8 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	  if (!expr_not_equal_to (arg0, wi::zero (prec)))
 	    return false;
 
-	  location_t loc = gimple_location (stmt);
-	  gimple *g = gimple_build_call (decl, 1, arg0);
+	  loc = gimple_location (stmt);
+	  g = gimple_build_call (decl, 1, arg0);
 	  gimple_set_location (g, loc);
 	  tree lhs = make_ssa_name (integer_type_node);
 	  gimple_call_set_lhs (g, lhs);
@@ -18472,8 +18516,8 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	  arg0 = gimple_call_arg (stmt, 0);
 	  if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
 	    break;
-	  location_t loc = gimple_location (stmt);
-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+	  loc = gimple_location (stmt);
+	  g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
 	  gimple_set_location (g, loc);
 	  gsi_replace (gsi, g, false);
 	  return true;
@@ -18488,9 +18532,9 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
       arg1 = gimple_call_arg (stmt, 1);
       if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
 	{
-	  location_t loc = gimple_location (stmt);
+	  loc = gimple_location (stmt);
 	  arg0 = gimple_call_arg (stmt, 0);
-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+	  g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
 	  gimple_set_location (g, loc);
 	  gsi_replace (gsi, g, false);
 	  return true;
@@ -18521,23 +18565,24 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
       arg2 = gimple_call_arg (stmt, 2);
       if (gimple_call_lhs (stmt))
 	{
-	  location_t loc = gimple_location (stmt);
+	  loc = gimple_location (stmt);
 	  tree type = TREE_TYPE (arg2);
-	  gimple_seq stmts = NULL;
 	  if (VECTOR_FLOAT_TYPE_P (type))
 	    {
 	      tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
 		? intSI_type_node : intDI_type_node;
 	      type = get_same_sized_vectype (itype, type);
-	      arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
 	    }
+	  else
+	    type = signed_type_for (type);
+	  arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
 	  tree zero_vec = build_zero_cst (type);
 	  tree cmp_type = truth_type_for (type);
 	  tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec);
 	  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
-					   VEC_COND_EXPR, cmp,
-					   arg1, arg0);
+	  g = gimple_build_assign (gimple_call_lhs (stmt),
+				   VEC_COND_EXPR, cmp,
+				   arg1, arg0);
 	  gimple_set_location (g, loc);
 	  gsi_replace (gsi, g, false);
 	}
@@ -18573,17 +18618,16 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
       arg1 = gimple_call_arg (stmt, 1);
       if (gimple_call_lhs (stmt))
 	{
-	  location_t loc = gimple_location (stmt);
+	  loc = gimple_location (stmt);
 	  tree type = TREE_TYPE (arg0);
 	  tree zero_vec = build_zero_cst (type);
 	  tree minus_one_vec = build_minus_one_cst (type);
 	  tree cmp_type = truth_type_for (type);
-	  gimple_seq stmts = NULL;
 	  tree cmp = gimple_build (&stmts, tcode, cmp_type, arg0, arg1);
 	  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-	  gimple* g = gimple_build_assign (gimple_call_lhs (stmt),
-					   VEC_COND_EXPR, cmp,
-					   minus_one_vec, zero_vec);
+	  g = gimple_build_assign (gimple_call_lhs (stmt),
+				   VEC_COND_EXPR, cmp,
+				   minus_one_vec, zero_vec);
 	  gimple_set_location (g, loc);
 	  gsi_replace (gsi, g, false);
 	}
@@ -18788,8 +18832,8 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
       if (count == 0)
 	{
 	  /* Just return the first argument for shift by 0.  */
-	  location_t loc = gimple_location (stmt);
-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+	  loc = gimple_location (stmt);
+	  g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
 	  gimple_set_location (g, loc);
 	  gsi_replace (gsi, g, false);
 	  return true;
@@ -18799,9 +18843,9 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	{
 	  /* For shift counts equal or greater than precision, except for
 	     arithmetic right shift the result is zero.  */
-	  location_t loc = gimple_location (stmt);
-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
-					   build_zero_cst (TREE_TYPE (arg0)));
+	  loc = gimple_location (stmt);
+	  g = gimple_build_assign (gimple_call_lhs (stmt),
+				   build_zero_cst (TREE_TYPE (arg0)));
 	  gimple_set_location (g, loc);
 	  gsi_replace (gsi, g, false);
 	  return true;
@@ -18830,7 +18874,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	    return false;
 
 	  machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0)));
-	  location_t loc = gimple_location (stmt);
+	  loc = gimple_location (stmt);
 	  tree itype = (imode == E_DFmode
 			? long_long_integer_type_node : integer_type_node);
 	  tree vtype = build_vector_type (itype, elems);
@@ -18861,9 +18905,9 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 
 	  tree perm_mask = elts.build ();
 	  arg1 = gimple_call_arg (stmt, 1);
-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
-					   VEC_PERM_EXPR,
-					   arg0, arg1, perm_mask);
+	  g = gimple_build_assign (gimple_call_lhs (stmt),
+				   VEC_PERM_EXPR,
+				   arg0, arg1, perm_mask);
 	  gimple_set_location (g, loc);
 	  gsi_replace (gsi, g, false);
 	  return true;
@@ -18871,6 +18915,55 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
       // Do not error yet, the constant could be propagated later?
       break;
 
+    case IX86_BUILTIN_PABSB:
+    case IX86_BUILTIN_PABSW:
+    case IX86_BUILTIN_PABSD:
+      /* 64-bit vector abs<mode>2 is only supported under TARGET_MMX_WITH_SSE.  */
+      if (!TARGET_MMX_WITH_SSE)
+	break;
+      /* FALLTHRU.  */
+    case IX86_BUILTIN_PABSB128:
+    case IX86_BUILTIN_PABSB256:
+    case IX86_BUILTIN_PABSB512:
+    case IX86_BUILTIN_PABSW128:
+    case IX86_BUILTIN_PABSW256:
+    case IX86_BUILTIN_PABSW512:
+    case IX86_BUILTIN_PABSD128:
+    case IX86_BUILTIN_PABSD256:
+    case IX86_BUILTIN_PABSD512:
+    case IX86_BUILTIN_PABSQ128:
+    case IX86_BUILTIN_PABSQ256:
+    case IX86_BUILTIN_PABSQ512:
+    case IX86_BUILTIN_PABSB128_MASK:
+    case IX86_BUILTIN_PABSB256_MASK:
+    case IX86_BUILTIN_PABSW128_MASK:
+    case IX86_BUILTIN_PABSW256_MASK:
+    case IX86_BUILTIN_PABSD128_MASK:
+    case IX86_BUILTIN_PABSD256_MASK:
+      gcc_assert (n_args >= 1);
+      if (!gimple_call_lhs (stmt))
+	break;
+      arg0 = gimple_call_arg (stmt, 0);
+      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+      /* For masked ABS, only optimize if the mask is all ones.  */
+      if (n_args > 1
+	  && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, n_args - 1)))
+	break;
+      {
+	tree utype, ures, vce;
+	utype = unsigned_type_for (TREE_TYPE (arg0));
+	/* PABSB/W/D/Q store the unsigned result in dst, use ABSU_EXPR
+	   instead of ABS_EXPR to hanlde overflow case(TYPE_MIN).  */
+	ures = gimple_build (&stmts, ABSU_EXPR, utype, arg0);
+	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	loc = gimple_location (stmt);
+	vce = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (arg0), ures);
+	g = gimple_build_assign (gimple_call_lhs (stmt),
+				 VIEW_CONVERT_EXPR, vce);
+	gsi_replace (gsi, g, false);
+      }
+      return true;
+
     default:
       break;
     }
@@ -19847,9 +19940,12 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
 	  index = 1;
 	  break;
 	/* DImode loads and stores assumed to cost the same as SImode.  */
-	default:
+	case 4:
+	case 8:
 	  index = 2;
 	  break;
+	default:
+	  return 100;
 	}
 
       if (in == 2)
@@ -20426,7 +20522,8 @@ ix86_widen_mult_cost (const struct processor_costs *cost,
       basic_cost = cost->mulss * 2 + cost->sse_op * 4;
       break;
     default:
-      gcc_unreachable();
+      /* Not implemented.  */
+      return 100;
     }
   return ix86_vec_cost (mode, basic_cost + extra_cost);
 }
@@ -20450,34 +20547,107 @@ ix86_multiplication_cost (const struct processor_costs *cost,
 			   inner_mode == DFmode ? cost->mulsd : cost->mulss);
   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
     {
-      /* vpmullq is used in this case. No emulation is needed.  */
-      if (TARGET_AVX512DQ)
-	return ix86_vec_cost (mode, cost->mulss);
+      int nmults, nops;
+      /* Cost of reading the memory.  */
+      int extra;
 
-      /* V*QImode is emulated with 7-13 insns.  */
-      if (mode == V16QImode || mode == V32QImode)
-	{
-	  int extra = 11;
-	  if (TARGET_XOP && mode == V16QImode)
-	    extra = 5;
-	  else if (TARGET_SSSE3)
-	    extra = 6;
-	  return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * extra);
-	}
-      /* V*DImode is emulated with 5-8 insns.  */
-      else if (mode == V2DImode || mode == V4DImode)
+      switch (mode)
 	{
-	  if (TARGET_XOP && mode == V2DImode)
-	    return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 3);
+	case V4QImode:
+	case V8QImode:
+	  /* Partial V*QImode is emulated with 4-6 insns.  */
+	  nmults = 1;
+	  nops = 3;
+	  extra = 0;
+
+	  if (TARGET_AVX512BW && TARGET_AVX512VL)
+	    ;
+	  else if (TARGET_AVX2)
+	    nops += 2;
+	  else if (TARGET_XOP)
+	    extra += cost->sse_load[2];
+	  else
+	    {
+	      nops += 1;
+	      extra += cost->sse_load[2];
+	    }
+	  goto do_qimode;
+
+	case V16QImode:
+	  /* V*QImode is emulated with 4-11 insns.  */
+	  nmults = 1;
+	  nops = 3;
+	  extra = 0;
+
+	  if (TARGET_AVX2 && !TARGET_PREFER_AVX128)
+	    {
+	      if (!(TARGET_AVX512BW && TARGET_AVX512VL))
+		nops += 3;
+	    }
+	  else if (TARGET_XOP)
+	    {
+	      nmults += 1;
+	      nops += 2;
+	      extra += cost->sse_load[2];
+	    }
+	  else
+	    {
+	      nmults += 1;
+	      nops += 4;
+	      extra += cost->sse_load[2];
+	    }
+	  goto do_qimode;
+
+	case V32QImode:
+	  nmults = 1;
+	  nops = 3;
+	  extra = 0;
+
+	  if (!TARGET_AVX512BW || TARGET_PREFER_AVX256)
+	    {
+	      nmults += 1;
+	      nops += 4;
+	      extra += cost->sse_load[3] * 2;
+	    }
+	  goto do_qimode;
+
+	case V64QImode:
+	  nmults = 2;
+	  nops = 9;
+	  extra = cost->sse_load[3] * 2 + cost->sse_load[4] * 2;
+
+	do_qimode:
+	  return ix86_vec_cost (mode, cost->mulss * nmults
+				+ cost->sse_op * nops) + extra;
+
+	case V4SImode:
+	  /* pmulld is used in this case. No emulation is needed.  */
+	  if (TARGET_SSE4_1)
+	    goto do_native;
+	  /* V4SImode is emulated with 7 insns.  */
+	  else
+	    return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5);
+
+	case V2DImode:
+	case V4DImode:
+	  /* vpmullq is used in this case. No emulation is needed.  */
+	  if (TARGET_AVX512DQ && TARGET_AVX512VL)
+	    goto do_native;
+	  /* V*DImode is emulated with 6-8 insns.  */
+	  else if (TARGET_XOP && mode == V2DImode)
+	    return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 4);
+	  /* FALLTHRU */
+	case V8DImode:
+	  /* vpmullq is used in this case. No emulation is needed.  */
+	  if (TARGET_AVX512DQ && mode == V8DImode)
+	    goto do_native;
 	  else
 	    return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5);
+
+	default:
+	do_native:
+	  return ix86_vec_cost (mode, cost->mulss);
 	}
-      /* Without sse4.1, we don't have PMULLD; it's emulated with 7
-	 insns, including two PMULUDQ.  */
-      else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
-	return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5);
-      else
-	return ix86_vec_cost (mode, cost->mulss);
     }
   else
     return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
@@ -20516,20 +20686,51 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
 			enum rtx_code code,
 			enum machine_mode mode, bool constant_op1,
 			HOST_WIDE_INT op1_val,
-			bool speed,
 			bool and_in_op1,
 			bool shift_and_truncate,
 			bool *skip_op0, bool *skip_op1)
 {
   if (skip_op0)
     *skip_op0 = *skip_op1 = false;
+
   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
     {
-      /* V*QImode is emulated with 1-11 insns.  */
-      if (mode == V16QImode || mode == V32QImode)
+      int count;
+      /* Cost of reading the memory.  */
+      int extra;
+
+      switch (mode)
 	{
-	  int count = 11;
-	  if (TARGET_XOP && mode == V16QImode)
+	case V4QImode:
+	case V8QImode:
+	  if (TARGET_AVX2)
+	    /* Use vpbroadcast.  */
+	    extra = cost->sse_op;
+	  else
+	    extra = cost->sse_load[2];
+
+	  if (constant_op1)
+	    {
+	      if (code == ASHIFTRT)
+		{
+		  count = 4;
+		  extra *= 2;
+		}
+	      else
+		count = 2;
+	    }
+	  else if (TARGET_AVX512BW && TARGET_AVX512VL)
+	    return ix86_vec_cost (mode, cost->sse_op * 4);
+	  else if (TARGET_SSE4_1)
+	    count = 5;
+	  else if (code == ASHIFTRT)
+	    count = 6;
+	  else
+	    count = 5;
+	  return ix86_vec_cost (mode, cost->sse_op * count) + extra;
+
+	case V16QImode:
+	  if (TARGET_XOP)
 	    {
 	      /* For XOP we use vpshab, which requires a broadcast of the
 		 value to the variable shift insn.  For constants this
@@ -20537,37 +20738,80 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
 		 shift with one insn set the cost to prefer paddb.  */
 	      if (constant_op1)
 		{
-		  if (skip_op1)
-		    *skip_op1 = true;
-		  return ix86_vec_cost (mode,
-					cost->sse_op
-					+ (speed
-					   ? 2
-					   : COSTS_N_BYTES
-					       (GET_MODE_UNIT_SIZE (mode))));
+		  extra = cost->sse_load[2];
+		  return ix86_vec_cost (mode, cost->sse_op) + extra;
+		}
+	      else
+		{
+		  count = (code == ASHIFT) ? 3 : 4;
+		  return ix86_vec_cost (mode, cost->sse_op * count);
+		}
+	    }
+	  /* FALLTHRU */
+	case V32QImode:
+	  if (TARGET_AVX2)
+	    /* Use vpbroadcast.  */
+	    extra = cost->sse_op;
+	  else
+	    extra = (mode == V16QImode) ? cost->sse_load[2] : cost->sse_load[3];
+
+	  if (constant_op1)
+	    {
+	      if (code == ASHIFTRT)
+		{
+		  count = 4;
+		  extra *= 2;
 		}
-	      count = 3;
+	      else
+		count = 2;
 	    }
-	  else if (TARGET_SSSE3)
-	    count = 7;
-	  return ix86_vec_cost (mode, cost->sse_op * count);
-	}
-      /* V*DImode arithmetic right shift is emulated.  */
-      else if (code == ASHIFTRT
-	       && (mode == V2DImode || mode == V4DImode)
-	       && !TARGET_XOP
-	       && !TARGET_AVX512VL)
-	{
-	  int count = 4;
-	  if (constant_op1 && op1_val == 63 && TARGET_SSE4_2)
-	    count = 2;
-	  else if (constant_op1)
-	    count = 3;
-	  return ix86_vec_cost (mode, cost->sse_op * count);
+	  else if (TARGET_AVX512BW
+		   && ((mode == V32QImode && !TARGET_PREFER_AVX256)
+		       || (mode == V16QImode && TARGET_AVX512VL
+			   && !TARGET_PREFER_AVX128)))
+	    return ix86_vec_cost (mode, cost->sse_op * 4);
+	  else if (TARGET_AVX2
+		   && mode == V16QImode && !TARGET_PREFER_AVX128)
+	    count = 6;
+	  else if (TARGET_SSE4_1)
+	    count = 9;
+	  else if (code == ASHIFTRT)
+	    count = 10;
+	  else
+	    count = 9;
+	  return ix86_vec_cost (mode, cost->sse_op * count) + extra;
+
+	case V2DImode:
+	case V4DImode:
+	  /* V*DImode arithmetic right shift is emulated.  */
+	  if (code == ASHIFTRT && !TARGET_AVX512VL)
+	    {
+	      if (constant_op1)
+		{
+		  if (op1_val == 63)
+		    count = TARGET_SSE4_2 ? 1 : 2;
+		  else if (TARGET_XOP)
+		    count = 2;
+		  else if (TARGET_SSE4_1)
+		    count = 3;
+		  else
+		    count = 4;
+		}
+	      else if (TARGET_XOP)
+		count = 3;
+	      else if (TARGET_SSE4_2)
+		count = 4;
+	      else
+		count = 5;
+
+	      return ix86_vec_cost (mode, cost->sse_op * count);
+	    }
+	  /* FALLTHRU */
+	default:
+	  return ix86_vec_cost (mode, cost->sse_op);
 	}
-      else
-	return ix86_vec_cost (mode, cost->sse_op);
     }
+
   if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
     {
       if (constant_op1)
@@ -20737,7 +20981,6 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
 				       CONSTANT_P (XEXP (x, 1)),
 				       CONST_INT_P (XEXP (x, 1))
 					 ? INTVAL (XEXP (x, 1)) : -1,
-				       speed,
 				       GET_CODE (XEXP (x, 1)) == AND,
 				       SUBREG_P (XEXP (x, 1))
 				       && GET_CODE (XEXP (XEXP (x, 1),
@@ -21088,6 +21331,31 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
 	  *total = 0;
 	  return true;
 	}
+      /* Match x
+	 (compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+		      (geu:QI (reg:CCC FLAGS_REG) (const_int 0)))  */
+      if (mode == CCCmode
+	  && GET_CODE (op0) == NEG
+	  && GET_CODE (XEXP (op0, 0)) == LTU
+	  && REG_P (XEXP (XEXP (op0, 0), 0))
+	  && GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCCmode
+	  && REGNO (XEXP (XEXP (op0, 0), 0)) == FLAGS_REG
+	  && XEXP (XEXP (op0, 0), 1) == const0_rtx
+	  && GET_CODE (op1) == GEU
+	  && REG_P (XEXP (op1, 0))
+	  && GET_MODE (XEXP (op1, 0)) == CCCmode
+	  && REGNO (XEXP (op1, 0)) == FLAGS_REG
+	  && XEXP (op1, 1) == const0_rtx)
+	{
+	  /* This is *x86_cmc.  */
+	  if (!speed)
+	    *total = COSTS_N_BYTES (1);
+	  else if (TARGET_SLOW_STC)
+	    *total = COSTS_N_INSNS (2);
+	  else 
+	    *total = COSTS_N_INSNS (1);
+	  return true;
+	}
 
       if (SCALAR_INT_MODE_P (GET_MODE (op0))
 	  && GET_MODE_SIZE (GET_MODE (op0)) > UNITS_PER_WORD)
@@ -21823,8 +22091,12 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
 	      break;
 	    case CM_SMALL_PIC:
 	    case CM_MEDIUM_PIC:
-	      fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
-	      break;
+	      if (!ix86_direct_extern_access)
+		{
+		  fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
+		  break;
+		}
+	      /* fall through */
 	    default:
 	      x86_print_call_or_nop (file, mcount_name);
 	      break;
@@ -23394,6 +23666,7 @@ class ix86_vector_costs : public vector_costs
 			      stmt_vec_info stmt_info, slp_tree node,
 			      tree vectype, int misalign,
 			      vect_cost_model_location where) override;
+  void finish_cost (const vector_costs *) override;
 };
 
 /* Implement targetm.vectorize.create_costs.  */
@@ -23505,7 +23778,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 		            TREE_CODE (op2) == INTEGER_CST,
 			    cst_and_fits_in_hwi (op2)
 			    ? int_cst_value (op2) : -1,
-		            true, false, false, NULL, NULL);
+			    false, false, NULL, NULL);
 	  }
 	  break;
 	case NOP_EXPR:
@@ -23563,13 +23836,15 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
       && stmt_info
       && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
 	  || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
-      && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
-      && TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) != INTEGER_CST)
+      && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
+	   && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+	       != INTEGER_CST))
+	  || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER))
     {
       stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
       stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
     }
-  else if (kind == vec_construct
+  else if ((kind == vec_construct || kind == scalar_to_vec)
 	   && node
 	   && SLP_TREE_DEF_TYPE (node) == vect_external_def
 	   && INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
@@ -23602,7 +23877,9 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	     Likewise with a BIT_FIELD_REF extracting from a vector
 	     register we can hope to avoid using a GPR.  */
 	  if (!is_gimple_assign (def)
-	      || (!gimple_assign_load_p (def)
+	      || ((!gimple_assign_load_p (def)
+		   || (!TARGET_SSE4_1
+		       && GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op))) == 1))
 		  && (gimple_assign_rhs_code (def) != BIT_FIELD_REF
 		      || !VECTOR_TYPE_P (TREE_TYPE
 				(TREE_OPERAND (gimple_assign_rhs1 (def), 0))))))
@@ -23642,6 +23919,31 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
   return retval;
 }
 
+void
+ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
+{
+  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
+  if (loop_vinfo && !m_costing_for_scalar)
+    {
+      /* We are currently not asking the vectorizer to compare costs
+	 between different vector mode sizes.  When using predication
+	 that will end up always choosing the prefered mode size even
+	 if there's a smaller mode covering all lanes.  Test for this
+	 situation and artificially reject the larger mode attempt.
+	 ???  We currently lack masked ops for sub-SSE sized modes,
+	 so we could restrict this rejection to AVX and AVX512 modes
+	 but error on the safe side for now.  */
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+	  && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	  && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())
+	      > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
+	m_costs[vect_body] = INT_MAX;
+    }
+
+  vector_costs::finish_cost (scalar_costs);
+}
+
 /* Validate target specific memory model bits in VAL. */
 
 static unsigned HOST_WIDE_INT
@@ -23816,7 +24118,7 @@ ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
 	 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
 	 emit corresponding clone.  */
       tree ctype = ret_type;
-      if (TREE_CODE (ret_type) == VOID_TYPE)
+      if (VOID_TYPE_P (ret_type))
 	ctype = base_type;
       int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
       if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
@@ -25237,7 +25539,8 @@ ix86_libgcc_floating_mode_supported_p
 #undef TARGET_MEMTAG_TAG_SIZE
 #define TARGET_MEMTAG_TAG_SIZE ix86_memtag_tag_size
 
-static bool ix86_libc_has_fast_function (int fcode ATTRIBUTE_UNUSED)
+static bool
+ix86_libc_has_fast_function (int fcode ATTRIBUTE_UNUSED)
 {
 #ifdef OPTION_GLIBC
   if (OPTION_GLIBC)
@@ -25252,6 +25555,58 @@ static bool ix86_libc_has_fast_function (int fcode ATTRIBUTE_UNUSED)
 #undef TARGET_LIBC_HAS_FAST_FUNCTION
 #define TARGET_LIBC_HAS_FAST_FUNCTION ix86_libc_has_fast_function
 
+static unsigned
+ix86_libm_function_max_error (unsigned cfn, machine_mode mode,
+			      bool boundary_p)
+{
+#ifdef OPTION_GLIBC
+  bool glibc_p = OPTION_GLIBC;
+#else
+  bool glibc_p = false;
+#endif
+  if (glibc_p)
+    {
+      /* If __FAST_MATH__ is defined, glibc provides libmvec.  */
+      unsigned int libmvec_ret = 0;
+      if (!flag_trapping_math
+	  && flag_unsafe_math_optimizations
+	  && flag_finite_math_only
+	  && !flag_signed_zeros
+	  && !flag_errno_math)
+	switch (cfn)
+	  {
+	  CASE_CFN_COS:
+	  CASE_CFN_COS_FN:
+	  CASE_CFN_SIN:
+	  CASE_CFN_SIN_FN:
+	    if (!boundary_p)
+	      {
+		/* With non-default rounding modes, libmvec provides
+		   complete garbage in results.  E.g.
+		   _ZGVcN8v_sinf for 1.40129846e-45f in FE_UPWARD
+		   returns 0.00333309174f rather than 1.40129846e-45f.  */
+		if (flag_rounding_math)
+		  return ~0U;
+		/* https://www.gnu.org/software/libc/manual/html_node/Errors-in-Math-Functions.html
+		   claims libmvec maximum error is 4ulps.
+		   My own random testing indicates 2ulps for SFmode and
+		   0.5ulps for DFmode, but let's go with the 4ulps.  */
+		libmvec_ret = 4;
+	      }
+	    break;
+	  default:
+	    break;
+	  }
+      unsigned int ret = glibc_linux_libm_function_max_error (cfn, mode,
+							      boundary_p);
+      return MAX (ret, libmvec_ret);
+    }
+  return default_libm_function_max_error (cfn, mode, boundary_p);
+}
+
+#undef TARGET_LIBM_FUNCTION_MAX_ERROR
+#define TARGET_LIBM_FUNCTION_MAX_ERROR ix86_libm_function_max_error
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index dd9391c..5ac9c78 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -448,6 +448,7 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
 #define TARGET_DEST_FALSE_DEP_FOR_GLC \
 	ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
+#define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
@@ -1166,6 +1167,9 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 #define FIRST_INT_REG AX_REG
 #define LAST_INT_REG  SP_REG
 
+#define FIRST_INDEX_REG AX_REG
+#define LAST_INDEX_REG  BP_REG
+
 #define FIRST_QI_REG AX_REG
 #define LAST_QI_REG  BX_REG
 
@@ -1404,7 +1408,11 @@ enum reg_class
 #define QI_REGNO_P(N) IN_RANGE ((N), FIRST_QI_REG, LAST_QI_REG)
 
 #define LEGACY_INT_REG_P(X) (REG_P (X) && LEGACY_INT_REGNO_P (REGNO (X)))
-#define LEGACY_INT_REGNO_P(N) (IN_RANGE ((N), FIRST_INT_REG, LAST_INT_REG))
+#define LEGACY_INT_REGNO_P(N) IN_RANGE ((N), FIRST_INT_REG, LAST_INT_REG)
+
+#define LEGACY_INDEX_REG_P(X) (REG_P (X) && LEGACY_INDEX_REGNO_P (REGNO (X)))
+#define LEGACY_INDEX_REGNO_P(N) \
+  IN_RANGE ((N), FIRST_INDEX_REG, LAST_INDEX_REG)
 
 #define REX_INT_REG_P(X) (REG_P (X) && REX_INT_REGNO_P (REGNO (X)))
 #define REX_INT_REGNO_P(N) \
@@ -1414,6 +1422,10 @@ enum reg_class
 #define GENERAL_REGNO_P(N) \
   (LEGACY_INT_REGNO_P (N) || REX_INT_REGNO_P (N))
 
+#define INDEX_REG_P(X) (REG_P (X) && INDEX_REGNO_P (REGNO (X)))
+#define INDEX_REGNO_P(N) \
+  (LEGACY_INDEX_REGNO_P (N) || REX_INT_REGNO_P (N))
+
 #define ANY_QI_REG_P(X) (REG_P (X) && ANY_QI_REGNO_P (REGNO (X)))
 #define ANY_QI_REGNO_P(N) \
   (TARGET_64BIT ? GENERAL_REGNO_P (N) : QI_REGNO_P (N))
@@ -1678,56 +1690,26 @@ typedef struct ix86_args {
    has been allocated, which happens in reginfo.cc during register
    allocation.  */
 
-#define REGNO_OK_FOR_INDEX_P(REGNO) 					\
-  ((REGNO) < STACK_POINTER_REGNUM 					\
-   || REX_INT_REGNO_P (REGNO)						\
-   || (unsigned) reg_renumber[(REGNO)] < STACK_POINTER_REGNUM		\
-   || REX_INT_REGNO_P ((unsigned) reg_renumber[(REGNO)]))
+#define REGNO_OK_FOR_INDEX_P(REGNO)					\
+  (INDEX_REGNO_P (REGNO)						\
+   || INDEX_REGNO_P (reg_renumber[(REGNO)]))
 
-#define REGNO_OK_FOR_BASE_P(REGNO) 					\
+#define REGNO_OK_FOR_BASE_P(REGNO)					\
   (GENERAL_REGNO_P (REGNO)						\
    || (REGNO) == ARG_POINTER_REGNUM 					\
    || (REGNO) == FRAME_POINTER_REGNUM 					\
-   || GENERAL_REGNO_P ((unsigned) reg_renumber[(REGNO)]))
-
-/* The macros REG_OK_FOR..._P assume that the arg is a REG rtx
-   and check its validity for a certain class.
-   We have two alternate definitions for each of them.
-   The usual definition accepts all pseudo regs; the other rejects
-   them unless they have been allocated suitable hard regs.
-   The symbol REG_OK_STRICT causes the latter definition to be used.
-
-   Most source files want to accept pseudo regs in the hope that
-   they will get allocated to the class that the insn wants them to be in.
-   Source files for reload pass need to be strict.
-   After reload, it makes no difference, since pseudo regs have
-   been eliminated by then.  */
-
+   || GENERAL_REGNO_P (reg_renumber[(REGNO)]))
 
 /* Non strict versions, pseudos are ok.  */
-#define REG_OK_FOR_INDEX_NONSTRICT_P(X)					\
-  (REGNO (X) < STACK_POINTER_REGNUM					\
-   || REX_INT_REGNO_P (REGNO (X))					\
-   || REGNO (X) >= FIRST_PSEUDO_REGISTER)
+#define REGNO_OK_FOR_INDEX_NONSTRICT_P(REGNO)				\
+  (INDEX_REGNO_P (REGNO)						\
+   || !HARD_REGISTER_NUM_P (REGNO))
 
-#define REG_OK_FOR_BASE_NONSTRICT_P(X)					\
-  (GENERAL_REGNO_P (REGNO (X))						\
-   || REGNO (X) == ARG_POINTER_REGNUM					\
-   || REGNO (X) == FRAME_POINTER_REGNUM 				\
-   || REGNO (X) >= FIRST_PSEUDO_REGISTER)
-
-/* Strict versions, hard registers only */
-#define REG_OK_FOR_INDEX_STRICT_P(X) REGNO_OK_FOR_INDEX_P (REGNO (X))
-#define REG_OK_FOR_BASE_STRICT_P(X)  REGNO_OK_FOR_BASE_P (REGNO (X))
-
-#ifndef REG_OK_STRICT
-#define REG_OK_FOR_INDEX_P(X)  REG_OK_FOR_INDEX_NONSTRICT_P (X)
-#define REG_OK_FOR_BASE_P(X)   REG_OK_FOR_BASE_NONSTRICT_P (X)
-
-#else
-#define REG_OK_FOR_INDEX_P(X)  REG_OK_FOR_INDEX_STRICT_P (X)
-#define REG_OK_FOR_BASE_P(X)   REG_OK_FOR_BASE_STRICT_P (X)
-#endif
+#define REGNO_OK_FOR_BASE_NONSTRICT_P(REGNO)				\
+  (GENERAL_REGNO_P (REGNO)						\
+   || (REGNO) == ARG_POINTER_REGNUM					\
+   || (REGNO) == FRAME_POINTER_REGNUM					\
+   || !HARD_REGISTER_NUM_P (REGNO))
 
 /* TARGET_LEGITIMATE_ADDRESS_P recognizes an RTL expression
    that is a valid memory address for an instruction.
@@ -2361,7 +2343,7 @@ constexpr wide_int_bitmask PTA_ALDERLAKE = PTA_TREMONT | PTA_ADX | PTA_AVX
 constexpr wide_int_bitmask PTA_SIERRAFOREST = PTA_ALDERLAKE | PTA_AVXIFMA
   | PTA_AVXVNNIINT8 | PTA_AVXNECONVERT | PTA_CMPCCXADD;
 constexpr wide_int_bitmask PTA_GRANITERAPIDS = PTA_SAPPHIRERAPIDS | PTA_AMX_FP16
-  | PTA_PREFETCHI;
+  | PTA_PREFETCHI | PTA_AMX_COMPLEX;
 constexpr wide_int_bitmask PTA_GRANDRIDGE = PTA_SIERRAFOREST | PTA_RAOINT;
 constexpr wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW
   | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index ed689b0..95a6653c 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -114,12 +114,14 @@
   UNSPEC_INSN_FALSE_DEP
   UNSPEC_SBB
   UNSPEC_CC_NE
+  UNSPEC_STC
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
   UNSPEC_MASKMOV
   UNSPEC_MOVCC_MASK
   UNSPEC_MOVMSK
+  UNSPEC_INSERTPS
   UNSPEC_BLENDV
   UNSPEC_PSHUFB
   UNSPEC_XOP_PERMUTE
@@ -127,6 +129,10 @@
   UNSPEC_RSQRT
   UNSPEC_PSADBW
 
+  ;; Different from generic us_truncate RTX
+  ;; as it does unsigned saturation of signed source.
+  UNSPEC_US_TRUNCATE
+
   ;; For AVX/AVX512F support
   UNSPEC_SCALEF
   UNSPEC_PCMP
@@ -836,12 +842,12 @@
 
 ;; Used to control the "enabled" attribute on a per-instruction basis.
 (define_attr "isa" "base,x64,nox64,x64_sse2,x64_sse4,x64_sse4_noavx,
-		    x64_avx,x64_avx512bw,x64_avx512dq,
+		    x64_avx,x64_avx512bw,x64_avx512dq,aes,
 		    sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx,
 		    avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,
 		    avx512bw,noavx512bw,avx512dq,noavx512dq,fma_or_avx512vl,
 		    avx512vl,noavx512vl,avxvnni,avx512vnnivl,avx512fp16,avxifma,
-		    avx512ifmavl,avxneconvert,avx512bf16vl"
+		    avx512ifmavl,avxneconvert,avx512bf16vl,vpclmulqdqvl"
   (const_string "base"))
 
 ;; Define instruction set of MMX instructions
@@ -863,6 +869,7 @@
 	   (symbol_ref "TARGET_64BIT && TARGET_AVX512BW")
 	 (eq_attr "isa" "x64_avx512dq")
 	   (symbol_ref "TARGET_64BIT && TARGET_AVX512DQ")
+	 (eq_attr "isa" "aes") (symbol_ref "TARGET_AES")
 	 (eq_attr "isa" "sse_noavx")
 	   (symbol_ref "TARGET_SSE && !TARGET_AVX")
 	 (eq_attr "isa" "sse2") (symbol_ref "TARGET_SSE2")
@@ -903,6 +910,8 @@
 	 (eq_attr "isa" "avxneconvert") (symbol_ref "TARGET_AVXNECONVERT")
 	 (eq_attr "isa" "avx512bf16vl")
 	   (symbol_ref "TARGET_AVX512BF16 && TARGET_AVX512VL")
+	 (eq_attr "isa" "vpclmulqdqvl")
+	   (symbol_ref "TARGET_VPCLMULQDQ && TARGET_AVX512VL")
 
 	 (eq_attr "mmx_isa" "native")
 	   (symbol_ref "!TARGET_MMX_WITH_SSE")
@@ -1453,29 +1462,66 @@
   [(set_attr "type" "icmp")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*cmpqi_ext<mode>_1_mem_rex64"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (match_operand:QI 0 "norex_memory_operand" "Bn")
+	  (subreg:QI
+	    (match_operator:SWI248 2 "extract_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")
+	       (const_int 8)
+	       (const_int 8)]) 0)))]
+  "TARGET_64BIT && reload_completed
+   && ix86_match_ccmode (insn, CCmode)"
+  "cmp{b}\t{%h1, %0|%0, %h1}"
+  [(set_attr "type" "icmp")
+   (set_attr "mode" "QI")])
+
 (define_insn "*cmpqi_ext<mode>_1"
   [(set (reg FLAGS_REG)
 	(compare
-	  (match_operand:QI 0 "nonimm_x64constmem_operand" "QBc,m")
+	  (match_operand:QI 0 "nonimmediate_operand" "QBc,m")
 	  (subreg:QI
-	    (zero_extract:SWI248
-	      (match_operand 1 "int248_register_operand" "Q,Q")
-	      (const_int 8)
-	      (const_int 8)) 0)))]
+	    (match_operator:SWI248 2 "extract_operator"
+	      [(match_operand 1 "int248_register_operand" "Q,Q")
+	       (const_int 8)
+	       (const_int 8)]) 0)))]
   "ix86_match_ccmode (insn, CCmode)"
   "cmp{b}\t{%h1, %0|%0, %h1}"
   [(set_attr "isa" "*,nox64")
    (set_attr "type" "icmp")
    (set_attr "mode" "QI")])
 
+(define_peephole2
+  [(set (match_operand:QI 0 "register_operand")
+	(match_operand:QI 1 "norex_memory_operand"))
+   (set (match_operand 3 "flags_reg_operand")
+	(match_operator 4 "compare_operator"
+	  [(match_dup 0)
+	   (subreg:QI
+	     (match_operator:SWI248 5 "extract_operator"
+	       [(match_operand 2 "int248_register_operand")
+	        (const_int 8)
+	        (const_int 8)]) 0)]))]
+  "TARGET_64BIT
+   && peep2_reg_dead_p (2, operands[0])"
+  [(set (match_dup 3)
+	(match_op_dup 4
+	  [(match_dup 1)
+	   (subreg:QI
+	     (match_op_dup 5
+	       [(match_dup 2)
+	        (const_int 8)
+	        (const_int 8)]) 0)]))])
+
 (define_insn "*cmpqi_ext<mode>_2"
   [(set (reg FLAGS_REG)
 	(compare
 	  (subreg:QI
-	    (zero_extract:SWI248
-	      (match_operand 0 "int248_register_operand" "Q")
-	      (const_int 8)
-	      (const_int 8)) 0)
+	    (match_operator:SWI248 2 "extract_operator"
+	      [(match_operand 0 "int248_register_operand" "Q")
+	       (const_int 8)
+	       (const_int 8)]) 0)
 	  (match_operand:QI 1 "const0_operand")))]
   "ix86_match_ccmode (insn, CCNOmode)"
   "test{b}\t%h0, %h0"
@@ -1493,34 +1539,71 @@
 	      (const_int 8)) 0)
 	  (match_operand:QI 1 "const_int_operand")))])
 
+(define_insn "*cmpqi_ext<mode>_3_mem_rex64"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (subreg:QI
+	    (match_operator:SWI248 2 "extract_operator"
+	      [(match_operand 0 "int248_register_operand" "Q")
+	       (const_int 8)
+	       (const_int 8)]) 0)
+	  (match_operand:QI 1 "norex_memory_operand" "Bn")))]
+  "TARGET_64BIT && reload_completed
+   && ix86_match_ccmode (insn, CCmode)"
+  "cmp{b}\t{%1, %h0|%h0, %1}"
+  [(set_attr "type" "icmp")
+   (set_attr "mode" "QI")])
+
 (define_insn "*cmpqi_ext<mode>_3"
   [(set (reg FLAGS_REG)
 	(compare
 	  (subreg:QI
-	    (zero_extract:SWI248
-	      (match_operand 0 "int248_register_operand" "Q,Q")
-	      (const_int 8)
-	      (const_int 8)) 0)
-	  (match_operand:QI 1 "general_x64constmem_operand" "QnBc,m")))]
+	    (match_operator:SWI248 2 "extract_operator"
+	      [(match_operand 0 "int248_register_operand" "Q,Q")
+	       (const_int 8)
+	       (const_int 8)]) 0)
+	  (match_operand:QI 1 "general_operand" "QnBc,m")))]
   "ix86_match_ccmode (insn, CCmode)"
   "cmp{b}\t{%1, %h0|%h0, %1}"
   [(set_attr "isa" "*,nox64")
    (set_attr "type" "icmp")
    (set_attr "mode" "QI")])
 
+(define_peephole2
+  [(set (match_operand:QI 0 "register_operand")
+	(match_operand:QI 1 "norex_memory_operand"))
+   (set (match_operand 3 "flags_reg_operand")
+	(match_operator 4 "compare_operator"
+	  [(subreg:QI
+	     (match_operator:SWI248 5 "extract_operator"
+	       [(match_operand 2 "int248_register_operand")
+	        (const_int 8)
+	        (const_int 8)]) 0)
+	   (match_dup 0)]))]
+  "TARGET_64BIT
+   && peep2_reg_dead_p (2, operands[0])"
+  [(set (match_dup 3)
+	(match_op_dup 4
+	  [(subreg:QI
+	     (match_op_dup 5
+	       [(match_dup 2)
+	        (const_int 8)
+	        (const_int 8)]) 0)
+	   (match_dup 1)]))])
+
 (define_insn "*cmpqi_ext<mode>_4"
   [(set (reg FLAGS_REG)
 	(compare
 	  (subreg:QI
-	    (zero_extract:SWI248
-	      (match_operand 0 "int248_register_operand" "Q")
-	      (const_int 8)
-	      (const_int 8)) 0)
+	    (match_operator:SWI248 2 "extract_operator"
+	      [(match_operand 0 "int248_register_operand" "Q")
+	       (const_int 8)
+	       (const_int 8)]) 0)
 	  (subreg:QI
-	    (zero_extract:SWI248
-	      (match_operand 1 "int248_register_operand" "Q")
-	      (const_int 8)
-	      (const_int 8)) 0)))]
+	    (match_operator:SWI248 3 "extract_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")
+	       (const_int 8)
+	       (const_int 8)]) 0)))]
   "ix86_match_ccmode (insn, CCmode)"
   "cmp{b}\t{%h1, %h0|%h0, %h1}"
   [(set_attr "type" "icmp")
@@ -1921,6 +2004,53 @@
   [(set_attr "type" "ssecomi")
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
+
+;; Set carry flag.
+(define_insn "x86_stc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  ""
+  "stc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+;; On Pentium 4, set the carry flag using mov $1,%al;addb $-1,%al.
+(define_peephole2
+  [(match_scratch:QI 0 "r")
+   (set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  "TARGET_SLOW_STC && !optimize_insn_for_size_p ()"
+  [(set (match_dup 0) (const_int 1))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+	   (compare:CCC (plus:QI (match_dup 0) (const_int -1))
+			(match_dup 0)))
+      (set (match_dup 0) (plus:QI (match_dup 0) (const_int -1)))])])
+
+;; Complement carry flag.
+(define_insn "*x86_cmc"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+		     (geu:QI (reg:CCC FLAGS_REG) (const_int 0))))]
+  ""
+  "cmc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "use_carry" "1")
+   (set_attr "modrm" "0")])
+
+;; On Pentium 4, cmc is replaced with setnc %al;addb $-1,%al.
+(define_peephole2
+  [(match_scratch:QI 0 "r")
+   (set (reg:CCC FLAGS_REG)
+	(compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+		     (geu:QI (reg:CCC FLAGS_REG) (const_int 0))))]
+  "TARGET_SLOW_STC && !optimize_insn_for_size_p ()"
+  [(set (match_dup 0) (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+	   (compare:CCC (plus:QI (match_dup 0) (const_int -1))
+			(match_dup 0)))
+      (set (match_dup 0) (plus:QI (match_dup 0) (const_int -1)))])])
 
 ;; Push/pop instructions.
 
@@ -3188,18 +3318,6 @@
     operands[1] = copy_to_reg (operands[1]);
 })
 
-(define_insn "*extzvqi_mem_rex64"
-  [(set (match_operand:QI 0 "norex_memory_operand" "=Bn")
-	(subreg:QI
-	  (zero_extract:SWI248
-	    (match_operand 1 "int248_register_operand" "Q")
-	    (const_int 8)
-	    (const_int 8)) 0))]
-  "TARGET_64BIT && reload_completed"
-  "mov{b}\t{%h1, %0|%0, %h1}"
-  [(set_attr "type" "imov")
-   (set_attr "mode" "QI")])
-
 (define_insn "*extzv<mode>"
   [(set (match_operand:SWI248 0 "register_operand" "=R")
 	(zero_extract:SWI248 (match_operand 1 "int248_register_operand" "Q")
@@ -3210,13 +3328,25 @@
   [(set_attr "type" "imovx")
    (set_attr "mode" "SI")])
 
+(define_insn "*extzvqi_mem_rex64"
+  [(set (match_operand:QI 0 "norex_memory_operand" "=Bn")
+	(subreg:QI
+	  (match_operator:SWI248 2 "extract_operator"
+	    [(match_operand 1 "int248_register_operand" "Q")
+	     (const_int 8)
+	     (const_int 8)]) 0))]
+  "TARGET_64BIT && reload_completed"
+  "mov{b}\t{%h1, %0|%0, %h1}"
+  [(set_attr "type" "imov")
+   (set_attr "mode" "QI")])
+
 (define_insn "*extzvqi"
   [(set (match_operand:QI 0 "nonimmediate_operand" "=QBc,?R,m")
 	(subreg:QI
-	  (zero_extract:SWI248
-	    (match_operand 1 "int248_register_operand" "Q,Q,Q")
-	    (const_int 8)
-	    (const_int 8)) 0))]
+	  (match_operator:SWI248 2 "extract_operator"
+	    [(match_operand 1 "int248_register_operand" "Q,Q,Q")
+	     (const_int 8)
+	     (const_int 8)]) 0))]
   ""
 {
   switch (get_attr_type (insn))
@@ -3242,17 +3372,19 @@
 (define_peephole2
   [(set (match_operand:QI 0 "register_operand")
 	(subreg:QI
-	  (zero_extract:SWI248 (match_operand 1 "int248_register_operand")
-			       (const_int 8)
-			       (const_int 8)) 0))
+	  (match_operator:SWI248 3 "extract_operator"
+	    [(match_operand 1 "int248_register_operand")
+	     (const_int 8)
+	     (const_int 8)]) 0))
    (set (match_operand:QI 2 "norex_memory_operand") (match_dup 0))]
   "TARGET_64BIT
    && peep2_reg_dead_p (2, operands[0])"
   [(set (match_dup 2)
 	(subreg:QI
-	  (zero_extract:SWI248 (match_dup 1)
-			       (const_int 8)
-			       (const_int 8)) 0))])
+	  (match_op_dup 3
+	    [(match_dup 1)
+	     (const_int 8)
+	     (const_int 8)]) 0))])
 
 (define_expand "insv<mode>"
   [(set (zero_extract:SWI248 (match_operand:SWI248 0 "register_operand")
@@ -3373,18 +3505,15 @@
   operands[4] = gen_int_mode (tmp, <SWI48:MODE>mode);
 })
 
-
-(define_code_iterator any_extract [sign_extract zero_extract])
-
 (define_insn "*insvqi_2"
   [(set (zero_extract:SWI248
 	  (match_operand 0 "int248_register_operand" "+Q")
 	  (const_int 8)
 	  (const_int 8))
-	(any_extract:SWI248
-	  (match_operand 1 "int248_register_operand" "Q")
-	  (const_int 8)
-	  (const_int 8)))]
+	(match_operator:SWI248 2 "extract_operator"
+	  [(match_operand 1 "int248_register_operand" "Q")
+	   (const_int 8)
+	   (const_int 8)]))]
   ""
   "mov{b}\t{%h1, %h0|%h0, %h1}"
   [(set_attr "type" "imov")
@@ -3402,6 +3531,32 @@
   "mov{b}\t{%h1, %h0|%h0, %h1}"
   [(set_attr "type" "imov")
    (set_attr "mode" "QI")])
+
+(define_code_iterator any_or_plus [plus ior xor])
+
+(define_insn_and_split "*insvti_highpart_1"
+  [(set (match_operand:TI 0 "nonimmediate_operand" "=ro,r,r,&r")
+	(any_or_plus:TI
+	  (and:TI
+	    (match_operand:TI 1 "nonimmediate_operand" "r,m,r,m")
+	    (match_operand:TI 3 "const_scalar_int_operand" "n,n,n,n"))
+	  (ashift:TI
+	    (zero_extend:TI
+	      (match_operand:DI 2 "nonimmediate_operand" "r,r,m,m"))
+	    (const_int 64))))]
+  "TARGET_64BIT
+   && CONST_WIDE_INT_P (operands[3])
+   && CONST_WIDE_INT_NUNITS (operands[3]) == 2
+   && CONST_WIDE_INT_ELT (operands[3], 0) == -1
+   && CONST_WIDE_INT_ELT (operands[3], 1) == 0"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  operands[4] = gen_lowpart (DImode, operands[1]);
+  split_double_concat (TImode, operands[0], operands[4], operands[2]);
+  DONE;
+})
 
 ;; Floating point push instructions.
 
@@ -4782,10 +4937,10 @@
   [(set (match_operand:SWI24 0 "register_operand" "=R")
 	(sign_extend:SWI24
 	  (subreg:QI
-	    (zero_extract:SWI248
-	      (match_operand 1 "int248_register_operand" "Q")
-	      (const_int 8)
-	      (const_int 8)) 0)))]
+	    (match_operator:SWI248 2 "extract_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")
+	       (const_int 8)
+	       (const_int 8)]) 0)))]
   ""
   "movs{b<SWI24:imodesuffix>|x}\t{%h1, %0|%0, %h1}"
    [(set_attr "type" "imovx")
@@ -6021,6 +6176,36 @@
 	      (clobber (reg:CC FLAGS_REG))])]
  "split_double_mode (<DWI>mode, &operands[0], 2, &operands[0], &operands[3]);")
 
+(define_insn_and_split "*add<dwi>3_doubleword_concat"
+  [(set (match_operand:<DWI> 0 "register_operand" "=&r")
+	(plus:<DWI>
+	  (any_or_plus:<DWI>
+	    (ashift:<DWI>
+	      (zero_extend:<DWI>
+		(match_operand:DWIH 2 "nonimmediate_operand" "rm"))
+	      (match_operand:QI 3 "const_int_operand"))
+	    (zero_extend:<DWI>
+	      (match_operand:DWIH 4 "nonimmediate_operand" "rm")))
+	  (match_operand:<DWI> 1 "register_operand" "0")))
+   (clobber (reg:CC FLAGS_REG))]
+  "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (plus:DWIH (match_dup 1) (match_dup 4))
+		     (match_dup 1)))
+	      (set (match_dup 0)
+		   (plus:DWIH (match_dup 1) (match_dup 4)))])
+   (parallel [(set (match_dup 5)
+		   (plus:DWIH
+		     (plus:DWIH
+		       (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0))
+		       (match_dup 6))
+		     (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])]
+ "split_double_mode (<DWI>mode, &operands[0], 2, &operands[0], &operands[5]);")
+
 (define_insn "*add<mode>_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r")
 	(plus:SWI48
@@ -6645,10 +6830,10 @@
   [(set (match_operand:QI 0 "nonimm_x64constmem_operand" "=QBc,m")
 	(plus:QI
 	  (subreg:QI
-	    (zero_extract:SWI248
-	      (match_operand 2 "int248_register_operand" "Q,Q")
-	      (const_int 8)
-	      (const_int 8)) 0)
+	    (match_operator:SWI248 3 "extract_operator"
+	      [(match_operand 2 "int248_register_operand" "Q,Q")
+	       (const_int 8)
+	       (const_int 8)]) 0)
 	  (match_operand:QI 1 "nonimm_x64constmem_operand" "0,0")))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -6679,10 +6864,10 @@
 	(subreg:SWI248
 	  (plus:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "0,0")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 3 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "0,0")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (match_operand:QI 2 "general_x64constmem_operand" "QnBc,m")) 0))
    (clobber (reg:CC FLAGS_REG))]
   "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
@@ -6718,15 +6903,15 @@
 	(subreg:SWI248
 	  (plus:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "%0")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 3 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "%0")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 2 "int248_register_operand" "Q")
-		(const_int 8)
-		(const_int 8)) 0)) 0))
+	      (match_operator:SWI248 4 "extract_operator"
+		[(match_operand 2 "int248_register_operand" "Q")
+		 (const_int 8)
+		 (const_int 8)]) 0)) 0))
   (clobber (reg:CC FLAGS_REG))]
   "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
    rtx_equal_p (operands[0], operands[1])
@@ -6991,7 +7176,7 @@
 (define_insn_and_split "*lea<mode>_general_1"
   [(set (match_operand:SWI12 0 "register_operand" "=r")
 	(plus:SWI12
-	  (plus:SWI12 (match_operand:SWI12 1 "index_register_operand" "l")
+	  (plus:SWI12 (match_operand:SWI12 1 "register_no_SP_operand" "l")
 		      (match_operand:SWI12 2 "register_operand" "r"))
 	  (match_operand:SWI12 3 "immediate_operand" "i")))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
@@ -7013,7 +7198,7 @@
 (define_insn_and_split "*lea<mode>_general_2"
   [(set (match_operand:SWI12 0 "register_operand" "=r")
 	(plus:SWI12
-	  (mult:SWI12 (match_operand:SWI12 1 "index_register_operand" "l")
+	  (mult:SWI12 (match_operand:SWI12 1 "register_no_SP_operand" "l")
 		      (match_operand 2 "const248_operand" "n"))
 	  (match_operand:SWI12 3 "nonmemory_operand" "ri")))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
@@ -7034,7 +7219,7 @@
 (define_insn_and_split "*lea<mode>_general_2b"
   [(set (match_operand:SWI12 0 "register_operand" "=r")
 	(plus:SWI12
-	  (ashift:SWI12 (match_operand:SWI12 1 "index_register_operand" "l")
+	  (ashift:SWI12 (match_operand:SWI12 1 "register_no_SP_operand" "l")
 			(match_operand 2 "const123_operand" "n"))
 	  (match_operand:SWI12 3 "nonmemory_operand" "ri")))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
@@ -7056,7 +7241,7 @@
   [(set (match_operand:SWI12 0 "register_operand" "=r")
 	(plus:SWI12
 	  (plus:SWI12
-	    (mult:SWI12 (match_operand:SWI12 1 "index_register_operand" "l")
+	    (mult:SWI12 (match_operand:SWI12 1 "register_no_SP_operand" "l")
 			(match_operand 2 "const248_operand" "n"))
 	    (match_operand:SWI12 3 "register_operand" "r"))
 	  (match_operand:SWI12 4 "immediate_operand" "i")))]
@@ -7082,7 +7267,7 @@
   [(set (match_operand:SWI12 0 "register_operand" "=r")
 	(plus:SWI12
 	  (plus:SWI12
-	    (ashift:SWI12 (match_operand:SWI12 1 "index_register_operand" "l")
+	    (ashift:SWI12 (match_operand:SWI12 1 "register_no_SP_operand" "l")
 			  (match_operand 2 "const123_operand" "n"))
 	    (match_operand:SWI12 3 "register_operand" "r"))
 	  (match_operand:SWI12 4 "immediate_operand" "i")))]
@@ -7108,7 +7293,7 @@
   [(set (match_operand:SWI12 0 "register_operand" "=r")
 	(any_or:SWI12
 	  (ashift:SWI12
-	    (match_operand:SWI12 1 "index_register_operand" "l")
+	    (match_operand:SWI12 1 "register_no_SP_operand" "l")
 	    (match_operand 2 "const_0_to_3_operand"))
 	  (match_operand 3 "const_int_operand")))]
   "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
@@ -7132,7 +7317,7 @@
   [(set (match_operand:SWI48 0 "register_operand" "=r")
 	(any_or:SWI48
 	  (ashift:SWI48
-	    (match_operand:SWI48 1 "index_register_operand" "l")
+	    (match_operand:SWI48 1 "register_no_SP_operand" "l")
 	    (match_operand 2 "const_0_to_3_operand"))
 	  (match_operand 3 "const_int_operand")))]
   "(unsigned HOST_WIDE_INT) INTVAL (operands[3])
@@ -7286,10 +7471,10 @@
 	(minus:QI
 	  (match_operand:QI 1 "nonimm_x64constmem_operand" "0,0")
 	  (subreg:QI
-	    (zero_extract:SWI248
-	      (match_operand 2 "int248_register_operand" "Q,Q")
-	      (const_int 8)
-	      (const_int 8)) 0)))
+	    (match_operator:SWI248 3 "extract_operator"
+	      [(match_operand 2 "int248_register_operand" "Q,Q")
+	       (const_int 8)
+	       (const_int 8)]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "sub{b}\t{%h2, %0|%0, %h2}"
@@ -7305,15 +7490,15 @@
 	(subreg:SWI248
 	  (minus:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "0")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 3 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "0")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 2 "int248_register_operand" "Q")
-		(const_int 8)
-		(const_int 8)) 0)) 0))
+	      (match_operator:SWI248 4 "extract_operator"
+		[(match_operand 2 "int248_register_operand" "Q")
+		 (const_int 8)
+		 (const_int 8)]) 0)) 0))
   (clobber (reg:CC FLAGS_REG))]
   "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
    rtx_equal_p (operands[0], operands[1])"
@@ -7582,6 +7767,25 @@
   [(set (reg:CC FLAGS_REG)
 	(compare:CC (match_dup 0) (match_dup 1)))])
 
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+	(match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (reg:CC FLAGS_REG)
+		   (compare:CC (match_dup 0)
+			       (match_operand:SWI 2 "memory_operand")))
+	      (set (match_dup 0)
+		   (minus:SWI (match_dup 0) (match_dup 2)))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (reg:CC FLAGS_REG)
+		   (compare:CC (match_dup 1) (match_dup 0)))
+	      (set (match_dup 1)
+		   (minus:SWI (match_dup 1) (match_dup 0)))])])
+
 ;; decl %eax; cmpl $-1, %eax; jne .Lxx; can be optimized into
 ;; subl $1, %eax; jnc .Lxx;
 (define_peephole2
@@ -7667,6 +7871,59 @@
    (set_attr "pent_pair" "pu")
    (set_attr "mode" "<MODE>")])
 
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+	(match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (match_dup 0)
+		   (plus:SWI
+		     (plus:SWI
+		       (match_operator:SWI 4 "ix86_carry_flag_operator"
+			 [(match_operand 3 "flags_reg_operand")
+			  (const_int 0)])
+		       (match_dup 0))
+		     (match_operand:SWI 2 "memory_operand")))
+	      (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (match_dup 1)
+		   (plus:SWI (plus:SWI (match_op_dup 4
+					 [(match_dup 3) (const_int 0)])
+				       (match_dup 1))
+			     (match_dup 0)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+	(match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (match_dup 0)
+		   (plus:SWI
+		     (plus:SWI
+		       (match_operator:SWI 4 "ix86_carry_flag_operator"
+			 [(match_operand 3 "flags_reg_operand")
+			  (const_int 0)])
+		       (match_dup 0))
+		     (match_operand:SWI 2 "memory_operand")))
+	      (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:SWI 5 "general_reg_operand") (match_dup 0))
+   (set (match_dup 1) (match_dup 5))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && peep2_reg_dead_p (4, operands[5])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])
+   && !reg_overlap_mentioned_p (operands[5], operands[1])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (match_dup 1)
+		   (plus:SWI (plus:SWI (match_op_dup 4
+					 [(match_dup 3) (const_int 0)])
+				       (match_dup 1))
+			     (match_dup 0)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn "*add<mode>3_carry_0"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(plus:SWI
@@ -7767,6 +8024,149 @@
    (set_attr "pent_pair" "pu")
    (set_attr "mode" "<MODE>")])
 
+(define_peephole2
+  [(parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (zero_extend:<DWI>
+		       (plus:SWI48
+			 (plus:SWI48
+			   (match_operator:SWI48 4 "ix86_carry_flag_operator"
+			     [(match_operand 2 "flags_reg_operand")
+			      (const_int 0)])
+			   (match_operand:SWI48 0 "general_reg_operand"))
+			 (match_operand:SWI48 1 "memory_operand")))
+		     (plus:<DWI>
+		       (zero_extend:<DWI> (match_dup 1))
+		       (match_operator:<DWI> 3 "ix86_carry_flag_operator"
+			 [(match_dup 2) (const_int 0)]))))
+	      (set (match_dup 0)
+		   (plus:SWI48 (plus:SWI48 (match_op_dup 4
+					     [(match_dup 2) (const_int 0)])
+					   (match_dup 0))
+			       (match_dup 1)))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (2, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])"
+  [(parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (zero_extend:<DWI>
+		       (plus:SWI48
+			 (plus:SWI48
+			   (match_op_dup 4
+			     [(match_dup 2) (const_int 0)])
+			   (match_dup 1))
+			 (match_dup 0)))
+		     (plus:<DWI>
+		       (zero_extend:<DWI> (match_dup 0))
+		       (match_op_dup 3
+			 [(match_dup 2) (const_int 0)]))))
+	      (set (match_dup 1)
+		   (plus:SWI48 (plus:SWI48 (match_op_dup 4
+					     [(match_dup 2) (const_int 0)])
+					   (match_dup 1))
+			       (match_dup 0)))])])
+
+(define_peephole2
+  [(set (match_operand:SWI48 0 "general_reg_operand")
+	(match_operand:SWI48 1 "memory_operand"))
+   (parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (zero_extend:<DWI>
+		       (plus:SWI48
+			 (plus:SWI48
+			   (match_operator:SWI48 5 "ix86_carry_flag_operator"
+			     [(match_operand 3 "flags_reg_operand")
+			      (const_int 0)])
+			   (match_dup 0))
+			 (match_operand:SWI48 2 "memory_operand")))
+		     (plus:<DWI>
+		       (zero_extend:<DWI> (match_dup 2))
+		       (match_operator:<DWI> 4 "ix86_carry_flag_operator"
+			 [(match_dup 3) (const_int 0)]))))
+	      (set (match_dup 0)
+		   (plus:SWI48 (plus:SWI48 (match_op_dup 5
+					     [(match_dup 3) (const_int 0)])
+					   (match_dup 0))
+			       (match_dup 2)))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (zero_extend:<DWI>
+		       (plus:SWI48
+			 (plus:SWI48
+			   (match_op_dup 5
+			     [(match_dup 3) (const_int 0)])
+			   (match_dup 1))
+			 (match_dup 0)))
+		     (plus:<DWI>
+		       (zero_extend:<DWI> (match_dup 0))
+		       (match_op_dup 4
+			 [(match_dup 3) (const_int 0)]))))
+	      (set (match_dup 1)
+		   (plus:SWI48 (plus:SWI48 (match_op_dup 5
+					     [(match_dup 3) (const_int 0)])
+					   (match_dup 1))
+			       (match_dup 0)))])])
+
+(define_peephole2
+  [(parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (zero_extend:<DWI>
+		       (plus:SWI48
+			 (plus:SWI48
+			   (match_operator:SWI48 4 "ix86_carry_flag_operator"
+			     [(match_operand 2 "flags_reg_operand")
+			      (const_int 0)])
+			   (match_operand:SWI48 0 "general_reg_operand"))
+			 (match_operand:SWI48 1 "memory_operand")))
+		     (plus:<DWI>
+		       (zero_extend:<DWI> (match_dup 1))
+		       (match_operator:<DWI> 3 "ix86_carry_flag_operator"
+			 [(match_dup 2) (const_int 0)]))))
+	      (set (match_dup 0)
+		   (plus:SWI48 (plus:SWI48 (match_op_dup 4
+					     [(match_dup 2) (const_int 0)])
+					   (match_dup 0))
+			       (match_dup 1)))])
+   (set (match_operand:QI 5 "general_reg_operand")
+	(ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (set (match_operand:SWI48 6 "general_reg_operand")
+	(zero_extend:SWI48 (match_dup 5)))
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (4, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[5])
+   && !reg_overlap_mentioned_p (operands[5], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[6])
+   && !reg_overlap_mentioned_p (operands[6], operands[1])"
+  [(parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (zero_extend:<DWI>
+		       (plus:SWI48
+			 (plus:SWI48
+			   (match_op_dup 4
+			     [(match_dup 2) (const_int 0)])
+			   (match_dup 1))
+			 (match_dup 0)))
+		     (plus:<DWI>
+		       (zero_extend:<DWI> (match_dup 0))
+		       (match_op_dup 3
+			 [(match_dup 2) (const_int 0)]))))
+	      (set (match_dup 1)
+		   (plus:SWI48 (plus:SWI48 (match_op_dup 4
+					     [(match_dup 2) (const_int 0)])
+					   (match_dup 1))
+			       (match_dup 0)))])
+   (set (match_dup 5) (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (set (match_dup 6) (zero_extend:SWI48 (match_dup 5)))])
+
 (define_expand "addcarry<mode>_0"
   [(parallel
      [(set (reg:CCC FLAGS_REG)
@@ -7837,6 +8237,59 @@
    (set_attr "pent_pair" "pu")
    (set_attr "mode" "<MODE>")])
 
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+	(match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (match_dup 0)
+		   (minus:SWI
+		     (minus:SWI
+		       (match_dup 0)
+		       (match_operator:SWI 4 "ix86_carry_flag_operator"
+			 [(match_operand 3 "flags_reg_operand")
+			  (const_int 0)]))
+		     (match_operand:SWI 2 "memory_operand")))
+	      (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (match_dup 1)
+		   (minus:SWI (minus:SWI (match_dup 1)
+					 (match_op_dup 4
+					   [(match_dup 3) (const_int 0)]))
+			      (match_dup 0)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+	(match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (match_dup 0)
+		   (minus:SWI
+		     (minus:SWI
+		       (match_dup 0)
+		       (match_operator:SWI 4 "ix86_carry_flag_operator"
+			 [(match_operand 3 "flags_reg_operand")
+			  (const_int 0)]))
+		     (match_operand:SWI 2 "memory_operand")))
+	      (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:SWI 5 "general_reg_operand") (match_dup 0))
+   (set (match_dup 1) (match_dup 5))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && peep2_reg_dead_p (4, operands[5])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])
+   && !reg_overlap_mentioned_p (operands[5], operands[1])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (match_dup 1)
+		   (minus:SWI (minus:SWI (match_dup 1)
+					 (match_op_dup 4
+					   [(match_dup 3) (const_int 0)]))
+			      (match_dup 0)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn "*sub<mode>3_carry_0"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(minus:SWI
@@ -7962,13 +8415,13 @@
   [(set (reg:CCC FLAGS_REG)
 	(compare:CCC
 	  (zero_extend:<DWI>
-	    (match_operand:SWI48 1 "nonimmediate_operand" "0"))
+	    (match_operand:SWI48 1 "nonimmediate_operand" "0,0"))
 	  (plus:<DWI>
 	    (match_operator:<DWI> 4 "ix86_carry_flag_operator"
 	      [(match_operand 3 "flags_reg_operand") (const_int 0)])
 	    (zero_extend:<DWI>
-	      (match_operand:SWI48 2 "nonimmediate_operand" "rm")))))
-   (set (match_operand:SWI48 0 "register_operand" "=r")
+	      (match_operand:SWI48 2 "nonimmediate_operand" "r,rm")))))
+   (set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
 	(minus:SWI48 (minus:SWI48
 		       (match_dup 1)
 		       (match_operator:SWI48 5 "ix86_carry_flag_operator"
@@ -7981,6 +8434,154 @@
    (set_attr "pent_pair" "pu")
    (set_attr "mode" "<MODE>")])
 
+(define_peephole2
+  [(set (match_operand:SWI48 0 "general_reg_operand")
+	(match_operand:SWI48 1 "memory_operand"))
+   (parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (zero_extend:<DWI> (match_dup 0))
+		     (plus:<DWI>
+		       (match_operator:<DWI> 4 "ix86_carry_flag_operator"
+			 [(match_operand 3 "flags_reg_operand") (const_int 0)])
+		       (zero_extend:<DWI>
+			 (match_operand:SWI48 2 "memory_operand")))))
+	      (set (match_dup 0)
+		   (minus:SWI48
+		     (minus:SWI48
+		       (match_dup 0)
+		       (match_operator:SWI48 5 "ix86_carry_flag_operator"
+			 [(match_dup 3) (const_int 0)]))
+		     (match_dup 2)))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (zero_extend:<DWI> (match_dup 1))
+		     (plus:<DWI> (match_op_dup 4
+				   [(match_dup 3) (const_int 0)])
+				 (zero_extend:<DWI> (match_dup 0)))))
+	      (set (match_dup 1)
+		   (minus:SWI48 (minus:SWI48 (match_dup 1)
+					     (match_op_dup 5
+					       [(match_dup 3) (const_int 0)]))
+				(match_dup 0)))])])
+
+(define_peephole2
+  [(set (match_operand:SWI48 6 "general_reg_operand")
+	(match_operand:SWI48 7 "memory_operand"))
+   (set (match_operand:SWI48 8 "general_reg_operand")
+	(match_operand:SWI48 9 "memory_operand"))
+   (parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (zero_extend:<DWI>
+		       (match_operand:SWI48 0 "general_reg_operand"))
+		     (plus:<DWI>
+		       (match_operator:<DWI> 4 "ix86_carry_flag_operator"
+			 [(match_operand 3 "flags_reg_operand") (const_int 0)])
+		       (zero_extend:<DWI>
+			 (match_operand:SWI48 2 "general_reg_operand")))))
+	      (set (match_dup 0)
+		   (minus:SWI48
+		     (minus:SWI48
+		       (match_dup 0)
+		       (match_operator:SWI48 5 "ix86_carry_flag_operator"
+			 [(match_dup 3) (const_int 0)]))
+		     (match_dup 2)))])
+   (set (match_operand:SWI48 1 "memory_operand") (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (4, operands[0])
+   && peep2_reg_dead_p (3, operands[2])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[2], operands[1])
+   && !reg_overlap_mentioned_p (operands[6], operands[9])
+   && (rtx_equal_p (operands[6], operands[0])
+       ? (rtx_equal_p (operands[7], operands[1])
+	  && rtx_equal_p (operands[8], operands[2]))
+       : (rtx_equal_p (operands[8], operands[0])
+	  && rtx_equal_p (operands[9], operands[1])
+	  && rtx_equal_p (operands[6], operands[2])))"
+  [(set (match_dup 0) (match_dup 9))
+   (parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (zero_extend:<DWI> (match_dup 1))
+		     (plus:<DWI> (match_op_dup 4
+				   [(match_dup 3) (const_int 0)])
+				 (zero_extend:<DWI> (match_dup 0)))))
+	      (set (match_dup 1)
+		   (minus:SWI48 (minus:SWI48 (match_dup 1)
+					     (match_op_dup 5
+					       [(match_dup 3) (const_int 0)]))
+				(match_dup 0)))])]
+{
+  if (!rtx_equal_p (operands[6], operands[0]))
+    operands[9] = operands[7];
+})
+
+(define_peephole2
+  [(set (match_operand:SWI48 6 "general_reg_operand")
+	(match_operand:SWI48 7 "memory_operand"))
+   (set (match_operand:SWI48 8 "general_reg_operand")
+	(match_operand:SWI48 9 "memory_operand"))
+   (parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (zero_extend:<DWI>
+		       (match_operand:SWI48 0 "general_reg_operand"))
+		     (plus:<DWI>
+		       (match_operator:<DWI> 4 "ix86_carry_flag_operator"
+			 [(match_operand 3 "flags_reg_operand") (const_int 0)])
+		       (zero_extend:<DWI>
+			 (match_operand:SWI48 2 "general_reg_operand")))))
+	      (set (match_dup 0)
+		   (minus:SWI48
+		     (minus:SWI48
+		       (match_dup 0)
+		       (match_operator:SWI48 5 "ix86_carry_flag_operator"
+			 [(match_dup 3) (const_int 0)]))
+		     (match_dup 2)))])
+   (set (match_operand:QI 10 "general_reg_operand")
+	(ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (set (match_operand:SWI48 11 "general_reg_operand")
+	(zero_extend:SWI48 (match_dup 10)))
+   (set (match_operand:SWI48 1 "memory_operand") (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (6, operands[0])
+   && peep2_reg_dead_p (3, operands[2])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[2], operands[1])
+   && !reg_overlap_mentioned_p (operands[6], operands[9])
+   && !reg_overlap_mentioned_p (operands[0], operands[10])
+   && !reg_overlap_mentioned_p (operands[10], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[11])
+   && !reg_overlap_mentioned_p (operands[11], operands[1])
+   && (rtx_equal_p (operands[6], operands[0])
+       ? (rtx_equal_p (operands[7], operands[1])
+	  && rtx_equal_p (operands[8], operands[2]))
+       : (rtx_equal_p (operands[8], operands[0])
+	  && rtx_equal_p (operands[9], operands[1])
+	  && rtx_equal_p (operands[6], operands[2])))"
+  [(set (match_dup 0) (match_dup 9))
+   (parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (zero_extend:<DWI> (match_dup 1))
+		     (plus:<DWI> (match_op_dup 4
+				   [(match_dup 3) (const_int 0)])
+				 (zero_extend:<DWI> (match_dup 0)))))
+	      (set (match_dup 1)
+		   (minus:SWI48 (minus:SWI48 (match_dup 1)
+					     (match_op_dup 5
+					       [(match_dup 3) (const_int 0)]))
+				(match_dup 0)))])
+   (set (match_dup 10) (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (set (match_dup 11) (zero_extend:SWI48 (match_dup 10)))]
+{
+  if (!rtx_equal_p (operands[6], operands[0]))
+    operands[9] = operands[7];
+})
+
 (define_expand "subborrow<mode>_0"
   [(parallel
      [(set (reg:CC FLAGS_REG)
@@ -7991,6 +8592,63 @@
 	   (minus:SWI48 (match_dup 1) (match_dup 2)))])]
   "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)")
 
+(define_expand "uaddc<mode>5"
+  [(match_operand:SWI48 0 "register_operand")
+   (match_operand:SWI48 1 "register_operand")
+   (match_operand:SWI48 2 "register_operand")
+   (match_operand:SWI48 3 "register_operand")
+   (match_operand:SWI48 4 "nonmemory_operand")]
+  ""
+{
+  rtx cf = gen_rtx_REG (CCCmode, FLAGS_REG), pat, pat2;
+  if (operands[4] == const0_rtx)
+    emit_insn (gen_addcarry<mode>_0 (operands[0], operands[2], operands[3]));
+  else
+    {
+      ix86_expand_carry (operands[4]);
+      pat = gen_rtx_LTU (<DWI>mode, cf, const0_rtx);
+      pat2 = gen_rtx_LTU (<MODE>mode, cf, const0_rtx);
+      emit_insn (gen_addcarry<mode> (operands[0], operands[2], operands[3],
+				     cf, pat, pat2));
+    }
+  rtx cc = gen_reg_rtx (QImode);
+  pat = gen_rtx_LTU (QImode, cf, const0_rtx);
+  emit_insn (gen_rtx_SET (cc, pat));
+  emit_insn (gen_zero_extendqi<mode>2 (operands[1], cc));
+  DONE;
+})
+
+(define_expand "usubc<mode>5"
+  [(match_operand:SWI48 0 "register_operand")
+   (match_operand:SWI48 1 "register_operand")
+   (match_operand:SWI48 2 "register_operand")
+   (match_operand:SWI48 3 "register_operand")
+   (match_operand:SWI48 4 "nonmemory_operand")]
+  ""
+{
+  rtx cf, pat, pat2;
+  if (operands[4] == const0_rtx)
+    {
+      cf = gen_rtx_REG (CCmode, FLAGS_REG);
+      emit_insn (gen_subborrow<mode>_0 (operands[0], operands[2],
+					operands[3]));
+    }
+  else
+    {
+      cf = gen_rtx_REG (CCCmode, FLAGS_REG);
+      ix86_expand_carry (operands[4]);
+      pat = gen_rtx_LTU (<DWI>mode, cf, const0_rtx);
+      pat2 = gen_rtx_LTU (<MODE>mode, cf, const0_rtx);
+      emit_insn (gen_subborrow<mode> (operands[0], operands[2], operands[3],
+				      cf, pat, pat2));
+    }
+  rtx cc = gen_reg_rtx (QImode);
+  pat = gen_rtx_LTU (QImode, cf, const0_rtx);
+  emit_insn (gen_rtx_SET (cc, pat));
+  emit_insn (gen_zero_extendqi<mode>2 (operands[1], cc));
+  DONE;
+})
+
 (define_mode_iterator CC_CCC [CC CCC])
 
 ;; Pre-reload splitter to optimize
@@ -8003,7 +8661,39 @@
   "ix86_pre_reload_split ()"
   "#"
   "&& 1"
-  [(const_int 0)])
+  [(const_int 0)]
+  "emit_note (NOTE_INSN_DELETED); DONE;")
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setccc"
+  [(set (reg:CCC FLAGS_REG)
+	(reg:CCC FLAGS_REG))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  "emit_note (NOTE_INSN_DELETED); DONE;")
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setcc_qi_negqi_ccc_1_<mode>"
+  [(set (reg:CCC FLAGS_REG)
+	(ltu:CCC (reg:CC_CCC FLAGS_REG) (const_int 0)))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  "emit_note (NOTE_INSN_DELETED); DONE;")
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setcc_qi_negqi_ccc_2_<mode>"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))
+		     (const_int 0)] UNSPEC_CC_NE))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  "emit_note (NOTE_INSN_DELETED); DONE;")
 
 ;; Overflow setting add instructions
 
@@ -8062,6 +8752,27 @@
 		     (match_dup 1)))
 	      (set (match_dup 1) (plus:SWI (match_dup 1) (match_dup 0)))])])
 
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+	(match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (plus:SWI (match_dup 0)
+			       (match_operand:SWI 2 "memory_operand"))
+		     (match_dup 0)))
+	      (set (match_dup 0) (plus:SWI (match_dup 0) (match_dup 2)))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (reg:CCC FLAGS_REG)
+		   (compare:CCC
+		     (plus:SWI (match_dup 1) (match_dup 0))
+		     (match_dup 1)))
+	      (set (match_dup 1) (plus:SWI (match_dup 1) (match_dup 0)))])])
+
 (define_insn "*addsi3_zext_cc_overflow_1"
   [(set (reg:CCC FLAGS_REG)
 	(compare:CCC
@@ -9897,10 +10608,10 @@
 	(compare
 	  (and:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 0 "int248_register_operand" "Q,Q")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 2 "extract_operator"
+		[(match_operand 0 "int248_register_operand" "Q,Q")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (match_operand:QI 1 "general_x64constmem_operand" "QnBc,m"))
 	  (const_int 0)))]
   "ix86_match_ccmode (insn, CCNOmode)"
@@ -9914,15 +10625,15 @@
 	(compare
 	  (and:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 0 "int248_register_operand" "Q")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 2 "extract_operator"
+		[(match_operand 0 "int248_register_operand" "Q")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "Q")
-		(const_int 8)
-		(const_int 8)) 0))
+	      (match_operator:SWI248 3 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "Q")
+		 (const_int 8)
+		 (const_int 8)]) 0))
 	  (const_int 0)))]
   "ix86_match_ccmode (insn, CCNOmode)"
   "test{b}\t{%h1, %h0|%h0, %h1}"
@@ -10564,10 +11275,10 @@
   [(set (match_operand:QI 0 "nonimm_x64constmem_operand" "=QBc,m")
 	(and:QI
 	  (subreg:QI
-	    (zero_extract:SWI248
-	      (match_operand 2 "int248_register_operand" "Q,Q")
-	      (const_int 8)
-	      (const_int 8)) 0)
+	    (match_operator:SWI248 3 "extract_operator"
+	      [(match_operand 2 "int248_register_operand" "Q,Q")
+	       (const_int 8)
+	       (const_int 8)]) 0)
 	  (match_operand:QI 1 "nonimm_x64constmem_operand" "0,0")))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -10598,10 +11309,10 @@
 	(subreg:SWI248
 	  (and:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "0,0")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 3 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "0,0")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (match_operand:QI 2 "general_x64constmem_operand" "QnBc,m")) 0))
    (clobber (reg:CC FLAGS_REG))]
   "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
@@ -10618,10 +11329,10 @@
 	(compare
 	  (and:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "0,0")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 3 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "0,0")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (match_operand:QI 2 "general_x64constmem_operand" "QnBc,m"))
 	  (const_int 0)))
    (set (zero_extract:SWI248
@@ -10631,10 +11342,10 @@
 	(subreg:SWI248
 	  (and:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_dup 1)
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_op_dup 3
+		[(match_dup 1)
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (match_dup 2)) 0))]
   "ix86_match_ccmode (insn, CCNOmode)
    /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
@@ -10652,15 +11363,15 @@
 	(subreg:SWI248
 	  (and:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "%0")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 3 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "%0")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 2 "int248_register_operand" "Q")
-		(const_int 8)
-		(const_int 8)) 0)) 0))
+	      (match_operator:SWI248 4 "extract_operator"
+		[(match_operand 2 "int248_register_operand" "Q")
+		 (const_int 8)
+		 (const_int 8)]) 0)) 0))
    (clobber (reg:CC FLAGS_REG))]
   "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
    rtx_equal_p (operands[0], operands[1])
@@ -11321,10 +12032,10 @@
   [(set (match_operand:QI 0 "nonimm_x64constmem_operand" "=QBc,m")
 	(any_or:QI
 	  (subreg:QI
-	    (zero_extract:SWI248
-	      (match_operand 2 "int248_register_operand" "Q,Q")
-	      (const_int 8)
-	      (const_int 8)) 0)
+	    (match_operator:SWI248 3 "extract_operator"
+	      [(match_operand 2 "int248_register_operand" "Q,Q")
+	       (const_int 8)
+	       (const_int 8)]) 0)
 	  (match_operand:QI 1 "nonimm_x64constmem_operand" "0,0")))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -11341,10 +12052,10 @@
 	(subreg:SWI248
 	  (any_or:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "0,0")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 3 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "0,0")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (match_operand:QI 2 "general_x64constmem_operand" "QnBc,m")) 0))
    (clobber (reg:CC FLAGS_REG))]
   "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
@@ -11363,15 +12074,15 @@
 	(subreg:SWI248
 	  (any_or:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "%0")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 3 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "%0")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 2 "int248_register_operand" "Q")
-		(const_int 8)
-		(const_int 8)) 0)) 0))
+	      (match_operator:SWI248 4 "extract_operator"
+		[(match_operand 2 "int248_register_operand" "Q")
+		 (const_int 8)
+		 (const_int 8)]) 0)) 0))
    (clobber (reg:CC FLAGS_REG))]
   "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
    /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
@@ -11469,10 +12180,10 @@
 	(compare
 	  (xor:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "0,0")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 3 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "0,0")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (match_operand:QI 2 "general_x64constmem_operand" "QnBc,m"))
 	  (const_int 0)))
    (set (zero_extract:SWI248
@@ -11482,10 +12193,10 @@
 	(subreg:SWI248
 	  (xor:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_dup 1)
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_op_dup 3
+		[(match_dup 1)
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	  (match_dup 2)) 0))]
   "ix86_match_ccmode (insn, CCNOmode)
    /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
@@ -11496,18 +12207,17 @@
    (set_attr "mode" "QI")])
 
 ;; Split DST = (HI<<32)|LO early to minimize register usage.
-(define_code_iterator any_or_plus [plus ior xor])
 (define_insn_and_split "*concat<mode><dwi>3_1"
   [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r")
 	(any_or_plus:<DWI>
 	  (ashift:<DWI> (match_operand:<DWI> 1 "register_operand" "r,r")
-			(match_operand:<DWI> 2 "const_int_operand"))
+			(match_operand:QI 2 "const_int_operand"))
 	  (zero_extend:<DWI>
 	    (match_operand:DWIH 3 "nonimmediate_operand" "r,m"))))]
   "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   split_double_concat (<DWI>mode, operands[0], operands[3],
 		       gen_lowpart (<MODE>mode, operands[1]));
@@ -11520,11 +12230,11 @@
 	  (zero_extend:<DWI>
 	    (match_operand:DWIH 1 "nonimmediate_operand" "r,m"))
 	  (ashift:<DWI> (match_operand:<DWI> 2 "register_operand" "r,r")
-			(match_operand:<DWI> 3 "const_int_operand"))))]
+			(match_operand:QI 3 "const_int_operand"))))]
   "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   split_double_concat (<DWI>mode, operands[0], operands[1],
 		       gen_lowpart (<MODE>mode, operands[2]));
@@ -11537,13 +12247,13 @@
 	  (ashift:<DWI>
 	    (zero_extend:<DWI>
 	      (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m"))
-	    (match_operand:<DWI> 2 "const_int_operand"))
+	    (match_operand:QI 2 "const_int_operand"))
 	  (zero_extend:<DWI>
 	    (match_operand:DWIH 3 "nonimmediate_operand" "r,r,m,m"))))]
   "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   split_double_concat (<DWI>mode, operands[0], operands[3], operands[1]);
   DONE;
@@ -11557,11 +12267,11 @@
 	  (ashift:<DWI>
 	    (zero_extend:<DWI>
 	      (match_operand:DWIH 2 "nonimmediate_operand" "r,r,m,m"))
-	    (match_operand:<DWI> 3 "const_int_operand"))))]
+	    (match_operand:QI 3 "const_int_operand"))))]
   "INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   split_double_concat (<DWI>mode, operands[0], operands[1], operands[2]);
   DONE;
@@ -11571,7 +12281,7 @@
   [(set (match_operand:DWI 0 "nonimmediate_operand" "=r,o,o")
 	(any_or_plus:DWI
 	  (ashift:DWI (match_operand:DWI 1 "register_operand" "r,r,r")
-		      (match_operand:DWI 2 "const_int_operand"))
+		      (match_operand:QI 2 "const_int_operand"))
 	  (match_operand:DWI 3 "const_scalar_int_operand" "n,n,Wd")))]
   "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT / 2
    && (<MODE>mode == DImode
@@ -11588,7 +12298,7 @@
 					VOIDmode))"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   rtx op3 = simplify_subreg (<HALF>mode, operands[3], <MODE>mode, 0);
   split_double_concat (<MODE>mode, operands[0], op3,
@@ -11603,7 +12313,7 @@
 	  (ashift:<DWI>
 	    (zero_extend:<DWI>
 	      (match_operand:DWIH 1 "nonimmediate_operand" "r,r,r,m"))
-	    (match_operand:<DWI> 2 "const_int_operand"))
+	    (match_operand:QI 2 "const_int_operand"))
 	  (match_operand:<DWI> 3 "const_scalar_int_operand" "n,n,Wd,n")))]
   "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT
    && (<DWI>mode == DImode
@@ -11620,7 +12330,7 @@
 					VOIDmode))"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   rtx op3 = simplify_subreg (<MODE>mode, operands[3], <DWI>mode, 0);
   split_double_concat (<DWI>mode, operands[0], op3, operands[1]);
@@ -11646,7 +12356,7 @@
 				       VOIDmode)"
   "#"
   "&& reload_completed"
-  [(clobber (const_int 0))]
+  [(const_int 0)]
 {
   rtx op2;
   if (<DWI>mode == DImode)
@@ -11879,10 +12589,10 @@
 	(subreg:SWI248
 	  (neg:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "0")
-		(const_int 8)
-		(const_int 8)) 0)) 0))
+	      (match_operator:SWI248 2 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "0")
+		 (const_int 8)
+		 (const_int 8)]) 0)) 0))
    (clobber (reg:CC FLAGS_REG))]
   "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
    rtx_equal_p (operands[0], operands[1])"
@@ -13254,8 +13964,8 @@
 
 ;; Convert ashift to the lea pattern to avoid flags dependency.
 (define_split
-  [(set (match_operand:SWI 0 "register_operand")
-	(ashift:SWI (match_operand:SWI 1 "index_register_operand")
+  [(set (match_operand:SWI 0 "general_reg_operand")
+	(ashift:SWI (match_operand:SWI 1 "index_reg_operand")
 		    (match_operand 2 "const_0_to_3_operand")))
    (clobber (reg:CC FLAGS_REG))]
   "reload_completed
@@ -13273,9 +13983,9 @@
 
 ;; Convert ashift to the lea pattern to avoid flags dependency.
 (define_split
-  [(set (match_operand:DI 0 "register_operand")
+  [(set (match_operand:DI 0 "general_reg_operand")
 	(zero_extend:DI
-	  (ashift:SI (match_operand:SI 1 "index_register_operand")
+	  (ashift:SI (match_operand:SI 1 "index_reg_operand")
 		     (match_operand 2 "const_0_to_3_operand"))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && reload_completed
@@ -13440,10 +14150,10 @@
 	(subreg:SWI248
 	  (ashift:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "0")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 3 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "0")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (match_operand:QI 2 "nonmemory_operand" "cI")) 0))
   (clobber (reg:CC FLAGS_REG))]
   "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
@@ -14343,10 +15053,10 @@
 	(subreg:SWI248
 	  (any_shiftrt:QI
 	    (subreg:QI
-	      (zero_extract:SWI248
-	        (match_operand 1 "int248_register_operand" "0")
-		(const_int 8)
-		(const_int 8)) 0)
+	      (match_operator:SWI248 3 "extract_operator"
+		[(match_operand 1 "int248_register_operand" "0")
+		 (const_int 8)
+		 (const_int 8)]) 0)
 	    (match_operand:QI 2 "nonmemory_operand" "cI")) 0))
   (clobber (reg:CC FLAGS_REG))]
   "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
@@ -21958,6 +22668,22 @@
   [(set_attr "type" "icmov")
    (set_attr "mode" "SI")])
 
+(define_insn "*movsicc_noc_zext_1"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r")
+	(zero_extend:DI
+	  (if_then_else:SI (match_operator 1 "ix86_comparison_operator"
+			     [(reg FLAGS_REG) (const_int 0)])
+	     (match_operand:SI 2 "nonimmediate_operand" "rm,0")
+	     (match_operand:SI 3 "nonimmediate_operand" "0,rm"))))]
+  "TARGET_64BIT
+   && TARGET_CMOVE && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
+  "@
+   cmov%O2%C1\t{%2, %k0|%k0, %2}
+   cmov%O2%c1\t{%3, %k0|%k0, %3}"
+  [(set_attr "type" "icmov")
+   (set_attr "mode" "SI")])
+
+
 ;; Don't do conditional moves with memory inputs.  This splitter helps
 ;; register starved x86_32 by forcing inputs into registers before reload.
 (define_split
@@ -23197,9 +23923,10 @@
 	(match_operator 1 "compare_operator"
 	  [(and:QI
 	     (subreg:QI
-	       (zero_extract:SWI248 (match_operand 2 "int248_register_operand")
-				    (const_int 8)
-				    (const_int 8)) 0)
+	       (match_operator:SWI248 4 "extract_operator"
+		 [(match_operand 2 "int248_register_operand")
+		  (const_int 8)
+		  (const_int 8)]) 0)
 	     (match_operand 3 "const_int_operand"))
 	   (const_int 0)]))]
   "! TARGET_PARTIAL_REG_STALL
@@ -23211,9 +23938,9 @@
 	   (match_op_dup 1
 	     [(and:QI
 		(subreg:QI
-		  (zero_extract:SWI248 (match_dup 2)
-				       (const_int 8)
-				       (const_int 8)) 0)
+		  (match_op_dup 4 [(match_dup 2)
+				   (const_int 8)
+				   (const_int 8)]) 0)
 		(match_dup 3))
 	      (const_int 0)]))
       (set (zero_extract:SWI248 (match_dup 2)
@@ -23222,9 +23949,9 @@
 	   (subreg:SWI248
 	     (and:QI
 	       (subreg:QI
-		 (zero_extract:SWI248 (match_dup 2)
-				      (const_int 8)
-				      (const_int 8)) 0)
+		 (match_op_dup 4 [(match_dup 2)
+				  (const_int 8)
+				  (const_int 8)]) 0)
 	       (match_dup 3)) 0))])])
 
 ;; Don't do logical operations with memory inputs.
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 94fdd63..d74f6b1 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1274,3 +1274,7 @@ Enum(lam_type) String(u48) Value(lam_u48)
 
 EnumValue
 Enum(lam_type) String(u57) Value(lam_u57)
+
+mamx-complex
+Target Mask(ISA2_AMX_COMPLEX) Var(ix86_isa_flags2) Save
+Support AMX-COMPLEX built-in functions and code generation.
diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
index 740de63..b220d87 100644
--- a/gcc/config/i386/immintrin.h
+++ b/gcc/config/i386/immintrin.h
@@ -132,6 +132,8 @@
 
 #include <amxbf16intrin.h>
 
+#include <amxcomplexintrin.h>
+
 #include <prfchwintrin.h>
 
 #include <keylockerintrin.h>
diff --git a/gcc/config/i386/mingw-w64.h b/gcc/config/i386/mingw-w64.h
index 3a21cec..0146ed4 100644
--- a/gcc/config/i386/mingw-w64.h
+++ b/gcc/config/i386/mingw-w64.h
@@ -25,7 +25,27 @@ along with GCC; see the file COPYING3.  If not see
 #define CPP_SPEC "%{posix:-D_POSIX_SOURCE} %{mthreads:-D_MT} " \
 		 "%{municode:-DUNICODE} " \
 		 "%{" SPEC_PTHREAD1 ":-D_REENTRANT} " \
-		 "%{" SPEC_PTHREAD2 ":-U_REENTRANT} "
+		 "%{" SPEC_PTHREAD2 ":-U_REENTRANT} " \
+		 "%{mcrtdll=crtdll*:-U__MSVCRT__ -D__CRTDLL__} " \
+		 "%{mcrtdll=msvcrt10*:-D__MSVCRT_VERSION__=0x100} " \
+		 "%{mcrtdll=msvcrt20*:-D__MSVCRT_VERSION__=0x200} " \
+		 "%{mcrtdll=msvcrt40*:-D__MSVCRT_VERSION__=0x400} " \
+		 "%{mcrtdll=msvcrt-os*:-D__MSVCRT_VERSION__=0x700} " \
+		 "%{mcrtdll=msvcr70*:-D__MSVCRT_VERSION__=0x700} " \
+		 "%{mcrtdll=msvcr71*:-D__MSVCRT_VERSION__=0x701} " \
+		 "%{mcrtdll=msvcr80*:-D__MSVCRT_VERSION__=0x800} " \
+		 "%{mcrtdll=msvcr90*:-D__MSVCRT_VERSION__=0x900} " \
+		 "%{mcrtdll=msvcr100*:-D__MSVCRT_VERSION__=0xA00} " \
+		 "%{mcrtdll=msvcr110*:-D__MSVCRT_VERSION__=0xB00} " \
+		 "%{mcrtdll=msvcr120*:-D__MSVCRT_VERSION__=0xC00} " \
+		 "%{mcrtdll=ucrt*:-D_UCRT} "
+
+#undef REAL_LIBGCC_SPEC
+#define REAL_LIBGCC_SPEC \
+  "%{mthreads:-lmingwthrd} -lmingw32 \
+   " SHARED_LIBGCC_SPEC " \
+   -lmingwex %{!mcrtdll=*:-lmsvcrt} %{mcrtdll=*:-l%*} \
+   -lkernel32 " MCFGTHREAD_SPEC
 
 #undef STARTFILE_SPEC
 #define STARTFILE_SPEC "%{shared|mdll:dllcrt2%O%s} \
diff --git a/gcc/config/i386/mingw.opt b/gcc/config/i386/mingw.opt
index 0ae026a..dd66a50 100644
--- a/gcc/config/i386/mingw.opt
+++ b/gcc/config/i386/mingw.opt
@@ -18,6 +18,10 @@
 ; along with GCC; see the file COPYING3.  If not see
 ; <http://www.gnu.org/licenses/>.
 
+mcrtdll=
+Target RejectNegative Joined
+Preprocess, compile or link with specified C RunTime DLL library.
+
 pthread
 Driver
 
diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h
index 6a55baa..a1ee001 100644
--- a/gcc/config/i386/mingw32.h
+++ b/gcc/config/i386/mingw32.h
@@ -89,7 +89,20 @@ along with GCC; see the file COPYING3.  If not see
 #undef CPP_SPEC
 #define CPP_SPEC "%{posix:-D_POSIX_SOURCE} %{mthreads:-D_MT} " \
 		 "%{" SPEC_PTHREAD1 ":-D_REENTRANT} " \
-		 "%{" SPEC_PTHREAD2 ": } "
+		 "%{" SPEC_PTHREAD2 ": } " \
+		 "%{mcrtdll=crtdll*:-U__MSVCRT__ -D__CRTDLL__} " \
+		 "%{mcrtdll=msvcrt10*:-D__MSVCRT_VERSION__=0x100} " \
+		 "%{mcrtdll=msvcrt20*:-D__MSVCRT_VERSION__=0x200} " \
+		 "%{mcrtdll=msvcrt40*:-D__MSVCRT_VERSION__=0x400} " \
+		 "%{mcrtdll=msvcrt-os*:-D__MSVCRT_VERSION__=0x700} " \
+		 "%{mcrtdll=msvcr70*:-D__MSVCRT_VERSION__=0x700} " \
+		 "%{mcrtdll=msvcr71*:-D__MSVCRT_VERSION__=0x701} " \
+		 "%{mcrtdll=msvcr80*:-D__MSVCRT_VERSION__=0x800} " \
+		 "%{mcrtdll=msvcr90*:-D__MSVCRT_VERSION__=0x900} " \
+		 "%{mcrtdll=msvcr100*:-D__MSVCRT_VERSION__=0xA00} " \
+		 "%{mcrtdll=msvcr110*:-D__MSVCRT_VERSION__=0xB00} " \
+		 "%{mcrtdll=msvcr120*:-D__MSVCRT_VERSION__=0xC00} " \
+		 "%{mcrtdll=ucrt*:-D_UCRT} "
 
 /* For Windows applications, include more libraries, but always include
    kernel32.  */
@@ -184,11 +197,18 @@ along with GCC; see the file COPYING3.  If not see
 #define REAL_LIBGCC_SPEC \
   "%{mthreads:-lmingwthrd} -lmingw32 \
    " SHARED_LIBGCC_SPEC " \
-   -lmoldname -lmingwex -lmsvcrt -lkernel32 " MCFGTHREAD_SPEC
+   %{mcrtdll=crtdll*:-lcoldname} %{!mcrtdll=crtdll*:-lmoldname} \
+   -lmingwex %{!mcrtdll=*:-lmsvcrt} %{mcrtdll=*:-l%*} \
+   -lkernel32 " MCFGTHREAD_SPEC
 
 #undef STARTFILE_SPEC
-#define STARTFILE_SPEC "%{shared|mdll:dllcrt2%O%s} \
-  %{!shared:%{!mdll:crt2%O%s}} %{pg:gcrt2%O%s} \
+#define STARTFILE_SPEC " \
+  %{shared|mdll:%{mcrtdll=crtdll*:dllcrt1%O%s}} \
+  %{shared|mdll:%{!mcrtdll=crtdll*:dllcrt2%O%s}} \
+  %{!shared:%{!mdll:%{mcrtdll=crtdll*:crt1%O%s}}} \
+  %{!shared:%{!mdll:%{!mcrtdll=crtdll*:crt2%O%s}}} \
+  %{pg:%{mcrtdll=crtdll*:gcrt1%O%s}} \
+  %{pg:%{!mcrtdll=crtdll*:gcrt2%O%s}} \
   crtbegin.o%s \
   %{fvtable-verify=none:%s; \
     fvtable-verify=preinit:vtv_start.o%s; \
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 18dae03..12b103a 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -106,6 +106,10 @@
 (define_mode_attr mmxintvecmodelower
   [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")])
 
+;; Mapping of vector modes back to the scalar modes
+(define_mode_attr mmxscalarmode
+  [(V2SI "SI") (V2SF "SF")])
+
 (define_mode_attr Yv_Yw
   [(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")])
 
@@ -1154,6 +1158,42 @@
   DONE;
 })
 
+(define_insn "@sse4_1_insertps_<mode>"
+  [(set (match_operand:V2FI 0 "register_operand" "=Yr,*x,v")
+	(unspec:V2FI
+	  [(match_operand:V2FI 2 "nonimmediate_operand" "Yrm,*xm,vm")
+	   (match_operand:V2FI 1 "register_operand" "0,0,v")
+	   (match_operand:SI 3 "const_0_to_255_operand")]
+	  UNSPEC_INSERTPS))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+{
+  if (MEM_P (operands[2]))
+    {
+      unsigned count_s = INTVAL (operands[3]) >> 6;
+      if (count_s)
+	operands[3] = GEN_INT (INTVAL (operands[3]) & 0x3f);
+      operands[2] = adjust_address_nv (operands[2],
+				       <mmxscalarmode>mode, count_s * 4);
+    }
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+      return "insertps\t{%3, %2, %0|%0, %2, %3}";
+    case 2:
+      return "vinsertps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "*mmx_blendps"
   [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,x")
 	(vec_merge:V2SF
@@ -2052,6 +2092,23 @@
    (set_attr "type" "sseadd")
    (set_attr "mode" "TI")])
 
+(define_insn "mulv2si3"
+  [(set (match_operand:V2SI 0 "register_operand" "=Yr,*x,v")
+	(mult:V2SI
+	  (match_operand:V2SI 1 "register_operand" "%0,0,v")
+	  (match_operand:V2SI 2 "register_operand" "Yr,*x,v")))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "@
+   pmulld\t{%2, %0|%0, %2}
+   pmulld\t{%2, %0|%0, %2}
+   vpmulld\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "sseimul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "TI")])
+
 (define_expand "mmx_mulv4hi3"
   [(set (match_operand:V4HI 0 "register_operand")
         (mult:V4HI (match_operand:V4HI 1 "register_mmxmem_operand")
@@ -2092,6 +2149,26 @@
    (set_attr "type" "ssemul")
    (set_attr "mode" "TI")])
 
+(define_expand "mulv8qi3"
+  [(set (match_operand:V8QI 0 "register_operand")
+	(mult:V8QI (match_operand:V8QI 1 "register_operand")
+		   (match_operand:V8QI 2 "register_operand")))]
+  "TARGET_MMX_WITH_SSE"
+{
+  ix86_expand_vecop_qihi_partial (MULT, operands[0], operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "mulv4qi3"
+  [(set (match_operand:V4QI 0 "register_operand")
+	(mult:V4QI (match_operand:V4QI 1 "register_operand")
+		   (match_operand:V4QI 2 "register_operand")))]
+  "TARGET_SSE2"
+{
+  ix86_expand_vecop_qihi_partial (MULT, operands[0], operands[1], operands[2]);
+  DONE;
+})
+
 (define_expand "mmx_smulv4hi3_highpart"
   [(set (match_operand:V4HI 0 "register_operand")
 	(truncate:V4HI
@@ -2603,6 +2680,28 @@
        (const_string "0")))
    (set_attr "mode" "TI")])
 
+(define_expand "<insn>v8qi3"
+  [(set (match_operand:V8QI 0 "register_operand")
+	(any_shift:V8QI (match_operand:V8QI 1 "register_operand")
+			(match_operand:DI 2 "nonmemory_operand")))]
+  "TARGET_MMX_WITH_SSE"
+{
+  ix86_expand_vecop_qihi_partial (<CODE>, operands[0],
+				  operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "<insn>v4qi3"
+  [(set (match_operand:V4QI 0 "register_operand")
+	(any_shift:V4QI (match_operand:V4QI 1 "register_operand")
+			(match_operand:DI 2 "nonmemory_operand")))]
+  "TARGET_SSE2"
+{
+  ix86_expand_vecop_qihi_partial (<CODE>, operands[0],
+				  operands[1], operands[2]);
+  DONE;
+})
+
 (define_insn_and_split "<insn>v2qi3"
   [(set (match_operand:V2QI 0 "register_operand" "=Q")
         (any_shift:V2QI
@@ -2635,6 +2734,30 @@
   [(set_attr "type" "multi")
    (set_attr "mode" "QI")])
 
+(define_expand "v<insn>v8qi3"
+  [(set (match_operand:V8QI 0 "register_operand")
+	(any_shift:V8QI
+	  (match_operand:V8QI 1 "register_operand")
+	  (match_operand:V8QI 2 "register_operand")))]
+  "TARGET_AVX512BW && TARGET_AVX512VL && TARGET_MMX_WITH_SSE"
+{
+  ix86_expand_vecop_qihi_partial (<CODE>, operands[0],
+				  operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "v<insn>v4qi3"
+  [(set (match_operand:V4QI 0 "register_operand")
+	(any_shift:V4QI
+	  (match_operand:V4QI 1 "register_operand")
+	  (match_operand:V4QI 2 "register_operand")))]
+  "TARGET_AVX512BW && TARGET_AVX512VL"
+{
+  ix86_expand_vecop_qihi_partial (<CODE>, operands[0],
+				  operands[1], operands[2]);
+  DONE;
+})
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel integral comparisons
@@ -3214,27 +3337,43 @@
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; Used in signed and unsigned truncations with saturation.
-(define_code_iterator any_s_truncate [ss_truncate us_truncate])
-;; Instruction suffix for truncations with saturation.
-(define_code_attr s_trunsuffix [(ss_truncate "s") (us_truncate "u")])
-
-(define_insn_and_split "mmx_pack<s_trunsuffix>swb"
+(define_insn_and_split "mmx_packsswb"
   [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw")
 	(vec_concat:V8QI
-	  (any_s_truncate:V4QI
+	  (ss_truncate:V4QI
 	    (match_operand:V4HI 1 "register_operand" "0,0,Yw"))
-	  (any_s_truncate:V4QI
+	  (ss_truncate:V4QI
 	    (match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yw"))))]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
   "@
-   pack<s_trunsuffix>swb\t{%2, %0|%0, %2}
+   packsswb\t{%2, %0|%0, %2}
+   #
+   #"
+  "&& reload_completed
+   && SSE_REGNO_P (REGNO (operands[0]))"
+  [(const_int 0)]
+  "ix86_split_mmx_pack (operands, SS_TRUNCATE); DONE;"
+  [(set_attr "mmx_isa" "native,sse_noavx,avx")
+   (set_attr "type" "mmxshft,sselog,sselog")
+   (set_attr "mode" "DI,TI,TI")])
+
+;; This instruction does unsigned saturation of signed source
+;; and is different from generic us_truncate RTX.
+(define_insn_and_split "mmx_packuswb"
+  [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw")
+	(unspec:V8QI
+	  [(match_operand:V4HI 1 "register_operand" "0,0,Yw")
+	   (match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yw")]
+	  UNSPEC_US_TRUNCATE))]
+  "TARGET_MMX || TARGET_MMX_WITH_SSE"
+  "@
+   packuswb\t{%2, %0|%0, %2}
    #
    #"
   "&& reload_completed
    && SSE_REGNO_P (REGNO (operands[0]))"
   [(const_int 0)]
-  "ix86_split_mmx_pack (operands, <any_s_truncate:CODE>); DONE;"
+  "ix86_split_mmx_pack (operands, US_TRUNCATE); DONE;"
   [(set_attr "mmx_isa" "native,sse_noavx,avx")
    (set_attr "type" "mmxshft,sselog,sselog")
    (set_attr "mode" "DI,TI,TI")])
@@ -3261,11 +3400,10 @@
 
 (define_insn_and_split "mmx_packusdw"
   [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,Yw")
-	(vec_concat:V4HI
-	  (us_truncate:V2HI
-	    (match_operand:V2SI 1 "register_operand" "0,0,Yw"))
-	  (us_truncate:V2HI
-	    (match_operand:V2SI 2 "register_operand" "Yr,*x,Yw"))))]
+	(unspec:V4HI
+	  [(match_operand:V2SI 1 "register_operand" "0,0,Yw")
+	   (match_operand:V2SI 2 "register_operand" "Yr,*x,Yw")]
+	   UNSPEC_US_TRUNCATE))]
   "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
   "#"
   "&& reload_completed"
@@ -3454,6 +3592,18 @@
    (set_attr "prefix" "orig,orig,maybe_evex")
    (set_attr "mode" "TI")])
 
+(define_expand "<insn>v4qiv4hi2"
+  [(set (match_operand:V4HI 0 "register_operand")
+	(any_extend:V4HI
+	  (match_operand:V4QI 1 "register_operand")))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+{
+  rtx op1 = force_reg (V4QImode, operands[1]);
+  op1 = lowpart_subreg (V8QImode, op1, V4QImode);
+  emit_insn (gen_sse4_1_<code>v4qiv4hi2 (operands[0], op1));
+  DONE;
+})
+
 (define_insn "sse4_1_<code>v2hiv2si2"
   [(set (match_operand:V2SI 0 "register_operand" "=Yr,*x,v")
 	(any_extend:V2SI
@@ -3468,6 +3618,44 @@
    (set_attr "prefix" "orig,orig,maybe_evex")
    (set_attr "mode" "TI")])
 
+(define_expand "<insn>v2hiv2si2"
+  [(set (match_operand:V2SI 0 "register_operand")
+	(any_extend:V2SI
+	  (match_operand:V2HI 1 "register_operand")))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+{
+  rtx op1 = force_reg (V2HImode, operands[1]);
+  op1 = lowpart_subreg (V4HImode, op1, V2HImode);
+  emit_insn (gen_sse4_1_<code>v2hiv2si2 (operands[0], op1));
+  DONE;
+})
+
+(define_insn "sse4_1_<code>v2qiv2si2"
+  [(set (match_operand:V2SI 0 "register_operand" "=Yr,*x,v")
+	(any_extend:V2SI
+	  (vec_select:V2QI
+	    (match_operand:V4QI 1 "register_operand" "Yr,*x,v")
+	    (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "%vpmov<extsuffix>bd\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
+(define_expand "<insn>v2qiv2si2"
+  [(set (match_operand:V2SI 0 "register_operand")
+	(any_extend:V2SI
+	  (match_operand:V2QI 1 "register_operand")))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+{
+  rtx op1 = force_reg (V2QImode, operands[1]);
+  op1 = lowpart_subreg (V4QImode, op1, V2QImode);
+  emit_insn (gen_sse4_1_<code>v2qiv2si2 (operands[0], op1));
+  DONE;
+})
+
 (define_insn "sse4_1_<code>v2qiv2hi2"
   [(set (match_operand:V2HI 0 "register_operand" "=Yr,*x,Yw")
 	(any_extend:V2HI
@@ -3482,6 +3670,39 @@
    (set_attr "prefix" "orig,orig,maybe_evex")
    (set_attr "mode" "TI")])
 
+(define_expand "<insn>v2qiv2hi2"
+  [(set (match_operand:V2HI 0 "register_operand")
+	(any_extend:V2HI
+	  (match_operand:V2QI 1 "register_operand")))]
+  "TARGET_SSE4_1"
+{
+  rtx op1 = force_reg (V2QImode, operands[1]);
+  op1 = lowpart_subreg (V4QImode, op1, V2QImode);
+  emit_insn (gen_sse4_1_<code>v2qiv2hi2 (operands[0], op1));
+  DONE;
+})
+
+(define_insn "truncv2hiv2qi2"
+  [(set (match_operand:V2QI 0 "register_operand" "=v")
+	(truncate:V2QI
+	  (match_operand:V2HI 1 "register_operand" "v")))]
+  "TARGET_AVX512VL && TARGET_AVX512BW"
+  "vpmovwb\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_mode_iterator V2QI_V2HI [V2QI V2HI])
+(define_insn "truncv2si<mode>2"
+  [(set (match_operand:V2QI_V2HI 0 "register_operand" "=v")
+	(truncate:V2QI_V2HI
+	  (match_operand:V2SI 1 "register_operand" "v")))]
+  "TARGET_AVX512VL && TARGET_MMX_WITH_SSE"
+  "vpmovd<mmxvecsize>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
 ;; Pack/unpack vector modes
 (define_mode_attr mmxpackmode
   [(V4HI "V8QI") (V2SI "V4HI")])
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index b4d9ab4..fb07707 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -32,6 +32,11 @@
   (and (match_code "reg")
        (match_test "GENERAL_REGNO_P (REGNO (op))")))
 
+;; True if the operand is an INDEX class register.
+(define_predicate "index_reg_operand"
+  (and (match_code "reg")
+       (match_test "INDEX_REGNO_P (REGNO (op))")))
+
 ;; True if the operand is a nonimmediate operand with GENERAL class register.
 (define_predicate "nonimmediate_gr_operand"
   (if_then_else (match_code "reg")
@@ -686,25 +691,27 @@
   return true;
 })
 
-;; P6 processors will jump to the address after the decrement when %esp
-;; is used as a call operand, so they will execute return address as a code.
-;; See Pentium Pro errata 70, Pentium 2 errata A33 and Pentium 3 errata E17.
-
-(define_predicate "call_register_no_elim_operand"
+;; True for any non-virtual and non-eliminable register.  Used in places where
+;; instantiation of such a register may cause the pattern to not be recognized.
+(define_predicate "register_no_elim_operand"
   (match_operand 0 "register_operand")
 {
   if (SUBREG_P (op))
     op = SUBREG_REG (op);
 
-  if (!TARGET_64BIT && op == stack_pointer_rtx)
-    return false;
+  /* Before reload, we can allow (SUBREG (MEM...)) as a register operand
+     because it is guaranteed to be reloaded into one.  */
+  if (MEM_P (op))
+    return true;
 
-  return register_no_elim_operand (op, mode);
+  return !(op == arg_pointer_rtx
+	   || op == frame_pointer_rtx
+	   || VIRTUAL_REGISTER_P (op));
 })
 
-;; True for any non-virtual or eliminable register.  Used in places where
-;; instantiation of such a register may cause the pattern to not be recognized.
-(define_predicate "register_no_elim_operand"
+;; Similarly, but include the stack pointer.  This is used
+;; to prevent esp from being used as an index reg.
+(define_predicate "register_no_SP_operand"
   (match_operand 0 "register_operand")
 {
   if (SUBREG_P (op))
@@ -717,23 +724,18 @@
 
   return !(op == arg_pointer_rtx
 	   || op == frame_pointer_rtx
-	   || IN_RANGE (REGNO (op),
-			FIRST_PSEUDO_REGISTER, LAST_VIRTUAL_REGISTER));
+	   || op == stack_pointer_rtx
+	   || VIRTUAL_REGISTER_P (op));
 })
 
-;; Similarly, but include the stack pointer.  This is used to prevent esp
-;; from being used as an index reg.
-(define_predicate "index_register_operand"
-  (match_operand 0 "register_operand")
-{
-  if (SUBREG_P (op))
-    op = SUBREG_REG (op);
+;; P6 processors will jump to the address after the decrement when %esp
+;; is used as a call operand, so they will execute return address as a code.
+;; See Pentium Pro errata 70, Pentium 2 errata A33 and Pentium 3 errata E17.
 
-  if (reload_completed)
-    return REG_OK_FOR_INDEX_STRICT_P (op);
-  else
-    return REG_OK_FOR_INDEX_NONSTRICT_P (op);
-})
+(define_predicate "call_register_operand"
+  (if_then_else (match_test "TARGET_64BIT")
+    (match_operand 0 "register_operand")
+    (match_operand 0 "register_no_SP_operand")))
 
 ;; Return false if this is any eliminable register.  Otherwise general_operand.
 (define_predicate "general_no_elim_operand"
@@ -790,7 +792,7 @@
 (define_special_predicate "call_insn_operand"
   (ior (match_test "constant_call_address_operand
 		     (op, mode == VOIDmode ? mode : Pmode)")
-       (match_operand 0 "call_register_no_elim_operand")
+       (match_operand 0 "call_register_operand")
        (and (not (match_test "TARGET_INDIRECT_BRANCH_REGISTER"))
 	    (ior (and (not (match_test "TARGET_X32"))
 		      (match_operand 0 "memory_operand"))
@@ -1684,6 +1686,9 @@
 (define_predicate "compare_operator"
   (match_code "compare"))
 
+(define_predicate "extract_operator"
+  (match_code "zero_extract,sign_extract"))
+
 ;; Return true if OP is a memory operand, aligned to
 ;; less than its natural alignment.
 (define_predicate "misaligned_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 172ec3b..f793258 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -38,7 +38,6 @@
   UNSPEC_INSERTQ
 
   ;; For SSE4.1 support
-  UNSPEC_INSERTPS
   UNSPEC_DP
   UNSPEC_MOVNTDQA
   UNSPEC_MPSADBW
@@ -275,12 +274,6 @@
    V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")
    V32BF (V16BF "TARGET_AVX512VL") (V8BF "TARGET_AVX512VL")])
 
-;; Same iterator, but without supposed TARGET_AVX512BW
-(define_mode_iterator VI12_AVX512VLBW
-  [(V64QI "TARGET_AVX512BW") (V16QI "TARGET_AVX512VL")
-   (V32QI "TARGET_AVX512VL && TARGET_AVX512BW") (V32HI "TARGET_AVX512BW")
-   (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
-
 (define_mode_iterator VI1_AVX512VL
   [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")])
 
@@ -298,6 +291,9 @@
 (define_mode_iterator V_128
   [V16QI V8HI V4SI V2DI V4SF (V2DF "TARGET_SSE2")])
 
+(define_mode_iterator V_128H
+  [V16QI V8HI V8HF V8BF V4SI V2DI V4SF (V2DF "TARGET_SSE2")])
+
 ;; All 256bit vector modes
 (define_mode_iterator V_256
   [V32QI V16HI V8SI V4DI V8SF V4DF])
@@ -451,8 +447,9 @@
   [(V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16")
    (V8HF "TARGET_AVX512FP16") V32BF V16BF V8BF])
 
-(define_mode_iterator VF_AVX512BWHFBF16
-  [V32HF V16HF V8HF V32BF V16BF V8BF])
+(define_mode_iterator VF_AVX512HFBFVL
+  [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")
+   V32BF (V16BF "TARGET_AVX512VL") (V8BF "TARGET_AVX512VL")])
 
 (define_mode_iterator VF_AVX512FP16VL
   [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")])
@@ -686,10 +683,6 @@
 (define_mode_iterator VF4_128_8_256
   [V4DF V4SF])
 
-(define_mode_iterator VI1_AVX512VLBW
-  [(V64QI "TARGET_AVX512BW") (V32QI  "TARGET_AVX512VL")
-	(V16QI  "TARGET_AVX512VL")])
-
 (define_mode_attr avx512
   [(V16QI "avx512vl") (V32QI "avx512vl") (V64QI "avx512bw")
    (V8HI  "avx512vl") (V16HI  "avx512vl") (V32HI "avx512bw")
@@ -867,16 +860,15 @@
    (V4DI "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL")
    (V4SI "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
    (V2DI "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
-(define_mode_iterator VI12_VI48F_AVX512VLBW
+(define_mode_iterator VI12_VI48F_AVX512VL
   [(V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
    (V8DI "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
    (V8SI "TARGET_AVX512VL") (V8SF "TARGET_AVX512VL")
    (V4DI "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL")
    (V4SI "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
    (V2DI "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")
-   (V64QI "TARGET_AVX512BW") (V16QI "TARGET_AVX512VL")
-   (V32QI "TARGET_AVX512VL && TARGET_AVX512BW") (V32HI "TARGET_AVX512BW")
-   (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
+   V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")
+   V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
 
 (define_mode_iterator VI48F_256 [V8SI V8SF V4DI V4DF])
 
@@ -1087,6 +1079,12 @@
    (V8DI "v8hf") (V4DI "v4hf") (V2DI "v2hf")
    (V8DF "v8hf") (V16SF "v16hf") (V8SF "v8hf")])
 
+
+;; Mapping of vector modes to packed vector hf modes of same sized.
+(define_mode_attr ssepackPHmode
+  [(V16SI "V32HF") (V8SI "V16HF") (V4SI "V8HF")
+   (V16SF "V32HF") (V8SF "V16HF") (V4SF "V8HF")])
+
 ;; Mapping of vector modes to packed single mode of the same size
 (define_mode_attr ssePSmode
   [(V16SI "V16SF") (V8DF "V16SF")
@@ -1335,7 +1333,7 @@
 
 (define_insn "mov<mode>_internal"
   [(set (match_operand:VMOVE 0 "nonimmediate_operand"
-	 "=v,v ,v,v ,m")
+	 "=v,v ,x,v ,m")
 	(match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
 	 " C,<sseconstm1>,BH,vm,v"))]
   "TARGET_SSE
@@ -1597,10 +1595,10 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<avx512>_blendm<mode>"
-  [(set (match_operand:VF_AVX512BWHFBF16 0 "register_operand" "=v,v")
-	(vec_merge:VF_AVX512BWHFBF16
-	  (match_operand:VF_AVX512BWHFBF16 2 "nonimmediate_operand" "vm,vm")
-	  (match_operand:VF_AVX512BWHFBF16 1 "nonimm_or_0_operand" "0C,v")
+  [(set (match_operand:VF_AVX512HFBFVL 0 "register_operand" "=v,v")
+	(vec_merge:VF_AVX512HFBFVL
+	  (match_operand:VF_AVX512HFBFVL 2 "nonimmediate_operand" "vm,vm")
+	  (match_operand:VF_AVX512HFBFVL 1 "nonimm_or_0_operand" "0C,v")
 	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
   "TARGET_AVX512BW"
   "@
@@ -4557,10 +4555,6 @@
   DONE;
 })
 
-(define_mode_iterator VF_AVX512HFBFVL
-  [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")
-   V32BF (V16BF "TARGET_AVX512VL") (V8BF "TARGET_AVX512VL")])
-
 (define_expand "vcond<mode><sseintvecmodelower>"
   [(set (match_operand:VF_AVX512HFBFVL 0 "register_operand")
 	(if_then_else:VF_AVX512HFBFVL
@@ -6933,6 +6927,61 @@
    (V16SF "") (V8SF "{y}") (V4SF "{x}")
    (V8DF "{z}") (V4DF "{y}") (V2DF "{x}")])
 
+(define_mode_attr vunpck_extract_mode
+  [(V32HF "v32hf") (V16HF "v16hf") (V8HF "v16hf")])
+
+(define_expand "vec_unpacks_lo_<mode>"
+  [(match_operand:<ssePSmode> 0 "register_operand")
+   (match_operand:VF_AVX512FP16VL 1 "register_operand")]
+  "TARGET_AVX512FP16"
+{
+  rtx tem = operands[1];
+  rtx (*gen) (rtx, rtx);
+  if (<MODE>mode != V8HFmode)
+    {
+      tem = gen_reg_rtx (<ssehalfvecmode>mode);
+      emit_insn (gen_vec_extract_lo_<vunpck_extract_mode> (tem,
+							   operands[1]));
+      gen = gen_extend<ssehalfvecmodelower><ssePSmodelower>2;
+    }
+  else
+    gen = gen_avx512fp16_float_extend_phv4sf2;
+
+  emit_insn (gen (operands[0], tem));
+  DONE;
+})
+
+(define_expand "vec_unpacks_hi_<mode>"
+  [(match_operand:<ssePSmode> 0 "register_operand")
+   (match_operand:VF_AVX512FP16VL 1 "register_operand")]
+  "TARGET_AVX512FP16"
+{
+  rtx tem = operands[1];
+  rtx (*gen) (rtx, rtx);
+  if (<MODE>mode != V8HFmode)
+    {
+      tem = gen_reg_rtx (<ssehalfvecmode>mode);
+      emit_insn (gen_vec_extract_hi_<vunpck_extract_mode> (tem,
+							   operands[1]));
+      gen = gen_extend<ssehalfvecmodelower><ssePSmodelower>2;
+    }
+  else
+    {
+      tem = gen_reg_rtx (V8HFmode);
+      rtvec tmp = rtvec_alloc (8);
+      for (int i = 0; i != 8; i++)
+	RTVEC_ELT (tmp, i) = GEN_INT ((i + 4) % 8);
+
+      rtx selector = gen_rtx_PARALLEL (VOIDmode, tmp);
+      emit_move_insn (tem,
+		     gen_rtx_VEC_SELECT (V8HFmode, operands[1], selector));
+      gen = gen_avx512fp16_float_extend_phv4sf2;
+    }
+
+  emit_insn (gen (operands[0], tem));
+  DONE;
+})
+
 (define_insn "avx512fp16_vcvtph2<sseintconvertsignprefix><sseintconvert>_<mode><mask_name><round_name>"
   [(set (match_operand:VI248_AVX512VL 0 "register_operand" "=v")
         (unspec:VI248_AVX512VL
@@ -7850,7 +7899,7 @@
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "ufloat<sseintvecmodelower><mode>2<mask_name><round_name>"
+(define_insn "<mask_codefor>floatuns<sseintvecmodelower><mode>2<mask_name><round_name>"
   [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=v")
 	(unsigned_float:VF1_AVX512VL
 	  (match_operand:<sseintvecmode> 1 "nonimmediate_operand" "<round_constraint>")))]
@@ -7861,24 +7910,16 @@
    (set_attr "mode" "<MODE>")])
 
 (define_expand "floatuns<sseintvecmodelower><mode>2"
-  [(match_operand:VF1 0 "register_operand")
-   (match_operand:<sseintvecmode> 1 "register_operand")]
+  [(set (match_operand:VF1 0 "register_operand")
+	(unsigned_float:VF1
+	  (match_operand:<sseintvecmode> 1 "register_operand")))]
   "TARGET_SSE2 && (<MODE>mode == V4SFmode || TARGET_AVX2)"
 {
-  if (<MODE>mode == V16SFmode)
-    emit_insn (gen_ufloatv16siv16sf2 (operands[0], operands[1]));
-  else
-    if (TARGET_AVX512VL)
-      {
-	if (<MODE>mode == V4SFmode)
-	  emit_insn (gen_ufloatv4siv4sf2 (operands[0], operands[1]));
-	else
-	  emit_insn (gen_ufloatv8siv8sf2 (operands[0], operands[1]));
-      }
-  else
-    ix86_expand_vector_convert_uns_vsivsf (operands[0], operands[1]);
-
-  DONE;
+  if (<MODE>mode != V16SFmode && !TARGET_AVX512VL)
+    {
+      ix86_expand_vector_convert_uns_vsivsf (operands[0], operands[1]);
+      DONE;
+    }
 })
 
 
@@ -7913,7 +7954,7 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
-(define_insn "<mask_codefor><avx512>_ufix_notrunc<sf2simodelower><mode><mask_name><round_name>"
+(define_insn "<mask_codefor><avx512>_fixuns_notrunc<sf2simodelower><mode><mask_name><round_name>"
   [(set (match_operand:VI4_AVX512VL 0 "register_operand" "=v")
 	(unspec:VI4_AVX512VL
 	  [(match_operand:<ssePSmode> 1 "nonimmediate_operand" "<round_constraint>")]
@@ -7970,7 +8011,7 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "TI")])
 
-(define_insn "<fixsuffix>fix_truncv16sfv16si2<mask_name><round_saeonly_name>"
+(define_insn "fix<fixunssuffix>_truncv16sfv16si2<mask_name><round_saeonly_name>"
   [(set (match_operand:V16SI 0 "register_operand" "=v")
 	(any_fix:V16SI
 	  (match_operand:V16SF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))]
@@ -8010,22 +8051,21 @@
    (set_attr "mode" "TI")])
 
 (define_expand "fixuns_trunc<mode><sseintvecmodelower>2"
-  [(match_operand:<sseintvecmode> 0 "register_operand")
-   (match_operand:VF1 1 "register_operand")]
+  [(set (match_operand:<sseintvecmode> 0 "register_operand")
+	  (unsigned_fix:<sseintvecmode>
+	    (match_operand:VF1_128_256 1 "register_operand")))]
   "TARGET_SSE2"
 {
-  if (<MODE>mode == V16SFmode)
-    emit_insn (gen_ufix_truncv16sfv16si2 (operands[0],
-					  operands[1]));
-  else
+  /* AVX512 support vcvttps2udq for all 128/256/512-bit vectors.  */
+  if (!TARGET_AVX512VL)
     {
       rtx tmp[3];
       tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]);
       tmp[1] = gen_reg_rtx (<sseintvecmode>mode);
       emit_insn (gen_fix_trunc<mode><sseintvecmodelower>2 (tmp[1], tmp[0]));
       emit_insn (gen_xor<sseintvecmodelower>3 (operands[0], tmp[1], tmp[2]));
+      DONE;
     }
-  DONE;
 })
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -8338,11 +8378,17 @@
 })
 
 (define_mode_attr vpckfloat_concat_mode
-  [(V8DI "v16sf") (V4DI "v8sf") (V2DI "v8sf")])
+  [(V8DI "v16sf") (V4DI "v8sf") (V2DI "v8sf")
+   (V16SI "v32hf") (V8SI "v16hf") (V4SI "v16hf")
+   (V16SF "v32hf") (V8SF "v16hf") (V4SF "v16hf")])
 (define_mode_attr vpckfloat_temp_mode
-  [(V8DI "V8SF") (V4DI "V4SF") (V2DI "V4SF")])
+  [(V8DI "V8SF") (V4DI "V4SF") (V2DI "V4SF")
+   (V16SI "V16HF") (V8SI "V8HF") (V4SI "V8HF")
+   (V16SF "V16HF") (V8SF "V8HF") (V4SF "V8HF")])
 (define_mode_attr vpckfloat_op_mode
-  [(V8DI "v8sf") (V4DI "v4sf") (V2DI "v2sf")])
+  [(V8DI "v8sf") (V4DI "v4sf") (V2DI "v2sf")
+   (V16SI "v16hf") (V8SI "v8hf") (V4SI "v4hf")
+   (V16SF "v16hf") (V8SF "v8hf") (V4SF "v4hf")])
 
 (define_expand "vec_pack<floatprefix>_float_<mode>"
   [(match_operand:<ssePSmode> 0 "register_operand")
@@ -8369,6 +8415,31 @@
   DONE;
 })
 
+(define_expand "vec_pack<floatprefix>_float_<mode>"
+  [(match_operand:<ssepackPHmode> 0 "register_operand")
+   (any_float:<ssepackPHmode>
+     (match_operand:VI4_AVX512VL 1 "register_operand"))
+   (match_operand:VI4_AVX512VL 2 "register_operand")]
+  "TARGET_AVX512FP16"
+{
+  rtx r1 = gen_reg_rtx (<vpckfloat_temp_mode>mode);
+  rtx r2 = gen_reg_rtx (<vpckfloat_temp_mode>mode);
+  rtx (*gen) (rtx, rtx);
+
+  if (<MODE>mode == V4SImode)
+    gen = gen_avx512fp16_float<floatunssuffix>v4siv4hf2;
+  else
+    gen = gen_float<floatunssuffix><mode><vpckfloat_op_mode>2;
+  emit_insn (gen (r1, operands[1]));
+  emit_insn (gen (r2, operands[2]));
+  if (<MODE>mode == V4SImode)
+    emit_insn (gen_sse_movlhps_v8hf (operands[0], r1, r2));
+  else
+    emit_insn (gen_avx_vec_concat<vpckfloat_concat_mode> (operands[0],
+							  r1, r2));
+  DONE;
+})
+
 (define_expand "float<floatunssuffix>v2div2sf2_mask"
   [(set (match_operand:V4SF 0 "register_operand" "=v")
     (vec_concat:V4SF
@@ -8413,7 +8484,7 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "V4SF")])
 
-(define_insn "ufloat<si2dfmodelower><mode>2<mask_name>"
+(define_insn "floatuns<si2dfmodelower><mode>2<mask_name>"
   [(set (match_operand:VF2_512_256VL 0 "register_operand" "=v")
 	(unsigned_float:VF2_512_256VL
 	  (match_operand:<si2dfmode> 1 "nonimmediate_operand" "vm")))]
@@ -8423,7 +8494,7 @@
     (set_attr "prefix" "evex")
     (set_attr "mode" "<MODE>")])
 
-(define_insn "ufloatv2siv2df2<mask_name>"
+(define_insn "<mask_codefor>floatunsv2siv2df2<mask_name>"
   [(set (match_operand:V2DF 0 "register_operand" "=v")
 	(unsigned_float:V2DF
 	  (vec_select:V2SI
@@ -8572,11 +8643,11 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "TI")])
 
-;; For ufix_notrunc* insn patterns
+;; For fixuns_notrunc* insn patterns
 (define_mode_attr pd2udqsuff
   [(V8DF "") (V4DF "{y}")])
 
-(define_insn "ufix_notrunc<mode><si2dfmodelower>2<mask_name><round_name>"
+(define_insn "fixuns_notrunc<mode><si2dfmodelower>2<mask_name><round_name>"
   [(set (match_operand:<si2dfmode> 0 "register_operand" "=v")
 	(unspec:<si2dfmode>
 	  [(match_operand:VF2_512_256VL 1 "nonimmediate_operand" "<round_constraint>")]
@@ -8587,7 +8658,7 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "ufix_notruncv2dfv2si2"
+(define_insn "fixuns_notruncv2dfv2si2"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
 	(vec_concat:V4SI
 	  (unspec:V2SI
@@ -8600,7 +8671,7 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "TI")])
 
-(define_insn "ufix_notruncv2dfv2si2_mask"
+(define_insn "fixuns_notruncv2dfv2si2_mask"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
 	(vec_concat:V4SI
 	  (vec_merge:V2SI
@@ -8618,7 +8689,7 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "TI")])
 
-(define_insn "*ufix_notruncv2dfv2si2_mask_1"
+(define_insn "*fixuns_notruncv2dfv2si2_mask_1"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
 	(vec_concat:V4SI
 	  (vec_merge:V2SI
@@ -8644,7 +8715,7 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "OI")])
 
-(define_insn "ufix_truncv2dfv2si2"
+(define_insn "*fixuns_truncv2dfv2si2"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
 	(vec_concat:V4SI
 	  (unsigned_fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vm"))
@@ -8655,7 +8726,7 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "TI")])
 
-(define_insn "ufix_truncv2dfv2si2_mask"
+(define_insn "fixuns_truncv2dfv2si2_mask"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
 	(vec_concat:V4SI
 	  (vec_merge:V2SI
@@ -8671,7 +8742,7 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "TI")])
 
-(define_insn "*ufix_truncv2dfv2si2_mask_1"
+(define_insn "*fixuns_truncv2dfv2si2_mask_1"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
 	(vec_concat:V4SI
 	  (vec_merge:V2SI
@@ -8694,7 +8765,7 @@
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "OI")])
 
-(define_insn "ufix_truncv4dfv4si2<mask_name>"
+(define_insn "fixuns_truncv4dfv4si2<mask_name>"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
 	(unsigned_fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512VL && TARGET_AVX512F"
@@ -8724,7 +8795,7 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseintvecmode2>")])
 
-(define_insn "ufix_notrunc<mode><sseintvecmodelower>2<mask_name><round_name>"
+(define_insn "fixuns_notrunc<mode><sseintvecmodelower>2<mask_name><round_name>"
   [(set (match_operand:<sseintvecmode> 0 "register_operand" "=v")
 	(unspec:<sseintvecmode>
 	  [(match_operand:VF2_AVX512VL 1 "nonimmediate_operand" "<round_constraint>")]
@@ -8771,11 +8842,14 @@
 })
 
 (define_mode_attr vunpckfixt_mode
-  [(V16SF "V8DI") (V8SF "V4DI") (V4SF "V2DI")])
+  [(V16SF "V8DI") (V8SF "V4DI") (V4SF "V2DI")
+   (V32HF "V16SI") (V16HF "V8SI") (V8HF "V4SI")])
 (define_mode_attr vunpckfixt_model
-  [(V16SF "v8di") (V8SF "v4di") (V4SF "v2di")])
+  [(V16SF "v8di") (V8SF "v4di") (V4SF "v2di")
+   (V32HF "v16si") (V16HF "v8si") (V8HF "v4si")])
 (define_mode_attr vunpckfixt_extract_mode
-  [(V16SF "v16sf") (V8SF "v8sf") (V4SF "v8sf")])
+  [(V16SF "v16sf") (V8SF "v8sf") (V4SF "v8sf")
+   (V32HF "v32hf") (V16HF "v16hf") (V8HF "v16hf")])
 
 (define_expand "vec_unpack_<fixprefix>fix_trunc_lo_<mode>"
   [(match_operand:<vunpckfixt_mode> 0 "register_operand")
@@ -8827,7 +8901,61 @@
   DONE;
 })
 
-(define_insn "ufix_trunc<mode><sseintvecmodelower>2<mask_name>"
+(define_expand "vec_unpack_<fixprefix>fix_trunc_lo_<mode>"
+  [(match_operand:<vunpckfixt_mode> 0 "register_operand")
+   (any_fix:<vunpckfixt_mode>
+     (match_operand:VF_AVX512FP16VL 1 "register_operand"))]
+  "TARGET_AVX512FP16"
+{
+  rtx tem = operands[1];
+  rtx (*gen) (rtx, rtx);
+  if (<MODE>mode != V8HFmode)
+    {
+      tem = gen_reg_rtx (<ssehalfvecmode>mode);
+      emit_insn (gen_vec_extract_lo_<vunpckfixt_extract_mode> (tem,
+							       operands[1]));
+      gen = gen_fix<fixunssuffix>_trunc<ssehalfvecmodelower><vunpckfixt_model>2;
+    }
+  else
+    gen = gen_avx512fp16_fix<fixunssuffix>_trunc<vunpckfixt_model>2;
+
+  emit_insn (gen (operands[0], tem));
+  DONE;
+})
+
+(define_expand "vec_unpack_<fixprefix>fix_trunc_hi_<mode>"
+  [(match_operand:<vunpckfixt_mode> 0 "register_operand")
+   (any_fix:<vunpckfixt_mode>
+     (match_operand:VF_AVX512FP16VL 1 "register_operand"))]
+  "TARGET_AVX512FP16"
+{
+  rtx tem = operands[1];
+  rtx (*gen) (rtx, rtx);
+  if (<MODE>mode != V8HFmode)
+    {
+      tem = gen_reg_rtx (<ssehalfvecmode>mode);
+      emit_insn (gen_vec_extract_hi_<vunpckfixt_extract_mode> (tem,
+							       operands[1]));
+      gen = gen_fix<fixunssuffix>_trunc<ssehalfvecmodelower><vunpckfixt_model>2;
+    }
+  else
+    {
+      tem = gen_reg_rtx (V8HFmode);
+      rtvec tmp = rtvec_alloc (8);
+      for (int i = 0; i != 8; i++)
+	RTVEC_ELT (tmp, i) = GEN_INT ((i + 4) % 8);
+
+      rtx selector = gen_rtx_PARALLEL (VOIDmode, tmp);
+      emit_move_insn (tem,
+		     gen_rtx_VEC_SELECT (V8HFmode, operands[1], selector));
+      gen = gen_avx512fp16_fix<fixunssuffix>_trunc<vunpckfixt_model>2;
+    }
+
+  emit_insn (gen (operands[0], tem));
+  DONE;
+})
+
+(define_insn "<mask_codefor>fixuns_trunc<mode><sseintvecmodelower>2<mask_name>"
   [(set (match_operand:<sseintvecmode> 0 "register_operand" "=v")
 	(unsigned_fix:<sseintvecmode>
 	  (match_operand:VF1_128_256VL 1 "nonimmediate_operand" "vm")))]
@@ -9640,6 +9768,31 @@
   operands[4] = gen_reg_rtx (<sf2dfmode>mode);
 })
 
+(define_expand "vec_pack_trunc_<mode>"
+  [(match_operand:<ssepackPHmode> 0 "register_operand")
+   (match_operand:VF1_AVX512VL 1 "register_operand")
+   (match_operand:VF1_AVX512VL 2 "register_operand")]
+  "TARGET_AVX512FP16"
+{
+  rtx r1 = gen_reg_rtx (<vpckfloat_temp_mode>mode);
+  rtx r2 = gen_reg_rtx (<vpckfloat_temp_mode>mode);
+  rtx (*gen) (rtx, rtx);
+
+  if (<MODE>mode == V4SFmode)
+    gen = gen_avx512fp16_truncv4sfv4hf2;
+  else
+    gen = gen_trunc<mode><vpckfloat_op_mode>2;
+  emit_insn (gen (r1, operands[1]));
+  emit_insn (gen (r2, operands[2]));
+  if (<MODE>mode == V4SFmode)
+    emit_insn (gen_sse_movlhps_v8hf (operands[0], r1, r2));
+  else
+    emit_insn (gen_avx_vec_concat<vpckfloat_concat_mode> (operands[0],
+							  r1, r2));
+  DONE;
+
+})
+
 (define_expand "vec_pack_trunc_v2df"
   [(match_operand:V4SF 0 "register_operand")
    (match_operand:V2DF 1 "vector_operand")
@@ -9945,6 +10098,27 @@
    (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex,maybe_vex")
    (set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")])
 
+(define_insn "sse_movlhps_<mode>"
+  [(set (match_operand:V8_128 0 "nonimmediate_operand"     "=x,v,x,o")
+	(vec_select:V8_128
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V8_128 1 "nonimmediate_operand" " 0,v,0,0")
+	    (match_operand:V8_128 2 "nonimmediate_operand" " x,v,m,v"))
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)])))]
+  "TARGET_SSE && ix86_binary_operator_ok (UNKNOWN, <MODE>mode, operands)"
+  "@
+   movlhps\t{%2, %0|%0, %2}
+   vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}
+   movhps\t{%2, %0|%0, %q2}
+   %vmovlps\t{%2, %H0|%H0, %2}"
+  [(set_attr "isa" "noavx,avx,noavx,*")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix" "orig,maybe_evex,orig,maybe_vex")
+   (set_attr "mode" "V4SF,TI,V2SF,V2SF")])
+
 (define_insn "<mask_codefor>avx512f_unpckhps512<mask_name>"
   [(set (match_operand:V16SF 0 "register_operand" "=v")
 	(vec_select:V16SF
@@ -10968,12 +11142,13 @@
   DONE;
 })
 
-(define_insn "sse4_1_insertps"
-  [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
-	(unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
-		      (match_operand:V4SF 1 "register_operand" "0,0,v")
-		      (match_operand:SI 3 "const_0_to_255_operand")]
-		     UNSPEC_INSERTPS))]
+(define_insn "@sse4_1_insertps_<mode>"
+  [(set (match_operand:VI4F_128 0 "register_operand" "=Yr,*x,v")
+	(unspec:VI4F_128
+	  [(match_operand:VI4F_128 2 "nonimmediate_operand" "Yrm,*xm,vm")
+	   (match_operand:VI4F_128 1 "register_operand" "0,0,v")
+	   (match_operand:SI 3 "const_0_to_255_operand")]
+	  UNSPEC_INSERTPS))]
   "TARGET_SSE4_1"
 {
   if (MEM_P (operands[2]))
@@ -10981,7 +11156,8 @@
       unsigned count_s = INTVAL (operands[3]) >> 6;
       if (count_s)
 	operands[3] = GEN_INT (INTVAL (operands[3]) & 0x3f);
-      operands[2] = adjust_address_nv (operands[2], SFmode, count_s * 4);
+      operands[2] = adjust_address_nv (operands[2],
+				       <ssescalarmode>mode, count_s * 4);
     }
   switch (which_alternative)
     {
@@ -12421,22 +12597,35 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "*<avx512>_vternlog<mode>_all"
-  [(set (match_operand:V 0 "register_operand" "=v")
+  [(set (match_operand:V 0 "register_operand" "=v,v")
 	(unspec:V
-	  [(match_operand:V 1 "register_operand" "0")
-	   (match_operand:V 2 "register_operand" "v")
-	   (match_operand:V 3 "bcst_vector_operand" "vmBr")
+	  [(match_operand:V 1 "register_operand" "0,0")
+	   (match_operand:V 2 "register_operand" "v,v")
+	   (match_operand:V 3 "bcst_vector_operand" "vBr,m")
 	   (match_operand:SI 4 "const_0_to_255_operand")]
 	  UNSPEC_VTERNLOG))]
-  "TARGET_AVX512F
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
+    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
 /* Disallow embeded broadcast for vector HFmode since
    it's not real AVX512FP16 instruction.  */
   && (GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)) >= 4
      || GET_CODE (operands[3]) != VEC_DUPLICATE)"
-  "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"
+{
+  if (TARGET_AVX512VL)
+    return "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}";
+  else
+    return "vpternlog<ternlogsuffix>\t{%4, %g3, %g2, %g0|%g0, %g2, %g3, %4}";
+}
   [(set_attr "type" "sselog")
    (set_attr "prefix" "evex")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set (attr "mode")
+        (if_then_else (match_test "TARGET_AVX512VL")
+		      (const_string "<sseinsnmode>")
+		      (const_string "XI")))
+   (set (attr "enabled")
+	(if_then_else (eq_attr "alternative" "1")
+		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
+		      (const_string "*")))])
 
 ;; There must be lots of other combinations like
 ;;
@@ -12465,7 +12654,8 @@
 	  (any_logic2:V
 	    (match_operand:V 3 "regmem_or_bitnot_regmem_operand")
 	    (match_operand:V 4 "regmem_or_bitnot_regmem_operand"))))]
-  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
+    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
    && ix86_pre_reload_split ()
    && (rtx_equal_p (STRIP_UNARY (operands[1]),
 		    STRIP_UNARY (operands[4]))
@@ -12549,7 +12739,8 @@
 	      (match_operand:V 2 "regmem_or_bitnot_regmem_operand"))
 	    (match_operand:V 3 "regmem_or_bitnot_regmem_operand"))
 	  (match_operand:V 4 "regmem_or_bitnot_regmem_operand")))]
-  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
+    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
    && ix86_pre_reload_split ()
    && (rtx_equal_p (STRIP_UNARY (operands[1]),
 		    STRIP_UNARY (operands[4]))
@@ -12632,7 +12823,8 @@
 	    (match_operand:V 1 "regmem_or_bitnot_regmem_operand")
 	    (match_operand:V 2 "regmem_or_bitnot_regmem_operand"))
 	  (match_operand:V 3 "regmem_or_bitnot_regmem_operand")))]
-  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
+    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
@@ -15009,16 +15201,6 @@
 	(eq:VI12_AVX2 (match_dup 4) (match_dup 1)))]
   "operands[4] = gen_reg_rtx (<MODE>mode);")
 
-(define_expand "mulv8qi3"
-  [(set (match_operand:V8QI 0 "register_operand")
-	(mult:V8QI (match_operand:V8QI 1 "register_operand")
-		   (match_operand:V8QI 2 "register_operand")))]
-  "TARGET_AVX512VL && TARGET_AVX512BW && TARGET_64BIT"
-{
-  ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
-  DONE;
-})
-
 (define_expand "mul<mode>3"
   [(set (match_operand:VI1_AVX512 0 "register_operand")
 	(mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand")
@@ -17148,17 +17330,17 @@
 
 ;; PR target/100711: Split notl; vpbroadcastd; vpand as vpbroadcastd; vpandn
 (define_split
-  [(set (match_operand:VI124_AVX2 0 "register_operand")
-	(and:VI124_AVX2
-	  (vec_duplicate:VI124_AVX2
+  [(set (match_operand:VI_AVX2 0 "register_operand")
+	(and:VI_AVX2
+	  (vec_duplicate:VI_AVX2
 	    (not:<ssescalarmode>
 	      (match_operand:<ssescalarmode> 1 "register_operand")))
-	  (match_operand:VI124_AVX2 2 "vector_operand")))]
+	  (match_operand:VI_AVX2 2 "vector_operand")))]
   "TARGET_AVX2"
   [(set (match_dup 3)
-	(vec_duplicate:VI124_AVX2 (match_dup 1)))
+	(vec_duplicate:VI_AVX2 (match_dup 1)))
    (set (match_dup 0)
-	(and:VI124_AVX2 (not:VI124_AVX2 (match_dup 3))
+	(and:VI_AVX2 (not:VI_AVX2 (match_dup 3))
 			(match_dup 2)))]
   "operands[3] = gen_reg_rtx (<MODE>mode);")
 
@@ -17596,14 +17778,14 @@
   DONE;
 })
 
-(define_insn "<sse2_avx2>_packsswb<mask_name>"
-  [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,<v_Yw>")
-	(vec_concat:VI1_AVX512
-	  (ss_truncate:<ssehalfvecmode>
-	    (match_operand:<sseunpackmode> 1 "register_operand" "0,<v_Yw>"))
-	  (ss_truncate:<ssehalfvecmode>
-	    (match_operand:<sseunpackmode> 2 "vector_operand" "xBm,<v_Yw>m"))))]
-  "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
+(define_insn "sse2_packsswb<mask_name>"
+  [(set (match_operand:V16QI 0 "register_operand" "=x,Yw")
+	(vec_concat:V16QI
+	  (ss_truncate:V8QI
+	    (match_operand:V8HI 1 "register_operand" "0,Yw"))
+	  (ss_truncate:V8QI
+	    (match_operand:V8HI 2 "vector_operand" "xBm,Ywm"))))]
+  "TARGET_SSE2 && <mask_avx512vl_condition> && <mask_avx512bw_condition>"
   "@
    packsswb\t{%2, %0|%0, %2}
    vpacksswb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
@@ -17611,16 +17793,93 @@
    (set_attr "type" "sselog")
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix" "orig,<mask_prefix>")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set_attr "mode" "TI")])
 
-(define_insn "<sse2_avx2>_packssdw<mask_name>"
-  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,<v_Yw>")
-	(vec_concat:VI2_AVX2
-	  (ss_truncate:<ssehalfvecmode>
-	    (match_operand:<sseunpackmode> 1 "register_operand" "0,<v_Yw>"))
-	  (ss_truncate:<ssehalfvecmode>
-	    (match_operand:<sseunpackmode> 2 "vector_operand" "xBm,<v_Yw>m"))))]
-  "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
+(define_insn "avx2_packsswb<mask_name>"
+  [(set (match_operand:V32QI 0 "register_operand" "=Yw")
+	(vec_select:V32QI
+	  (vec_concat:V32QI
+	    (ss_truncate:V16QI
+	      (match_operand:V16HI 1 "register_operand" "Yw"))
+	    (ss_truncate:V16QI
+	      (match_operand:V16HI 2 "vector_operand" "Ywm")))
+	  (parallel [(const_int 0)  (const_int 1)
+		     (const_int 2)  (const_int 3)
+		     (const_int 4)  (const_int 5)
+		     (const_int 6)  (const_int 7)
+		     (const_int 16) (const_int 17)
+		     (const_int 18) (const_int 19)
+		     (const_int 20) (const_int 21)
+		     (const_int 22) (const_int 23)
+		     (const_int 8)  (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)
+		     (const_int 24) (const_int 25)
+		     (const_int 26) (const_int 27)
+		     (const_int 28) (const_int 29)
+		     (const_int 30) (const_int 31)])))]
+  "TARGET_AVX2 && <mask_avx512vl_condition> && <mask_avx512bw_condition>"
+  "vpacksswb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "<mask_prefix>")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx512bw_packsswb<mask_name>"
+  [(set (match_operand:V64QI 0 "register_operand" "=v")
+	(vec_select:V64QI
+	  (vec_concat:V64QI
+	    (ss_truncate:V32QI
+	      (match_operand:V32HI 1 "register_operand" "v"))
+	    (ss_truncate:V32QI
+	      (match_operand:V32HI 2 "vector_operand" "vm")))
+	  (parallel [(const_int 0)  (const_int 1)
+		     (const_int 2)  (const_int 3)
+		     (const_int 4)  (const_int 5)
+		     (const_int 6)  (const_int 7)
+		     (const_int 32) (const_int 33)
+		     (const_int 34) (const_int 35)
+		     (const_int 36) (const_int 37)
+		     (const_int 38) (const_int 39)
+		     (const_int 8)  (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)
+		     (const_int 40) (const_int 41)
+		     (const_int 42) (const_int 43)
+		     (const_int 44) (const_int 45)
+		     (const_int 46) (const_int 47)
+		     (const_int 16) (const_int 17)
+		     (const_int 18) (const_int 19)
+		     (const_int 20) (const_int 21)
+		     (const_int 22) (const_int 23)
+		     (const_int 48) (const_int 49)
+		     (const_int 50) (const_int 51)
+		     (const_int 52) (const_int 53)
+		     (const_int 54) (const_int 55)
+		     (const_int 24) (const_int 25)
+		     (const_int 26) (const_int 27)
+		     (const_int 28) (const_int 29)
+		     (const_int 30) (const_int 31)
+		     (const_int 56) (const_int 57)
+		     (const_int 58) (const_int 59)
+		     (const_int 60) (const_int 61)
+		     (const_int 62) (const_int 63)])))]
+
+  "TARGET_AVX512BW"
+  "vpacksswb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "<mask_prefix>")
+   (set_attr "mode" "XI")])
+
+(define_insn "sse2_packssdw<mask_name>"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,Yw")
+	(vec_concat:V8HI
+	  (ss_truncate:V4HI
+	    (match_operand:V4SI 1 "register_operand" "0,Yw"))
+	  (ss_truncate:V4HI
+	    (match_operand:V4SI 2 "vector_operand" "xBm,Ywm"))))]
+  "TARGET_SSE2 && <mask_avx512vl_condition> && <mask_avx512bw_condition>"
   "@
    packssdw\t{%2, %0|%0, %2}
    vpackssdw\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
@@ -17628,15 +17887,68 @@
    (set_attr "type" "sselog")
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix" "orig,<mask_prefix>")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set_attr "mode" "TI")])
+
+(define_insn "avx2_packssdw<mask_name>"
+  [(set (match_operand:V16HI 0 "register_operand" "=Yw")
+	(vec_select:V16HI
+	  (vec_concat:V16HI
+	    (ss_truncate:V8HI
+	      (match_operand:V8SI 1 "register_operand" "Yw"))
+	    (ss_truncate:V8HI
+	      (match_operand:V8SI 2 "vector_operand" "Ywm")))
+	  (parallel [(const_int 0)  (const_int 1)
+		     (const_int 2)  (const_int 3)
+		     (const_int 8)  (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 4)  (const_int 5)
+		     (const_int 6)  (const_int 7)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_AVX2 && <mask_avx512vl_condition> && <mask_avx512bw_condition>"
+  "vpackssdw\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "<mask_prefix>")
+   (set_attr "mode" "OI")])
 
+(define_insn "avx512bw_packssdw<mask_name>"
+  [(set (match_operand:V32HI 0 "register_operand" "=v")
+	(vec_select:V32HI
+	  (vec_concat:V32HI
+	    (ss_truncate:V16HI
+	      (match_operand:V16SI 1 "register_operand" "v"))
+	    (ss_truncate:V16HI
+	      (match_operand:V16SI 2 "vector_operand" "vm")))
+	  (parallel [(const_int 0)  (const_int 1)
+		     (const_int 2)  (const_int 3)
+		     (const_int 16)  (const_int 17)
+		     (const_int 18) (const_int 19)
+		     (const_int 4)  (const_int 5)
+		     (const_int 6)  (const_int 7)
+		     (const_int 20) (const_int 21)
+		     (const_int 22) (const_int 23)
+		     (const_int 8)  (const_int 9)
+		     (const_int 10)  (const_int 11)
+		     (const_int 24)  (const_int 25)
+		     (const_int 26)  (const_int 27)
+		     (const_int 12)  (const_int 13)
+		     (const_int 14)  (const_int 15)
+		     (const_int 28)  (const_int 29)
+		     (const_int 30)  (const_int 31)])))]
+  "TARGET_AVX512BW"
+  "vpackssdw\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "<mask_prefix>")
+   (set_attr "mode" "XI")])
+
+;; This instruction does unsigned saturation of signed source
+;; and is different from generic us_truncate RTX.
 (define_insn "<sse2_avx2>_packuswb<mask_name>"
   [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,<v_Yw>")
-	(vec_concat:VI1_AVX512
-	  (us_truncate:<ssehalfvecmode>
-	    (match_operand:<sseunpackmode> 1 "register_operand" "0,<v_Yw>"))
-	  (us_truncate:<ssehalfvecmode>
-	    (match_operand:<sseunpackmode> 2 "vector_operand" "xBm,<v_Yw>m"))))]
+	(unspec:VI1_AVX512
+	  [(match_operand:<sseunpackmode> 1 "register_operand" "0,<v_Yw>")
+	   (match_operand:<sseunpackmode> 2 "vector_operand" "xBm,<v_Yw>m")]
+	  UNSPEC_US_TRUNCATE))]
   "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
   "@
    packuswb\t{%2, %0|%0, %2}
@@ -18446,6 +18758,8 @@
   mask = INTVAL (operands[3]) / 2;
   mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
   operands[3] = GEN_INT (mask);
+  if (INTVAL (operands[3]) == 2 && !<mask_applied>)
+    return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
   return "vshuf<shuffletype>64x2\t{%3, %2, %1, %0<mask_operand7>|%0<mask_operand7>, %1, %2, %3}";
 }
   [(set_attr "type" "sselog")
@@ -18604,6 +18918,9 @@
   mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
   operands[3] = GEN_INT (mask);
 
+  if (INTVAL (operands[3]) == 2 && !<mask_applied>)
+    return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+
   return "vshuf<shuffletype>32x4\t{%3, %2, %1, %0<mask_operand11>|%0<mask_operand11>, %1, %2, %3}";
 }
   [(set_attr "type" "sselog")
@@ -20450,10 +20767,10 @@
 		    UNSPEC_MOVMSK)
 		 (match_operand 2 "const_int_operand")))]
   "TARGET_SSE4_1 && (INTVAL (operands[2]) == (int) (<vi1avx2const>))"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC [(match_dup 0)
-		    (match_dup 0)]
-		   UNSPEC_PTEST))])
+  [(set (reg:CCZ FLAGS_REG)
+	(unspec:CCZ [(match_dup 0)
+		     (match_dup 0)]
+		    UNSPEC_PTEST))])
 
 (define_expand "sse2_maskmovdqu"
   [(set (match_operand:V16QI 0 "memory_operand")
@@ -21718,11 +22035,10 @@
 
 (define_insn "<sse4_1_avx2>_packusdw<mask_name>"
   [(set (match_operand:VI2_AVX2 0 "register_operand" "=Yr,*x,<v_Yw>")
-	(vec_concat:VI2_AVX2
-	  (us_truncate:<ssehalfvecmode>
-	    (match_operand:<sseunpackmode> 1 "register_operand" "0,0,<v_Yw>"))
-	  (us_truncate:<ssehalfvecmode>
-	    (match_operand:<sseunpackmode> 2 "vector_operand" "YrBm,*xBm,<v_Yw>m"))))]
+	(unspec:VI2_AVX2
+	  [(match_operand:<sseunpackmode> 1 "register_operand" "0,0,<v_Yw>")
+	   (match_operand:<sseunpackmode> 2 "vector_operand" "YrBm,*xBm,<v_Yw>m")]
+	   UNSPEC_US_TRUNCATE))]
   "TARGET_SSE4_1 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
   "@
    packusdw\t{%2, %0|%0, %2}
@@ -23105,13 +23421,13 @@
    (set_attr "mode" "<MODE>")])
 
 ;; ptest is very similar to comiss and ucomiss when setting FLAGS_REG.
-;; But it is not a really compare instruction.
-(define_insn "<sse4_1>_ptest<mode>"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC [(match_operand:V_AVX 0 "register_operand" "Yr, *x, x")
-		    (match_operand:V_AVX 1 "vector_operand" "YrBm, *xBm, xm")]
-		   UNSPEC_PTEST))]
-  "TARGET_SSE4_1"
+;; But it is not really a compare instruction.
+(define_insn "*<sse4_1>_ptest<mode>"
+  [(set (reg FLAGS_REG)
+	(unspec [(match_operand:V_AVX 0 "register_operand" "Yr, *x, x")
+		 (match_operand:V_AVX 1 "vector_operand" "YrBm, *xBm, xm")]
+		UNSPEC_PTEST))]
+  "TARGET_SSE4_1 && ix86_match_ptest_ccmode (insn)"
   "%vptest\t{%1, %0|%0, %1}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "ssecomi")
@@ -23124,6 +23440,30 @@
      (const_string "*")))
    (set_attr "mode" "<sseinsnmode>")])
 
+;; Expand a ptest to set the Z flag.
+(define_expand "<sse4_1>_ptestz<mode>"
+  [(set (reg:CCZ FLAGS_REG)
+	(unspec:CCZ [(match_operand:V_AVX 0 "register_operand")
+		     (match_operand:V_AVX 1 "vector_operand")]
+		    UNSPEC_PTEST))]
+  "TARGET_SSE4_1")
+
+;; Expand a ptest to set the C flag
+(define_expand "<sse4_1>_ptestc<mode>"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(match_operand:V_AVX 0 "register_operand")
+		     (match_operand:V_AVX 1 "vector_operand")]
+		    UNSPEC_PTEST))]
+  "TARGET_SSE4_1")
+
+;; Expand a ptest to set both the Z and C flags
+(define_expand "<sse4_1>_ptest<mode>"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_operand:V_AVX 0 "register_operand")
+		    (match_operand:V_AVX 1 "vector_operand")]
+		   UNSPEC_PTEST))]
+  "TARGET_SSE4_1")
+
 (define_insn "ptesttf2"
   [(set (reg:CC FLAGS_REG)
 	(unspec:CC [(match_operand:TF 0 "register_operand" "Yr, *x, x")
@@ -23138,17 +23478,17 @@
    (set_attr "mode" "TI")])
 
 (define_insn_and_split "*ptest<mode>_and"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC [(and:V_AVX (match_operand:V_AVX 0 "register_operand")
-			       (match_operand:V_AVX 1 "vector_operand"))
-		    (and:V_AVX (match_dup 0) (match_dup 1))]
+  [(set (reg:CCZ FLAGS_REG)
+	(unspec:CCZ [(and:V_AVX (match_operand:V_AVX 0 "register_operand")
+				(match_operand:V_AVX 1 "vector_operand"))
+		     (and:V_AVX (match_dup 0) (match_dup 1))]
 		   UNSPEC_PTEST))]
   "TARGET_SSE4_1
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC [(match_dup 0) (match_dup 1)] UNSPEC_PTEST))])
+  [(set (reg:CCZ FLAGS_REG)
+	(unspec:CCZ [(match_dup 0) (match_dup 1)] UNSPEC_PTEST))])
 
 (define_expand "nearbyint<mode>2"
   [(set (match_operand:VFH 0 "register_operand")
@@ -24591,17 +24931,6 @@
     }
 })
 
-(define_expand "v<insn>v8qi3"
-  [(set (match_operand:V8QI 0 "register_operand")
-	(any_shift:V8QI
-	  (match_operand:V8QI 1 "register_operand")
-	  (match_operand:V8QI 2 "nonimmediate_operand")))]
-  "TARGET_AVX512BW && TARGET_AVX512VL && TARGET_64BIT"
-{
-  ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
-  DONE;
-})
-
 (define_expand "vlshr<mode>3"
   [(set (match_operand:VI48_512 0 "register_operand")
 	(lshiftrt:VI48_512
@@ -25125,67 +25454,71 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (define_insn "aesenc"
-  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
-	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x")
-		       (match_operand:V2DI 2 "vector_operand" "xBm,xm")]
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x,v")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x,v")
+		       (match_operand:V2DI 2 "vector_operand" "xBm,xm,vm")]
 		      UNSPEC_AESENC))]
-  "TARGET_AES"
+  "TARGET_AES || (TARGET_VAES && TARGET_AVX512VL)"
   "@
    aesenc\t{%2, %0|%0, %2}
+   vaesenc\t{%2, %1, %0|%0, %1, %2}
    vaesenc\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
+  [(set_attr "isa" "noavx,aes,avx512vl")
    (set_attr "type" "sselog1")
    (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "btver2_decode" "double,double")
+   (set_attr "prefix" "orig,vex,evex")
+   (set_attr "btver2_decode" "double,double,double")
    (set_attr "mode" "TI")])
 
 (define_insn "aesenclast"
-  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
-	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x")
-		       (match_operand:V2DI 2 "vector_operand" "xBm,xm")]
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x,v")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x,v")
+		       (match_operand:V2DI 2 "vector_operand" "xBm,xm,vm")]
 		      UNSPEC_AESENCLAST))]
-  "TARGET_AES"
+  "TARGET_AES || (TARGET_VAES && TARGET_AVX512VL)"
   "@
    aesenclast\t{%2, %0|%0, %2}
+   vaesenclast\t{%2, %1, %0|%0, %1, %2}
    vaesenclast\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
+  [(set_attr "isa" "noavx,aes,avx512vl")
    (set_attr "type" "sselog1")
    (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "btver2_decode" "double,double") 
+   (set_attr "prefix" "orig,vex,evex")
+   (set_attr "btver2_decode" "double,double,double") 
    (set_attr "mode" "TI")])
 
 (define_insn "aesdec"
-  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
-	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x")
-		       (match_operand:V2DI 2 "vector_operand" "xBm,xm")]
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x,v")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x,v")
+		       (match_operand:V2DI 2 "vector_operand" "xBm,xm,vm")]
 		      UNSPEC_AESDEC))]
-  "TARGET_AES"
+  "TARGET_AES || (TARGET_VAES && TARGET_AVX512VL)"
   "@
    aesdec\t{%2, %0|%0, %2}
+   vaesdec\t{%2, %1, %0|%0, %1, %2}
    vaesdec\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
+  [(set_attr "isa" "noavx,aes,avx512vl")
    (set_attr "type" "sselog1")
    (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "btver2_decode" "double,double") 
+   (set_attr "prefix" "orig,vex,evex")
+   (set_attr "btver2_decode" "double,double,double") 
    (set_attr "mode" "TI")])
 
 (define_insn "aesdeclast"
-  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
-	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x")
-		       (match_operand:V2DI 2 "vector_operand" "xBm,xm")]
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x,v")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x,v")
+		       (match_operand:V2DI 2 "vector_operand" "xBm,xm,vm")]
 		      UNSPEC_AESDECLAST))]
-  "TARGET_AES"
+  "TARGET_AES || (TARGET_VAES && TARGET_AVX512VL)"
   "@
    aesdeclast\t{%2, %0|%0, %2}
+   vaesdeclast\t{%2, %1, %0|%0, %1, %2}
    vaesdeclast\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
+  [(set_attr "isa" "noavx,aes,avx512vl")
    (set_attr "type" "sselog1")
    (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "btver2_decode" "double,double")
+   (set_attr "prefix" "orig,vex,evex")
+   (set_attr "btver2_decode" "double,double,double")
    (set_attr "mode" "TI")])
 
 (define_insn "aesimc"
@@ -25213,20 +25546,21 @@
    (set_attr "mode" "TI")])
 
 (define_insn "pclmulqdq"
-  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
-	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x")
-		      (match_operand:V2DI 2 "vector_operand" "xBm,xm")
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x,v")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x,v")
+		      (match_operand:V2DI 2 "vector_operand" "xBm,xm,vm")
 		      (match_operand:SI 3 "const_0_to_255_operand")]
 		     UNSPEC_PCLMUL))]
   "TARGET_PCLMUL"
   "@
    pclmulqdq\t{%3, %2, %0|%0, %2, %3}
+   vpclmulqdq\t{%3, %2, %1, %0|%0, %1, %2, %3}
    vpclmulqdq\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "isa" "noavx,avx")
+  [(set_attr "isa" "noavx,avx,vpclmulqdqvl")
    (set_attr "type" "sselog1")
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "1")
-   (set_attr "prefix" "orig,vex")
+   (set_attr "prefix" "orig,vex,evex")
    (set_attr "mode" "TI")])
 
 (define_expand "avx_vzeroall"
@@ -25672,7 +26006,28 @@
 	   (match_operand:SI 3 "const_0_to_255_operand")]
 	  UNSPEC_VPERMTI))]
   "TARGET_AVX2"
-  "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  {
+    int mask = INTVAL (operands[3]);
+    if ((mask & 0xbb) == 16)
+      {
+	if (rtx_equal_p (operands[0], operands[1]))
+	  return "";
+	else
+	  return "vmovaps\t{%1, %0|%0, %1}";
+      }
+    if ((mask & 0xbb) == 50)
+      {
+	if (rtx_equal_p (operands[0], operands[2]))
+	  return "";
+	else
+	  return "vmovaps\t{%2, %0|%0, %2}";
+      }
+    if ((mask & 0xbb) == 18)
+      return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+    if ((mask & 0xbb) == 48)
+      return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+    return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+  }
   [(set_attr "type" "sselog")
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])
@@ -25712,9 +26067,9 @@
   "TARGET_AVX512F"
 {
   /*  There is no DF broadcast (in AVX-512*) to 128b register.
-      Mimic it with integer variant.  */
+      Mimic it with vmovddup, just like vec_dupv2df<mask_name> does.  */
   if (<MODE>mode == V2DFmode)
-    return "vpbroadcastq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}";
+    return "vmovddup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}";
 
   return "v<sseintprefix>broadcast<bcstscalarsuff>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %<iptr>1}";
 }
@@ -25823,19 +26178,35 @@
 	   (symbol_ref "true")))])
 
 (define_insn "*vec_dupv2di"
-  [(set (match_operand:V2DI 0 "register_operand"     "=x,v,v,x")
+  [(set (match_operand:V2DI 0 "register_operand"     "=x,v,v,v,x")
 	(vec_duplicate:V2DI
-	  (match_operand:DI 1 "nonimmediate_operand" " 0,Yv,vm,0")))]
+	  (match_operand:DI 1 "nonimmediate_operand" " 0,Yv,vm,Yvm,0")))]
   "TARGET_SSE"
   "@
    punpcklqdq\t%0, %0
    vpunpcklqdq\t{%d1, %0|%0, %d1}
+   * return TARGET_AVX512VL ? \"vpbroadcastq\t{%1, %0|%0, %1}\" : \"vpbroadcastq\t{%1, %g0|%g0, %1}\";
    %vmovddup\t{%1, %0|%0, %1}
    movlhps\t%0, %0"
-  [(set_attr "isa" "sse2_noavx,avx,sse3,noavx")
-   (set_attr "type" "sselog1,sselog1,sselog1,ssemov")
-   (set_attr "prefix" "orig,maybe_evex,maybe_vex,orig")
-   (set_attr "mode" "TI,TI,DF,V4SF")])
+  [(set_attr "isa" "sse2_noavx,avx,avx512f,sse3,noavx")
+   (set_attr "type" "sselog1,sselog1,ssemov,sselog1,ssemov")
+   (set_attr "prefix" "orig,maybe_evex,evex,maybe_vex,orig")
+   (set (attr "mode")
+	(cond [(and (eq_attr "alternative" "2")
+		    (match_test "!TARGET_AVX512VL"))
+		 (const_string "XI")
+	       (eq_attr "alternative" "3")
+		 (const_string "DF")
+	       (eq_attr "alternative" "4")
+		 (const_string "V4SF")
+	      ]
+	      (const_string "TI")))
+   (set (attr "enabled")
+	(if_then_else
+	  (eq_attr "alternative" "2")
+	  (symbol_ref "TARGET_AVX512VL
+		       || (TARGET_AVX512F && !TARGET_PREFER_AVX256)")
+	  (const_string "*")))])
 
 (define_insn "avx2_vbroadcasti128_<mode>"
   [(set (match_operand:VI_256 0 "register_operand" "=x,v,v")
@@ -26235,9 +26606,11 @@
    && avx_vperm2f128_parallel (operands[3], <MODE>mode)"
 {
   int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1;
-  if (mask == 0x12)
-    return "vinsert<i128>\t{$0, %x2, %1, %0|%0, %1, %x2, 0}";
-  if (mask == 0x20)
+  if ((mask & 0xbb) == 0x12)
+    return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+  if ((mask & 0xbb) == 0x30)
+    return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+  if ((mask & 0xbb) == 0x20)
     return "vinsert<i128>\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
   operands[3] = GEN_INT (mask);
   return "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
@@ -26249,9 +26622,9 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "*ssse3_palignr<mode>_perm"
-  [(set (match_operand:V_128 0 "register_operand" "=x,Yw")
-      (vec_select:V_128
-	(match_operand:V_128 1 "register_operand" "0,Yw")
+  [(set (match_operand:V_128H 0 "register_operand" "=x,Yw")
+      (vec_select:V_128H
+	(match_operand:V_128H 1 "register_operand" "0,Yw")
 	(match_parallel 2 "palignr_operand"
 	  [(match_operand 3 "const_int_operand")])))]
   "TARGET_SSSE3"
@@ -27438,10 +27811,10 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "compress<mode>_mask"
-  [(set (match_operand:VI12_AVX512VLBW 0 "register_operand" "=v")
-	(unspec:VI12_AVX512VLBW
-	  [(match_operand:VI12_AVX512VLBW 1 "register_operand" "v")
-	   (match_operand:VI12_AVX512VLBW 2 "nonimm_or_0_operand" "0C")
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
+	(unspec:VI12_AVX512VL
+	  [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	   (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C")
 	   (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")]
 	  UNSPEC_COMPRESS))]
   "TARGET_AVX512VBMI2"
@@ -27465,9 +27838,9 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "compressstore<mode>_mask"
-  [(set (match_operand:VI12_AVX512VLBW 0 "memory_operand" "=m")
-	(unspec:VI12_AVX512VLBW
-	  [(match_operand:VI12_AVX512VLBW 1 "register_operand" "x")
+  [(set (match_operand:VI12_AVX512VL 0 "memory_operand" "=m")
+	(unspec:VI12_AVX512VL
+	  [(match_operand:VI12_AVX512VL 1 "register_operand" "x")
 	   (match_dup 0)
 	   (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")]
 	  UNSPEC_COMPRESS_STORE))]
@@ -27503,10 +27876,10 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "expand<mode>_mask"
-  [(set (match_operand:VI12_AVX512VLBW 0 "register_operand" "=v,v")
-	(unspec:VI12_AVX512VLBW
-	  [(match_operand:VI12_AVX512VLBW 1 "nonimmediate_operand" "v,m")
-	   (match_operand:VI12_AVX512VLBW 2 "nonimm_or_0_operand" "0C,0C")
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v,v")
+	(unspec:VI12_AVX512VL
+	  [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v,m")
+	   (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C,0C")
 	   (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")]
 	  UNSPEC_EXPAND))]
   "TARGET_AVX512VBMI2"
@@ -27517,10 +27890,10 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn_and_split "*expand<mode>_mask"
-  [(set (match_operand:VI12_VI48F_AVX512VLBW 0 "register_operand")
-	(unspec:VI12_VI48F_AVX512VLBW
-	  [(match_operand:VI12_VI48F_AVX512VLBW 1 "nonimmediate_operand")
-	   (match_operand:VI12_VI48F_AVX512VLBW 2 "nonimm_or_0_operand")
+  [(set (match_operand:VI12_VI48F_AVX512VL 0 "register_operand")
+	(unspec:VI12_VI48F_AVX512VL
+	  [(match_operand:VI12_VI48F_AVX512VL 1 "nonimmediate_operand")
+	   (match_operand:VI12_VI48F_AVX512VL 2 "nonimm_or_0_operand")
 	   (match_operand 3 "const_int_operand")]
 	  UNSPEC_EXPAND))]
   "ix86_pre_reload_split ()
@@ -27573,10 +27946,10 @@
 })
 
 (define_expand "expand<mode>_maskz"
-  [(set (match_operand:VI12_AVX512VLBW 0 "register_operand")
-	(unspec:VI12_AVX512VLBW
-	  [(match_operand:VI12_AVX512VLBW 1 "nonimmediate_operand")
-	   (match_operand:VI12_AVX512VLBW 2 "nonimm_or_0_operand")
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
+	(unspec:VI12_AVX512VL
+	  [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
+	   (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand")
 	   (match_operand:<avx512fmaskmode> 3 "register_operand")]
 	  UNSPEC_EXPAND))]
   "TARGET_AVX512VBMI2"
@@ -28834,8 +29207,8 @@
 (define_insn "avx512vl_vpshufbitqmb<mode><mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
 	(unspec:<avx512fmaskmode>
-	  [(match_operand:VI1_AVX512VLBW 1 "register_operand" "v")
-	   (match_operand:VI1_AVX512VLBW 2 "nonimmediate_operand" "vm")]
+	  [(match_operand:VI1_AVX512VL 1 "register_operand" "v")
+	   (match_operand:VI1_AVX512VL 2 "nonimmediate_operand" "vm")]
 	  UNSPEC_VPSHUFBIT))]
   "TARGET_AVX512BITALG"
   "vpshufbitqmb\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"
diff --git a/gcc/config/i386/vaesintrin.h b/gcc/config/i386/vaesintrin.h
index 0f1cffe..58fc19c 100644
--- a/gcc/config/i386/vaesintrin.h
+++ b/gcc/config/i386/vaesintrin.h
@@ -24,9 +24,9 @@
 #ifndef __VAESINTRIN_H_INCLUDED
 #define __VAESINTRIN_H_INCLUDED
 
-#if !defined(__VAES__) || !defined(__AVX__)
+#if !defined(__VAES__)
 #pragma GCC push_options
-#pragma GCC target("vaes,avx")
+#pragma GCC target("vaes")
 #define __DISABLE_VAES__
 #endif /* __VAES__ */
 
diff --git a/gcc/config/i386/vpclmulqdqintrin.h b/gcc/config/i386/vpclmulqdqintrin.h
index ba93fc4..2c83b60 100644
--- a/gcc/config/i386/vpclmulqdqintrin.h
+++ b/gcc/config/i386/vpclmulqdqintrin.h
@@ -53,9 +53,9 @@ _mm512_clmulepi64_epi128 (__m512i __A, __m512i __B, const int __C)
 #pragma GCC pop_options
 #endif /* __DISABLE_VPCLMULQDQF__ */
 
-#if !defined(__VPCLMULQDQ__) || !defined(__AVX__)
+#if !defined(__VPCLMULQDQ__)
 #pragma GCC push_options
-#pragma GCC target("vpclmulqdq,avx")
+#pragma GCC target("vpclmulqdq")
 #define __DISABLE_VPCLMULQDQ__
 #endif /* __VPCLMULQDQ__ */
 
diff --git a/gcc/config/i386/winnt-cxx.cc b/gcc/config/i386/winnt-cxx.cc
index 9982d98..6306d83 100644
--- a/gcc/config/i386/winnt-cxx.cc
+++ b/gcc/config/i386/winnt-cxx.cc
@@ -30,7 +30,7 @@ along with GCC; see the file COPYING3.  If not see
 bool
 i386_pe_type_dllimport_p (tree decl)
 {
-  gcc_assert (TREE_CODE (decl) == VAR_DECL 
+  gcc_assert (VAR_P (decl)
 	      || TREE_CODE (decl) == FUNCTION_DECL);
 
   if (TARGET_NOP_FUN_DLLIMPORT && TREE_CODE (decl) == FUNCTION_DECL)
@@ -54,7 +54,7 @@ i386_pe_type_dllimport_p (tree decl)
 bool
 i386_pe_type_dllexport_p (tree decl)
 {
-  gcc_assert (TREE_CODE (decl) == VAR_DECL 
+  gcc_assert (VAR_P (decl)
               || TREE_CODE (decl) == FUNCTION_DECL);
 
   /* Avoid exporting compiler-generated default dtors and copy ctors.
@@ -118,7 +118,7 @@ i386_pe_adjust_class_at_definition (tree t)
 
       /* Check FUNCTION_DECL's and static VAR_DECL's.  */
       for (member = TYPE_FIELDS (t); member; member = DECL_CHAIN (member))
-	if (TREE_CODE (member) == VAR_DECL)     
+	if (VAR_P (member))
 	  maybe_add_dllexport (member);
 	else if (TREE_CODE (member) == FUNCTION_DECL)
 	  {
@@ -134,7 +134,7 @@ i386_pe_adjust_class_at_definition (tree t)
       /* Check vtables  */
       for (member = CLASSTYPE_VTABLES (t);
 	   member; member = DECL_CHAIN (member))
-	if (TREE_CODE (member) == VAR_DECL) 
+	if (VAR_P (member))
 	  maybe_add_dllexport (member);
     }
 
@@ -150,7 +150,7 @@ i386_pe_adjust_class_at_definition (tree t)
 
       /* Check FUNCTION_DECL's and static VAR_DECL's.  */
       for (member = TYPE_FIELDS (t); member; member = DECL_CHAIN (member))
-	if (TREE_CODE (member) == VAR_DECL)     
+	if (VAR_P (member))
 	  maybe_add_dllimport (member);
 	else if (TREE_CODE (member) == FUNCTION_DECL)
 	  {
@@ -166,7 +166,7 @@ i386_pe_adjust_class_at_definition (tree t)
       /* Check vtables  */
       for (member = CLASSTYPE_VTABLES (t);
 	   member;  member = DECL_CHAIN (member))
-	if (TREE_CODE (member) == VAR_DECL) 
+	if (VAR_P (member))
 	  maybe_add_dllimport (member);
 
       /* We leave typeinfo tables alone.  We can't mark TI objects as
diff --git a/gcc/config/i386/winnt.cc b/gcc/config/i386/winnt.cc
index 6b64eca..83a21c6 100644
--- a/gcc/config/i386/winnt.cc
+++ b/gcc/config/i386/winnt.cc
@@ -147,7 +147,7 @@ i386_pe_determine_dllimport_p (tree decl)
      out-of-class definition of static data.  */
   assoc = associated_type (decl);
   if (assoc && lookup_attribute ("dllimport", TYPE_ATTRIBUTES (assoc))
-      && TREE_CODE (decl) == VAR_DECL
+      && VAR_P (decl)
       && TREE_STATIC (decl) && TREE_PUBLIC (decl)
       && !DECL_EXTERNAL (decl)
       /* vtable's are linkonce constants, so defining a vtable is not
@@ -335,7 +335,7 @@ i386_pe_encode_section_info (tree decl, rtx rtl, int first)
 bool
 i386_pe_binds_local_p (const_tree exp)
 {
-  if ((TREE_CODE (exp) == VAR_DECL || TREE_CODE (exp) == FUNCTION_DECL)
+  if ((VAR_P (exp) || TREE_CODE (exp) == FUNCTION_DECL)
       && DECL_DLLIMPORT_P (exp))
     return false;
 
@@ -459,7 +459,7 @@ i386_pe_section_type_flags (tree decl, const char *, int reloc)
     {
       flags = SECTION_WRITE;
 
-      if (decl && TREE_CODE (decl) == VAR_DECL
+      if (decl && VAR_P (decl)
 	  && lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
 	flags |= SECTION_PE_SHARED;
     }
diff --git a/gcc/config/i386/wmmintrin.h b/gcc/config/i386/wmmintrin.h
index ae15cea..da314db 100644
--- a/gcc/config/i386/wmmintrin.h
+++ b/gcc/config/i386/wmmintrin.h
@@ -40,36 +40,23 @@
 
 /* Performs 1 round of AES decryption of the first m128i using 
    the second m128i as a round key.  */
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_aesdec_si128 (__m128i __X, __m128i __Y)
-{
-  return (__m128i) __builtin_ia32_aesdec128 ((__v2di)__X, (__v2di)__Y);
-}
+#define _mm_aesdec_si128(X, Y) \
+  (__m128i) __builtin_ia32_aesdec128 ((__v2di) (X), (__v2di) (Y))
 
 /* Performs the last round of AES decryption of the first m128i 
    using the second m128i as a round key.  */
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_aesdeclast_si128 (__m128i __X, __m128i __Y)
-{
-  return (__m128i) __builtin_ia32_aesdeclast128 ((__v2di)__X,
-						 (__v2di)__Y);
-}
+#define _mm_aesdeclast_si128(X, Y) \
+  (__m128i) __builtin_ia32_aesdeclast128 ((__v2di) (X), (__v2di) (Y))
 
 /* Performs 1 round of AES encryption of the first m128i using 
    the second m128i as a round key.  */
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_aesenc_si128 (__m128i __X, __m128i __Y)
-{
-  return (__m128i) __builtin_ia32_aesenc128 ((__v2di)__X, (__v2di)__Y);
-}
+#define _mm_aesenc_si128(X, Y) \
+  (__m128i) __builtin_ia32_aesenc128 ((__v2di) (X), (__v2di) (Y))
 
 /* Performs the last round of AES encryption of the first m128i
    using the second m128i as a round key.  */
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_aesenclast_si128 (__m128i __X, __m128i __Y)
-{
-  return (__m128i) __builtin_ia32_aesenclast128 ((__v2di)__X, (__v2di)__Y);
-}
+#define _mm_aesenclast_si128(X, Y) \
+  (__m128i) __builtin_ia32_aesenclast128 ((__v2di) (X), (__v2di) (Y))
 
 /* Performs the InverseMixColumn operation on the source m128i 
    and stores the result into m128i destination.  */
diff --git a/gcc/config/i386/x-mingw32-utf8 b/gcc/config/i386/x-mingw32-utf8
index 2783dd2..b5a6cfc 100644
--- a/gcc/config/i386/x-mingw32-utf8
+++ b/gcc/config/i386/x-mingw32-utf8
@@ -34,7 +34,7 @@ utf8rc-mingw32.o : $(srcdir)/config/i386/utf8-mingw32.rc \
 # Create an object file that just exports the global symbol
 # HOST_EXTRA_OBJS_SYMBOL
 sym-mingw32.o : $(srcdir)/config/i386/sym-mingw32.cc
-	$(COMPILER) -c $< $@
+	$(COMPILER) -c $<
 
 # Combine the two object files into one which has both the
 # compiled utf8 resource and the HOST_EXTRA_OBJS_SYMBOL symbol.
@@ -44,8 +44,10 @@ sym-mingw32.o : $(srcdir)/config/i386/sym-mingw32.cc
 # If nothing references it into libbackend.a, it will not
 # get linked into the compiler proper eventually.
 # Therefore we need to request the symbol at compiler link time.
+# -nostdlib is required for supporting old gcc versions that
+# don't apply it automatically with -r.
 utf8-mingw32.o : utf8rc-mingw32.o sym-mingw32.o
-	$(COMPILER) -r utf8rc-mingw32.o sym-mingw32.o -o $@
+	$(COMPILER) -r -nostdlib utf8rc-mingw32.o sym-mingw32.o -o $@
 
 # Force compilers to link against the utf8 resource by
 # requiring the symbol to be defined.
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 9d603cc..c3229d2 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -335,7 +335,8 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
    for bit-manipulation instructions.  */
 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
-	  m_SANDYBRIDGE | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_CORE_ATOM
+	  m_SANDYBRIDGE | m_HASWELL | m_SKYLAKE | m_SKYLAKE_AVX512
+	  | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE
 	  | m_LUJIAZUI | m_GENERIC)
 
 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
@@ -697,3 +698,7 @@ DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE)
 /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
    before a transfer of control flow out of the function.  */
 DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)
+
+/* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag
+  modifications on architectures where theses operations are slow.  */
+DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4)
author	Ian Lance Taylor <iant@golang.org>	2023-06-21 11:04:04 -0700
committer	Ian Lance Taylor <iant@golang.org>	2023-06-21 11:04:04 -0700
commit	97e31a0a2a2d2273687fcdb4e5416aab1a2186e1 (patch)
tree	d5c1cae4de436a0fe54a5f0a2a197d309f3d654c /gcc/config/i386
parent	6612f4f8cb9b0d5af18ec69ad04e56debc3e6ced (diff)
parent	577223aebc7acdd31e62b33c1682fe54a622ae27 (diff)
download	gcc-97e31a0a2a2d2273687fcdb4e5416aab1a2186e1.zip gcc-97e31a0a2a2d2273687fcdb4e5416aab1a2186e1.tar.gz gcc-97e31a0a2a2d2273687fcdb4e5416aab1a2186e1.tar.bz2