diff options
-rw-r--r-- | gcc/ChangeLog | 423 | ||||
-rw-r--r-- | gcc/config.gcc | 4 | ||||
-rw-r--r-- | gcc/config/i386/avx2intrin.h | 1874 | ||||
-rw-r--r-- | gcc/config/i386/i386-builtin-types.def | 78 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 562 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 11 | ||||
-rw-r--r-- | gcc/config/i386/immintrin.h | 4 | ||||
-rw-r--r-- | gcc/config/i386/predicates.md | 8 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 2532 | ||||
-rw-r--r-- | gcc/doc/extend.texi | 178 |
10 files changed, 5415 insertions, 259 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 76154f2..15c4e3b 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,426 @@ +2011-08-22 Kirill Yukhin <kirill.yukhin@intel.com> + + * config/i386/avx2intrin.h: New file. + * config/i386/i386-builtin-types.def (PCINT, PCINT64, PV4SI, + PV8SI, V32QI_FTYPE_V32QI, V32QI_FTYPE_V16QI, V16HI_FTYPE_V16HI, + V16HI_FTYPE_V8HI, V8SI_FTYPE_V8SI, V16HI_FTYPE_V16QI, + V8SI_FTYPE_V16QI, V4DI_FTYPE_V16QI, V8SI_FTYPE_V8HI, + V4DI_FTYPE_V8HI, V4DI_FTYPE_V4SI, V4DI_FTYPE_PV4DI, + V4DI_FTYPE_V2DI, V2DI_FTYPE_PCV2DI_V2DI, V4SI_FTYPE_PCV4SI_V4SI, + V32QI_FTYPE_V16HI_V16HI, V16HI_FTYPE_V8SI_V8SI, + V32QI_FTYPE_V32QI_V32QI, V16HI_FTYPE_V32QI_V32QI, + V16HI_FTYPE_V16HI_V8HI, V16HI_FTYPE_V16HI_V16HI, + V16HI_FTYPE_V16HI_INT, V16HI_FTYPE_V16HI_SI, + V16HI_FTYPE_V16HI_V16HI_INT, V32QI_FTYPE_V32QI_V32QI_INT, + V8SI_FTYPE_V8SI_V4SI, V8SI_FTYPE_V8SI_V8SI, + V8SI_FTYPE_V16HI_V16HI, V8SI_FTYPE_V8SI_INT, V8SI_FTYPE_V8SI_SI, + V8SI_FTYPE_PCV8SI_V8SI, V4DI_FTYPE_V4DI_V4DI, + V4DI_FTYPE_V8SI_V8SI, V4DI_FTYPE_V4DI_V2DI, + V4DI_FTYPE_PCV4DI_V4DI, V4DI_FTYPE_V4DI_INT, + V2DI_FTYPE_V4DI_INT, V4DI_FTYPE_V4DI_V4DI_INT, + V4DI_FTYPE_V4DI_V2DI_INT, VOID_FTYPE_PV2DI_V2DI_V2DI, + VOID_FTYPE_PV4DI_V4DI_V4DI, VOID_FTYPE_PV4SI_V4SI_V4SI, + VOID_FTYPE_PV8SI_V8SI_V8SI, + V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT, + V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT, + V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT, + V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT, + V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT, + V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT, + V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT, + V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT, + V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT, + V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT, + V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT, + V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT, + V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT, + V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT, + V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT, + V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT, + V16HI_FTYPE_V16HI_SI_COUNT, V16HI_FTYPE_V16HI_V8HI_COUNT, + V8SI_FTYPE_V8SI_SI_COUNT, V8SI_FTYPE_V8SI_V4SI_COUNT, + V4DI_FTYPE_V4DI_INT_COUNT, V4DI_FTYPE_V4DI_V2DI_COUNT, + V4DI_FTYPE_V4DI_INT_CONVERT, + V4DI_FTYPE_V4DI_V4DI_INT_CONVERT): New. + * config/i386/i386.c (ix86_builtins): Add IX86_BUILTIN_MPSADBW256, + IX86_BUILTIN_PABSB256, IX86_BUILTIN_PABSW256, + IX86_BUILTIN_PABSD256, IX86_BUILTIN_PACKSSDW256, + IX86_BUILTIN_PACKSSWB256, IX86_BUILTIN_PACKUSDW256, + IX86_BUILTIN_PACKUSWB256, IX86_BUILTIN_PADDB256, + IX86_BUILTIN_PADDW256, IX86_BUILTIN_PADDD256, + IX86_BUILTIN_PADDQ256, IX86_BUILTIN_PADDSB256, + IX86_BUILTIN_PADDSW256, IX86_BUILTIN_PADDUSB256, + IX86_BUILTIN_PADDUSW256, IX86_BUILTIN_PALIGNR256, + IX86_BUILTIN_AND256I, IX86_BUILTIN_ANDNOT256I, + IX86_BUILTIN_PAVGB256, IX86_BUILTIN_PAVGW256, + IX86_BUILTIN_PBLENDVB256, IX86_BUILTIN_PBLENDVW256, + IX86_BUILTIN_PCMPEQB256, IX86_BUILTIN_PCMPEQW256, + IX86_BUILTIN_PCMPEQD256, IX86_BUILTIN_PCMPEQQ256, + IX86_BUILTIN_PCMPGTB256, IX86_BUILTIN_PCMPGTW256, + IX86_BUILTIN_PCMPGTD256, IX86_BUILTIN_PCMPGTQ256, + IX86_BUILTIN_PHADDW256, IX86_BUILTIN_PHADDD256, + IX86_BUILTIN_PHADDSW256, IX86_BUILTIN_PHSUBW256, + IX86_BUILTIN_PHSUBD256, IX86_BUILTIN_PHSUBSW256, + IX86_BUILTIN_PMADDUBSW256, IX86_BUILTIN_PMADDWD256, + IX86_BUILTIN_PMAXSB256, IX86_BUILTIN_PMAXSW256, + IX86_BUILTIN_PMAXSD256, IX86_BUILTIN_PMAXUB256, + IX86_BUILTIN_PMAXUW256, IX86_BUILTIN_PMAXUD256, + IX86_BUILTIN_PMINSB256, IX86_BUILTIN_PMINSW256, + IX86_BUILTIN_PMINSD256, IX86_BUILTIN_PMINUB256, + IX86_BUILTIN_PMINUW256, IX86_BUILTIN_PMINUD256, + IX86_BUILTIN_PMOVMSKB256, IX86_BUILTIN_PMOVSXBW256, + IX86_BUILTIN_PMOVSXBD256, IX86_BUILTIN_PMOVSXBQ256, + IX86_BUILTIN_PMOVSXWD256, IX86_BUILTIN_PMOVSXWQ256, + IX86_BUILTIN_PMOVSXDQ256, IX86_BUILTIN_PMOVZXBW256, + IX86_BUILTIN_PMOVZXBD256, IX86_BUILTIN_PMOVZXBQ256, + IX86_BUILTIN_PMOVZXWD256, IX86_BUILTIN_PMOVZXWQ256, + IX86_BUILTIN_PMOVZXDQ256, IX86_BUILTIN_PMULDQ256, + IX86_BUILTIN_PMULHRSW256, IX86_BUILTIN_PMULHUW256, + IX86_BUILTIN_PMULHW256, IX86_BUILTIN_PMULLW256, + IX86_BUILTIN_PMULLD256, IX86_BUILTIN_PMULUDQ256, + IX86_BUILTIN_POR256, IX86_BUILTIN_PSADBW256, + IX86_BUILTIN_PSHUFB256, IX86_BUILTIN_PSHUFD256, + IX86_BUILTIN_PSHUFHW256, IX86_BUILTIN_PSHUFLW256, + IX86_BUILTIN_PSIGNB256, IX86_BUILTIN_PSIGNW256, + IX86_BUILTIN_PSIGND256, IX86_BUILTIN_PSLLDQI256, + IX86_BUILTIN_PSLLWI256, IX86_BUILTIN_PSLLW256, + IX86_BUILTIN_PSLLDI256, IX86_BUILTIN_PSLLD256, + IX86_BUILTIN_PSLLQI256, IX86_BUILTIN_PSLLQ256, + IX86_BUILTIN_PSRAWI256, IX86_BUILTIN_PSRAW256, + IX86_BUILTIN_PSRADI256, IX86_BUILTIN_PSRAD256, + IX86_BUILTIN_PSRLDQI256, IX86_BUILTIN_PSRLWI256, + IX86_BUILTIN_PSRLW256, IX86_BUILTIN_PSRLDI256, + IX86_BUILTIN_PSRLD256, IX86_BUILTIN_PSRLQI256, + IX86_BUILTIN_PSRLQ256, IX86_BUILTIN_PSUBB256, + IX86_BUILTIN_PSUBW256, IX86_BUILTIN_PSUBD256, + IX86_BUILTIN_PSUBQ256, IX86_BUILTIN_PSUBSB256, + IX86_BUILTIN_PSUBSW256, IX86_BUILTIN_PSUBUSB256, + IX86_BUILTIN_PSUBUSW256, IX86_BUILTIN_PUNPCKHBW256, + IX86_BUILTIN_PUNPCKHWD256, IX86_BUILTIN_PUNPCKHDQ256, + IX86_BUILTIN_PUNPCKHQDQ256, IX86_BUILTIN_PUNPCKLBW256, + IX86_BUILTIN_PUNPCKLWD256, IX86_BUILTIN_PUNPCKLDQ256, + IX86_BUILTIN_PUNPCKLQDQ256, IX86_BUILTIN_PXOR256, + IX86_BUILTIN_MOVNTDQA256, IX86_BUILTIN_VBROADCASTSS_PS, + IX86_BUILTIN_VBROADCASTSS_PS256, + IX86_BUILTIN_VBROADCASTSD_PD256, + IX86_BUILTIN_VBROADCASTSI256, IX86_BUILTIN_PBLENDD256, + IX86_BUILTIN_PBLENDD128, IX86_BUILTIN_PBROADCASTB256, + IX86_BUILTIN_PBROADCASTW256, IX86_BUILTIN_PBROADCASTD256, + IX86_BUILTIN_PBROADCASTQ256, IX86_BUILTIN_PBROADCASTB128, + IX86_BUILTIN_PBROADCASTW128, IX86_BUILTIN_PBROADCASTD128, + IX86_BUILTIN_PBROADCASTQ128, IX86_BUILTIN_VPERMVARSI256, + IX86_BUILTIN_VPERMDF256, IX86_BUILTIN_VPERMVARSF256, + IX86_BUILTIN_VPERMDI256, IX86_BUILTIN_VPERMTI256, + IX86_BUILTIN_VEXTRACT128I256, IX86_BUILTIN_VINSERT128I256, + IX86_BUILTIN_MASKLOADD, IX86_BUILTIN_MASKLOADQ, + IX86_BUILTIN_MASKLOADD256, IX86_BUILTIN_MASKLOADQ256, + IX86_BUILTIN_MASKSTORED, IX86_BUILTIN_MASKSTOREQ, + IX86_BUILTIN_MASKSTORED256, IX86_BUILTIN_MASKSTOREQ256, + IX86_BUILTIN_PSLLVV4DI, IX86_BUILTIN_PSLLVV2DI, + IX86_BUILTIN_PSLLVV8SI, IX86_BUILTIN_PSLLVV4SI, + IX86_BUILTIN_PSRAVV8SI, IX86_BUILTIN_PSRAVV4SI, + IX86_BUILTIN_PSRLVV4DI, IX86_BUILTIN_PSRLVV2DI, + IX86_BUILTIN_PSRLVV8SI, IX86_BUILTIN_PSRLVV4SI, + IX86_BUILTIN_GATHERSIV2DF, IX86_BUILTIN_GATHERSIV4DF, + IX86_BUILTIN_GATHERDIV2DF, IX86_BUILTIN_GATHERDIV4DF, + IX86_BUILTIN_GATHERSIV4SF, IX86_BUILTIN_GATHERSIV8SF, + IX86_BUILTIN_GATHERDIV4SF, IX86_BUILTIN_GATHERDIV8SF, + IX86_BUILTIN_GATHERSIV2DI, IX86_BUILTIN_GATHERSIV4DI, + IX86_BUILTIN_GATHERDIV2DI, IX86_BUILTIN_GATHERDIV4DI, + IX86_BUILTIN_GATHERSIV4SI, IX86_BUILTIN_GATHERSIV8SI, + IX86_BUILTIN_GATHERDIV4SI, IX86_BUILTIN_GATHERDIV8SI. + (bdesc_special_args): Add IX86_BUILTIN_MOVNTDQA256, + IX86_BUILTIN_MASKLOADD, IX86_BUILTIN_MASKLOADQ, + IX86_BUILTIN_MASKLOADD256, IX86_BUILTIN_MASKLOADQ256, + IX86_BUILTIN_MASKSTORED, IX86_BUILTIN_MASKSTOREQ, + IX86_BUILTIN_MASKSTORED256, IX86_BUILTIN_MASKSTOREQ256. + (bdesc_args): Add IX86_BUILTIN_MPSADBW256, + IX86_BUILTIN_PABSB256, IX86_BUILTIN_PABSW256, + IX86_BUILTIN_PABSD256, IX86_BUILTIN_PACKSSDW256, + IX86_BUILTIN_PACKSSWB256, IX86_BUILTIN_PACKUSDW256, + IX86_BUILTIN_PACKUSWB256, IX86_BUILTIN_PADDB256, + IX86_BUILTIN_PADDW256, IX86_BUILTIN_PADDD256, + IX86_BUILTIN_PADDQ256, IX86_BUILTIN_PADDSB256, + IX86_BUILTIN_PADDSW256, IX86_BUILTIN_PADDUSB256, + IX86_BUILTIN_PADDUSW256, IX86_BUILTIN_PALIGNR256, + IX86_BUILTIN_AND256I, IX86_BUILTIN_ANDNOT256I, + IX86_BUILTIN_PAVGB256, IX86_BUILTIN_PAVGW256, + IX86_BUILTIN_PBLENDVB256, IX86_BUILTIN_PBLENDVW256, + IX86_BUILTIN_PCMPEQB256, IX86_BUILTIN_PCMPEQW256, + IX86_BUILTIN_PCMPEQD256, IX86_BUILTIN_PCMPEQQ256, + IX86_BUILTIN_PCMPGTB256, IX86_BUILTIN_PCMPGTW256, + IX86_BUILTIN_PCMPGTD256, IX86_BUILTIN_PCMPGTQ256, + IX86_BUILTIN_PHADDW256, IX86_BUILTIN_PHADDD256, + IX86_BUILTIN_PHADDSW256, IX86_BUILTIN_PHSUBW256, + IX86_BUILTIN_PHSUBD256, IX86_BUILTIN_PHSUBSW256, + IX86_BUILTIN_PMADDUBSW256, IX86_BUILTIN_PMADDWD256, + IX86_BUILTIN_PMAXSB256, IX86_BUILTIN_PMAXSW256, + IX86_BUILTIN_PMAXSD256, IX86_BUILTIN_PMAXUB256, + IX86_BUILTIN_PMAXUW256, IX86_BUILTIN_PMAXUD256, + IX86_BUILTIN_PMINSB256, IX86_BUILTIN_PMINSW256, + IX86_BUILTIN_PMINSD256, IX86_BUILTIN_PMINUB256, + IX86_BUILTIN_PMINUW256, IX86_BUILTIN_PMINUD256, + IX86_BUILTIN_PMOVMSKB256, IX86_BUILTIN_PMOVSXBW256, + IX86_BUILTIN_PMOVSXBD256, IX86_BUILTIN_PMOVSXBQ256, + IX86_BUILTIN_PMOVSXWD256, IX86_BUILTIN_PMOVSXWQ256, + IX86_BUILTIN_PMOVSXDQ256, IX86_BUILTIN_PMOVZXBW256, + IX86_BUILTIN_PMOVZXBD256, IX86_BUILTIN_PMOVZXBQ256, + IX86_BUILTIN_PMOVZXWD256, IX86_BUILTIN_PMOVZXWQ256, + IX86_BUILTIN_PMOVZXDQ256, IX86_BUILTIN_PMULDQ256, + IX86_BUILTIN_PMULHRSW256, IX86_BUILTIN_PMULHUW256, + IX86_BUILTIN_PMULHW256, IX86_BUILTIN_PMULLW256, + IX86_BUILTIN_PMULLD256, IX86_BUILTIN_PMULUDQ256, + IX86_BUILTIN_POR256, IX86_BUILTIN_PSADBW256, + IX86_BUILTIN_PSHUFB256, IX86_BUILTIN_PSHUFD256, + IX86_BUILTIN_PSHUFHW256, IX86_BUILTIN_PSHUFLW256, + IX86_BUILTIN_PSIGNB256, IX86_BUILTIN_PSIGNW256, + IX86_BUILTIN_PSIGND256, IX86_BUILTIN_PSLLDQI256, + IX86_BUILTIN_PSLLWI256, IX86_BUILTIN_PSLLW256, + IX86_BUILTIN_PSLLDI256, IX86_BUILTIN_PSLLD256, + IX86_BUILTIN_PSLLQI256, IX86_BUILTIN_PSLLQ256, + IX86_BUILTIN_PSRAWI256, IX86_BUILTIN_PSRAW256, + IX86_BUILTIN_PSRADI256, IX86_BUILTIN_PSRAD256, + IX86_BUILTIN_PSRLDQI256, IX86_BUILTIN_PSRLWI256, + IX86_BUILTIN_PSRLW256, IX86_BUILTIN_PSRLDI256, + IX86_BUILTIN_PSRLD256, IX86_BUILTIN_PSRLQI256, + IX86_BUILTIN_PSRLQ256, IX86_BUILTIN_PSUBB256, + IX86_BUILTIN_PSUBW256, IX86_BUILTIN_PSUBD256, + IX86_BUILTIN_PSUBQ256, IX86_BUILTIN_PSUBSB256, + IX86_BUILTIN_PSUBSW256, IX86_BUILTIN_PSUBUSB256, + IX86_BUILTIN_PSUBUSW256, IX86_BUILTIN_PUNPCKHBW256, + IX86_BUILTIN_PUNPCKHWD256, IX86_BUILTIN_PUNPCKHDQ256, + IX86_BUILTIN_PUNPCKHQDQ256, IX86_BUILTIN_PUNPCKLBW256, + IX86_BUILTIN_PUNPCKLWD256, IX86_BUILTIN_PUNPCKLDQ256, + IX86_BUILTIN_PUNPCKLQDQ256, IX86_BUILTIN_PXOR256, + IX86_BUILTIN_VBROADCASTSS_PS, IX86_BUILTIN_VBROADCASTSS_PS256, + IX86_BUILTIN_VBROADCASTSD_PD256, + IX86_BUILTIN_VBROADCASTSI256, IX86_BUILTIN_PBLENDD256, + IX86_BUILTIN_PBLENDD128, IX86_BUILTIN_PBROADCASTB256, + IX86_BUILTIN_PBROADCASTW256, IX86_BUILTIN_PBROADCASTD256, + IX86_BUILTIN_PBROADCASTQ256, IX86_BUILTIN_PBROADCASTB128, + IX86_BUILTIN_PBROADCASTW128, IX86_BUILTIN_PBROADCASTD128, + IX86_BUILTIN_PBROADCASTQ128, IX86_BUILTIN_VPERMVARSI256, + IX86_BUILTIN_VPERMDF256, IX86_BUILTIN_VPERMVARSF256, + IX86_BUILTIN_VPERMDI256, IX86_BUILTIN_VPERMTI256, + IX86_BUILTIN_VEXTRACT128I256, IX86_BUILTIN_VINSERT128I256, + IX86_BUILTIN_PSLLVV4DI, IX86_BUILTIN_PSLLVV2DI, + IX86_BUILTIN_PSLLVV8SI, IX86_BUILTIN_PSLLVV4SI, + IX86_BUILTIN_PSRAVV8SI, IX86_BUILTIN_PSRAVV4SI, + IX86_BUILTIN_PSRLVV4DI, IX86_BUILTIN_PSRLVV2DI, + IX86_BUILTIN_PSRLVV8SI, IX86_BUILTIN_PSRLVV4SI. + (ix86_init_mmx_sse_builtins): Add IX86_BUILTIN_GATHERSIV2DF, + IX86_BUILTIN_GATHERSIV4DF, IX86_BUILTIN_GATHERDIV2DF, + IX86_BUILTIN_GATHERDIV4DF, IX86_BUILTIN_GATHERSIV4SF, + IX86_BUILTIN_GATHERSIV8SF, IX86_BUILTIN_GATHERDIV4SF, + IX86_BUILTIN_GATHERDIV8SF, IX86_BUILTIN_GATHERSIV2DI, + IX86_BUILTIN_GATHERSIV4DI, IX86_BUILTIN_GATHERDIV2DI, + IX86_BUILTIN_GATHERDIV4DI, IX86_BUILTIN_GATHERSIV4SI, + IX86_BUILTIN_GATHERSIV8SI, IX86_BUILTIN_GATHERDIV4SI, + IX86_BUILTIN_GATHERDIV8SI. + (ix86_preferred_simd_mode): Support AVX2 modes. + (ix86_expand_args_builtin): Support AVX2 built-ins. + (ix86_expand_special_args_builtin): Likewise. + (ix86_expand_builtin): Likewise. + * config/i386/i386.md (UNSPEC_VPERMSI): New. + (UNSPEC_VPERMDF): Likewise. + (UNSPEC_VPERMSF): Likewise. + (UNSPEC_VPERMDI): Likewise. + (UNSPEC_VPERMTI): Likewise. + (UNSPEC_GATHER): Likewise. + (ssemodesuffix): Extend. + * config/i386/immintrin.h: Include avx2intrin.h when __AVX2__ + is defined. + * config/i386/predicates.md (const1248_operand): New. + * config/i386/sse.md (VI_AVX2): + (VI1_AVX2): Likewise. + (VI2_AVX2): Likewise. + (VI4_AVX2): Likewise. + (VI8_AVX2): Likewise. + (VIMAX_AVX2): Likewise. + (SSESCALARMODE): Likewise. + (VI12_AVX2): Likewise. + (VI24_AVX2): Likewise. + (VI124_AVX2): Likeuse_submit_for_speed = 1 + wise. + (VI248_AVX2): Likewise. + (VI48_AVX2): Likewise. + (VI4SD_AVX2): Likewise. + (V48_AVX2): Likewise. + (avx2modesuffix): Likewise. + (sse_avx2): Likewise. + (sse2_avx2): Likewise. + (ssse3_avx2): Likewise. + (sse4_1_avx2): Likewise. + (avx_avx2): Likewise. + (lshift)<code_oterator>: Likewise. + (lshift_insn): Likewise. + (lshift)<code_attr>: Likewise. + (SSESHORTMODE): Likewise. + (SSELONGMODE): Likewise. + (SSEBYTEMODE): Likewise. + (AVXTOSSEMODE): Likewise. + (shortmode): Likewise. + (ssescalarmodesuffix): Update. + (sseunpackmode): Likewise. + (ssepackmode): Likewise. + (AVX256MODEI): New. + (AVX256MODE124): Likewise. + (AVX256MODE1248): Likewise. + (AVX256MODE248): Likewise. + (AVXMODE48P_SI): Likewise. + (AVXMODE48P_SI): Likewise. + (AVXMODE48P_DI): Likewise. + (AVXMODE48P_DI): Likewise. + (gthrfirstp): Likewise. + (gthrlastp): Likewise. + (avx2): Likwise. + (ssevecsize): Likewise. + (ssedoublesizemode): Likewise. + (avxvecmode): Likewise. + (avxvecsize): Likewise. + (avxhalfvecmode): Likewise. + (avxscalarmode): Likewise. + (avxpermvecmode): Likewise. + (avxmodesuffixp): Likewise. + (avxmodesuffix): Likewise. + (avx2_vec_dupv4sf): New. + (avx2_vec_dupv8sf): Likewise. + (avx2_interleave_highv4di): Likewise. + (avx2_interleave_lowv4di): Likewise. + (<plusminus_insn><mode>3): Update. + (*<plusminus_insn><mode>3): Likewise. + (sse2_<plusminus_insn><mode>3): Rename to ... + ("<sse2_avx2>_<plusminus_insn><mode>3): ... this. updated. + (*sse2_<plusminus_insn><mode>3): Likewise. + (*<sse2_avx2>_<plusminus_insn><mode>3): Likewise. + (mulv8hi3): Likewise. + (mul<mode>3): Likewise. + (*mulv8hi3): Likewise. + (*mul<mode>3): Likewise. + (<s>mulv8hi3_highpart): Likewise. + (<s>mul<mode>3_highpart): Likewise. + (*<s>mulv8hi3_highpart): Likewise. + (*<s>mul<mode>3_highpart): Likewise. + (avx2_umulv4siv4di3): Likewise. + (*avx_umulv4siv4di3): Likewise. + (sse4_1_mulv2siv2di3): Likewise. + (<sse4_1_avx2>_mul<shortmode><mode>3): Likewise. + (*sse4_1_mulv2siv2di3): Likewise. + (*<sse4_1_avx2>_mulv2siv2di3): Likewise. + (avx2_pmaddwd): New. + (*avx2_pmaddwd): Likewise. + (mulv4si3): Rename to ... + (mul<mode>3): ... this. Update. + (*sse4_1_mulv4si3): Likewise. + (*<sse4_1_avx2>_mul<mode>3): Likewise. + (ashr<mode>3): Update. + (avx2_lshrqv4di3): New. + (lshr<mode>3): Update. + (avx2_lshlqv4di3): New. + (avx2_lshl<mode>3): Likewise. + (sse2_ashlv1ti3): Rename to ... + (<sse2_avx2>_ashl<mode>3): ... this. Update. + (avx2_<code><mode>3)<umaxmin>: New. + (*avx2_<code><mode>3)<umaxmin>: Likewise. + (avx2_<code><mode>3)<smaxmin>: New. + (*avx2_<code><mode>3)<smaxmin>: Likewise. + (avx2_eq<mode>3): Likewise. + (*avx2_eq<mode>3): Likewise. + (avx2_gt<mode>3): Likewise. + (sse2_andnot<mode>3): Rename to ... + (<sse2_avx2>_andnot<mode>3): ... this. Update. + (*andnot<mode>3): Update. + (<code><mode>3)<any_logic>: Update. + (*<code><mode>3)<any_logic>: Likewise. + (sse2_packsswb): Rename to ... + (<sse2_avx2>_packsswb): ... this. Update. + (sse2_packssdw): Likewise. + (<sse2_avx2>_packssdw): Likewise. + (sse2_packuswb): Likewise. + (<sse2_avx2>_packuswb): Likewise. + (avx2_interleave_highv32qi): New. + (avx2_interleave_lowv32qi): Likewise. + (avx2_interleave_highv16hi): Likewise. + (avx2_interleave_lowv16hi): Likewise. + (avx2_interleave_highv8si): Likewise. + (avx2_interleave_lowv8si): Likewise. + (avx2_pshufd): New + (avx2_pshufd_1): Likewise. + (avx2_pshuflwv3): Likewise. + (avx2_pshuflw_1): Likewise. + (avx2_pshufhwv3): Likewise. + (avx2_pshufhw_1): Likewise. + (avx2_uavgv32qi3): Likewise. + (*avx2_uavgv32qi3): Likewise. + (avx2_uavgv16hi3): Likewise. + (*avx2_uavgv16hi3): Likewise. + (sse2_psadbw): Rename to ... + (<sse2_avx2>_psadbw): ... this. Update. + (avx2_pmovmskb): New. + (avx2_phaddwv16hi3): Likewise. + (avx2_phadddv8si3): Likewise. + (avx2_phaddswv16hi3): Likewise. + (avx2_phsubwv16hi3): Likewise. + (avx2_phsubdv8si3): Likewise. + (avx2_phsubswv16hi3): Likewise. + (avx2_pmaddubsw256): Likewise. + (avx2_umulhrswv16hi3): Likewise. + (*avx2_umulhrswv16hi3): Likewise. + (ssse3_pshufbv16qi3): Rename to ... + (<ssse3_avx2>_pshufb<mode>3): ... this. Update. + (ssse3_psign<mode>3): Likewise. + (<ssse3_avx2>_psign<mode>3): Likewise. + (ssse3_palignrti): Likewise. + (<ssse3_avx2>_palignr<mode>): Likewise. + (abs<mode>2): Likewise. + (sse4_1_movntdqa): Rename to ... + (<sse4_1_avx2>_movntdqa): ... this. Update. + (sse4_1_mpsadbw): Likewise. + (<sse4_1_avx2>_mpsadbw): Likewise. + (avx2_packusdw): New. + (sse4_1_pblendvb): Rename to ... + (<sse4_1_avx2>_pblendvb): ... this. Update. + (sse4_1_pblendw): Likewise. + (<sse4_1_avx2>_pblendw): Likewise. + (avx2_pblendd<mode>): New. + (avx2_<code>v16qiv16hi2): Likewise. + (avx2_<code>v8qiv8si2): Likewise. + (avx2_<code>v8hiv8si2): Likewise. + (avx2_<code>v4qiv4di2): Likewise. + (avx2_<code>v4hiv4di2): Likewise. + (avx2_<code>v4siv4di2): Likewise. + (avx2_pbroadcast<mode>): Likewise. + (avx2_permvarv8si): Likewise. + (avx2_permv4df): Likewise. + (avx2_permvarv8sf): Likewise. + (avx2_permv4di): Likewise. + (avx2_permv2ti): Likewise. + (avx2_vec_dupv4df): Likewise. + (avx2_vbroadcasti128_<mode>): Likewise. + (avx2_vec_set_lo_v4di): Likewise. + (avx2_vec_set_hi_v4di): Likewise. + (avx_maskload<ssemodesuffix><avxsizesuffix>): Rename to ... + (<avx_avx2>_maskload<avx2modesuffix><avxmodesuffix>): ... this. + Update. + (avx_maskstore<ssemodesuffix><avxsizesuffix>): Likewise. + (<avx_avx2>_maskstore<avx2modesuffix><avxmodesuffix>): Likewise. + (*avx2_maskmov<avx2modesuffix><avxmodesuffix>): New. + (avx2_extracti128): Likewise. + (avx2_inserti128): Likewise. + (avx2_ashrvv8si): Likewise. + (avx2_ashrvv4si): Likewise. + (avx2_<lshift>vv8si): Likewise. + (avx2_<lshift>v<mode>): Likewise. + (avx2_<lshift>vv2di): Likewise. + (avx2_gathersi<mode>): Likewise. + (*avx2_gathersi<mode>): Likewise. + (avx2_gatherdi<mode>): Likewise. + (*avx2_gatherdi<mode>): Likewise. + (avx2_gatherdi<mode>256): Likewise. + (*avx2_gatherdi<mode>256): Likewise. + * doc/extend.texi: Document AVX2 built-in functions. + * doc/invoke.texi: Document -mavx2. + 2011-08-22 Matthias Klose <doko@debian.org> Revert: diff --git a/gcc/config.gcc b/gcc/config.gcc index e8e0eeb..b8addaf 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -352,7 +352,7 @@ i[34567]86-*-*) nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h immintrin.h x86intrin.h avxintrin.h xopintrin.h ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h - lzcntintrin.h bmiintrin.h tbmintrin.h" + lzcntintrin.h bmiintrin.h tbmintrin.h avx2intrin.h" ;; x86_64-*-*) cpu_type=i386 @@ -364,7 +364,7 @@ x86_64-*-*) nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h immintrin.h x86intrin.h avxintrin.h xopintrin.h ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h - lzcntintrin.h bmiintrin.h tbmintrin.h" + lzcntintrin.h bmiintrin.h tbmintrin.h avx2intrin.h" need_64bit_hwint=yes ;; ia64-*-*) diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h new file mode 100644 index 0000000..3c8f360 --- /dev/null +++ b/gcc/config/i386/avx2intrin.h @@ -0,0 +1,1874 @@ +/* Copyright (C) 2011 + Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _IMMINTRIN_H_INCLUDED +# error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." +#endif + +/* Sum absolute 8-bit integer difference of adjacent groups of 4 + byte integers in the first 2 operands. Starting offsets within + operands are determined by the 3rd mask operand. */ +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X, + (__v32qi)__Y, __M); +} +#else +#define _mm256_mpsadbw_epu8(X, Y, M) \ + ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), (int)(M))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi8 (__m256i __A) +{ + return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi16 (__m256i __A) +{ + return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi32 (__m256i __A) +{ + return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packs_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packs_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packus_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packus_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N) +{ + return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A, + (__v4di)__B, + __N * 8); +} +#else +/* In that case (__N*8) will be in vreg, and insn will not be matched. */ +/* Use define instead */ +#define _mm256_alignr_epi8(A, B, N) \ + ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (int)(N) * 8)) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_and_si256 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_andnot_si256 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_avg_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_avg_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M) +{ + return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X, + (__v32qi)__Y, + (__v32qi)__M); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X, + (__v16hi)__Y, + __M); +} +#else +#define _mm256_blend_epi16(X, Y, M) \ + ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \ + (__v16hi)(__m256i)(Y), (int)(M))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A, + (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A, + (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A, + (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadds_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsubs_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maddubs_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X, + (__v32qi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_madd_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A, + (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movemask_epi8 (__m256i __A) +{ + return __builtin_ia32_pmovmskb256 ((__v32qi)__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi8_epi16 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi8_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi8_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi16_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi16_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu8_epi16 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu8_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu8_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu16_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu16_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu32_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mulhrs_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mulhi_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mulhi_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mullo_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mullo_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_epu32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_si256 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sad_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_epi8 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X, + (__v32qi)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_epi32 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shufflehi_epi16 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shufflelo_epi16 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask); +} +#else +#define _mm256_shuffle_epi32(A, N) \ + ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N))) +#define _mm256_shufflehi_epi16(A, N) \ + ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N))) +#define _mm256_shufflelo_epi16(A, N) \ + ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sign_epi8 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sign_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sign_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_si256 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); +} +#else +#define _mm256_slli_si256(A, N) \ + ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_epi16 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sll_epi16 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_epi32 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sll_epi32 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_epi64 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sll_epi64 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srai_epi16 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sra_epi16 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srai_epi32 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sra_epi32 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_si256 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); +} +#else +#define _mm256_srli_si256(A, N) \ + ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_epi16 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srl_epi16 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_epi32 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srl_epi32 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_epi64 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srl_epi64 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_si256 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_load_si256 (__m256i const *__X) +{ + return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastss_ps (__m128 __X) +{ + return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastss_ps (__m128 __X) +{ + return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastsd_pd (__m128d __X) +{ + return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastsi128_si256 (__m128i __X) +{ + return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X, + (__v4si)__Y, + __M); +} +#else +#define _mm_blend_epi32(X, Y, M) \ + ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \ + (__v4si)(__m128i)(Y), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X, + (__v8si)__Y, + __M); +} +#else +#define _mm256_blend_epi32(X, Y, M) \ + ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(M))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastb_epi8 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastw_epi16 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastd_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastq_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastb_epi8 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastw_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastd_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastq_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute4x64_pd (__m256d __X, const int __M) +{ + return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M); +} +#else +#define _mm256_permute4x64_pd(X, M) \ + ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M))) +#endif + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar8x32_ps (__m256 __X, __m256 __Y) +{ + return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X,(__v8sf)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute4x64_epi64 (__m256i __X, const int __M) +{ + return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M); +} +#else +#define _mm256_permute4x64_epi64(X, M) \ + ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M))) +#endif + + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M); +} +#else +#define _mm256_permute2x128_si256(X, Y, M) \ + ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extracti128_si256 (__m256i __X, const int __M) +{ + return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M); +} +#else +#define _mm256_extracti128_si256(X, M) \ + ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M); +} +#else +#define _mm256_inserti128_si256(X, Y, M) \ + ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \ + (__v2di)(__m128i)(Y), \ + (int)(M))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_epi32 (int const *__X, __m256i __M ) +{ + return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X, + (__v8si)__M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_epi64 (long long const *__X, __m256i __M ) +{ + return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X, + (__v4di)__M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_epi32 (int const *__X, __m128i __M ) +{ + return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X, + (__v4si)__M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_epi64 (long long const *__X, __m128i __M ) +{ + return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X, + (__v2di)__M); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y ) +{ + __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y ) +{ + __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y ) +{ + __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y ) +{ + __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sllv_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sllv_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sllv_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sllv_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srav_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srav_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srlv_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srlv_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srlv_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srlv_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_pd (double const *base, __m128i index, const int scale) +{ + __v2df src = _mm_setzero_pd (); + __v2df mask = _mm_cmpeq_pd (src, src); + + return (__m128d) __builtin_ia32_gathersiv2df (src, + base, + (__v4si)index, + mask, + scale); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index, + __m128d mask, const int scale) +{ + return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src, + base, + (__v4si)index, + (__v2df)mask, + scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_pd (double const *base, __m128i index, const int scale) +{ + __v4df src = _mm256_setzero_pd (); + __v4df mask = _mm256_set1_pd((double)(long long int) -1); + + return (__m256d) __builtin_ia32_gathersiv4df (src, + base, + (__v4si)index, + mask, + scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_pd (__m256d src, double const *base, + __m128i index, __m256d mask, const int scale) +{ + return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src, + base, + (__v4si)index, + (__v4df)mask, + scale); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_pd (double const *base, __m128i index, const int scale) +{ + __v2df src = _mm_setzero_pd (); + __v2df mask = _mm_cmpeq_pd (src, src); + + return (__m128d) __builtin_ia32_gatherdiv2df (src, + base, + (__v2di)index, + mask, + scale); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index, + __m128d mask, const int scale) +{ + return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src, + base, + (__v2di)index, + (__v2df)mask, + scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_pd (double const *base, __m256i index, const int scale) +{ + __v4df src = _mm256_setzero_pd (); + __v4df mask = _mm256_set1_pd((double)(long long int) -1); + + return (__m256d) __builtin_ia32_gatherdiv4df (src, + base, + (__v4di)index, + mask, + scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_pd (__m256d src, double const *base, + __m256i index, __m256d mask, const int scale) +{ + return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src, + base, + (__v4di)index, + (__v4df)mask, + scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_ps (float const *base, __m128i index, const int scale) +{ + __v4sf src = _mm_setzero_ps (); + __v4sf mask = _mm_cmpeq_ps (src, src); + + return (__m128) __builtin_ia32_gathersiv4sf (src, + base, + (__v4si)index, + mask, + scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index, + __m128 mask, const int scale) +{ + return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src, + base, + (__v4si)index, + (__v4sf)mask, + scale); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_ps (float const *base, __m256i index, const int scale) +{ + __v8sf src = _mm256_setzero_ps (); + __v8sf mask = _mm256_set1_ps((float)(int) -1); + + return (__m256) __builtin_ia32_gathersiv8sf (src, + base, + (__v8si)index, + mask, + scale); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_ps (__m256 src, float const *base, + __m256i index, __m256 mask, const int scale) +{ + return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src, + base, + (__v8si)index, + (__v8sf)mask, + scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_ps (float const *base, __m128i index, const int scale) +{ + __v4sf src = _mm_setzero_ps (); + __v4sf mask = _mm_cmpeq_ps (src, src); + + return (__m128) __builtin_ia32_gatherdiv4sf (src, + base, + (__v2di)index, + mask, + scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index, + __m128 mask, const int scale) +{ + return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src, + base, + (__v2di)index, + (__v4sf)mask, + scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_ps (float const *base, __m256i index, const int scale) +{ + __v4sf src = _mm_setzero_ps (); + __v4sf mask = _mm_cmpeq_ps (src, src); + + return (__m128) __builtin_ia32_gatherdiv4sf256 (src, + base, + (__v4di)index, + mask, + scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_ps (__m128 src, float const *base, + __m256i index, __m128 mask, const int scale) +{ + return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src, + base, + (__v4di)index, + (__v4sf)mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_epi64 (long long int const *base, + __m128i index, const int scale) +{ + __v2di src = __extension__ (__v2di){ 0, 0 }; + __v2di mask = __extension__ (__v2di){ ~0, ~0 }; + + return (__m128i) __builtin_ia32_gathersiv2di (src, + base, + (__v4si)index, + mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_epi64 (__m128i src, long long int const *base, + __m128i index, __m128i mask, const int scale) +{ + return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src, + base, + (__v4si)index, + (__v2di)mask, + scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_epi64 (long long int const *base, + __m128i index, const int scale) +{ + __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; + __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; + + return (__m256i) __builtin_ia32_gathersiv4di (src, + base, + (__v4si)index, + mask, + scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_epi64 (__m256i src, long long int const *base, + __m128i index, __m256i mask, const int scale) +{ + return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src, + base, + (__v4si)index, + (__v4di)mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_epi64 (long long int const *base, + __m128i index, const int scale) +{ + __v2di src = __extension__ (__v2di){ 0, 0 }; + __v2di mask = __extension__ (__v2di){ ~0, ~0 }; + + return (__m128i) __builtin_ia32_gatherdiv2di (src, + base, + (__v2di)index, + mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index, + __m128i mask, const int scale) +{ + return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src, + base, + (__v2di)index, + (__v2di)mask, + scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_epi64 (long long int const *base, + __m256i index, const int scale) +{ + __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; + __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; + + return (__m256i) __builtin_ia32_gatherdiv4di (src, + base, + (__v4di)index, + mask, + scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_epi64 (__m256i src, long long int const *base, + __m256i index, __m256i mask, const int scale) +{ + return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src, + base, + (__v4di)index, + (__v4di)mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_epi32 (int const *base, __m128i index, const int scale) +{ + __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + + return (__m128i) __builtin_ia32_gathersiv4si (src, + base, + (__v4si)index, + mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index, + __m128i mask, const int scale) +{ + return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src, + base, + (__v4si)index, + (__v4si)mask, + scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_epi32 (int const *base, __m256i index, const int scale) +{ + __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 }; + __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }; + + return (__m256i) __builtin_ia32_gathersiv8si (src, + base, + (__v8si)index, + mask, + scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_epi32 (__m256i src, int const *base, + __m256i index, __m256i mask, const int scale) +{ + return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src, + base, + (__v8si)index, + (__v8si)mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_epi32 (int const *base, __m128i index, const int scale) +{ + __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + + return (__m128i) __builtin_ia32_gatherdiv4si (src, + base, + (__v2di)index, + mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index, + __m128i mask, const int scale) +{ + return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src, + base, + (__v2di)index, + (__v4si)mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_epi32 (int const *base, __m256i index, const int scale) +{ + __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + + return (__m128i) __builtin_ia32_gatherdiv4si256 (src, + base, + (__v4di)index, + mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_epi32 (__m128i src, int const *base, + __m256i index, __m128i mask, const int scale) +{ + return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src, + base, + (__v4di)index, + (__v4si)mask, + scale); +} +#else /* __OPTIMIZE__ */ +#define _mm_i32gather_pd(BASE, INDEX, SCALE) \ + (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \ + (double const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v2df)_mm_set1_pd( \ + (double)(long long int) -1), \ + (int)SCALE) + +#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \ + (double const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v2df)(__m128d)MASK, \ + (int)SCALE) + +#define _mm256_i32gather_pd(BASE, INDEX, SCALE) \ + (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \ + (double const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4df)_mm256_set1_pd( \ + (double)(long long int) -1), \ + (int)SCALE) + +#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \ + (double const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4df)(__m256d)MASK, \ + (int)SCALE) + +#define _mm_i64gather_pd(BASE, INDEX, SCALE) \ + (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \ + (double const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v2df)_mm_set1_pd( \ + (double)(long long int) -1), \ + (int)SCALE) + +#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \ + (double const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v2df)(__m128d)MASK, \ + (int)SCALE) + +#define _mm256_i64gather_pd(BASE, INDEX, SCALE) \ + (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \ + (double const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4df)_mm256_set1_pd( \ + (double)(long long int) -1), \ + (int)SCALE) + +#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \ + (double const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4df)(__m256d)MASK, \ + (int)SCALE) + +#define _mm_i32gather_ps(BASE, INDEX, SCALE) \ + (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \ + (float const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + _mm_set1_ps ((float)(int) -1), \ + (int)SCALE) + +#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \ + (float const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4sf)(__m128d)MASK, \ + (int)SCALE) + +#define _mm256_i32gather_ps(BASE, INDEX, SCALE) \ + (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \ + (float const *)BASE, \ + (__v8si)(__m256i)INDEX, \ + (__v8sf)_mm256_set1_ps ( \ + (float)(int) -1), \ + (int)SCALE) + +#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \ + (float const *)BASE, \ + (__v8si)(__m256i)INDEX, \ + (__v8sf)(__m256d)MASK, \ + (int)SCALE) + +#define _mm_i64gather_ps(BASE, INDEX, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \ + (float const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v4sf)_mm_set1_ps ( \ + (float)(int) -1), \ + (int)SCALE) + +#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \ + (float const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v4sf)(__m128d)MASK, \ + (int)SCALE) + +#define _mm256_i64gather_ps(BASE, INDEX, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \ + (float const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4sf)_mm_set1_ps( \ + (float)(int) -1), \ + (int)SCALE) + +#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \ + (float const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4sf)(__m128)MASK, \ + (int)SCALE) + +#define _mm_i32gather_epi64(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \ + (long long const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v2di)_mm_set1_epi64x (-1), \ + (int)SCALE) + +#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \ + (long long const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v2di)(__m128i)MASK, \ + (int)SCALE) + +#define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \ + (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \ + (long long const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4di)_mm256_set1_epi64x (-1), \ + (int)SCALE) + +#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \ + (long long const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4di)(__m256i)MASK, \ + (int)SCALE) + +#define _mm_i64gather_epi64(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \ + (long long const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v2di)_mm_set1_epi64x (-1), \ + (int)SCALE) + +#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \ + (long long const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v2di)(__m128i)MASK, \ + (int)SCALE) + +#define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \ + (long long const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4di)_mm256_set1_epi64x (-1), \ + (int)SCALE) + +#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \ + (long long const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4di)(__m256i)MASK, \ + (int)SCALE) + +#define _mm_i32gather_epi32(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \ + (int const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4si)_mm_set1_epi32 (-1), \ + (int)SCALE) + +#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \ + (int const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4si)(__m128i)MASK, \ + (int)SCALE) + +#define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \ + (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \ + (int const *)BASE, \ + (__v8si)(__m256i)INDEX, \ + (__v8si)_mm256_set1_epi32 (-1), \ + (int)SCALE) + +#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \ + (int const *)BASE, \ + (__v8si)(__m256i)INDEX, \ + (__v8si)(__m256i)MASK, \ + (int)SCALE) + +#define _mm_i64gather_epi32(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \ + (int const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v4si)_mm_set1_epi32 (-1), \ + (int)SCALE) + +#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \ + (int const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v4si)(__m128i)MASK, \ + (int)SCALE) + +#define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \ + (int const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4si)_mm_set1_epi32(-1), \ + (int)SCALE) + +#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \ + (int const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4si)(__m128i)MASK, \ + (int)SCALE) +#endif /* __OPTIMIZE__ */ diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index d4b3e82..c4070e4 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -102,6 +102,8 @@ DEF_VECTOR_TYPE (V32QI, QI) DEF_POINTER_TYPE (PCCHAR, CHAR, CONST) DEF_POINTER_TYPE (PCDOUBLE, DOUBLE, CONST) DEF_POINTER_TYPE (PCFLOAT, FLOAT, CONST) +DEF_POINTER_TYPE (PCINT, INT, CONST) +DEF_POINTER_TYPE (PCINT64, INT64, CONST) DEF_POINTER_TYPE (PCHAR, CHAR) DEF_POINTER_TYPE (PCVOID, VOID, CONST) DEF_POINTER_TYPE (PVOID, VOID) @@ -119,6 +121,8 @@ DEF_POINTER_TYPE (PV4DF, V4DF) DEF_POINTER_TYPE (PV4DI, V4DI) DEF_POINTER_TYPE (PV4SF, V4SF) DEF_POINTER_TYPE (PV8SF, V8SF) +DEF_POINTER_TYPE (PV4SI, V4SI) +DEF_POINTER_TYPE (PV8SI, V8SI) DEF_POINTER_TYPE (PCV2DF, V2DF, CONST) DEF_POINTER_TYPE (PCV2SF, V2SF, CONST) @@ -126,6 +130,11 @@ DEF_POINTER_TYPE (PCV4DF, V4DF, CONST) DEF_POINTER_TYPE (PCV4SF, V4SF, CONST) DEF_POINTER_TYPE (PCV8SF, V8SF, CONST) +DEF_POINTER_TYPE (PCV2DI, V2DI, CONST) +DEF_POINTER_TYPE (PCV4SI, V4SI, CONST) +DEF_POINTER_TYPE (PCV4DI, V4DI, CONST) +DEF_POINTER_TYPE (PCV8SI, V8SI, CONST) + DEF_FUNCTION_TYPE (FLOAT128) DEF_FUNCTION_TYPE (UINT64) DEF_FUNCTION_TYPE (UNSIGNED) @@ -141,6 +150,7 @@ DEF_FUNCTION_TYPE (INT, V4DF) DEF_FUNCTION_TYPE (INT, V4SF) DEF_FUNCTION_TYPE (INT, V8QI) DEF_FUNCTION_TYPE (INT, V8SF) +DEF_FUNCTION_TYPE (INT, V32QI) DEF_FUNCTION_TYPE (INT64, INT64) DEF_FUNCTION_TYPE (INT64, V2DF) DEF_FUNCTION_TYPE (INT64, V4SF) @@ -199,6 +209,11 @@ DEF_FUNCTION_TYPE (V8SF, V8SI) DEF_FUNCTION_TYPE (V8SF, V8HI) DEF_FUNCTION_TYPE (V8SI, V4SI) DEF_FUNCTION_TYPE (V8SI, V8SF) +DEF_FUNCTION_TYPE (V32QI, V32QI) +DEF_FUNCTION_TYPE (V32QI, V16QI) +DEF_FUNCTION_TYPE (V16HI, V16HI) +DEF_FUNCTION_TYPE (V16HI, V8HI) +DEF_FUNCTION_TYPE (V8SI, V8SI) DEF_FUNCTION_TYPE (VOID, PCVOID) DEF_FUNCTION_TYPE (VOID, PVOID) DEF_FUNCTION_TYPE (VOID, UINT64) @@ -206,6 +221,14 @@ DEF_FUNCTION_TYPE (VOID, UNSIGNED) DEF_FUNCTION_TYPE (INT, PUSHORT) DEF_FUNCTION_TYPE (INT, PUNSIGNED) DEF_FUNCTION_TYPE (INT, PULONGLONG) +DEF_FUNCTION_TYPE (V16HI, V16QI) +DEF_FUNCTION_TYPE (V8SI, V16QI) +DEF_FUNCTION_TYPE (V4DI, V16QI) +DEF_FUNCTION_TYPE (V8SI, V8HI) +DEF_FUNCTION_TYPE (V4DI, V8HI) +DEF_FUNCTION_TYPE (V4DI, V4SI) +DEF_FUNCTION_TYPE (V4DI, PV4DI) +DEF_FUNCTION_TYPE (V4DI, V2DI) DEF_FUNCTION_TYPE (DI, V2DI, INT) DEF_FUNCTION_TYPE (DOUBLE, V2DF, INT) @@ -252,6 +275,7 @@ DEF_FUNCTION_TYPE (V2DI, V2DI, SI) DEF_FUNCTION_TYPE (V2DI, V2DI, V16QI) DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI) DEF_FUNCTION_TYPE (V2DI, V4SI, V4SI) +DEF_FUNCTION_TYPE (V2DI, PCV2DI, V2DI) DEF_FUNCTION_TYPE (V2SF, V2SF, V2SF) DEF_FUNCTION_TYPE (V2SI, INT, INT) DEF_FUNCTION_TYPE (V2SI, V2SF, V2SF) @@ -284,6 +308,7 @@ DEF_FUNCTION_TYPE (V4SI, V4SI, SI) DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI) DEF_FUNCTION_TYPE (V4SI, V8HI, V8HI) DEF_FUNCTION_TYPE (V4SI, V8SI, INT) +DEF_FUNCTION_TYPE (V4SI, PCV4SI, V4SI) DEF_FUNCTION_TYPE (V8HI, V16QI, V16QI) DEF_FUNCTION_TYPE (V8HI, V4SI, V4SI) DEF_FUNCTION_TYPE (V8HI, V8HI, INT) @@ -297,6 +322,28 @@ DEF_FUNCTION_TYPE (V8SF, PCV8SF, V8SI) DEF_FUNCTION_TYPE (V8SF, V8SF, INT) DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF) DEF_FUNCTION_TYPE (V8SF, V8SF, V8SI) +DEF_FUNCTION_TYPE (V32QI, V16HI, V16HI) +DEF_FUNCTION_TYPE (V16HI, V8SI, V8SI) +DEF_FUNCTION_TYPE (V32QI, V32QI, V32QI) +DEF_FUNCTION_TYPE (V16HI, V32QI, V32QI) +DEF_FUNCTION_TYPE (V16HI, V16HI, V8HI) +DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI) +DEF_FUNCTION_TYPE (V16HI, V16HI, INT) +DEF_FUNCTION_TYPE (V16HI, V16HI, SI) +DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI, INT) +DEF_FUNCTION_TYPE (V32QI, V32QI, V32QI, INT) +DEF_FUNCTION_TYPE (V8SI, V8SI, V4SI) +DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI) +DEF_FUNCTION_TYPE (V8SI, V16HI, V16HI) +DEF_FUNCTION_TYPE (V8SI, V8SI, INT) +DEF_FUNCTION_TYPE (V8SI, V8SI, SI) +DEF_FUNCTION_TYPE (V8SI, PCV8SI, V8SI) +DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI) +DEF_FUNCTION_TYPE (V4DI, V8SI, V8SI) +DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI) +DEF_FUNCTION_TYPE (V4DI, PCV4DI, V4DI) +DEF_FUNCTION_TYPE (V4DI, V4DI, INT) +DEF_FUNCTION_TYPE (V2DI, V4DI, INT) DEF_FUNCTION_TYPE (VOID, PCHAR, V16QI) DEF_FUNCTION_TYPE (VOID, PCHAR, V32QI) DEF_FUNCTION_TYPE (VOID, PDOUBLE, V2DF) @@ -351,11 +398,17 @@ DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SI, INT) DEF_FUNCTION_TYPE (V8SI, V8SI, V4SI, INT) DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, INT) DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, V8SI) +DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI, INT) +DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI, INT) DEF_FUNCTION_TYPE (VOID, PCVOID, UNSIGNED, UNSIGNED) DEF_FUNCTION_TYPE (VOID, PV2DF, V2DI, V2DF) DEF_FUNCTION_TYPE (VOID, PV4DF, V4DI, V4DF) DEF_FUNCTION_TYPE (VOID, PV4SF, V4SI, V4SF) DEF_FUNCTION_TYPE (VOID, PV8SF, V8SI, V8SF) +DEF_FUNCTION_TYPE (VOID, PV2DI, V2DI, V2DI) +DEF_FUNCTION_TYPE (VOID, PV4DI, V4DI, V4DI) +DEF_FUNCTION_TYPE (VOID, PV4SI, V4SI, V4SI) +DEF_FUNCTION_TYPE (VOID, PV8SI, V8SI, V8SI) DEF_FUNCTION_TYPE (VOID, UINT, UINT, UINT) DEF_FUNCTION_TYPE (VOID, UINT64, UINT, UINT) DEF_FUNCTION_TYPE (VOID, V16QI, V16QI, PCHAR) @@ -377,6 +430,23 @@ DEF_FUNCTION_TYPE (V16QI, V16QI, INT, V16QI, INT, INT) DEF_FUNCTION_TYPE (V8QI, QI, QI, QI, QI, QI, QI, QI, QI) +DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V4SI, V2DF, INT) +DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4SI, V4DF, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V2DI, V2DF, INT) +DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4DI, V4DF, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4SI, V4SF, INT) +DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V8SI, V8SF, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V2DI, V4SF, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4DI, V4SF, INT) +DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V4SI, V2DI, INT) +DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4SI, V4DI, INT) +DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V2DI, V2DI, INT) +DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4DI, V4DI, INT) +DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4SI, V4SI, INT) +DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V8SI, V8SI, INT) +DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V2DI, V4SI, INT) +DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4DI, V4SI, INT) + DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, ROUND) DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND) DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF, ROUND) @@ -404,11 +474,19 @@ DEF_FUNCTION_TYPE_ALIAS (V2SI_FTYPE_V2SI_V2SI, COUNT) DEF_FUNCTION_TYPE_ALIAS (V4HI_FTYPE_V4HI_V4HI, COUNT) DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SI_V4SI, COUNT) DEF_FUNCTION_TYPE_ALIAS (V8HI_FTYPE_V8HI_V8HI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V16HI_FTYPE_V16HI_SI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V16HI_FTYPE_V16HI_V8HI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V8SI_SI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V8SI_V4SI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V4DI_FTYPE_V4DI_INT, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V4DI_FTYPE_V4DI_V2DI, COUNT) DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF_V2DF, SWAP) DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF_V4SF, SWAP) +DEF_FUNCTION_TYPE_ALIAS (V4DI_FTYPE_V4DI_INT, CONVERT) DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_INT, CONVERT) +DEF_FUNCTION_TYPE_ALIAS (V4DI_FTYPE_V4DI_V4DI_INT, CONVERT) DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_V2DI_INT, CONVERT) DEF_FUNCTION_TYPE_ALIAS (V1DI_FTYPE_V1DI_V1DI_INT, CONVERT) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7b7ac87..ef02673 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -23867,6 +23867,180 @@ enum ix86_builtins IX86_BUILTIN_MOVMSKPD256, IX86_BUILTIN_MOVMSKPS256, + /* AVX2 */ + IX86_BUILTIN_MPSADBW256, + IX86_BUILTIN_PABSB256, + IX86_BUILTIN_PABSW256, + IX86_BUILTIN_PABSD256, + IX86_BUILTIN_PACKSSDW256, + IX86_BUILTIN_PACKSSWB256, + IX86_BUILTIN_PACKUSDW256, + IX86_BUILTIN_PACKUSWB256, + IX86_BUILTIN_PADDB256, + IX86_BUILTIN_PADDW256, + IX86_BUILTIN_PADDD256, + IX86_BUILTIN_PADDQ256, + IX86_BUILTIN_PADDSB256, + IX86_BUILTIN_PADDSW256, + IX86_BUILTIN_PADDUSB256, + IX86_BUILTIN_PADDUSW256, + IX86_BUILTIN_PALIGNR256, + IX86_BUILTIN_AND256I, + IX86_BUILTIN_ANDNOT256I, + IX86_BUILTIN_PAVGB256, + IX86_BUILTIN_PAVGW256, + IX86_BUILTIN_PBLENDVB256, + IX86_BUILTIN_PBLENDVW256, + IX86_BUILTIN_PCMPEQB256, + IX86_BUILTIN_PCMPEQW256, + IX86_BUILTIN_PCMPEQD256, + IX86_BUILTIN_PCMPEQQ256, + IX86_BUILTIN_PCMPGTB256, + IX86_BUILTIN_PCMPGTW256, + IX86_BUILTIN_PCMPGTD256, + IX86_BUILTIN_PCMPGTQ256, + IX86_BUILTIN_PHADDW256, + IX86_BUILTIN_PHADDD256, + IX86_BUILTIN_PHADDSW256, + IX86_BUILTIN_PHSUBW256, + IX86_BUILTIN_PHSUBD256, + IX86_BUILTIN_PHSUBSW256, + IX86_BUILTIN_PMADDUBSW256, + IX86_BUILTIN_PMADDWD256, + IX86_BUILTIN_PMAXSB256, + IX86_BUILTIN_PMAXSW256, + IX86_BUILTIN_PMAXSD256, + IX86_BUILTIN_PMAXUB256, + IX86_BUILTIN_PMAXUW256, + IX86_BUILTIN_PMAXUD256, + IX86_BUILTIN_PMINSB256, + IX86_BUILTIN_PMINSW256, + IX86_BUILTIN_PMINSD256, + IX86_BUILTIN_PMINUB256, + IX86_BUILTIN_PMINUW256, + IX86_BUILTIN_PMINUD256, + IX86_BUILTIN_PMOVMSKB256, + IX86_BUILTIN_PMOVSXBW256, + IX86_BUILTIN_PMOVSXBD256, + IX86_BUILTIN_PMOVSXBQ256, + IX86_BUILTIN_PMOVSXWD256, + IX86_BUILTIN_PMOVSXWQ256, + IX86_BUILTIN_PMOVSXDQ256, + IX86_BUILTIN_PMOVZXBW256, + IX86_BUILTIN_PMOVZXBD256, + IX86_BUILTIN_PMOVZXBQ256, + IX86_BUILTIN_PMOVZXWD256, + IX86_BUILTIN_PMOVZXWQ256, + IX86_BUILTIN_PMOVZXDQ256, + IX86_BUILTIN_PMULDQ256, + IX86_BUILTIN_PMULHRSW256, + IX86_BUILTIN_PMULHUW256, + IX86_BUILTIN_PMULHW256, + IX86_BUILTIN_PMULLW256, + IX86_BUILTIN_PMULLD256, + IX86_BUILTIN_PMULUDQ256, + IX86_BUILTIN_POR256, + IX86_BUILTIN_PSADBW256, + IX86_BUILTIN_PSHUFB256, + IX86_BUILTIN_PSHUFD256, + IX86_BUILTIN_PSHUFHW256, + IX86_BUILTIN_PSHUFLW256, + IX86_BUILTIN_PSIGNB256, + IX86_BUILTIN_PSIGNW256, + IX86_BUILTIN_PSIGND256, + IX86_BUILTIN_PSLLDQI256, + IX86_BUILTIN_PSLLWI256, + IX86_BUILTIN_PSLLW256, + IX86_BUILTIN_PSLLDI256, + IX86_BUILTIN_PSLLD256, + IX86_BUILTIN_PSLLQI256, + IX86_BUILTIN_PSLLQ256, + IX86_BUILTIN_PSRAWI256, + IX86_BUILTIN_PSRAW256, + IX86_BUILTIN_PSRADI256, + IX86_BUILTIN_PSRAD256, + IX86_BUILTIN_PSRLDQI256, + IX86_BUILTIN_PSRLWI256, + IX86_BUILTIN_PSRLW256, + IX86_BUILTIN_PSRLDI256, + IX86_BUILTIN_PSRLD256, + IX86_BUILTIN_PSRLQI256, + IX86_BUILTIN_PSRLQ256, + IX86_BUILTIN_PSUBB256, + IX86_BUILTIN_PSUBW256, + IX86_BUILTIN_PSUBD256, + IX86_BUILTIN_PSUBQ256, + IX86_BUILTIN_PSUBSB256, + IX86_BUILTIN_PSUBSW256, + IX86_BUILTIN_PSUBUSB256, + IX86_BUILTIN_PSUBUSW256, + IX86_BUILTIN_PUNPCKHBW256, + IX86_BUILTIN_PUNPCKHWD256, + IX86_BUILTIN_PUNPCKHDQ256, + IX86_BUILTIN_PUNPCKHQDQ256, + IX86_BUILTIN_PUNPCKLBW256, + IX86_BUILTIN_PUNPCKLWD256, + IX86_BUILTIN_PUNPCKLDQ256, + IX86_BUILTIN_PUNPCKLQDQ256, + IX86_BUILTIN_PXOR256, + IX86_BUILTIN_MOVNTDQA256, + IX86_BUILTIN_VBROADCASTSS_PS, + IX86_BUILTIN_VBROADCASTSS_PS256, + IX86_BUILTIN_VBROADCASTSD_PD256, + IX86_BUILTIN_VBROADCASTSI256, + IX86_BUILTIN_PBLENDD256, + IX86_BUILTIN_PBLENDD128, + IX86_BUILTIN_PBROADCASTB256, + IX86_BUILTIN_PBROADCASTW256, + IX86_BUILTIN_PBROADCASTD256, + IX86_BUILTIN_PBROADCASTQ256, + IX86_BUILTIN_PBROADCASTB128, + IX86_BUILTIN_PBROADCASTW128, + IX86_BUILTIN_PBROADCASTD128, + IX86_BUILTIN_PBROADCASTQ128, + IX86_BUILTIN_VPERMVARSI256, + IX86_BUILTIN_VPERMDF256, + IX86_BUILTIN_VPERMVARSF256, + IX86_BUILTIN_VPERMDI256, + IX86_BUILTIN_VPERMTI256, + IX86_BUILTIN_VEXTRACT128I256, + IX86_BUILTIN_VINSERT128I256, + IX86_BUILTIN_MASKLOADD, + IX86_BUILTIN_MASKLOADQ, + IX86_BUILTIN_MASKLOADD256, + IX86_BUILTIN_MASKLOADQ256, + IX86_BUILTIN_MASKSTORED, + IX86_BUILTIN_MASKSTOREQ, + IX86_BUILTIN_MASKSTORED256, + IX86_BUILTIN_MASKSTOREQ256, + IX86_BUILTIN_PSLLVV4DI, + IX86_BUILTIN_PSLLVV2DI, + IX86_BUILTIN_PSLLVV8SI, + IX86_BUILTIN_PSLLVV4SI, + IX86_BUILTIN_PSRAVV8SI, + IX86_BUILTIN_PSRAVV4SI, + IX86_BUILTIN_PSRLVV4DI, + IX86_BUILTIN_PSRLVV2DI, + IX86_BUILTIN_PSRLVV8SI, + IX86_BUILTIN_PSRLVV4SI, + + IX86_BUILTIN_GATHERSIV2DF, + IX86_BUILTIN_GATHERSIV4DF, + IX86_BUILTIN_GATHERDIV2DF, + IX86_BUILTIN_GATHERDIV4DF, + IX86_BUILTIN_GATHERSIV4SF, + IX86_BUILTIN_GATHERSIV8SF, + IX86_BUILTIN_GATHERDIV4SF, + IX86_BUILTIN_GATHERDIV8SF, + IX86_BUILTIN_GATHERSIV2DI, + IX86_BUILTIN_GATHERSIV4DI, + IX86_BUILTIN_GATHERDIV2DI, + IX86_BUILTIN_GATHERDIV4DI, + IX86_BUILTIN_GATHERSIV4SI, + IX86_BUILTIN_GATHERSIV8SI, + IX86_BUILTIN_GATHERDIV4SI, + IX86_BUILTIN_GATHERDIV8SI, + /* TFmode support builtins. */ IX86_BUILTIN_INFQ, IX86_BUILTIN_HUGE_VALQ, @@ -24362,6 +24536,17 @@ static const struct builtin_description bdesc_special_args[] = { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF }, + /* AVX2 */ + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI }, + { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID }, { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID }, { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT }, @@ -25026,6 +25211,154 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + /* AVX2 */ + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv4di, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlqv4di3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrqv4di3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 }, /* BMI */ @@ -25415,6 +25748,71 @@ ix86_init_mmx_sse_builtins (void) "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDRAND64_STEP); + /* AVX2 */ + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df", + V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT, + IX86_BUILTIN_GATHERSIV2DF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df", + V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT, + IX86_BUILTIN_GATHERSIV4DF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df", + V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT, + IX86_BUILTIN_GATHERDIV2DF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df", + V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT, + IX86_BUILTIN_GATHERDIV4DF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf", + V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT, + IX86_BUILTIN_GATHERSIV4SF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf", + V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT, + IX86_BUILTIN_GATHERSIV8SF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf", + V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT, + IX86_BUILTIN_GATHERDIV4SF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256", + V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT, + IX86_BUILTIN_GATHERDIV8SF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di", + V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT, + IX86_BUILTIN_GATHERSIV2DI); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di", + V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT, + IX86_BUILTIN_GATHERSIV4DI); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di", + V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT, + IX86_BUILTIN_GATHERDIV2DI); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di", + V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT, + IX86_BUILTIN_GATHERDIV4DI); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si", + V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT, + IX86_BUILTIN_GATHERSIV4SI); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si", + V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT, + IX86_BUILTIN_GATHERSIV8SI); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si", + V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT, + IX86_BUILTIN_GATHERDIV4SI); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256", + V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT, + IX86_BUILTIN_GATHERDIV8SI); + /* MMX access to the vec_init patterns. */ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si", V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI); @@ -26364,6 +26762,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case INT_FTYPE_V4DF: case INT_FTYPE_V4SF: case INT_FTYPE_V2DF: + case INT_FTYPE_V32QI: case V16QI_FTYPE_V16QI: case V8SI_FTYPE_V8SF: case V8SI_FTYPE_V4SI: @@ -26407,6 +26806,18 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V2SI_FTYPE_V2DF: case V2SF_FTYPE_V2SF: case V2SF_FTYPE_V2SI: + case V32QI_FTYPE_V32QI: + case V32QI_FTYPE_V16QI: + case V16HI_FTYPE_V16HI: + case V16HI_FTYPE_V8HI: + case V8SI_FTYPE_V8SI: + case V16HI_FTYPE_V16QI: + case V8SI_FTYPE_V16QI: + case V4DI_FTYPE_V16QI: + case V8SI_FTYPE_V8HI: + case V4DI_FTYPE_V8HI: + case V4DI_FTYPE_V4SI: + case V4DI_FTYPE_V2DI: nargs = 1; break; case V4SF_FTYPE_V4SF_VEC_MERGE: @@ -26454,6 +26865,15 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V1DI_FTYPE_V1DI_V1DI: case V1DI_FTYPE_V8QI_V8QI: case V1DI_FTYPE_V2SI_V2SI: + case V32QI_FTYPE_V16HI_V16HI: + case V16HI_FTYPE_V8SI_V8SI: + case V32QI_FTYPE_V32QI_V32QI: + case V16HI_FTYPE_V32QI_V32QI: + case V16HI_FTYPE_V16HI_V16HI: + case V8SI_FTYPE_V8SI_V8SI: + case V8SI_FTYPE_V16HI_V16HI: + case V4DI_FTYPE_V4DI_V4DI: + case V4DI_FTYPE_V8SI_V8SI: if (comparison == UNKNOWN) return ix86_expand_binop_builtin (icode, exp, target); nargs = 2; @@ -26464,6 +26884,12 @@ ix86_expand_args_builtin (const struct builtin_description *d, nargs = 2; swap = true; break; + case V16HI_FTYPE_V16HI_V8HI_COUNT: + case V16HI_FTYPE_V16HI_SI_COUNT: + case V8SI_FTYPE_V8SI_V4SI_COUNT: + case V8SI_FTYPE_V8SI_SI_COUNT: + case V4DI_FTYPE_V4DI_V2DI_COUNT: + case V4DI_FTYPE_V4DI_INT_COUNT: case V8HI_FTYPE_V8HI_V8HI_COUNT: case V8HI_FTYPE_V8HI_SI_COUNT: case V4SI_FTYPE_V4SI_V4SI_COUNT: @@ -26505,6 +26931,10 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V2DI_FTYPE_V2DI_INT: case V2DF_FTYPE_V2DF_INT: case V2DF_FTYPE_V4DF_INT: + case V16HI_FTYPE_V16HI_INT: + case V8SI_FTYPE_V8SI_INT: + case V4DI_FTYPE_V4DI_INT: + case V2DI_FTYPE_V4DI_INT: nargs = 2; nargs_constant = 1; break; @@ -26513,9 +26943,13 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V4DF_FTYPE_V4DF_V4DF_V4DF: case V4SF_FTYPE_V4SF_V4SF_V4SF: case V2DF_FTYPE_V2DF_V2DF_V2DF: + case V32QI_FTYPE_V32QI_V32QI_V32QI: nargs = 3; break; + case V32QI_FTYPE_V32QI_V32QI_INT: + case V16HI_FTYPE_V16HI_V16HI_INT: case V16QI_FTYPE_V16QI_V16QI_INT: + case V4DI_FTYPE_V4DI_V4DI_INT: case V8HI_FTYPE_V8HI_V8HI_INT: case V8SI_FTYPE_V8SI_V8SI_INT: case V8SI_FTYPE_V8SI_V4SI_INT: @@ -26526,10 +26960,16 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V4DF_FTYPE_V4DF_V2DF_INT: case V4SF_FTYPE_V4SF_V4SF_INT: case V2DI_FTYPE_V2DI_V2DI_INT: + case V4DI_FTYPE_V4DI_V2DI_INT: case V2DF_FTYPE_V2DF_V2DF_INT: nargs = 3; nargs_constant = 1; break; + case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT: + nargs = 3; + rmode = V4DImode; + nargs_constant = 1; + break; case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT: nargs = 3; rmode = V2DImode; @@ -26606,6 +27046,11 @@ ix86_expand_args_builtin (const struct builtin_description *d, if (!match) switch (icode) { + case CODE_FOR_avx2_inserti128: + case CODE_FOR_avx2_extracti128: + error ("the last argument must be an 1-bit immediate"); + return const0_rtx; + case CODE_FOR_sse4_1_roundpd: case CODE_FOR_sse4_1_roundps: case CODE_FOR_sse4_1_roundsd: @@ -26759,6 +27204,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, break; case UINT64_FTYPE_PUNSIGNED: case V2DI_FTYPE_PV2DI: + case V4DI_FTYPE_PV4DI: case V32QI_FTYPE_PCCHAR: case V16QI_FTYPE_PCCHAR: case V8SF_FTYPE_PCV4SF: @@ -26798,6 +27244,10 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, case V4DF_FTYPE_PCV4DF_V4DI: case V4SF_FTYPE_PCV4SF_V4SI: case V2DF_FTYPE_PCV2DF_V2DI: + case V8SI_FTYPE_PCV8SI_V8SI: + case V4DI_FTYPE_PCV4DI_V4DI: + case V4SI_FTYPE_PCV4SI_V4SI: + case V2DI_FTYPE_PCV2DI_V2DI: nargs = 2; klass = load; memory = 0; @@ -26806,6 +27256,10 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, case VOID_FTYPE_PV4DF_V4DI_V4DF: case VOID_FTYPE_PV4SF_V4SI_V4SF: case VOID_FTYPE_PV2DF_V2DI_V2DF: + case VOID_FTYPE_PV8SI_V8SI_V8SI: + case VOID_FTYPE_PV4DI_V4DI_V4DI: + case VOID_FTYPE_PV4SI_V4SI_V4SI: + case VOID_FTYPE_PV2DI_V2DI_V2DI: nargs = 2; klass = store; /* Reserve memory operand for target. */ @@ -27062,9 +27516,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, size_t i; enum insn_code icode; tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); - tree arg0, arg1, arg2; - rtx op0, op1, op2, pat; - enum machine_mode mode0, mode1, mode2; + tree arg0, arg1, arg2, arg3, arg4; + rtx op0, op1, op2, op3, op4, pat; + enum machine_mode mode0, mode1, mode2, mode3, mode4; unsigned int fcode = DECL_FUNCTION_CODE (fndecl); /* Determine whether the builtin function is available under the current ISA. @@ -27333,6 +27787,100 @@ rdrand_step: gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1))); return target; + case IX86_BUILTIN_GATHERSIV2DF: + icode = CODE_FOR_avx2_gathersiv2df; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV4DF: + icode = CODE_FOR_avx2_gathersiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV2DF: + icode = CODE_FOR_avx2_gatherdiv2df; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV4DF: + icode = CODE_FOR_avx2_gatherdiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV4SF: + icode = CODE_FOR_avx2_gathersiv4sf; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV8SF: + icode = CODE_FOR_avx2_gathersiv8sf; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV4SF: + icode = CODE_FOR_avx2_gatherdiv4sf; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV8SF: + icode = CODE_FOR_avx2_gatherdiv4sf256; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV2DI: + icode = CODE_FOR_avx2_gathersiv2di; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV4DI: + icode = CODE_FOR_avx2_gathersiv4di; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV2DI: + icode = CODE_FOR_avx2_gatherdiv2di; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV4DI: + icode = CODE_FOR_avx2_gatherdiv4di; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV4SI: + icode = CODE_FOR_avx2_gathersiv4si; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV8SI: + icode = CODE_FOR_avx2_gathersiv8si; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV4SI: + icode = CODE_FOR_avx2_gatherdiv4si; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV8SI: + icode = CODE_FOR_avx2_gatherdiv4si256; + + gather_gen: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + arg3 = CALL_EXPR_ARG (exp, 3); + arg4 = CALL_EXPR_ARG (exp, 4); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + op3 = expand_normal (arg3); + op4 = expand_normal (arg4); + /* Note the arg order is different from the operand order. */ + mode0 = insn_data[icode].operand[1].mode; + mode1 = insn_data[icode].operand[2].mode; + mode2 = insn_data[icode].operand[3].mode; + mode3 = insn_data[icode].operand[4].mode; + mode4 = insn_data[icode].operand[5].mode; + + if (target == NULL_RTX) + target = gen_reg_rtx (insn_data[icode].operand[0].mode); + + /* Force memory operand only with base register here. But we + don't want to do it on memory operand for other builtin + functions. */ + op1 = force_reg (Pmode, op1); + op1 = gen_rtx_MEM (mode1, op1); + + if (!insn_data[icode].operand[1].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if (!insn_data[icode].operand[2].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + if (!insn_data[icode].operand[3].predicate (op2, mode2)) + op2 = copy_to_mode_reg (mode2, op2); + if (!insn_data[icode].operand[4].predicate (op3, mode3)) + op3 = copy_to_mode_reg (mode3, op3); + if (!insn_data[icode].operand[5].predicate (op4, mode4)) + { + error ("last argument must be scale 1, 2, 4, 8"); + return const0_rtx; + } + pat = GEN_FCN (icode) (target, op0, op1, op2, op3, op4); + if (! pat) + return const0_rtx; + emit_insn (pat); + return target; + default: break; } @@ -35044,13 +35592,13 @@ ix86_preferred_simd_mode (enum machine_mode mode) switch (mode) { case QImode: - return V16QImode; + return TARGET_AVX2 ? V32QImode : V16QImode; case HImode: - return V8HImode; + return TARGET_AVX2 ? V16HImode : V8HImode; case SImode: - return V4SImode; + return TARGET_AVX2 ? V8SImode : V4SImode; case DImode: - return V2DImode; + return TARGET_AVX2 ? V4DImode : V2DImode; case SFmode: if (TARGET_AVX && !TARGET_PREFER_AVX128) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index b73d46f..d343fc2 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -231,6 +231,14 @@ UNSPEC_VCVTPH2PS UNSPEC_VCVTPS2PH + ;; For AVX2 support + UNSPEC_VPERMSI + UNSPEC_VPERMDF + UNSPEC_VPERMSF + UNSPEC_VPERMDI + UNSPEC_VPERMTI + UNSPEC_GATHER + ;; For BMI support UNSPEC_BEXTR @@ -930,7 +938,8 @@ [(SF "ss") (DF "sd") (V8SF "ps") (V4DF "pd") (V4SF "ps") (V2DF "pd") - (V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q")]) + (V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q") + (V32QI "b") (V16HI "w") (V8SI "d") (V4DI "q")]) ;; SSE vector suffix for floating point modes (define_mode_attr ssevecmodesuffix [(SF "ps") (DF "pd")]) diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h index 11a1a4e..3704df7 100644 --- a/gcc/config/i386/immintrin.h +++ b/gcc/config/i386/immintrin.h @@ -56,6 +56,10 @@ #include <avxintrin.h> #endif +#ifdef __AVX2__ +#include <avx2intrin.h> +#endif + #ifdef __RDRND__ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index bc0a357..b4fa04e 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -611,6 +611,14 @@ return i == 2 || i == 4 || i == 8; }) +;; Match 1, 2, 4, or 8 +(define_predicate "const1248_operand" + (match_code "const_int") +{ + HOST_WIDE_INT i = INTVAL (op); + return i == 1 || i == 2 || i == 4 || i == 8; +}) + ;; Match 3, 5, or 9. Used for leal multiplicands. (define_predicate "const359_operand" (match_code "const_int") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index e9f6c3d..5bc8586 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -81,9 +81,104 @@ (define_mode_iterator VI8 [(V4DI "TARGET_AVX") V2DI]) +(define_mode_iterator VI1_AVX2 + [(V32QI "TARGET_AVX2") V16QI]) + +(define_mode_iterator VI2_AVX2 + [(V16HI "TARGET_AVX2") V8HI]) + +(define_mode_iterator VI4_AVX2 + [(V8SI "TARGET_AVX2") V4SI]) + +(define_mode_iterator VI8_AVX2 + [(V4DI "TARGET_AVX2") V2DI]) + +(define_mode_iterator VIMAX_AVX2 + [(V4DI "TARGET_AVX2") V1TI]) + +(define_mode_iterator SSESCALARMODE + [(V4DI "TARGET_AVX2") TI]) + +(define_mode_iterator VI12_AVX2 + [(V32QI "TARGET_AVX2") V16QI + (V16HI "TARGET_AVX2") V8HI]) + +(define_mode_iterator VI24_AVX2 + [(V16HI "TARGET_AVX2") V8HI + (V8SI "TARGET_AVX2") V4SI]) + +(define_mode_iterator VI124_AVX2 + [(V32QI "TARGET_AVX2") V16QI + (V16HI "TARGET_AVX2") V8HI + (V8SI "TARGET_AVX2") V4SI]) + +(define_mode_iterator VI248_AVX2 + [(V16HI "TARGET_AVX2") V8HI + (V8SI "TARGET_AVX2") V4SI + (V4DI "TARGET_AVX2") V2DI]) + +(define_mode_iterator VI48_AVX2 + [V8SI V4SI V4DI V2DI]) + +(define_mode_iterator VI4SD_AVX2 + [V4SI V4DI]) + +(define_mode_iterator V48_AVX2 + [(V4SF "TARGET_SSE") (V2DF "TARGET_SSE2") + (V8SF "TARGET_AVX") (V4DF "TARGET_AVX") + (V4SI "TARGET_AVX2") (V2DI "TARGET_AVX2") + (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")]) + +(define_mode_attr sse2_avx2 + [(V16QI "sse2") (V32QI "avx2") + (V8HI "sse2") (V16HI "avx2") + (V4SI "sse2") (V8SI "avx2") + (V2DI "sse2") (V4DI "avx2") + (V1TI "sse2")]) + +(define_mode_attr ssse3_avx2 + [(V16QI "ssse3") (V32QI "avx2") + (V8HI "ssse3") (V16HI "avx2") + (V4SI "ssse3") (V8SI "avx2") + (V2DI "ssse3") (V4DI "avx2") + (TI "ssse3")]) + +(define_mode_attr sse4_1_avx2 + [(V16QI "sse4_1") (V32QI "avx2") + (V8HI "sse4_1") (V16HI "avx2") + (V4SI "sse4_1") (V8SI "avx2") + (V2DI "sse4_1") (V4DI "avx2")]) + +(define_mode_attr avx_avx2 + [(V4SF "avx") (V2DF "avx") + (V8SF "avx") (V4DF "avx") + (V4SI "avx2") (V2DI "avx2") + (V8SI "avx2") (V4DI "avx2")]) + +;; Mapping of logic-shift operators +(define_code_iterator lshift [lshiftrt ashift]) + +;; Base name for define_insn +(define_code_attr lshift_insn [(lshiftrt "srl") (ashift "sll")]) + +;; Base name for insn mnemonic +(define_code_attr lshift [(lshiftrt "lshr") (ashift "lshl")]) + +(define_mode_attr ssedoublemode + [(V16HI "V16SI") (V8HI "V8SI")]) + +(define_mode_attr ssebytemode + [(V4DI "V32QI") (V2DI "V16QI")]) + +(define_mode_attr shortmode + [(V4DI "v4si") (V2DI "v2si")]) + ;; All 128bit vector integer modes (define_mode_iterator VI_128 [V16QI V8HI V4SI V2DI]) +;; All 256bit vector integer modes +(define_mode_iterator VI_256 [V32QI V16HI V8SI V4DI]) + ;; Random 128bit vector integer mode combinations (define_mode_iterator VI12_128 [V16QI V8HI]) (define_mode_iterator VI14_128 [V16QI V4SI]) @@ -91,6 +186,11 @@ (define_mode_iterator VI24_128 [V8HI V4SI]) (define_mode_iterator VI248_128 [V8HI V4SI V2DI]) +;; Random 256bit vector integer mode combinations +(define_mode_iterator VI124_256 [V32QI V16HI V8SI]) +(define_mode_iterator VI1248_256 [V32QI V16HI V8SI V4DI]) +(define_mode_iterator VI248_256 [V16HI V8SI V4DI]) + ;; Int-float size matches (define_mode_iterator VI4F_128 [V4SI V4SF]) (define_mode_iterator VI8F_128 [V2DI V2DF]) @@ -125,12 +225,16 @@ [(V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI") (V16QI "TI") (V8HI "TI") (V4SI "TI") (V2DI "TI") (V1TI "TI") (V8SF "V8SF") (V4DF "V4DF") - (V4SF "V4SF") (V2DF "V2DF")]) + (V4SF "V4SF") (V2DF "V2DF") + (TI "TI") (V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI")]) ;; Mapping of vector float modes to an integer mode of the same size (define_mode_attr sseintvecmode [(V8SF "V8SI") (V4DF "V4DI") - (V4SF "V4SI") (V2DF "V2DI")]) + (V4SF "V4SI") (V2DF "V2DI") + (V4DF "V4DI") (V8SF "V8SI") + (V8SI "V8SI") (V4DI "V4DI") + (V4SI "V4SI") (V2DI "V2DI")]) ;; Mapping of vector modes to a vector mode of double size (define_mode_attr ssedoublevecmode @@ -162,17 +266,20 @@ ;; SSE scalar suffix for vector modes (define_mode_attr ssescalarmodesuffix - [(V8SF "ss") (V4DF "sd") + [(SF "ss") (DF "sd") + (V8SF "ss") (V4DF "sd") (V4SF "ss") (V2DF "sd") (V8SI "ss") (V4DI "sd") (V4SI "d")]) ;; Pack/unpack vector modes (define_mode_attr sseunpackmode - [(V16QI "V8HI") (V8HI "V4SI") (V4SI "V2DI")]) + [(V16QI "V8HI") (V8HI "V4SI") (V4SI "V2DI") + (V32QI "V16HI") (V16HI "V8SI") (V8SI "V4DI")]) (define_mode_attr ssepackmode - [(V8HI "V16QI") (V4SI "V8HI") (V2DI "V4SI")]) + [(V8HI "V16QI") (V4SI "V8HI") (V2DI "V4SI") + (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")]) ;; Mapping of the max integer size for xop rotate immediate constraint (define_mode_attr sserotatemax @@ -184,11 +291,27 @@ ;; Instruction suffix for sign and zero extensions. (define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")]) - - ;; Mix-n-match (define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF]) +(define_mode_iterator AVXMODE48P_DI + [V2DI V2DF V4DI V4DF V4SF V4SI]) +(define_mode_attr AVXMODE48P_DI + [(V2DI "V2DI") (V2DF "V2DI") + (V4DI "V4DI") (V4DF "V4DI") + (V4SI "V2DI") (V4SF "V2DI") + (V8SI "V4DI") (V8SF "V4DI")]) +(define_mode_attr gthrfirstp + [(V2DI "p") (V2DF "") + (V4DI "p") (V4DF "") + (V4SI "p") (V4SF "") + (V8SI "p") (V8SF "")]) +(define_mode_attr gthrlastp + [(V2DI "q") (V2DF "pd") + (V4DI "q") (V4DF "pd") + (V4SI "d") (V4SF "ps") + (V8SI "d") (V8SF "ps")]) + (define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF]) ;; Mapping of immediate bits for blend instructions @@ -229,7 +352,7 @@ case 1: case 2: switch (get_attr_mode (insn)) - { + { case MODE_V8SF: case MODE_V4SF: if (TARGET_AVX @@ -272,10 +395,10 @@ (set_attr "prefix" "maybe_vex") (set (attr "mode") (cond [(ne (symbol_ref "TARGET_AVX") (const_int 0)) - (const_string "<sseinsnmode>") + (const_string "<sseinsnmode>") (ior (ior - (ne (symbol_ref "optimize_function_for_size_p (cfun)") - (const_int 0)) + (ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0)) (eq (symbol_ref "TARGET_SSE2") (const_int 0))) (and (eq_attr "alternative" "2") (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES") @@ -325,15 +448,15 @@ /* The DImode arrived in a pair of integral registers (e.g. %edx:%eax). Assemble the 64-bit DImode value in an xmm register. */ emit_insn (gen_sse2_loadld (operands[0], CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, operands[1], 0))); + gen_rtx_SUBREG (SImode, operands[1], 0))); emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), gen_rtx_SUBREG (SImode, operands[1], 4))); emit_insn (gen_vec_interleave_lowv4si (operands[0], operands[0], - operands[2])); + operands[2])); } else if (memory_operand (operands[1], DImode)) emit_insn (gen_vec_concatv2di (gen_lowpart (V2DImode, operands[0]), - operands[1], const0_rtx)); + operands[1], const0_rtx)); else gcc_unreachable (); }) @@ -1281,12 +1404,12 @@ (define_expand "vcond<mode>" [(set (match_operand:VF 0 "register_operand" "") - (if_then_else:VF - (match_operator 3 "" - [(match_operand:VF 4 "nonimmediate_operand" "") - (match_operand:VF 5 "nonimmediate_operand" "")]) - (match_operand:VF 1 "general_operand" "") - (match_operand:VF 2 "general_operand" "")))] + (if_then_else:VF + (match_operator 3 "" + [(match_operand:VF 4 "nonimmediate_operand" "") + (match_operand:VF 5 "nonimmediate_operand" "")]) + (match_operand:VF 1 "general_operand" "") + (match_operand:VF 2 "general_operand" "")))] "TARGET_SSE" { bool ok = ix86_expand_fp_vcond (operands); @@ -2579,7 +2702,7 @@ (parallel [(const_int 2) (const_int 3) (const_int 2) (const_int 3)]))) (set (match_operand:V2DF 0 "register_operand" "") - (float:V2DF + (float:V2DF (vec_select:V2SI (match_dup 2) (parallel [(const_int 0) (const_int 1)]))))] @@ -2601,7 +2724,7 @@ (parallel [(const_int 4) (const_int 5) (const_int 6) (const_int 7)]))) (set (match_operand:V4DF 0 "register_operand" "") - (float:V4DF + (float:V4DF (match_dup 2)))] "TARGET_AVX" "operands[2] = gen_reg_rtx (V4SImode);") @@ -2622,7 +2745,7 @@ (parallel [(const_int 2) (const_int 3) (const_int 2) (const_int 3)]))) (set (match_dup 6) - (float:V2DF + (float:V2DF (vec_select:V2SI (match_dup 5) (parallel [(const_int 0) (const_int 1)])))) @@ -2728,8 +2851,8 @@ emit_insn (gen_sse2_cvttpd2dq (r1, operands[1])); emit_insn (gen_sse2_cvttpd2dq (r2, operands[2])); emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), - gen_lowpart (V2DImode, r1), - gen_lowpart (V2DImode, r2))); + gen_lowpart (V2DImode, r1), + gen_lowpart (V2DImode, r2))); DONE; }) @@ -2747,8 +2870,8 @@ emit_insn (gen_sse2_cvtpd2dq (r1, operands[1])); emit_insn (gen_sse2_cvtpd2dq (r2, operands[2])); emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), - gen_lowpart (V2DImode, r1), - gen_lowpart (V2DImode, r2))); + gen_lowpart (V2DImode, r1), + gen_lowpart (V2DImode, r2))); DONE; }) @@ -3290,6 +3413,18 @@ operands[1] = force_reg (SFmode, operands[1]); }) +(define_insn "avx2_vec_dupv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_duplicate:V4SF + (vec_select:SF + (match_operand:V4SF 1 "register_operand" "x") + (parallel [(const_int 0)]))))] + "TARGET_AVX2" + "vbroadcastss\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF")]) + (define_insn "*vec_dupv4sf_avx" [(set (match_operand:V4SF 0 "register_operand" "=x,x") (vec_duplicate:V4SF @@ -3304,6 +3439,18 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4SF")]) +(define_insn "avx2_vec_dupv8sf" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (vec_duplicate:V8SF + (vec_select:SF + (match_operand:V4SF 1 "register_operand" "x") + (parallel [(const_int 0)]))))] + "TARGET_AVX2" + "vbroadcastss\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + (define_insn "*vec_dupv4sf" [(set (match_operand:V4SF 0 "register_operand" "=x") (vec_duplicate:V4SF @@ -3899,7 +4046,7 @@ (match_dup 3) (match_dup 4)) (parallel [(const_int 0) (const_int 1) - (const_int 4) (const_int 5)])))] + (const_int 4) (const_int 5)])))] "TARGET_AVX" { operands[3] = gen_reg_rtx (V4DFmode); @@ -4059,6 +4206,21 @@ }) ;; punpcklqdq and punpckhqdq are shorter than shufpd. +(define_insn "avx2_interleave_highv4di" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (vec_select:V4DI + (vec_concat:V8DI + (match_operand:V4DI 1 "register_operand" "x") + (match_operand:V4DI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 1) + (const_int 5) + (const_int 3) + (const_int 7)])))] + "TARGET_AVX2" + "vpunpckhqdq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) (define_insn "vec_interleave_highv2di" [(set (match_operand:V2DI 0 "register_operand" "=x,x") @@ -4078,6 +4240,22 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_insn "avx2_interleave_lowv4di" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (vec_select:V4DI + (vec_concat:V8DI + (match_operand:V4DI 1 "register_operand" "x") + (match_operand:V4DI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 4) + (const_int 2) + (const_int 6)])))] + "TARGET_AVX2" + "vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "vec_interleave_lowv2di" [(set (match_operand:V2DI 0 "register_operand" "=x,x") (vec_select:V2DI @@ -4463,18 +4641,18 @@ "operands[2] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));") (define_expand "<plusminus_insn><mode>3" - [(set (match_operand:VI_128 0 "register_operand" "") - (plusminus:VI_128 - (match_operand:VI_128 1 "nonimmediate_operand" "") - (match_operand:VI_128 2 "nonimmediate_operand" "")))] + [(set (match_operand:VI 0 "register_operand" "") + (plusminus:VI + (match_operand:VI 1 "nonimmediate_operand" "") + (match_operand:VI 2 "nonimmediate_operand" "")))] "TARGET_SSE2" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") (define_insn "*<plusminus_insn><mode>3" - [(set (match_operand:VI_128 0 "register_operand" "=x,x") - (plusminus:VI_128 - (match_operand:VI_128 1 "nonimmediate_operand" "<comm>0,x") - (match_operand:VI_128 2 "nonimmediate_operand" "xm,xm")))] + [(set (match_operand:VI 0 "register_operand" "=x,x") + (plusminus:VI + (match_operand:VI 1 "nonimmediate_operand" "<comm>0,x") + (match_operand:VI 2 "nonimmediate_operand" "xm,xm")))] "TARGET_SSE2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" "@ p<plusminus_mnemonic><ssemodesuffix>\t{%2, %0|%0, %2} @@ -4483,21 +4661,21 @@ (set_attr "type" "sseiadd") (set_attr "prefix_data16" "1,*") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) -(define_expand "sse2_<plusminus_insn><mode>3" - [(set (match_operand:VI12_128 0 "register_operand" "") - (sat_plusminus:VI12_128 - (match_operand:VI12_128 1 "nonimmediate_operand" "") - (match_operand:VI12_128 2 "nonimmediate_operand" "")))] +(define_expand "<sse2_avx2>_<plusminus_insn><mode>3" + [(set (match_operand:VI12_AVX2 0 "register_operand" "") + (sat_plusminus:VI12_AVX2 + (match_operand:VI12_AVX2 1 "nonimmediate_operand" "") + (match_operand:VI12_AVX2 2 "nonimmediate_operand" "")))] "TARGET_SSE2" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") -(define_insn "*sse2_<plusminus_insn><mode>3" - [(set (match_operand:VI12_128 0 "register_operand" "=x,x") - (sat_plusminus:VI12_128 - (match_operand:VI12_128 1 "nonimmediate_operand" "<comm>0,x") - (match_operand:VI12_128 2 "nonimmediate_operand" "xm,xm")))] +(define_insn "*<sse2_avx2>_<plusminus_insn><mode>3" + [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,x") + (sat_plusminus:VI12_AVX2 + (match_operand:VI12_AVX2 1 "nonimmediate_operand" "<comm>0,x") + (match_operand:VI12_AVX2 2 "nonimmediate_operand" "xm,xm")))] "TARGET_SSE2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" "@ p<plusminus_mnemonic><ssemodesuffix>\t{%2, %0|%0, %2} @@ -4548,18 +4726,18 @@ DONE; }) -(define_expand "mulv8hi3" - [(set (match_operand:V8HI 0 "register_operand" "") - (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "") - (match_operand:V8HI 2 "nonimmediate_operand" "")))] +(define_expand "mul<mode>3" + [(set (match_operand:VI2_AVX2 0 "register_operand" "") + (mult:VI2_AVX2 (match_operand:VI2_AVX2 1 "nonimmediate_operand" "") + (match_operand:VI2_AVX2 2 "nonimmediate_operand" "")))] "TARGET_SSE2" - "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);") + "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);") -(define_insn "*mulv8hi3" - [(set (match_operand:V8HI 0 "register_operand" "=x,x") - (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "%0,x") - (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm")))] - "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)" +(define_insn "*mul<mode>3" + [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,x") + (mult:VI2_AVX2 (match_operand:VI2_AVX2 1 "nonimmediate_operand" "%0,x") + (match_operand:VI2_AVX2 2 "nonimmediate_operand" "xm,xm")))] + "TARGET_SSE2 && ix86_binary_operator_ok (MULT, <MODE>mode, operands)" "@ pmullw\t{%2, %0|%0, %2} vpmullw\t{%2, %1, %0|%0, %1, %2}" @@ -4567,32 +4745,32 @@ (set_attr "type" "sseimul") (set_attr "prefix_data16" "1,*") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) -(define_expand "<s>mulv8hi3_highpart" - [(set (match_operand:V8HI 0 "register_operand" "") - (truncate:V8HI - (lshiftrt:V8SI - (mult:V8SI - (any_extend:V8SI - (match_operand:V8HI 1 "nonimmediate_operand" "")) - (any_extend:V8SI - (match_operand:V8HI 2 "nonimmediate_operand" ""))) - (const_int 16))))] +(define_expand "<s>mul<mode>3_highpart" + [(set (match_operand:VI2_AVX2 0 "register_operand" "") + (truncate:VI2_AVX2 + (lshiftrt:<ssedoublemode> + (mult:<ssedoublemode> + (any_extend:<ssedoublemode> + (match_operand:VI2_AVX2 1 "nonimmediate_operand" "")) + (any_extend:<ssedoublemode> + (match_operand:VI2_AVX2 2 "nonimmediate_operand" ""))) + (const_int 16))))] "TARGET_SSE2" "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);") -(define_insn "*<s>mulv8hi3_highpart" - [(set (match_operand:V8HI 0 "register_operand" "=x,x") - (truncate:V8HI - (lshiftrt:V8SI - (mult:V8SI - (any_extend:V8SI - (match_operand:V8HI 1 "nonimmediate_operand" "%0,x")) - (any_extend:V8SI - (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm"))) +(define_insn "*<s>mul<mode>3_highpart" + [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,x") + (truncate:VI2_AVX2 + (lshiftrt:<ssedoublemode> + (mult:<ssedoublemode> + (any_extend:<ssedoublemode> + (match_operand:VI2_AVX2 1 "nonimmediate_operand" "%0,x")) + (any_extend:<ssedoublemode> + (match_operand:VI2_AVX2 2 "nonimmediate_operand" "xm,xm"))) (const_int 16))))] - "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)" + "TARGET_SSE2 && ix86_binary_operator_ok (MULT, <MODE>mode, operands)" "@ pmulh<u>w\t{%2, %0|%0, %2} vpmulh<u>w\t{%2, %1, %0|%0, %1, %2}" @@ -4600,7 +4778,42 @@ (set_attr "type" "sseimul") (set_attr "prefix_data16" "1,*") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) + +(define_expand "avx2_umulv4siv4di3" + [(set (match_operand:V4DI 0 "register_operand" "") + (mult:V4DI + (zero_extend:V4DI + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 2) + (const_int 4) (const_int 6)]))) + (zero_extend:V4DI + (vec_select:V4SI + (match_operand:V8SI 2 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 2) + (const_int 4) (const_int 6)])))))] + "TARGET_AVX2" + "ix86_fixup_binary_operands_no_copy (MULT, V8SImode, operands);") + +(define_insn "*avx_umulv4siv4di3" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (mult:V4DI + (zero_extend:V4DI + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 0) (const_int 2) + (const_int 4) (const_int 6)]))) + (zero_extend:V4DI + (vec_select:V4SI + (match_operand:V8SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 2) + (const_int 4) (const_int 6)])))))] + "TARGET_AVX2 && ix86_binary_operator_ok (MULT, V8SImode, operands)" + "vpmuludq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) (define_expand "sse2_umulv2siv2di3" [(set (match_operand:V2DI 0 "register_operand" "") @@ -4637,6 +4850,43 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_expand "avx2_mulv4siv4di3" + [(set (match_operand:V4DI 0 "register_operand" "") + (mult:V4DI + (sign_extend:V4DI + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 2) + (const_int 4) (const_int 6)]))) + (sign_extend:V4DI + (vec_select:V4SI + (match_operand:V8SI 2 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 2) + (const_int 4) (const_int 6)])))))] + "TARGET_AVX2" + "ix86_fixup_binary_operands_no_copy (MULT, V8SImode, operands);") + +(define_insn "*avx2_mulv4siv4di3" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (mult:V4DI + (sign_extend:V4DI + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "x") + (parallel [(const_int 0) (const_int 2) + (const_int 4) (const_int 6)]))) + (sign_extend:V4DI + (vec_select:V4SI + (match_operand:V8SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 2) + (const_int 4) (const_int 6)])))))] + "TARGET_AVX2 && ix86_binary_operator_ok (MULT, V8SImode, operands)" + "vpmuldq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "avx") + (set_attr "type" "sseimul") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_expand "sse4_1_mulv2siv2di3" [(set (match_operand:V2DI 0 "register_operand" "") (mult:V2DI @@ -4673,6 +4923,56 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_expand "avx2_pmaddwd" + [(set (match_operand:V8SI 0 "register_operand" "") + (plus:V8SI + (mult:V8SI + (sign_extend:V8SI + (vec_select:V8HI + (match_operand:V16HI 1 "nonimmediate_operand" "") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14)]))) + (sign_extend:V8SI + (vec_select:V8HI + (match_operand:V16HI 2 "nonimmediate_operand" "") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14)])))) + (mult:V8SI + (sign_extend:V8SI + (vec_select:V8HI (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15)]))) + (sign_extend:V8SI + (vec_select:V8HI (match_dup 2) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15)]))))))] + "TARGET_AVX2" + "ix86_fixup_binary_operands_no_copy (MULT, V16HImode, operands);") + (define_expand "sse2_pmaddwd" [(set (match_operand:V4SI 0 "register_operand" "") (plus:V4SI @@ -4707,6 +5007,59 @@ "TARGET_SSE2" "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);") +(define_insn "*avx2_pmaddwd" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (plus:V8SI + (mult:V8SI + (sign_extend:V8SI + (vec_select:V8HI + (match_operand:V16HI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14)]))) + (sign_extend:V8SI + (vec_select:V8HI + (match_operand:V16HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14)])))) + (mult:V8SI + (sign_extend:V8SI + (vec_select:V8HI (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15)]))) + (sign_extend:V8SI + (vec_select:V8HI (match_dup 2) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15)]))))))] + "TARGET_AVX2 && ix86_binary_operator_ok (MULT, V16HImode, operands)" + "vpmaddwd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "*sse2_pmaddwd" [(set (match_operand:V4SI 0 "register_operand" "=x,x") (plus:V4SI @@ -4749,21 +5102,21 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) -(define_expand "mulv4si3" - [(set (match_operand:V4SI 0 "register_operand" "") - (mult:V4SI (match_operand:V4SI 1 "register_operand" "") - (match_operand:V4SI 2 "register_operand" "")))] +(define_expand "mul<mode>3" + [(set (match_operand:VI4_AVX2 0 "register_operand" "") + (mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand" "") + (match_operand:VI4_AVX2 2 "register_operand" "")))] "TARGET_SSE2" { if (TARGET_SSE4_1 || TARGET_AVX) - ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands); + ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands); }) -(define_insn "*sse4_1_mulv4si3" - [(set (match_operand:V4SI 0 "register_operand" "=x,x") - (mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%0,x") - (match_operand:V4SI 2 "nonimmediate_operand" "xm,xm")))] - "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, V4SImode, operands)" +(define_insn "*<sse4_1_avx2>_mul<mode>3" + [(set (match_operand:VI4_AVX2 0 "register_operand" "=x,x") + (mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "nonimmediate_operand" "%0,x") + (match_operand:VI4_AVX2 2 "nonimmediate_operand" "xm,xm")))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, <MODE>mode, operands)" "@ pmulld\t{%2, %0|%0, %2} vpmulld\t{%2, %1, %0|%0, %1, %2}" @@ -4771,7 +5124,7 @@ (set_attr "type" "sseimul") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) (define_insn_and_split "*sse2_mulv4si3" [(set (match_operand:V4SI 0 "register_operand" "") @@ -4885,7 +5238,7 @@ /* Multiply low parts. */ emit_insn (gen_sse2_umulv2siv2di3 (t1, gen_lowpart (V4SImode, op1), - gen_lowpart (V4SImode, op2))); + gen_lowpart (V4SImode, op2))); /* Shift input vectors left 32 bits so we can multiply high parts. */ emit_insn (gen_lshrv2di3 (t2, op1, thirtytwo)); @@ -5119,9 +5472,9 @@ }) (define_insn "ashr<mode>3" - [(set (match_operand:VI24_128 0 "register_operand" "=x,x") - (ashiftrt:VI24_128 - (match_operand:VI24_128 1 "register_operand" "0,x") + [(set (match_operand:VI24_AVX2 0 "register_operand" "=x,x") + (ashiftrt:VI24_AVX2 + (match_operand:VI24_AVX2 1 "register_operand" "0,x") (match_operand:SI 2 "nonmemory_operand" "xN,xN")))] "TARGET_SSE2" "@ @@ -5135,12 +5488,27 @@ (const_string "0"))) (set_attr "prefix_data16" "1,*") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx2_lshrqv4di3" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (lshiftrt:V4DI + (match_operand:V4DI 1 "register_operand" "x") + (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))] + "TARGET_AVX2" +{ + operands[2] = GEN_INT (INTVAL (operands[2]) / 8); + return "vpsrldq\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set_attr "length_immediate" "1") + (set_attr "mode" "OI")]) (define_insn "lshr<mode>3" - [(set (match_operand:VI248_128 0 "register_operand" "=x,x") - (lshiftrt:VI248_128 - (match_operand:VI248_128 1 "register_operand" "0,x") + [(set (match_operand:VI248_AVX2 0 "register_operand" "=x,x") + (lshiftrt:VI248_AVX2 + (match_operand:VI248_AVX2 1 "register_operand" "0,x") (match_operand:SI 2 "nonmemory_operand" "xN,xN")))] "TARGET_SSE2" "@ @@ -5154,7 +5522,36 @@ (const_string "0"))) (set_attr "prefix_data16" "1,*") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx2_lshlqv4di3" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (ashift:V4DI (match_operand:V4DI 1 "register_operand" "x") + (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))] + "TARGET_AVX2" +{ + operands[2] = GEN_INT (INTVAL (operands[2]) / 8); + return "vpslldq\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set_attr "length_immediate" "1") + (set_attr "mode" "OI")]) + +(define_insn "avx2_lshl<mode>3" + [(set (match_operand:VI248_256 0 "register_operand" "=x") + (ashift:VI248_256 + (match_operand:VI248_256 1 "register_operand" "x") + (match_operand:SI 2 "nonmemory_operand" "xN")))] + "TARGET_AVX2" + "vpsll<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "OI")]) (define_insn "ashl<mode>3" [(set (match_operand:VI248_128 0 "register_operand" "=x,x") @@ -5177,7 +5574,7 @@ (define_expand "vec_shl_<mode>" [(set (match_operand:VI_128 0 "register_operand" "") - (ashift:V1TI + (ashift:V1TI (match_operand:VI_128 1 "register_operand" "") (match_operand:SI 2 "const_0_to_255_mul_8_operand" "")))] "TARGET_SSE2" @@ -5186,10 +5583,10 @@ operands[1] = gen_lowpart (V1TImode, operands[1]); }) -(define_insn "sse2_ashlv1ti3" - [(set (match_operand:V1TI 0 "register_operand" "=x,x") - (ashift:V1TI - (match_operand:V1TI 1 "register_operand" "0,x") +(define_insn "<sse2_avx2>_ashl<mode>3" + [(set (match_operand:VIMAX_AVX2 0 "register_operand" "=x,x") + (ashift:VIMAX_AVX2 + (match_operand:VIMAX_AVX2 1 "register_operand" "0,x") (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n,n")))] "TARGET_SSE2" { @@ -5210,11 +5607,11 @@ (set_attr "length_immediate" "1") (set_attr "prefix_data16" "1,*") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) (define_expand "vec_shr_<mode>" [(set (match_operand:VI_128 0 "register_operand" "") - (lshiftrt:V1TI + (lshiftrt:V1TI (match_operand:VI_128 1 "register_operand" "") (match_operand:SI 2 "const_0_to_255_mul_8_operand" "")))] "TARGET_SSE2" @@ -5223,9 +5620,29 @@ operands[1] = gen_lowpart (V1TImode, operands[1]); }) +(define_expand "avx2_<code><mode>3" + [(set (match_operand:VI124_256 0 "register_operand" "") + (umaxmin:VI124_256 + (match_operand:VI124_256 1 "nonimmediate_operand" "") + (match_operand:VI124_256 2 "nonimmediate_operand" "")))] + "TARGET_AVX2" + "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") + +(define_insn "*avx2_<code><mode>3" + [(set (match_operand:VI124_256 0 "register_operand" "=x") + (umaxmin:VI124_256 + (match_operand:VI124_256 1 "nonimmediate_operand" "%x") + (match_operand:VI124_256 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" + "vp<maxmin_int><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "sse2_lshrv1ti3" [(set (match_operand:V1TI 0 "register_operand" "=x,x") - (lshiftrt:V1TI + (lshiftrt:V1TI (match_operand:V1TI 1 "register_operand" "0,x") (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n,n")))] "TARGET_SSE2" @@ -5250,6 +5667,26 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_expand "avx2_<code><mode>3" + [(set (match_operand:VI124_256 0 "register_operand" "") + (smaxmin:VI124_256 + (match_operand:VI124_256 1 "nonimmediate_operand" "") + (match_operand:VI124_256 2 "nonimmediate_operand" "")))] + "TARGET_AVX2" + "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") + +(define_insn "*avx2_<code><mode>3" + [(set (match_operand:VI124_256 0 "register_operand" "=x") + (smaxmin:VI124_256 + (match_operand:VI124_256 1 "nonimmediate_operand" "%x") + (match_operand:VI124_256 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" + "vp<maxmin_int><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "*sse4_1_<code><mode>3" [(set (match_operand:VI14_128 0 "register_operand" "=x,x") (smaxmin:VI14_128 @@ -5533,6 +5970,26 @@ ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(define_expand "avx2_eq<mode>3" + [(set (match_operand:VI1248_256 0 "register_operand" "") + (eq:VI1248_256 + (match_operand:VI1248_256 1 "nonimmediate_operand" "") + (match_operand:VI1248_256 2 "nonimmediate_operand" "")))] + "TARGET_AVX2" + "ix86_fixup_binary_operands_no_copy (EQ, <MODE>mode, operands);") + +(define_insn "*avx2_eq<mode>3" + [(set (match_operand:VI1248_256 0 "register_operand" "=x") + (eq:VI1248_256 + (match_operand:VI1248_256 1 "nonimmediate_operand" "%x") + (match_operand:VI1248_256 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX2 && ix86_binary_operator_ok (EQ, <MODE>mode, operands)" + "vpcmpeq<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "*sse4_1_eqv2di3" [(set (match_operand:V2DI 0 "register_operand" "=x,x") (eq:V2DI @@ -5595,6 +6052,18 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_insn "avx2_gt<mode>3" + [(set (match_operand:VI1248_256 0 "register_operand" "=x") + (gt:VI1248_256 + (match_operand:VI1248_256 1 "register_operand" "x") + (match_operand:VI1248_256 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX2" + "vpcmpgt<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "sse2_gt<mode>3" [(set (match_operand:VI124_128 0 "register_operand" "=x,x") (gt:VI124_128 @@ -5612,12 +6081,12 @@ (define_expand "vcond<mode>" [(set (match_operand:VI124_128 0 "register_operand" "") - (if_then_else:VI124_128 - (match_operator 3 "" - [(match_operand:VI124_128 4 "nonimmediate_operand" "") - (match_operand:VI124_128 5 "nonimmediate_operand" "")]) - (match_operand:VI124_128 1 "general_operand" "") - (match_operand:VI124_128 2 "general_operand" "")))] + (if_then_else:VI124_128 + (match_operator 3 "" + [(match_operand:VI124_128 4 "nonimmediate_operand" "") + (match_operand:VI124_128 5 "nonimmediate_operand" "")]) + (match_operand:VI124_128 1 "general_operand" "") + (match_operand:VI124_128 2 "general_operand" "")))] "TARGET_SSE2" { bool ok = ix86_expand_int_vcond (operands); @@ -5627,12 +6096,12 @@ (define_expand "vcondv2di" [(set (match_operand:V2DI 0 "register_operand" "") - (if_then_else:V2DI - (match_operator 3 "" - [(match_operand:V2DI 4 "nonimmediate_operand" "") - (match_operand:V2DI 5 "nonimmediate_operand" "")]) - (match_operand:V2DI 1 "general_operand" "") - (match_operand:V2DI 2 "general_operand" "")))] + (if_then_else:V2DI + (match_operator 3 "" + [(match_operand:V2DI 4 "nonimmediate_operand" "") + (match_operand:V2DI 5 "nonimmediate_operand" "")]) + (match_operand:V2DI 1 "general_operand" "") + (match_operand:V2DI 2 "general_operand" "")))] "TARGET_SSE4_2" { bool ok = ix86_expand_int_vcond (operands); @@ -5642,12 +6111,12 @@ (define_expand "vcondu<mode>" [(set (match_operand:VI124_128 0 "register_operand" "") - (if_then_else:VI124_128 - (match_operator 3 "" - [(match_operand:VI124_128 4 "nonimmediate_operand" "") - (match_operand:VI124_128 5 "nonimmediate_operand" "")]) - (match_operand:VI124_128 1 "general_operand" "") - (match_operand:VI124_128 2 "general_operand" "")))] + (if_then_else:VI124_128 + (match_operator 3 "" + [(match_operand:VI124_128 4 "nonimmediate_operand" "") + (match_operand:VI124_128 5 "nonimmediate_operand" "")]) + (match_operand:VI124_128 1 "general_operand" "") + (match_operand:VI124_128 2 "general_operand" "")))] "TARGET_SSE2" { bool ok = ix86_expand_int_vcond (operands); @@ -5657,12 +6126,12 @@ (define_expand "vconduv2di" [(set (match_operand:V2DI 0 "register_operand" "") - (if_then_else:V2DI - (match_operator 3 "" - [(match_operand:V2DI 4 "nonimmediate_operand" "") - (match_operand:V2DI 5 "nonimmediate_operand" "")]) - (match_operand:V2DI 1 "general_operand" "") - (match_operand:V2DI 2 "general_operand" "")))] + (if_then_else:V2DI + (match_operator 3 "" + [(match_operand:V2DI 4 "nonimmediate_operand" "") + (match_operand:V2DI 5 "nonimmediate_operand" "")]) + (match_operand:V2DI 1 "general_operand" "") + (match_operand:V2DI 2 "general_operand" "")))] "TARGET_SSE4_2" { bool ok = ix86_expand_int_vcond (operands); @@ -5691,11 +6160,11 @@ operands[2] = force_reg (<MODE>mode, gen_rtx_CONST_VECTOR (<MODE>mode, v)); }) -(define_expand "sse2_andnot<mode>3" - [(set (match_operand:VI_128 0 "register_operand" "") - (and:VI_128 - (not:VI_128 (match_operand:VI_128 1 "register_operand" "")) - (match_operand:VI_128 2 "nonimmediate_operand" "")))] +(define_expand "<sse2_avx2>_andnot<mode>3" + [(set (match_operand:VI 0 "register_operand" "") + (and:VI + (not:VI (match_operand:VI 1 "register_operand" "")) + (match_operand:VI 2 "nonimmediate_operand" "")))] "TARGET_SSE2") (define_insn "*andnot<mode>3" @@ -5708,7 +6177,8 @@ static char buf[32]; const char *ops; const char *tmp - = (get_attr_mode (insn) == MODE_TI) ? "pandn" : "andnps"; + = ((get_attr_mode (insn) == MODE_TI) || + (get_attr_mode (insn) == MODE_OI)) ? "pandn" : "andnps"; switch (which_alternative) { @@ -5739,6 +6209,8 @@ (const_string "V8SF") (ne (symbol_ref "TARGET_SSE2") (const_int 0)) (const_string "TI") + (ne (symbol_ref "TARGET_AVX2") (const_int 0)) + (const_string "OI") ] (const_string "V4SF")))]) @@ -5761,7 +6233,8 @@ static char buf[32]; const char *ops; const char *tmp - = (get_attr_mode (insn) == MODE_TI) ? "p<logic>" : "<logic>ps"; + = (get_attr_mode (insn) == MODE_TI)|| + (get_attr_mode (insn) == MODE_OI) ? "p<logic>" : "<logic>ps"; switch (which_alternative) { @@ -5792,6 +6265,8 @@ (const_string "V8SF") (ne (symbol_ref "TARGET_SSE2") (const_int 0)) (const_string "TI") + (ne (symbol_ref "TARGET_AVX2") (const_int 0)) + (const_string "OI") ] (const_string "V4SF")))]) @@ -5852,13 +6327,13 @@ DONE; }) -(define_insn "sse2_packsswb" - [(set (match_operand:V16QI 0 "register_operand" "=x,x") - (vec_concat:V16QI - (ss_truncate:V8QI - (match_operand:V8HI 1 "register_operand" "0,x")) - (ss_truncate:V8QI - (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm"))))] +(define_insn "<sse2_avx2>_packsswb" + [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") + (vec_concat:VI1_AVX2 + (ss_truncate:<ssehalfvecmode> + (match_operand:<sseunpackmode> 1 "register_operand" "0,x")) + (ss_truncate:<ssehalfvecmode> + (match_operand:<sseunpackmode> 2 "nonimmediate_operand" "xm,xm"))))] "TARGET_SSE2" "@ packsswb\t{%2, %0|%0, %2} @@ -5867,15 +6342,15 @@ (set_attr "type" "sselog") (set_attr "prefix_data16" "1,*") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) -(define_insn "sse2_packssdw" - [(set (match_operand:V8HI 0 "register_operand" "=x,x") - (vec_concat:V8HI - (ss_truncate:V4HI - (match_operand:V4SI 1 "register_operand" "0,x")) - (ss_truncate:V4HI - (match_operand:V4SI 2 "nonimmediate_operand" "xm,xm"))))] +(define_insn "<sse2_avx2>_packssdw" + [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,x") + (vec_concat:VI2_AVX2 + (ss_truncate:<ssehalfvecmode> + (match_operand:<sseunpackmode> 1 "register_operand" "0,x")) + (ss_truncate:<ssehalfvecmode> + (match_operand:<sseunpackmode> 2 "nonimmediate_operand" "xm,xm"))))] "TARGET_SSE2" "@ packssdw\t{%2, %0|%0, %2} @@ -5884,15 +6359,15 @@ (set_attr "type" "sselog") (set_attr "prefix_data16" "1,*") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) -(define_insn "sse2_packuswb" - [(set (match_operand:V16QI 0 "register_operand" "=x,x") - (vec_concat:V16QI - (us_truncate:V8QI - (match_operand:V8HI 1 "register_operand" "0,x")) - (us_truncate:V8QI - (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm"))))] +(define_insn "<sse2_avx2>_packuswb" + [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") + (vec_concat:VI1_AVX2 + (us_truncate:<ssehalfvecmode> + (match_operand:<sseunpackmode> 1 "register_operand" "0,x")) + (us_truncate:<ssehalfvecmode> + (match_operand:<sseunpackmode> 2 "nonimmediate_operand" "xm,xm"))))] "TARGET_SSE2" "@ packuswb\t{%2, %0|%0, %2} @@ -5901,7 +6376,36 @@ (set_attr "type" "sselog") (set_attr "prefix_data16" "1,*") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx2_interleave_highv32qi" + [(set (match_operand:V32QI 0 "register_operand" "=x") + (vec_select:V32QI + (vec_concat:V64QI + (match_operand:V32QI 1 "register_operand" "x") + (match_operand:V32QI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 8) (const_int 40) + (const_int 9) (const_int 41) + (const_int 10) (const_int 42) + (const_int 11) (const_int 43) + (const_int 12) (const_int 44) + (const_int 13) (const_int 45) + (const_int 14) (const_int 46) + (const_int 15) (const_int 47) + (const_int 24) (const_int 56) + (const_int 25) (const_int 57) + (const_int 26) (const_int 58) + (const_int 27) (const_int 59) + (const_int 28) (const_int 60) + (const_int 29) (const_int 61) + (const_int 30) (const_int 62) + (const_int 31) (const_int 63) + (const_int 32) (const_int 64)])))] + "TARGET_AVX2" + "vpunpckhbw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) (define_insn "vec_interleave_highv16qi" [(set (match_operand:V16QI 0 "register_operand" "=x,x") @@ -5927,6 +6431,35 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_insn "avx2_interleave_lowv32qi" + [(set (match_operand:V32QI 0 "register_operand" "=x") + (vec_select:V32QI + (vec_concat:V64QI + (match_operand:V32QI 1 "register_operand" "x") + (match_operand:V32QI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 32) + (const_int 1) (const_int 33) + (const_int 2) (const_int 34) + (const_int 3) (const_int 35) + (const_int 4) (const_int 36) + (const_int 5) (const_int 37) + (const_int 6) (const_int 38) + (const_int 7) (const_int 39) + (const_int 15) (const_int 47) + (const_int 16) (const_int 48) + (const_int 17) (const_int 49) + (const_int 18) (const_int 50) + (const_int 19) (const_int 51) + (const_int 20) (const_int 52) + (const_int 21) (const_int 53) + (const_int 22) (const_int 54) + (const_int 23) (const_int 55)])))] + "TARGET_AVX2" + "vpunpcklbw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "vec_interleave_lowv16qi" [(set (match_operand:V16QI 0 "register_operand" "=x,x") (vec_select:V16QI @@ -5951,6 +6484,26 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_insn "avx2_interleave_highv16hi" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (vec_select:V16HI + (vec_concat:V32HI + (match_operand:V16HI 1 "register_operand" "x") + (match_operand:V16HI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 4) (const_int 20) + (const_int 5) (const_int 21) + (const_int 6) (const_int 22) + (const_int 7) (const_int 23) + (const_int 12) (const_int 28) + (const_int 13) (const_int 29) + (const_int 14) (const_int 30) + (const_int 15) (const_int 31)])))] + "TARGET_AVX2" + "vpunpckhwd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "vec_interleave_highv8hi" [(set (match_operand:V8HI 0 "register_operand" "=x,x") (vec_select:V8HI @@ -5971,6 +6524,26 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_insn "avx2_interleave_lowv16hi" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (vec_select:V16HI + (vec_concat:V32HI + (match_operand:V16HI 1 "register_operand" "x") + (match_operand:V16HI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 16) + (const_int 1) (const_int 17) + (const_int 2) (const_int 18) + (const_int 3) (const_int 19) + (const_int 8) (const_int 24) + (const_int 9) (const_int 25) + (const_int 10) (const_int 26) + (const_int 11) (const_int 27)])))] + "TARGET_AVX2" + "vpunpcklwd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "vec_interleave_lowv8hi" [(set (match_operand:V8HI 0 "register_operand" "=x,x") (vec_select:V8HI @@ -5991,6 +6564,22 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_insn "avx2_interleave_highv8si" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (vec_select:V8SI + (vec_concat:V16SI + (match_operand:V8SI 1 "register_operand" "x") + (match_operand:V8SI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 2) (const_int 10) + (const_int 3) (const_int 11) + (const_int 6) (const_int 14) + (const_int 7) (const_int 15)])))] + "TARGET_AVX2" + "vpunpckhdq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "vec_interleave_highv4si" [(set (match_operand:V4SI 0 "register_operand" "=x,x") (vec_select:V4SI @@ -6009,6 +6598,22 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_insn "avx2_interleave_lowv8si" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (vec_select:V8SI + (vec_concat:V16SI + (match_operand:V8SI 1 "register_operand" "x") + (match_operand:V8SI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 8) + (const_int 1) (const_int 9) + (const_int 4) (const_int 12) + (const_int 5) (const_int 13)])))] + "TARGET_AVX2" + "vpunpckldq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "vec_interleave_lowv4si" [(set (match_operand:V4SI 0 "register_operand" "=x,x") (vec_select:V4SI @@ -6055,13 +6660,13 @@ { case 0: if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode)) - return "pinsr<ssemodesuffix>\t{%3, %k2, %0|%0, %k2, %3}"; + return "pinsr<ssemodesuffix>\t{%3, %k2, %0|%0, %k2, %3}"; /* FALLTHRU */ case 1: return "pinsr<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}"; case 2: if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode)) - return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; + return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; /* FALLTHRU */ case 3: return "vpinsr<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"; @@ -6189,6 +6794,49 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) +(define_expand "avx2_pshufdv3" + [(match_operand:V8SI 0 "register_operand" "") + (match_operand:V8SI 1 "nonimmediate_operand" "") + (match_operand:SI 2 "const_0_to_255_operand" "")] + "TARGET_AVX2" +{ + int mask = INTVAL (operands[2]); + emit_insn (gen_avx2_pshufd_1 (operands[0], operands[1], + GEN_INT ((mask >> 0) & 3), + GEN_INT ((mask >> 2) & 3), + GEN_INT ((mask >> 4) & 3), + GEN_INT ((mask >> 6) & 3))); + DONE; +}) + +(define_insn "avx2_pshufd_1" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (vec_select:V8SI + (match_operand:V8SI 1 "nonimmediate_operand" "xm") + (parallel [(match_operand 2 "const_0_to_3_operand" "") + (match_operand 3 "const_0_to_3_operand" "") + (match_operand 4 "const_0_to_3_operand" "") + (match_operand 5 "const_0_to_3_operand" "") + (match_dup 2) + (match_dup 3) + (match_dup 4) + (match_dup 5)])))] + "TARGET_AVX2" +{ + int mask = 0; + mask |= INTVAL (operands[2]) << 0; + mask |= INTVAL (operands[3]) << 2; + mask |= INTVAL (operands[4]) << 4; + mask |= INTVAL (operands[5]) << 6; + operands[2] = GEN_INT (mask); + + return "vpshufd\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog1") + (set_attr "prefix" "vex") + (set_attr "length_immediate" "1") + (set_attr "mode" "OI")]) + (define_expand "sse2_pshufd" [(match_operand:V4SI 0 "register_operand" "") (match_operand:V4SI 1 "nonimmediate_operand" "") @@ -6229,6 +6877,57 @@ (set_attr "length_immediate" "1") (set_attr "mode" "TI")]) +(define_expand "avx2_pshuflwv3" + [(match_operand:V16HI 0 "register_operand" "") + (match_operand:V16HI 1 "nonimmediate_operand" "") + (match_operand:SI 2 "const_0_to_255_operand" "")] + "TARGET_AVX2" +{ + int mask = INTVAL (operands[2]); + emit_insn (gen_avx2_pshuflw_1 (operands[0], operands[1], + GEN_INT ((mask >> 0) & 3), + GEN_INT ((mask >> 2) & 3), + GEN_INT ((mask >> 4) & 3), + GEN_INT ((mask >> 6) & 3))); + DONE; +}) + +(define_insn "avx2_pshuflw_1" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (vec_select:V16HI + (match_operand:V16HI 1 "nonimmediate_operand" "xm") + (parallel [(match_operand 2 "const_0_to_3_operand" "") + (match_operand 3 "const_0_to_3_operand" "") + (match_operand 4 "const_0_to_3_operand" "") + (match_operand 5 "const_0_to_3_operand" "") + (const_int 4) + (const_int 5) + (const_int 6) + (const_int 7) + (match_dup 2) + (match_dup 3) + (match_dup 4) + (match_dup 5) + (const_int 12) + (const_int 13) + (const_int 14) + (const_int 15)])))] + "TARGET_AVX2" +{ + int mask = 0; + mask |= INTVAL (operands[2]) << 0; + mask |= INTVAL (operands[3]) << 2; + mask |= INTVAL (operands[4]) << 4; + mask |= INTVAL (operands[5]) << 6; + operands[2] = GEN_INT (mask); + + return "vpshuflw\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "length_immediate" "1") + (set_attr "mode" "OI")]) + (define_expand "sse2_pshuflw" [(match_operand:V8HI 0 "register_operand" "") (match_operand:V8HI 1 "nonimmediate_operand" "") @@ -6274,6 +6973,57 @@ (set_attr "length_immediate" "1") (set_attr "mode" "TI")]) +(define_expand "avx2_pshufhwv3" + [(match_operand:V16HI 0 "register_operand" "") + (match_operand:V16HI 1 "nonimmediate_operand" "") + (match_operand:SI 2 "const_0_to_255_operand" "")] + "TARGET_AVX2" +{ + int mask = INTVAL (operands[2]); + emit_insn (gen_avx2_pshufhw_1 (operands[0], operands[1], + GEN_INT (((mask >> 0) & 3) + 4), + GEN_INT (((mask >> 2) & 3) + 4), + GEN_INT (((mask >> 4) & 3) + 4), + GEN_INT (((mask >> 6) & 3) + 4))); + DONE; +}) + +(define_insn "avx2_pshufhw_1" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (vec_select:V16HI + (match_operand:V16HI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3) + (match_operand 2 "const_4_to_7_operand" "") + (match_operand 3 "const_4_to_7_operand" "") + (match_operand 4 "const_4_to_7_operand" "") + (match_operand 5 "const_4_to_7_operand" "") + (const_int 8) + (const_int 9) + (const_int 10) + (const_int 11) + (match_dup 2) + (match_dup 3) + (match_dup 4) + (match_dup 5)])))] + "TARGET_AVX2" +{ + int mask = 0; + mask |= (INTVAL (operands[2]) - 4) << 0; + mask |= (INTVAL (operands[3]) - 4) << 2; + mask |= (INTVAL (operands[4]) - 4) << 4; + mask |= (INTVAL (operands[5]) - 4) << 6; + operands[2] = GEN_INT (mask); + + return "vpshufhw\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "length_immediate" "1") + (set_attr "mode" "OI")]) + (define_expand "sse2_pshufhw" [(match_operand:V8HI 0 "register_operand" "") (match_operand:V8HI 1 "nonimmediate_operand" "") @@ -6665,6 +7415,36 @@ ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(define_expand "avx2_uavgv32qi3" + [(set (match_operand:V32QI 0 "register_operand" "") + (truncate:V32QI + (lshiftrt:V32HI + (plus:V32HI + (plus:V32HI + (zero_extend:V32HI + (match_operand:V32QI 1 "nonimmediate_operand" "")) + (zero_extend:V32HI + (match_operand:V32QI 2 "nonimmediate_operand" ""))) + (const_vector:V32QI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_AVX2" + "ix86_fixup_binary_operands_no_copy (PLUS, V32QImode, operands);") + (define_expand "sse2_uavgv16qi3" [(set (match_operand:V16QI 0 "register_operand" "") (truncate:V16QI @@ -6687,6 +7467,39 @@ "TARGET_SSE2" "ix86_fixup_binary_operands_no_copy (PLUS, V16QImode, operands);") +(define_insn "*avx2_uavgv32qi3" + [(set (match_operand:V32QI 0 "register_operand" "=x") + (truncate:V32QI + (lshiftrt:V32HI + (plus:V32HI + (plus:V32HI + (zero_extend:V32HI + (match_operand:V32QI 1 "nonimmediate_operand" "%x")) + (zero_extend:V32HI + (match_operand:V32QI 2 "nonimmediate_operand" "xm"))) + (const_vector:V32QI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_AVX2 && ix86_binary_operator_ok (PLUS, V32QImode, operands)" + "vpavgb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "*sse2_uavgv16qi3" [(set (match_operand:V16QI 0 "register_operand" "=x,x") (truncate:V16QI @@ -6716,6 +7529,28 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_expand "avx2_uavgv16hi3" + [(set (match_operand:V16HI 0 "register_operand" "") + (truncate:V16HI + (lshiftrt:V16SI + (plus:V16SI + (plus:V16SI + (zero_extend:V16SI + (match_operand:V16HI 1 "nonimmediate_operand" "")) + (zero_extend:V16SI + (match_operand:V16HI 2 "nonimmediate_operand" ""))) + (const_vector:V16HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_AVX2" + "ix86_fixup_binary_operands_no_copy (PLUS, V16HImode, operands);") + (define_expand "sse2_uavgv8hi3" [(set (match_operand:V8HI 0 "register_operand" "") (truncate:V8HI @@ -6734,6 +7569,31 @@ "TARGET_SSE2" "ix86_fixup_binary_operands_no_copy (PLUS, V8HImode, operands);") +(define_insn "*avx2_uavgv16hi3" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (truncate:V16HI + (lshiftrt:V16SI + (plus:V16SI + (plus:V16SI + (zero_extend:V16SI + (match_operand:V16HI 1 "nonimmediate_operand" "%x")) + (zero_extend:V16SI + (match_operand:V16HI 2 "nonimmediate_operand" "xm"))) + (const_vector:V16HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_AVX2 && ix86_binary_operator_ok (PLUS, V16HImode, operands)" + "vpavgw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "*sse2_uavgv8hi3" [(set (match_operand:V8HI 0 "register_operand" "=x,x") (truncate:V8HI @@ -6761,11 +7621,11 @@ ;; The correct representation for this is absolutely enormous, and ;; surely not generally useful. -(define_insn "sse2_psadbw" - [(set (match_operand:V2DI 0 "register_operand" "=x,x") - (unspec:V2DI [(match_operand:V16QI 1 "register_operand" "0,x") - (match_operand:V16QI 2 "nonimmediate_operand" "xm,xm")] - UNSPEC_PSADBW))] +(define_insn "<sse2_avx2>_psadbw" + [(set (match_operand:VI8_AVX2 0 "register_operand" "=x,x") + (unspec:VI8_AVX2 [(match_operand:<ssebytemode> 1 "register_operand" "0,x") + (match_operand:<ssebytemode> 2 "nonimmediate_operand" "xm,xm")] + UNSPEC_PSADBW))] "TARGET_SSE2" "@ psadbw\t{%2, %0|%0, %2} @@ -6775,7 +7635,7 @@ (set_attr "atom_unit" "simul") (set_attr "prefix_data16" "1,*") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) (define_insn "<sse>_movmsk<ssemodesuffix><avxsizesuffix>" [(set (match_operand:SI 0 "register_operand" "=r") @@ -6788,6 +7648,16 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "<MODE>")]) +(define_insn "avx2_pmovmskb" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI [(match_operand:V32QI 1 "register_operand" "x")] + UNSPEC_MOVMSK))] + "TARGET_AVX2" + "vpmovmskb\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "DI")]) + (define_insn "sse2_pmovmskb" [(set (match_operand:SI 0 "register_operand" "=r") (unspec:SI [(match_operand:V16QI 1 "register_operand" "x")] @@ -6947,6 +7817,82 @@ ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(define_insn "avx2_phaddwv16hi3" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (vec_concat:V16HI + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (plus:HI + (vec_select:HI + (match_operand:V16HI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 5)]))) + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 8)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 9)]))) + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 10)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 11)])))) + (vec_concat:V2HI + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 12)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 13)]))) + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 14)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 15)])))))) + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (plus:HI + (vec_select:HI + (match_operand:V16HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 5)]))) + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 8)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 9)]))) + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 10)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 11)])))) + (vec_concat:V2HI + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 12)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 13)]))) + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 14)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 15)]))))))))] + "TARGET_AVX2" + "vphaddw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "ssse3_phaddwv8hi3" [(set (match_operand:V8HI 0 "register_operand" "=x,x") (vec_concat:V8HI @@ -7025,6 +7971,50 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) +(define_insn "avx2_phadddv8si3" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (vec_concat:V8SI + (vec_concat:V4SI + (vec_concat:V2SI + (plus:SI + (vec_select:SI + (match_operand:V8SI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 1)]))) + (plus:SI + (vec_select:SI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2SI + (plus:SI + (vec_select:SI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 5)]))) + (plus:SI + (vec_select:SI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4SI + (vec_concat:V2SI + (plus:SI + (vec_select:SI + (match_operand:V8SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 1)]))) + (plus:SI + (vec_select:SI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2SI + (plus:SI + (vec_select:SI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 5)]))) + (plus:SI + (vec_select:SI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 7)])))))))] + "TARGET_AVX2" + "vphaddd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "ssse3_phadddv4si3" [(set (match_operand:V4SI 0 "register_operand" "=x,x") (vec_concat:V4SI @@ -7079,6 +8069,82 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) +(define_insn "avx2_phaddswv16hi3" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (vec_concat:V16HI + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI + (match_operand:V16HI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 5)]))) + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 8)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 9)]))) + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 10)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 11)])))) + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 12)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 13)]))) + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 14)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 15)])))))) + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI + (match_operand:V16HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 5)]))) + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 8)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 9)]))) + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 10)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 11)])))) + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 12)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 13)]))) + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 14)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 15)]))))))))] + "TARGET_AVX2" + "vphaddsw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "ssse3_phaddswv8hi3" [(set (match_operand:V8HI 0 "register_operand" "=x,x") (vec_concat:V8HI @@ -7157,6 +8223,82 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) +(define_insn "avx2_phsubwv16hi3" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (vec_concat:V16HI + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (minus:HI + (vec_select:HI + (match_operand:V16HI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 5)]))) + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 8)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 9)]))) + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 10)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 11)])))) + (vec_concat:V2HI + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 12)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 13)]))) + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 14)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 15)])))))) + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (minus:HI + (vec_select:HI + (match_operand:V16HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 5)]))) + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 8)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 9)]))) + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 10)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 11)])))) + (vec_concat:V2HI + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 12)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 13)]))) + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 14)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 15)]))))))))] + "TARGET_AVX2" + "vphsubw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "ssse3_phsubwv8hi3" [(set (match_operand:V8HI 0 "register_operand" "=x,x") (vec_concat:V8HI @@ -7235,6 +8377,50 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) +(define_insn "avx2_phsubdv8si3" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (vec_concat:V8SI + (vec_concat:V4SI + (vec_concat:V2SI + (minus:SI + (vec_select:SI + (match_operand:V8SI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 1)]))) + (minus:SI + (vec_select:SI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2SI + (minus:SI + (vec_select:SI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 5)]))) + (minus:SI + (vec_select:SI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4SI + (vec_concat:V2SI + (minus:SI + (vec_select:SI + (match_operand:V8SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 1)]))) + (minus:SI + (vec_select:SI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2SI + (minus:SI + (vec_select:SI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 5)]))) + (minus:SI + (vec_select:SI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 7)])))))))] + "TARGET_AVX2" + "vphsubd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "ssse3_phsubdv4si3" [(set (match_operand:V4SI 0 "register_operand" "=x,x") (vec_concat:V4SI @@ -7290,6 +8476,82 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) +(define_insn "avx2_phsubswv16hi3" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (vec_concat:V16HI + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI + (match_operand:V16HI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 5)]))) + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 8)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 9)]))) + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 10)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 11)])))) + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 12)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 13)]))) + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 14)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 15)])))))) + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI + (match_operand:V16HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 5)]))) + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 8)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 9)]))) + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 10)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 11)])))) + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 12)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 13)]))) + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 14)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 15)]))))))))] + "TARGET_AVX2" + "vphsubsw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "ssse3_phsubswv8hi3" [(set (match_operand:V8HI 0 "register_operand" "=x,x") (vec_concat:V8HI @@ -7368,6 +8630,92 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) +(define_insn "avx2_pmaddubsw256" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (ss_plus:V16HI + (mult:V16HI + (zero_extend:V16HI + (vec_select:V16QI + (match_operand:V32QI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14) + (const_int 16) + (const_int 18) + (const_int 20) + (const_int 22) + (const_int 24) + (const_int 26) + (const_int 28) + (const_int 30)]))) + (sign_extend:V16HI + (vec_select:V16QI + (match_operand:V32QI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14) + (const_int 16) + (const_int 18) + (const_int 20) + (const_int 22) + (const_int 24) + (const_int 26) + (const_int 28) + (const_int 30)])))) + (mult:V16HI + (zero_extend:V16HI + (vec_select:V16QI (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15) + (const_int 17) + (const_int 19) + (const_int 21) + (const_int 23) + (const_int 25) + (const_int 27) + (const_int 29) + (const_int 31)]))) + (sign_extend:V16HI + (vec_select:V16QI (match_dup 2) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15) + (const_int 17) + (const_int 19) + (const_int 21) + (const_int 23) + (const_int 25) + (const_int 27) + (const_int 29) + (const_int 31)]))))))] + "TARGET_AVX2" + "vpmaddubsw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "ssse3_pmaddubsw128" [(set (match_operand:V8HI 0 "register_operand" "=x,x") (ss_plus:V8HI @@ -7466,6 +8814,58 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) +(define_expand "avx2_umulhrswv16hi3" + [(set (match_operand:V16HI 0 "register_operand" "") + (truncate:V16HI + (lshiftrt:V16SI + (plus:V16SI + (lshiftrt:V16SI + (mult:V16SI + (sign_extend:V16SI + (match_operand:V16HI 1 "nonimmediate_operand" "")) + (sign_extend:V16SI + (match_operand:V16HI 2 "nonimmediate_operand" ""))) + (const_int 14)) + (const_vector:V16HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_AVX2" + "ix86_fixup_binary_operands_no_copy (MULT, V16HImode, operands);") + +(define_insn "*avx2_umulhrswv16hi3" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (truncate:V16HI + (lshiftrt:V16SI + (plus:V16SI + (lshiftrt:V16SI + (mult:V16SI + (sign_extend:V16SI + (match_operand:V16HI 1 "nonimmediate_operand" "%x")) + (sign_extend:V16SI + (match_operand:V16HI 2 "nonimmediate_operand" "xm"))) + (const_int 14)) + (const_vector:V16HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_AVX2 && ix86_binary_operator_ok (MULT, V16HImode, operands)" + "vpmulhrsw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_expand "ssse3_pmulhrswv8hi3" [(set (match_operand:V8HI 0 "register_operand" "") (truncate:V8HI @@ -7554,11 +8954,11 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) -(define_insn "ssse3_pshufbv16qi3" - [(set (match_operand:V16QI 0 "register_operand" "=x,x") - (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,x") - (match_operand:V16QI 2 "nonimmediate_operand" "xm,xm")] - UNSPEC_PSHUFB))] +(define_insn "<ssse3_avx2>_pshufb<mode>3" + [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") + (unspec:VI1_AVX2 [(match_operand:VI1_AVX2 1 "register_operand" "0,x") + (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")] + UNSPEC_PSHUFB))] "TARGET_SSSE3" "@ pshufb\t{%2, %0|%0, %2} @@ -7568,7 +8968,7 @@ (set_attr "prefix_data16" "1,*") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) (define_insn "ssse3_pshufbv8qi3" [(set (match_operand:V8QI 0 "register_operand" "=y") @@ -7582,11 +8982,11 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) -(define_insn "ssse3_psign<mode>3" - [(set (match_operand:VI124_128 0 "register_operand" "=x,x") - (unspec:VI124_128 - [(match_operand:VI124_128 1 "register_operand" "0,x") - (match_operand:VI124_128 2 "nonimmediate_operand" "xm,xm")] +(define_insn "<ssse3_avx2>_psign<mode>3" + [(set (match_operand:VI124_AVX2 0 "register_operand" "=x,x") + (unspec:VI124_AVX2 + [(match_operand:VI124_AVX2 1 "register_operand" "0,x") + (match_operand:VI124_AVX2 2 "nonimmediate_operand" "xm,xm")] UNSPEC_PSIGN))] "TARGET_SSSE3" "@ @@ -7597,7 +8997,7 @@ (set_attr "prefix_data16" "1,*") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) (define_insn "ssse3_psign<mode>3" [(set (match_operand:MMXMODEI 0 "register_operand" "=y") @@ -7612,12 +9012,12 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) -(define_insn "ssse3_palignrti" - [(set (match_operand:TI 0 "register_operand" "=x,x") - (unspec:TI [(match_operand:TI 1 "register_operand" "0,x") - (match_operand:TI 2 "nonimmediate_operand" "xm,xm") - (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n,n")] - UNSPEC_PALIGNR))] +(define_insn "<ssse3_avx2>_palignr<mode>" + [(set (match_operand:SSESCALARMODE 0 "register_operand" "=x,x") + (unspec:SSESCALARMODE [(match_operand:SSESCALARMODE 1 "register_operand" "0,x") + (match_operand:SSESCALARMODE 2 "nonimmediate_operand" "xm,xm") + (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n,n")] + UNSPEC_PALIGNR))] "TARGET_SSSE3" { operands[3] = GEN_INT (INTVAL (operands[3]) / 8); @@ -7639,7 +9039,7 @@ (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) (define_insn "ssse3_palignrdi" [(set (match_operand:DI 0 "register_operand" "=y") @@ -7660,16 +9060,16 @@ (set_attr "mode" "DI")]) (define_insn "abs<mode>2" - [(set (match_operand:VI124_128 0 "register_operand" "=x") - (abs:VI124_128 - (match_operand:VI124_128 1 "nonimmediate_operand" "xm")))] + [(set (match_operand:VI124_AVX2 0 "register_operand" "=x") + (abs:VI124_AVX2 + (match_operand:VI124_AVX2 1 "nonimmediate_operand" "xm")))] "TARGET_SSSE3" "%vpabs<ssemodesuffix>\t{%1, %0|%0, %1}" [(set_attr "type" "sselog1") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) (define_insn "abs<mode>2" [(set (match_operand:MMXMODEI 0 "register_operand" "=y") @@ -7693,7 +9093,7 @@ [(set (match_operand:MODEF 0 "memory_operand" "=m") (unspec:MODEF [(match_operand:MODEF 1 "register_operand" "x")] - UNSPEC_MOVNT))] + UNSPEC_MOVNT))] "TARGET_SSE4A" "movnt<ssemodesuffix>\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") @@ -7713,10 +9113,10 @@ (define_insn "sse4a_extrqi" [(set (match_operand:V2DI 0 "register_operand" "=x") - (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") - (match_operand 2 "const_0_to_255_operand" "") - (match_operand 3 "const_0_to_255_operand" "")] - UNSPEC_EXTRQI))] + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand 2 "const_0_to_255_operand" "") + (match_operand 3 "const_0_to_255_operand" "")] + UNSPEC_EXTRQI))] "TARGET_SSE4A" "extrq\t{%3, %2, %0|%0, %2, %3}" [(set_attr "type" "sse") @@ -7726,9 +9126,9 @@ (define_insn "sse4a_extrq" [(set (match_operand:V2DI 0 "register_operand" "=x") - (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") - (match_operand:V16QI 2 "register_operand" "x")] - UNSPEC_EXTRQ))] + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V16QI 2 "register_operand" "x")] + UNSPEC_EXTRQ))] "TARGET_SSE4A" "extrq\t{%2, %0|%0, %2}" [(set_attr "type" "sse") @@ -7737,11 +9137,11 @@ (define_insn "sse4a_insertqi" [(set (match_operand:V2DI 0 "register_operand" "=x") - (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") - (match_operand:V2DI 2 "register_operand" "x") - (match_operand 3 "const_0_to_255_operand" "") - (match_operand 4 "const_0_to_255_operand" "")] - UNSPEC_INSERTQI))] + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "x") + (match_operand 3 "const_0_to_255_operand" "") + (match_operand 4 "const_0_to_255_operand" "")] + UNSPEC_INSERTQI))] "TARGET_SSE4A" "insertq\t{%4, %3, %2, %0|%0, %2, %3, %4}" [(set_attr "type" "sseins") @@ -7752,9 +9152,9 @@ (define_insn "sse4a_insertq" [(set (match_operand:V2DI 0 "register_operand" "=x") - (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") - (match_operand:V2DI 2 "register_operand" "x")] - UNSPEC_INSERTQ))] + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "x")] + UNSPEC_INSERTQ))] "TARGET_SSE4A" "insertq\t{%2, %0|%0, %2}" [(set_attr "type" "sseins") @@ -7824,23 +9224,23 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "<MODE>")]) -(define_insn "sse4_1_movntdqa" - [(set (match_operand:V2DI 0 "register_operand" "=x") - (unspec:V2DI [(match_operand:V2DI 1 "memory_operand" "m")] +(define_insn "<sse4_1_avx2>_movntdqa" + [(set (match_operand:VI8_AVX2 0 "register_operand" "=x") + (unspec:VI8_AVX2 [(match_operand:VI8_AVX2 1 "memory_operand" "m")] UNSPEC_MOVNTDQA))] "TARGET_SSE4_1" "%vmovntdqa\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) -(define_insn "sse4_1_mpsadbw" - [(set (match_operand:V16QI 0 "register_operand" "=x,x") - (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,x") - (match_operand:V16QI 2 "nonimmediate_operand" "xm,xm") - (match_operand:SI 3 "const_0_to_255_operand" "n,n")] - UNSPEC_MPSADBW))] +(define_insn "<sse4_1_avx2>_mpsadbw" + [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") + (unspec:VI1_AVX2 [(match_operand:VI1_AVX2 1 "register_operand" "0,x") + (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm") + (match_operand:SI 3 "const_0_to_255_operand" "n,n")] + UNSPEC_MPSADBW))] "TARGET_SSE4_1" "@ mpsadbw\t{%3, %2, %0|%0, %2, %3} @@ -7850,7 +9250,21 @@ (set_attr "length_immediate" "1") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx2_packusdw" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (vec_concat:V16HI + (us_truncate:V8HI + (match_operand:V8SI 1 "register_operand" "x")) + (us_truncate:V8HI + (match_operand:V8SI 2 "nonimmediate_operand" "xm"))))] + "TARGET_AVX2" + "vpackusdw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) (define_insn "sse4_1_packusdw" [(set (match_operand:V8HI 0 "register_operand" "=x,x") @@ -7869,12 +9283,12 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) -(define_insn "sse4_1_pblendvb" - [(set (match_operand:V16QI 0 "reg_not_xmm0_operand" "=x,x") - (unspec:V16QI - [(match_operand:V16QI 1 "reg_not_xmm0_operand_maybe_avx" "0,x") - (match_operand:V16QI 2 "nonimm_not_xmm0_operand_maybe_avx" "xm,xm") - (match_operand:V16QI 3 "register_operand" "Yz,x")] +(define_insn "<sse4_1_avx2>_pblendvb" + [(set (match_operand:VI1_AVX2 0 "reg_not_xmm0_operand" "=x,x") + (unspec:VI1_AVX2 + [(match_operand:VI1_AVX2 1 "reg_not_xmm0_operand_maybe_avx" "0,x") + (match_operand:VI1_AVX2 2 "nonimm_not_xmm0_operand_maybe_avx" "xm,xm") + (match_operand:VI1_AVX2 3 "register_operand" "Yz,x")] UNSPEC_BLENDV))] "TARGET_SSE4_1" "@ @@ -7885,13 +9299,13 @@ (set_attr "prefix_extra" "1") (set_attr "length_immediate" "*,1") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) -(define_insn "sse4_1_pblendw" - [(set (match_operand:V8HI 0 "register_operand" "=x,x") - (vec_merge:V8HI - (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm") - (match_operand:V8HI 1 "register_operand" "0,x") +(define_insn "<sse4_1_avx2>_pblendw" + [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,x") + (vec_merge:VI2_AVX2 + (match_operand:VI2_AVX2 2 "nonimmediate_operand" "xm,xm") + (match_operand:VI2_AVX2 1 "register_operand" "0,x") (match_operand:SI 3 "const_0_to_255_operand" "n,n")))] "TARGET_SSE4_1" "@ @@ -7902,7 +9316,21 @@ (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx2_pblendd<mode>" + [(set (match_operand:VI4_AVX2 0 "register_operand" "=x") + (vec_merge:VI4_AVX2 + (match_operand:VI4_AVX2 2 "nonimmediate_operand" "xm") + (match_operand:VI4_AVX2 1 "register_operand" "x") + (match_operand:SI 3 "const_0_to_255_operand" "n")))] + "TARGET_AVX2" + "vpblendd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) (define_insn "sse4_1_phminposuw" [(set (match_operand:V8HI 0 "register_operand" "=x") @@ -7915,6 +9343,17 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) +(define_insn "avx2_<code>v16qiv16hi2" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (any_extend:V16HI + (match_operand:V16QI 1 "nonimmediate_operand" "xm")))] + "TARGET_AVX2" + "vpmov<extsuffix>bw\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "sse4_1_<code>v8qiv8hi2" [(set (match_operand:V8HI 0 "register_operand" "=x") (any_extend:V8HI @@ -7935,6 +9374,26 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) +(define_insn "avx2_<code>v8qiv8si2" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (any_extend:V8SI + (vec_select:V8QI + (match_operand:V16QI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3) + (const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)]))))] + "TARGET_AVX2" + "vpmov<extsuffix>bd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "sse4_1_<code>v4qiv4si2" [(set (match_operand:V4SI 0 "register_operand" "=x") (any_extend:V4SI @@ -7951,6 +9410,17 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) +(define_insn "avx2_<code>v8hiv8si2" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (any_extend:V8SI + (match_operand:V8HI 1 "nonimmediate_operand" "xm")))] + "TARGET_AVX2" + "vpmov<extsuffix>wd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "sse4_1_<code>v4hiv4si2" [(set (match_operand:V4SI 0 "register_operand" "=x") (any_extend:V4SI @@ -7967,6 +9437,22 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) +(define_insn "avx2_<code>v4qiv4di2" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (any_extend:V4DI + (vec_select:V4QI + (match_operand:V16QI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3)]))))] + "TARGET_AVX2" + "vpmov<extsuffix>bq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "sse4_1_<code>v2qiv2di2" [(set (match_operand:V2DI 0 "register_operand" "=x") (any_extend:V2DI @@ -7981,6 +9467,22 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) +(define_insn "avx2_<code>v4hiv4di2" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (any_extend:V4DI + (vec_select:V4HI + (match_operand:V8HI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3)]))))] + "TARGET_AVX2" + "vpmov<extsuffix>wq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "sse4_1_<code>v2hiv2di2" [(set (match_operand:V2DI 0 "register_operand" "=x") (any_extend:V2DI @@ -7995,6 +9497,16 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) +(define_insn "avx2_<code>v4siv4di2" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (any_extend:V4DI + (match_operand:V4SI 1 "nonimmediate_operand" "xm")))] + "TARGET_AVX2" + "vpmov<extsuffix>dq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "OI")]) + (define_insn "sse4_1_<code>v2siv2di2" [(set (match_operand:V2DI 0 "register_operand" "=x") (any_extend:V2DI @@ -8360,7 +9872,7 @@ ;; do not allow the value being added to be a memory operation. (define_insn "xop_pmacsww" [(set (match_operand:V8HI 0 "register_operand" "=x") - (plus:V8HI + (plus:V8HI (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "%x") (match_operand:V8HI 2 "nonimmediate_operand" "xm")) @@ -8372,7 +9884,7 @@ (define_insn "xop_pmacssww" [(set (match_operand:V8HI 0 "register_operand" "=x") - (ss_plus:V8HI + (ss_plus:V8HI (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "%x") (match_operand:V8HI 2 "nonimmediate_operand" "xm")) (match_operand:V8HI 3 "nonimmediate_operand" "x")))] @@ -8383,7 +9895,7 @@ (define_insn "xop_pmacsdd" [(set (match_operand:V4SI 0 "register_operand" "=x") - (plus:V4SI + (plus:V4SI (mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%x") (match_operand:V4SI 2 "nonimmediate_operand" "xm")) @@ -8395,7 +9907,7 @@ (define_insn "xop_pmacssdd" [(set (match_operand:V4SI 0 "register_operand" "=x") - (ss_plus:V4SI + (ss_plus:V4SI (mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%x") (match_operand:V4SI 2 "nonimmediate_operand" "xm")) (match_operand:V4SI 3 "nonimmediate_operand" "x")))] @@ -9218,7 +10730,7 @@ int i; if (GET_MODE (op2) != <ssescalarmode>mode) - { + { op2 = gen_reg_rtx (<ssescalarmode>mode); convert_move (op2, operands[2], false); } @@ -9250,7 +10762,7 @@ int i; if (GET_MODE (op2) != <ssescalarmode>mode) - { + { op2 = gen_reg_rtx (<ssescalarmode>mode); convert_move (op2, operands[2], false); } @@ -9772,6 +11284,99 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) +(define_mode_attr AVXTOSSEMODE + [(V4DI "V2DI") (V2DI "V2DI") + (V8SI "V4SI") (V4SI "V4SI") + (V16HI "V8HI") (V8HI "V8HI") + (V32QI "V16QI") (V16QI "V16QI")]) + +(define_insn "avx2_pbroadcast<mode>" + [(set (match_operand:VI 0 "register_operand" "=x") + (vec_duplicate:VI + (vec_select:<ssescalarmode> + (match_operand:<AVXTOSSEMODE> 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0)]))))] + "TARGET_AVX2" + "vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx2_permvarv8si" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (unspec:V8SI + [(match_operand:V8SI 1 "register_operand" "x") + (match_operand:V8SI 2 "nonimmediate_operand" "xm")] + UNSPEC_VPERMSI))] + "TARGET_AVX2" + "vpermd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_insn "avx2_permv4df" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (unspec:V4DF + [(match_operand:V4DF 1 "register_operand" "xm") + (match_operand:SI 2 "const_0_to_255_operand" "n")] + UNSPEC_VPERMDF))] + "TARGET_AVX2" + "vpermpd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_insn "avx2_permvarv8sf" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (unspec:V8SF + [(match_operand:V8SF 1 "register_operand" "x") + (match_operand:V8SF 2 "nonimmediate_operand" "xm")] + UNSPEC_VPERMSF))] + "TARGET_AVX2" + "vpermps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_insn "avx2_permv4di" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (unspec:V4DI + [(match_operand:V4DI 1 "register_operand" "xm") + (match_operand:SI 2 "const_0_to_255_operand" "n")] + UNSPEC_VPERMDI))] + "TARGET_AVX2" + "vpermq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_insn "avx2_permv2ti" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (unspec:V4DI + [(match_operand:V4DI 1 "register_operand" "x") + (match_operand:V4DI 2 "register_operand" "xm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_VPERMTI))] + "TARGET_AVX2" + "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_insn "avx2_vec_dupv4df" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (vec_duplicate:V4DF + (vec_select:DF + (match_operand:V2DF 1 "register_operand" "x") + (parallel [(const_int 0)]))))] + "TARGET_AVX2" + "vbroadcastsd\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + ;; Modes handled by AVX vec_dup patterns. (define_mode_iterator AVX_VEC_DUP_MODE [V8SI V8SF V4DI V4DF]) @@ -9789,6 +11394,18 @@ (set_attr "prefix" "vex") (set_attr "mode" "V8SF")]) +(define_insn "avx2_vbroadcasti128_<mode>" + [(set (match_operand:VI_256 0 "register_operand" "=x") + (vec_concat:VI_256 + (match_operand:<ssehalfvecmode> 1 "memory_operand" "m") + (match_dup 1)))] + "TARGET_AVX2" + "vbroadcasti128\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_split [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand" "") (vec_duplicate:AVX_VEC_DUP_MODE @@ -9880,7 +11497,7 @@ } operands[1] = adjust_address_nv (op1, <ssescalarmode>mode, - elt * GET_MODE_SIZE (<ssescalarmode>mode)); + elt * GET_MODE_SIZE (<ssescalarmode>mode)); }) (define_expand "avx_vpermil<mode>" @@ -10061,6 +11678,36 @@ DONE; }) +(define_insn "avx2_vec_set_lo_v4di" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (vec_concat:V4DI + (match_operand:V2DI 2 "nonimmediate_operand" "xm") + (vec_select:V2DI + (match_operand:V4DI 1 "register_operand" "x") + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_AVX2" + "vinserti128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_insn "avx2_vec_set_hi_v4di" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (vec_concat:V4DI + (vec_select:V2DI + (match_operand:V4DI 1 "register_operand" "x") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:V2DI 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX2" + "vinserti128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_insn "vec_set_lo_<mode>" [(set (match_operand:VI8F_256 0 "register_operand" "=x") (vec_concat:VI8F_256 @@ -10203,24 +11850,39 @@ (set_attr "prefix" "vex") (set_attr "mode" "V8SF")]) -(define_expand "avx_maskload<ssemodesuffix><avxsizesuffix>" - [(set (match_operand:VF 0 "register_operand" "") - (unspec:VF +(define_expand "<avx_avx2>_maskload<ssemodesuffix><avxsizesuffix>" + [(set (match_operand:V48_AVX2 0 "register_operand" "") + (unspec:V48_AVX2 [(match_operand:<sseintvecmode> 2 "register_operand" "") - (match_operand:VF 1 "memory_operand" "") + (match_operand:V48_AVX2 1 "memory_operand" "") (match_dup 0)] UNSPEC_MASKMOV))] "TARGET_AVX") -(define_expand "avx_maskstore<ssemodesuffix><avxsizesuffix>" - [(set (match_operand:VF 0 "memory_operand" "") - (unspec:VF +(define_expand "<avx_avx2>_maskstore<ssemodesuffix><avxsizesuffix>" + [(set (match_operand:V48_AVX2 0 "memory_operand" "") + (unspec:V48_AVX2 [(match_operand:<sseintvecmode> 1 "register_operand" "") - (match_operand:VF 2 "register_operand" "") + (match_operand:V48_AVX2 2 "register_operand" "") (match_dup 0)] UNSPEC_MASKMOV))] "TARGET_AVX") +(define_insn "*avx2_maskmov<ssemodesuffix><avxsizesuffix>" + [(set (match_operand:VI48_AVX2 0 "nonimmediate_operand" "=x,m") + (unspec:VI48_AVX2 + [(match_operand:<sseintvecmode> 1 "register_operand" "x,x") + (match_operand:VI48_AVX2 2 "nonimmediate_operand" "m,x") + (match_dup 0)] + UNSPEC_MASKMOV))] + "TARGET_AVX2 + && (REG_P (operands[0]) == MEM_P (operands[2]))" + "vpmaskmov<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "*avx_maskmov<ssemodesuffix><avxsizesuffix>" [(set (match_operand:VF 0 "nonimmediate_operand" "=x,m") (unspec:VF @@ -10265,6 +11927,286 @@ DONE; }) +(define_insn "avx2_extracti128" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (vec_select:V2DI + (match_operand:V4DI 1 "nonimmediate_operand" "xm") + (parallel [(match_operand:SI 2 "const_0_to_1_operand" "n")])))] + "TARGET_AVX2" + "vextracti128\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_expand "avx2_inserti128" + [(match_operand:V4DI 0 "register_operand" "") + (match_operand:V4DI 1 "register_operand" "") + (match_operand:V2DI 2 "nonimmediate_operand" "") + (match_operand:SI 3 "const_0_to_1_operand" "")] + "TARGET_AVX2" +{ + rtx (*insn)(rtx, rtx, rtx); + + switch (INTVAL (operands[3])) + { + case 0: + insn = gen_avx2_vec_set_lo_v4di; + break; + case 1: + insn = gen_avx2_vec_set_hi_v4di; + break; + default: + gcc_unreachable (); + } + + emit_insn (insn (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_insn "avx2_ashrvv8si" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (vec_concat:V8SI + (vec_concat:V4SI + (vec_concat:V2SI + (ashiftrt:SI + (vec_select:SI + (match_operand:V8SI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:SI + (match_operand:V8SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)]))) + (ashiftrt:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 1)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 1)])))) + (vec_concat:V2SI + (ashiftrt:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 2)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 2)]))) + (ashiftrt:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 3)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 3)]))))) + (vec_concat:V4SI + (vec_concat:V2SI + (ashiftrt:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 0)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 0)]))) + (ashiftrt:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 1)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 1)])))) + (vec_concat:V2SI + (ashiftrt:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 2)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 2)]))) + (ashiftrt:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 3)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 3)])))))))] + "TARGET_AVX2" + "vpsravd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_insn "avx2_ashrvv4si" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_concat:V4SI + (vec_concat:V2SI + (ashiftrt:SI + (vec_select:SI + (match_operand:V4SI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)]))) + (ashiftrt:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 1)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 1)])))) + (vec_concat:V2SI + (ashiftrt:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 2)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 2)]))) + (ashiftrt:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 3)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 3)]))))))] + "TARGET_AVX2" + "vpsravd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "avx2_<lshift>vv8si" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (vec_concat:V8SI + (vec_concat:V4SI + (vec_concat:V2SI + (lshift:SI + (vec_select:SI + (match_operand:V8SI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:SI + (match_operand:V8SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)]))) + (lshift:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 1)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 1)])))) + (vec_concat:V2SI + (lshift:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 2)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 2)]))) + (lshift:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 3)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 3)]))))) + (vec_concat:V4SI + (vec_concat:V2SI + (lshift:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 0)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 0)]))) + (lshift:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 1)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 1)])))) + (vec_concat:V2SI + (lshift:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 2)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 2)]))) + (lshift:SI + (vec_select:SI + (match_dup 1) + (parallel [(const_int 3)])) + (vec_select:SI + (match_dup 2) + (parallel [(const_int 3)])))))))] + "TARGET_AVX2" + "vp<lshift_insn>vd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_insn "avx2_<lshift>v<mode>" + [(set (match_operand:VI4SD_AVX2 0 "register_operand" "=x") + (vec_concat:VI4SD_AVX2 + (vec_concat:<ssehalfvecmode> + (lshift:<ssescalarmode> + (vec_select:<ssescalarmode> + (match_operand:VI4SD_AVX2 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:<ssescalarmode> + (match_operand:VI4SD_AVX2 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)]))) + (lshift:<ssescalarmode> + (vec_select:<ssescalarmode> + (match_dup 1) + (parallel [(const_int 1)])) + (vec_select:<ssescalarmode> + (match_dup 2) + (parallel [(const_int 1)])))) + (vec_concat:<ssehalfvecmode> + (lshift:<ssescalarmode> + (vec_select:<ssescalarmode> + (match_dup 1) + (parallel [(const_int 2)])) + (vec_select:<ssescalarmode> + (match_dup 2) + (parallel [(const_int 2)]))) + (lshift:<ssescalarmode> + (vec_select:<ssescalarmode> + (match_dup 1) + (parallel [(const_int 3)])) + (vec_select:<ssescalarmode> + (match_dup 2) + (parallel [(const_int 3)]))))))] + "TARGET_AVX2" + "vp<lshift_insn>v<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx2_<lshift>vv2di" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (vec_concat:V2DI + (lshift:DI + (vec_select:DI + (match_operand:V2DI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:DI + (match_operand:V2DI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)]))) + (lshift:DI + (vec_select:DI + (match_dup 1) + (parallel [(const_int 1)])) + (vec_select:DI + (match_dup 2) + (parallel [(const_int 1)])))))] + "TARGET_AVX2" + "vp<lshift_insn>vq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + (define_insn "*vec_concat<mode>_avx" [(set (match_operand:V_256 0 "register_operand" "=x,x") (vec_concat:V_256 @@ -10278,7 +12220,7 @@ return "vinsertf128\t{$0x1, %2, %t1, %0|%0, %t1, %2, 0x1}"; case 1: switch (get_attr_mode (insn)) - { + { case MODE_V8SF: return "vmovaps\t{%1, %x0|%x0, %1}"; case MODE_V4DF: @@ -10373,3 +12315,95 @@ [(set_attr "type" "ssecvt") (set_attr "prefix" "vex") (set_attr "mode" "V8SF")]) + +;; For gather* insn patterns +(define_mode_iterator VEC_GATHER_MODE + [V2DI V2DF V4DI V4DF V4SI V4SF V8SI V8SF]) +(define_mode_attr VEC_GATHER_MODE + [(V2DI "V4SI") (V2DF "V4SI") + (V4DI "V4SI") (V4DF "V4SI") + (V4SI "V4SI") (V4SF "V4SI") + (V8SI "V8SI") (V8SF "V8SI")]) + +(define_expand "avx2_gathersi<mode>" + [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "") + (unspec:VEC_GATHER_MODE + [(match_operand:VEC_GATHER_MODE 1 "register_operand" "") + (match_operand:<ssescalarmode> 2 "memory_operand" "") + (match_operand:<VEC_GATHER_MODE> 3 "register_operand" "") + (match_operand:VEC_GATHER_MODE 4 "register_operand" "") + (match_operand:SI 5 "const1248_operand " "")] + UNSPEC_GATHER))] + "TARGET_AVX2") + +(define_insn "*avx2_gathersi<mode>" + [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=x") + (unspec:VEC_GATHER_MODE + [(match_operand:VEC_GATHER_MODE 1 "register_operand" "0") + (mem:<ssescalarmode> + (match_operand:P 2 "register_operand" "r")) + (match_operand:<VEC_GATHER_MODE> 3 "register_operand" "x") + (match_operand:VEC_GATHER_MODE 4 "register_operand" "x") + (match_operand:SI 5 "const1248_operand" "n")] + UNSPEC_GATHER))] + "TARGET_AVX2" + "v<gthrfirstp>gatherd<gthrlastp>\t{%4, (%2, %3, %c5), %0|%0, (%2, %3, %c5), %4}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_expand "avx2_gatherdi<mode>" + [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "") + (unspec:VEC_GATHER_MODE + [(match_operand:VEC_GATHER_MODE 1 "register_operand" "") + (match_operand:<ssescalarmode> 2 "memory_operand" "") + (match_operand:<AVXMODE48P_DI> 3 "register_operand" "") + (match_operand:VEC_GATHER_MODE 4 "register_operand" "") + (match_operand:SI 5 "const1248_operand " "")] + UNSPEC_GATHER))] + "TARGET_AVX2") + +(define_insn "*avx2_gatherdi<mode>" + [(set (match_operand:AVXMODE48P_DI 0 "register_operand" "=x") + (unspec:AVXMODE48P_DI + [(match_operand:AVXMODE48P_DI 1 "register_operand" "0") + (mem:<ssescalarmode> + (match_operand:P 2 "register_operand" "r")) + (match_operand:<AVXMODE48P_DI> 3 "register_operand" "x") + (match_operand:AVXMODE48P_DI 4 "register_operand" "x") + (match_operand:SI 5 "const1248_operand" "n")] + UNSPEC_GATHER))] + "TARGET_AVX2" + "v<gthrfirstp>gatherq<gthrlastp>\t{%4, (%2, %3, %c5), %0|%0, (%2, %3, %c5), %4}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) + +;; Special handling for VEX.256 with float arguments +;; since there're still xmms as operands +(define_expand "avx2_gatherdi<mode>256" + [(set (match_operand:VI4F_128 0 "register_operand" "") + (unspec:VI4F_128 + [(match_operand:VI4F_128 1 "register_operand" "") + (match_operand:<ssescalarmode> 2 "memory_operand" "") + (match_operand:V4DI 3 "register_operand" "") + (match_operand:VI4F_128 4 "register_operand" "") + (match_operand:SI 5 "const1248_operand " "")] + UNSPEC_GATHER))] + "TARGET_AVX2") + +(define_insn "*avx2_gatherdi<mode>256" + [(set (match_operand:VI4F_128 0 "register_operand" "=x") + (unspec:VI4F_128 + [(match_operand:VI4F_128 1 "register_operand" "0") + (mem:<ssescalarmode> + (match_operand:P 2 "register_operand" "r")) + (match_operand:V4DI 3 "register_operand" "x") + (match_operand:VI4F_128 4 "register_operand" "x") + (match_operand:SI 5 "const1248_operand" "n")] + UNSPEC_GATHER))] + "TARGET_AVX2" + "v<gthrfirstp>gatherq<gthrlastp>\t{%4, (%2, %3, %c5), %0|%0, (%2, %3, %c5), %4}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index cf7fdbf..29c02b8 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -9455,6 +9455,184 @@ v4df __builtin_ia32_xorpd256 (v4df,v4df) v8sf __builtin_ia32_xorps256 (v8sf,v8sf) @end smallexample +The following built-in functions are available when @option{-mavx2} is +used. All of them generate the machine instruction that is part of the +name. + +@smallexample +v32qi __builtin_ia32_mpsadbw256 (v32qi,v32qi,v32qi,int) +v32qi __builtin_ia32_pabsb256 (v32qi) +v16hi __builtin_ia32_pabsw256 (v16hi) +v8si __builtin_ia32_pabsd256 (v8si) +v16hi builtin_ia32_packssdw256 (v8si,v8si) +v32qi __builtin_ia32_packsswb256 (v16hi,v16hi) +v16hi __builtin_ia32_packusdw256 (v8si,v8si) +v32qi __builtin_ia32_packuswb256 (v16hi,v16hi) +v32qi__builtin_ia32_paddb256 (v32qi,v32qi) +v16hi __builtin_ia32_paddw256 (v16hi,v16hi) +v8si __builtin_ia32_paddd256 (v8si,v8si) +v4di __builtin_ia32_paddq256 (v4di,v4di) +v32qi __builtin_ia32_paddsb256 (v32qi,v32qi) +v16hi __builtin_ia32_paddsw256 (v16hi,v16hi) +v32qi __builtin_ia32_paddusb256 (v32qi,v32qi) +v16hi __builtin_ia32_paddusw256 (v16hi,v16hi) +v4di __builtin_ia32_palignr256 (v4di,v4di,int) +v4di __builtin_ia32_andsi256 (v4di,v4di) +v4di __builtin_ia32_andnotsi256 (v4di,v4di) +v32qi__builtin_ia32_pavgb256 (v32qi,v32qi) +v16hi __builtin_ia32_pavgw256 (v16hi,v16hi) +v32qi __builtin_ia32_pblendvb256 (v32qi,v32qi,v32qi) +v16hi __builtin_ia32_pblendw256 (v16hi,v16hi,int) +v32qi __builtin_ia32_pcmpeqb256 (v32qi,v32qi) +v16hi __builtin_ia32_pcmpeqw256 (v16hi,v16hi) +v8si __builtin_ia32_pcmpeqd256 (c8si,v8si) +v4di __builtin_ia32_pcmpeqq256 (v4di,v4di) +v32qi __builtin_ia32_pcmpgtb256 (v32qi,v32qi) +v16hi __builtin_ia32_pcmpgtw256 (16hi,v16hi) +v8si __builtin_ia32_pcmpgtd256 (v8si,v8si) +v4di __builtin_ia32_pcmpgtq256 (v4di,v4di) +v16hi __builtin_ia32_phaddw256 (v16hi,v16hi) +v8si __builtin_ia32_phaddd256 (v8si,v8si) +v16hi __builtin_ia32_phaddsw256 (v16hi,v16hi) +v16hi __builtin_ia32_phsubw256 (v16hi,v16hi) +v8si __builtin_ia32_phsubd256 (v8si,v8si) +v16hi __builtin_ia32_phsubsw256 (v16hi,v16hi) +v32qi __builtin_ia32_pmaddubsw256 (v32qi,v32qi) +v16hi __builtin_ia32_pmaddwd256 (v16hi,v16hi) +v32qi __builtin_ia32_pmaxsb256 (v32qi,v32qi) +v16hi __builtin_ia32_pmaxsw256 (v16hi,v16hi) +v8si __builtin_ia32_pmaxsd256 (v8si,v8si) +v32qi __builtin_ia32_pmaxub256 (v32qi,v32qi) +v16hi __builtin_ia32_pmaxuw256 (v16hi,v16hi) +v8si __builtin_ia32_pmaxud256 (v8si,v8si) +v32qi __builtin_ia32_pminsb256 (v32qi,v32qi) +v16hi __builtin_ia32_pminsw256 (v16hi,v16hi) +v8si __builtin_ia32_pminsd256 (v8si,v8si) +v32qi __builtin_ia32_pminub256 (v32qi,v32qi) +v16hi __builtin_ia32_pminuw256 (v16hi,v16hi) +v8si __builtin_ia32_pminud256 (v8si,v8si) +int __builtin_ia32_pmovmskb256 (v32qi) +v16hi __builtin_ia32_pmovsxbw256 (v16qi) +v8si __builtin_ia32_pmovsxbd256 (v16qi) +v4di __builtin_ia32_pmovsxbq256 (v16qi) +v8si __builtin_ia32_pmovsxwd256 (v8hi) +v4di __builtin_ia32_pmovsxwq256 (v8hi) +v4di __builtin_ia32_pmovsxdq256 (v4si) +v16hi __builtin_ia32_pmovzxbw256 (v16qi) +v8si __builtin_ia32_pmovzxbd256 (v16qi) +v4di __builtin_ia32_pmovzxbq256 (v16qi) +v8si __builtin_ia32_pmovzxwd256 (v8hi) +v4di __builtin_ia32_pmovzxwq256 (v8hi) +v4di __builtin_ia32_pmovzxdq256 (v4si) +v4di __builtin_ia32_pmuldq256 (v8si,v8si) +v16hi __builtin_ia32_pmulhrsw256 (v16hi, v16hi) +v16hi __builtin_ia32_pmulhuw256 (v16hi,v16hi) +v16hi __builtin_ia32_pmulhw256 (v16hi,v16hi) +v16hi __builtin_ia32_pmullw256 (v16hi,v16hi) +v8si __builtin_ia32_pmulld256 (v8si,v8si) +v4di __builtin_ia32_pmuludq256 (v8si,v8si) +v4di __builtin_ia32_por256 (v4di,v4di) +v16hi __builtin_ia32_psadbw256 (v32qi,v32qi) +v32qi __builtin_ia32_pshufb256 (v32qi,v32qi) +v8si __builtin_ia32_pshufd256 (v8si,int) +v16hi __builtin_ia32_pshufhw256 (v16hi,int) +v16hi __builtin_ia32_pshuflw256 (v16hi,int) +v32qi __builtin_ia32_psignb256 (v32qi,v32qi) +v16hi __builtin_ia32_psignw256 (v16hi,v16hi) +v8si __builtin_ia32_psignd256 (v8si,v8si) +v4di __builtin_ia32_pslldqi256 (v4di,int) +v16hi __builtin_ia32_psllwi256 (16hi,int) +v16hi __builtin_ia32_psllw256(v16hi,v8hi) +v8si __builtin_ia32_pslldi256 (v8si,int) +v8si __builtin_ia32_pslld256(v8si,v4si) +v4di __builtin_ia32_psllqi256 (v4di,int) +v4di __builtin_ia32_psllq256(v4di,v2di) +v16hi __builtin_ia32_psrawi256 (v16hi,int) +v16hi __builtin_ia32_psraw256 (v16hi,v8hi) +v8si __builtin_ia32_psradi256 (v8si,int) +v8si __builtin_ia32_psrad256 (v8si,v4si) +v4di __builtin_ia32_psrldqi256 (v4di, int) +v16hi __builtin_ia32_psrlwi256 (v16hi,int) +v16hi __builtin_ia32_psrlw256 (v16hi,v8hi) +v8si __builtin_ia32_psrldi256 (v8si,int) +v8si __builtin_ia32_psrld256 (v8si,v4si) +v4di __builtin_ia32_psrlqi256 (v4di,int) +v4di __builtin_ia32_psrlq256(v4di,v2di) +v32qi __builtin_ia32_psubb256 (v32qi,v32qi) +v32hi __builtin_ia32_psubw256 (v16hi,v16hi) +v8si __builtin_ia32_psubd256 (v8si,v8si) +v4di __builtin_ia32_psubq256 (v4di,v4di) +v32qi __builtin_ia32_psubsb256 (v32qi,v32qi) +v16hi __builtin_ia32_psubsw256 (v16hi,v16hi) +v32qi __builtin_ia32_psubusb256 (v32qi,v32qi) +v16hi __builtin_ia32_psubusw256 (v16hi,v16hi) +v32qi __builtin_ia32_punpckhbw256 (v32qi,v32qi) +v16hi __builtin_ia32_punpckhwd256 (v16hi,v16hi) +v8si __builtin_ia32_punpckhdq256 (v8si,v8si) +v4di __builtin_ia32_punpckhqdq256 (v4di,v4di) +v32qi __builtin_ia32_punpcklbw256 (v32qi,v32qi) +v16hi __builtin_ia32_punpcklwd256 (v16hi,v16hi) +v8si __builtin_ia32_punpckldq256 (v8si,v8si) +v4di __builtin_ia32_punpcklqdq256 (v4di,v4di) +v4di __builtin_ia32_pxor256 (v4di,v4di) +v4di __builtin_ia32_movntdqa256 (pv4di) +v4sf __builtin_ia32_vbroadcastss_ps (v4sf) +v8sf __builtin_ia32_vbroadcastss_ps256 (v4sf) +v4df __builtin_ia32_vbroadcastsd_pd256 (v2df) +v4di __builtin_ia32_vbroadcastsi256 (v2di) +v4si __builtin_ia32_pblendd128 (v4si,v4si) +v8si __builtin_ia32_pblendd256 (v8si,v8si) +v32qi __builtin_ia32_pbroadcastb256 (v16qi) +v16hi __builtin_ia32_pbroadcastw256 (v8hi) +v8si __builtin_ia32_pbroadcastd256 (v4si) +v4di __builtin_ia32_pbroadcastq256 (v2di) +v16qi __builtin_ia32_pbroadcastb128 (v16qi) +v8hi __builtin_ia32_pbroadcastw128 (v8hi) +v4si __builtin_ia32_pbroadcastd128 (v4si) +v2di __builtin_ia32_pbroadcastq128 (v2di) +v8si __builtin_ia32_permvarsi256 (v8si,v8si) +v4df __builtin_ia32_permdf256 (v4df,int) +v8sf __builtin_ia32_permvarsf256 (v8sf,v8sf) +v4di __builtin_ia32_permdi256 (v4di,int) +v4di __builtin_ia32_permti256 (v4di,v4di,int) +v4di __builtin_ia32_extract128i256 (v4di,int) +v4di __builtin_ia32_insert128i256 (v4di,v2di,int) +v8si __builtin_ia32_maskloadd256 (pcv8si,v8si) +v4di __builtin_ia32_maskloadq256 (pcv4di,v4di) +v4si __builtin_ia32_maskloadd (pcv4si,v4si) +v2di __builtin_ia32_maskloadq (pcv2di,v2di) +void __builtin_ia32_maskstored256 (pv8si,v8si,v8si) +void __builtin_ia32_maskstoreq256 (pv4di,v4di,v4di) +void __builtin_ia32_maskstored (pv4si,v4si,v4si) +void __builtin_ia32_maskstoreq (pv2di,v2di,v2di) +v8si __builtin_ia32_psllv8si (v8si,v8si) +v4si __builtin_ia32_psllv4si (v4si,v4si) +v4di __builtin_ia32_psllv4di (v4di,v4di) +v2di __builtin_ia32_psllv2di (v2di,v2di) +v8si __builtin_ia32_psrav8si (v8si,v8si) +v4si __builtin_ia32_psrav4si (v4si,v4si) +v8si __builtin_ia32_psrlv8si (v8si,v8si) +v4si __builtin_ia32_psrlv4si (v4si,v4si) +v4di __builtin_ia32_psrlv4di (v4di,v4di) +v2di __builtin_ia32_psrlv2di (v2di,v2di) +v2df __builtin_ia32_gathersiv2df (v2df, pcdouble,v4si,v2df,int) +v4df __builtin_ia32_gathersiv4df (v4df, pcdouble,v4si,v4df,int) +v2df __builtin_ia32_gatherdiv2df (v2df, pcdouble,v2di,v2df,int) +v4df __builtin_ia32_gatherdiv4df (v4df, pcdouble,v4di,v4df,int) +v4sf __builtin_ia32_gathersiv4sf (v4sf, pcfloat,v4si,v4sf,int) +v8sf __builtin_ia32_gathersiv8sf (v8sf, pcfloat,v8si,v8sf,int) +v4sf __builtin_ia32_gatherdiv4sf (v4sf, pcfloat,v2di,v4sf,int) +v4sf __builtin_ia32_gatherdiv4sf256 (v4sf, pcfloat,v4di,v4sf,int) +v2di __builtin_ia32_gathersiv2di (v2di, pcint64,v4si,v2di,int) +v4di __builtin_ia32_gathersiv4di (v4di, pcint64,v4si,v4di,int) +v2di __builtin_ia32_gatherdiv2di (v2di, pcint64,v2di,v2di,int) +v4di __builtin_ia32_gatherdiv4di (v4di, pcint64,v4di,v4di,int) +v4si __builtin_ia32_gathersiv4si (v4si, pcint,v4si,v4si,int) +v8si __builtin_ia32_gathersiv8si (v8si, pcint,v8si,v8si,int) +v4si __builtin_ia32_gatherdiv4si (v4si, pcint,v2di,v4si,int) +v4si __builtin_ia32_gatherdiv4si256 (v4si, pcint,v4di,v4si,int) +@end smallexample + The following built-in functions are available when @option{-maes} is used. All of them generate the machine instruction that is part of the name. |