diff options
56 files changed, 4119 insertions, 25 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b8c0905..2fce0d5 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,125 @@ +2009-09-29 Harsha Jagasia <harsha.jagasia@amd.com> + + * config.gcc (i[34567]86-*-*): Include fma4intrin.h. + (x86_64-*-*): Ditto. + + * config/i386/fma4intrin.h: New file, provide common x86 compiler + intrinisics for FMA4. + * config/i386/cpuid.h (bit_FMA4): Define FMA4 bit. + * config/i386/x86intrin.h: Fix typo to SSE4A instead of SSE4a. + Add FMA4 check and fma4intrin.h. + * config/i386/i386-c.c(ix86_target_macros_internal): Check + ISA_FLAG for FMA4. + * config/i386/i386.h(TARGET_FMA4): New macro for FMA4. + * config/i386/i386.md (UNSPEC_FMA4_INTRINSIC): Add new UNSPEC + constant for FMA4 support. + (UNSPEC_FMA4_FMADDSUB): Ditto. + (UNSPEC_FMA4_FMSUBADD): Ditto. + * config/i386/i386.opt (-mfma4): New switch for FMA4 support. + * config/i386/i386-protos.h (ix86_fma4_valid_op_p): Add + declaration. + (ix86_expand_fma4_multiple_memory): Ditto. + * config/i386/i386.c (OPTION_MASK_ISA_FMA4_SET): New. + (OPTION_MASK_ISA_FMA4_UNSET): New. + (OPTION_MASK_ISA_SSE4A_UNSET): Change definition to + depend on FMA4. + (OPTION_MASK_ISA_AVX_UNSET): Change definition to + depend on FMA4. + (ix86_handle_option): Handle -mfma4. + (isa_opts): Handle -mfma4. + (enum pta_flags): Add PTA_FMA4. + (override_options): Add FMA4 support. + (IX86_BUILTIN_VFMADDSS): New for FMA4 intrinsic. + (IX86_BUILTIN_VFMADDSD): Ditto. + (IX86_BUILTIN_VFMADDPS): Ditto. + (IX86_BUILTIN_VFMADDPD): Ditto. + (IX86_BUILTIN_VFMSUBSS): Ditto. + (IX86_BUILTIN_VFMSUBSD): Ditto. + (IX86_BUILTIN_VFMSUBPS): Ditto. + (IX86_BUILTIN_VFMSUBPD): Ditto. + (IX86_BUILTIN_VFMADDSUBPS): Ditto. + (IX86_BUILTIN_VFMADDSUBPD): Ditto. + (IX86_BUILTIN_VFMSUBADDPS): Ditto. + (IX86_BUILTIN_VFMSUBADDPD): Ditto. + (IX86_BUILTIN_VFNMADDSS): Ditto. + (IX86_BUILTIN_VFNMADDSD): Ditto. + (IX86_BUILTIN_VFNMADDPS): Ditto. + (IX86_BUILTIN_VFNMADDPD): Ditto. + (IX86_BUILTIN_VFNMSUBSS): Ditto. + (IX86_BUILTIN_VFNMSUBSD): Ditto. + (IX86_BUILTIN_VFNMSUBPS): Ditto. + (IX86_BUILTIN_VFNMSUBPD): Ditto. + (IX86_BUILTIN_VFMADDPS256): Ditto. + (IX86_BUILTIN_VFMADDPD256): Ditto. + (IX86_BUILTIN_VFMSUBPS256): Ditto. + (IX86_BUILTIN_VFMSUBPD256): Ditto. + (IX86_BUILTIN_VFMADDSUBPS256): Ditto. + (IX86_BUILTIN_VFMADDSUBPD256): Ditto. + (IX86_BUILTIN_VFMSUBADDPS256): Ditto. + (IX86_BUILTIN_VFMSUBADDPD256): Ditto. + (IX86_BUILTIN_VFNMADDPS256): Ditto. + (IX86_BUILTIN_VFNMADDPD256): Ditto. + (IX86_BUILTIN_VFNMSUBPS256): Ditto. + (IX86_BUILTIN_VFNMSUBPD256): Ditto. + (enum multi_arg_type): New enum for describing the various FMA4 + intrinsic argument types. + (bdesc_multi_arg): New table for FMA4 intrinsics. + (ix86_init_mmx_sse_builtins): Add FMA4 intrinsic support. + (ix86_expand_multi_arg_builtin): New function for creating FMA4 + intrinsics. + (ix86_expand_builtin): Add FMA4 intrinsic support. + (ix86_fma4_valid_op_p): New function to validate FMA4 3 and 4 + operand instructions. + (ix86_expand_fma4_multiple_memory): New function to split the + second memory reference from FMA4 instructions. + * config/i386/sse.md (ssemodesuffixf4): New mode attribute for FMA4. + (ssemodesuffixf2s): Ditto. + (fma4_fmadd<mode>4): Add FMA4 floating point multiply/add + instructions. + (fma4_fmsub<mode>4): Ditto. + (fma4_fnmadd<mode>4): Ditto. + (fma4_fnmsub<mode>4): Ditto. + (fma4_vmfmadd<mode>4): Ditto. + (fma4_vmfmsub<mode>4): Ditto. + (fma4_vmfnmadd<mode>4): Ditto. + (fma4_vmfnmsub<mode>4): Ditto. + (fma4_fmadd<mode>4256): Ditto. + (fma4_fmsub<mode>4256): Ditto. + (fma4_fnmadd<mode>4256): Ditto. + (fma4_fnmsub<mode>4256): Ditto. + (fma4_fmaddsubv8sf4): Ditto. + (fma4_fmaddsubv4sf4): Ditto. + (fma4_fmaddsubv4df4): Ditto. + (fma4_fmaddsubv2df4): Ditto. + (fma4_fmsubaddv8sf4): Ditto. + (fma4_fmsubaddv4sf4): Ditto. + (fma4_fmsubaddv4df4): Ditto. + (fma4_fmsubaddv2df4): Ditto. + (fma4i_fmadd<mode>4): Add FMA4 floating point multiply/add + instructions for intrinsics. + (fma4i_fmsub<mode>4): Ditto. + (fma4i_fnmadd<mode>4): Ditto. + (fma4i_fnmsub<mode>4): Ditto. + (fma4i_vmfmadd<mode>4): Ditto. + (fma4i_vmfmsub<mode>4): Ditto. + (fma4i_vmfnmadd<mode>4): Ditto. + (fma4i_vmfnmsub<mode>4): Ditto. + (fma4i_fmadd<mode>4256): Ditto. + (fma4i_fmsub<mode>4256): Ditto. + (fma4i_fnmadd<mode>4256): Ditto. + (fma4i_fnmsub<mode>4256): Ditto. + (fma4i_fmaddsubv8sf4): Ditto. + (fma4i_fmaddsubv4sf4): Ditto. + (fma4i_fmaddsubv4df4): Ditto. + (fma4i_fmaddsubv2df4): Ditto. + (fma4i_fmsubaddv8sf4): Ditto. + (fma4i_fmsubaddv4sf4): Ditto. + (fma4i_fmsubaddv4df4): Ditto. + (fma4i_fmsubaddv2df4): Ditto. + + * doc/invoke.texi (-mfma4): Add documentation. + * doc/extend.texi (x86 intrinsics): Add FMA4 intrinsics. + 2009-09-29 Richard Henderson <rth@redhat.com> * tree-eh.c (unsplit_eh): Do not unsplit if there's already diff --git a/gcc/config.gcc b/gcc/config.gcc index 5ad5770..6351aa5 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -286,8 +286,9 @@ i[34567]86-*-*) cxx_target_objs="i386-c.o" extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h - nmmintrin.h bmmintrin.h wmmintrin.h immintrin.h - x86intrin.h avxintrin.h ia32intrin.h cross-stdarg.h" + nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h + immintrin.h x86intrin.h avxintrin.h + ia32intrin.h cross-stdarg.h" ;; x86_64-*-*) cpu_type=i386 @@ -295,8 +296,9 @@ x86_64-*-*) cxx_target_objs="i386-c.o" extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h - nmmintrin.h bmmintrin.h wmmintrin.h immintrin.h - x86intrin.h avxintrin.h ia32intrin.h cross-stdarg.h" + nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h + immintrin.h x86intrin.h avxintrin.h + ia32intrin.h cross-stdarg.h" need_64bit_hwint=yes ;; ia64-*-*) diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h index 61a3dca..49acfa7 100644 --- a/gcc/config/i386/cpuid.h +++ b/gcc/config/i386/cpuid.h @@ -48,6 +48,7 @@ /* %ecx */ #define bit_LAHF_LM (1 << 0) #define bit_SSE4a (1 << 6) +#define bit_FMA4 (1 << 16) /* %edx */ #define bit_LM (1 << 29) diff --git a/gcc/config/i386/fma4intrin.h b/gcc/config/i386/fma4intrin.h new file mode 100644 index 0000000..42782ad --- /dev/null +++ b/gcc/config/i386/fma4intrin.h @@ -0,0 +1,245 @@ +/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _X86INTRIN_H_INCLUDED +# error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead." +#endif + +#ifndef _FMA4INTRIN_H_INCLUDED +#define _FMA4INTRIN_H_INCLUDED + +#ifndef __FMA4__ +# error "FMA4 instruction set not enabled" +#else + +/* We need definitions from the SSE4A, SSE3, SSE2 and SSE header files. */ +#include <ammintrin.h> + +/* Internal data types for implementing the intrinsics. */ +typedef float __v8sf __attribute__ ((__vector_size__ (32))); +typedef double __v4df __attribute__ ((__vector_size__ (32))); + +typedef float __m256 __attribute__ ((__vector_size__ (32), + __may_alias__)); +typedef double __m256d __attribute__ ((__vector_size__ (32), + __may_alias__)); + +/* 128b Floating point multiply/add type instructions. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_ps (__m128 __A, __m128 __B, __m128 __C) + +{ + return (__m128) __builtin_ia32_vfmsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmsubss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmsubsd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmsubss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmsubsd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msubadd_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmsubaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msubadd_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmsubaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +/* 256b Floating point multiply/add type instructions. */ +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_macc_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_macc_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msub_ps (__m256 __A, __m256 __B, __m256 __C) + +{ + return (__m256) __builtin_ia32_vfmsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmacc_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfnmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmacc_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfnmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfnmsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfnmsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maddsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maddsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msubadd_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmsubaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msubadd_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmsubaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +#endif + +#endif diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c index 4c960e7..12a3f17 100644 --- a/gcc/config/i386/i386-c.c +++ b/gcc/config/i386/i386-c.c @@ -230,6 +230,8 @@ ix86_target_macros_internal (int isa_flag, def_or_undef (parse_in, "__FMA__"); if (isa_flag & OPTION_MASK_ISA_SSE4A) def_or_undef (parse_in, "__SSE4A__"); + if (isa_flag & OPTION_MASK_ISA_FMA4) + def_or_undef (parse_in, "__FMA4__"); if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE)) def_or_undef (parse_in, "__SSE_MATH__"); if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE2)) diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 389fc3c..58da131 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -214,6 +214,9 @@ extern void ix86_expand_vector_set (bool, rtx, rtx, int); extern void ix86_expand_vector_extract (bool, rtx, rtx, int); extern void ix86_expand_reduc_v4sf (rtx (*)(rtx, rtx, rtx), rtx, rtx); +extern bool ix86_fma4_valid_op_p (rtx [], rtx, int, bool, int, bool); +extern void ix86_expand_fma4_multiple_memory (rtx [], int, enum machine_mode); + /* In i386-c.c */ extern void ix86_target_macros (void); extern void ix86_register_pragmas (void); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 3b11b91..9df01ba 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1955,6 +1955,9 @@ static int ix86_isa_flags_explicit; #define OPTION_MASK_ISA_SSE4A_SET \ (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE3_SET) +#define OPTION_MASK_ISA_FMA4_SET \ + (OPTION_MASK_ISA_FMA4 | OPTION_MASK_ISA_SSE4A_SET \ + | OPTION_MASK_ISA_AVX_SET) /* AES and PCLMUL need SSE2 because they use xmm registers */ #define OPTION_MASK_ISA_AES_SET \ @@ -1995,7 +1998,8 @@ static int ix86_isa_flags_explicit; #define OPTION_MASK_ISA_SSE4_2_UNSET \ (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_AVX_UNSET ) #define OPTION_MASK_ISA_AVX_UNSET \ - (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_FMA_UNSET) + (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_FMA_UNSET \ + | OPTION_MASK_ISA_FMA4_UNSET) #define OPTION_MASK_ISA_FMA_UNSET OPTION_MASK_ISA_FMA /* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same @@ -2003,7 +2007,10 @@ static int ix86_isa_flags_explicit; #define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET #define OPTION_MASK_ISA_SSE4A_UNSET \ - (OPTION_MASK_ISA_SSE4A) + (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_FMA4_UNSET) + +#define OPTION_MASK_ISA_FMA4_UNSET OPTION_MASK_ISA_FMA4 + #define OPTION_MASK_ISA_AES_UNSET OPTION_MASK_ISA_AES #define OPTION_MASK_ISA_PCLMUL_UNSET OPTION_MASK_ISA_PCLMUL #define OPTION_MASK_ISA_ABM_UNSET OPTION_MASK_ISA_ABM @@ -2237,6 +2244,19 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) } return true; + case OPT_mfma4: + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_FMA4_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA4_SET; + } + else + { + ix86_isa_flags &= ~OPTION_MASK_ISA_FMA4_UNSET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA4_UNSET; + } + return true; + case OPT_mabm: if (value) { @@ -2364,6 +2384,7 @@ ix86_target_string (int isa, int flags, const char *arch, const char *tune, static struct ix86_target_opts isa_opts[] = { { "-m64", OPTION_MASK_ISA_64BIT }, + { "-mfma4", OPTION_MASK_ISA_FMA4 }, { "-msse4a", OPTION_MASK_ISA_SSE4A }, { "-msse4.2", OPTION_MASK_ISA_SSE4_2 }, { "-msse4.1", OPTION_MASK_ISA_SSE4_1 }, @@ -2593,7 +2614,8 @@ override_options (bool main_args_p) PTA_PCLMUL = 1 << 17, PTA_AVX = 1 << 18, PTA_FMA = 1 << 19, - PTA_MOVBE = 1 << 20 + PTA_MOVBE = 1 << 20, + PTA_FMA4 = 1 << 21 }; static struct pta @@ -2936,6 +2958,9 @@ override_options (bool main_args_p) if (processor_alias_table[i].flags & PTA_SSE4A && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A)) ix86_isa_flags |= OPTION_MASK_ISA_SSE4A; + if (processor_alias_table[i].flags & PTA_FMA4 + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4)) + ix86_isa_flags |= OPTION_MASK_ISA_FMA4; if (processor_alias_table[i].flags & PTA_ABM && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM)) ix86_isa_flags |= OPTION_MASK_ISA_ABM; @@ -3619,6 +3644,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[]) IX86_ATTR_ISA ("sse4.2", OPT_msse4_2), IX86_ATTR_ISA ("sse4a", OPT_msse4a), IX86_ATTR_ISA ("ssse3", OPT_mssse3), + IX86_ATTR_ISA ("fma4", OPT_mfma4), /* string options */ IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH), @@ -20686,6 +20712,39 @@ enum ix86_builtins IX86_BUILTIN_CVTUDQ2PS, + /* FMA4 instructions. */ + IX86_BUILTIN_VFMADDSS, + IX86_BUILTIN_VFMADDSD, + IX86_BUILTIN_VFMADDPS, + IX86_BUILTIN_VFMADDPD, + IX86_BUILTIN_VFMSUBSS, + IX86_BUILTIN_VFMSUBSD, + IX86_BUILTIN_VFMSUBPS, + IX86_BUILTIN_VFMSUBPD, + IX86_BUILTIN_VFMADDSUBPS, + IX86_BUILTIN_VFMADDSUBPD, + IX86_BUILTIN_VFMSUBADDPS, + IX86_BUILTIN_VFMSUBADDPD, + IX86_BUILTIN_VFNMADDSS, + IX86_BUILTIN_VFNMADDSD, + IX86_BUILTIN_VFNMADDPS, + IX86_BUILTIN_VFNMADDPD, + IX86_BUILTIN_VFNMSUBSS, + IX86_BUILTIN_VFNMSUBSD, + IX86_BUILTIN_VFNMSUBPS, + IX86_BUILTIN_VFNMSUBPD, + IX86_BUILTIN_VFMADDPS256, + IX86_BUILTIN_VFMADDPD256, + IX86_BUILTIN_VFMSUBPS256, + IX86_BUILTIN_VFMSUBPD256, + IX86_BUILTIN_VFMADDSUBPS256, + IX86_BUILTIN_VFMADDSUBPD256, + IX86_BUILTIN_VFMSUBADDPS256, + IX86_BUILTIN_VFMSUBADDPD256, + IX86_BUILTIN_VFNMADDPS256, + IX86_BUILTIN_VFNMADDPD256, + IX86_BUILTIN_VFNMSUBPS256, + IX86_BUILTIN_VFNMSUBPD256, IX86_BUILTIN_MAX }; @@ -21759,6 +21818,56 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF }, }; +/* FMA4. */ +enum multi_arg_type { + MULTI_ARG_UNKNOWN, + MULTI_ARG_3_SF, + MULTI_ARG_3_DF, + MULTI_ARG_3_SF2, + MULTI_ARG_3_DF2 +}; + +static const struct builtin_description bdesc_multi_arg[] = +{ + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmaddv4sf4, "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS, UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmaddv2df4, "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD, UNKNOWN, (int)MULTI_ARG_3_DF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv4sf4, "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS, UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv2df4, "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD, UNKNOWN, (int)MULTI_ARG_3_DF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmsubv4sf4, "__builtin_ia32_vfmsubss", IX86_BUILTIN_VFMSUBSS, UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmsubv2df4, "__builtin_ia32_vfmsubsd", IX86_BUILTIN_VFMSUBSD, UNKNOWN, (int)MULTI_ARG_3_DF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv4sf4, "__builtin_ia32_vfmsubps", IX86_BUILTIN_VFMSUBPS, UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv2df4, "__builtin_ia32_vfmsubpd", IX86_BUILTIN_VFMSUBPD, UNKNOWN, (int)MULTI_ARG_3_DF }, + + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmaddv4sf4, "__builtin_ia32_vfnmaddss", IX86_BUILTIN_VFNMADDSS, UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmaddv2df4, "__builtin_ia32_vfnmaddsd", IX86_BUILTIN_VFNMADDSD, UNKNOWN, (int)MULTI_ARG_3_DF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv4sf4, "__builtin_ia32_vfnmaddps", IX86_BUILTIN_VFNMADDPS, UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv2df4, "__builtin_ia32_vfnmaddpd", IX86_BUILTIN_VFNMADDPD, UNKNOWN, (int)MULTI_ARG_3_DF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmsubv4sf4, "__builtin_ia32_vfnmsubss", IX86_BUILTIN_VFNMSUBSS, UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmsubv2df4, "__builtin_ia32_vfnmsubsd", IX86_BUILTIN_VFNMSUBSD, UNKNOWN, (int)MULTI_ARG_3_DF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv4sf4, "__builtin_ia32_vfnmsubps", IX86_BUILTIN_VFNMSUBPS, UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv2df4, "__builtin_ia32_vfnmsubpd", IX86_BUILTIN_VFNMSUBPD, UNKNOWN, (int)MULTI_ARG_3_DF }, + + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv4sf4, "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS, UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv2df4, "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD, UNKNOWN, (int)MULTI_ARG_3_DF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv4sf4, "__builtin_ia32_vfmsubaddps", IX86_BUILTIN_VFMSUBADDPS, UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv2df4, "__builtin_ia32_vfmsubaddpd", IX86_BUILTIN_VFMSUBADDPD, UNKNOWN, (int)MULTI_ARG_3_DF }, + + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv8sf4256, "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256, UNKNOWN, (int)MULTI_ARG_3_SF2 }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv4df4256, "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256, UNKNOWN, (int)MULTI_ARG_3_DF2 }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv8sf4256, "__builtin_ia32_vfmsubps256", IX86_BUILTIN_VFMSUBPS256, UNKNOWN, (int)MULTI_ARG_3_SF2 }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv4df4256, "__builtin_ia32_vfmsubpd256", IX86_BUILTIN_VFMSUBPD256, UNKNOWN, (int)MULTI_ARG_3_DF2 }, + + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv8sf4256, "__builtin_ia32_vfnmaddps256", IX86_BUILTIN_VFNMADDPS256, UNKNOWN, (int)MULTI_ARG_3_SF2 }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv4df4256, "__builtin_ia32_vfnmaddpd256", IX86_BUILTIN_VFNMADDPD256, UNKNOWN, (int)MULTI_ARG_3_DF2 }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv8sf4256, "__builtin_ia32_vfnmsubps256", IX86_BUILTIN_VFNMSUBPS256, UNKNOWN, (int)MULTI_ARG_3_SF2 }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv4df4256, "__builtin_ia32_vfnmsubpd256", IX86_BUILTIN_VFNMSUBPD256, UNKNOWN, (int)MULTI_ARG_3_DF2 }, + + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv8sf4, "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256, UNKNOWN, (int)MULTI_ARG_3_SF2 }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv4df4, "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256, UNKNOWN, (int)MULTI_ARG_3_DF2 }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv8sf4, "__builtin_ia32_vfmsubaddps256", IX86_BUILTIN_VFMSUBADDPS256, UNKNOWN, (int)MULTI_ARG_3_SF2 }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv4df4, "__builtin_ia32_vfmsubaddpd256", IX86_BUILTIN_VFMSUBADDPD256, UNKNOWN, (int)MULTI_ARG_3_DF2 } + +}; /* Set up all the MMX/SSE builtins, even builtins for instructions that are not in the current target ISA to allow the user to compile particular modules @@ -23192,6 +23301,29 @@ ix86_init_mmx_sse_builtins (void) intQI_type_node, integer_type_node, NULL_TREE); def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi", ftype, IX86_BUILTIN_VEC_SET_V16QI); + /* Add FMA4 multi-arg argument instructions */ + for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++) + { + tree mtype = NULL_TREE; + + if (d->name == 0) + continue; + + switch ((enum multi_arg_type)d->flag) + { + case MULTI_ARG_3_SF: mtype = v4sf_ftype_v4sf_v4sf_v4sf; break; + case MULTI_ARG_3_DF: mtype = v2df_ftype_v2df_v2df_v2df; break; + case MULTI_ARG_3_SF2: mtype = v8sf_ftype_v8sf_v8sf_v8sf; break; + case MULTI_ARG_3_DF2: mtype = v4df_ftype_v4df_v4df_v4df; break; + + case MULTI_ARG_UNKNOWN: + default: + gcc_unreachable (); + } + + if (mtype) + def_builtin_const (d->mask, d->name, mtype, d->code); + } } /* Internal method for ix86_init_builtins. */ @@ -23364,6 +23496,122 @@ ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target) return target; } +/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */ + +static rtx +ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target, + enum multi_arg_type m_type, + enum rtx_code sub_code) +{ + rtx pat; + int i; + int nargs; + bool comparison_p = false; + bool tf_p = false; + bool last_arg_constant = false; + int num_memory = 0; + struct { + rtx op; + enum machine_mode mode; + } args[4]; + + enum machine_mode tmode = insn_data[icode].operand[0].mode; + + switch (m_type) + { + case MULTI_ARG_3_SF: + case MULTI_ARG_3_DF: + case MULTI_ARG_3_SF2: + case MULTI_ARG_3_DF2: + nargs = 3; + break; + + case MULTI_ARG_UNKNOWN: + default: + gcc_unreachable (); + } + + if (optimize || !target + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + + gcc_assert (nargs <= 4); + + for (i = 0; i < nargs; i++) + { + tree arg = CALL_EXPR_ARG (exp, i); + rtx op = expand_normal (arg); + int adjust = (comparison_p) ? 1 : 0; + enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode; + + if (last_arg_constant && i == nargs-1) + { + if (!CONST_INT_P (op)) + { + error ("last argument must be an immediate"); + return gen_reg_rtx (tmode); + } + } + else + { + if (VECTOR_MODE_P (mode)) + op = safe_vector_operand (op, mode); + + /* If we aren't optimizing, only allow one memory operand to be + generated. */ + if (memory_operand (op, mode)) + num_memory++; + + gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode); + + if (optimize + || ! (*insn_data[icode].operand[i+adjust+1].predicate) (op, mode) + || num_memory > 1) + op = force_reg (mode, op); + } + + args[i].op = op; + args[i].mode = mode; + } + + switch (nargs) + { + case 1: + pat = GEN_FCN (icode) (target, args[0].op); + break; + + case 2: + if (tf_p) + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, + GEN_INT ((int)sub_code)); + else if (! comparison_p) + pat = GEN_FCN (icode) (target, args[0].op, args[1].op); + else + { + rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target), + args[0].op, + args[1].op); + + pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op); + } + break; + + case 3: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); + break; + + default: + gcc_unreachable (); + } + + if (! pat) + return 0; + + emit_insn (pat); + return target; +} + /* Subroutine of ix86_expand_args_builtin to take care of scalar unop insns with vec_merge. */ @@ -24633,6 +24881,12 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, if (d->code == fcode) return ix86_expand_sse_pcmpistr (d, exp, target); + for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++) + if (d->code == fcode) + return ix86_expand_multi_arg_builtin (d->icode, exp, target, + (enum multi_arg_type)d->flag, + d->comparison); + gcc_unreachable (); } @@ -29015,6 +29269,200 @@ ix86_expand_round (rtx operand0, rtx operand1) emit_move_insn (operand0, res); } +/* Validate whether a FMA4 instruction is valid or not. + OPERANDS is the array of operands. + NUM is the number of operands. + USES_OC0 is true if the instruction uses OC0 and provides 4 variants. + NUM_MEMORY is the maximum number of memory operands to accept. + NUM_MEMORY less than zero is a special case to allow an operand + of an instruction to be memory operation. + when COMMUTATIVE is set, operand 1 and 2 can be swapped. */ + +bool +ix86_fma4_valid_op_p (rtx operands[], rtx insn ATTRIBUTE_UNUSED, int num, + bool uses_oc0, int num_memory, bool commutative) +{ + int mem_mask; + int mem_count; + int i; + + /* Count the number of memory arguments */ + mem_mask = 0; + mem_count = 0; + for (i = 0; i < num; i++) + { + enum machine_mode mode = GET_MODE (operands[i]); + if (register_operand (operands[i], mode)) + ; + + else if (memory_operand (operands[i], mode)) + { + mem_mask |= (1 << i); + mem_count++; + } + + else + { + rtx pattern = PATTERN (insn); + + /* allow 0 for pcmov */ + if (GET_CODE (pattern) != SET + || GET_CODE (SET_SRC (pattern)) != IF_THEN_ELSE + || i < 2 + || operands[i] != CONST0_RTX (mode)) + return false; + } + } + + /* Special case pmacsdq{l,h} where we allow the 3rd argument to be + a memory operation. */ + if (num_memory < 0) + { + num_memory = -num_memory; + if ((mem_mask & (1 << (num-1))) != 0) + { + mem_mask &= ~(1 << (num-1)); + mem_count--; + } + } + + /* If there were no memory operations, allow the insn */ + if (mem_mask == 0) + return true; + + /* Do not allow the destination register to be a memory operand. */ + else if (mem_mask & (1 << 0)) + return false; + + /* If there are too many memory operations, disallow the instruction. While + the hardware only allows 1 memory reference, before register allocation + for some insns, we allow two memory operations sometimes in order to allow + code like the following to be optimized: + + float fmadd (float *a, float *b, float *c) { return (*a * *b) + *c; } + + or similar cases that are vectorized into using the vfmaddss + instruction. */ + else if (mem_count > num_memory) + return false; + + /* Don't allow more than one memory operation if not optimizing. */ + else if (mem_count > 1 && !optimize) + return false; + + else if (num == 4 && mem_count == 1) + { + /* formats (destination is the first argument), example vfmaddss: + xmm1, xmm1, xmm2, xmm3/mem + xmm1, xmm1, xmm2/mem, xmm3 + xmm1, xmm2, xmm3/mem, xmm1 + xmm1, xmm2/mem, xmm3, xmm1 */ + if (uses_oc0) + return ((mem_mask == (1 << 1)) + || (mem_mask == (1 << 2)) + || (mem_mask == (1 << 3))); + + /* format, example vpmacsdd: + xmm1, xmm2, xmm3/mem, xmm1 */ + if (commutative) + return (mem_mask == (1 << 2) || mem_mask == (1 << 1)); + else + return (mem_mask == (1 << 2)); + } + + else if (num == 4 && num_memory == 2) + { + /* If there are two memory operations, we can load one of the memory ops + into the destination register. This is for optimizing the + multiply/add ops, which the combiner has optimized both the multiply + and the add insns to have a memory operation. We have to be careful + that the destination doesn't overlap with the inputs. */ + rtx op0 = operands[0]; + + if (reg_mentioned_p (op0, operands[1]) + || reg_mentioned_p (op0, operands[2]) + || reg_mentioned_p (op0, operands[3])) + return false; + + /* formats (destination is the first argument), example vfmaddss: + xmm1, xmm1, xmm2, xmm3/mem + xmm1, xmm1, xmm2/mem, xmm3 + xmm1, xmm2, xmm3/mem, xmm1 + xmm1, xmm2/mem, xmm3, xmm1 + + For the oc0 case, we will load either operands[1] or operands[3] into + operands[0], so any combination of 2 memory operands is ok. */ + if (uses_oc0) + return true; + + /* format, example vpmacsdd: + xmm1, xmm2, xmm3/mem, xmm1 + + For the integer multiply/add instructions be more restrictive and + require operands[2] and operands[3] to be the memory operands. */ + if (commutative) + return (mem_mask == ((1 << 1) | (1 << 3)) || ((1 << 2) | (1 << 3))); + else + return (mem_mask == ((1 << 2) | (1 << 3))); + } + + else if (num == 3 && num_memory == 1) + { + /* formats, example vprotb: + xmm1, xmm2, xmm3/mem + xmm1, xmm2/mem, xmm3 */ + if (uses_oc0) + return ((mem_mask == (1 << 1)) || (mem_mask == (1 << 2))); + + /* format, example vpcomeq: + xmm1, xmm2, xmm3/mem */ + else + return (mem_mask == (1 << 2)); + } + + else + gcc_unreachable (); + + return false; +} + + +/* Fixup an FMA4 instruction that has 2 memory input references into a form the + hardware will allow by using the destination register to load one of the + memory operations. Presently this is used by the multiply/add routines to + allow 2 memory references. */ + +void +ix86_expand_fma4_multiple_memory (rtx operands[], + int num, + enum machine_mode mode) +{ + rtx op0 = operands[0]; + if (num != 4 + || memory_operand (op0, mode) + || reg_mentioned_p (op0, operands[1]) + || reg_mentioned_p (op0, operands[2]) + || reg_mentioned_p (op0, operands[3])) + gcc_unreachable (); + + /* For 2 memory operands, pick either operands[1] or operands[3] to move into + the destination register. */ + if (memory_operand (operands[1], mode)) + { + emit_move_insn (op0, operands[1]); + operands[1] = op0; + } + else if (memory_operand (operands[3], mode)) + { + emit_move_insn (op0, operands[3]); + operands[3] = op0; + } + else + gcc_unreachable (); + + return; +} + /* Table of valid machine attributes. */ static const struct attribute_spec ix86_attribute_table[] = { diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index f8259480..8d52572 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -54,6 +54,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define TARGET_AVX OPTION_ISA_AVX #define TARGET_FMA OPTION_ISA_FMA #define TARGET_SSE4A OPTION_ISA_SSE4A +#define TARGET_FMA4 OPTION_ISA_FMA4 #define TARGET_ROUND OPTION_ISA_ROUND #define TARGET_ABM OPTION_ISA_ABM #define TARGET_POPCNT OPTION_ISA_POPCNT @@ -65,8 +66,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define TARGET_CMPXCHG16B OPTION_ISA_CX16 -/* SSE4.1 define round instructions */ -#define OPTION_MASK_ISA_ROUND (OPTION_MASK_ISA_SSE4_1) +/* SSE4.1 defines round instructions */ +#define OPTION_MASK_ISA_ROUND OPTION_MASK_ISA_SSE4_1 #define OPTION_ISA_ROUND ((ix86_isa_flags & OPTION_MASK_ISA_ROUND) != 0) #include "config/vxworks-dummy.h" @@ -1351,6 +1352,10 @@ enum reg_class (TARGET_AVX && ((MODE) == V4SFmode || (MODE) == V2DFmode \ || (MODE) == V8SFmode || (MODE) == V4DFmode)) +#define FMA4_VEC_FLOAT_MODE_P(MODE) \ + (TARGET_FMA4 && ((MODE) == V4SFmode || (MODE) == V2DFmode \ + || (MODE) == V8SFmode || (MODE) == V4DFmode)) + #define MMX_REG_P(XOP) (REG_P (XOP) && MMX_REGNO_P (REGNO (XOP))) #define MMX_REGNO_P(N) IN_RANGE ((N), FIRST_MMX_REG, LAST_MMX_REG) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 26bbc9a..5c2564e 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -195,6 +195,10 @@ (UNSPEC_PCMPESTR 144) (UNSPEC_PCMPISTR 145) + ; For FMA4 support + (UNSPEC_FMA4_INTRINSIC 150) + (UNSPEC_FMA4_FMADDSUB 151) + (UNSPEC_FMA4_FMSUBADD 152) ; For AES support (UNSPEC_AESENC 159) (UNSPEC_AESENCLAST 160) diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index f23763b..9668ff6 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -310,6 +310,10 @@ msse4a Target Report Mask(ISA_SSE4A) Var(ix86_isa_flags) VarExists Save Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation +mfma4 +Target Report Mask(ISA_FMA4) Var(ix86_isa_flags) VarExists Save +Support FMA4 built-in functions and code generation + mabm Target Report Mask(ISA_ABM) Var(ix86_isa_flags) VarExists Save Support code generation of Advanced Bit Manipulation (ABM) instructions. diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 2ddbbf5..e902965 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -49,6 +49,7 @@ (define_mode_iterator SSEMODE248 [V8HI V4SI V2DI]) (define_mode_iterator SSEMODE1248 [V16QI V8HI V4SI V2DI]) (define_mode_iterator SSEMODEF4 [SF DF V4SF V2DF]) +(define_mode_iterator FMA4MODEF4 [V8SF V4DF]) (define_mode_iterator SSEMODEF2P [V4SF V2DF]) (define_mode_iterator AVX256MODEF2P [V8SF V4DF]) @@ -74,6 +75,11 @@ ;; Mapping from integer vector mode to mnemonic suffix (define_mode_attr ssevecsize [(V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q")]) +;; Mapping of the fma4 suffix +(define_mode_attr fma4modesuffixf4 [(V8SF "ps") (V4DF "pd")]) +(define_mode_attr ssemodesuffixf2s [(SF "ss") (DF "sd") + (V4SF "ss") (V2DF "sd")]) + ;; Mapping of the avx suffix (define_mode_attr ssemodesuffixf4 [(SF "ss") (DF "sd") (V4SF "ps") (V2DF "pd")]) @@ -1661,6 +1667,936 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; +;; FMA4 floating point multiply/accumulate instructions This includes the +;; scalar version of the instructions as well as the vector +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; In order to match (*a * *b) + *c, particularly when vectorizing, allow +;; combine to generate a multiply/add with two memory references. We then +;; split this insn, into loading up the destination register with one of the +;; memory operations. If we don't manage to split the insn, reload will +;; generate the appropriate moves. The reason this is needed, is that combine +;; has already folded one of the memory references into both the multiply and +;; add insns, and it can't generate a new pseudo. I.e.: +;; (set (reg1) (mem (addr1))) +;; (set (reg2) (mult (reg1) (mem (addr2)))) +;; (set (reg3) (plus (reg2) (mem (addr3)))) + +(define_insn "fma4_fmadd<mode>4256" + [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x,x") + (plus:FMA4MODEF4 + (mult:FMA4MODEF4 + (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x,xm") + (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm,x")) + (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x,x")))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmadd<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;; Split fmadd with two memory operands into a load and the fmadd. +(define_split + [(set (match_operand:FMA4MODEF4 0 "register_operand" "") + (plus:FMA4MODEF4 + (mult:FMA4MODEF4 + (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "") + (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "")) + (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "")))] + "TARGET_FMA4 + && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true) + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true) + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2]) + && !reg_mentioned_p (operands[0], operands[3])" + [(const_int 0)] +{ + ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode); + emit_insn (gen_fma4_fmadd<mode>4256 (operands[0], operands[1], + operands[2], operands[3])); + DONE; +}) + +;; Floating multiply and subtract +;; Allow two memory operands the same as fmadd +(define_insn "fma4_fmsub<mode>4256" + [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x,x") + (minus:FMA4MODEF4 + (mult:FMA4MODEF4 + (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x,xm") + (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm,x")) + (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x,x")))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmsub<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;; Split fmsub with two memory operands into a load and the fmsub. +(define_split + [(set (match_operand:FMA4MODEF4 0 "register_operand" "") + (minus:FMA4MODEF4 + (mult:FMA4MODEF4 + (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "") + (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "")) + (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "")))] + "TARGET_FMA4 + && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true) + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true) + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2]) + && !reg_mentioned_p (operands[0], operands[3])" + [(const_int 0)] +{ + ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode); + emit_insn (gen_fma4_fmsub<mode>4256 (operands[0], operands[1], + operands[2], operands[3])); + DONE; +}) + +;; Floating point negative multiply and add +;; Rewrite (- (a * b) + c) into the canonical form: c - (a * b) +;; Note operands are out of order to simplify call to ix86_fma4_valid_p +;; Allow two memory operands to help in optimizing. +(define_insn "fma4_fnmadd<mode>4256" + [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x,x") + (minus:FMA4MODEF4 + (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x,x") + (mult:FMA4MODEF4 + (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x,xm") + (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm,x"))))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfnmadd<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;; Split fnmadd with two memory operands into a load and the fnmadd. +(define_split + [(set (match_operand:FMA4MODEF4 0 "register_operand" "") + (minus:FMA4MODEF4 + (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "") + (mult:FMA4MODEF4 + (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "") + (match_operand:FMA4MODEF4 2 "nonimmediate_operand" ""))))] + "TARGET_FMA4 + && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true) + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true) + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2]) + && !reg_mentioned_p (operands[0], operands[3])" + [(const_int 0)] +{ + ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode); + emit_insn (gen_fma4_fnmadd<mode>4256 (operands[0], operands[1], + operands[2], operands[3])); + DONE; +}) + +;; Floating point negative multiply and subtract +;; Rewrite (- (a * b) - c) into the canonical form: ((-a) * b) - c +;; Allow 2 memory operands to help with optimization +(define_insn "fma4_fnmsub<mode>4256" + [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x") + (minus:FMA4MODEF4 + (mult:FMA4MODEF4 + (neg:FMA4MODEF4 + (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x")) + (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm")) + (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x")))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)" + "vfnmsub<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;; Split fnmsub with two memory operands into a load and the fmsub. +(define_split + [(set (match_operand:FMA4MODEF4 0 "register_operand" "") + (minus:FMA4MODEF4 + (mult:FMA4MODEF4 + (neg:FMA4MODEF4 + (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "")) + (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "")) + (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "")))] + "TARGET_FMA4 + && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false) + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false) + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2]) + && !reg_mentioned_p (operands[0], operands[3])" + [(const_int 0)] +{ + ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode); + emit_insn (gen_fma4_fnmsub<mode>4256 (operands[0], operands[1], + operands[2], operands[3])); + DONE; +}) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(define_insn "fma4_fmadd<mode>4" + [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x,x") + (plus:SSEMODEF4 + (mult:SSEMODEF4 + (match_operand:SSEMODEF4 1 "nonimmediate_operand" "x,x,xm") + (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm,x")) + (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x,x")))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;; Split fmadd with two memory operands into a load and the fmadd. +(define_split + [(set (match_operand:SSEMODEF4 0 "register_operand" "") + (plus:SSEMODEF4 + (mult:SSEMODEF4 + (match_operand:SSEMODEF4 1 "nonimmediate_operand" "") + (match_operand:SSEMODEF4 2 "nonimmediate_operand" "")) + (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")))] + "TARGET_FMA4 + && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true) + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true) + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2]) + && !reg_mentioned_p (operands[0], operands[3])" + [(const_int 0)] +{ + ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode); + emit_insn (gen_fma4_fmadd<mode>4 (operands[0], operands[1], + operands[2], operands[3])); + DONE; +}) + +;; For the scalar operations, use operand1 for the upper words that aren't +;; modified, so restrict the forms that are generated. +;; Scalar version of fmadd +(define_insn "fma4_vmfmadd<mode>4" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (vec_merge:SSEMODEF2P + (plus:SSEMODEF2P + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")) + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")) + (match_dup 0) + (const_int 1)))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)" + "vfmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;; Floating multiply and subtract +;; Allow two memory operands the same as fmadd +(define_insn "fma4_fmsub<mode>4" + [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x,x") + (minus:SSEMODEF4 + (mult:SSEMODEF4 + (match_operand:SSEMODEF4 1 "nonimmediate_operand" "x,x,xm") + (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm,x")) + (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x,x")))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;; Split fmsub with two memory operands into a load and the fmsub. +(define_split + [(set (match_operand:SSEMODEF4 0 "register_operand" "") + (minus:SSEMODEF4 + (mult:SSEMODEF4 + (match_operand:SSEMODEF4 1 "nonimmediate_operand" "") + (match_operand:SSEMODEF4 2 "nonimmediate_operand" "")) + (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")))] + "TARGET_FMA4 + && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true) + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true) + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2]) + && !reg_mentioned_p (operands[0], operands[3])" + [(const_int 0)] +{ + ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode); + emit_insn (gen_fma4_fmsub<mode>4 (operands[0], operands[1], + operands[2], operands[3])); + DONE; +}) + +;; For the scalar operations, use operand1 for the upper words that aren't +;; modified, so restrict the forms that are generated. +;; Scalar version of fmsub +(define_insn "fma4_vmfmsub<mode>4" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (vec_merge:SSEMODEF2P + (minus:SSEMODEF2P + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")) + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")) + (match_dup 0) + (const_int 1)))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)" + "vfmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;; Floating point negative multiply and add +;; Rewrite (- (a * b) + c) into the canonical form: c - (a * b) +;; Note operands are out of order to simplify call to ix86_fma4_valid_p +;; Allow two memory operands to help in optimizing. +(define_insn "fma4_fnmadd<mode>4" + [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x,x") + (minus:SSEMODEF4 + (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x,x") + (mult:SSEMODEF4 + (match_operand:SSEMODEF4 1 "nonimmediate_operand" "x,x,xm") + (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm,x"))))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfnmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;; Split fnmadd with two memory operands into a load and the fnmadd. +(define_split + [(set (match_operand:SSEMODEF4 0 "register_operand" "") + (minus:SSEMODEF4 + (match_operand:SSEMODEF4 3 "nonimmediate_operand" "") + (mult:SSEMODEF4 + (match_operand:SSEMODEF4 1 "nonimmediate_operand" "") + (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))))] + "TARGET_FMA4 + && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true) + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true) + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2]) + && !reg_mentioned_p (operands[0], operands[3])" + [(const_int 0)] +{ + ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode); + emit_insn (gen_fma4_fnmadd<mode>4 (operands[0], operands[1], + operands[2], operands[3])); + DONE; +}) + +;; For the scalar operations, use operand1 for the upper words that aren't +;; modified, so restrict the forms that are generated. +;; Scalar version of fnmadd +(define_insn "fma4_vmfnmadd<mode>4" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (vec_merge:SSEMODEF2P + (minus:SSEMODEF2P + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x") + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))) + (match_dup 0) + (const_int 1)))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)" + "vfnmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;; Floating point negative multiply and subtract +;; Rewrite (- (a * b) - c) into the canonical form: ((-a) * b) - c +;; Allow 2 memory operands to help with optimization +(define_insn "fma4_fnmsub<mode>4" + [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x") + (minus:SSEMODEF4 + (mult:SSEMODEF4 + (neg:SSEMODEF4 + (match_operand:SSEMODEF4 1 "nonimmediate_operand" "x,x")) + (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm")) + (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x")))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)" + "vfnmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;; Split fnmsub with two memory operands into a load and the fmsub. +(define_split + [(set (match_operand:SSEMODEF4 0 "register_operand" "") + (minus:SSEMODEF4 + (mult:SSEMODEF4 + (neg:SSEMODEF4 + (match_operand:SSEMODEF4 1 "nonimmediate_operand" "")) + (match_operand:SSEMODEF4 2 "nonimmediate_operand" "")) + (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")))] + "TARGET_FMA4 + && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false) + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false) + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2]) + && !reg_mentioned_p (operands[0], operands[3])" + [(const_int 0)] +{ + ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode); + emit_insn (gen_fma4_fnmsub<mode>4 (operands[0], operands[1], + operands[2], operands[3])); + DONE; +}) + +;; For the scalar operations, use operand1 for the upper words that aren't +;; modified, so restrict the forms that are generated. +;; Scalar version of fnmsub +(define_insn "fma4_vmfnmsub<mode>4" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (vec_merge:SSEMODEF2P + (minus:SSEMODEF2P + (mult:SSEMODEF2P + (neg:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")) + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")) + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")) + (match_dup 0) + (const_int 1)))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)" + "vfnmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "fma4i_fmadd<mode>4256" + [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x") + (unspec:FMA4MODEF4 + [(plus:FMA4MODEF4 + (mult:FMA4MODEF4 + (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x") + (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm")) + (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x"))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)" + "vfmadd<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "fma4i_fmsub<mode>4256" + [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x") + (unspec:FMA4MODEF4 + [(minus:FMA4MODEF4 + (mult:FMA4MODEF4 + (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x") + (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm")) + (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x"))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)" + "vfmsub<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "fma4i_fnmadd<mode>4256" + [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x") + (unspec:FMA4MODEF4 + [(minus:FMA4MODEF4 + (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x") + (mult:FMA4MODEF4 + (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x") + (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm")))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)" + "vfnmadd<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "fma4i_fnmsub<mode>4256" + [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x") + (unspec:FMA4MODEF4 + [(minus:FMA4MODEF4 + (mult:FMA4MODEF4 + (neg:FMA4MODEF4 + (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x")) + (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm")) + (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x"))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)" + "vfnmsub<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "fma4i_fmadd<mode>4" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (unspec:SSEMODEF2P + [(plus:SSEMODEF2P + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")) + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)" + "vfmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "fma4i_fmsub<mode>4" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (unspec:SSEMODEF2P + [(minus:SSEMODEF2P + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")) + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)" + "vfmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "fma4i_fnmadd<mode>4" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (unspec:SSEMODEF2P + [(minus:SSEMODEF2P + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x") + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)" + "vfnmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "fma4i_fnmsub<mode>4" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (unspec:SSEMODEF2P + [(minus:SSEMODEF2P + (mult:SSEMODEF2P + (neg:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")) + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")) + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)" + "vfnmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +;; For the scalar operations, use operand1 for the upper words that aren't +;; modified, so restrict the forms that are accepted. +(define_insn "fma4i_vmfmadd<mode>4" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (unspec:SSEMODEF2P + [(vec_merge:SSEMODEF2P + (plus:SSEMODEF2P + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "x,x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")) + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")) + (match_dup 0) + (const_int 1))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)" + "vfmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<ssescalarmode>")]) + +(define_insn "fma4i_vmfmsub<mode>4" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (unspec:SSEMODEF2P + [(vec_merge:SSEMODEF2P + (minus:SSEMODEF2P + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "x,x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")) + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")) + (match_dup 0) + (const_int 1))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)" + "vfmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<ssescalarmode>")]) + +(define_insn "fma4i_vmfnmadd<mode>4" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (unspec:SSEMODEF2P + [(vec_merge:SSEMODEF2P + (minus:SSEMODEF2P + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x") + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))) + (match_dup 0) + (const_int 1))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)" + "vfnmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<ssescalarmode>")]) + +(define_insn "fma4i_vmfnmsub<mode>4" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (unspec:SSEMODEF2P + [(vec_merge:SSEMODEF2P + (minus:SSEMODEF2P + (mult:SSEMODEF2P + (neg:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "x,x")) + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")) + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")) + (match_dup 0) + (const_int 1))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)" + "vfnmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<ssescalarmode>")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; FMA4 Parallel floating point multiply addsub and subadd operations +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "fma4_fmaddsubv8sf4" + [(set (match_operand:V8SF 0 "register_operand" "=x,x") + (vec_merge:V8SF + (plus:V8SF + (mult:V8SF + (match_operand:V8SF 1 "nonimmediate_operand" "x,x") + (match_operand:V8SF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V8SF 3 "nonimmediate_operand" "xm,x")) + (minus:V8SF + (mult:V8SF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 170)))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V8SF")]) + +(define_insn "fma4_fmaddsubv4df4" + [(set (match_operand:V4DF 0 "register_operand" "=x,x") + (vec_merge:V4DF + (plus:V4DF + (mult:V4DF + (match_operand:V4DF 1 "nonimmediate_operand" "x,x") + (match_operand:V4DF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V4DF 3 "nonimmediate_operand" "xm,x")) + (minus:V4DF + (mult:V4DF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 10)))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmaddsubpd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V4DF")]) + +(define_insn "fma4_fmaddsubv4sf4" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (vec_merge:V4SF + (plus:V4SF + (mult:V4SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,x") + (match_operand:V4SF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V4SF 3 "nonimmediate_operand" "xm,x")) + (minus:V4SF + (mult:V4SF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 10)))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V4SF")]) + +(define_insn "fma4_fmaddsubv2df4" + [(set (match_operand:V2DF 0 "register_operand" "=x,x") + (vec_merge:V2DF + (plus:V2DF + (mult:V2DF + (match_operand:V2DF 1 "nonimmediate_operand" "x,x") + (match_operand:V2DF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V2DF 3 "nonimmediate_operand" "xm,x")) + (minus:V2DF + (mult:V2DF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 2)))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmaddsubpd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V2DF")]) + +(define_insn "fma4_fmsubaddv8sf4" + [(set (match_operand:V8SF 0 "register_operand" "=x,x") + (vec_merge:V8SF + (plus:V8SF + (mult:V8SF + (match_operand:V8SF 1 "nonimmediate_operand" "x,x") + (match_operand:V8SF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V8SF 3 "nonimmediate_operand" "xm,x")) + (minus:V8SF + (mult:V8SF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 85)))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V8SF")]) + +(define_insn "fma4_fmsubaddv4df4" + [(set (match_operand:V4DF 0 "register_operand" "=x,x") + (vec_merge:V4DF + (plus:V4DF + (mult:V4DF + (match_operand:V4DF 1 "nonimmediate_operand" "x,x") + (match_operand:V4DF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V4DF 3 "nonimmediate_operand" "xm,x")) + (minus:V4DF + (mult:V4DF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 5)))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmsubaddpd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V4DF")]) + +(define_insn "fma4_fmsubaddv4sf4" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (vec_merge:V4SF + (plus:V4SF + (mult:V4SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,x") + (match_operand:V4SF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V4SF 3 "nonimmediate_operand" "xm,x")) + (minus:V4SF + (mult:V4SF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 5)))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V4SF")]) + +(define_insn "fma4_fmsubaddv2df4" + [(set (match_operand:V2DF 0 "register_operand" "=x,x") + (vec_merge:V2DF + (plus:V2DF + (mult:V2DF + (match_operand:V2DF 1 "nonimmediate_operand" "x,x") + (match_operand:V2DF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V2DF 3 "nonimmediate_operand" "xm,x")) + (minus:V2DF + (mult:V2DF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 1)))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmsubaddpd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V2DF")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "fma4i_fmaddsubv8sf4" + [(set (match_operand:V8SF 0 "register_operand" "=x,x") + (unspec:V8SF + [(vec_merge:V8SF + (plus:V8SF + (mult:V8SF + (match_operand:V8SF 1 "nonimmediate_operand" "x,x") + (match_operand:V8SF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V8SF 3 "nonimmediate_operand" "xm,x")) + (minus:V8SF + (mult:V8SF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 170))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V8SF")]) + +(define_insn "fma4i_fmaddsubv4df4" + [(set (match_operand:V4DF 0 "register_operand" "=x,x") + (unspec:V4DF + [(vec_merge:V4DF + (plus:V4DF + (mult:V4DF + (match_operand:V4DF 1 "nonimmediate_operand" "x,x") + (match_operand:V4DF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V4DF 3 "nonimmediate_operand" "xm,x")) + (minus:V4DF + (mult:V4DF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 10))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmaddsubpd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V4DF")]) + +(define_insn "fma4i_fmaddsubv4sf4" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (unspec:V4SF + [(vec_merge:V4SF + (plus:V4SF + (mult:V4SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,x") + (match_operand:V4SF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V4SF 3 "nonimmediate_operand" "xm,x")) + (minus:V4SF + (mult:V4SF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 10))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V4SF")]) + +(define_insn "fma4i_fmaddsubv2df4" + [(set (match_operand:V2DF 0 "register_operand" "=x,x") + (unspec:V2DF + [(vec_merge:V2DF + (plus:V2DF + (mult:V2DF + (match_operand:V2DF 1 "nonimmediate_operand" "x,x") + (match_operand:V2DF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V2DF 3 "nonimmediate_operand" "xm,x")) + (minus:V2DF + (mult:V2DF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 2))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmaddsubpd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V2DF")]) + +(define_insn "fma4i_fmsubaddv8sf4" + [(set (match_operand:V8SF 0 "register_operand" "=x,x") + (unspec:V8SF + [(vec_merge:V8SF + (plus:V8SF + (mult:V8SF + (match_operand:V8SF 1 "nonimmediate_operand" "x,x") + (match_operand:V8SF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V8SF 3 "nonimmediate_operand" "xm,x")) + (minus:V8SF + (mult:V8SF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 85))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V8SF")]) + +(define_insn "fma4i_fmsubaddv4df4" + [(set (match_operand:V4DF 0 "register_operand" "=x,x") + (unspec:V4DF + [(vec_merge:V4DF + (plus:V4DF + (mult:V4DF + (match_operand:V4DF 1 "nonimmediate_operand" "x,x") + (match_operand:V4DF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V4DF 3 "nonimmediate_operand" "xm,x")) + (minus:V4DF + (mult:V4DF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 5))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmsubaddpd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V4DF")]) + +(define_insn "fma4i_fmsubaddv4sf4" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (unspec:V4SF + [(vec_merge:V4SF + (plus:V4SF + (mult:V4SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,x") + (match_operand:V4SF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V4SF 3 "nonimmediate_operand" "xm,x")) + (minus:V4SF + (mult:V4SF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 5))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V4SF")]) + +(define_insn "fma4i_fmsubaddv2df4" + [(set (match_operand:V2DF 0 "register_operand" "=x,x") + (unspec:V2DF + [(vec_merge:V2DF + (plus:V2DF + (mult:V2DF + (match_operand:V2DF 1 "nonimmediate_operand" "x,x") + (match_operand:V2DF 2 "nonimmediate_operand" "x,xm")) + (match_operand:V2DF 3 "nonimmediate_operand" "xm,x")) + (minus:V2DF + (mult:V2DF + (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (const_int 1))] + UNSPEC_FMA4_INTRINSIC))] + "TARGET_FMA4 + && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)" + "vfmsubaddpd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V2DF")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ;; Parallel single-precision floating point conversion operations ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/gcc/config/i386/x86intrin.h b/gcc/config/i386/x86intrin.h index 939ea4f..7bc47f8 100644 --- a/gcc/config/i386/x86intrin.h +++ b/gcc/config/i386/x86intrin.h @@ -46,7 +46,7 @@ #include <tmmintrin.h> #endif -#ifdef __SSE4a__ +#ifdef __SSE4A__ #include <ammintrin.h> #endif @@ -54,6 +54,10 @@ #include <smmintrin.h> #endif +#ifdef __FMA4__ +#include <fma4intrin.h> +#endif + #if defined (__AES__) || defined (__PCLMUL__) #include <wmmintrin.h> #endif diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 993863f..6f09555 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -3168,6 +3168,11 @@ Enable/disable the generation of the sse4.2 instructions. @cindex @code{target("sse4a")} attribute Enable/disable the generation of the SSE4A instructions. +@item fma4 +@itemx no-fma4 +@cindex @code{target("fma4")} attribute +Enable/disable the generation of the FMA4 instructions. + @item ssse3 @itemx no-ssse3 @cindex @code{target("ssse3")} attribute @@ -8888,6 +8893,46 @@ v2di __builtin_ia32_insertq (v2di, v2di) v2di __builtin_ia32_insertqi (v2di, v2di, const unsigned int, const unsigned int) @end smallexample +The following built-in functions are available when @option{-mfma4} is used. +All of them generate the machine instruction that is part of the name +with MMX registers. + +@smallexample +v2df __builtin_ia32_fmaddpd (v2df, v2df, v2df) +v4sf __builtin_ia32_fmaddps (v4sf, v4sf, v4sf) +v2df __builtin_ia32_fmaddsd (v2df, v2df, v2df) +v4sf __builtin_ia32_fmaddss (v4sf, v4sf, v4sf) +v2df __builtin_ia32_fmsubpd (v2df, v2df, v2df) +v4sf __builtin_ia32_fmsubps (v4sf, v4sf, v4sf) +v2df __builtin_ia32_fmsubsd (v2df, v2df, v2df) +v4sf __builtin_ia32_fmsubss (v4sf, v4sf, v4sf) +v2df __builtin_ia32_fnmaddpd (v2df, v2df, v2df) +v4sf __builtin_ia32_fnmaddps (v4sf, v4sf, v4sf) +v2df __builtin_ia32_fnmaddsd (v2df, v2df, v2df) +v4sf __builtin_ia32_fnmaddss (v4sf, v4sf, v4sf) +v2df __builtin_ia32_fnmsubpd (v2df, v2df, v2df) +v4sf __builtin_ia32_fnmsubps (v4sf, v4sf, v4sf) +v2df __builtin_ia32_fnmsubsd (v2df, v2df, v2df) +v4sf __builtin_ia32_fnmsubss (v4sf, v4sf, v4sf) +v2df __builtin_ia32_fmaddsubpd (v2df, v2df, v2df) +v4sf __builtin_ia32_fmaddsubps (v4sf, v4sf, v4sf) +v2df __builtin_ia32_fmsubaddpd (v2df, v2df, v2df) +v4sf __builtin_ia32_fmsubaddps (v4sf, v4sf, v4sf) +v4df __builtin_ia32_fmaddpd256 (v4df, v4df, v4df) +v8sf __builtin_ia32_fmaddps256 (v8sf, v8sf, v8sf) +v4df __builtin_ia32_fmsubpd256 (v4df, v4df, v4df) +v8sf __builtin_ia32_fmsubps256 (v8sf, v8sf, v8sf) +v4df __builtin_ia32_fnmaddpd256 (v4df, v4df, v4df) +v8sf __builtin_ia32_fnmaddps256 (v8sf, v8sf, v8sf) +v4df __builtin_ia32_fnmsubpd256 (v4df, v4df, v4df) +v8sf __builtin_ia32_fnmsubps256 (v8sf, v8sf, v8sf) +v4df __builtin_ia32_fmaddsubpd256 (v4df, v4df, v4df) +v8sf __builtin_ia32_fmaddsubps256 (v8sf, v8sf, v8sf) +v4df __builtin_ia32_fmsubaddpd256 (v4df, v4df, v4df) +v8sf __builtin_ia32_fmsubaddps256 (v8sf, v8sf, v8sf) + +@end smallexample + The following built-in functions are available when @option{-m3dnow} is used. All of them generate the machine instruction that is part of the name. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 4ae8a02..e12241c 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -592,7 +592,7 @@ Objective-C and Objective-C++ Dialects}. -mcld -mcx16 -msahf -mmovbe -mcrc32 -mrecip @gol -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol -maes -mpclmul @gol --msse4a -m3dnow -mpopcnt -mabm @gol +-msse4a -m3dnow -mpopcnt -mabm -mfma4 @gol -mthreads -mno-align-stringops -minline-all-stringops @gol -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol @@ -11727,6 +11727,8 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. @itemx -mno-pclmul @itemx -msse4a @itemx -mno-sse4a +@itemx -mfma4 +@itemx -mno-fma4 @itemx -m3dnow @itemx -mno-3dnow @itemx -mpopcnt @@ -11740,7 +11742,7 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. @opindex m3dnow @opindex mno-3dnow These switches enable or disable the use of instructions in the MMX, -SSE, SSE2, SSE3, SSSE3, SSE4.1, AVX, AES, PCLMUL, SSE4A, ABM or +SSE, SSE2, SSE3, SSSE3, SSE4.1, AVX, AES, PCLMUL, SSE4A, FMA4, ABM or 3DNow!@: extended instruction sets. These extensions are also available as built-in functions: see @ref{X86 Built-in Functions}, for details of the functions enabled and diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index eec705f..8128a90 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,56 @@ +2009-09-29 Harsha Jagasia <harsha.jagasia@amd.com> + + * gcc.target/i386/fma4-check.h + * gcc.target/i386/fma4-fma.c + * gcc.target/i386/fma4-maccXX.c + * gcc.target/i386/fma4-msubXX.c + * gcc.target/i386/fma4-nmaccXX.c + * gcc.target/i386/fma4-nmsubXX.c + * gcc.target/i386/fma4-vector.c + * gcc.target/i386/fma4-256-maccXX.c + * gcc.target/i386/fma4-256-msubXX.c + * gcc.target/i386/fma4-256-nmaccXX.c + * gcc.target/i386/fma4-256-nmsubXX.c + * gcc.target/i386/fma4-256-vector.c + * gcc.target/i386/funcspec-2.c: New file. + + * gcc.target/i386/funcspec-4.c: Test error conditions + related to FMA4. + + * gcc.target/i386/funcspec-5.c + * gcc.target/i386/funcspec-6.c + * gcc.target/i386/funcspec-8.c: Add FMA4. + + * gcc.target/i386/funcspec-9.c: New file. + + * gcc.target/i386/i386.exp: Add check_effective_target_fma4. + + * gcc.target/i386/isa-10.c + * gcc.target/i386/isa-11.c + * gcc.target/i386/isa-12.c + * gcc.target/i386/isa-13.c + * gcc.target/i386/isa-2.c + * gcc.target/i386/isa-3.c + * gcc.target/i386/isa-4.c + * gcc.target/i386/isa-7.c + * gcc.target/i386/isa-8.c + * gcc.target/i386/isa-9.c: New file. + + * gcc.target/i386/isa-14.c + * gcc.target/i386/isa-1.c + * gcc.target/i386/isa-5.c + * gcc.target/i386/isa-6.c: Add FMA4. + + * gcc.target/i386/sse-12.c + * gcc.target/i386/sse-13.c + * gcc.target/i386/sse-14.c + * gcc.target/i386/sse-22.c: New file. + + * g++.dg/other/i386-2.C + * g++.dg/other/i386-3.C + * g++.dg/other/i386-5.C + * g++.dg/other/i386-6.C: Add -mfma4 in dg-options. + 2009-09-29 H.J. Lu <hongjiu.lu@intel.com> PR testsuite/41496 diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C index 3b05101..4c9579d 100644 --- a/gcc/testsuite/g++.dg/other/i386-2.C +++ b/gcc/testsuite/g++.dg/other/i386-2.C @@ -1,7 +1,7 @@ -/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are - usable with -O -pedantic-errors. */ +/* Test that {,x,e,p,t,s,w,a,i}mmintrin.h, fma4intrin.h, mm3dnow.h and + mm_malloc.h are usable with -O -pedantic-errors. */ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ -/* { dg-options "-O -pedantic-errors -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */ +/* { dg-options "-O -pedantic-errors -march=k8 -m3dnow -mavx -msse4a -mfma4 -maes -mpclmul" } */ #include <x86intrin.h> diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C index 377891d..b9e8916 100644 --- a/gcc/testsuite/g++.dg/other/i386-3.C +++ b/gcc/testsuite/g++.dg/other/i386-3.C @@ -1,6 +1,6 @@ -/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are - usable with -O -fkeep-inline-functions. */ +/* Test that {,x,e,p,t,s,w,a,i}mmintrin.h, fma4intrin.h, mm3dnow.h and + mm_malloc.h are usable with -O -fkeep-inline-functions. */ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ -/* { dg-options "-O -fkeep-inline-functions -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */ +/* { dg-options "-O -fkeep-inline-functions -march=k8 -m3dnow -mavx -msse4a -mfma4 -maes -mpclmul" } */ #include <x86intrin.h> diff --git a/gcc/testsuite/g++.dg/other/i386-5.C b/gcc/testsuite/g++.dg/other/i386-5.C index 377891d..6dcb2d3 100644 --- a/gcc/testsuite/g++.dg/other/i386-5.C +++ b/gcc/testsuite/g++.dg/other/i386-5.C @@ -1,6 +1,6 @@ -/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are - usable with -O -fkeep-inline-functions. */ +/* Test that {,x,e,p,t,s,w,a,i}mmintrin.h, fma4intrin.h, mm3dnow.h and + mm_malloc.h are usable with -O -fkeep-inline-functions. */ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ -/* { dg-options "-O -fkeep-inline-functions -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */ +/* { dg-options "-O -fkeep-inline-functions -march=k8 -m3dnow -mavx -msse4a -mfma4 -maes -mpclmul" } */ #include <x86intrin.h> diff --git a/gcc/testsuite/g++.dg/other/i386-6.C b/gcc/testsuite/g++.dg/other/i386-6.C index 3b05101..4c9579d 100644 --- a/gcc/testsuite/g++.dg/other/i386-6.C +++ b/gcc/testsuite/g++.dg/other/i386-6.C @@ -1,7 +1,7 @@ -/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are - usable with -O -pedantic-errors. */ +/* Test that {,x,e,p,t,s,w,a,i}mmintrin.h, fma4intrin.h, mm3dnow.h and + mm_malloc.h are usable with -O -pedantic-errors. */ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ -/* { dg-options "-O -pedantic-errors -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */ +/* { dg-options "-O -pedantic-errors -march=k8 -m3dnow -mavx -msse4a -mfma4 -maes -mpclmul" } */ #include <x86intrin.h> diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-maccXX.c b/gcc/testsuite/gcc.target/i386/fma4-256-maccXX.c new file mode 100644 index 0000000..134200a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-256-maccXX.c @@ -0,0 +1,96 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O2 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m256 x[NUM]; + float f[NUM * 8]; + __m256d y[NUM]; + double d[NUM * 4]; +} dst, res, src1, src2, src3; + + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_maccps () +{ + int i; + for (i = 0; i < NUM * 8; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_maccpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_maccps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 8; i = i + 8) + for (j = 0; j < 8; j++) + { + res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_maccpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_maccps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm256_macc_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_maccps ()) + abort (); + + init_maccpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm256_macc_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_maccpd ()) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-msubXX.c b/gcc/testsuite/gcc.target/i386/fma4-256-msubXX.c new file mode 100644 index 0000000..d6cafb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-256-msubXX.c @@ -0,0 +1,96 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O2 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m256 x[NUM]; + float f[NUM * 8]; + __m256d y[NUM]; + double d[NUM * 4]; +} dst, res, src1, src2, src3; + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_msubps () +{ + int i; + for (i = 0; i < NUM * 8; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_msubpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_msubps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 8; i = i + 8) + for (j = 0; j < 8; j++) + { + res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_msubpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_msubps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm256_msub_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_msubps ()) + abort (); + + init_msubpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm256_msub_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_msubpd ()) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-nmaccXX.c b/gcc/testsuite/gcc.target/i386/fma4-256-nmaccXX.c new file mode 100644 index 0000000..261f302 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-256-nmaccXX.c @@ -0,0 +1,96 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O2 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m256 x[NUM]; + float f[NUM * 8]; + __m256d y[NUM]; + double d[NUM * 4]; +} dst, res, src1, src2, src3; + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_nmaccps () +{ + int i; + for (i = 0; i < NUM * 8; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_nmaccpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_nmaccps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 8; i = i + 8) + for (j = 0; j < 8; j++) + { + res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_nmaccpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_nmaccps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm256_nmacc_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_nmaccps ()) + abort (); + + init_nmaccpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm256_nmacc_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_nmaccpd ()) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-nmsubXX.c b/gcc/testsuite/gcc.target/i386/fma4-256-nmsubXX.c new file mode 100644 index 0000000..3205715 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-256-nmsubXX.c @@ -0,0 +1,95 @@ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O2 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m256 x[NUM]; + float f[NUM * 8]; + __m256d y[NUM]; + double d[NUM * 4]; +} dst, res, src1, src2, src3; + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_nmsubps () +{ + int i; + for (i = 0; i < NUM * 8; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_nmsubpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_nmsubps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 8; i = i + 8) + for (j = 0; j < 8; j++) + { + res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_nmsubpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_nmsubps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm256_nmsub_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_nmsubps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4])) + abort (); + + init_nmsubpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm256_nmsub_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_nmsubpd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2])) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-vector.c b/gcc/testsuite/gcc.target/i386/fma4-256-vector.c new file mode 100644 index 0000000..714b743 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-256-vector.c @@ -0,0 +1,93 @@ +/* Test that the compiler properly optimizes floating point multiply and add + instructions vector into vfmaddps on FMA4 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */ + +extern void exit (int); + +typedef float __m256 __attribute__ ((__vector_size__ (32), __may_alias__)); +typedef double __m256d __attribute__ ((__vector_size__ (32), __may_alias__)); + +#define SIZE 10240 + +union { + __m256 f_align; + __m256d d_align; + float f[SIZE]; + double d[SIZE]; +} a, b, c, d; + +void +flt_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.f[i] = (b.f[i] * c.f[i]) + d.f[i]; +} + +void +dbl_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.d[i] = (b.d[i] * c.d[i]) + d.d[i]; +} + +void +flt_mul_sub (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.f[i] = (b.f[i] * c.f[i]) - d.f[i]; +} + +void +dbl_mul_sub (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.d[i] = (b.d[i] * c.d[i]) - d.d[i]; +} + +void +flt_neg_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.f[i] = (-(b.f[i] * c.f[i])) + d.f[i]; +} + +void +dbl_neg_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.d[i] = (-(b.d[i] * c.d[i])) + d.d[i]; +} + +int main () +{ + flt_mul_add (); + flt_mul_sub (); + flt_neg_mul_add (); + + dbl_mul_add (); + dbl_mul_sub (); + dbl_neg_mul_add (); + exit (0); +} + +/* { dg-final { scan-assembler "vfmaddps" } } */ +/* { dg-final { scan-assembler "vfmaddpd" } } */ +/* { dg-final { scan-assembler "vfmsubps" } } */ +/* { dg-final { scan-assembler "vfmsubpd" } } */ +/* { dg-final { scan-assembler "vfnmaddps" } } */ +/* { dg-final { scan-assembler "vfnmaddpd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/fma4-check.h b/gcc/testsuite/gcc.target/i386/fma4-check.h new file mode 100644 index 0000000..76fcdef --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-check.h @@ -0,0 +1,20 @@ +#include <stdlib.h> + +#include "cpuid.h" + +static void fma4_test (void); + +int +main () +{ + unsigned int eax, ebx, ecx, edx; + + if (!__get_cpuid (0x80000001, &eax, &ebx, &ecx, &edx)) + return 0; + + /* Run FMA4 test only if host has FMA4 support. */ + if (ecx & bit_FMA4) + fma4_test (); + + exit (0); +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-fma.c b/gcc/testsuite/gcc.target/i386/fma4-fma.c new file mode 100644 index 0000000..cb90691 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-fma.c @@ -0,0 +1,83 @@ +/* Test that the compiler properly optimizes floating point multiply + and add instructions into vfmaddss, vfmsubss, vfnmaddss, + vfnmsubss on FMA4 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -mfma4" } */ + +extern void exit (int); + +float +flt_mul_add (float a, float b, float c) +{ + return (a * b) + c; +} + +double +dbl_mul_add (double a, double b, double c) +{ + return (a * b) + c; +} + +float +flt_mul_sub (float a, float b, float c) +{ + return (a * b) - c; +} + +double +dbl_mul_sub (double a, double b, double c) +{ + return (a * b) - c; +} + +float +flt_neg_mul_add (float a, float b, float c) +{ + return (-(a * b)) + c; +} + +double +dbl_neg_mul_add (double a, double b, double c) +{ + return (-(a * b)) + c; +} + +float +flt_neg_mul_sub (float a, float b, float c) +{ + return (-(a * b)) - c; +} + +double +dbl_neg_mul_sub (double a, double b, double c) +{ + return (-(a * b)) - c; +} + +float f[10] = { 2, 3, 4 }; +double d[10] = { 2, 3, 4 }; + +int main () +{ + f[3] = flt_mul_add (f[0], f[1], f[2]); + f[4] = flt_mul_sub (f[0], f[1], f[2]); + f[5] = flt_neg_mul_add (f[0], f[1], f[2]); + f[6] = flt_neg_mul_sub (f[0], f[1], f[2]); + + d[3] = dbl_mul_add (d[0], d[1], d[2]); + d[4] = dbl_mul_sub (d[0], d[1], d[2]); + d[5] = dbl_neg_mul_add (d[0], d[1], d[2]); + d[6] = dbl_neg_mul_sub (d[0], d[1], d[2]); + exit (0); +} + +/* { dg-final { scan-assembler "vfmaddss" } } */ +/* { dg-final { scan-assembler "vfmaddsd" } } */ +/* { dg-final { scan-assembler "vfmsubss" } } */ +/* { dg-final { scan-assembler "vfmsubsd" } } */ +/* { dg-final { scan-assembler "vfnmaddss" } } */ +/* { dg-final { scan-assembler "vfnmaddsd" } } */ +/* { dg-final { scan-assembler "vfnmsubss" } } */ +/* { dg-final { scan-assembler "vfnmsubsd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/fma4-maccXX.c b/gcc/testsuite/gcc.target/i386/fma4-maccXX.c new file mode 100644 index 0000000..4b4c005 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-maccXX.c @@ -0,0 +1,136 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O0 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m128 x[NUM]; + float f[NUM * 4]; + __m128d y[NUM]; + double d[NUM * 2]; +} dst, res, src1, src2, src3; + + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_maccps () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_maccpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_maccps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_maccpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + for (j = 0; j < 2; j++) + { + res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + + +static int +check_maccss () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i= i + 4) + { + res.f[i] = (src1.f[i] * src2.f[i]) + src3.f[i]; + if (dst.f[i] != res.f[i]) + check_fails++; + } + return check_fails++; +} + +static int +check_maccsd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + { + res.d[i] = (src1.d[i] * src2.d[i]) + src3.d[i]; + if (dst.d[i] != res.d[i]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_maccps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_macc_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_maccps ()) + abort (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_macc_ss (src1.x[i], src2.x[i], src3.x[i]); + + if (check_maccss ()) + abort (); + + init_maccpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_macc_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_maccpd ()) + abort (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_macc_sd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_maccsd ()) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-msubXX.c b/gcc/testsuite/gcc.target/i386/fma4-msubXX.c new file mode 100644 index 0000000..eed7558 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-msubXX.c @@ -0,0 +1,134 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O0 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m128 x[NUM]; + float f[NUM * 4]; + __m128d y[NUM]; + double d[NUM * 2]; +} dst, res, src1, src2, src3; + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_msubps () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_msubpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_msubps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_msubpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + for (j = 0; j < 2; j++) + { + res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + + +static int +check_msubss () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + { + res.f[i] = (src1.f[i] * src2.f[i]) - src3.f[i]; + if (dst.f[i] != res.f[i]) + check_fails++; + } + return check_fails++; +} + +static int +check_msubsd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + { + res.d[i] = (src1.d[i] * src2.d[i]) - src3.d[i]; + if (dst.d[i] != res.d[i]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_msubps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_msub_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_msubps ()) + abort (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_msub_ss (src1.x[i], src2.x[i], src3.x[i]); + + if (check_msubss ()) + abort (); + + init_msubpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_msub_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_msubpd ()) + abort (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_msub_sd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_msubsd ()) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-nmaccXX.c b/gcc/testsuite/gcc.target/i386/fma4-nmaccXX.c new file mode 100644 index 0000000..9abf746 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-nmaccXX.c @@ -0,0 +1,137 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O0 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m128 x[NUM]; + float f[NUM * 4]; + __m128d y[NUM]; + double d[NUM * 2]; +} dst, res, src1, src2, src3; + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_nmaccps () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_nmaccpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_nmaccps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_nmaccpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + for (j = 0; j < 2; j++) + { + res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + + +static int +check_nmaccss () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + { + res.f[i] = - (src1.f[i] * src2.f[i]) + src3.f[i]; + if (dst.f[i] != res.f[i]) + check_fails++; + } + return check_fails++; +} + +static int +check_nmaccsd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + { + res.d[i] = - (src1.d[i] * src2.d[i]) + src3.d[i]; + if (dst.d[i] != res.d[i]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_nmaccps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_nmacc_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_nmaccps ()) + abort (); + + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_nmacc_ss (src1.x[i], src2.x[i], src3.x[i]); + + if (check_nmaccss ()) + abort (); + + init_nmaccpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_nmacc_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_nmaccpd ()) + abort (); + + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_nmacc_sd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_nmaccsd ()) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-nmsubXX.c b/gcc/testsuite/gcc.target/i386/fma4-nmsubXX.c new file mode 100644 index 0000000..85fbecd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-nmsubXX.c @@ -0,0 +1,137 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O0 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m128 x[NUM]; + float f[NUM * 4]; + __m128d y[NUM]; + double d[NUM * 2]; +} dst, res, src1, src2, src3; + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_nmsubps () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_nmsubpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_nmsubps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_nmsubpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + for (j = 0; j < 2; j++) + { + res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + + +static int +check_nmsubss () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + { + res.f[i] = - (src1.f[i] * src2.f[i]) - src3.f[i]; + if (dst.f[i] != res.f[i]) + check_fails++; + } + return check_fails++; +} + +static int +check_nmsubsd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + { + res.d[i] = - (src1.d[i] * src2.d[i]) - src3.d[i]; + if (dst.d[i] != res.d[i]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_nmsubps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_nmsub_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_nmsubps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4])) + abort (); + + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_nmsub_ss (src1.x[i], src2.x[i], src3.x[i]); + + if (check_nmsubss (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4])) + abort (); + + init_nmsubpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_nmsub_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_nmsubpd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2])) + abort (); + + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_nmsub_sd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_nmsubsd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2])) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-vector.c b/gcc/testsuite/gcc.target/i386/fma4-vector.c new file mode 100644 index 0000000..df8463e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-vector.c @@ -0,0 +1,93 @@ +/* Test that the compiler properly optimizes floating point multiply and add + instructions vector into vfmaddps on FMA4 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */ + +extern void exit (int); + +typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); +typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); + +#define SIZE 10240 + +union { + __m128 f_align; + __m128d d_align; + float f[SIZE]; + double d[SIZE]; +} a, b, c, d; + +void +flt_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.f[i] = (b.f[i] * c.f[i]) + d.f[i]; +} + +void +dbl_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.d[i] = (b.d[i] * c.d[i]) + d.d[i]; +} + +void +flt_mul_sub (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.f[i] = (b.f[i] * c.f[i]) - d.f[i]; +} + +void +dbl_mul_sub (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.d[i] = (b.d[i] * c.d[i]) - d.d[i]; +} + +void +flt_neg_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.f[i] = (-(b.f[i] * c.f[i])) + d.f[i]; +} + +void +dbl_neg_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.d[i] = (-(b.d[i] * c.d[i])) + d.d[i]; +} + +int main () +{ + flt_mul_add (); + flt_mul_sub (); + flt_neg_mul_add (); + + dbl_mul_add (); + dbl_mul_sub (); + dbl_neg_mul_add (); + exit (0); +} + +/* { dg-final { scan-assembler "vfmaddps" } } */ +/* { dg-final { scan-assembler "vfmaddpd" } } */ +/* { dg-final { scan-assembler "vfmsubps" } } */ +/* { dg-final { scan-assembler "vfmsubpd" } } */ +/* { dg-final { scan-assembler "vfnmaddps" } } */ +/* { dg-final { scan-assembler "vfnmaddpd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/funcspec-2.c b/gcc/testsuite/gcc.target/i386/funcspec-2.c new file mode 100644 index 0000000..c132fc9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/funcspec-2.c @@ -0,0 +1,99 @@ +/* Test whether using target specific options, we can generate FMA4 code. */ +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -march=k8" } */ + +extern void exit (int); + +#define FMA4_ATTR __attribute__((__target__("fma4"))) +extern float flt_mul_add (float a, float b, float c) FMA4_ATTR; +extern float flt_mul_sub (float a, float b, float c) FMA4_ATTR; +extern float flt_neg_mul_add (float a, float b, float c) FMA4_ATTR; +extern float flt_neg_mul_sub (float a, float b, float c) FMA4_ATTR; + +extern double dbl_mul_add (double a, double b, double c) FMA4_ATTR; +extern double dbl_mul_sub (double a, double b, double c) FMA4_ATTR; +extern double dbl_neg_mul_add (double a, double b, double c) FMA4_ATTR; +extern double dbl_neg_mul_sub (double a, double b, double c) FMA4_ATTR; + +float +flt_mul_add (float a, float b, float c) +{ + return (a * b) + c; +} + +double +dbl_mul_add (double a, double b, double c) +{ + return (a * b) + c; +} + +float +flt_mul_sub (float a, float b, float c) +{ + return (a * b) - c; +} + +double +dbl_mul_sub (double a, double b, double c) +{ + return (a * b) - c; +} + +float +flt_neg_mul_add (float a, float b, float c) +{ + return (-(a * b)) + c; +} + +double +dbl_neg_mul_add (double a, double b, double c) +{ + return (-(a * b)) + c; +} + +float +flt_neg_mul_sub (float a, float b, float c) +{ + return (-(a * b)) - c; +} + +double +dbl_neg_mul_sub (double a, double b, double c) +{ + return (-(a * b)) - c; +} + +float f[10] = { 2, 3, 4 }; +double d[10] = { 2, 3, 4 }; + +int main () +{ + f[3] = flt_mul_add (f[0], f[1], f[2]); + f[4] = flt_mul_sub (f[0], f[1], f[2]); + f[5] = flt_neg_mul_add (f[0], f[1], f[2]); + f[6] = flt_neg_mul_sub (f[0], f[1], f[2]); + + d[3] = dbl_mul_add (d[0], d[1], d[2]); + d[4] = dbl_mul_sub (d[0], d[1], d[2]); + d[5] = dbl_neg_mul_add (d[0], d[1], d[2]); + d[6] = dbl_neg_mul_sub (d[0], d[1], d[2]); + exit (0); +} + +/* { dg-final { scan-assembler "vfmaddss" } } */ +/* { dg-final { scan-assembler "vfmaddsd" } } */ +/* { dg-final { scan-assembler "vfmsubss" } } */ +/* { dg-final { scan-assembler "vfmsubsd" } } */ +/* { dg-final { scan-assembler "vfnmaddss" } } */ +/* { dg-final { scan-assembler "vfnmaddsd" } } */ +/* { dg-final { scan-assembler "vfnmsubss" } } */ +/* { dg-final { scan-assembler "vfnmsubsd" } } */ +/* { dg-final { scan-assembler "call\t(.*)flt_mul_add" } } */ +/* { dg-final { scan-assembler "call\t(.*)flt_mul_sub" } } */ +/* { dg-final { scan-assembler "call\t(.*)flt_neg_mul_add" } } */ +/* { dg-final { scan-assembler "call\t(.*)flt_neg_mul_sub" } } */ +/* { dg-final { scan-assembler "call\t(.*)dbl_mul_add" } } */ +/* { dg-final { scan-assembler "call\t(.*)dbl_mul_sub" } } */ +/* { dg-final { scan-assembler "call\t(.*)dbl_neg_mul_add" } } */ +/* { dg-final { scan-assembler "call\t(.*)dbl_neg_mul_sub" } } */ diff --git a/gcc/testsuite/gcc.target/i386/funcspec-4.c b/gcc/testsuite/gcc.target/i386/funcspec-4.c index e2eef41..025b97d 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-4.c +++ b/gcc/testsuite/gcc.target/i386/funcspec-4.c @@ -1,6 +1,9 @@ /* Test some error conditions with function specific options. */ /* { dg-do compile } */ +/* no fma400 switch */ +extern void error1 (void) __attribute__((__target__("fma400"))); /* { dg-error "unknown" } */ + /* Multiple arch switches */ extern void error2 (void) __attribute__((__target__("arch=core2,arch=k8"))); /* { dg-error "already specified" } */ diff --git a/gcc/testsuite/gcc.target/i386/funcspec-5.c b/gcc/testsuite/gcc.target/i386/funcspec-5.c index cbecfaa..34da51c 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-5.c +++ b/gcc/testsuite/gcc.target/i386/funcspec-5.c @@ -16,6 +16,7 @@ extern void test_sse4 (void) __attribute__((__target__("sse4"))); extern void test_sse4_1 (void) __attribute__((__target__("sse4.1"))); extern void test_sse4_2 (void) __attribute__((__target__("sse4.2"))); extern void test_sse4a (void) __attribute__((__target__("sse4a"))); +extern void test_fma4 (void) __attribute__((__target__("fma4"))); extern void test_ssse3 (void) __attribute__((__target__("ssse3"))); extern void test_no_abm (void) __attribute__((__target__("no-abm"))); @@ -31,6 +32,7 @@ extern void test_no_sse4 (void) __attribute__((__target__("no-sse4"))); extern void test_no_sse4_1 (void) __attribute__((__target__("no-sse4.1"))); extern void test_no_sse4_2 (void) __attribute__((__target__("no-sse4.2"))); extern void test_no_sse4a (void) __attribute__((__target__("no-sse4a"))); +extern void test_no_fma4 (void) __attribute__((__target__("no-fma4"))); extern void test_no_ssse3 (void) __attribute__((__target__("no-ssse3"))); extern void test_arch_i386 (void) __attribute__((__target__("arch=i386"))); diff --git a/gcc/testsuite/gcc.target/i386/funcspec-6.c b/gcc/testsuite/gcc.target/i386/funcspec-6.c index 7f46ad0..575be9b 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-6.c +++ b/gcc/testsuite/gcc.target/i386/funcspec-6.c @@ -16,6 +16,7 @@ extern void test_sse4 (void) __attribute__((__target__("sse4"))); extern void test_sse4_1 (void) __attribute__((__target__("sse4.1"))); extern void test_sse4_2 (void) __attribute__((__target__("sse4.2"))); extern void test_sse4a (void) __attribute__((__target__("sse4a"))); +extern void test_fma4 (void) __attribute__((__target__("fma4"))); extern void test_ssse3 (void) __attribute__((__target__("ssse3"))); extern void test_no_abm (void) __attribute__((__target__("no-abm"))); @@ -31,6 +32,7 @@ extern void test_no_sse4 (void) __attribute__((__target__("no-sse4"))); extern void test_no_sse4_1 (void) __attribute__((__target__("no-sse4.1"))); extern void test_no_sse4_2 (void) __attribute__((__target__("no-sse4.2"))); extern void test_no_sse4a (void) __attribute__((__target__("no-sse4a"))); +extern void test_no_fma4 (void) __attribute__((__target__("no-fma4"))); extern void test_no_ssse3 (void) __attribute__((__target__("no-ssse3"))); extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona"))); diff --git a/gcc/testsuite/gcc.target/i386/funcspec-8.c b/gcc/testsuite/gcc.target/i386/funcspec-8.c index c370733..ba4b7f2 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-8.c +++ b/gcc/testsuite/gcc.target/i386/funcspec-8.c @@ -104,6 +104,25 @@ generic_insertq (__m128i a, __m128i b) return __builtin_ia32_insertq (a, b); /* { dg-error "needs isa option" } */ } +#ifdef __FMA4__ +#error "-mfma4 should not be set for this test" +#endif + +__m128d fma4_fmaddpd (__m128d a, __m128d b, __m128d c) __attribute__((__target__("fma4"))); +__m128d generic_fmaddpd (__m128d a, __m128d b, __m128d c); + +__m128d +fma4_fmaddpd (__m128d a, __m128d b, __m128d c) +{ + return __builtin_ia32_vfmaddpd (a, b, c); +} + +__m128d +generic_fmaddpd (__m128d a, __m128d b, __m128d c) +{ + return __builtin_ia32_vfmaddpd (a, b, c); /* { dg-error "needs isa option" } */ +} + #ifdef __AES__ #error "-maes should not be set for this test" #endif diff --git a/gcc/testsuite/gcc.target/i386/funcspec-9.c b/gcc/testsuite/gcc.target/i386/funcspec-9.c new file mode 100644 index 0000000..78714e1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/funcspec-9.c @@ -0,0 +1,36 @@ +/* Test whether using target specific options, we can generate FMA4 code. */ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=k8 -mfpmath=sse -msse2" } */ + +extern void exit (int); + +#ifdef __FMA4__ +#warning "__FMA4__ should not be defined before #pragma GCC target." +#endif + +#pragma GCC push_options +#pragma GCC target ("fma4") + +#ifndef __FMA4__ +#warning "__FMA4__ should have be defined after #pragma GCC target." +#endif + +float +flt_mul_add (float a, float b, float c) +{ + return (a * b) + c; +} + +#pragma GCC pop_options +#ifdef __FMA4__ +#warning "__FMA4__ should not be defined after #pragma GCC pop target." +#endif + +double +dbl_mul_add (double a, double b, double c) +{ + return (a * b) + c; +} + +/* { dg-final { scan-assembler "vfmaddss" } } */ +/* { dg-final { scan-assembler "addsd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/i386.exp b/gcc/testsuite/gcc.target/i386/i386.exp index 242c40f..c7c6e12 100644 --- a/gcc/testsuite/gcc.target/i386/i386.exp +++ b/gcc/testsuite/gcc.target/i386/i386.exp @@ -120,6 +120,20 @@ proc check_effective_target_sse4a { } { } "-O2 -msse4a" ] } +# Return 1 if fma4 instructions can be compiled. +proc check_effective_target_fma4 { } { + return [check_no_compiler_messages fma4 object { + typedef float __m128 __attribute__ ((__vector_size__ (16))); + typedef float __v4sf __attribute__ ((__vector_size__ (16))); + __m128 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) + { + return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, + (__v4sf)__B, + (__v4sf)__C); + } + } "-O2 -mfma4" ] +} + # If a testcase doesn't have special options, use these. global DEFAULT_CFLAGS if ![info exists DEFAULT_CFLAGS] then { diff --git a/gcc/testsuite/gcc.target/i386/isa-1.c b/gcc/testsuite/gcc.target/i386/isa-1.c index b2040b3..d98c14f 100644 --- a/gcc/testsuite/gcc.target/i386/isa-1.c +++ b/gcc/testsuite/gcc.target/i386/isa-1.c @@ -27,5 +27,11 @@ main () #if defined __SSE4A__ abort (); #endif +#if defined __AVX__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif return 0; } diff --git a/gcc/testsuite/gcc.target/i386/isa-10.c b/gcc/testsuite/gcc.target/i386/isa-10.c new file mode 100644 index 0000000..5f57be9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-10.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64 -mfma4 -mno-sse4" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if !defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-11.c b/gcc/testsuite/gcc.target/i386/isa-11.c new file mode 100644 index 0000000..64755b09 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-11.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64 -mfma4 -mno-ssse3" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-12.c b/gcc/testsuite/gcc.target/i386/isa-12.c new file mode 100644 index 0000000..fde84a2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-12.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64 -mfma4 -mno-sse3" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if defined __SSE3__ + abort (); +#endif +#if defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if defined __SSE4A__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-13.c b/gcc/testsuite/gcc.target/i386/isa-13.c new file mode 100644 index 0000000..74e37d9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-13.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64 -mfma4 -mno-sse2" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if defined __SSE2__ + abort (); +#endif +#if defined __SSE3__ + abort (); +#endif +#if defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if defined __SSE4A__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-14.c b/gcc/testsuite/gcc.target/i386/isa-14.c index 09dda6d..5d49e6e 100644 --- a/gcc/testsuite/gcc.target/i386/isa-14.c +++ b/gcc/testsuite/gcc.target/i386/isa-14.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-march=x86-64 -msse4a -mno-sse" } */ +/* { dg-options "-march=x86-64 -msse4a -mfma4 -mno-sse" } */ extern void abort (void); @@ -27,5 +27,8 @@ main () #if defined __SSE4A__ abort (); #endif +#if defined __FMA4__ + abort (); +#endif return 0; } diff --git a/gcc/testsuite/gcc.target/i386/isa-2.c b/gcc/testsuite/gcc.target/i386/isa-2.c new file mode 100644 index 0000000..aa8958c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-2.c @@ -0,0 +1,37 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64 -msse4 -mfma4" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if !defined __SSSE3__ + abort (); +#endif +#if !defined __SSE4_1__ + abort (); +#endif +#if !defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if !defined __AVX__ + abort (); +#endif +#if !defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-3.c b/gcc/testsuite/gcc.target/i386/isa-3.c new file mode 100644 index 0000000..a4d93f4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-3.c @@ -0,0 +1,37 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64 -msse4 -mfma4 -msse4a" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if !defined __SSSE3__ + abort (); +#endif +#if !defined __SSE4_1__ + abort (); +#endif +#if !defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if !defined __AVX__ + abort (); +#endif +#if !defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-4.c b/gcc/testsuite/gcc.target/i386/isa-4.c new file mode 100644 index 0000000..0137257 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-4.c @@ -0,0 +1,37 @@ +/* { dg-do run } */ +/* { dg-options "-march=core2 -mfma4 -mno-sse4" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if !defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if defined __AVX__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-5.c b/gcc/testsuite/gcc.target/i386/isa-5.c index 2157499..39d065e 100644 --- a/gcc/testsuite/gcc.target/i386/isa-5.c +++ b/gcc/testsuite/gcc.target/i386/isa-5.c @@ -27,5 +27,11 @@ main () #if !defined __SSE4A__ abort (); #endif +#if defined __AVX__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif return 0; } diff --git a/gcc/testsuite/gcc.target/i386/isa-6.c b/gcc/testsuite/gcc.target/i386/isa-6.c index 389621b..a9a0ddb 100644 --- a/gcc/testsuite/gcc.target/i386/isa-6.c +++ b/gcc/testsuite/gcc.target/i386/isa-6.c @@ -28,5 +28,11 @@ main () #if !defined __SSE4A__ abort (); #endif +#if defined __AVX__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif return 0; } diff --git a/gcc/testsuite/gcc.target/i386/isa-7.c b/gcc/testsuite/gcc.target/i386/isa-7.c new file mode 100644 index 0000000..8dd628e9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-7.c @@ -0,0 +1,37 @@ +/* { dg-do run } */ +/* { dg-options "-march=amdfam10 -mfma4 -mno-sse4" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if !defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if defined __AVX__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-8.c b/gcc/testsuite/gcc.target/i386/isa-8.c new file mode 100644 index 0000000..2ffd80f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-8.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-march=amdfam10 -mfma4 -mno-sse4a" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if !defined __SSSE3__ + abort (); +#endif +#if !defined __SSE4_1__ + abort (); +#endif +#if !defined __SSE4_2__ + abort (); +#endif +#if defined __SSE4A__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-9.c b/gcc/testsuite/gcc.target/i386/isa-9.c new file mode 100644 index 0000000..64cbdbd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-9.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-march=amdfam10 -mno-fma4" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/sse-12.c b/gcc/testsuite/gcc.target/i386/sse-12.c new file mode 100644 index 0000000..85c36c8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse-12.c @@ -0,0 +1,8 @@ +/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are + usable with -O -std=c89 -pedantic-errors. */ +/* { dg-do compile } */ +/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -m3dnow -mavx -mfma4 -maes -mpclmul" } */ + +#include <x86intrin.h> + +int dummy; diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c new file mode 100644 index 0000000..1ce9d96 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse-13.c @@ -0,0 +1,128 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -maes -mpclmul" } */ + +#include <mm_malloc.h> + +/* Test that the intrinsics compile with optimization. All of them are + defined as inline functions in {,x,e,p,t,s,w,a,b,i}mmintrin.h and mm3dnow.h + that reference the proper builtin functions. Defining away "extern" and + "__inline" results in all of them being compiled as proper functions. */ + +#define extern +#define __inline + +/* Following intrinsics require immediate arguments. */ + +/* ammintrin.h */ +#define __builtin_ia32_extrqi(X, I, L) __builtin_ia32_extrqi(X, 1, 1) +#define __builtin_ia32_insertqi(X, Y, I, L) __builtin_ia32_insertqi(X, Y, 1, 1) + +/* immintrin.h */ +#define __builtin_ia32_blendpd256(X, Y, M) __builtin_ia32_blendpd256(X, Y, 1) +#define __builtin_ia32_blendps256(X, Y, M) __builtin_ia32_blendps256(X, Y, 1) +#define __builtin_ia32_dpps256(X, Y, M) __builtin_ia32_dpps256(X, Y, 1) +#define __builtin_ia32_shufpd256(X, Y, M) __builtin_ia32_shufpd256(X, Y, 1) +#define __builtin_ia32_shufps256(X, Y, M) __builtin_ia32_shufps256(X, Y, 1) +#define __builtin_ia32_cmpsd(X, Y, O) __builtin_ia32_cmpsd(X, Y, 1) +#define __builtin_ia32_cmpss(X, Y, O) __builtin_ia32_cmpss(X, Y, 1) +#define __builtin_ia32_cmppd(X, Y, O) __builtin_ia32_cmppd(X, Y, 1) +#define __builtin_ia32_cmpps(X, Y, O) __builtin_ia32_cmpps(X, Y, 1) +#define __builtin_ia32_cmppd256(X, Y, O) __builtin_ia32_cmppd256(X, Y, 1) +#define __builtin_ia32_cmpps256(X, Y, O) __builtin_ia32_cmpps256(X, Y, 1) +#define __builtin_ia32_vextractf128_pd256(X, N) __builtin_ia32_vextractf128_pd256(X, 1) +#define __builtin_ia32_vextractf128_ps256(X, N) __builtin_ia32_vextractf128_ps256(X, 1) +#define __builtin_ia32_vextractf128_si256(X, N) __builtin_ia32_vextractf128_si256(X, 1) +#define __builtin_ia32_vpermilpd(X, N) __builtin_ia32_vpermilpd(X, 1) +#define __builtin_ia32_vpermilpd256(X, N) __builtin_ia32_vpermilpd256(X, 1) +#define __builtin_ia32_vpermilps(X, N) __builtin_ia32_vpermilps(X, 1) +#define __builtin_ia32_vpermilps256(X, N) __builtin_ia32_vpermilps256(X, 1) +#define __builtin_ia32_vpermil2pd(X, Y, C, I) __builtin_ia32_vpermil2pd(X, Y, C, 1) +#define __builtin_ia32_vpermil2pd256(X, Y, C, I) __builtin_ia32_vpermil2pd256(X, Y, C, 1) +#define __builtin_ia32_vpermil2ps(X, Y, C, I) __builtin_ia32_vpermil2ps(X, Y, C, 1) +#define __builtin_ia32_vpermil2ps256(X, Y, C, I) __builtin_ia32_vpermil2ps256(X, Y, C, 1) +#define __builtin_ia32_vperm2f128_pd256(X, Y, C) __builtin_ia32_vperm2f128_pd256(X, Y, 1) +#define __builtin_ia32_vperm2f128_ps256(X, Y, C) __builtin_ia32_vperm2f128_ps256(X, Y, 1) +#define __builtin_ia32_vperm2f128_si256(X, Y, C) __builtin_ia32_vperm2f128_si256(X, Y, 1) +#define __builtin_ia32_vinsertf128_pd256(X, Y, C) __builtin_ia32_vinsertf128_pd256(X, Y, 1) +#define __builtin_ia32_vinsertf128_ps256(X, Y, C) __builtin_ia32_vinsertf128_ps256(X, Y, 1) +#define __builtin_ia32_vinsertf128_si256(X, Y, C) __builtin_ia32_vinsertf128_si256(X, Y, 1) +#define __builtin_ia32_roundpd256(V, M) __builtin_ia32_roundpd256(V, 1) +#define __builtin_ia32_roundps256(V, M) __builtin_ia32_roundps256(V, 1) + +/* wmmintrin.h */ +#define __builtin_ia32_aeskeygenassist128(X, C) __builtin_ia32_aeskeygenassist128(X, 1) +#define __builtin_ia32_pclmulqdq128(X, Y, I) __builtin_ia32_pclmulqdq128(X, Y, 1) + +/* smmintrin.h */ +#define __builtin_ia32_roundpd(V, M) __builtin_ia32_roundpd(V, 1) +#define __builtin_ia32_roundsd(D, V, M) __builtin_ia32_roundsd(D, V, 1) +#define __builtin_ia32_roundps(V, M) __builtin_ia32_roundps(V, 1) +#define __builtin_ia32_roundss(D, V, M) __builtin_ia32_roundss(D, V, 1) + +#define __builtin_ia32_pblendw128(X, Y, M) __builtin_ia32_pblendw128 (X, Y, 1) +#define __builtin_ia32_blendps(X, Y, M) __builtin_ia32_blendps(X, Y, 1) +#define __builtin_ia32_blendpd(X, Y, M) __builtin_ia32_blendpd(X, Y, 1) +#define __builtin_ia32_dpps(X, Y, M) __builtin_ia32_dpps(X, Y, 1) +#define __builtin_ia32_dppd(X, Y, M) __builtin_ia32_dppd(X, Y, 1) +#define __builtin_ia32_insertps128(D, S, N) __builtin_ia32_insertps128(D, S, 1) +#define __builtin_ia32_vec_ext_v4sf(X, N) __builtin_ia32_vec_ext_v4sf(X, 1) +#define __builtin_ia32_vec_set_v16qi(D, S, N) __builtin_ia32_vec_set_v16qi(D, S, 1) +#define __builtin_ia32_vec_set_v4si(D, S, N) __builtin_ia32_vec_set_v4si(D, S, 1) +#define __builtin_ia32_vec_set_v2di(D, S, N) __builtin_ia32_vec_set_v2di(D, S, 1) +#define __builtin_ia32_vec_ext_v16qi(X, N) __builtin_ia32_vec_ext_v16qi(X, 1) +#define __builtin_ia32_vec_ext_v4si(X, N) __builtin_ia32_vec_ext_v4si(X, 1) +#define __builtin_ia32_vec_ext_v2di(X, N) __builtin_ia32_vec_ext_v2di(X, 1) +#define __builtin_ia32_mpsadbw128(X, Y, M) __builtin_ia32_mpsadbw128(X, Y, 1) +#define __builtin_ia32_pcmpistrm128(X, Y, M) \ + __builtin_ia32_pcmpistrm128(X, Y, 1) +#define __builtin_ia32_pcmpistri128(X, Y, M) \ + __builtin_ia32_pcmpistri128(X, Y, 1) +#define __builtin_ia32_pcmpestrm128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestrm128(X, LX, Y, LY, 1) +#define __builtin_ia32_pcmpestri128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestri128(X, LX, Y, LY, 1) +#define __builtin_ia32_pcmpistria128(X, Y, M) \ + __builtin_ia32_pcmpistria128(X, Y, 1) +#define __builtin_ia32_pcmpistric128(X, Y, M) \ + __builtin_ia32_pcmpistric128(X, Y, 1) +#define __builtin_ia32_pcmpistrio128(X, Y, M) \ + __builtin_ia32_pcmpistrio128(X, Y, 1) +#define __builtin_ia32_pcmpistris128(X, Y, M) \ + __builtin_ia32_pcmpistris128(X, Y, 1) +#define __builtin_ia32_pcmpistriz128(X, Y, M) \ + __builtin_ia32_pcmpistriz128(X, Y, 1) +#define __builtin_ia32_pcmpestria128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestria128(X, LX, Y, LY, 1) +#define __builtin_ia32_pcmpestric128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestric128(X, LX, Y, LY, 1) +#define __builtin_ia32_pcmpestrio128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestrio128(X, LX, Y, LY, 1) +#define __builtin_ia32_pcmpestris128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestris128(X, LX, Y, LY, 1) +#define __builtin_ia32_pcmpestriz128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestriz128(X, LX, Y, LY, 1) + +/* tmmintrin.h */ +#define __builtin_ia32_palignr128(X, Y, N) __builtin_ia32_palignr128(X, Y, 8) +#define __builtin_ia32_palignr(X, Y, N) __builtin_ia32_palignr(X, Y, 8) + +/* emmintrin.h */ +#define __builtin_ia32_psrldqi128(A, B) __builtin_ia32_psrldqi128(A, 8) +#define __builtin_ia32_pslldqi128(A, B) __builtin_ia32_pslldqi128(A, 8) +#define __builtin_ia32_pshufhw(A, N) __builtin_ia32_pshufhw(A, 0) +#define __builtin_ia32_pshuflw(A, N) __builtin_ia32_pshuflw(A, 0) +#define __builtin_ia32_pshufd(A, N) __builtin_ia32_pshufd(A, 0) +#define __builtin_ia32_vec_set_v8hi(A, D, N) \ + __builtin_ia32_vec_set_v8hi(A, D, 0) +#define __builtin_ia32_vec_ext_v8hi(A, N) __builtin_ia32_vec_ext_v8hi(A, 0) +#define __builtin_ia32_shufpd(A, B, N) __builtin_ia32_shufpd(A, B, 0) + +/* xmmintrin.h */ +#define __builtin_prefetch(P, A, I) __builtin_prefetch(P, A, _MM_HINT_NTA) +#define __builtin_ia32_pshufw(A, N) __builtin_ia32_pshufw(A, 0) +#define __builtin_ia32_vec_set_v4hi(A, D, N) \ + __builtin_ia32_vec_set_v4hi(A, D, 0) +#define __builtin_ia32_vec_ext_v4hi(A, N) __builtin_ia32_vec_ext_v4hi(A, 0) +#define __builtin_ia32_shufps(A, B, N) __builtin_ia32_shufps(A, B, 0) + +#include <x86intrin.h> diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c new file mode 100644 index 0000000..c1ddb96 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse-14.c @@ -0,0 +1,157 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */ + +#include <mm_malloc.h> + +/* Test that the intrinsics compile without optimization. All of them are + defined as inline functions in {,x,e,p,t,s,w,a}mmintrin.h and mm3dnow.h + that reference the proper builtin functions. Defining away "extern" and + "__inline" results in all of them being compiled as proper functions. */ + +#define extern +#define __inline + +#include <x86intrin.h> + +#define _CONCAT(x,y) x ## y + +#define test_1(func, type, op1_type, imm) \ + type _CONCAT(_,func) (op1_type A, int const I) \ + { return func (A, imm); } + +#define test_1x(func, type, op1_type, imm1, imm2) \ + type _CONCAT(_,func) (op1_type A, int const I, int const L) \ + { return func (A, imm1, imm2); } + +#define test_2(func, type, op1_type, op2_type, imm) \ + type _CONCAT(_,func) (op1_type A, op2_type B, int const I) \ + { return func (A, B, imm); } + +#define test_2x(func, type, op1_type, op2_type, imm1, imm2) \ + type _CONCAT(_,func) (op1_type A, op2_type B, int const I, int const L) \ + { return func (A, B, imm1, imm2); } + +#define test_3(func, type, op1_type, op2_type, op3_type, imm) \ + type _CONCAT(_,func) (op1_type A, op2_type B, \ + op3_type C, int const I) \ + { return func (A, B, C, imm); } + +#define test_4(func, type, op1_type, op2_type, op3_type, op4_type, imm) \ + type _CONCAT(_,func) (op1_type A, op2_type B, \ + op3_type C, op4_type D, int const I) \ + { return func (A, B, C, D, imm); } + + +/* Following intrinsics require immediate arguments. They + are defined as macros for non-optimized compilations. */ + +/* ammintrin.h */ +test_1x (_mm_extracti_si64, __m128i, __m128i, 1, 1) +test_2x (_mm_inserti_si64, __m128i, __m128i, __m128i, 1, 1) + +/* immintrin.h */ +test_2 (_mm256_blend_pd, __m256d, __m256d, __m256d, 1) +test_2 (_mm256_blend_ps, __m256, __m256, __m256, 1) +test_2 (_mm256_dp_ps, __m256, __m256, __m256, 1) +test_2 (_mm256_shuffle_pd, __m256d, __m256d, __m256d, 1) +test_2 (_mm256_shuffle_ps, __m256, __m256, __m256, 1) +test_2 (_mm_cmp_sd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_cmp_ss, __m128, __m128, __m128, 1) +test_2 (_mm_cmp_pd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_cmp_ps, __m128, __m128, __m128, 1) +test_2 (_mm256_cmp_pd, __m256d, __m256d, __m256d, 1) +test_2 (_mm256_cmp_ps, __m256, __m256, __m256, 1) +test_1 (_mm256_extractf128_pd, __m128d, __m256d, 1) +test_1 (_mm256_extractf128_ps, __m128, __m256, 1) +test_1 (_mm256_extractf128_si256, __m128i, __m256i, 1) +test_1 (_mm256_extract_epi8, int, __m256i, 20) +test_1 (_mm256_extract_epi16, int, __m256i, 13) +test_1 (_mm256_extract_epi32, int, __m256i, 6) +#ifdef __x86_64__ +test_1 (_mm256_extract_epi64, long long, __m256i, 2) +#endif +test_1 (_mm_permute_pd, __m128d, __m128d, 1) +test_1 (_mm256_permute_pd, __m256d, __m256d, 1) +test_1 (_mm_permute_ps, __m128, __m128, 1) +test_1 (_mm256_permute_ps, __m256, __m256, 1) +test_2 (_mm256_permute2f128_pd, __m256d, __m256d, __m256d, 1) +test_2 (_mm256_permute2f128_ps, __m256, __m256, __m256, 1) +test_2 (_mm256_permute2f128_si256, __m256i, __m256i, __m256i, 1) +test_2 (_mm256_insertf128_pd, __m256d, __m256d, __m128d, 1) +test_2 (_mm256_insertf128_ps, __m256, __m256, __m128, 1) +test_2 (_mm256_insertf128_si256, __m256i, __m256i, __m128i, 1) +test_2 (_mm256_insert_epi8, __m256i, __m256i, int, 30) +test_2 (_mm256_insert_epi16, __m256i, __m256i, int, 7) +test_2 (_mm256_insert_epi32, __m256i, __m256i, int, 3) +#ifdef __x86_64__ +test_2 (_mm256_insert_epi64, __m256i, __m256i, long long, 1) +#endif +test_1 (_mm256_round_pd, __m256d, __m256d, 1) +test_1 (_mm256_round_ps, __m256, __m256, 1) + +/* wmmintrin.h */ +test_1 (_mm_aeskeygenassist_si128, __m128i, __m128i, 1) +test_2 (_mm_clmulepi64_si128, __m128i, __m128i, __m128i, 1) + +/* smmintrin.h */ +test_1 (_mm_round_pd, __m128d, __m128d, 1) +test_1 (_mm_round_ps, __m128, __m128, 1) +test_2 (_mm_round_sd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_round_ss, __m128, __m128, __m128, 1) + +test_2 (_mm_blend_epi16, __m128i, __m128i, __m128i, 1) +test_2 (_mm_blend_ps, __m128, __m128, __m128, 1) +test_2 (_mm_blend_pd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_dp_ps, __m128, __m128, __m128, 1) +test_2 (_mm_dp_pd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_insert_ps, __m128, __m128, __m128, 1) +test_1 (_mm_extract_ps, int, __m128, 1) +test_2 (_mm_insert_epi8, __m128i, __m128i, int, 1) +test_2 (_mm_insert_epi32, __m128i, __m128i, int, 1) +#ifdef __x86_64__ +test_2 (_mm_insert_epi64, __m128i, __m128i, long long, 1) +#endif +test_1 (_mm_extract_epi8, int, __m128i, 1) +test_1 (_mm_extract_epi32, int, __m128i, 1) +#ifdef __x86_64__ +test_1 (_mm_extract_epi64, long long, __m128i, 1) +#endif +test_2 (_mm_mpsadbw_epu8, __m128i, __m128i, __m128i, 1) +test_2 (_mm_cmpistrm, __m128i, __m128i, __m128i, 1) +test_2 (_mm_cmpistri, int, __m128i, __m128i, 1) +test_4 (_mm_cmpestrm, __m128i, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestri, int, __m128i, int, __m128i, int, 1) +test_2 (_mm_cmpistra, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistrc, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistro, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistrs, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistrz, int, __m128i, __m128i, 1) +test_4 (_mm_cmpestra, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestrc, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestro, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestrs, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1) + +/* tmmintrin.h */ +test_2 (_mm_alignr_epi8, __m128i, __m128i, __m128i, 1) +test_2 (_mm_alignr_pi8, __m64, __m64, __m64, 1) + +/* emmintrin.h */ +test_2 (_mm_shuffle_pd, __m128d, __m128d, __m128d, 1) +test_1 (_mm_srli_si128, __m128i, __m128i, 1) +test_1 (_mm_slli_si128, __m128i, __m128i, 1) +test_1 (_mm_extract_epi16, int, __m128i, 1) +test_2 (_mm_insert_epi16, __m128i, __m128i, int, 1) +test_1 (_mm_shufflehi_epi16, __m128i, __m128i, 1) +test_1 (_mm_shufflelo_epi16, __m128i, __m128i, 1) +test_1 (_mm_shuffle_epi32, __m128i, __m128i, 1) + +/* xmmintrin.h */ +test_2 (_mm_shuffle_ps, __m128, __m128, __m128, 1) +test_1 (_mm_extract_pi16, int, __m64, 1) +test_1 (_m_pextrw, int, __m64, 1) +test_2 (_mm_insert_pi16, __m64, __m64, int, 1) +test_2 (_m_pinsrw, __m64, __m64, int, 1) +test_1 (_mm_shuffle_pi16, __m64, __m64, 1) +test_1 (_m_pshufw, __m64, __m64, 1) +test_1 (_mm_prefetch, void, void *, _MM_HINT_NTA) diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c new file mode 100644 index 0000000..eeae0fc --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse-22.c @@ -0,0 +1,161 @@ +/* Same as sse-14, except converted to use #pragma GCC option. */ +/* { dg-do compile } */ +/* { dg-options "-O0 -Werror-implicit-function-declaration" } */ + +#include <mm_malloc.h> + +/* Test that the intrinsics compile without optimization. All of them are + defined as inline functions in {,x,e,p,t,s,w,a}mmintrin.h and mm3dnow.h + that reference the proper builtin functions. Defining away "extern" and + "__inline" results in all of them being compiled as proper functions. */ + +#define extern +#define __inline + +#define _CONCAT(x,y) x ## y + +#define test_1(func, type, op1_type, imm) \ + type _CONCAT(_,func) (op1_type A, int const I) \ + { return func (A, imm); } + +#define test_1x(func, type, op1_type, imm1, imm2) \ + type _CONCAT(_,func) (op1_type A, int const I, int const L) \ + { return func (A, imm1, imm2); } + +#define test_2(func, type, op1_type, op2_type, imm) \ + type _CONCAT(_,func) (op1_type A, op2_type B, int const I) \ + { return func (A, B, imm); } + +#define test_2x(func, type, op1_type, op2_type, imm1, imm2) \ + type _CONCAT(_,func) (op1_type A, op2_type B, int const I, int const L) \ + { return func (A, B, imm1, imm2); } + +#define test_4(func, type, op1_type, op2_type, op3_type, op4_type, imm) \ + type _CONCAT(_,func) (op1_type A, op2_type B, \ + op3_type C, op4_type D, int const I) \ + { return func (A, B, C, D, imm); } + + +#ifndef DIFFERENT_PRAGMAS +#pragma GCC target ("mmx,3dnow,sse,sse2,sse3,ssse3,sse4.1,sse4.2,sse4a,aes,pclmul") +#endif + +/* Following intrinsics require immediate arguments. They + are defined as macros for non-optimized compilations. */ + +/* mmintrin.h (MMX). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("mmx") +#endif +#include <mmintrin.h> + +/* mm3dnow.h (3DNOW). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("3dnow") +#endif +#include <mm3dnow.h> + +/* xmmintrin.h (SSE). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("sse") +#endif +#include <xmmintrin.h> +test_2 (_mm_shuffle_ps, __m128, __m128, __m128, 1) +test_1 (_mm_extract_pi16, int, __m64, 1) +test_1 (_m_pextrw, int, __m64, 1) +test_2 (_mm_insert_pi16, __m64, __m64, int, 1) +test_2 (_m_pinsrw, __m64, __m64, int, 1) +test_1 (_mm_shuffle_pi16, __m64, __m64, 1) +test_1 (_m_pshufw, __m64, __m64, 1) +test_1 (_mm_prefetch, void, void *, _MM_HINT_NTA) + +/* emmintrin.h (SSE2). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("sse2") +#endif +#include <emmintrin.h> +test_2 (_mm_shuffle_pd, __m128d, __m128d, __m128d, 1) +test_1 (_mm_srli_si128, __m128i, __m128i, 1) +test_1 (_mm_slli_si128, __m128i, __m128i, 1) +test_1 (_mm_extract_epi16, int, __m128i, 1) +test_2 (_mm_insert_epi16, __m128i, __m128i, int, 1) +test_1 (_mm_shufflehi_epi16, __m128i, __m128i, 1) +test_1 (_mm_shufflelo_epi16, __m128i, __m128i, 1) +test_1 (_mm_shuffle_epi32, __m128i, __m128i, 1) + +/* pmmintrin.h (SSE3). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("sse3") +#endif +#include <pmmintrin.h> + +/* tmmintrin.h (SSSE3). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("ssse3") +#endif +#include <tmmintrin.h> +test_2 (_mm_alignr_epi8, __m128i, __m128i, __m128i, 1) +test_2 (_mm_alignr_pi8, __m64, __m64, __m64, 1) + +/* ammintrin.h (SSE4A). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("sse4a") +#endif +#include <ammintrin.h> +test_1x (_mm_extracti_si64, __m128i, __m128i, 1, 1) +test_2x (_mm_inserti_si64, __m128i, __m128i, __m128i, 1, 1) + +/* smmintrin.h (SSE4.1). */ +/* nmmintrin.h (SSE4.2). */ +/* Note, nmmintrin.h includes smmintrin.h, and smmintrin.h checks for the + #ifdef. So just set the option to SSE4.2. */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("sse4.2") +#endif +#include <nmmintrin.h> +test_2 (_mm_blend_epi16, __m128i, __m128i, __m128i, 1) +test_2 (_mm_blend_ps, __m128, __m128, __m128, 1) +test_2 (_mm_blend_pd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_dp_ps, __m128, __m128, __m128, 1) +test_2 (_mm_dp_pd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_insert_ps, __m128, __m128, __m128, 1) +test_1 (_mm_extract_ps, int, __m128, 1) +test_2 (_mm_insert_epi8, __m128i, __m128i, int, 1) +test_2 (_mm_insert_epi32, __m128i, __m128i, int, 1) +#ifdef __x86_64__ +test_2 (_mm_insert_epi64, __m128i, __m128i, long long, 1) +#endif +test_1 (_mm_extract_epi8, int, __m128i, 1) +test_1 (_mm_extract_epi32, int, __m128i, 1) +#ifdef __x86_64__ +test_1 (_mm_extract_epi64, long long, __m128i, 1) +#endif +test_2 (_mm_mpsadbw_epu8, __m128i, __m128i, __m128i, 1) +test_2 (_mm_cmpistrm, __m128i, __m128i, __m128i, 1) +test_2 (_mm_cmpistri, int, __m128i, __m128i, 1) +test_4 (_mm_cmpestrm, __m128i, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestri, int, __m128i, int, __m128i, int, 1) +test_2 (_mm_cmpistra, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistrc, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistro, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistrs, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistrz, int, __m128i, __m128i, 1) +test_4 (_mm_cmpestra, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestrc, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestro, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestrs, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1) + +/* wmmintrin.h (AES/PCLMUL). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("aes,pclmul") +#endif +#include <wmmintrin.h> +test_1 (_mm_aeskeygenassist_si128, __m128i, __m128i, 1) +test_2 (_mm_clmulepi64_si128, __m128i, __m128i, __m128i, 1) + +/* smmintrin.h (SSE4.1). */ +test_1 (_mm_round_pd, __m128d, __m128d, 1) +test_1 (_mm_round_ps, __m128, __m128, 1) +test_2 (_mm_round_sd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_round_ss, __m128, __m128, __m128, 1) |