aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/ChangeLog122
-rw-r--r--gcc/config.gcc10
-rw-r--r--gcc/config/i386/cpuid.h1
-rw-r--r--gcc/config/i386/fma4intrin.h245
-rw-r--r--gcc/config/i386/i386-c.c2
-rw-r--r--gcc/config/i386/i386-protos.h3
-rw-r--r--gcc/config/i386/i386.c454
-rw-r--r--gcc/config/i386/i386.h9
-rw-r--r--gcc/config/i386/i386.md4
-rw-r--r--gcc/config/i386/i386.opt4
-rw-r--r--gcc/config/i386/sse.md936
-rw-r--r--gcc/config/i386/x86intrin.h6
-rw-r--r--gcc/doc/extend.texi45
-rw-r--r--gcc/doc/invoke.texi6
-rw-r--r--gcc/testsuite/ChangeLog53
-rw-r--r--gcc/testsuite/g++.dg/other/i386-2.C6
-rw-r--r--gcc/testsuite/g++.dg/other/i386-3.C6
-rw-r--r--gcc/testsuite/g++.dg/other/i386-5.C6
-rw-r--r--gcc/testsuite/g++.dg/other/i386-6.C6
-rw-r--r--gcc/testsuite/gcc.target/i386/fma4-256-maccXX.c96
-rw-r--r--gcc/testsuite/gcc.target/i386/fma4-256-msubXX.c96
-rw-r--r--gcc/testsuite/gcc.target/i386/fma4-256-nmaccXX.c96
-rw-r--r--gcc/testsuite/gcc.target/i386/fma4-256-nmsubXX.c95
-rw-r--r--gcc/testsuite/gcc.target/i386/fma4-256-vector.c93
-rw-r--r--gcc/testsuite/gcc.target/i386/fma4-check.h20
-rw-r--r--gcc/testsuite/gcc.target/i386/fma4-fma.c83
-rw-r--r--gcc/testsuite/gcc.target/i386/fma4-maccXX.c136
-rw-r--r--gcc/testsuite/gcc.target/i386/fma4-msubXX.c134
-rw-r--r--gcc/testsuite/gcc.target/i386/fma4-nmaccXX.c137
-rw-r--r--gcc/testsuite/gcc.target/i386/fma4-nmsubXX.c137
-rw-r--r--gcc/testsuite/gcc.target/i386/fma4-vector.c93
-rw-r--r--gcc/testsuite/gcc.target/i386/funcspec-2.c99
-rw-r--r--gcc/testsuite/gcc.target/i386/funcspec-4.c3
-rw-r--r--gcc/testsuite/gcc.target/i386/funcspec-5.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/funcspec-6.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/funcspec-8.c19
-rw-r--r--gcc/testsuite/gcc.target/i386/funcspec-9.c36
-rw-r--r--gcc/testsuite/gcc.target/i386/i386.exp14
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-1.c6
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-10.c34
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-11.c34
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-12.c34
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-13.c34
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-14.c5
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-2.c37
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-3.c37
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-4.c37
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-5.c6
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-6.c6
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-7.c37
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-8.c34
-rw-r--r--gcc/testsuite/gcc.target/i386/isa-9.c34
-rw-r--r--gcc/testsuite/gcc.target/i386/sse-12.c8
-rw-r--r--gcc/testsuite/gcc.target/i386/sse-13.c128
-rw-r--r--gcc/testsuite/gcc.target/i386/sse-14.c157
-rw-r--r--gcc/testsuite/gcc.target/i386/sse-22.c161
56 files changed, 4119 insertions, 25 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b8c0905..2fce0d5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,125 @@
+2009-09-29 Harsha Jagasia <harsha.jagasia@amd.com>
+
+ * config.gcc (i[34567]86-*-*): Include fma4intrin.h.
+ (x86_64-*-*): Ditto.
+
+ * config/i386/fma4intrin.h: New file, provide common x86 compiler
+ intrinisics for FMA4.
+ * config/i386/cpuid.h (bit_FMA4): Define FMA4 bit.
+ * config/i386/x86intrin.h: Fix typo to SSE4A instead of SSE4a.
+ Add FMA4 check and fma4intrin.h.
+ * config/i386/i386-c.c(ix86_target_macros_internal): Check
+ ISA_FLAG for FMA4.
+ * config/i386/i386.h(TARGET_FMA4): New macro for FMA4.
+ * config/i386/i386.md (UNSPEC_FMA4_INTRINSIC): Add new UNSPEC
+ constant for FMA4 support.
+ (UNSPEC_FMA4_FMADDSUB): Ditto.
+ (UNSPEC_FMA4_FMSUBADD): Ditto.
+ * config/i386/i386.opt (-mfma4): New switch for FMA4 support.
+ * config/i386/i386-protos.h (ix86_fma4_valid_op_p): Add
+ declaration.
+ (ix86_expand_fma4_multiple_memory): Ditto.
+ * config/i386/i386.c (OPTION_MASK_ISA_FMA4_SET): New.
+ (OPTION_MASK_ISA_FMA4_UNSET): New.
+ (OPTION_MASK_ISA_SSE4A_UNSET): Change definition to
+ depend on FMA4.
+ (OPTION_MASK_ISA_AVX_UNSET): Change definition to
+ depend on FMA4.
+ (ix86_handle_option): Handle -mfma4.
+ (isa_opts): Handle -mfma4.
+ (enum pta_flags): Add PTA_FMA4.
+ (override_options): Add FMA4 support.
+ (IX86_BUILTIN_VFMADDSS): New for FMA4 intrinsic.
+ (IX86_BUILTIN_VFMADDSD): Ditto.
+ (IX86_BUILTIN_VFMADDPS): Ditto.
+ (IX86_BUILTIN_VFMADDPD): Ditto.
+ (IX86_BUILTIN_VFMSUBSS): Ditto.
+ (IX86_BUILTIN_VFMSUBSD): Ditto.
+ (IX86_BUILTIN_VFMSUBPS): Ditto.
+ (IX86_BUILTIN_VFMSUBPD): Ditto.
+ (IX86_BUILTIN_VFMADDSUBPS): Ditto.
+ (IX86_BUILTIN_VFMADDSUBPD): Ditto.
+ (IX86_BUILTIN_VFMSUBADDPS): Ditto.
+ (IX86_BUILTIN_VFMSUBADDPD): Ditto.
+ (IX86_BUILTIN_VFNMADDSS): Ditto.
+ (IX86_BUILTIN_VFNMADDSD): Ditto.
+ (IX86_BUILTIN_VFNMADDPS): Ditto.
+ (IX86_BUILTIN_VFNMADDPD): Ditto.
+ (IX86_BUILTIN_VFNMSUBSS): Ditto.
+ (IX86_BUILTIN_VFNMSUBSD): Ditto.
+ (IX86_BUILTIN_VFNMSUBPS): Ditto.
+ (IX86_BUILTIN_VFNMSUBPD): Ditto.
+ (IX86_BUILTIN_VFMADDPS256): Ditto.
+ (IX86_BUILTIN_VFMADDPD256): Ditto.
+ (IX86_BUILTIN_VFMSUBPS256): Ditto.
+ (IX86_BUILTIN_VFMSUBPD256): Ditto.
+ (IX86_BUILTIN_VFMADDSUBPS256): Ditto.
+ (IX86_BUILTIN_VFMADDSUBPD256): Ditto.
+ (IX86_BUILTIN_VFMSUBADDPS256): Ditto.
+ (IX86_BUILTIN_VFMSUBADDPD256): Ditto.
+ (IX86_BUILTIN_VFNMADDPS256): Ditto.
+ (IX86_BUILTIN_VFNMADDPD256): Ditto.
+ (IX86_BUILTIN_VFNMSUBPS256): Ditto.
+ (IX86_BUILTIN_VFNMSUBPD256): Ditto.
+ (enum multi_arg_type): New enum for describing the various FMA4
+ intrinsic argument types.
+ (bdesc_multi_arg): New table for FMA4 intrinsics.
+ (ix86_init_mmx_sse_builtins): Add FMA4 intrinsic support.
+ (ix86_expand_multi_arg_builtin): New function for creating FMA4
+ intrinsics.
+ (ix86_expand_builtin): Add FMA4 intrinsic support.
+ (ix86_fma4_valid_op_p): New function to validate FMA4 3 and 4
+ operand instructions.
+ (ix86_expand_fma4_multiple_memory): New function to split the
+ second memory reference from FMA4 instructions.
+ * config/i386/sse.md (ssemodesuffixf4): New mode attribute for FMA4.
+ (ssemodesuffixf2s): Ditto.
+ (fma4_fmadd<mode>4): Add FMA4 floating point multiply/add
+ instructions.
+ (fma4_fmsub<mode>4): Ditto.
+ (fma4_fnmadd<mode>4): Ditto.
+ (fma4_fnmsub<mode>4): Ditto.
+ (fma4_vmfmadd<mode>4): Ditto.
+ (fma4_vmfmsub<mode>4): Ditto.
+ (fma4_vmfnmadd<mode>4): Ditto.
+ (fma4_vmfnmsub<mode>4): Ditto.
+ (fma4_fmadd<mode>4256): Ditto.
+ (fma4_fmsub<mode>4256): Ditto.
+ (fma4_fnmadd<mode>4256): Ditto.
+ (fma4_fnmsub<mode>4256): Ditto.
+ (fma4_fmaddsubv8sf4): Ditto.
+ (fma4_fmaddsubv4sf4): Ditto.
+ (fma4_fmaddsubv4df4): Ditto.
+ (fma4_fmaddsubv2df4): Ditto.
+ (fma4_fmsubaddv8sf4): Ditto.
+ (fma4_fmsubaddv4sf4): Ditto.
+ (fma4_fmsubaddv4df4): Ditto.
+ (fma4_fmsubaddv2df4): Ditto.
+ (fma4i_fmadd<mode>4): Add FMA4 floating point multiply/add
+ instructions for intrinsics.
+ (fma4i_fmsub<mode>4): Ditto.
+ (fma4i_fnmadd<mode>4): Ditto.
+ (fma4i_fnmsub<mode>4): Ditto.
+ (fma4i_vmfmadd<mode>4): Ditto.
+ (fma4i_vmfmsub<mode>4): Ditto.
+ (fma4i_vmfnmadd<mode>4): Ditto.
+ (fma4i_vmfnmsub<mode>4): Ditto.
+ (fma4i_fmadd<mode>4256): Ditto.
+ (fma4i_fmsub<mode>4256): Ditto.
+ (fma4i_fnmadd<mode>4256): Ditto.
+ (fma4i_fnmsub<mode>4256): Ditto.
+ (fma4i_fmaddsubv8sf4): Ditto.
+ (fma4i_fmaddsubv4sf4): Ditto.
+ (fma4i_fmaddsubv4df4): Ditto.
+ (fma4i_fmaddsubv2df4): Ditto.
+ (fma4i_fmsubaddv8sf4): Ditto.
+ (fma4i_fmsubaddv4sf4): Ditto.
+ (fma4i_fmsubaddv4df4): Ditto.
+ (fma4i_fmsubaddv2df4): Ditto.
+
+ * doc/invoke.texi (-mfma4): Add documentation.
+ * doc/extend.texi (x86 intrinsics): Add FMA4 intrinsics.
+
2009-09-29 Richard Henderson <rth@redhat.com>
* tree-eh.c (unsplit_eh): Do not unsplit if there's already
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 5ad5770..6351aa5 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -286,8 +286,9 @@ i[34567]86-*-*)
cxx_target_objs="i386-c.o"
extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
- nmmintrin.h bmmintrin.h wmmintrin.h immintrin.h
- x86intrin.h avxintrin.h ia32intrin.h cross-stdarg.h"
+ nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
+ immintrin.h x86intrin.h avxintrin.h
+ ia32intrin.h cross-stdarg.h"
;;
x86_64-*-*)
cpu_type=i386
@@ -295,8 +296,9 @@ x86_64-*-*)
cxx_target_objs="i386-c.o"
extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
- nmmintrin.h bmmintrin.h wmmintrin.h immintrin.h
- x86intrin.h avxintrin.h ia32intrin.h cross-stdarg.h"
+ nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
+ immintrin.h x86intrin.h avxintrin.h
+ ia32intrin.h cross-stdarg.h"
need_64bit_hwint=yes
;;
ia64-*-*)
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index 61a3dca..49acfa7 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -48,6 +48,7 @@
/* %ecx */
#define bit_LAHF_LM (1 << 0)
#define bit_SSE4a (1 << 6)
+#define bit_FMA4 (1 << 16)
/* %edx */
#define bit_LM (1 << 29)
diff --git a/gcc/config/i386/fma4intrin.h b/gcc/config/i386/fma4intrin.h
new file mode 100644
index 0000000..42782ad
--- /dev/null
+++ b/gcc/config/i386/fma4intrin.h
@@ -0,0 +1,245 @@
+/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _FMA4INTRIN_H_INCLUDED
+#define _FMA4INTRIN_H_INCLUDED
+
+#ifndef __FMA4__
+# error "FMA4 instruction set not enabled"
+#else
+
+/* We need definitions from the SSE4A, SSE3, SSE2 and SSE header files. */
+#include <ammintrin.h>
+
+/* Internal data types for implementing the intrinsics. */
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+
+typedef float __m256 __attribute__ ((__vector_size__ (32),
+ __may_alias__));
+typedef double __m256d __attribute__ ((__vector_size__ (32),
+ __may_alias__));
+
+/* 128b Floating point multiply/add type instructions. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_ps (__m128 __A, __m128 __B, __m128 __C)
+
+{
+ return (__m128) __builtin_ia32_vfmsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmsubss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmsubsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfnmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfnmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfnmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfnmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfnmsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfnmsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfnmsubss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfnmsubsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msubadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmsubaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msubadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmsubaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+/* 256b Floating point multiply/add type instructions. */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_macc_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_macc_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msub_ps (__m256 __A, __m256 __B, __m256 __C)
+
+{
+ return (__m256) __builtin_ia32_vfmsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmacc_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfnmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmacc_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfnmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfnmsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfnmsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msubadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmsubaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msubadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmsubaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+#endif
+
+#endif
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 4c960e7..12a3f17 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -230,6 +230,8 @@ ix86_target_macros_internal (int isa_flag,
def_or_undef (parse_in, "__FMA__");
if (isa_flag & OPTION_MASK_ISA_SSE4A)
def_or_undef (parse_in, "__SSE4A__");
+ if (isa_flag & OPTION_MASK_ISA_FMA4)
+ def_or_undef (parse_in, "__FMA4__");
if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE))
def_or_undef (parse_in, "__SSE_MATH__");
if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE2))
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 389fc3c..58da131 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -214,6 +214,9 @@ extern void ix86_expand_vector_set (bool, rtx, rtx, int);
extern void ix86_expand_vector_extract (bool, rtx, rtx, int);
extern void ix86_expand_reduc_v4sf (rtx (*)(rtx, rtx, rtx), rtx, rtx);
+extern bool ix86_fma4_valid_op_p (rtx [], rtx, int, bool, int, bool);
+extern void ix86_expand_fma4_multiple_memory (rtx [], int, enum machine_mode);
+
/* In i386-c.c */
extern void ix86_target_macros (void);
extern void ix86_register_pragmas (void);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 3b11b91..9df01ba 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1955,6 +1955,9 @@ static int ix86_isa_flags_explicit;
#define OPTION_MASK_ISA_SSE4A_SET \
(OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE3_SET)
+#define OPTION_MASK_ISA_FMA4_SET \
+ (OPTION_MASK_ISA_FMA4 | OPTION_MASK_ISA_SSE4A_SET \
+ | OPTION_MASK_ISA_AVX_SET)
/* AES and PCLMUL need SSE2 because they use xmm registers */
#define OPTION_MASK_ISA_AES_SET \
@@ -1995,7 +1998,8 @@ static int ix86_isa_flags_explicit;
#define OPTION_MASK_ISA_SSE4_2_UNSET \
(OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_AVX_UNSET )
#define OPTION_MASK_ISA_AVX_UNSET \
- (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_FMA_UNSET)
+ (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_FMA_UNSET \
+ | OPTION_MASK_ISA_FMA4_UNSET)
#define OPTION_MASK_ISA_FMA_UNSET OPTION_MASK_ISA_FMA
/* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
@@ -2003,7 +2007,10 @@ static int ix86_isa_flags_explicit;
#define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET
#define OPTION_MASK_ISA_SSE4A_UNSET \
- (OPTION_MASK_ISA_SSE4A)
+ (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_FMA4_UNSET)
+
+#define OPTION_MASK_ISA_FMA4_UNSET OPTION_MASK_ISA_FMA4
+
#define OPTION_MASK_ISA_AES_UNSET OPTION_MASK_ISA_AES
#define OPTION_MASK_ISA_PCLMUL_UNSET OPTION_MASK_ISA_PCLMUL
#define OPTION_MASK_ISA_ABM_UNSET OPTION_MASK_ISA_ABM
@@ -2237,6 +2244,19 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
}
return true;
+ case OPT_mfma4:
+ if (value)
+ {
+ ix86_isa_flags |= OPTION_MASK_ISA_FMA4_SET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA4_SET;
+ }
+ else
+ {
+ ix86_isa_flags &= ~OPTION_MASK_ISA_FMA4_UNSET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA4_UNSET;
+ }
+ return true;
+
case OPT_mabm:
if (value)
{
@@ -2364,6 +2384,7 @@ ix86_target_string (int isa, int flags, const char *arch, const char *tune,
static struct ix86_target_opts isa_opts[] =
{
{ "-m64", OPTION_MASK_ISA_64BIT },
+ { "-mfma4", OPTION_MASK_ISA_FMA4 },
{ "-msse4a", OPTION_MASK_ISA_SSE4A },
{ "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
{ "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
@@ -2593,7 +2614,8 @@ override_options (bool main_args_p)
PTA_PCLMUL = 1 << 17,
PTA_AVX = 1 << 18,
PTA_FMA = 1 << 19,
- PTA_MOVBE = 1 << 20
+ PTA_MOVBE = 1 << 20,
+ PTA_FMA4 = 1 << 21
};
static struct pta
@@ -2936,6 +2958,9 @@ override_options (bool main_args_p)
if (processor_alias_table[i].flags & PTA_SSE4A
&& !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
+ if (processor_alias_table[i].flags & PTA_FMA4
+ && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
+ ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
if (processor_alias_table[i].flags & PTA_ABM
&& !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
ix86_isa_flags |= OPTION_MASK_ISA_ABM;
@@ -3619,6 +3644,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[])
IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
IX86_ATTR_ISA ("sse4a", OPT_msse4a),
IX86_ATTR_ISA ("ssse3", OPT_mssse3),
+ IX86_ATTR_ISA ("fma4", OPT_mfma4),
/* string options */
IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
@@ -20686,6 +20712,39 @@ enum ix86_builtins
IX86_BUILTIN_CVTUDQ2PS,
+ /* FMA4 instructions. */
+ IX86_BUILTIN_VFMADDSS,
+ IX86_BUILTIN_VFMADDSD,
+ IX86_BUILTIN_VFMADDPS,
+ IX86_BUILTIN_VFMADDPD,
+ IX86_BUILTIN_VFMSUBSS,
+ IX86_BUILTIN_VFMSUBSD,
+ IX86_BUILTIN_VFMSUBPS,
+ IX86_BUILTIN_VFMSUBPD,
+ IX86_BUILTIN_VFMADDSUBPS,
+ IX86_BUILTIN_VFMADDSUBPD,
+ IX86_BUILTIN_VFMSUBADDPS,
+ IX86_BUILTIN_VFMSUBADDPD,
+ IX86_BUILTIN_VFNMADDSS,
+ IX86_BUILTIN_VFNMADDSD,
+ IX86_BUILTIN_VFNMADDPS,
+ IX86_BUILTIN_VFNMADDPD,
+ IX86_BUILTIN_VFNMSUBSS,
+ IX86_BUILTIN_VFNMSUBSD,
+ IX86_BUILTIN_VFNMSUBPS,
+ IX86_BUILTIN_VFNMSUBPD,
+ IX86_BUILTIN_VFMADDPS256,
+ IX86_BUILTIN_VFMADDPD256,
+ IX86_BUILTIN_VFMSUBPS256,
+ IX86_BUILTIN_VFMSUBPD256,
+ IX86_BUILTIN_VFMADDSUBPS256,
+ IX86_BUILTIN_VFMADDSUBPD256,
+ IX86_BUILTIN_VFMSUBADDPS256,
+ IX86_BUILTIN_VFMSUBADDPD256,
+ IX86_BUILTIN_VFNMADDPS256,
+ IX86_BUILTIN_VFNMADDPD256,
+ IX86_BUILTIN_VFNMSUBPS256,
+ IX86_BUILTIN_VFNMSUBPD256,
IX86_BUILTIN_MAX
};
@@ -21759,6 +21818,56 @@ static const struct builtin_description bdesc_args[] =
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
};
+/* FMA4. */
+enum multi_arg_type {
+ MULTI_ARG_UNKNOWN,
+ MULTI_ARG_3_SF,
+ MULTI_ARG_3_DF,
+ MULTI_ARG_3_SF2,
+ MULTI_ARG_3_DF2
+};
+
+static const struct builtin_description bdesc_multi_arg[] =
+{
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmaddv4sf4, "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS, UNKNOWN, (int)MULTI_ARG_3_SF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmaddv2df4, "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD, UNKNOWN, (int)MULTI_ARG_3_DF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv4sf4, "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS, UNKNOWN, (int)MULTI_ARG_3_SF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv2df4, "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD, UNKNOWN, (int)MULTI_ARG_3_DF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmsubv4sf4, "__builtin_ia32_vfmsubss", IX86_BUILTIN_VFMSUBSS, UNKNOWN, (int)MULTI_ARG_3_SF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmsubv2df4, "__builtin_ia32_vfmsubsd", IX86_BUILTIN_VFMSUBSD, UNKNOWN, (int)MULTI_ARG_3_DF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv4sf4, "__builtin_ia32_vfmsubps", IX86_BUILTIN_VFMSUBPS, UNKNOWN, (int)MULTI_ARG_3_SF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv2df4, "__builtin_ia32_vfmsubpd", IX86_BUILTIN_VFMSUBPD, UNKNOWN, (int)MULTI_ARG_3_DF },
+
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmaddv4sf4, "__builtin_ia32_vfnmaddss", IX86_BUILTIN_VFNMADDSS, UNKNOWN, (int)MULTI_ARG_3_SF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmaddv2df4, "__builtin_ia32_vfnmaddsd", IX86_BUILTIN_VFNMADDSD, UNKNOWN, (int)MULTI_ARG_3_DF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv4sf4, "__builtin_ia32_vfnmaddps", IX86_BUILTIN_VFNMADDPS, UNKNOWN, (int)MULTI_ARG_3_SF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv2df4, "__builtin_ia32_vfnmaddpd", IX86_BUILTIN_VFNMADDPD, UNKNOWN, (int)MULTI_ARG_3_DF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmsubv4sf4, "__builtin_ia32_vfnmsubss", IX86_BUILTIN_VFNMSUBSS, UNKNOWN, (int)MULTI_ARG_3_SF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmsubv2df4, "__builtin_ia32_vfnmsubsd", IX86_BUILTIN_VFNMSUBSD, UNKNOWN, (int)MULTI_ARG_3_DF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv4sf4, "__builtin_ia32_vfnmsubps", IX86_BUILTIN_VFNMSUBPS, UNKNOWN, (int)MULTI_ARG_3_SF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv2df4, "__builtin_ia32_vfnmsubpd", IX86_BUILTIN_VFNMSUBPD, UNKNOWN, (int)MULTI_ARG_3_DF },
+
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv4sf4, "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS, UNKNOWN, (int)MULTI_ARG_3_SF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv2df4, "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD, UNKNOWN, (int)MULTI_ARG_3_DF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv4sf4, "__builtin_ia32_vfmsubaddps", IX86_BUILTIN_VFMSUBADDPS, UNKNOWN, (int)MULTI_ARG_3_SF },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv2df4, "__builtin_ia32_vfmsubaddpd", IX86_BUILTIN_VFMSUBADDPD, UNKNOWN, (int)MULTI_ARG_3_DF },
+
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv8sf4256, "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv4df4256, "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv8sf4256, "__builtin_ia32_vfmsubps256", IX86_BUILTIN_VFMSUBPS256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv4df4256, "__builtin_ia32_vfmsubpd256", IX86_BUILTIN_VFMSUBPD256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
+
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv8sf4256, "__builtin_ia32_vfnmaddps256", IX86_BUILTIN_VFNMADDPS256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv4df4256, "__builtin_ia32_vfnmaddpd256", IX86_BUILTIN_VFNMADDPD256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv8sf4256, "__builtin_ia32_vfnmsubps256", IX86_BUILTIN_VFNMSUBPS256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv4df4256, "__builtin_ia32_vfnmsubpd256", IX86_BUILTIN_VFNMSUBPD256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
+
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv8sf4, "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv4df4, "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv8sf4, "__builtin_ia32_vfmsubaddps256", IX86_BUILTIN_VFMSUBADDPS256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
+ { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv4df4, "__builtin_ia32_vfmsubaddpd256", IX86_BUILTIN_VFMSUBADDPD256, UNKNOWN, (int)MULTI_ARG_3_DF2 }
+
+};
/* Set up all the MMX/SSE builtins, even builtins for instructions that are not
in the current target ISA to allow the user to compile particular modules
@@ -23192,6 +23301,29 @@ ix86_init_mmx_sse_builtins (void)
intQI_type_node,
integer_type_node, NULL_TREE);
def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi", ftype, IX86_BUILTIN_VEC_SET_V16QI);
+ /* Add FMA4 multi-arg argument instructions */
+ for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
+ {
+ tree mtype = NULL_TREE;
+
+ if (d->name == 0)
+ continue;
+
+ switch ((enum multi_arg_type)d->flag)
+ {
+ case MULTI_ARG_3_SF: mtype = v4sf_ftype_v4sf_v4sf_v4sf; break;
+ case MULTI_ARG_3_DF: mtype = v2df_ftype_v2df_v2df_v2df; break;
+ case MULTI_ARG_3_SF2: mtype = v8sf_ftype_v8sf_v8sf_v8sf; break;
+ case MULTI_ARG_3_DF2: mtype = v4df_ftype_v4df_v4df_v4df; break;
+
+ case MULTI_ARG_UNKNOWN:
+ default:
+ gcc_unreachable ();
+ }
+
+ if (mtype)
+ def_builtin_const (d->mask, d->name, mtype, d->code);
+ }
}
/* Internal method for ix86_init_builtins. */
@@ -23364,6 +23496,122 @@ ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
return target;
}
+/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
+
+static rtx
+ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
+ enum multi_arg_type m_type,
+ enum rtx_code sub_code)
+{
+ rtx pat;
+ int i;
+ int nargs;
+ bool comparison_p = false;
+ bool tf_p = false;
+ bool last_arg_constant = false;
+ int num_memory = 0;
+ struct {
+ rtx op;
+ enum machine_mode mode;
+ } args[4];
+
+ enum machine_mode tmode = insn_data[icode].operand[0].mode;
+
+ switch (m_type)
+ {
+ case MULTI_ARG_3_SF:
+ case MULTI_ARG_3_DF:
+ case MULTI_ARG_3_SF2:
+ case MULTI_ARG_3_DF2:
+ nargs = 3;
+ break;
+
+ case MULTI_ARG_UNKNOWN:
+ default:
+ gcc_unreachable ();
+ }
+
+ if (optimize || !target
+ || GET_MODE (target) != tmode
+ || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
+ target = gen_reg_rtx (tmode);
+
+ gcc_assert (nargs <= 4);
+
+ for (i = 0; i < nargs; i++)
+ {
+ tree arg = CALL_EXPR_ARG (exp, i);
+ rtx op = expand_normal (arg);
+ int adjust = (comparison_p) ? 1 : 0;
+ enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
+
+ if (last_arg_constant && i == nargs-1)
+ {
+ if (!CONST_INT_P (op))
+ {
+ error ("last argument must be an immediate");
+ return gen_reg_rtx (tmode);
+ }
+ }
+ else
+ {
+ if (VECTOR_MODE_P (mode))
+ op = safe_vector_operand (op, mode);
+
+ /* If we aren't optimizing, only allow one memory operand to be
+ generated. */
+ if (memory_operand (op, mode))
+ num_memory++;
+
+ gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
+
+ if (optimize
+ || ! (*insn_data[icode].operand[i+adjust+1].predicate) (op, mode)
+ || num_memory > 1)
+ op = force_reg (mode, op);
+ }
+
+ args[i].op = op;
+ args[i].mode = mode;
+ }
+
+ switch (nargs)
+ {
+ case 1:
+ pat = GEN_FCN (icode) (target, args[0].op);
+ break;
+
+ case 2:
+ if (tf_p)
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+ GEN_INT ((int)sub_code));
+ else if (! comparison_p)
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+ else
+ {
+ rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
+ args[0].op,
+ args[1].op);
+
+ pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
+ }
+ break;
+
+ case 3:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ if (! pat)
+ return 0;
+
+ emit_insn (pat);
+ return target;
+}
+
/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
insns with vec_merge. */
@@ -24633,6 +24881,12 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
if (d->code == fcode)
return ix86_expand_sse_pcmpistr (d, exp, target);
+ for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
+ if (d->code == fcode)
+ return ix86_expand_multi_arg_builtin (d->icode, exp, target,
+ (enum multi_arg_type)d->flag,
+ d->comparison);
+
gcc_unreachable ();
}
@@ -29015,6 +29269,200 @@ ix86_expand_round (rtx operand0, rtx operand1)
emit_move_insn (operand0, res);
}
+/* Validate whether a FMA4 instruction is valid or not.
+ OPERANDS is the array of operands.
+ NUM is the number of operands.
+ USES_OC0 is true if the instruction uses OC0 and provides 4 variants.
+ NUM_MEMORY is the maximum number of memory operands to accept.
+ NUM_MEMORY less than zero is a special case to allow an operand
+ of an instruction to be memory operation.
+ when COMMUTATIVE is set, operand 1 and 2 can be swapped. */
+
+bool
+ix86_fma4_valid_op_p (rtx operands[], rtx insn ATTRIBUTE_UNUSED, int num,
+ bool uses_oc0, int num_memory, bool commutative)
+{
+ int mem_mask;
+ int mem_count;
+ int i;
+
+ /* Count the number of memory arguments */
+ mem_mask = 0;
+ mem_count = 0;
+ for (i = 0; i < num; i++)
+ {
+ enum machine_mode mode = GET_MODE (operands[i]);
+ if (register_operand (operands[i], mode))
+ ;
+
+ else if (memory_operand (operands[i], mode))
+ {
+ mem_mask |= (1 << i);
+ mem_count++;
+ }
+
+ else
+ {
+ rtx pattern = PATTERN (insn);
+
+ /* allow 0 for pcmov */
+ if (GET_CODE (pattern) != SET
+ || GET_CODE (SET_SRC (pattern)) != IF_THEN_ELSE
+ || i < 2
+ || operands[i] != CONST0_RTX (mode))
+ return false;
+ }
+ }
+
+ /* Special case pmacsdq{l,h} where we allow the 3rd argument to be
+ a memory operation. */
+ if (num_memory < 0)
+ {
+ num_memory = -num_memory;
+ if ((mem_mask & (1 << (num-1))) != 0)
+ {
+ mem_mask &= ~(1 << (num-1));
+ mem_count--;
+ }
+ }
+
+ /* If there were no memory operations, allow the insn */
+ if (mem_mask == 0)
+ return true;
+
+ /* Do not allow the destination register to be a memory operand. */
+ else if (mem_mask & (1 << 0))
+ return false;
+
+ /* If there are too many memory operations, disallow the instruction. While
+ the hardware only allows 1 memory reference, before register allocation
+ for some insns, we allow two memory operations sometimes in order to allow
+ code like the following to be optimized:
+
+ float fmadd (float *a, float *b, float *c) { return (*a * *b) + *c; }
+
+ or similar cases that are vectorized into using the vfmaddss
+ instruction. */
+ else if (mem_count > num_memory)
+ return false;
+
+ /* Don't allow more than one memory operation if not optimizing. */
+ else if (mem_count > 1 && !optimize)
+ return false;
+
+ else if (num == 4 && mem_count == 1)
+ {
+ /* formats (destination is the first argument), example vfmaddss:
+ xmm1, xmm1, xmm2, xmm3/mem
+ xmm1, xmm1, xmm2/mem, xmm3
+ xmm1, xmm2, xmm3/mem, xmm1
+ xmm1, xmm2/mem, xmm3, xmm1 */
+ if (uses_oc0)
+ return ((mem_mask == (1 << 1))
+ || (mem_mask == (1 << 2))
+ || (mem_mask == (1 << 3)));
+
+ /* format, example vpmacsdd:
+ xmm1, xmm2, xmm3/mem, xmm1 */
+ if (commutative)
+ return (mem_mask == (1 << 2) || mem_mask == (1 << 1));
+ else
+ return (mem_mask == (1 << 2));
+ }
+
+ else if (num == 4 && num_memory == 2)
+ {
+ /* If there are two memory operations, we can load one of the memory ops
+ into the destination register. This is for optimizing the
+ multiply/add ops, which the combiner has optimized both the multiply
+ and the add insns to have a memory operation. We have to be careful
+ that the destination doesn't overlap with the inputs. */
+ rtx op0 = operands[0];
+
+ if (reg_mentioned_p (op0, operands[1])
+ || reg_mentioned_p (op0, operands[2])
+ || reg_mentioned_p (op0, operands[3]))
+ return false;
+
+ /* formats (destination is the first argument), example vfmaddss:
+ xmm1, xmm1, xmm2, xmm3/mem
+ xmm1, xmm1, xmm2/mem, xmm3
+ xmm1, xmm2, xmm3/mem, xmm1
+ xmm1, xmm2/mem, xmm3, xmm1
+
+ For the oc0 case, we will load either operands[1] or operands[3] into
+ operands[0], so any combination of 2 memory operands is ok. */
+ if (uses_oc0)
+ return true;
+
+ /* format, example vpmacsdd:
+ xmm1, xmm2, xmm3/mem, xmm1
+
+ For the integer multiply/add instructions be more restrictive and
+ require operands[2] and operands[3] to be the memory operands. */
+ if (commutative)
+ return (mem_mask == ((1 << 1) | (1 << 3)) || ((1 << 2) | (1 << 3)));
+ else
+ return (mem_mask == ((1 << 2) | (1 << 3)));
+ }
+
+ else if (num == 3 && num_memory == 1)
+ {
+ /* formats, example vprotb:
+ xmm1, xmm2, xmm3/mem
+ xmm1, xmm2/mem, xmm3 */
+ if (uses_oc0)
+ return ((mem_mask == (1 << 1)) || (mem_mask == (1 << 2)));
+
+ /* format, example vpcomeq:
+ xmm1, xmm2, xmm3/mem */
+ else
+ return (mem_mask == (1 << 2));
+ }
+
+ else
+ gcc_unreachable ();
+
+ return false;
+}
+
+
+/* Fixup an FMA4 instruction that has 2 memory input references into a form the
+ hardware will allow by using the destination register to load one of the
+ memory operations. Presently this is used by the multiply/add routines to
+ allow 2 memory references. */
+
+void
+ix86_expand_fma4_multiple_memory (rtx operands[],
+ int num,
+ enum machine_mode mode)
+{
+ rtx op0 = operands[0];
+ if (num != 4
+ || memory_operand (op0, mode)
+ || reg_mentioned_p (op0, operands[1])
+ || reg_mentioned_p (op0, operands[2])
+ || reg_mentioned_p (op0, operands[3]))
+ gcc_unreachable ();
+
+ /* For 2 memory operands, pick either operands[1] or operands[3] to move into
+ the destination register. */
+ if (memory_operand (operands[1], mode))
+ {
+ emit_move_insn (op0, operands[1]);
+ operands[1] = op0;
+ }
+ else if (memory_operand (operands[3], mode))
+ {
+ emit_move_insn (op0, operands[3]);
+ operands[3] = op0;
+ }
+ else
+ gcc_unreachable ();
+
+ return;
+}
+
/* Table of valid machine attributes. */
static const struct attribute_spec ix86_attribute_table[] =
{
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index f8259480..8d52572 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -54,6 +54,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#define TARGET_AVX OPTION_ISA_AVX
#define TARGET_FMA OPTION_ISA_FMA
#define TARGET_SSE4A OPTION_ISA_SSE4A
+#define TARGET_FMA4 OPTION_ISA_FMA4
#define TARGET_ROUND OPTION_ISA_ROUND
#define TARGET_ABM OPTION_ISA_ABM
#define TARGET_POPCNT OPTION_ISA_POPCNT
@@ -65,8 +66,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#define TARGET_CMPXCHG16B OPTION_ISA_CX16
-/* SSE4.1 define round instructions */
-#define OPTION_MASK_ISA_ROUND (OPTION_MASK_ISA_SSE4_1)
+/* SSE4.1 defines round instructions */
+#define OPTION_MASK_ISA_ROUND OPTION_MASK_ISA_SSE4_1
#define OPTION_ISA_ROUND ((ix86_isa_flags & OPTION_MASK_ISA_ROUND) != 0)
#include "config/vxworks-dummy.h"
@@ -1351,6 +1352,10 @@ enum reg_class
(TARGET_AVX && ((MODE) == V4SFmode || (MODE) == V2DFmode \
|| (MODE) == V8SFmode || (MODE) == V4DFmode))
+#define FMA4_VEC_FLOAT_MODE_P(MODE) \
+ (TARGET_FMA4 && ((MODE) == V4SFmode || (MODE) == V2DFmode \
+ || (MODE) == V8SFmode || (MODE) == V4DFmode))
+
#define MMX_REG_P(XOP) (REG_P (XOP) && MMX_REGNO_P (REGNO (XOP)))
#define MMX_REGNO_P(N) IN_RANGE ((N), FIRST_MMX_REG, LAST_MMX_REG)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 26bbc9a..5c2564e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -195,6 +195,10 @@
(UNSPEC_PCMPESTR 144)
(UNSPEC_PCMPISTR 145)
+ ; For FMA4 support
+ (UNSPEC_FMA4_INTRINSIC 150)
+ (UNSPEC_FMA4_FMADDSUB 151)
+ (UNSPEC_FMA4_FMSUBADD 152)
; For AES support
(UNSPEC_AESENC 159)
(UNSPEC_AESENCLAST 160)
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index f23763b..9668ff6 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -310,6 +310,10 @@ msse4a
Target Report Mask(ISA_SSE4A) Var(ix86_isa_flags) VarExists Save
Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation
+mfma4
+Target Report Mask(ISA_FMA4) Var(ix86_isa_flags) VarExists Save
+Support FMA4 built-in functions and code generation
+
mabm
Target Report Mask(ISA_ABM) Var(ix86_isa_flags) VarExists Save
Support code generation of Advanced Bit Manipulation (ABM) instructions.
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2ddbbf5..e902965 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -49,6 +49,7 @@
(define_mode_iterator SSEMODE248 [V8HI V4SI V2DI])
(define_mode_iterator SSEMODE1248 [V16QI V8HI V4SI V2DI])
(define_mode_iterator SSEMODEF4 [SF DF V4SF V2DF])
+(define_mode_iterator FMA4MODEF4 [V8SF V4DF])
(define_mode_iterator SSEMODEF2P [V4SF V2DF])
(define_mode_iterator AVX256MODEF2P [V8SF V4DF])
@@ -74,6 +75,11 @@
;; Mapping from integer vector mode to mnemonic suffix
(define_mode_attr ssevecsize [(V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q")])
+;; Mapping of the fma4 suffix
+(define_mode_attr fma4modesuffixf4 [(V8SF "ps") (V4DF "pd")])
+(define_mode_attr ssemodesuffixf2s [(SF "ss") (DF "sd")
+ (V4SF "ss") (V2DF "sd")])
+
;; Mapping of the avx suffix
(define_mode_attr ssemodesuffixf4 [(SF "ss") (DF "sd")
(V4SF "ps") (V2DF "pd")])
@@ -1661,6 +1667,936 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
+;; FMA4 floating point multiply/accumulate instructions This includes the
+;; scalar version of the instructions as well as the vector
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; In order to match (*a * *b) + *c, particularly when vectorizing, allow
+;; combine to generate a multiply/add with two memory references. We then
+;; split this insn, into loading up the destination register with one of the
+;; memory operations. If we don't manage to split the insn, reload will
+;; generate the appropriate moves. The reason this is needed, is that combine
+;; has already folded one of the memory references into both the multiply and
+;; add insns, and it can't generate a new pseudo. I.e.:
+;; (set (reg1) (mem (addr1)))
+;; (set (reg2) (mult (reg1) (mem (addr2))))
+;; (set (reg3) (plus (reg2) (mem (addr3))))
+
+(define_insn "fma4_fmadd<mode>4256"
+ [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x,x")
+ (plus:FMA4MODEF4
+ (mult:FMA4MODEF4
+ (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x,xm")
+ (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm,x"))
+ (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x,x")))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmadd<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;; Split fmadd with two memory operands into a load and the fmadd.
+(define_split
+ [(set (match_operand:FMA4MODEF4 0 "register_operand" "")
+ (plus:FMA4MODEF4
+ (mult:FMA4MODEF4
+ (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "")
+ (match_operand:FMA4MODEF4 2 "nonimmediate_operand" ""))
+ (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "")))]
+ "TARGET_FMA4
+ && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])
+ && !reg_mentioned_p (operands[0], operands[3])"
+ [(const_int 0)]
+{
+ ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+ emit_insn (gen_fma4_fmadd<mode>4256 (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+})
+
+;; Floating multiply and subtract
+;; Allow two memory operands the same as fmadd
+(define_insn "fma4_fmsub<mode>4256"
+ [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x,x")
+ (minus:FMA4MODEF4
+ (mult:FMA4MODEF4
+ (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x,xm")
+ (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm,x"))
+ (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x,x")))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmsub<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;; Split fmsub with two memory operands into a load and the fmsub.
+(define_split
+ [(set (match_operand:FMA4MODEF4 0 "register_operand" "")
+ (minus:FMA4MODEF4
+ (mult:FMA4MODEF4
+ (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "")
+ (match_operand:FMA4MODEF4 2 "nonimmediate_operand" ""))
+ (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "")))]
+ "TARGET_FMA4
+ && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])
+ && !reg_mentioned_p (operands[0], operands[3])"
+ [(const_int 0)]
+{
+ ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+ emit_insn (gen_fma4_fmsub<mode>4256 (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+})
+
+;; Floating point negative multiply and add
+;; Rewrite (- (a * b) + c) into the canonical form: c - (a * b)
+;; Note operands are out of order to simplify call to ix86_fma4_valid_p
+;; Allow two memory operands to help in optimizing.
+(define_insn "fma4_fnmadd<mode>4256"
+ [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x,x")
+ (minus:FMA4MODEF4
+ (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x,x")
+ (mult:FMA4MODEF4
+ (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x,xm")
+ (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm,x"))))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfnmadd<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;; Split fnmadd with two memory operands into a load and the fnmadd.
+(define_split
+ [(set (match_operand:FMA4MODEF4 0 "register_operand" "")
+ (minus:FMA4MODEF4
+ (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "")
+ (mult:FMA4MODEF4
+ (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "")
+ (match_operand:FMA4MODEF4 2 "nonimmediate_operand" ""))))]
+ "TARGET_FMA4
+ && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])
+ && !reg_mentioned_p (operands[0], operands[3])"
+ [(const_int 0)]
+{
+ ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+ emit_insn (gen_fma4_fnmadd<mode>4256 (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+})
+
+;; Floating point negative multiply and subtract
+;; Rewrite (- (a * b) - c) into the canonical form: ((-a) * b) - c
+;; Allow 2 memory operands to help with optimization
+(define_insn "fma4_fnmsub<mode>4256"
+ [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x")
+ (minus:FMA4MODEF4
+ (mult:FMA4MODEF4
+ (neg:FMA4MODEF4
+ (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x"))
+ (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x")))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)"
+ "vfnmsub<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;; Split fnmsub with two memory operands into a load and the fmsub.
+(define_split
+ [(set (match_operand:FMA4MODEF4 0 "register_operand" "")
+ (minus:FMA4MODEF4
+ (mult:FMA4MODEF4
+ (neg:FMA4MODEF4
+ (match_operand:FMA4MODEF4 1 "nonimmediate_operand" ""))
+ (match_operand:FMA4MODEF4 2 "nonimmediate_operand" ""))
+ (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "")))]
+ "TARGET_FMA4
+ && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])
+ && !reg_mentioned_p (operands[0], operands[3])"
+ [(const_int 0)]
+{
+ ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+ emit_insn (gen_fma4_fnmsub<mode>4256 (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+})
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(define_insn "fma4_fmadd<mode>4"
+ [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x,x")
+ (plus:SSEMODEF4
+ (mult:SSEMODEF4
+ (match_operand:SSEMODEF4 1 "nonimmediate_operand" "x,x,xm")
+ (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm,x"))
+ (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x,x")))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;; Split fmadd with two memory operands into a load and the fmadd.
+(define_split
+ [(set (match_operand:SSEMODEF4 0 "register_operand" "")
+ (plus:SSEMODEF4
+ (mult:SSEMODEF4
+ (match_operand:SSEMODEF4 1 "nonimmediate_operand" "")
+ (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))
+ (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")))]
+ "TARGET_FMA4
+ && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])
+ && !reg_mentioned_p (operands[0], operands[3])"
+ [(const_int 0)]
+{
+ ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+ emit_insn (gen_fma4_fmadd<mode>4 (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are generated.
+;; Scalar version of fmadd
+(define_insn "fma4_vmfmadd<mode>4"
+ [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+ (vec_merge:SSEMODEF2P
+ (plus:SSEMODEF2P
+ (mult:SSEMODEF2P
+ (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+ (match_dup 0)
+ (const_int 1)))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+ "vfmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;; Floating multiply and subtract
+;; Allow two memory operands the same as fmadd
+(define_insn "fma4_fmsub<mode>4"
+ [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x,x")
+ (minus:SSEMODEF4
+ (mult:SSEMODEF4
+ (match_operand:SSEMODEF4 1 "nonimmediate_operand" "x,x,xm")
+ (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm,x"))
+ (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x,x")))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;; Split fmsub with two memory operands into a load and the fmsub.
+(define_split
+ [(set (match_operand:SSEMODEF4 0 "register_operand" "")
+ (minus:SSEMODEF4
+ (mult:SSEMODEF4
+ (match_operand:SSEMODEF4 1 "nonimmediate_operand" "")
+ (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))
+ (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")))]
+ "TARGET_FMA4
+ && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])
+ && !reg_mentioned_p (operands[0], operands[3])"
+ [(const_int 0)]
+{
+ ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+ emit_insn (gen_fma4_fmsub<mode>4 (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are generated.
+;; Scalar version of fmsub
+(define_insn "fma4_vmfmsub<mode>4"
+ [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+ (vec_merge:SSEMODEF2P
+ (minus:SSEMODEF2P
+ (mult:SSEMODEF2P
+ (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+ (match_dup 0)
+ (const_int 1)))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
+ "vfmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;; Floating point negative multiply and add
+;; Rewrite (- (a * b) + c) into the canonical form: c - (a * b)
+;; Note operands are out of order to simplify call to ix86_fma4_valid_p
+;; Allow two memory operands to help in optimizing.
+(define_insn "fma4_fnmadd<mode>4"
+ [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x,x")
+ (minus:SSEMODEF4
+ (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x,x")
+ (mult:SSEMODEF4
+ (match_operand:SSEMODEF4 1 "nonimmediate_operand" "x,x,xm")
+ (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm,x"))))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfnmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;; Split fnmadd with two memory operands into a load and the fnmadd.
+(define_split
+ [(set (match_operand:SSEMODEF4 0 "register_operand" "")
+ (minus:SSEMODEF4
+ (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")
+ (mult:SSEMODEF4
+ (match_operand:SSEMODEF4 1 "nonimmediate_operand" "")
+ (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))))]
+ "TARGET_FMA4
+ && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])
+ && !reg_mentioned_p (operands[0], operands[3])"
+ [(const_int 0)]
+{
+ ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+ emit_insn (gen_fma4_fnmadd<mode>4 (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are generated.
+;; Scalar version of fnmadd
+(define_insn "fma4_vmfnmadd<mode>4"
+ [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+ (vec_merge:SSEMODEF2P
+ (minus:SSEMODEF2P
+ (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")
+ (mult:SSEMODEF2P
+ (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")))
+ (match_dup 0)
+ (const_int 1)))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+ "vfnmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;; Floating point negative multiply and subtract
+;; Rewrite (- (a * b) - c) into the canonical form: ((-a) * b) - c
+;; Allow 2 memory operands to help with optimization
+(define_insn "fma4_fnmsub<mode>4"
+ [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x")
+ (minus:SSEMODEF4
+ (mult:SSEMODEF4
+ (neg:SSEMODEF4
+ (match_operand:SSEMODEF4 1 "nonimmediate_operand" "x,x"))
+ (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x")))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)"
+ "vfnmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;; Split fnmsub with two memory operands into a load and the fmsub.
+(define_split
+ [(set (match_operand:SSEMODEF4 0 "register_operand" "")
+ (minus:SSEMODEF4
+ (mult:SSEMODEF4
+ (neg:SSEMODEF4
+ (match_operand:SSEMODEF4 1 "nonimmediate_operand" ""))
+ (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))
+ (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")))]
+ "TARGET_FMA4
+ && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])
+ && !reg_mentioned_p (operands[0], operands[3])"
+ [(const_int 0)]
+{
+ ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+ emit_insn (gen_fma4_fnmsub<mode>4 (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are generated.
+;; Scalar version of fnmsub
+(define_insn "fma4_vmfnmsub<mode>4"
+ [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+ (vec_merge:SSEMODEF2P
+ (minus:SSEMODEF2P
+ (mult:SSEMODEF2P
+ (neg:SSEMODEF2P
+ (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x"))
+ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+ (match_dup 0)
+ (const_int 1)))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)"
+ "vfnmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "fma4i_fmadd<mode>4256"
+ [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x")
+ (unspec:FMA4MODEF4
+ [(plus:FMA4MODEF4
+ (mult:FMA4MODEF4
+ (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x")
+ (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x"))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+ "vfmadd<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "fma4i_fmsub<mode>4256"
+ [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x")
+ (unspec:FMA4MODEF4
+ [(minus:FMA4MODEF4
+ (mult:FMA4MODEF4
+ (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x")
+ (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x"))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+ "vfmsub<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "fma4i_fnmadd<mode>4256"
+ [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x")
+ (unspec:FMA4MODEF4
+ [(minus:FMA4MODEF4
+ (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x")
+ (mult:FMA4MODEF4
+ (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x")
+ (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm")))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+ "vfnmadd<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "fma4i_fnmsub<mode>4256"
+ [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x")
+ (unspec:FMA4MODEF4
+ [(minus:FMA4MODEF4
+ (mult:FMA4MODEF4
+ (neg:FMA4MODEF4
+ (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x"))
+ (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x"))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
+ "vfnmsub<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "fma4i_fmadd<mode>4"
+ [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+ (unspec:SSEMODEF2P
+ [(plus:SSEMODEF2P
+ (mult:SSEMODEF2P
+ (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+ "vfmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "fma4i_fmsub<mode>4"
+ [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+ (unspec:SSEMODEF2P
+ [(minus:SSEMODEF2P
+ (mult:SSEMODEF2P
+ (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+ "vfmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "fma4i_fnmadd<mode>4"
+ [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+ (unspec:SSEMODEF2P
+ [(minus:SSEMODEF2P
+ (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")
+ (mult:SSEMODEF2P
+ (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+ "vfnmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "fma4i_fnmsub<mode>4"
+ [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+ (unspec:SSEMODEF2P
+ [(minus:SSEMODEF2P
+ (mult:SSEMODEF2P
+ (neg:SSEMODEF2P
+ (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x"))
+ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
+ "vfnmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are accepted.
+(define_insn "fma4i_vmfmadd<mode>4"
+ [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+ (unspec:SSEMODEF2P
+ [(vec_merge:SSEMODEF2P
+ (plus:SSEMODEF2P
+ (mult:SSEMODEF2P
+ (match_operand:SSEMODEF2P 1 "register_operand" "x,x")
+ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+ (match_dup 0)
+ (const_int 1))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
+ "vfmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "fma4i_vmfmsub<mode>4"
+ [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+ (unspec:SSEMODEF2P
+ [(vec_merge:SSEMODEF2P
+ (minus:SSEMODEF2P
+ (mult:SSEMODEF2P
+ (match_operand:SSEMODEF2P 1 "register_operand" "x,x")
+ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+ (match_dup 0)
+ (const_int 1))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
+ "vfmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "fma4i_vmfnmadd<mode>4"
+ [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+ (unspec:SSEMODEF2P
+ [(vec_merge:SSEMODEF2P
+ (minus:SSEMODEF2P
+ (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")
+ (mult:SSEMODEF2P
+ (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")))
+ (match_dup 0)
+ (const_int 1))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+ "vfnmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "fma4i_vmfnmsub<mode>4"
+ [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+ (unspec:SSEMODEF2P
+ [(vec_merge:SSEMODEF2P
+ (minus:SSEMODEF2P
+ (mult:SSEMODEF2P
+ (neg:SSEMODEF2P
+ (match_operand:SSEMODEF2P 1 "register_operand" "x,x"))
+ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+ (match_dup 0)
+ (const_int 1))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
+ "vfnmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<ssescalarmode>")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; FMA4 Parallel floating point multiply addsub and subadd operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "fma4_fmaddsubv8sf4"
+ [(set (match_operand:V8SF 0 "register_operand" "=x,x")
+ (vec_merge:V8SF
+ (plus:V8SF
+ (mult:V8SF
+ (match_operand:V8SF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V8SF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V8SF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V8SF
+ (mult:V8SF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 170)))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V8SF")])
+
+(define_insn "fma4_fmaddsubv4df4"
+ [(set (match_operand:V4DF 0 "register_operand" "=x,x")
+ (vec_merge:V4DF
+ (plus:V4DF
+ (mult:V4DF
+ (match_operand:V4DF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V4DF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V4DF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V4DF
+ (mult:V4DF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 10)))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmaddsubpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V4DF")])
+
+(define_insn "fma4_fmaddsubv4sf4"
+ [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+ (vec_merge:V4SF
+ (plus:V4SF
+ (mult:V4SF
+ (match_operand:V4SF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V4SF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V4SF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V4SF
+ (mult:V4SF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 10)))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V4SF")])
+
+(define_insn "fma4_fmaddsubv2df4"
+ [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+ (vec_merge:V2DF
+ (plus:V2DF
+ (mult:V2DF
+ (match_operand:V2DF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V2DF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V2DF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V2DF
+ (mult:V2DF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 2)))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmaddsubpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V2DF")])
+
+(define_insn "fma4_fmsubaddv8sf4"
+ [(set (match_operand:V8SF 0 "register_operand" "=x,x")
+ (vec_merge:V8SF
+ (plus:V8SF
+ (mult:V8SF
+ (match_operand:V8SF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V8SF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V8SF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V8SF
+ (mult:V8SF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 85)))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V8SF")])
+
+(define_insn "fma4_fmsubaddv4df4"
+ [(set (match_operand:V4DF 0 "register_operand" "=x,x")
+ (vec_merge:V4DF
+ (plus:V4DF
+ (mult:V4DF
+ (match_operand:V4DF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V4DF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V4DF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V4DF
+ (mult:V4DF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 5)))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmsubaddpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V4DF")])
+
+(define_insn "fma4_fmsubaddv4sf4"
+ [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+ (vec_merge:V4SF
+ (plus:V4SF
+ (mult:V4SF
+ (match_operand:V4SF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V4SF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V4SF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V4SF
+ (mult:V4SF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 5)))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V4SF")])
+
+(define_insn "fma4_fmsubaddv2df4"
+ [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+ (vec_merge:V2DF
+ (plus:V2DF
+ (mult:V2DF
+ (match_operand:V2DF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V2DF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V2DF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V2DF
+ (mult:V2DF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 1)))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmsubaddpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V2DF")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "fma4i_fmaddsubv8sf4"
+ [(set (match_operand:V8SF 0 "register_operand" "=x,x")
+ (unspec:V8SF
+ [(vec_merge:V8SF
+ (plus:V8SF
+ (mult:V8SF
+ (match_operand:V8SF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V8SF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V8SF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V8SF
+ (mult:V8SF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 170))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V8SF")])
+
+(define_insn "fma4i_fmaddsubv4df4"
+ [(set (match_operand:V4DF 0 "register_operand" "=x,x")
+ (unspec:V4DF
+ [(vec_merge:V4DF
+ (plus:V4DF
+ (mult:V4DF
+ (match_operand:V4DF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V4DF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V4DF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V4DF
+ (mult:V4DF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 10))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmaddsubpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V4DF")])
+
+(define_insn "fma4i_fmaddsubv4sf4"
+ [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+ (unspec:V4SF
+ [(vec_merge:V4SF
+ (plus:V4SF
+ (mult:V4SF
+ (match_operand:V4SF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V4SF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V4SF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V4SF
+ (mult:V4SF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 10))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V4SF")])
+
+(define_insn "fma4i_fmaddsubv2df4"
+ [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+ (unspec:V2DF
+ [(vec_merge:V2DF
+ (plus:V2DF
+ (mult:V2DF
+ (match_operand:V2DF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V2DF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V2DF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V2DF
+ (mult:V2DF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 2))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmaddsubpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V2DF")])
+
+(define_insn "fma4i_fmsubaddv8sf4"
+ [(set (match_operand:V8SF 0 "register_operand" "=x,x")
+ (unspec:V8SF
+ [(vec_merge:V8SF
+ (plus:V8SF
+ (mult:V8SF
+ (match_operand:V8SF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V8SF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V8SF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V8SF
+ (mult:V8SF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 85))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V8SF")])
+
+(define_insn "fma4i_fmsubaddv4df4"
+ [(set (match_operand:V4DF 0 "register_operand" "=x,x")
+ (unspec:V4DF
+ [(vec_merge:V4DF
+ (plus:V4DF
+ (mult:V4DF
+ (match_operand:V4DF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V4DF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V4DF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V4DF
+ (mult:V4DF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 5))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmsubaddpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V4DF")])
+
+(define_insn "fma4i_fmsubaddv4sf4"
+ [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+ (unspec:V4SF
+ [(vec_merge:V4SF
+ (plus:V4SF
+ (mult:V4SF
+ (match_operand:V4SF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V4SF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V4SF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V4SF
+ (mult:V4SF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 5))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V4SF")])
+
+(define_insn "fma4i_fmsubaddv2df4"
+ [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+ (unspec:V2DF
+ [(vec_merge:V2DF
+ (plus:V2DF
+ (mult:V2DF
+ (match_operand:V2DF 1 "nonimmediate_operand" "x,x")
+ (match_operand:V2DF 2 "nonimmediate_operand" "x,xm"))
+ (match_operand:V2DF 3 "nonimmediate_operand" "xm,x"))
+ (minus:V2DF
+ (mult:V2DF
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))
+ (const_int 1))]
+ UNSPEC_FMA4_INTRINSIC))]
+ "TARGET_FMA4
+ && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+ "vfmsubaddpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V2DF")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
;; Parallel single-precision floating point conversion operations
;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/gcc/config/i386/x86intrin.h b/gcc/config/i386/x86intrin.h
index 939ea4f..7bc47f8 100644
--- a/gcc/config/i386/x86intrin.h
+++ b/gcc/config/i386/x86intrin.h
@@ -46,7 +46,7 @@
#include <tmmintrin.h>
#endif
-#ifdef __SSE4a__
+#ifdef __SSE4A__
#include <ammintrin.h>
#endif
@@ -54,6 +54,10 @@
#include <smmintrin.h>
#endif
+#ifdef __FMA4__
+#include <fma4intrin.h>
+#endif
+
#if defined (__AES__) || defined (__PCLMUL__)
#include <wmmintrin.h>
#endif
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 993863f..6f09555 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -3168,6 +3168,11 @@ Enable/disable the generation of the sse4.2 instructions.
@cindex @code{target("sse4a")} attribute
Enable/disable the generation of the SSE4A instructions.
+@item fma4
+@itemx no-fma4
+@cindex @code{target("fma4")} attribute
+Enable/disable the generation of the FMA4 instructions.
+
@item ssse3
@itemx no-ssse3
@cindex @code{target("ssse3")} attribute
@@ -8888,6 +8893,46 @@ v2di __builtin_ia32_insertq (v2di, v2di)
v2di __builtin_ia32_insertqi (v2di, v2di, const unsigned int, const unsigned int)
@end smallexample
+The following built-in functions are available when @option{-mfma4} is used.
+All of them generate the machine instruction that is part of the name
+with MMX registers.
+
+@smallexample
+v2df __builtin_ia32_fmaddpd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fmaddps (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fmaddsd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fmaddss (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fmsubpd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fmsubps (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fmsubsd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fmsubss (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fnmaddpd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fnmaddps (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fnmaddsd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fnmaddss (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fnmsubpd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fnmsubps (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fnmsubsd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fnmsubss (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fmaddsubpd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fmaddsubps (v4sf, v4sf, v4sf)
+v2df __builtin_ia32_fmsubaddpd (v2df, v2df, v2df)
+v4sf __builtin_ia32_fmsubaddps (v4sf, v4sf, v4sf)
+v4df __builtin_ia32_fmaddpd256 (v4df, v4df, v4df)
+v8sf __builtin_ia32_fmaddps256 (v8sf, v8sf, v8sf)
+v4df __builtin_ia32_fmsubpd256 (v4df, v4df, v4df)
+v8sf __builtin_ia32_fmsubps256 (v8sf, v8sf, v8sf)
+v4df __builtin_ia32_fnmaddpd256 (v4df, v4df, v4df)
+v8sf __builtin_ia32_fnmaddps256 (v8sf, v8sf, v8sf)
+v4df __builtin_ia32_fnmsubpd256 (v4df, v4df, v4df)
+v8sf __builtin_ia32_fnmsubps256 (v8sf, v8sf, v8sf)
+v4df __builtin_ia32_fmaddsubpd256 (v4df, v4df, v4df)
+v8sf __builtin_ia32_fmaddsubps256 (v8sf, v8sf, v8sf)
+v4df __builtin_ia32_fmsubaddpd256 (v4df, v4df, v4df)
+v8sf __builtin_ia32_fmsubaddps256 (v8sf, v8sf, v8sf)
+
+@end smallexample
+
The following built-in functions are available when @option{-m3dnow} is used.
All of them generate the machine instruction that is part of the name.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 4ae8a02..e12241c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -592,7 +592,7 @@ Objective-C and Objective-C++ Dialects}.
-mcld -mcx16 -msahf -mmovbe -mcrc32 -mrecip @gol
-mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol
-maes -mpclmul @gol
--msse4a -m3dnow -mpopcnt -mabm @gol
+-msse4a -m3dnow -mpopcnt -mabm -mfma4 @gol
-mthreads -mno-align-stringops -minline-all-stringops @gol
-minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
-mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol
@@ -11727,6 +11727,8 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
@itemx -mno-pclmul
@itemx -msse4a
@itemx -mno-sse4a
+@itemx -mfma4
+@itemx -mno-fma4
@itemx -m3dnow
@itemx -mno-3dnow
@itemx -mpopcnt
@@ -11740,7 +11742,7 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
@opindex m3dnow
@opindex mno-3dnow
These switches enable or disable the use of instructions in the MMX,
-SSE, SSE2, SSE3, SSSE3, SSE4.1, AVX, AES, PCLMUL, SSE4A, ABM or
+SSE, SSE2, SSE3, SSSE3, SSE4.1, AVX, AES, PCLMUL, SSE4A, FMA4, ABM or
3DNow!@: extended instruction sets.
These extensions are also available as built-in functions: see
@ref{X86 Built-in Functions}, for details of the functions enabled and
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index eec705f..8128a90 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,56 @@
+2009-09-29 Harsha Jagasia <harsha.jagasia@amd.com>
+
+ * gcc.target/i386/fma4-check.h
+ * gcc.target/i386/fma4-fma.c
+ * gcc.target/i386/fma4-maccXX.c
+ * gcc.target/i386/fma4-msubXX.c
+ * gcc.target/i386/fma4-nmaccXX.c
+ * gcc.target/i386/fma4-nmsubXX.c
+ * gcc.target/i386/fma4-vector.c
+ * gcc.target/i386/fma4-256-maccXX.c
+ * gcc.target/i386/fma4-256-msubXX.c
+ * gcc.target/i386/fma4-256-nmaccXX.c
+ * gcc.target/i386/fma4-256-nmsubXX.c
+ * gcc.target/i386/fma4-256-vector.c
+ * gcc.target/i386/funcspec-2.c: New file.
+
+ * gcc.target/i386/funcspec-4.c: Test error conditions
+ related to FMA4.
+
+ * gcc.target/i386/funcspec-5.c
+ * gcc.target/i386/funcspec-6.c
+ * gcc.target/i386/funcspec-8.c: Add FMA4.
+
+ * gcc.target/i386/funcspec-9.c: New file.
+
+ * gcc.target/i386/i386.exp: Add check_effective_target_fma4.
+
+ * gcc.target/i386/isa-10.c
+ * gcc.target/i386/isa-11.c
+ * gcc.target/i386/isa-12.c
+ * gcc.target/i386/isa-13.c
+ * gcc.target/i386/isa-2.c
+ * gcc.target/i386/isa-3.c
+ * gcc.target/i386/isa-4.c
+ * gcc.target/i386/isa-7.c
+ * gcc.target/i386/isa-8.c
+ * gcc.target/i386/isa-9.c: New file.
+
+ * gcc.target/i386/isa-14.c
+ * gcc.target/i386/isa-1.c
+ * gcc.target/i386/isa-5.c
+ * gcc.target/i386/isa-6.c: Add FMA4.
+
+ * gcc.target/i386/sse-12.c
+ * gcc.target/i386/sse-13.c
+ * gcc.target/i386/sse-14.c
+ * gcc.target/i386/sse-22.c: New file.
+
+ * g++.dg/other/i386-2.C
+ * g++.dg/other/i386-3.C
+ * g++.dg/other/i386-5.C
+ * g++.dg/other/i386-6.C: Add -mfma4 in dg-options.
+
2009-09-29 H.J. Lu <hongjiu.lu@intel.com>
PR testsuite/41496
diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C
index 3b05101..4c9579d 100644
--- a/gcc/testsuite/g++.dg/other/i386-2.C
+++ b/gcc/testsuite/g++.dg/other/i386-2.C
@@ -1,7 +1,7 @@
-/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are
- usable with -O -pedantic-errors. */
+/* Test that {,x,e,p,t,s,w,a,i}mmintrin.h, fma4intrin.h, mm3dnow.h and
+ mm_malloc.h are usable with -O -pedantic-errors. */
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -m3dnow -mavx -msse4a -mfma4 -maes -mpclmul" } */
#include <x86intrin.h>
diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C
index 377891d..b9e8916 100644
--- a/gcc/testsuite/g++.dg/other/i386-3.C
+++ b/gcc/testsuite/g++.dg/other/i386-3.C
@@ -1,6 +1,6 @@
-/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are
- usable with -O -fkeep-inline-functions. */
+/* Test that {,x,e,p,t,s,w,a,i}mmintrin.h, fma4intrin.h, mm3dnow.h and
+ mm_malloc.h are usable with -O -fkeep-inline-functions. */
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -m3dnow -mavx -msse4a -mfma4 -maes -mpclmul" } */
#include <x86intrin.h>
diff --git a/gcc/testsuite/g++.dg/other/i386-5.C b/gcc/testsuite/g++.dg/other/i386-5.C
index 377891d..6dcb2d3 100644
--- a/gcc/testsuite/g++.dg/other/i386-5.C
+++ b/gcc/testsuite/g++.dg/other/i386-5.C
@@ -1,6 +1,6 @@
-/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are
- usable with -O -fkeep-inline-functions. */
+/* Test that {,x,e,p,t,s,w,a,i}mmintrin.h, fma4intrin.h, mm3dnow.h and
+ mm_malloc.h are usable with -O -fkeep-inline-functions. */
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -m3dnow -mavx -msse4a -mfma4 -maes -mpclmul" } */
#include <x86intrin.h>
diff --git a/gcc/testsuite/g++.dg/other/i386-6.C b/gcc/testsuite/g++.dg/other/i386-6.C
index 3b05101..4c9579d 100644
--- a/gcc/testsuite/g++.dg/other/i386-6.C
+++ b/gcc/testsuite/g++.dg/other/i386-6.C
@@ -1,7 +1,7 @@
-/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are
- usable with -O -pedantic-errors. */
+/* Test that {,x,e,p,t,s,w,a,i}mmintrin.h, fma4intrin.h, mm3dnow.h and
+ mm_malloc.h are usable with -O -pedantic-errors. */
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -m3dnow -mavx -msse4a -mfma4 -maes -mpclmul" } */
#include <x86intrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-maccXX.c b/gcc/testsuite/gcc.target/i386/fma4-256-maccXX.c
new file mode 100644
index 0000000..134200a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-256-maccXX.c
@@ -0,0 +1,96 @@
+/* { dg-do run } */
+/* { dg-require-effective-target fma4 } */
+/* { dg-options "-O2 -mfma4" } */
+
+#include "fma4-check.h"
+
+#include <x86intrin.h>
+#include <string.h>
+
+#define NUM 20
+
+union
+{
+ __m256 x[NUM];
+ float f[NUM * 8];
+ __m256d y[NUM];
+ double d[NUM * 4];
+} dst, res, src1, src2, src3;
+
+
+/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
+ product is not rounded, only the addition is rounded. */
+
+static void
+init_maccps ()
+{
+ int i;
+ for (i = 0; i < NUM * 8; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 10;
+ src3.f[i] = i + 20;
+ }
+}
+
+static void
+init_maccpd ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 10;
+ src3.d[i] = i + 20;
+ }
+}
+
+static int
+check_maccps ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 8; i = i + 8)
+ for (j = 0; j < 8; j++)
+ {
+ res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j];
+ if (dst.f[i + j] != res.f[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_maccpd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ for (j = 0; j < 4; j++)
+ {
+ res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j];
+ if (dst.d[i + j] != res.d[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+fma4_test (void)
+{
+ int i;
+
+ init_maccps ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm256_macc_ps (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_maccps ())
+ abort ();
+
+ init_maccpd ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm256_macc_pd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_maccpd ())
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-msubXX.c b/gcc/testsuite/gcc.target/i386/fma4-256-msubXX.c
new file mode 100644
index 0000000..d6cafb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-256-msubXX.c
@@ -0,0 +1,96 @@
+/* { dg-do run } */
+/* { dg-require-effective-target fma4 } */
+/* { dg-options "-O2 -mfma4" } */
+
+#include "fma4-check.h"
+
+#include <x86intrin.h>
+#include <string.h>
+
+#define NUM 20
+
+union
+{
+ __m256 x[NUM];
+ float f[NUM * 8];
+ __m256d y[NUM];
+ double d[NUM * 4];
+} dst, res, src1, src2, src3;
+
+/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
+ product is not rounded, only the addition is rounded. */
+
+static void
+init_msubps ()
+{
+ int i;
+ for (i = 0; i < NUM * 8; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 10;
+ src3.f[i] = i + 20;
+ }
+}
+
+static void
+init_msubpd ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 10;
+ src3.d[i] = i + 20;
+ }
+}
+
+static int
+check_msubps ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 8; i = i + 8)
+ for (j = 0; j < 8; j++)
+ {
+ res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j];
+ if (dst.f[i + j] != res.f[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_msubpd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ for (j = 0; j < 4; j++)
+ {
+ res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j];
+ if (dst.d[i + j] != res.d[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+fma4_test (void)
+{
+ int i;
+
+ init_msubps ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm256_msub_ps (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_msubps ())
+ abort ();
+
+ init_msubpd ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm256_msub_pd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_msubpd ())
+ abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-nmaccXX.c b/gcc/testsuite/gcc.target/i386/fma4-256-nmaccXX.c
new file mode 100644
index 0000000..261f302
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-256-nmaccXX.c
@@ -0,0 +1,96 @@
+/* { dg-do run } */
+/* { dg-require-effective-target fma4 } */
+/* { dg-options "-O2 -mfma4" } */
+
+#include "fma4-check.h"
+
+#include <x86intrin.h>
+#include <string.h>
+
+#define NUM 20
+
+union
+{
+ __m256 x[NUM];
+ float f[NUM * 8];
+ __m256d y[NUM];
+ double d[NUM * 4];
+} dst, res, src1, src2, src3;
+
+/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
+ product is not rounded, only the addition is rounded. */
+
+static void
+init_nmaccps ()
+{
+ int i;
+ for (i = 0; i < NUM * 8; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 10;
+ src3.f[i] = i + 20;
+ }
+}
+
+static void
+init_nmaccpd ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 10;
+ src3.d[i] = i + 20;
+ }
+}
+
+static int
+check_nmaccps ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 8; i = i + 8)
+ for (j = 0; j < 8; j++)
+ {
+ res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j];
+ if (dst.f[i + j] != res.f[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_nmaccpd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ for (j = 0; j < 4; j++)
+ {
+ res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j];
+ if (dst.d[i + j] != res.d[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+fma4_test (void)
+{
+ int i;
+
+ init_nmaccps ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm256_nmacc_ps (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_nmaccps ())
+ abort ();
+
+ init_nmaccpd ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm256_nmacc_pd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_nmaccpd ())
+ abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-nmsubXX.c b/gcc/testsuite/gcc.target/i386/fma4-256-nmsubXX.c
new file mode 100644
index 0000000..3205715
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-256-nmsubXX.c
@@ -0,0 +1,95 @@
+/* { dg-require-effective-target fma4 } */
+/* { dg-options "-O2 -mfma4" } */
+
+#include "fma4-check.h"
+
+#include <x86intrin.h>
+#include <string.h>
+
+#define NUM 20
+
+union
+{
+ __m256 x[NUM];
+ float f[NUM * 8];
+ __m256d y[NUM];
+ double d[NUM * 4];
+} dst, res, src1, src2, src3;
+
+/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
+ product is not rounded, only the addition is rounded. */
+
+static void
+init_nmsubps ()
+{
+ int i;
+ for (i = 0; i < NUM * 8; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 10;
+ src3.f[i] = i + 20;
+ }
+}
+
+static void
+init_nmsubpd ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 10;
+ src3.d[i] = i + 20;
+ }
+}
+
+static int
+check_nmsubps ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 8; i = i + 8)
+ for (j = 0; j < 8; j++)
+ {
+ res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j];
+ if (dst.f[i + j] != res.f[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_nmsubpd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ for (j = 0; j < 4; j++)
+ {
+ res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j];
+ if (dst.d[i + j] != res.d[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+fma4_test (void)
+{
+ int i;
+
+ init_nmsubps ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm256_nmsub_ps (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_nmsubps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4]))
+ abort ();
+
+ init_nmsubpd ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm256_nmsub_pd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_nmsubpd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2]))
+ abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-vector.c b/gcc/testsuite/gcc.target/i386/fma4-256-vector.c
new file mode 100644
index 0000000..714b743
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-256-vector.c
@@ -0,0 +1,93 @@
+/* Test that the compiler properly optimizes floating point multiply and add
+ instructions vector into vfmaddps on FMA4 systems. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */
+
+extern void exit (int);
+
+typedef float __m256 __attribute__ ((__vector_size__ (32), __may_alias__));
+typedef double __m256d __attribute__ ((__vector_size__ (32), __may_alias__));
+
+#define SIZE 10240
+
+union {
+ __m256 f_align;
+ __m256d d_align;
+ float f[SIZE];
+ double d[SIZE];
+} a, b, c, d;
+
+void
+flt_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.f[i] = (b.f[i] * c.f[i]) + d.f[i];
+}
+
+void
+dbl_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.d[i] = (b.d[i] * c.d[i]) + d.d[i];
+}
+
+void
+flt_mul_sub (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.f[i] = (b.f[i] * c.f[i]) - d.f[i];
+}
+
+void
+dbl_mul_sub (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.d[i] = (b.d[i] * c.d[i]) - d.d[i];
+}
+
+void
+flt_neg_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.f[i] = (-(b.f[i] * c.f[i])) + d.f[i];
+}
+
+void
+dbl_neg_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.d[i] = (-(b.d[i] * c.d[i])) + d.d[i];
+}
+
+int main ()
+{
+ flt_mul_add ();
+ flt_mul_sub ();
+ flt_neg_mul_add ();
+
+ dbl_mul_add ();
+ dbl_mul_sub ();
+ dbl_neg_mul_add ();
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "vfmaddps" } } */
+/* { dg-final { scan-assembler "vfmaddpd" } } */
+/* { dg-final { scan-assembler "vfmsubps" } } */
+/* { dg-final { scan-assembler "vfmsubpd" } } */
+/* { dg-final { scan-assembler "vfnmaddps" } } */
+/* { dg-final { scan-assembler "vfnmaddpd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/fma4-check.h b/gcc/testsuite/gcc.target/i386/fma4-check.h
new file mode 100644
index 0000000..76fcdef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-check.h
@@ -0,0 +1,20 @@
+#include <stdlib.h>
+
+#include "cpuid.h"
+
+static void fma4_test (void);
+
+int
+main ()
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ if (!__get_cpuid (0x80000001, &eax, &ebx, &ecx, &edx))
+ return 0;
+
+ /* Run FMA4 test only if host has FMA4 support. */
+ if (ecx & bit_FMA4)
+ fma4_test ();
+
+ exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/i386/fma4-fma.c b/gcc/testsuite/gcc.target/i386/fma4-fma.c
new file mode 100644
index 0000000..cb90691
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-fma.c
@@ -0,0 +1,83 @@
+/* Test that the compiler properly optimizes floating point multiply
+ and add instructions into vfmaddss, vfmsubss, vfnmaddss,
+ vfnmsubss on FMA4 systems. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -mfma4" } */
+
+extern void exit (int);
+
+float
+flt_mul_add (float a, float b, float c)
+{
+ return (a * b) + c;
+}
+
+double
+dbl_mul_add (double a, double b, double c)
+{
+ return (a * b) + c;
+}
+
+float
+flt_mul_sub (float a, float b, float c)
+{
+ return (a * b) - c;
+}
+
+double
+dbl_mul_sub (double a, double b, double c)
+{
+ return (a * b) - c;
+}
+
+float
+flt_neg_mul_add (float a, float b, float c)
+{
+ return (-(a * b)) + c;
+}
+
+double
+dbl_neg_mul_add (double a, double b, double c)
+{
+ return (-(a * b)) + c;
+}
+
+float
+flt_neg_mul_sub (float a, float b, float c)
+{
+ return (-(a * b)) - c;
+}
+
+double
+dbl_neg_mul_sub (double a, double b, double c)
+{
+ return (-(a * b)) - c;
+}
+
+float f[10] = { 2, 3, 4 };
+double d[10] = { 2, 3, 4 };
+
+int main ()
+{
+ f[3] = flt_mul_add (f[0], f[1], f[2]);
+ f[4] = flt_mul_sub (f[0], f[1], f[2]);
+ f[5] = flt_neg_mul_add (f[0], f[1], f[2]);
+ f[6] = flt_neg_mul_sub (f[0], f[1], f[2]);
+
+ d[3] = dbl_mul_add (d[0], d[1], d[2]);
+ d[4] = dbl_mul_sub (d[0], d[1], d[2]);
+ d[5] = dbl_neg_mul_add (d[0], d[1], d[2]);
+ d[6] = dbl_neg_mul_sub (d[0], d[1], d[2]);
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "vfmaddss" } } */
+/* { dg-final { scan-assembler "vfmaddsd" } } */
+/* { dg-final { scan-assembler "vfmsubss" } } */
+/* { dg-final { scan-assembler "vfmsubsd" } } */
+/* { dg-final { scan-assembler "vfnmaddss" } } */
+/* { dg-final { scan-assembler "vfnmaddsd" } } */
+/* { dg-final { scan-assembler "vfnmsubss" } } */
+/* { dg-final { scan-assembler "vfnmsubsd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/fma4-maccXX.c b/gcc/testsuite/gcc.target/i386/fma4-maccXX.c
new file mode 100644
index 0000000..4b4c005
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-maccXX.c
@@ -0,0 +1,136 @@
+/* { dg-do run } */
+/* { dg-require-effective-target fma4 } */
+/* { dg-options "-O0 -mfma4" } */
+
+#include "fma4-check.h"
+
+#include <x86intrin.h>
+#include <string.h>
+
+#define NUM 20
+
+union
+{
+ __m128 x[NUM];
+ float f[NUM * 4];
+ __m128d y[NUM];
+ double d[NUM * 2];
+} dst, res, src1, src2, src3;
+
+
+/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
+ product is not rounded, only the addition is rounded. */
+
+static void
+init_maccps ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 10;
+ src3.f[i] = i + 20;
+ }
+}
+
+static void
+init_maccpd ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 10;
+ src3.d[i] = i + 20;
+ }
+}
+
+static int
+check_maccps ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ for (j = 0; j < 4; j++)
+ {
+ res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j];
+ if (dst.f[i + j] != res.f[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_maccpd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ for (j = 0; j < 2; j++)
+ {
+ res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j];
+ if (dst.d[i + j] != res.d[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+
+static int
+check_maccss ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i= i + 4)
+ {
+ res.f[i] = (src1.f[i] * src2.f[i]) + src3.f[i];
+ if (dst.f[i] != res.f[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_maccsd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ {
+ res.d[i] = (src1.d[i] * src2.d[i]) + src3.d[i];
+ if (dst.d[i] != res.d[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+fma4_test (void)
+{
+ int i;
+
+ init_maccps ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_macc_ps (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_maccps ())
+ abort ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_macc_ss (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_maccss ())
+ abort ();
+
+ init_maccpd ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_macc_pd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_maccpd ())
+ abort ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_macc_sd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_maccsd ())
+ abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/fma4-msubXX.c b/gcc/testsuite/gcc.target/i386/fma4-msubXX.c
new file mode 100644
index 0000000..eed7558
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-msubXX.c
@@ -0,0 +1,134 @@
+/* { dg-do run } */
+/* { dg-require-effective-target fma4 } */
+/* { dg-options "-O0 -mfma4" } */
+
+#include "fma4-check.h"
+
+#include <x86intrin.h>
+#include <string.h>
+
+#define NUM 20
+
+union
+{
+ __m128 x[NUM];
+ float f[NUM * 4];
+ __m128d y[NUM];
+ double d[NUM * 2];
+} dst, res, src1, src2, src3;
+
+/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
+ product is not rounded, only the addition is rounded. */
+
+static void
+init_msubps ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 10;
+ src3.f[i] = i + 20;
+ }
+}
+
+static void
+init_msubpd ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 10;
+ src3.d[i] = i + 20;
+ }
+}
+
+static int
+check_msubps ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ for (j = 0; j < 4; j++)
+ {
+ res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j];
+ if (dst.f[i + j] != res.f[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_msubpd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ for (j = 0; j < 2; j++)
+ {
+ res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j];
+ if (dst.d[i + j] != res.d[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+
+static int
+check_msubss ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ {
+ res.f[i] = (src1.f[i] * src2.f[i]) - src3.f[i];
+ if (dst.f[i] != res.f[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_msubsd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ {
+ res.d[i] = (src1.d[i] * src2.d[i]) - src3.d[i];
+ if (dst.d[i] != res.d[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+fma4_test (void)
+{
+ int i;
+
+ init_msubps ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_msub_ps (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_msubps ())
+ abort ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_msub_ss (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_msubss ())
+ abort ();
+
+ init_msubpd ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_msub_pd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_msubpd ())
+ abort ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_msub_sd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_msubsd ())
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/fma4-nmaccXX.c b/gcc/testsuite/gcc.target/i386/fma4-nmaccXX.c
new file mode 100644
index 0000000..9abf746
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-nmaccXX.c
@@ -0,0 +1,137 @@
+/* { dg-do run } */
+/* { dg-require-effective-target fma4 } */
+/* { dg-options "-O0 -mfma4" } */
+
+#include "fma4-check.h"
+
+#include <x86intrin.h>
+#include <string.h>
+
+#define NUM 20
+
+union
+{
+ __m128 x[NUM];
+ float f[NUM * 4];
+ __m128d y[NUM];
+ double d[NUM * 2];
+} dst, res, src1, src2, src3;
+
+/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
+ product is not rounded, only the addition is rounded. */
+
+static void
+init_nmaccps ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 10;
+ src3.f[i] = i + 20;
+ }
+}
+
+static void
+init_nmaccpd ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 10;
+ src3.d[i] = i + 20;
+ }
+}
+
+static int
+check_nmaccps ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ for (j = 0; j < 4; j++)
+ {
+ res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j];
+ if (dst.f[i + j] != res.f[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_nmaccpd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ for (j = 0; j < 2; j++)
+ {
+ res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j];
+ if (dst.d[i + j] != res.d[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+
+static int
+check_nmaccss ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ {
+ res.f[i] = - (src1.f[i] * src2.f[i]) + src3.f[i];
+ if (dst.f[i] != res.f[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_nmaccsd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ {
+ res.d[i] = - (src1.d[i] * src2.d[i]) + src3.d[i];
+ if (dst.d[i] != res.d[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+fma4_test (void)
+{
+ int i;
+
+ init_nmaccps ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_nmacc_ps (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_nmaccps ())
+ abort ();
+
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_nmacc_ss (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_nmaccss ())
+ abort ();
+
+ init_nmaccpd ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_nmacc_pd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_nmaccpd ())
+ abort ();
+
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_nmacc_sd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_nmaccsd ())
+ abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/fma4-nmsubXX.c b/gcc/testsuite/gcc.target/i386/fma4-nmsubXX.c
new file mode 100644
index 0000000..85fbecd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-nmsubXX.c
@@ -0,0 +1,137 @@
+/* { dg-do run } */
+/* { dg-require-effective-target fma4 } */
+/* { dg-options "-O0 -mfma4" } */
+
+#include "fma4-check.h"
+
+#include <x86intrin.h>
+#include <string.h>
+
+#define NUM 20
+
+union
+{
+ __m128 x[NUM];
+ float f[NUM * 4];
+ __m128d y[NUM];
+ double d[NUM * 2];
+} dst, res, src1, src2, src3;
+
+/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
+ product is not rounded, only the addition is rounded. */
+
+static void
+init_nmsubps ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 10;
+ src3.f[i] = i + 20;
+ }
+}
+
+static void
+init_nmsubpd ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 10;
+ src3.d[i] = i + 20;
+ }
+}
+
+static int
+check_nmsubps ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ for (j = 0; j < 4; j++)
+ {
+ res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j];
+ if (dst.f[i + j] != res.f[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_nmsubpd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ for (j = 0; j < 2; j++)
+ {
+ res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j];
+ if (dst.d[i + j] != res.d[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+
+static int
+check_nmsubss ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ {
+ res.f[i] = - (src1.f[i] * src2.f[i]) - src3.f[i];
+ if (dst.f[i] != res.f[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_nmsubsd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ {
+ res.d[i] = - (src1.d[i] * src2.d[i]) - src3.d[i];
+ if (dst.d[i] != res.d[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+fma4_test (void)
+{
+ int i;
+
+ init_nmsubps ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_nmsub_ps (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_nmsubps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4]))
+ abort ();
+
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_nmsub_ss (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_nmsubss (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4]))
+ abort ();
+
+ init_nmsubpd ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_nmsub_pd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_nmsubpd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2]))
+ abort ();
+
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_nmsub_sd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_nmsubsd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2]))
+ abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/fma4-vector.c b/gcc/testsuite/gcc.target/i386/fma4-vector.c
new file mode 100644
index 0000000..df8463e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-vector.c
@@ -0,0 +1,93 @@
+/* Test that the compiler properly optimizes floating point multiply and add
+ instructions vector into vfmaddps on FMA4 systems. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */
+
+extern void exit (int);
+
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
+
+#define SIZE 10240
+
+union {
+ __m128 f_align;
+ __m128d d_align;
+ float f[SIZE];
+ double d[SIZE];
+} a, b, c, d;
+
+void
+flt_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.f[i] = (b.f[i] * c.f[i]) + d.f[i];
+}
+
+void
+dbl_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.d[i] = (b.d[i] * c.d[i]) + d.d[i];
+}
+
+void
+flt_mul_sub (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.f[i] = (b.f[i] * c.f[i]) - d.f[i];
+}
+
+void
+dbl_mul_sub (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.d[i] = (b.d[i] * c.d[i]) - d.d[i];
+}
+
+void
+flt_neg_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.f[i] = (-(b.f[i] * c.f[i])) + d.f[i];
+}
+
+void
+dbl_neg_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.d[i] = (-(b.d[i] * c.d[i])) + d.d[i];
+}
+
+int main ()
+{
+ flt_mul_add ();
+ flt_mul_sub ();
+ flt_neg_mul_add ();
+
+ dbl_mul_add ();
+ dbl_mul_sub ();
+ dbl_neg_mul_add ();
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "vfmaddps" } } */
+/* { dg-final { scan-assembler "vfmaddpd" } } */
+/* { dg-final { scan-assembler "vfmsubps" } } */
+/* { dg-final { scan-assembler "vfmsubpd" } } */
+/* { dg-final { scan-assembler "vfnmaddps" } } */
+/* { dg-final { scan-assembler "vfnmaddpd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-2.c b/gcc/testsuite/gcc.target/i386/funcspec-2.c
new file mode 100644
index 0000000..c132fc9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/funcspec-2.c
@@ -0,0 +1,99 @@
+/* Test whether using target specific options, we can generate FMA4 code. */
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -march=k8" } */
+
+extern void exit (int);
+
+#define FMA4_ATTR __attribute__((__target__("fma4")))
+extern float flt_mul_add (float a, float b, float c) FMA4_ATTR;
+extern float flt_mul_sub (float a, float b, float c) FMA4_ATTR;
+extern float flt_neg_mul_add (float a, float b, float c) FMA4_ATTR;
+extern float flt_neg_mul_sub (float a, float b, float c) FMA4_ATTR;
+
+extern double dbl_mul_add (double a, double b, double c) FMA4_ATTR;
+extern double dbl_mul_sub (double a, double b, double c) FMA4_ATTR;
+extern double dbl_neg_mul_add (double a, double b, double c) FMA4_ATTR;
+extern double dbl_neg_mul_sub (double a, double b, double c) FMA4_ATTR;
+
+float
+flt_mul_add (float a, float b, float c)
+{
+ return (a * b) + c;
+}
+
+double
+dbl_mul_add (double a, double b, double c)
+{
+ return (a * b) + c;
+}
+
+float
+flt_mul_sub (float a, float b, float c)
+{
+ return (a * b) - c;
+}
+
+double
+dbl_mul_sub (double a, double b, double c)
+{
+ return (a * b) - c;
+}
+
+float
+flt_neg_mul_add (float a, float b, float c)
+{
+ return (-(a * b)) + c;
+}
+
+double
+dbl_neg_mul_add (double a, double b, double c)
+{
+ return (-(a * b)) + c;
+}
+
+float
+flt_neg_mul_sub (float a, float b, float c)
+{
+ return (-(a * b)) - c;
+}
+
+double
+dbl_neg_mul_sub (double a, double b, double c)
+{
+ return (-(a * b)) - c;
+}
+
+float f[10] = { 2, 3, 4 };
+double d[10] = { 2, 3, 4 };
+
+int main ()
+{
+ f[3] = flt_mul_add (f[0], f[1], f[2]);
+ f[4] = flt_mul_sub (f[0], f[1], f[2]);
+ f[5] = flt_neg_mul_add (f[0], f[1], f[2]);
+ f[6] = flt_neg_mul_sub (f[0], f[1], f[2]);
+
+ d[3] = dbl_mul_add (d[0], d[1], d[2]);
+ d[4] = dbl_mul_sub (d[0], d[1], d[2]);
+ d[5] = dbl_neg_mul_add (d[0], d[1], d[2]);
+ d[6] = dbl_neg_mul_sub (d[0], d[1], d[2]);
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "vfmaddss" } } */
+/* { dg-final { scan-assembler "vfmaddsd" } } */
+/* { dg-final { scan-assembler "vfmsubss" } } */
+/* { dg-final { scan-assembler "vfmsubsd" } } */
+/* { dg-final { scan-assembler "vfnmaddss" } } */
+/* { dg-final { scan-assembler "vfnmaddsd" } } */
+/* { dg-final { scan-assembler "vfnmsubss" } } */
+/* { dg-final { scan-assembler "vfnmsubsd" } } */
+/* { dg-final { scan-assembler "call\t(.*)flt_mul_add" } } */
+/* { dg-final { scan-assembler "call\t(.*)flt_mul_sub" } } */
+/* { dg-final { scan-assembler "call\t(.*)flt_neg_mul_add" } } */
+/* { dg-final { scan-assembler "call\t(.*)flt_neg_mul_sub" } } */
+/* { dg-final { scan-assembler "call\t(.*)dbl_mul_add" } } */
+/* { dg-final { scan-assembler "call\t(.*)dbl_mul_sub" } } */
+/* { dg-final { scan-assembler "call\t(.*)dbl_neg_mul_add" } } */
+/* { dg-final { scan-assembler "call\t(.*)dbl_neg_mul_sub" } } */
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-4.c b/gcc/testsuite/gcc.target/i386/funcspec-4.c
index e2eef41..025b97d 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-4.c
+++ b/gcc/testsuite/gcc.target/i386/funcspec-4.c
@@ -1,6 +1,9 @@
/* Test some error conditions with function specific options. */
/* { dg-do compile } */
+/* no fma400 switch */
+extern void error1 (void) __attribute__((__target__("fma400"))); /* { dg-error "unknown" } */
+
/* Multiple arch switches */
extern void error2 (void) __attribute__((__target__("arch=core2,arch=k8"))); /* { dg-error "already specified" } */
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-5.c b/gcc/testsuite/gcc.target/i386/funcspec-5.c
index cbecfaa..34da51c 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-5.c
+++ b/gcc/testsuite/gcc.target/i386/funcspec-5.c
@@ -16,6 +16,7 @@ extern void test_sse4 (void) __attribute__((__target__("sse4")));
extern void test_sse4_1 (void) __attribute__((__target__("sse4.1")));
extern void test_sse4_2 (void) __attribute__((__target__("sse4.2")));
extern void test_sse4a (void) __attribute__((__target__("sse4a")));
+extern void test_fma4 (void) __attribute__((__target__("fma4")));
extern void test_ssse3 (void) __attribute__((__target__("ssse3")));
extern void test_no_abm (void) __attribute__((__target__("no-abm")));
@@ -31,6 +32,7 @@ extern void test_no_sse4 (void) __attribute__((__target__("no-sse4")));
extern void test_no_sse4_1 (void) __attribute__((__target__("no-sse4.1")));
extern void test_no_sse4_2 (void) __attribute__((__target__("no-sse4.2")));
extern void test_no_sse4a (void) __attribute__((__target__("no-sse4a")));
+extern void test_no_fma4 (void) __attribute__((__target__("no-fma4")));
extern void test_no_ssse3 (void) __attribute__((__target__("no-ssse3")));
extern void test_arch_i386 (void) __attribute__((__target__("arch=i386")));
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-6.c b/gcc/testsuite/gcc.target/i386/funcspec-6.c
index 7f46ad0..575be9b 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-6.c
+++ b/gcc/testsuite/gcc.target/i386/funcspec-6.c
@@ -16,6 +16,7 @@ extern void test_sse4 (void) __attribute__((__target__("sse4")));
extern void test_sse4_1 (void) __attribute__((__target__("sse4.1")));
extern void test_sse4_2 (void) __attribute__((__target__("sse4.2")));
extern void test_sse4a (void) __attribute__((__target__("sse4a")));
+extern void test_fma4 (void) __attribute__((__target__("fma4")));
extern void test_ssse3 (void) __attribute__((__target__("ssse3")));
extern void test_no_abm (void) __attribute__((__target__("no-abm")));
@@ -31,6 +32,7 @@ extern void test_no_sse4 (void) __attribute__((__target__("no-sse4")));
extern void test_no_sse4_1 (void) __attribute__((__target__("no-sse4.1")));
extern void test_no_sse4_2 (void) __attribute__((__target__("no-sse4.2")));
extern void test_no_sse4a (void) __attribute__((__target__("no-sse4a")));
+extern void test_no_fma4 (void) __attribute__((__target__("no-fma4")));
extern void test_no_ssse3 (void) __attribute__((__target__("no-ssse3")));
extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona")));
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-8.c b/gcc/testsuite/gcc.target/i386/funcspec-8.c
index c370733..ba4b7f2 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-8.c
+++ b/gcc/testsuite/gcc.target/i386/funcspec-8.c
@@ -104,6 +104,25 @@ generic_insertq (__m128i a, __m128i b)
return __builtin_ia32_insertq (a, b); /* { dg-error "needs isa option" } */
}
+#ifdef __FMA4__
+#error "-mfma4 should not be set for this test"
+#endif
+
+__m128d fma4_fmaddpd (__m128d a, __m128d b, __m128d c) __attribute__((__target__("fma4")));
+__m128d generic_fmaddpd (__m128d a, __m128d b, __m128d c);
+
+__m128d
+fma4_fmaddpd (__m128d a, __m128d b, __m128d c)
+{
+ return __builtin_ia32_vfmaddpd (a, b, c);
+}
+
+__m128d
+generic_fmaddpd (__m128d a, __m128d b, __m128d c)
+{
+ return __builtin_ia32_vfmaddpd (a, b, c); /* { dg-error "needs isa option" } */
+}
+
#ifdef __AES__
#error "-maes should not be set for this test"
#endif
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-9.c b/gcc/testsuite/gcc.target/i386/funcspec-9.c
new file mode 100644
index 0000000..78714e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/funcspec-9.c
@@ -0,0 +1,36 @@
+/* Test whether using target specific options, we can generate FMA4 code. */
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=k8 -mfpmath=sse -msse2" } */
+
+extern void exit (int);
+
+#ifdef __FMA4__
+#warning "__FMA4__ should not be defined before #pragma GCC target."
+#endif
+
+#pragma GCC push_options
+#pragma GCC target ("fma4")
+
+#ifndef __FMA4__
+#warning "__FMA4__ should have be defined after #pragma GCC target."
+#endif
+
+float
+flt_mul_add (float a, float b, float c)
+{
+ return (a * b) + c;
+}
+
+#pragma GCC pop_options
+#ifdef __FMA4__
+#warning "__FMA4__ should not be defined after #pragma GCC pop target."
+#endif
+
+double
+dbl_mul_add (double a, double b, double c)
+{
+ return (a * b) + c;
+}
+
+/* { dg-final { scan-assembler "vfmaddss" } } */
+/* { dg-final { scan-assembler "addsd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/i386.exp b/gcc/testsuite/gcc.target/i386/i386.exp
index 242c40f..c7c6e12 100644
--- a/gcc/testsuite/gcc.target/i386/i386.exp
+++ b/gcc/testsuite/gcc.target/i386/i386.exp
@@ -120,6 +120,20 @@ proc check_effective_target_sse4a { } {
} "-O2 -msse4a" ]
}
+# Return 1 if fma4 instructions can be compiled.
+proc check_effective_target_fma4 { } {
+ return [check_no_compiler_messages fma4 object {
+ typedef float __m128 __attribute__ ((__vector_size__ (16)));
+ typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+ __m128 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
+ {
+ return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A,
+ (__v4sf)__B,
+ (__v4sf)__C);
+ }
+ } "-O2 -mfma4" ]
+}
+
# If a testcase doesn't have special options, use these.
global DEFAULT_CFLAGS
if ![info exists DEFAULT_CFLAGS] then {
diff --git a/gcc/testsuite/gcc.target/i386/isa-1.c b/gcc/testsuite/gcc.target/i386/isa-1.c
index b2040b3..d98c14f 100644
--- a/gcc/testsuite/gcc.target/i386/isa-1.c
+++ b/gcc/testsuite/gcc.target/i386/isa-1.c
@@ -27,5 +27,11 @@ main ()
#if defined __SSE4A__
abort ();
#endif
+#if defined __AVX__
+ abort ();
+#endif
+#if defined __FMA4__
+ abort ();
+#endif
return 0;
}
diff --git a/gcc/testsuite/gcc.target/i386/isa-10.c b/gcc/testsuite/gcc.target/i386/isa-10.c
new file mode 100644
index 0000000..5f57be9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/isa-10.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-options "-march=x86-64 -mfma4 -mno-sse4" } */
+
+extern void abort (void);
+
+int
+main ()
+{
+#if !defined __SSE__
+ abort ();
+#endif
+#if !defined __SSE2__
+ abort ();
+#endif
+#if !defined __SSE3__
+ abort ();
+#endif
+#if !defined __SSSE3__
+ abort ();
+#endif
+#if defined __SSE4_1__
+ abort ();
+#endif
+#if defined __SSE4_2__
+ abort ();
+#endif
+#if !defined __SSE4A__
+ abort ();
+#endif
+#if defined __FMA4__
+ abort ();
+#endif
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/isa-11.c b/gcc/testsuite/gcc.target/i386/isa-11.c
new file mode 100644
index 0000000..64755b09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/isa-11.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-options "-march=x86-64 -mfma4 -mno-ssse3" } */
+
+extern void abort (void);
+
+int
+main ()
+{
+#if !defined __SSE__
+ abort ();
+#endif
+#if !defined __SSE2__
+ abort ();
+#endif
+#if !defined __SSE3__
+ abort ();
+#endif
+#if defined __SSSE3__
+ abort ();
+#endif
+#if defined __SSE4_1__
+ abort ();
+#endif
+#if defined __SSE4_2__
+ abort ();
+#endif
+#if !defined __SSE4A__
+ abort ();
+#endif
+#if defined __FMA4__
+ abort ();
+#endif
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/isa-12.c b/gcc/testsuite/gcc.target/i386/isa-12.c
new file mode 100644
index 0000000..fde84a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/isa-12.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-options "-march=x86-64 -mfma4 -mno-sse3" } */
+
+extern void abort (void);
+
+int
+main ()
+{
+#if !defined __SSE__
+ abort ();
+#endif
+#if !defined __SSE2__
+ abort ();
+#endif
+#if defined __SSE3__
+ abort ();
+#endif
+#if defined __SSSE3__
+ abort ();
+#endif
+#if defined __SSE4_1__
+ abort ();
+#endif
+#if defined __SSE4_2__
+ abort ();
+#endif
+#if defined __SSE4A__
+ abort ();
+#endif
+#if defined __FMA4__
+ abort ();
+#endif
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/isa-13.c b/gcc/testsuite/gcc.target/i386/isa-13.c
new file mode 100644
index 0000000..74e37d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/isa-13.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-options "-march=x86-64 -mfma4 -mno-sse2" } */
+
+extern void abort (void);
+
+int
+main ()
+{
+#if !defined __SSE__
+ abort ();
+#endif
+#if defined __SSE2__
+ abort ();
+#endif
+#if defined __SSE3__
+ abort ();
+#endif
+#if defined __SSSE3__
+ abort ();
+#endif
+#if defined __SSE4_1__
+ abort ();
+#endif
+#if defined __SSE4_2__
+ abort ();
+#endif
+#if defined __SSE4A__
+ abort ();
+#endif
+#if defined __FMA4__
+ abort ();
+#endif
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/isa-14.c b/gcc/testsuite/gcc.target/i386/isa-14.c
index 09dda6d..5d49e6e 100644
--- a/gcc/testsuite/gcc.target/i386/isa-14.c
+++ b/gcc/testsuite/gcc.target/i386/isa-14.c
@@ -1,5 +1,5 @@
/* { dg-do run } */
-/* { dg-options "-march=x86-64 -msse4a -mno-sse" } */
+/* { dg-options "-march=x86-64 -msse4a -mfma4 -mno-sse" } */
extern void abort (void);
@@ -27,5 +27,8 @@ main ()
#if defined __SSE4A__
abort ();
#endif
+#if defined __FMA4__
+ abort ();
+#endif
return 0;
}
diff --git a/gcc/testsuite/gcc.target/i386/isa-2.c b/gcc/testsuite/gcc.target/i386/isa-2.c
new file mode 100644
index 0000000..aa8958c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/isa-2.c
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-options "-march=x86-64 -msse4 -mfma4" } */
+
+extern void abort (void);
+
+int
+main ()
+{
+#if !defined __SSE__
+ abort ();
+#endif
+#if !defined __SSE2__
+ abort ();
+#endif
+#if !defined __SSE3__
+ abort ();
+#endif
+#if !defined __SSSE3__
+ abort ();
+#endif
+#if !defined __SSE4_1__
+ abort ();
+#endif
+#if !defined __SSE4_2__
+ abort ();
+#endif
+#if !defined __SSE4A__
+ abort ();
+#endif
+#if !defined __AVX__
+ abort ();
+#endif
+#if !defined __FMA4__
+ abort ();
+#endif
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/isa-3.c b/gcc/testsuite/gcc.target/i386/isa-3.c
new file mode 100644
index 0000000..a4d93f4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/isa-3.c
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-options "-march=x86-64 -msse4 -mfma4 -msse4a" } */
+
+extern void abort (void);
+
+int
+main ()
+{
+#if !defined __SSE__
+ abort ();
+#endif
+#if !defined __SSE2__
+ abort ();
+#endif
+#if !defined __SSE3__
+ abort ();
+#endif
+#if !defined __SSSE3__
+ abort ();
+#endif
+#if !defined __SSE4_1__
+ abort ();
+#endif
+#if !defined __SSE4_2__
+ abort ();
+#endif
+#if !defined __SSE4A__
+ abort ();
+#endif
+#if !defined __AVX__
+ abort ();
+#endif
+#if !defined __FMA4__
+ abort ();
+#endif
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/isa-4.c b/gcc/testsuite/gcc.target/i386/isa-4.c
new file mode 100644
index 0000000..0137257
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/isa-4.c
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-options "-march=core2 -mfma4 -mno-sse4" } */
+
+extern void abort (void);
+
+int
+main ()
+{
+#if !defined __SSE__
+ abort ();
+#endif
+#if !defined __SSE2__
+ abort ();
+#endif
+#if !defined __SSE3__
+ abort ();
+#endif
+#if !defined __SSSE3__
+ abort ();
+#endif
+#if defined __SSE4_1__
+ abort ();
+#endif
+#if defined __SSE4_2__
+ abort ();
+#endif
+#if !defined __SSE4A__
+ abort ();
+#endif
+#if defined __AVX__
+ abort ();
+#endif
+#if defined __FMA4__
+ abort ();
+#endif
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/isa-5.c b/gcc/testsuite/gcc.target/i386/isa-5.c
index 2157499..39d065e 100644
--- a/gcc/testsuite/gcc.target/i386/isa-5.c
+++ b/gcc/testsuite/gcc.target/i386/isa-5.c
@@ -27,5 +27,11 @@ main ()
#if !defined __SSE4A__
abort ();
#endif
+#if defined __AVX__
+ abort ();
+#endif
+#if defined __FMA4__
+ abort ();
+#endif
return 0;
}
diff --git a/gcc/testsuite/gcc.target/i386/isa-6.c b/gcc/testsuite/gcc.target/i386/isa-6.c
index 389621b..a9a0ddb 100644
--- a/gcc/testsuite/gcc.target/i386/isa-6.c
+++ b/gcc/testsuite/gcc.target/i386/isa-6.c
@@ -28,5 +28,11 @@ main ()
#if !defined __SSE4A__
abort ();
#endif
+#if defined __AVX__
+ abort ();
+#endif
+#if defined __FMA4__
+ abort ();
+#endif
return 0;
}
diff --git a/gcc/testsuite/gcc.target/i386/isa-7.c b/gcc/testsuite/gcc.target/i386/isa-7.c
new file mode 100644
index 0000000..8dd628e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/isa-7.c
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-options "-march=amdfam10 -mfma4 -mno-sse4" } */
+
+extern void abort (void);
+
+int
+main ()
+{
+#if !defined __SSE__
+ abort ();
+#endif
+#if !defined __SSE2__
+ abort ();
+#endif
+#if !defined __SSE3__
+ abort ();
+#endif
+#if !defined __SSSE3__
+ abort ();
+#endif
+#if defined __SSE4_1__
+ abort ();
+#endif
+#if defined __SSE4_2__
+ abort ();
+#endif
+#if !defined __SSE4A__
+ abort ();
+#endif
+#if defined __AVX__
+ abort ();
+#endif
+#if defined __FMA4__
+ abort ();
+#endif
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/isa-8.c b/gcc/testsuite/gcc.target/i386/isa-8.c
new file mode 100644
index 0000000..2ffd80f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/isa-8.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-options "-march=amdfam10 -mfma4 -mno-sse4a" } */
+
+extern void abort (void);
+
+int
+main ()
+{
+#if !defined __SSE__
+ abort ();
+#endif
+#if !defined __SSE2__
+ abort ();
+#endif
+#if !defined __SSE3__
+ abort ();
+#endif
+#if !defined __SSSE3__
+ abort ();
+#endif
+#if !defined __SSE4_1__
+ abort ();
+#endif
+#if !defined __SSE4_2__
+ abort ();
+#endif
+#if defined __SSE4A__
+ abort ();
+#endif
+#if defined __FMA4__
+ abort ();
+#endif
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/isa-9.c b/gcc/testsuite/gcc.target/i386/isa-9.c
new file mode 100644
index 0000000..64cbdbd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/isa-9.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-options "-march=amdfam10 -mno-fma4" } */
+
+extern void abort (void);
+
+int
+main ()
+{
+#if !defined __SSE__
+ abort ();
+#endif
+#if !defined __SSE2__
+ abort ();
+#endif
+#if !defined __SSE3__
+ abort ();
+#endif
+#if defined __SSSE3__
+ abort ();
+#endif
+#if defined __SSE4_1__
+ abort ();
+#endif
+#if defined __SSE4_2__
+ abort ();
+#endif
+#if !defined __SSE4A__
+ abort ();
+#endif
+#if defined __FMA4__
+ abort ();
+#endif
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse-12.c b/gcc/testsuite/gcc.target/i386/sse-12.c
new file mode 100644
index 0000000..85c36c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse-12.c
@@ -0,0 +1,8 @@
+/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are
+ usable with -O -std=c89 -pedantic-errors. */
+/* { dg-do compile } */
+/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -m3dnow -mavx -mfma4 -maes -mpclmul" } */
+
+#include <x86intrin.h>
+
+int dummy;
diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c
new file mode 100644
index 0000000..1ce9d96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse-13.c
@@ -0,0 +1,128 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -maes -mpclmul" } */
+
+#include <mm_malloc.h>
+
+/* Test that the intrinsics compile with optimization. All of them are
+ defined as inline functions in {,x,e,p,t,s,w,a,b,i}mmintrin.h and mm3dnow.h
+ that reference the proper builtin functions. Defining away "extern" and
+ "__inline" results in all of them being compiled as proper functions. */
+
+#define extern
+#define __inline
+
+/* Following intrinsics require immediate arguments. */
+
+/* ammintrin.h */
+#define __builtin_ia32_extrqi(X, I, L) __builtin_ia32_extrqi(X, 1, 1)
+#define __builtin_ia32_insertqi(X, Y, I, L) __builtin_ia32_insertqi(X, Y, 1, 1)
+
+/* immintrin.h */
+#define __builtin_ia32_blendpd256(X, Y, M) __builtin_ia32_blendpd256(X, Y, 1)
+#define __builtin_ia32_blendps256(X, Y, M) __builtin_ia32_blendps256(X, Y, 1)
+#define __builtin_ia32_dpps256(X, Y, M) __builtin_ia32_dpps256(X, Y, 1)
+#define __builtin_ia32_shufpd256(X, Y, M) __builtin_ia32_shufpd256(X, Y, 1)
+#define __builtin_ia32_shufps256(X, Y, M) __builtin_ia32_shufps256(X, Y, 1)
+#define __builtin_ia32_cmpsd(X, Y, O) __builtin_ia32_cmpsd(X, Y, 1)
+#define __builtin_ia32_cmpss(X, Y, O) __builtin_ia32_cmpss(X, Y, 1)
+#define __builtin_ia32_cmppd(X, Y, O) __builtin_ia32_cmppd(X, Y, 1)
+#define __builtin_ia32_cmpps(X, Y, O) __builtin_ia32_cmpps(X, Y, 1)
+#define __builtin_ia32_cmppd256(X, Y, O) __builtin_ia32_cmppd256(X, Y, 1)
+#define __builtin_ia32_cmpps256(X, Y, O) __builtin_ia32_cmpps256(X, Y, 1)
+#define __builtin_ia32_vextractf128_pd256(X, N) __builtin_ia32_vextractf128_pd256(X, 1)
+#define __builtin_ia32_vextractf128_ps256(X, N) __builtin_ia32_vextractf128_ps256(X, 1)
+#define __builtin_ia32_vextractf128_si256(X, N) __builtin_ia32_vextractf128_si256(X, 1)
+#define __builtin_ia32_vpermilpd(X, N) __builtin_ia32_vpermilpd(X, 1)
+#define __builtin_ia32_vpermilpd256(X, N) __builtin_ia32_vpermilpd256(X, 1)
+#define __builtin_ia32_vpermilps(X, N) __builtin_ia32_vpermilps(X, 1)
+#define __builtin_ia32_vpermilps256(X, N) __builtin_ia32_vpermilps256(X, 1)
+#define __builtin_ia32_vpermil2pd(X, Y, C, I) __builtin_ia32_vpermil2pd(X, Y, C, 1)
+#define __builtin_ia32_vpermil2pd256(X, Y, C, I) __builtin_ia32_vpermil2pd256(X, Y, C, 1)
+#define __builtin_ia32_vpermil2ps(X, Y, C, I) __builtin_ia32_vpermil2ps(X, Y, C, 1)
+#define __builtin_ia32_vpermil2ps256(X, Y, C, I) __builtin_ia32_vpermil2ps256(X, Y, C, 1)
+#define __builtin_ia32_vperm2f128_pd256(X, Y, C) __builtin_ia32_vperm2f128_pd256(X, Y, 1)
+#define __builtin_ia32_vperm2f128_ps256(X, Y, C) __builtin_ia32_vperm2f128_ps256(X, Y, 1)
+#define __builtin_ia32_vperm2f128_si256(X, Y, C) __builtin_ia32_vperm2f128_si256(X, Y, 1)
+#define __builtin_ia32_vinsertf128_pd256(X, Y, C) __builtin_ia32_vinsertf128_pd256(X, Y, 1)
+#define __builtin_ia32_vinsertf128_ps256(X, Y, C) __builtin_ia32_vinsertf128_ps256(X, Y, 1)
+#define __builtin_ia32_vinsertf128_si256(X, Y, C) __builtin_ia32_vinsertf128_si256(X, Y, 1)
+#define __builtin_ia32_roundpd256(V, M) __builtin_ia32_roundpd256(V, 1)
+#define __builtin_ia32_roundps256(V, M) __builtin_ia32_roundps256(V, 1)
+
+/* wmmintrin.h */
+#define __builtin_ia32_aeskeygenassist128(X, C) __builtin_ia32_aeskeygenassist128(X, 1)
+#define __builtin_ia32_pclmulqdq128(X, Y, I) __builtin_ia32_pclmulqdq128(X, Y, 1)
+
+/* smmintrin.h */
+#define __builtin_ia32_roundpd(V, M) __builtin_ia32_roundpd(V, 1)
+#define __builtin_ia32_roundsd(D, V, M) __builtin_ia32_roundsd(D, V, 1)
+#define __builtin_ia32_roundps(V, M) __builtin_ia32_roundps(V, 1)
+#define __builtin_ia32_roundss(D, V, M) __builtin_ia32_roundss(D, V, 1)
+
+#define __builtin_ia32_pblendw128(X, Y, M) __builtin_ia32_pblendw128 (X, Y, 1)
+#define __builtin_ia32_blendps(X, Y, M) __builtin_ia32_blendps(X, Y, 1)
+#define __builtin_ia32_blendpd(X, Y, M) __builtin_ia32_blendpd(X, Y, 1)
+#define __builtin_ia32_dpps(X, Y, M) __builtin_ia32_dpps(X, Y, 1)
+#define __builtin_ia32_dppd(X, Y, M) __builtin_ia32_dppd(X, Y, 1)
+#define __builtin_ia32_insertps128(D, S, N) __builtin_ia32_insertps128(D, S, 1)
+#define __builtin_ia32_vec_ext_v4sf(X, N) __builtin_ia32_vec_ext_v4sf(X, 1)
+#define __builtin_ia32_vec_set_v16qi(D, S, N) __builtin_ia32_vec_set_v16qi(D, S, 1)
+#define __builtin_ia32_vec_set_v4si(D, S, N) __builtin_ia32_vec_set_v4si(D, S, 1)
+#define __builtin_ia32_vec_set_v2di(D, S, N) __builtin_ia32_vec_set_v2di(D, S, 1)
+#define __builtin_ia32_vec_ext_v16qi(X, N) __builtin_ia32_vec_ext_v16qi(X, 1)
+#define __builtin_ia32_vec_ext_v4si(X, N) __builtin_ia32_vec_ext_v4si(X, 1)
+#define __builtin_ia32_vec_ext_v2di(X, N) __builtin_ia32_vec_ext_v2di(X, 1)
+#define __builtin_ia32_mpsadbw128(X, Y, M) __builtin_ia32_mpsadbw128(X, Y, 1)
+#define __builtin_ia32_pcmpistrm128(X, Y, M) \
+ __builtin_ia32_pcmpistrm128(X, Y, 1)
+#define __builtin_ia32_pcmpistri128(X, Y, M) \
+ __builtin_ia32_pcmpistri128(X, Y, 1)
+#define __builtin_ia32_pcmpestrm128(X, LX, Y, LY, M) \
+ __builtin_ia32_pcmpestrm128(X, LX, Y, LY, 1)
+#define __builtin_ia32_pcmpestri128(X, LX, Y, LY, M) \
+ __builtin_ia32_pcmpestri128(X, LX, Y, LY, 1)
+#define __builtin_ia32_pcmpistria128(X, Y, M) \
+ __builtin_ia32_pcmpistria128(X, Y, 1)
+#define __builtin_ia32_pcmpistric128(X, Y, M) \
+ __builtin_ia32_pcmpistric128(X, Y, 1)
+#define __builtin_ia32_pcmpistrio128(X, Y, M) \
+ __builtin_ia32_pcmpistrio128(X, Y, 1)
+#define __builtin_ia32_pcmpistris128(X, Y, M) \
+ __builtin_ia32_pcmpistris128(X, Y, 1)
+#define __builtin_ia32_pcmpistriz128(X, Y, M) \
+ __builtin_ia32_pcmpistriz128(X, Y, 1)
+#define __builtin_ia32_pcmpestria128(X, LX, Y, LY, M) \
+ __builtin_ia32_pcmpestria128(X, LX, Y, LY, 1)
+#define __builtin_ia32_pcmpestric128(X, LX, Y, LY, M) \
+ __builtin_ia32_pcmpestric128(X, LX, Y, LY, 1)
+#define __builtin_ia32_pcmpestrio128(X, LX, Y, LY, M) \
+ __builtin_ia32_pcmpestrio128(X, LX, Y, LY, 1)
+#define __builtin_ia32_pcmpestris128(X, LX, Y, LY, M) \
+ __builtin_ia32_pcmpestris128(X, LX, Y, LY, 1)
+#define __builtin_ia32_pcmpestriz128(X, LX, Y, LY, M) \
+ __builtin_ia32_pcmpestriz128(X, LX, Y, LY, 1)
+
+/* tmmintrin.h */
+#define __builtin_ia32_palignr128(X, Y, N) __builtin_ia32_palignr128(X, Y, 8)
+#define __builtin_ia32_palignr(X, Y, N) __builtin_ia32_palignr(X, Y, 8)
+
+/* emmintrin.h */
+#define __builtin_ia32_psrldqi128(A, B) __builtin_ia32_psrldqi128(A, 8)
+#define __builtin_ia32_pslldqi128(A, B) __builtin_ia32_pslldqi128(A, 8)
+#define __builtin_ia32_pshufhw(A, N) __builtin_ia32_pshufhw(A, 0)
+#define __builtin_ia32_pshuflw(A, N) __builtin_ia32_pshuflw(A, 0)
+#define __builtin_ia32_pshufd(A, N) __builtin_ia32_pshufd(A, 0)
+#define __builtin_ia32_vec_set_v8hi(A, D, N) \
+ __builtin_ia32_vec_set_v8hi(A, D, 0)
+#define __builtin_ia32_vec_ext_v8hi(A, N) __builtin_ia32_vec_ext_v8hi(A, 0)
+#define __builtin_ia32_shufpd(A, B, N) __builtin_ia32_shufpd(A, B, 0)
+
+/* xmmintrin.h */
+#define __builtin_prefetch(P, A, I) __builtin_prefetch(P, A, _MM_HINT_NTA)
+#define __builtin_ia32_pshufw(A, N) __builtin_ia32_pshufw(A, 0)
+#define __builtin_ia32_vec_set_v4hi(A, D, N) \
+ __builtin_ia32_vec_set_v4hi(A, D, 0)
+#define __builtin_ia32_vec_ext_v4hi(A, N) __builtin_ia32_vec_ext_v4hi(A, 0)
+#define __builtin_ia32_shufps(A, B, N) __builtin_ia32_shufps(A, B, 0)
+
+#include <x86intrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c
new file mode 100644
index 0000000..c1ddb96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse-14.c
@@ -0,0 +1,157 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */
+
+#include <mm_malloc.h>
+
+/* Test that the intrinsics compile without optimization. All of them are
+ defined as inline functions in {,x,e,p,t,s,w,a}mmintrin.h and mm3dnow.h
+ that reference the proper builtin functions. Defining away "extern" and
+ "__inline" results in all of them being compiled as proper functions. */
+
+#define extern
+#define __inline
+
+#include <x86intrin.h>
+
+#define _CONCAT(x,y) x ## y
+
+#define test_1(func, type, op1_type, imm) \
+ type _CONCAT(_,func) (op1_type A, int const I) \
+ { return func (A, imm); }
+
+#define test_1x(func, type, op1_type, imm1, imm2) \
+ type _CONCAT(_,func) (op1_type A, int const I, int const L) \
+ { return func (A, imm1, imm2); }
+
+#define test_2(func, type, op1_type, op2_type, imm) \
+ type _CONCAT(_,func) (op1_type A, op2_type B, int const I) \
+ { return func (A, B, imm); }
+
+#define test_2x(func, type, op1_type, op2_type, imm1, imm2) \
+ type _CONCAT(_,func) (op1_type A, op2_type B, int const I, int const L) \
+ { return func (A, B, imm1, imm2); }
+
+#define test_3(func, type, op1_type, op2_type, op3_type, imm) \
+ type _CONCAT(_,func) (op1_type A, op2_type B, \
+ op3_type C, int const I) \
+ { return func (A, B, C, imm); }
+
+#define test_4(func, type, op1_type, op2_type, op3_type, op4_type, imm) \
+ type _CONCAT(_,func) (op1_type A, op2_type B, \
+ op3_type C, op4_type D, int const I) \
+ { return func (A, B, C, D, imm); }
+
+
+/* Following intrinsics require immediate arguments. They
+ are defined as macros for non-optimized compilations. */
+
+/* ammintrin.h */
+test_1x (_mm_extracti_si64, __m128i, __m128i, 1, 1)
+test_2x (_mm_inserti_si64, __m128i, __m128i, __m128i, 1, 1)
+
+/* immintrin.h */
+test_2 (_mm256_blend_pd, __m256d, __m256d, __m256d, 1)
+test_2 (_mm256_blend_ps, __m256, __m256, __m256, 1)
+test_2 (_mm256_dp_ps, __m256, __m256, __m256, 1)
+test_2 (_mm256_shuffle_pd, __m256d, __m256d, __m256d, 1)
+test_2 (_mm256_shuffle_ps, __m256, __m256, __m256, 1)
+test_2 (_mm_cmp_sd, __m128d, __m128d, __m128d, 1)
+test_2 (_mm_cmp_ss, __m128, __m128, __m128, 1)
+test_2 (_mm_cmp_pd, __m128d, __m128d, __m128d, 1)
+test_2 (_mm_cmp_ps, __m128, __m128, __m128, 1)
+test_2 (_mm256_cmp_pd, __m256d, __m256d, __m256d, 1)
+test_2 (_mm256_cmp_ps, __m256, __m256, __m256, 1)
+test_1 (_mm256_extractf128_pd, __m128d, __m256d, 1)
+test_1 (_mm256_extractf128_ps, __m128, __m256, 1)
+test_1 (_mm256_extractf128_si256, __m128i, __m256i, 1)
+test_1 (_mm256_extract_epi8, int, __m256i, 20)
+test_1 (_mm256_extract_epi16, int, __m256i, 13)
+test_1 (_mm256_extract_epi32, int, __m256i, 6)
+#ifdef __x86_64__
+test_1 (_mm256_extract_epi64, long long, __m256i, 2)
+#endif
+test_1 (_mm_permute_pd, __m128d, __m128d, 1)
+test_1 (_mm256_permute_pd, __m256d, __m256d, 1)
+test_1 (_mm_permute_ps, __m128, __m128, 1)
+test_1 (_mm256_permute_ps, __m256, __m256, 1)
+test_2 (_mm256_permute2f128_pd, __m256d, __m256d, __m256d, 1)
+test_2 (_mm256_permute2f128_ps, __m256, __m256, __m256, 1)
+test_2 (_mm256_permute2f128_si256, __m256i, __m256i, __m256i, 1)
+test_2 (_mm256_insertf128_pd, __m256d, __m256d, __m128d, 1)
+test_2 (_mm256_insertf128_ps, __m256, __m256, __m128, 1)
+test_2 (_mm256_insertf128_si256, __m256i, __m256i, __m128i, 1)
+test_2 (_mm256_insert_epi8, __m256i, __m256i, int, 30)
+test_2 (_mm256_insert_epi16, __m256i, __m256i, int, 7)
+test_2 (_mm256_insert_epi32, __m256i, __m256i, int, 3)
+#ifdef __x86_64__
+test_2 (_mm256_insert_epi64, __m256i, __m256i, long long, 1)
+#endif
+test_1 (_mm256_round_pd, __m256d, __m256d, 1)
+test_1 (_mm256_round_ps, __m256, __m256, 1)
+
+/* wmmintrin.h */
+test_1 (_mm_aeskeygenassist_si128, __m128i, __m128i, 1)
+test_2 (_mm_clmulepi64_si128, __m128i, __m128i, __m128i, 1)
+
+/* smmintrin.h */
+test_1 (_mm_round_pd, __m128d, __m128d, 1)
+test_1 (_mm_round_ps, __m128, __m128, 1)
+test_2 (_mm_round_sd, __m128d, __m128d, __m128d, 1)
+test_2 (_mm_round_ss, __m128, __m128, __m128, 1)
+
+test_2 (_mm_blend_epi16, __m128i, __m128i, __m128i, 1)
+test_2 (_mm_blend_ps, __m128, __m128, __m128, 1)
+test_2 (_mm_blend_pd, __m128d, __m128d, __m128d, 1)
+test_2 (_mm_dp_ps, __m128, __m128, __m128, 1)
+test_2 (_mm_dp_pd, __m128d, __m128d, __m128d, 1)
+test_2 (_mm_insert_ps, __m128, __m128, __m128, 1)
+test_1 (_mm_extract_ps, int, __m128, 1)
+test_2 (_mm_insert_epi8, __m128i, __m128i, int, 1)
+test_2 (_mm_insert_epi32, __m128i, __m128i, int, 1)
+#ifdef __x86_64__
+test_2 (_mm_insert_epi64, __m128i, __m128i, long long, 1)
+#endif
+test_1 (_mm_extract_epi8, int, __m128i, 1)
+test_1 (_mm_extract_epi32, int, __m128i, 1)
+#ifdef __x86_64__
+test_1 (_mm_extract_epi64, long long, __m128i, 1)
+#endif
+test_2 (_mm_mpsadbw_epu8, __m128i, __m128i, __m128i, 1)
+test_2 (_mm_cmpistrm, __m128i, __m128i, __m128i, 1)
+test_2 (_mm_cmpistri, int, __m128i, __m128i, 1)
+test_4 (_mm_cmpestrm, __m128i, __m128i, int, __m128i, int, 1)
+test_4 (_mm_cmpestri, int, __m128i, int, __m128i, int, 1)
+test_2 (_mm_cmpistra, int, __m128i, __m128i, 1)
+test_2 (_mm_cmpistrc, int, __m128i, __m128i, 1)
+test_2 (_mm_cmpistro, int, __m128i, __m128i, 1)
+test_2 (_mm_cmpistrs, int, __m128i, __m128i, 1)
+test_2 (_mm_cmpistrz, int, __m128i, __m128i, 1)
+test_4 (_mm_cmpestra, int, __m128i, int, __m128i, int, 1)
+test_4 (_mm_cmpestrc, int, __m128i, int, __m128i, int, 1)
+test_4 (_mm_cmpestro, int, __m128i, int, __m128i, int, 1)
+test_4 (_mm_cmpestrs, int, __m128i, int, __m128i, int, 1)
+test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1)
+
+/* tmmintrin.h */
+test_2 (_mm_alignr_epi8, __m128i, __m128i, __m128i, 1)
+test_2 (_mm_alignr_pi8, __m64, __m64, __m64, 1)
+
+/* emmintrin.h */
+test_2 (_mm_shuffle_pd, __m128d, __m128d, __m128d, 1)
+test_1 (_mm_srli_si128, __m128i, __m128i, 1)
+test_1 (_mm_slli_si128, __m128i, __m128i, 1)
+test_1 (_mm_extract_epi16, int, __m128i, 1)
+test_2 (_mm_insert_epi16, __m128i, __m128i, int, 1)
+test_1 (_mm_shufflehi_epi16, __m128i, __m128i, 1)
+test_1 (_mm_shufflelo_epi16, __m128i, __m128i, 1)
+test_1 (_mm_shuffle_epi32, __m128i, __m128i, 1)
+
+/* xmmintrin.h */
+test_2 (_mm_shuffle_ps, __m128, __m128, __m128, 1)
+test_1 (_mm_extract_pi16, int, __m64, 1)
+test_1 (_m_pextrw, int, __m64, 1)
+test_2 (_mm_insert_pi16, __m64, __m64, int, 1)
+test_2 (_m_pinsrw, __m64, __m64, int, 1)
+test_1 (_mm_shuffle_pi16, __m64, __m64, 1)
+test_1 (_m_pshufw, __m64, __m64, 1)
+test_1 (_mm_prefetch, void, void *, _MM_HINT_NTA)
diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c
new file mode 100644
index 0000000..eeae0fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse-22.c
@@ -0,0 +1,161 @@
+/* Same as sse-14, except converted to use #pragma GCC option. */
+/* { dg-do compile } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration" } */
+
+#include <mm_malloc.h>
+
+/* Test that the intrinsics compile without optimization. All of them are
+ defined as inline functions in {,x,e,p,t,s,w,a}mmintrin.h and mm3dnow.h
+ that reference the proper builtin functions. Defining away "extern" and
+ "__inline" results in all of them being compiled as proper functions. */
+
+#define extern
+#define __inline
+
+#define _CONCAT(x,y) x ## y
+
+#define test_1(func, type, op1_type, imm) \
+ type _CONCAT(_,func) (op1_type A, int const I) \
+ { return func (A, imm); }
+
+#define test_1x(func, type, op1_type, imm1, imm2) \
+ type _CONCAT(_,func) (op1_type A, int const I, int const L) \
+ { return func (A, imm1, imm2); }
+
+#define test_2(func, type, op1_type, op2_type, imm) \
+ type _CONCAT(_,func) (op1_type A, op2_type B, int const I) \
+ { return func (A, B, imm); }
+
+#define test_2x(func, type, op1_type, op2_type, imm1, imm2) \
+ type _CONCAT(_,func) (op1_type A, op2_type B, int const I, int const L) \
+ { return func (A, B, imm1, imm2); }
+
+#define test_4(func, type, op1_type, op2_type, op3_type, op4_type, imm) \
+ type _CONCAT(_,func) (op1_type A, op2_type B, \
+ op3_type C, op4_type D, int const I) \
+ { return func (A, B, C, D, imm); }
+
+
+#ifndef DIFFERENT_PRAGMAS
+#pragma GCC target ("mmx,3dnow,sse,sse2,sse3,ssse3,sse4.1,sse4.2,sse4a,aes,pclmul")
+#endif
+
+/* Following intrinsics require immediate arguments. They
+ are defined as macros for non-optimized compilations. */
+
+/* mmintrin.h (MMX). */
+#ifdef DIFFERENT_PRAGMAS
+#pragma GCC target ("mmx")
+#endif
+#include <mmintrin.h>
+
+/* mm3dnow.h (3DNOW). */
+#ifdef DIFFERENT_PRAGMAS
+#pragma GCC target ("3dnow")
+#endif
+#include <mm3dnow.h>
+
+/* xmmintrin.h (SSE). */
+#ifdef DIFFERENT_PRAGMAS
+#pragma GCC target ("sse")
+#endif
+#include <xmmintrin.h>
+test_2 (_mm_shuffle_ps, __m128, __m128, __m128, 1)
+test_1 (_mm_extract_pi16, int, __m64, 1)
+test_1 (_m_pextrw, int, __m64, 1)
+test_2 (_mm_insert_pi16, __m64, __m64, int, 1)
+test_2 (_m_pinsrw, __m64, __m64, int, 1)
+test_1 (_mm_shuffle_pi16, __m64, __m64, 1)
+test_1 (_m_pshufw, __m64, __m64, 1)
+test_1 (_mm_prefetch, void, void *, _MM_HINT_NTA)
+
+/* emmintrin.h (SSE2). */
+#ifdef DIFFERENT_PRAGMAS
+#pragma GCC target ("sse2")
+#endif
+#include <emmintrin.h>
+test_2 (_mm_shuffle_pd, __m128d, __m128d, __m128d, 1)
+test_1 (_mm_srli_si128, __m128i, __m128i, 1)
+test_1 (_mm_slli_si128, __m128i, __m128i, 1)
+test_1 (_mm_extract_epi16, int, __m128i, 1)
+test_2 (_mm_insert_epi16, __m128i, __m128i, int, 1)
+test_1 (_mm_shufflehi_epi16, __m128i, __m128i, 1)
+test_1 (_mm_shufflelo_epi16, __m128i, __m128i, 1)
+test_1 (_mm_shuffle_epi32, __m128i, __m128i, 1)
+
+/* pmmintrin.h (SSE3). */
+#ifdef DIFFERENT_PRAGMAS
+#pragma GCC target ("sse3")
+#endif
+#include <pmmintrin.h>
+
+/* tmmintrin.h (SSSE3). */
+#ifdef DIFFERENT_PRAGMAS
+#pragma GCC target ("ssse3")
+#endif
+#include <tmmintrin.h>
+test_2 (_mm_alignr_epi8, __m128i, __m128i, __m128i, 1)
+test_2 (_mm_alignr_pi8, __m64, __m64, __m64, 1)
+
+/* ammintrin.h (SSE4A). */
+#ifdef DIFFERENT_PRAGMAS
+#pragma GCC target ("sse4a")
+#endif
+#include <ammintrin.h>
+test_1x (_mm_extracti_si64, __m128i, __m128i, 1, 1)
+test_2x (_mm_inserti_si64, __m128i, __m128i, __m128i, 1, 1)
+
+/* smmintrin.h (SSE4.1). */
+/* nmmintrin.h (SSE4.2). */
+/* Note, nmmintrin.h includes smmintrin.h, and smmintrin.h checks for the
+ #ifdef. So just set the option to SSE4.2. */
+#ifdef DIFFERENT_PRAGMAS
+#pragma GCC target ("sse4.2")
+#endif
+#include <nmmintrin.h>
+test_2 (_mm_blend_epi16, __m128i, __m128i, __m128i, 1)
+test_2 (_mm_blend_ps, __m128, __m128, __m128, 1)
+test_2 (_mm_blend_pd, __m128d, __m128d, __m128d, 1)
+test_2 (_mm_dp_ps, __m128, __m128, __m128, 1)
+test_2 (_mm_dp_pd, __m128d, __m128d, __m128d, 1)
+test_2 (_mm_insert_ps, __m128, __m128, __m128, 1)
+test_1 (_mm_extract_ps, int, __m128, 1)
+test_2 (_mm_insert_epi8, __m128i, __m128i, int, 1)
+test_2 (_mm_insert_epi32, __m128i, __m128i, int, 1)
+#ifdef __x86_64__
+test_2 (_mm_insert_epi64, __m128i, __m128i, long long, 1)
+#endif
+test_1 (_mm_extract_epi8, int, __m128i, 1)
+test_1 (_mm_extract_epi32, int, __m128i, 1)
+#ifdef __x86_64__
+test_1 (_mm_extract_epi64, long long, __m128i, 1)
+#endif
+test_2 (_mm_mpsadbw_epu8, __m128i, __m128i, __m128i, 1)
+test_2 (_mm_cmpistrm, __m128i, __m128i, __m128i, 1)
+test_2 (_mm_cmpistri, int, __m128i, __m128i, 1)
+test_4 (_mm_cmpestrm, __m128i, __m128i, int, __m128i, int, 1)
+test_4 (_mm_cmpestri, int, __m128i, int, __m128i, int, 1)
+test_2 (_mm_cmpistra, int, __m128i, __m128i, 1)
+test_2 (_mm_cmpistrc, int, __m128i, __m128i, 1)
+test_2 (_mm_cmpistro, int, __m128i, __m128i, 1)
+test_2 (_mm_cmpistrs, int, __m128i, __m128i, 1)
+test_2 (_mm_cmpistrz, int, __m128i, __m128i, 1)
+test_4 (_mm_cmpestra, int, __m128i, int, __m128i, int, 1)
+test_4 (_mm_cmpestrc, int, __m128i, int, __m128i, int, 1)
+test_4 (_mm_cmpestro, int, __m128i, int, __m128i, int, 1)
+test_4 (_mm_cmpestrs, int, __m128i, int, __m128i, int, 1)
+test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1)
+
+/* wmmintrin.h (AES/PCLMUL). */
+#ifdef DIFFERENT_PRAGMAS
+#pragma GCC target ("aes,pclmul")
+#endif
+#include <wmmintrin.h>
+test_1 (_mm_aeskeygenassist_si128, __m128i, __m128i, 1)
+test_2 (_mm_clmulepi64_si128, __m128i, __m128i, __m128i, 1)
+
+/* smmintrin.h (SSE4.1). */
+test_1 (_mm_round_pd, __m128d, __m128d, 1)
+test_1 (_mm_round_ps, __m128, __m128, 1)
+test_2 (_mm_round_sd, __m128d, __m128d, __m128d, 1)
+test_2 (_mm_round_ss, __m128, __m128, __m128, 1)