diff options
author | Jeff Law <jlaw@ventanamicro.com> | 2022-10-17 17:33:52 -0600 |
---|---|---|
committer | Jeff Law <jlaw@ventanamicro.com> | 2022-10-17 17:36:42 -0600 |
commit | f6e93b7b48195037d6c545104c952b97e05ad381 (patch) | |
tree | 059db5d7abb81d2d77f98cd0970aada4b5813e29 /gcc | |
parent | 566c5f1aaae120d2283103e68ecf1c1a83dd4459 (diff) | |
download | gcc-f6e93b7b48195037d6c545104c952b97e05ad381.zip gcc-f6e93b7b48195037d6c545104c952b97e05ad381.tar.gz gcc-f6e93b7b48195037d6c545104c952b97e05ad381.tar.bz2 |
Remove accidential commits
gcc/
* config/i386/cet.c: Remove accidental commit.
* config/i386/driver-mingw32.c: Likewise.
* config/i386/i386-builtins.c: Likewise.
* config/i386/i386-d.c: Likewise.
* config/i386/i386-expand.c: Likewise.
* config/i386/i386-features.c: Likewise.
* config/i386/i386-options.c: Likewise.
* config/i386/t-cet: Likewise.
* config/i386/x86-tune-sched-atom.c: Likewise.
* config/i386/x86-tune-sched-bd.c: Likewise.
* config/i386/x86-tune-sched-core.c: Likewise.
* config/i386/x86-tune-sched.c: Likewise.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/cet.c | 76 | ||||
-rw-r--r-- | gcc/config/i386/driver-mingw32.c | 28 | ||||
-rw-r--r-- | gcc/config/i386/i386-builtins.c | 2546 | ||||
-rw-r--r-- | gcc/config/i386/i386-d.c | 44 | ||||
-rw-r--r-- | gcc/config/i386/i386-expand.c | 20310 | ||||
-rw-r--r-- | gcc/config/i386/i386-features.c | 2884 | ||||
-rw-r--r-- | gcc/config/i386/i386-options.c | 3799 | ||||
-rw-r--r-- | gcc/config/i386/t-cet | 21 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune-sched-atom.c | 246 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune-sched-bd.c | 824 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune-sched-core.c | 257 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune-sched.c | 636 |
12 files changed, 0 insertions, 31671 deletions
diff --git a/gcc/config/i386/cet.c b/gcc/config/i386/cet.c deleted file mode 100644 index 5450ac3..0000000 --- a/gcc/config/i386/cet.c +++ /dev/null @@ -1,76 +0,0 @@ -/* Functions for CET/x86. - Copyright (C) 2017-2020 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING3. If not see -<http://www.gnu.org/licenses/>. */ - -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "tm.h" -#include "output.h" -#include "linux-common.h" - -void -file_end_indicate_exec_stack_and_cet (void) -{ - file_end_indicate_exec_stack (); - - if (flag_cf_protection == CF_NONE) - return; - - unsigned int feature_1 = 0; - - if (flag_cf_protection & CF_BRANCH) - /* GNU_PROPERTY_X86_FEATURE_1_IBT. */ - feature_1 |= 0x1; - - if (flag_cf_protection & CF_RETURN) - /* GNU_PROPERTY_X86_FEATURE_1_SHSTK. */ - feature_1 |= 0x2; - - if (feature_1) - { - int p2align = ptr_mode == SImode ? 2 : 3; - - /* Generate GNU_PROPERTY_X86_FEATURE_1_XXX. */ - switch_to_section (get_section (".note.gnu.property", - SECTION_NOTYPE, NULL)); - - ASM_OUTPUT_ALIGN (asm_out_file, p2align); - /* name length. */ - fprintf (asm_out_file, ASM_LONG " 1f - 0f\n"); - /* data length. */ - fprintf (asm_out_file, ASM_LONG " 4f - 1f\n"); - /* note type: NT_GNU_PROPERTY_TYPE_0. */ - fprintf (asm_out_file, ASM_LONG " 5\n"); - fprintf (asm_out_file, "0:\n"); - /* vendor name: "GNU". */ - fprintf (asm_out_file, STRING_ASM_OP " \"GNU\"\n"); - fprintf (asm_out_file, "1:\n"); - ASM_OUTPUT_ALIGN (asm_out_file, p2align); - /* pr_type: GNU_PROPERTY_X86_FEATURE_1_AND. */ - fprintf (asm_out_file, ASM_LONG " 0xc0000002\n"); - /* pr_datasz. */\ - fprintf (asm_out_file, ASM_LONG " 3f - 2f\n"); - fprintf (asm_out_file, "2:\n"); - /* GNU_PROPERTY_X86_FEATURE_1_XXX. */ - fprintf (asm_out_file, ASM_LONG " 0x%x\n", feature_1); - fprintf (asm_out_file, "3:\n"); - ASM_OUTPUT_ALIGN (asm_out_file, p2align); - fprintf (asm_out_file, "4:\n"); - } -} diff --git a/gcc/config/i386/driver-mingw32.c b/gcc/config/i386/driver-mingw32.c deleted file mode 100644 index d0517e6..0000000 --- a/gcc/config/i386/driver-mingw32.c +++ /dev/null @@ -1,28 +0,0 @@ -/* Host OS specific configuration for the gcc driver. - Copyright (C) 2017-2020 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING3. If not see -<http://www.gnu.org/licenses/>. */ - -#define IN_TARGET_CODE 1 - -#include "config.h" - -/* When defined, force the use (if non null) or not (otherwise) of CLI - globbing. */ -#ifdef MINGW_DOWILDCARD -int _dowildcard = MINGW_DOWILDCARD; -#endif diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c deleted file mode 100644 index be3ed01..0000000 --- a/gcc/config/i386/i386-builtins.c +++ /dev/null @@ -1,2546 +0,0 @@ -/* Copyright (C) 1988-2020 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING3. If not see -<http://www.gnu.org/licenses/>. */ - -#define IN_TARGET_CODE 1 - -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "backend.h" -#include "rtl.h" -#include "tree.h" -#include "memmodel.h" -#include "gimple.h" -#include "cfghooks.h" -#include "cfgloop.h" -#include "df.h" -#include "tm_p.h" -#include "stringpool.h" -#include "expmed.h" -#include "optabs.h" -#include "regs.h" -#include "emit-rtl.h" -#include "recog.h" -#include "cgraph.h" -#include "diagnostic.h" -#include "cfgbuild.h" -#include "alias.h" -#include "fold-const.h" -#include "attribs.h" -#include "calls.h" -#include "stor-layout.h" -#include "varasm.h" -#include "output.h" -#include "insn-attr.h" -#include "flags.h" -#include "except.h" -#include "explow.h" -#include "expr.h" -#include "cfgrtl.h" -#include "common/common-target.h" -#include "langhooks.h" -#include "reload.h" -#include "gimplify.h" -#include "dwarf2.h" -#include "tm-constrs.h" -#include "cselib.h" -#include "sched-int.h" -#include "opts.h" -#include "tree-pass.h" -#include "context.h" -#include "pass_manager.h" -#include "target-globals.h" -#include "gimple-iterator.h" -#include "tree-vectorizer.h" -#include "shrink-wrap.h" -#include "builtins.h" -#include "rtl-iter.h" -#include "tree-iterator.h" -#include "dbgcnt.h" -#include "case-cfn-macros.h" -#include "dojump.h" -#include "fold-const-call.h" -#include "tree-vrp.h" -#include "tree-ssanames.h" -#include "selftest.h" -#include "selftest-rtl.h" -#include "print-rtl.h" -#include "intl.h" -#include "ifcvt.h" -#include "symbol-summary.h" -#include "ipa-prop.h" -#include "ipa-fnsummary.h" -#include "wide-int-bitmask.h" -#include "tree-vector-builder.h" -#include "debug.h" -#include "dwarf2out.h" -#include "i386-builtins.h" - -#undef BDESC -#undef BDESC_FIRST -#undef BDESC_END - -/* Macros for verification of enum ix86_builtins order. */ -#define BDESC_VERIFY(x, y, z) \ - gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z))) -#define BDESC_VERIFYS(x, y, z) \ - STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z))) - -BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST, - IX86_BUILTIN__BDESC_COMI_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST, - IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, - IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST, - IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, - IX86_BUILTIN__BDESC_ARGS_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, - IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST, - IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, - IX86_BUILTIN__BDESC_CET_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN_MAX, - IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1); - - -/* Table for the ix86 builtin non-function types. */ -static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1]; - -/* Retrieve an element from the above table, building some of - the types lazily. */ - -static tree -ix86_get_builtin_type (enum ix86_builtin_type tcode) -{ - unsigned int index; - tree type, itype; - - gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab)); - - type = ix86_builtin_type_tab[(int) tcode]; - if (type != NULL) - return type; - - gcc_assert (tcode > IX86_BT_LAST_PRIM); - if (tcode <= IX86_BT_LAST_VECT) - { - machine_mode mode; - - index = tcode - IX86_BT_LAST_PRIM - 1; - itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]); - mode = ix86_builtin_type_vect_mode[index]; - - type = build_vector_type_for_mode (itype, mode); - } - else - { - int quals; - - index = tcode - IX86_BT_LAST_VECT - 1; - if (tcode <= IX86_BT_LAST_PTR) - quals = TYPE_UNQUALIFIED; - else - quals = TYPE_QUAL_CONST; - - itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]); - if (quals != TYPE_UNQUALIFIED) - itype = build_qualified_type (itype, quals); - - type = build_pointer_type (itype); - } - - ix86_builtin_type_tab[(int) tcode] = type; - return type; -} - -/* Table for the ix86 builtin function types. */ -static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1]; - -/* Retrieve an element from the above table, building some of - the types lazily. */ - -static tree -ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode) -{ - tree type; - - gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab)); - - type = ix86_builtin_func_type_tab[(int) tcode]; - if (type != NULL) - return type; - - if (tcode <= IX86_BT_LAST_FUNC) - { - unsigned start = ix86_builtin_func_start[(int) tcode]; - unsigned after = ix86_builtin_func_start[(int) tcode + 1]; - tree rtype, atype, args = void_list_node; - unsigned i; - - rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]); - for (i = after - 1; i > start; --i) - { - atype = ix86_get_builtin_type (ix86_builtin_func_args[i]); - args = tree_cons (NULL, atype, args); - } - - type = build_function_type (rtype, args); - } - else - { - unsigned index = tcode - IX86_BT_LAST_FUNC - 1; - enum ix86_builtin_func_type icode; - - icode = ix86_builtin_func_alias_base[index]; - type = ix86_get_builtin_func_type (icode); - } - - ix86_builtin_func_type_tab[(int) tcode] = type; - return type; -} - -/* Table for the ix86 builtin decls. */ -static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX]; - -struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX]; - -tree get_ix86_builtin (enum ix86_builtins c) -{ - return ix86_builtins[c]; -} - -/* Bits that can still enable any inclusion of a builtin. */ -HOST_WIDE_INT deferred_isa_values = 0; -HOST_WIDE_INT deferred_isa_values2 = 0; - -/* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the - MASK and MASK2 of which isa_flags and ix86_isa_flags2 to use in the - ix86_builtins_isa array. Stores the function decl in the ix86_builtins - array. Returns the function decl or NULL_TREE, if the builtin was not - added. - - If the front end has a special hook for builtin functions, delay adding - builtin functions that aren't in the current ISA until the ISA is changed - with function specific optimization. Doing so, can save about 300K for the - default compiler. When the builtin is expanded, check at that time whether - it is valid. - - If the front end doesn't have a special hook, record all builtins, even if - it isn't an instruction set in the current ISA in case the user uses - function specific options for a different ISA, so that we don't get scope - errors if a builtin is added in the middle of a function scope. */ - -static inline tree -def_builtin (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, - const char *name, - enum ix86_builtin_func_type tcode, - enum ix86_builtins code) -{ - tree decl = NULL_TREE; - - /* An instruction may be 64bit only regardless of ISAs. */ - if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT) - { - ix86_builtins_isa[(int) code].isa = mask; - ix86_builtins_isa[(int) code].isa2 = mask2; - - mask &= ~OPTION_MASK_ISA_64BIT; - - /* Filter out the masks most often ored together with others. */ - if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL) - && mask != OPTION_MASK_ISA_AVX512VL) - mask &= ~OPTION_MASK_ISA_AVX512VL; - if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW) - && mask != OPTION_MASK_ISA_AVX512BW) - mask &= ~OPTION_MASK_ISA_AVX512BW; - - if (((mask2 == 0 || (mask2 & ix86_isa_flags2) != 0) - && (mask == 0 || (mask & ix86_isa_flags) != 0)) - || ((mask & OPTION_MASK_ISA_MMX) != 0 && TARGET_MMX_WITH_SSE) - || (lang_hooks.builtin_function - == lang_hooks.builtin_function_ext_scope)) - { - tree type = ix86_get_builtin_func_type (tcode); - decl = add_builtin_function (name, type, code, BUILT_IN_MD, - NULL, NULL_TREE); - ix86_builtins[(int) code] = decl; - ix86_builtins_isa[(int) code].set_and_not_built_p = false; - } - else - { - /* Just MASK and MASK2 where set_and_not_built_p == true can potentially - include a builtin. */ - deferred_isa_values |= mask; - deferred_isa_values2 |= mask2; - ix86_builtins[(int) code] = NULL_TREE; - ix86_builtins_isa[(int) code].tcode = tcode; - ix86_builtins_isa[(int) code].name = name; - ix86_builtins_isa[(int) code].const_p = false; - ix86_builtins_isa[(int) code].pure_p = false; - ix86_builtins_isa[(int) code].set_and_not_built_p = true; - } - } - - return decl; -} - -/* Like def_builtin, but also marks the function decl "const". */ - -static inline tree -def_builtin_const (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name, - enum ix86_builtin_func_type tcode, enum ix86_builtins code) -{ - tree decl = def_builtin (mask, mask2, name, tcode, code); - if (decl) - TREE_READONLY (decl) = 1; - else - ix86_builtins_isa[(int) code].const_p = true; - - return decl; -} - -/* Like def_builtin, but also marks the function decl "pure". */ - -static inline tree -def_builtin_pure (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name, - enum ix86_builtin_func_type tcode, enum ix86_builtins code) -{ - tree decl = def_builtin (mask, mask2, name, tcode, code); - if (decl) - DECL_PURE_P (decl) = 1; - else - ix86_builtins_isa[(int) code].pure_p = true; - - return decl; -} - -/* Add any new builtin functions for a given ISA that may not have been - declared. This saves a bit of space compared to adding all of the - declarations to the tree, even if we didn't use them. */ - -void -ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2) -{ - isa &= ~OPTION_MASK_ISA_64BIT; - - if ((isa & deferred_isa_values) == 0 - && (isa2 & deferred_isa_values2) == 0 - && ((deferred_isa_values & OPTION_MASK_ISA_MMX) == 0 - || !(TARGET_64BIT && (isa & OPTION_MASK_ISA_SSE2) != 0))) - return; - - /* Bits in ISA value can be removed from potential isa values. */ - deferred_isa_values &= ~isa; - deferred_isa_values2 &= ~isa2; - if (TARGET_64BIT && (isa & OPTION_MASK_ISA_SSE2) != 0) - deferred_isa_values &= ~OPTION_MASK_ISA_MMX; - - int i; - tree saved_current_target_pragma = current_target_pragma; - current_target_pragma = NULL_TREE; - - for (i = 0; i < (int)IX86_BUILTIN_MAX; i++) - { - if (((ix86_builtins_isa[i].isa & isa) != 0 - || (ix86_builtins_isa[i].isa2 & isa2) != 0 - || ((ix86_builtins_isa[i].isa & OPTION_MASK_ISA_MMX) != 0 - && TARGET_64BIT - && (isa & OPTION_MASK_ISA_SSE2) != 0)) - && ix86_builtins_isa[i].set_and_not_built_p) - { - tree decl, type; - - /* Don't define the builtin again. */ - ix86_builtins_isa[i].set_and_not_built_p = false; - - type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode); - decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name, - type, i, BUILT_IN_MD, NULL, - NULL_TREE); - - ix86_builtins[i] = decl; - if (ix86_builtins_isa[i].const_p) - TREE_READONLY (decl) = 1; - } - } - - current_target_pragma = saved_current_target_pragma; -} - -/* TM vector builtins. */ - -/* Reuse the existing x86-specific `struct builtin_description' cause - we're lazy. Add casts to make them fit. */ -static const struct builtin_description bdesc_tm[] = -{ - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI }, - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI }, - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI }, - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, - - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, - - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, - - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID }, -}; - -/* Initialize the transactional memory vector load/store builtins. */ - -static void -ix86_init_tm_builtins (void) -{ - enum ix86_builtin_func_type ftype; - const struct builtin_description *d; - size_t i; - tree decl; - tree attrs_load, attrs_type_load, attrs_store, attrs_type_store; - tree attrs_log, attrs_type_log; - - if (!flag_tm) - return; - - /* If there are no builtins defined, we must be compiling in a - language without trans-mem support. */ - if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1)) - return; - - /* Use whatever attributes a normal TM load has. */ - decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1); - attrs_load = DECL_ATTRIBUTES (decl); - attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl)); - /* Use whatever attributes a normal TM store has. */ - decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1); - attrs_store = DECL_ATTRIBUTES (decl); - attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl)); - /* Use whatever attributes a normal TM log has. */ - decl = builtin_decl_explicit (BUILT_IN_TM_LOG); - attrs_log = DECL_ATTRIBUTES (decl); - attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl)); - - for (i = 0, d = bdesc_tm; - i < ARRAY_SIZE (bdesc_tm); - i++, d++) - { - if ((d->mask & ix86_isa_flags) != 0 - || ((d->mask & OPTION_MASK_ISA_MMX) != 0 && TARGET_MMX_WITH_SSE) - || (lang_hooks.builtin_function - == lang_hooks.builtin_function_ext_scope)) - { - tree type, attrs, attrs_type; - enum built_in_function code = (enum built_in_function) d->code; - - ftype = (enum ix86_builtin_func_type) d->flag; - type = ix86_get_builtin_func_type (ftype); - - if (BUILTIN_TM_LOAD_P (code)) - { - attrs = attrs_load; - attrs_type = attrs_type_load; - } - else if (BUILTIN_TM_STORE_P (code)) - { - attrs = attrs_store; - attrs_type = attrs_type_store; - } - else - { - attrs = attrs_log; - attrs_type = attrs_type_log; - } - decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL, - /* The builtin without the prefix for - calling it directly. */ - d->name + strlen ("__builtin_"), - attrs); - /* add_builtin_function() will set the DECL_ATTRIBUTES, now - set the TYPE_ATTRIBUTES. */ - decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN); - - set_builtin_decl (code, decl, false); - } - } -} - -/* Set up all the MMX/SSE builtins, even builtins for instructions that are not - in the current target ISA to allow the user to compile particular modules - with different target specific options that differ from the command line - options. */ -static void -ix86_init_mmx_sse_builtins (void) -{ - const struct builtin_description * d; - enum ix86_builtin_func_type ftype; - size_t i; - - /* Add all special builtins with variable number of operands. */ - for (i = 0, d = bdesc_special_args; - i < ARRAY_SIZE (bdesc_special_args); - i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i); - if (d->name == 0) - continue; - - ftype = (enum ix86_builtin_func_type) d->flag; - def_builtin (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, - ARRAY_SIZE (bdesc_special_args) - 1); - - /* Add all builtins with variable number of operands. */ - for (i = 0, d = bdesc_args; - i < ARRAY_SIZE (bdesc_args); - i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i); - if (d->name == 0) - continue; - - ftype = (enum ix86_builtin_func_type) d->flag; - def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST, - IX86_BUILTIN__BDESC_ARGS_FIRST, - ARRAY_SIZE (bdesc_args) - 1); - - /* Add all builtins with rounding. */ - for (i = 0, d = bdesc_round_args; - i < ARRAY_SIZE (bdesc_round_args); - i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i); - if (d->name == 0) - continue; - - ftype = (enum ix86_builtin_func_type) d->flag; - def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, - ARRAY_SIZE (bdesc_round_args) - 1); - - /* pcmpestr[im] insns. */ - for (i = 0, d = bdesc_pcmpestr; - i < ARRAY_SIZE (bdesc_pcmpestr); - i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i); - if (d->code == IX86_BUILTIN_PCMPESTRM128) - ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT; - else - ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT; - def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST, - IX86_BUILTIN__BDESC_PCMPESTR_FIRST, - ARRAY_SIZE (bdesc_pcmpestr) - 1); - - /* pcmpistr[im] insns. */ - for (i = 0, d = bdesc_pcmpistr; - i < ARRAY_SIZE (bdesc_pcmpistr); - i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i); - if (d->code == IX86_BUILTIN_PCMPISTRM128) - ftype = V16QI_FTYPE_V16QI_V16QI_INT; - else - ftype = INT_FTYPE_V16QI_V16QI_INT; - def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST, - IX86_BUILTIN__BDESC_PCMPISTR_FIRST, - ARRAY_SIZE (bdesc_pcmpistr) - 1); - - /* comi/ucomi insns. */ - for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i); - if (d->mask == OPTION_MASK_ISA_SSE2) - ftype = INT_FTYPE_V2DF_V2DF; - else - ftype = INT_FTYPE_V4SF_V4SF; - def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST, - IX86_BUILTIN__BDESC_COMI_FIRST, - ARRAY_SIZE (bdesc_comi) - 1); - - /* SSE */ - def_builtin (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_ldmxcsr", - VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR); - def_builtin_pure (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_stmxcsr", - UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR); - - /* SSE or 3DNow!A */ - def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A - /* As it uses V4HImode, we have to require -mmmx too. */ - | OPTION_MASK_ISA_MMX, 0, - "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR, - IX86_BUILTIN_MASKMOVQ); - - /* SSE2 */ - def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_maskmovdqu", - VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU); - - def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_clflush", - VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH); - x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_mfence", - VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE); - - /* SSE3. */ - def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_monitor", - VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR); - def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_mwait", - VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT); - - /* AES */ - def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_aesenc128", - V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128); - def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_aesenclast128", - V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128); - def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_aesdec128", - V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128); - def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_aesdeclast128", - V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128); - def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_aesimc128", - V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128); - def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_aeskeygenassist128", - V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128); - - /* PCLMUL */ - def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_pclmulqdq128", - V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128); - - /* RDRND */ - def_builtin (OPTION_MASK_ISA_RDRND, 0, "__builtin_ia32_rdrand16_step", - INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP); - def_builtin (OPTION_MASK_ISA_RDRND, 0, "__builtin_ia32_rdrand32_step", - INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP); - def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT, 0, - "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG, - IX86_BUILTIN_RDRAND64_STEP); - - /* AVX2 */ - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2df", - V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT, - IX86_BUILTIN_GATHERSIV2DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4df", - V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT, - IX86_BUILTIN_GATHERSIV4DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2df", - V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT, - IX86_BUILTIN_GATHERDIV2DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4df", - V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT, - IX86_BUILTIN_GATHERDIV4DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4sf", - V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT, - IX86_BUILTIN_GATHERSIV4SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8sf", - V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT, - IX86_BUILTIN_GATHERSIV8SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf", - V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT, - IX86_BUILTIN_GATHERDIV4SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf256", - V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT, - IX86_BUILTIN_GATHERDIV8SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2di", - V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT, - IX86_BUILTIN_GATHERSIV2DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4di", - V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT, - IX86_BUILTIN_GATHERSIV4DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2di", - V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT, - IX86_BUILTIN_GATHERDIV2DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4di", - V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT, - IX86_BUILTIN_GATHERDIV4DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4si", - V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT, - IX86_BUILTIN_GATHERSIV4SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8si", - V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT, - IX86_BUILTIN_GATHERSIV8SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si", - V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT, - IX86_BUILTIN_GATHERDIV4SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si256", - V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT, - IX86_BUILTIN_GATHERDIV8SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4df ", - V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT, - IX86_BUILTIN_GATHERALTSIV4DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8sf ", - V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT, - IX86_BUILTIN_GATHERALTDIV8SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4di ", - V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT, - IX86_BUILTIN_GATHERALTSIV4DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8si ", - V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT, - IX86_BUILTIN_GATHERALTDIV8SI); - - /* AVX512F */ - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16sf", - V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT, - IX86_BUILTIN_GATHER3SIV16SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8df", - V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT, - IX86_BUILTIN_GATHER3SIV8DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16sf", - V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT, - IX86_BUILTIN_GATHER3DIV16SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8df", - V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT, - IX86_BUILTIN_GATHER3DIV8DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16si", - V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT, - IX86_BUILTIN_GATHER3SIV16SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8di", - V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT, - IX86_BUILTIN_GATHER3SIV8DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16si", - V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT, - IX86_BUILTIN_GATHER3DIV16SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8di", - V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT, - IX86_BUILTIN_GATHER3DIV8DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8df ", - V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT, - IX86_BUILTIN_GATHER3ALTSIV8DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16sf ", - V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT, - IX86_BUILTIN_GATHER3ALTDIV16SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8di ", - V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT, - IX86_BUILTIN_GATHER3ALTSIV8DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16si ", - V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT, - IX86_BUILTIN_GATHER3ALTDIV16SI); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16sf", - VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT, - IX86_BUILTIN_SCATTERSIV16SF); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8df", - VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT, - IX86_BUILTIN_SCATTERSIV8DF); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16sf", - VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT, - IX86_BUILTIN_SCATTERDIV16SF); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8df", - VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT, - IX86_BUILTIN_SCATTERDIV8DF); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16si", - VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT, - IX86_BUILTIN_SCATTERSIV16SI); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8di", - VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT, - IX86_BUILTIN_SCATTERSIV8DI); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16si", - VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT, - IX86_BUILTIN_SCATTERDIV16SI); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8di", - VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT, - IX86_BUILTIN_SCATTERDIV8DI); - - /* AVX512VL */ - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2df", - V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT, - IX86_BUILTIN_GATHER3SIV2DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4df", - V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT, - IX86_BUILTIN_GATHER3SIV4DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2df", - V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT, - IX86_BUILTIN_GATHER3DIV2DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4df", - V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT, - IX86_BUILTIN_GATHER3DIV4DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4sf", - V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT, - IX86_BUILTIN_GATHER3SIV4SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8sf", - V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT, - IX86_BUILTIN_GATHER3SIV8SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4sf", - V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT, - IX86_BUILTIN_GATHER3DIV4SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8sf", - V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT, - IX86_BUILTIN_GATHER3DIV8SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2di", - V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT, - IX86_BUILTIN_GATHER3SIV2DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4di", - V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT, - IX86_BUILTIN_GATHER3SIV4DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2di", - V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT, - IX86_BUILTIN_GATHER3DIV2DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4di", - V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT, - IX86_BUILTIN_GATHER3DIV4DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4si", - V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT, - IX86_BUILTIN_GATHER3SIV4SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8si", - V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT, - IX86_BUILTIN_GATHER3SIV8SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4si", - V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT, - IX86_BUILTIN_GATHER3DIV4SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8si", - V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT, - IX86_BUILTIN_GATHER3DIV8SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4df ", - V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT, - IX86_BUILTIN_GATHER3ALTSIV4DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8sf ", - V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT, - IX86_BUILTIN_GATHER3ALTDIV8SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4di ", - V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT, - IX86_BUILTIN_GATHER3ALTSIV4DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8si ", - V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT, - IX86_BUILTIN_GATHER3ALTDIV8SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8sf", - VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT, - IX86_BUILTIN_SCATTERSIV8SF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4sf", - VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT, - IX86_BUILTIN_SCATTERSIV4SF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4df", - VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT, - IX86_BUILTIN_SCATTERSIV4DF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2df", - VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT, - IX86_BUILTIN_SCATTERSIV2DF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8sf", - VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT, - IX86_BUILTIN_SCATTERDIV8SF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4sf", - VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT, - IX86_BUILTIN_SCATTERDIV4SF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4df", - VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT, - IX86_BUILTIN_SCATTERDIV4DF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2df", - VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT, - IX86_BUILTIN_SCATTERDIV2DF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8si", - VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT, - IX86_BUILTIN_SCATTERSIV8SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4si", - VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT, - IX86_BUILTIN_SCATTERSIV4SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4di", - VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT, - IX86_BUILTIN_SCATTERSIV4DI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2di", - VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT, - IX86_BUILTIN_SCATTERSIV2DI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8si", - VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT, - IX86_BUILTIN_SCATTERDIV8SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4si", - VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT, - IX86_BUILTIN_SCATTERDIV4SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4di", - VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT, - IX86_BUILTIN_SCATTERDIV4DI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2di", - VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT, - IX86_BUILTIN_SCATTERDIV2DI); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8df ", - VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT, - IX86_BUILTIN_SCATTERALTSIV8DF); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16sf ", - VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT, - IX86_BUILTIN_SCATTERALTDIV16SF); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8di ", - VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT, - IX86_BUILTIN_SCATTERALTSIV8DI); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16si ", - VOID_FTYPE_PINT_HI_V8DI_V16SI_INT, - IX86_BUILTIN_SCATTERALTDIV16SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4df ", - VOID_FTYPE_PDOUBLE_QI_V8SI_V4DF_INT, - IX86_BUILTIN_SCATTERALTSIV4DF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8sf ", - VOID_FTYPE_PFLOAT_QI_V4DI_V8SF_INT, - IX86_BUILTIN_SCATTERALTDIV8SF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4di ", - VOID_FTYPE_PLONGLONG_QI_V8SI_V4DI_INT, - IX86_BUILTIN_SCATTERALTSIV4DI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8si ", - VOID_FTYPE_PINT_QI_V4DI_V8SI_INT, - IX86_BUILTIN_SCATTERALTDIV8SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2df ", - VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT, - IX86_BUILTIN_SCATTERALTSIV2DF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4sf ", - VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT, - IX86_BUILTIN_SCATTERALTDIV4SF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2di ", - VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT, - IX86_BUILTIN_SCATTERALTSIV2DI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4si ", - VOID_FTYPE_PINT_QI_V2DI_V4SI_INT, - IX86_BUILTIN_SCATTERALTDIV4SI); - - /* AVX512PF */ - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdpd", - VOID_FTYPE_QI_V8SI_PCVOID_INT_INT, - IX86_BUILTIN_GATHERPFDPD); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdps", - VOID_FTYPE_HI_V16SI_PCVOID_INT_INT, - IX86_BUILTIN_GATHERPFDPS); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqpd", - VOID_FTYPE_QI_V8DI_PCVOID_INT_INT, - IX86_BUILTIN_GATHERPFQPD); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqps", - VOID_FTYPE_QI_V8DI_PCVOID_INT_INT, - IX86_BUILTIN_GATHERPFQPS); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdpd", - VOID_FTYPE_QI_V8SI_PCVOID_INT_INT, - IX86_BUILTIN_SCATTERPFDPD); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdps", - VOID_FTYPE_HI_V16SI_PCVOID_INT_INT, - IX86_BUILTIN_SCATTERPFDPS); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqpd", - VOID_FTYPE_QI_V8DI_PCVOID_INT_INT, - IX86_BUILTIN_SCATTERPFQPD); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqps", - VOID_FTYPE_QI_V8DI_PCVOID_INT_INT, - IX86_BUILTIN_SCATTERPFQPS); - - /* SHA */ - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg1", - V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1); - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg2", - V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2); - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1nexte", - V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE); - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1rnds4", - V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4); - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg1", - V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1); - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg2", - V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2); - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256rnds2", - V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2); - - /* RTM. */ - def_builtin (OPTION_MASK_ISA_RTM, 0, "__builtin_ia32_xabort", - VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT); - - /* MMX access to the vec_init patterns. */ - def_builtin_const (OPTION_MASK_ISA_MMX, 0, - "__builtin_ia32_vec_init_v2si", - V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI); - - def_builtin_const (OPTION_MASK_ISA_MMX, 0, - "__builtin_ia32_vec_init_v4hi", - V4HI_FTYPE_HI_HI_HI_HI, - IX86_BUILTIN_VEC_INIT_V4HI); - - def_builtin_const (OPTION_MASK_ISA_MMX, 0, - "__builtin_ia32_vec_init_v8qi", - V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI, - IX86_BUILTIN_VEC_INIT_V8QI); - - /* Access to the vec_extract patterns. */ - def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2df", - DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF); - def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2di", - DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI); - def_builtin_const (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_vec_ext_v4sf", - FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF); - def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v4si", - SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI); - def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v8hi", - HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI); - - def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A - /* As it uses V4HImode, we have to require -mmmx too. */ - | OPTION_MASK_ISA_MMX, 0, - "__builtin_ia32_vec_ext_v4hi", - HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI); - - def_builtin_const (OPTION_MASK_ISA_MMX, 0, - "__builtin_ia32_vec_ext_v2si", - SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI); - - def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v16qi", - QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI); - - /* Access to the vec_set patterns. */ - def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT, 0, - "__builtin_ia32_vec_set_v2di", - V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI); - - def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4sf", - V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF); - - def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4si", - V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI); - - def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_set_v8hi", - V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI); - - def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A - /* As it uses V4HImode, we have to require -mmmx too. */ - | OPTION_MASK_ISA_MMX, 0, - "__builtin_ia32_vec_set_v4hi", - V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI); - - def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v16qi", - V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI); - - /* RDSEED */ - def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_hi_step", - INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP); - def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_si_step", - INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP); - def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT, 0, - "__builtin_ia32_rdseed_di_step", - INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP); - - /* ADCX */ - def_builtin (0, 0, "__builtin_ia32_addcarryx_u32", - UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32); - def_builtin (OPTION_MASK_ISA_64BIT, 0, - "__builtin_ia32_addcarryx_u64", - UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG, - IX86_BUILTIN_ADDCARRYX64); - - /* SBB */ - def_builtin (0, 0, "__builtin_ia32_sbb_u32", - UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32); - def_builtin (OPTION_MASK_ISA_64BIT, 0, - "__builtin_ia32_sbb_u64", - UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG, - IX86_BUILTIN_SBB64); - - /* Read/write FLAGS. */ - if (TARGET_64BIT) - { - def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_readeflags_u64", - UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS); - def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_writeeflags_u64", - VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS); - } - else - { - def_builtin (0, 0, "__builtin_ia32_readeflags_u32", - UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS); - def_builtin (0, 0, "__builtin_ia32_writeeflags_u32", - VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS); - } - - /* CLFLUSHOPT. */ - def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, 0, "__builtin_ia32_clflushopt", - VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT); - - /* CLWB. */ - def_builtin (OPTION_MASK_ISA_CLWB, 0, "__builtin_ia32_clwb", - VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB); - - /* MONITORX and MWAITX. */ - def_builtin (0, OPTION_MASK_ISA2_MWAITX, "__builtin_ia32_monitorx", - VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX); - def_builtin (0, OPTION_MASK_ISA2_MWAITX, "__builtin_ia32_mwaitx", - VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX); - - /* CLZERO. */ - def_builtin (0, OPTION_MASK_ISA2_CLZERO, "__builtin_ia32_clzero", - VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO); - - /* WAITPKG. */ - def_builtin (0, OPTION_MASK_ISA2_WAITPKG, "__builtin_ia32_umonitor", - VOID_FTYPE_PVOID, IX86_BUILTIN_UMONITOR); - def_builtin (0, OPTION_MASK_ISA2_WAITPKG, "__builtin_ia32_umwait", - UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_UMWAIT); - def_builtin (0, OPTION_MASK_ISA2_WAITPKG, "__builtin_ia32_tpause", - UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_TPAUSE); - - /* CLDEMOTE. */ - def_builtin (0, OPTION_MASK_ISA2_CLDEMOTE, "__builtin_ia32_cldemote", - VOID_FTYPE_PCVOID, IX86_BUILTIN_CLDEMOTE); - - /* Add FMA4 multi-arg argument instructions */ - for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i); - if (d->name == 0) - continue; - - ftype = (enum ix86_builtin_func_type) d->flag; - def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST, - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, - ARRAY_SIZE (bdesc_multi_arg) - 1); - - /* Add CET inrinsics. */ - for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i); - if (d->name == 0) - continue; - - ftype = (enum ix86_builtin_func_type) d->flag; - def_builtin (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST, - IX86_BUILTIN__BDESC_CET_FIRST, - ARRAY_SIZE (bdesc_cet) - 1); - - for (i = 0, d = bdesc_cet_rdssp; - i < ARRAY_SIZE (bdesc_cet_rdssp); - i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i); - if (d->name == 0) - continue; - - ftype = (enum ix86_builtin_func_type) d->flag; - def_builtin (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST, - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, - ARRAY_SIZE (bdesc_cet_rdssp) - 1); -} - -#undef BDESC_VERIFY -#undef BDESC_VERIFYS - -/* Make builtins to detect cpu type and features supported. NAME is - the builtin name, CODE is the builtin code, and FTYPE is the function - type of the builtin. */ - -static void -make_cpu_type_builtin (const char* name, int code, - enum ix86_builtin_func_type ftype, bool is_const) -{ - tree decl; - tree type; - - type = ix86_get_builtin_func_type (ftype); - decl = add_builtin_function (name, type, code, BUILT_IN_MD, - NULL, NULL_TREE); - gcc_assert (decl != NULL_TREE); - ix86_builtins[(int) code] = decl; - TREE_READONLY (decl) = is_const; -} - -/* Make builtins to get CPU type and features supported. The created - builtins are : - - __builtin_cpu_init (), to detect cpu type and features, - __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>, - __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE> - */ - -static void -ix86_init_platform_type_builtins (void) -{ - make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT, - INT_FTYPE_VOID, false); - make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS, - INT_FTYPE_PCCHAR, true); - make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS, - INT_FTYPE_PCCHAR, true); -} - -/* Internal method for ix86_init_builtins. */ - -static void -ix86_init_builtins_va_builtins_abi (void) -{ - tree ms_va_ref, sysv_va_ref; - tree fnvoid_va_end_ms, fnvoid_va_end_sysv; - tree fnvoid_va_start_ms, fnvoid_va_start_sysv; - tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv; - tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE; - - if (!TARGET_64BIT) - return; - fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE); - fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE); - ms_va_ref = build_reference_type (ms_va_list_type_node); - sysv_va_ref = build_pointer_type (TREE_TYPE (sysv_va_list_type_node)); - - fnvoid_va_end_ms = build_function_type_list (void_type_node, ms_va_ref, - NULL_TREE); - fnvoid_va_start_ms - = build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE); - fnvoid_va_end_sysv - = build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE); - fnvoid_va_start_sysv - = build_varargs_function_type_list (void_type_node, sysv_va_ref, - NULL_TREE); - fnvoid_va_copy_ms - = build_function_type_list (void_type_node, ms_va_ref, - ms_va_list_type_node, NULL_TREE); - fnvoid_va_copy_sysv - = build_function_type_list (void_type_node, sysv_va_ref, - sysv_va_ref, NULL_TREE); - - add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms, - BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms); - add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms, - BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms); - add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms, - BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms); - add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv, - BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv); - add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv, - BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv); - add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv, - BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv); -} - -static void -ix86_init_builtin_types (void) -{ - tree float80_type_node, const_string_type_node; - - /* The __float80 type. */ - float80_type_node = long_double_type_node; - if (TYPE_MODE (float80_type_node) != XFmode) - { - if (float64x_type_node != NULL_TREE - && TYPE_MODE (float64x_type_node) == XFmode) - float80_type_node = float64x_type_node; - else - { - /* The __float80 type. */ - float80_type_node = make_node (REAL_TYPE); - - TYPE_PRECISION (float80_type_node) = 80; - layout_type (float80_type_node); - } - } - lang_hooks.types.register_builtin_type (float80_type_node, "__float80"); - - /* The __float128 type. The node has already been created as - _Float128, so we only need to register the __float128 name for - it. */ - lang_hooks.types.register_builtin_type (float128_type_node, "__float128"); - - const_string_type_node - = build_pointer_type (build_qualified_type - (char_type_node, TYPE_QUAL_CONST)); - - /* This macro is built by i386-builtin-types.awk. */ - DEFINE_BUILTIN_PRIMITIVE_TYPES; -} - -void -ix86_init_builtins (void) -{ - tree ftype, decl; - - ix86_init_builtin_types (); - - /* Builtins to get CPU type and features. */ - ix86_init_platform_type_builtins (); - - /* TFmode support builtins. */ - def_builtin_const (0, 0, "__builtin_infq", - FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ); - def_builtin_const (0, 0, "__builtin_huge_valq", - FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ); - - ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING); - decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ, - BUILT_IN_MD, "nanq", NULL_TREE); - TREE_READONLY (decl) = 1; - ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl; - - decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ, - BUILT_IN_MD, "nansq", NULL_TREE); - TREE_READONLY (decl) = 1; - ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl; - - /* We will expand them to normal call if SSE isn't available since - they are used by libgcc. */ - ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128); - decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ, - BUILT_IN_MD, "__fabstf2", NULL_TREE); - TREE_READONLY (decl) = 1; - ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl; - - ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128); - decl = add_builtin_function ("__builtin_copysignq", ftype, - IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD, - "__copysigntf3", NULL_TREE); - TREE_READONLY (decl) = 1; - ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl; - - ix86_init_tm_builtins (); - ix86_init_mmx_sse_builtins (); - - if (TARGET_LP64) - ix86_init_builtins_va_builtins_abi (); - -#ifdef SUBTARGET_INIT_BUILTINS - SUBTARGET_INIT_BUILTINS; -#endif -} - -/* Return the ix86 builtin for CODE. */ - -tree -ix86_builtin_decl (unsigned code, bool) -{ - if (code >= IX86_BUILTIN_MAX) - return error_mark_node; - - return ix86_builtins[code]; -} - -/* This returns the target-specific builtin with code CODE if - current_function_decl has visibility on this builtin, which is checked - using isa flags. Returns NULL_TREE otherwise. */ - -static tree ix86_get_builtin (enum ix86_builtins code) -{ - struct cl_target_option *opts; - tree target_tree = NULL_TREE; - - /* Determine the isa flags of current_function_decl. */ - - if (current_function_decl) - target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl); - - if (target_tree == NULL) - target_tree = target_option_default_node; - - opts = TREE_TARGET_OPTION (target_tree); - - if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags) - || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2)) - return ix86_builtin_decl (code, true); - else - return NULL_TREE; -} - -/* Vectorization library interface and handlers. */ -tree (*ix86_veclib_handler) (combined_fn, tree, tree); - -/* Returns a function decl for a vectorized version of the combined function - with combined_fn code FN and the result vector type TYPE, or NULL_TREE - if it is not available. */ - -tree -ix86_builtin_vectorized_function (unsigned int fn, tree type_out, - tree type_in) -{ - machine_mode in_mode, out_mode; - int in_n, out_n; - - if (TREE_CODE (type_out) != VECTOR_TYPE - || TREE_CODE (type_in) != VECTOR_TYPE) - return NULL_TREE; - - out_mode = TYPE_MODE (TREE_TYPE (type_out)); - out_n = TYPE_VECTOR_SUBPARTS (type_out); - in_mode = TYPE_MODE (TREE_TYPE (type_in)); - in_n = TYPE_VECTOR_SUBPARTS (type_in); - - switch (fn) - { - CASE_CFN_EXP2: - if (out_mode == SFmode && in_mode == SFmode) - { - if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_EXP2PS); - } - break; - - CASE_CFN_IFLOOR: - CASE_CFN_LFLOOR: - CASE_CFN_LLFLOOR: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == SImode && in_mode == DFmode) - { - if (out_n == 4 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX); - else if (out_n == 8 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256); - else if (out_n == 16 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512); - } - if (out_mode == SImode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512); - } - break; - - CASE_CFN_ICEIL: - CASE_CFN_LCEIL: - CASE_CFN_LLCEIL: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == SImode && in_mode == DFmode) - { - if (out_n == 4 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX); - else if (out_n == 8 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256); - else if (out_n == 16 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512); - } - if (out_mode == SImode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512); - } - break; - - CASE_CFN_IRINT: - CASE_CFN_LRINT: - CASE_CFN_LLRINT: - if (out_mode == SImode && in_mode == DFmode) - { - if (out_n == 4 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX); - else if (out_n == 8 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256); - else if (out_n == 16 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512); - } - if (out_mode == SImode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512); - } - break; - - CASE_CFN_IROUND: - CASE_CFN_LROUND: - CASE_CFN_LLROUND: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == SImode && in_mode == DFmode) - { - if (out_n == 4 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX); - else if (out_n == 8 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256); - else if (out_n == 16 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512); - } - if (out_mode == SImode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512); - } - break; - - CASE_CFN_FLOOR: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == DFmode && in_mode == DFmode) - { - if (out_n == 2 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_FLOORPD); - else if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_FLOORPD256); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_FLOORPD512); - } - if (out_mode == SFmode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_FLOORPS); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_FLOORPS256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_FLOORPS512); - } - break; - - CASE_CFN_CEIL: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == DFmode && in_mode == DFmode) - { - if (out_n == 2 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_CEILPD); - else if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_CEILPD256); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_CEILPD512); - } - if (out_mode == SFmode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_CEILPS); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_CEILPS256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_CEILPS512); - } - break; - - CASE_CFN_TRUNC: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == DFmode && in_mode == DFmode) - { - if (out_n == 2 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_TRUNCPD); - else if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512); - } - if (out_mode == SFmode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_TRUNCPS); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512); - } - break; - - CASE_CFN_FMA: - if (out_mode == DFmode && in_mode == DFmode) - { - if (out_n == 2 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_VFMADDPD); - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256); - } - if (out_mode == SFmode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_VFMADDPS); - if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256); - } - break; - - default: - break; - } - - /* Dispatch to a handler for a vectorization library. */ - if (ix86_veclib_handler) - return ix86_veclib_handler (combined_fn (fn), type_out, type_in); - - return NULL_TREE; -} - -/* Returns a decl of a function that implements gather load with - memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE. - Return NULL_TREE if it is not available. */ - -tree -ix86_vectorize_builtin_gather (const_tree mem_vectype, - const_tree index_type, int scale) -{ - bool si; - enum ix86_builtins code; - - if (! TARGET_AVX2 || !TARGET_USE_GATHER) - return NULL_TREE; - - if ((TREE_CODE (index_type) != INTEGER_TYPE - && !POINTER_TYPE_P (index_type)) - || (TYPE_MODE (index_type) != SImode - && TYPE_MODE (index_type) != DImode)) - return NULL_TREE; - - if (TYPE_PRECISION (index_type) > POINTER_SIZE) - return NULL_TREE; - - /* v*gather* insn sign extends index to pointer mode. */ - if (TYPE_PRECISION (index_type) < POINTER_SIZE - && TYPE_UNSIGNED (index_type)) - return NULL_TREE; - - if (scale <= 0 - || scale > 8 - || (scale & (scale - 1)) != 0) - return NULL_TREE; - - si = TYPE_MODE (index_type) == SImode; - switch (TYPE_MODE (mem_vectype)) - { - case E_V2DFmode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF; - else - code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF; - break; - case E_V4DFmode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF; - else - code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF; - break; - case E_V2DImode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI; - else - code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI; - break; - case E_V4DImode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI; - else - code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI; - break; - case E_V4SFmode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF; - else - code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF; - break; - case E_V8SFmode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF; - else - code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF; - break; - case E_V4SImode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI; - else - code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI; - break; - case E_V8SImode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI; - else - code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI; - break; - case E_V8DFmode: - if (TARGET_AVX512F) - code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF; - else - return NULL_TREE; - break; - case E_V8DImode: - if (TARGET_AVX512F) - code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI; - else - return NULL_TREE; - break; - case E_V16SFmode: - if (TARGET_AVX512F) - code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF; - else - return NULL_TREE; - break; - case E_V16SImode: - if (TARGET_AVX512F) - code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI; - else - return NULL_TREE; - break; - default: - return NULL_TREE; - } - - return ix86_get_builtin (code); -} - -/* Returns a code for a target-specific builtin that implements - reciprocal of the function, or NULL_TREE if not available. */ - -tree -ix86_builtin_reciprocal (tree fndecl) -{ - enum ix86_builtins fn_code - = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl); - switch (fn_code) - { - /* Vectorized version of sqrt to rsqrt conversion. */ - case IX86_BUILTIN_SQRTPS_NR: - return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR); - - case IX86_BUILTIN_SQRTPS_NR256: - return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256); - - default: - return NULL_TREE; - } -} - -/* Priority of i386 features, greater value is higher priority. This is - used to decide the order in which function dispatch must happen. For - instance, a version specialized for SSE4.2 should be checked for dispatch - before a version for SSE3, as SSE4.2 implies SSE3. */ -enum feature_priority -{ - P_ZERO = 0, - P_MMX, - P_SSE, - P_SSE2, - P_SSE3, - P_SSSE3, - P_PROC_SSSE3, - P_SSE4_A, - P_PROC_SSE4_A, - P_SSE4_1, - P_SSE4_2, - P_PROC_SSE4_2, - P_POPCNT, - P_AES, - P_PCLMUL, - P_AVX, - P_PROC_AVX, - P_BMI, - P_PROC_BMI, - P_FMA4, - P_XOP, - P_PROC_XOP, - P_FMA, - P_PROC_FMA, - P_BMI2, - P_AVX2, - P_PROC_AVX2, - P_AVX512F, - P_PROC_AVX512F -}; - -/* This is the order of bit-fields in __processor_features in cpuinfo.c */ -enum processor_features -{ - F_CMOV = 0, - F_MMX, - F_POPCNT, - F_SSE, - F_SSE2, - F_SSE3, - F_SSSE3, - F_SSE4_1, - F_SSE4_2, - F_AVX, - F_AVX2, - F_SSE4_A, - F_FMA4, - F_XOP, - F_FMA, - F_AVX512F, - F_BMI, - F_BMI2, - F_AES, - F_PCLMUL, - F_AVX512VL, - F_AVX512BW, - F_AVX512DQ, - F_AVX512CD, - F_AVX512ER, - F_AVX512PF, - F_AVX512VBMI, - F_AVX512IFMA, - F_AVX5124VNNIW, - F_AVX5124FMAPS, - F_AVX512VPOPCNTDQ, - F_AVX512VBMI2, - F_GFNI, - F_VPCLMULQDQ, - F_AVX512VNNI, - F_AVX512BITALG, - F_AVX512BF16, - F_AVX512VP2INTERSECT, - F_MAX -}; - -/* These are the values for vendor types and cpu types and subtypes - in cpuinfo.c. Cpu types and subtypes should be subtracted by - the corresponding start value. */ -enum processor_model -{ - M_INTEL = 1, - M_AMD, - M_CPU_TYPE_START, - M_INTEL_BONNELL, - M_INTEL_CORE2, - M_INTEL_COREI7, - M_AMDFAM10H, - M_AMDFAM15H, - M_INTEL_SILVERMONT, - M_INTEL_KNL, - M_AMD_BTVER1, - M_AMD_BTVER2, - M_AMDFAM17H, - M_INTEL_KNM, - M_INTEL_GOLDMONT, - M_INTEL_GOLDMONT_PLUS, - M_INTEL_TREMONT, - M_CPU_SUBTYPE_START, - M_INTEL_COREI7_NEHALEM, - M_INTEL_COREI7_WESTMERE, - M_INTEL_COREI7_SANDYBRIDGE, - M_AMDFAM10H_BARCELONA, - M_AMDFAM10H_SHANGHAI, - M_AMDFAM10H_ISTANBUL, - M_AMDFAM15H_BDVER1, - M_AMDFAM15H_BDVER2, - M_AMDFAM15H_BDVER3, - M_AMDFAM15H_BDVER4, - M_AMDFAM17H_ZNVER1, - M_INTEL_COREI7_IVYBRIDGE, - M_INTEL_COREI7_HASWELL, - M_INTEL_COREI7_BROADWELL, - M_INTEL_COREI7_SKYLAKE, - M_INTEL_COREI7_SKYLAKE_AVX512, - M_INTEL_COREI7_CANNONLAKE, - M_INTEL_COREI7_ICELAKE_CLIENT, - M_INTEL_COREI7_ICELAKE_SERVER, - M_AMDFAM17H_ZNVER2, - M_INTEL_COREI7_CASCADELAKE, - M_INTEL_COREI7_TIGERLAKE, - M_INTEL_COREI7_COOPERLAKE -}; - -struct _arch_names_table -{ - const char *const name; - const enum processor_model model; -}; - -static const _arch_names_table arch_names_table[] = -{ - {"amd", M_AMD}, - {"intel", M_INTEL}, - {"atom", M_INTEL_BONNELL}, - {"slm", M_INTEL_SILVERMONT}, - {"core2", M_INTEL_CORE2}, - {"corei7", M_INTEL_COREI7}, - {"nehalem", M_INTEL_COREI7_NEHALEM}, - {"westmere", M_INTEL_COREI7_WESTMERE}, - {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE}, - {"ivybridge", M_INTEL_COREI7_IVYBRIDGE}, - {"haswell", M_INTEL_COREI7_HASWELL}, - {"broadwell", M_INTEL_COREI7_BROADWELL}, - {"skylake", M_INTEL_COREI7_SKYLAKE}, - {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512}, - {"cannonlake", M_INTEL_COREI7_CANNONLAKE}, - {"icelake-client", M_INTEL_COREI7_ICELAKE_CLIENT}, - {"icelake-server", M_INTEL_COREI7_ICELAKE_SERVER}, - {"cascadelake", M_INTEL_COREI7_CASCADELAKE}, - {"tigerlake", M_INTEL_COREI7_TIGERLAKE}, - {"cooperlake", M_INTEL_COREI7_COOPERLAKE}, - {"bonnell", M_INTEL_BONNELL}, - {"silvermont", M_INTEL_SILVERMONT}, - {"goldmont", M_INTEL_GOLDMONT}, - {"goldmont-plus", M_INTEL_GOLDMONT_PLUS}, - {"tremont", M_INTEL_TREMONT}, - {"knl", M_INTEL_KNL}, - {"knm", M_INTEL_KNM}, - {"amdfam10h", M_AMDFAM10H}, - {"barcelona", M_AMDFAM10H_BARCELONA}, - {"shanghai", M_AMDFAM10H_SHANGHAI}, - {"istanbul", M_AMDFAM10H_ISTANBUL}, - {"btver1", M_AMD_BTVER1}, - {"amdfam15h", M_AMDFAM15H}, - {"bdver1", M_AMDFAM15H_BDVER1}, - {"bdver2", M_AMDFAM15H_BDVER2}, - {"bdver3", M_AMDFAM15H_BDVER3}, - {"bdver4", M_AMDFAM15H_BDVER4}, - {"btver2", M_AMD_BTVER2}, - {"amdfam17h", M_AMDFAM17H}, - {"znver1", M_AMDFAM17H_ZNVER1}, - {"znver2", M_AMDFAM17H_ZNVER2}, -}; - -/* These are the target attribute strings for which a dispatcher is - available, from fold_builtin_cpu. */ -struct _isa_names_table -{ - const char *const name; - const enum processor_features feature; - const enum feature_priority priority; -}; - -static const _isa_names_table isa_names_table[] = -{ - {"cmov", F_CMOV, P_ZERO}, - {"mmx", F_MMX, P_MMX}, - {"popcnt", F_POPCNT, P_POPCNT}, - {"sse", F_SSE, P_SSE}, - {"sse2", F_SSE2, P_SSE2}, - {"sse3", F_SSE3, P_SSE3}, - {"ssse3", F_SSSE3, P_SSSE3}, - {"sse4a", F_SSE4_A, P_SSE4_A}, - {"sse4.1", F_SSE4_1, P_SSE4_1}, - {"sse4.2", F_SSE4_2, P_SSE4_2}, - {"avx", F_AVX, P_AVX}, - {"fma4", F_FMA4, P_FMA4}, - {"xop", F_XOP, P_XOP}, - {"fma", F_FMA, P_FMA}, - {"avx2", F_AVX2, P_AVX2}, - {"avx512f", F_AVX512F, P_AVX512F}, - {"bmi", F_BMI, P_BMI}, - {"bmi2", F_BMI2, P_BMI2}, - {"aes", F_AES, P_AES}, - {"pclmul", F_PCLMUL, P_PCLMUL}, - {"avx512vl",F_AVX512VL, P_ZERO}, - {"avx512bw",F_AVX512BW, P_ZERO}, - {"avx512dq",F_AVX512DQ, P_ZERO}, - {"avx512cd",F_AVX512CD, P_ZERO}, - {"avx512er",F_AVX512ER, P_ZERO}, - {"avx512pf",F_AVX512PF, P_ZERO}, - {"avx512vbmi",F_AVX512VBMI, P_ZERO}, - {"avx512ifma",F_AVX512IFMA, P_ZERO}, - {"avx5124vnniw",F_AVX5124VNNIW, P_ZERO}, - {"avx5124fmaps",F_AVX5124FMAPS, P_ZERO}, - {"avx512vpopcntdq",F_AVX512VPOPCNTDQ, P_ZERO}, - {"avx512vbmi2", F_AVX512VBMI2, P_ZERO}, - {"gfni", F_GFNI, P_ZERO}, - {"vpclmulqdq", F_VPCLMULQDQ, P_ZERO}, - {"avx512vnni", F_AVX512VNNI, P_ZERO}, - {"avx512bitalg", F_AVX512BITALG, P_ZERO}, - {"avx512bf16", F_AVX512BF16, P_ZERO}, - {"avx512vp2intersect",F_AVX512VP2INTERSECT, P_ZERO} -}; - -/* This parses the attribute arguments to target in DECL and determines - the right builtin to use to match the platform specification. - It returns the priority value for this version decl. If PREDICATE_LIST - is not NULL, it stores the list of cpu features that need to be checked - before dispatching this function. */ - -unsigned int -get_builtin_code_for_version (tree decl, tree *predicate_list) -{ - tree attrs; - struct cl_target_option cur_target; - tree target_node; - struct cl_target_option *new_target; - const char *arg_str = NULL; - const char *attrs_str = NULL; - char *tok_str = NULL; - char *token; - - enum feature_priority priority = P_ZERO; - - static unsigned int NUM_FEATURES - = sizeof (isa_names_table) / sizeof (_isa_names_table); - - unsigned int i; - - tree predicate_chain = NULL_TREE; - tree predicate_decl, predicate_arg; - - attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl)); - gcc_assert (attrs != NULL); - - attrs = TREE_VALUE (TREE_VALUE (attrs)); - - gcc_assert (TREE_CODE (attrs) == STRING_CST); - attrs_str = TREE_STRING_POINTER (attrs); - - /* Return priority zero for default function. */ - if (strcmp (attrs_str, "default") == 0) - return 0; - - /* Handle arch= if specified. For priority, set it to be 1 more than - the best instruction set the processor can handle. For instance, if - there is a version for atom and a version for ssse3 (the highest ISA - priority for atom), the atom version must be checked for dispatch - before the ssse3 version. */ - if (strstr (attrs_str, "arch=") != NULL) - { - cl_target_option_save (&cur_target, &global_options); - target_node - = ix86_valid_target_attribute_tree (decl, attrs, &global_options, - &global_options_set, 0); - - gcc_assert (target_node); - if (target_node == error_mark_node) - return 0; - new_target = TREE_TARGET_OPTION (target_node); - gcc_assert (new_target); - - if (new_target->arch_specified && new_target->arch > 0) - { - switch (new_target->arch) - { - case PROCESSOR_CORE2: - arg_str = "core2"; - priority = P_PROC_SSSE3; - break; - case PROCESSOR_NEHALEM: - if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_PCLMUL) - { - arg_str = "westmere"; - priority = P_PCLMUL; - } - else - { - /* We translate "arch=corei7" and "arch=nehalem" to - "corei7" so that it will be mapped to M_INTEL_COREI7 - as cpu type to cover all M_INTEL_COREI7_XXXs. */ - arg_str = "corei7"; - priority = P_PROC_SSE4_2; - } - break; - case PROCESSOR_SANDYBRIDGE: - if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C) - arg_str = "ivybridge"; - else - arg_str = "sandybridge"; - priority = P_PROC_AVX; - break; - case PROCESSOR_HASWELL: - if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX) - arg_str = "broadwell"; - else - arg_str = "haswell"; - priority = P_PROC_AVX2; - break; - case PROCESSOR_SKYLAKE: - arg_str = "skylake"; - priority = P_PROC_AVX2; - break; - case PROCESSOR_SKYLAKE_AVX512: - arg_str = "skylake-avx512"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_CANNONLAKE: - arg_str = "cannonlake"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_ICELAKE_CLIENT: - arg_str = "icelake-client"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_ICELAKE_SERVER: - arg_str = "icelake-server"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_CASCADELAKE: - arg_str = "cascadelake"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_TIGERLAKE: - arg_str = "tigerlake"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_COOPERLAKE: - arg_str = "cooperlake"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_BONNELL: - arg_str = "bonnell"; - priority = P_PROC_SSSE3; - break; - case PROCESSOR_KNL: - arg_str = "knl"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_KNM: - arg_str = "knm"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_SILVERMONT: - arg_str = "silvermont"; - priority = P_PROC_SSE4_2; - break; - case PROCESSOR_GOLDMONT: - arg_str = "goldmont"; - priority = P_PROC_SSE4_2; - break; - case PROCESSOR_GOLDMONT_PLUS: - arg_str = "goldmont-plus"; - priority = P_PROC_SSE4_2; - break; - case PROCESSOR_TREMONT: - arg_str = "tremont"; - priority = P_PROC_SSE4_2; - break; - case PROCESSOR_AMDFAM10: - arg_str = "amdfam10h"; - priority = P_PROC_SSE4_A; - break; - case PROCESSOR_BTVER1: - arg_str = "btver1"; - priority = P_PROC_SSE4_A; - break; - case PROCESSOR_BTVER2: - arg_str = "btver2"; - priority = P_PROC_BMI; - break; - case PROCESSOR_BDVER1: - arg_str = "bdver1"; - priority = P_PROC_XOP; - break; - case PROCESSOR_BDVER2: - arg_str = "bdver2"; - priority = P_PROC_FMA; - break; - case PROCESSOR_BDVER3: - arg_str = "bdver3"; - priority = P_PROC_FMA; - break; - case PROCESSOR_BDVER4: - arg_str = "bdver4"; - priority = P_PROC_AVX2; - break; - case PROCESSOR_ZNVER1: - arg_str = "znver1"; - priority = P_PROC_AVX2; - break; - case PROCESSOR_ZNVER2: - arg_str = "znver2"; - priority = P_PROC_AVX2; - break; - } - } - - cl_target_option_restore (&global_options, &cur_target); - - if (predicate_list && arg_str == NULL) - { - error_at (DECL_SOURCE_LOCATION (decl), - "no dispatcher found for the versioning attributes"); - return 0; - } - - if (predicate_list) - { - predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS]; - /* For a C string literal the length includes the trailing NULL. */ - predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str); - predicate_chain = tree_cons (predicate_decl, predicate_arg, - predicate_chain); - } - } - - /* Process feature name. */ - tok_str = (char *) xmalloc (strlen (attrs_str) + 1); - strcpy (tok_str, attrs_str); - token = strtok (tok_str, ","); - predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS]; - - while (token != NULL) - { - /* Do not process "arch=" */ - if (strncmp (token, "arch=", 5) == 0) - { - token = strtok (NULL, ","); - continue; - } - for (i = 0; i < NUM_FEATURES; ++i) - { - if (strcmp (token, isa_names_table[i].name) == 0) - { - if (predicate_list) - { - predicate_arg = build_string_literal ( - strlen (isa_names_table[i].name) + 1, - isa_names_table[i].name); - predicate_chain = tree_cons (predicate_decl, predicate_arg, - predicate_chain); - } - /* Find the maximum priority feature. */ - if (isa_names_table[i].priority > priority) - priority = isa_names_table[i].priority; - - break; - } - } - if (predicate_list && priority == P_ZERO) - { - error_at (DECL_SOURCE_LOCATION (decl), - "ISA %qs is not supported in %<target%> attribute, " - "use %<arch=%> syntax", token); - return 0; - } - token = strtok (NULL, ","); - } - free (tok_str); - - if (predicate_list && predicate_chain == NULL_TREE) - { - error_at (DECL_SOURCE_LOCATION (decl), - "no dispatcher found for the versioning attributes: %s", - attrs_str); - return 0; - } - else if (predicate_list) - { - predicate_chain = nreverse (predicate_chain); - *predicate_list = predicate_chain; - } - - return priority; -} - -/* This builds the processor_model struct type defined in - libgcc/config/i386/cpuinfo.c */ - -static tree -build_processor_model_struct (void) -{ - const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype", - "__cpu_features"}; - tree field = NULL_TREE, field_chain = NULL_TREE; - int i; - tree type = make_node (RECORD_TYPE); - - /* The first 3 fields are unsigned int. */ - for (i = 0; i < 3; ++i) - { - field = build_decl (UNKNOWN_LOCATION, FIELD_DECL, - get_identifier (field_name[i]), unsigned_type_node); - if (field_chain != NULL_TREE) - DECL_CHAIN (field) = field_chain; - field_chain = field; - } - - /* The last field is an array of unsigned integers of size one. */ - field = build_decl (UNKNOWN_LOCATION, FIELD_DECL, - get_identifier (field_name[3]), - build_array_type (unsigned_type_node, - build_index_type (size_one_node))); - if (field_chain != NULL_TREE) - DECL_CHAIN (field) = field_chain; - field_chain = field; - - finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE); - return type; -} - -/* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */ - -static tree -make_var_decl (tree type, const char *name) -{ - tree new_decl; - - new_decl = build_decl (UNKNOWN_LOCATION, - VAR_DECL, - get_identifier(name), - type); - - DECL_EXTERNAL (new_decl) = 1; - TREE_STATIC (new_decl) = 1; - TREE_PUBLIC (new_decl) = 1; - DECL_INITIAL (new_decl) = 0; - DECL_ARTIFICIAL (new_decl) = 0; - DECL_PRESERVE_P (new_decl) = 1; - - make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl)); - assemble_variable (new_decl, 0, 0, 0); - - return new_decl; -} - -/* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded - into an integer defined in libgcc/config/i386/cpuinfo.c */ - -tree -fold_builtin_cpu (tree fndecl, tree *args) -{ - unsigned int i; - enum ix86_builtins fn_code - = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl); - tree param_string_cst = NULL; - - tree __processor_model_type = build_processor_model_struct (); - tree __cpu_model_var = make_var_decl (__processor_model_type, - "__cpu_model"); - - - varpool_node::add (__cpu_model_var); - - gcc_assert ((args != NULL) && (*args != NULL)); - - param_string_cst = *args; - while (param_string_cst - && TREE_CODE (param_string_cst) != STRING_CST) - { - /* *args must be a expr that can contain other EXPRS leading to a - STRING_CST. */ - if (!EXPR_P (param_string_cst)) - { - error ("parameter to builtin must be a string constant or literal"); - return integer_zero_node; - } - param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0); - } - - gcc_assert (param_string_cst); - - if (fn_code == IX86_BUILTIN_CPU_IS) - { - tree ref; - tree field; - tree final; - - unsigned int field_val = 0; - unsigned int NUM_ARCH_NAMES - = sizeof (arch_names_table) / sizeof (struct _arch_names_table); - - for (i = 0; i < NUM_ARCH_NAMES; i++) - if (strcmp (arch_names_table[i].name, - TREE_STRING_POINTER (param_string_cst)) == 0) - break; - - if (i == NUM_ARCH_NAMES) - { - error ("parameter to builtin not valid: %s", - TREE_STRING_POINTER (param_string_cst)); - return integer_zero_node; - } - - field = TYPE_FIELDS (__processor_model_type); - field_val = arch_names_table[i].model; - - /* CPU types are stored in the next field. */ - if (field_val > M_CPU_TYPE_START - && field_val < M_CPU_SUBTYPE_START) - { - field = DECL_CHAIN (field); - field_val -= M_CPU_TYPE_START; - } - - /* CPU subtypes are stored in the next field. */ - if (field_val > M_CPU_SUBTYPE_START) - { - field = DECL_CHAIN ( DECL_CHAIN (field)); - field_val -= M_CPU_SUBTYPE_START; - } - - /* Get the appropriate field in __cpu_model. */ - ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var, - field, NULL_TREE); - - /* Check the value. */ - final = build2 (EQ_EXPR, unsigned_type_node, ref, - build_int_cstu (unsigned_type_node, field_val)); - return build1 (CONVERT_EXPR, integer_type_node, final); - } - else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS) - { - tree ref; - tree array_elt; - tree field; - tree final; - - unsigned int field_val = 0; - unsigned int NUM_ISA_NAMES - = sizeof (isa_names_table) / sizeof (struct _isa_names_table); - - for (i = 0; i < NUM_ISA_NAMES; i++) - if (strcmp (isa_names_table[i].name, - TREE_STRING_POINTER (param_string_cst)) == 0) - break; - - if (i == NUM_ISA_NAMES) - { - error ("parameter to builtin not valid: %s", - TREE_STRING_POINTER (param_string_cst)); - return integer_zero_node; - } - - if (isa_names_table[i].feature >= 32) - { - tree __cpu_features2_var = make_var_decl (unsigned_type_node, - "__cpu_features2"); - - varpool_node::add (__cpu_features2_var); - field_val = (1U << (isa_names_table[i].feature - 32)); - /* Return __cpu_features2 & field_val */ - final = build2 (BIT_AND_EXPR, unsigned_type_node, - __cpu_features2_var, - build_int_cstu (unsigned_type_node, field_val)); - return build1 (CONVERT_EXPR, integer_type_node, final); - } - - field = TYPE_FIELDS (__processor_model_type); - /* Get the last field, which is __cpu_features. */ - while (DECL_CHAIN (field)) - field = DECL_CHAIN (field); - - /* Get the appropriate field: __cpu_model.__cpu_features */ - ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var, - field, NULL_TREE); - - /* Access the 0th element of __cpu_features array. */ - array_elt = build4 (ARRAY_REF, unsigned_type_node, ref, - integer_zero_node, NULL_TREE, NULL_TREE); - - field_val = (1U << isa_names_table[i].feature); - /* Return __cpu_model.__cpu_features[0] & field_val */ - final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt, - build_int_cstu (unsigned_type_node, field_val)); - return build1 (CONVERT_EXPR, integer_type_node, final); - } - gcc_unreachable (); -} - -#include "gt-i386-builtins.h" diff --git a/gcc/config/i386/i386-d.c b/gcc/config/i386/i386-d.c deleted file mode 100644 index 56fec11..0000000 --- a/gcc/config/i386/i386-d.c +++ /dev/null @@ -1,44 +0,0 @@ -/* Subroutines for the D front end on the x86 architecture. - Copyright (C) 2017-2020 Free Software Foundation, Inc. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING3. If not see -<http://www.gnu.org/licenses/>. */ - -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "tm.h" -#include "d/d-target.h" -#include "d/d-target-def.h" - -/* Implement TARGET_D_CPU_VERSIONS for x86 targets. */ - -void -ix86_d_target_versions (void) -{ - if (TARGET_64BIT) - { - d_add_builtin_version ("X86_64"); - - if (TARGET_X32) - d_add_builtin_version ("D_X32"); - } - else - d_add_builtin_version ("X86"); - - if (TARGET_80387) - d_add_builtin_version ("D_HardFloat"); - else - d_add_builtin_version ("D_SoftFloat"); -} diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c deleted file mode 100644 index 270585d..0000000 --- a/gcc/config/i386/i386-expand.c +++ /dev/null @@ -1,20310 +0,0 @@ -/* Copyright (C) 1988-2020 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING3. If not see -<http://www.gnu.org/licenses/>. */ - -#define IN_TARGET_CODE 1 - -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "backend.h" -#include "rtl.h" -#include "tree.h" -#include "memmodel.h" -#include "gimple.h" -#include "cfghooks.h" -#include "cfgloop.h" -#include "df.h" -#include "tm_p.h" -#include "stringpool.h" -#include "expmed.h" -#include "optabs.h" -#include "regs.h" -#include "emit-rtl.h" -#include "recog.h" -#include "cgraph.h" -#include "diagnostic.h" -#include "cfgbuild.h" -#include "alias.h" -#include "fold-const.h" -#include "attribs.h" -#include "calls.h" -#include "stor-layout.h" -#include "varasm.h" -#include "output.h" -#include "insn-attr.h" -#include "flags.h" -#include "except.h" -#include "explow.h" -#include "expr.h" -#include "cfgrtl.h" -#include "common/common-target.h" -#include "langhooks.h" -#include "reload.h" -#include "gimplify.h" -#include "dwarf2.h" -#include "tm-constrs.h" -#include "cselib.h" -#include "sched-int.h" -#include "opts.h" -#include "tree-pass.h" -#include "context.h" -#include "pass_manager.h" -#include "target-globals.h" -#include "gimple-iterator.h" -#include "tree-vectorizer.h" -#include "shrink-wrap.h" -#include "builtins.h" -#include "rtl-iter.h" -#include "tree-iterator.h" -#include "dbgcnt.h" -#include "case-cfn-macros.h" -#include "dojump.h" -#include "fold-const-call.h" -#include "tree-vrp.h" -#include "tree-ssanames.h" -#include "selftest.h" -#include "selftest-rtl.h" -#include "print-rtl.h" -#include "intl.h" -#include "ifcvt.h" -#include "symbol-summary.h" -#include "ipa-prop.h" -#include "ipa-fnsummary.h" -#include "wide-int-bitmask.h" -#include "tree-vector-builder.h" -#include "debug.h" -#include "dwarf2out.h" -#include "i386-options.h" -#include "i386-builtins.h" -#include "i386-expand.h" - -/* Split one or more double-mode RTL references into pairs of half-mode - references. The RTL can be REG, offsettable MEM, integer constant, or - CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to - split and "num" is its length. lo_half and hi_half are output arrays - that parallel "operands". */ - -void -split_double_mode (machine_mode mode, rtx operands[], - int num, rtx lo_half[], rtx hi_half[]) -{ - machine_mode half_mode; - unsigned int byte; - rtx mem_op = NULL_RTX; - int mem_num = 0; - - switch (mode) - { - case E_TImode: - half_mode = DImode; - break; - case E_DImode: - half_mode = SImode; - break; - default: - gcc_unreachable (); - } - - byte = GET_MODE_SIZE (half_mode); - - while (num--) - { - rtx op = operands[num]; - - /* simplify_subreg refuse to split volatile memory addresses, - but we still have to handle it. */ - if (MEM_P (op)) - { - if (mem_op && rtx_equal_p (op, mem_op)) - { - lo_half[num] = lo_half[mem_num]; - hi_half[num] = hi_half[mem_num]; - } - else - { - mem_op = op; - mem_num = num; - lo_half[num] = adjust_address (op, half_mode, 0); - hi_half[num] = adjust_address (op, half_mode, byte); - } - } - else - { - lo_half[num] = simplify_gen_subreg (half_mode, op, - GET_MODE (op) == VOIDmode - ? mode : GET_MODE (op), 0); - hi_half[num] = simplify_gen_subreg (half_mode, op, - GET_MODE (op) == VOIDmode - ? mode : GET_MODE (op), byte); - } - } -} - -/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate - for the target. */ - -void -ix86_expand_clear (rtx dest) -{ - rtx tmp; - - /* We play register width games, which are only valid after reload. */ - gcc_assert (reload_completed); - - /* Avoid HImode and its attendant prefix byte. */ - if (GET_MODE_SIZE (GET_MODE (dest)) < 4) - dest = gen_rtx_REG (SImode, REGNO (dest)); - tmp = gen_rtx_SET (dest, const0_rtx); - - if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ()) - { - rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob)); - } - - emit_insn (tmp); -} - -void -ix86_expand_move (machine_mode mode, rtx operands[]) -{ - rtx op0, op1; - rtx tmp, addend = NULL_RTX; - enum tls_model model; - - op0 = operands[0]; - op1 = operands[1]; - - switch (GET_CODE (op1)) - { - case CONST: - tmp = XEXP (op1, 0); - - if (GET_CODE (tmp) != PLUS - || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF) - break; - - op1 = XEXP (tmp, 0); - addend = XEXP (tmp, 1); - /* FALLTHRU */ - - case SYMBOL_REF: - model = SYMBOL_REF_TLS_MODEL (op1); - - if (model) - op1 = legitimize_tls_address (op1, model, true); - else if (ix86_force_load_from_GOT_p (op1)) - { - /* Load the external function address via GOT slot to avoid PLT. */ - op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1), - (TARGET_64BIT - ? UNSPEC_GOTPCREL - : UNSPEC_GOT)); - op1 = gen_rtx_CONST (Pmode, op1); - op1 = gen_const_mem (Pmode, op1); - set_mem_alias_set (op1, ix86_GOT_alias_set ()); - } - else - { - tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX); - if (tmp) - { - op1 = tmp; - if (!addend) - break; - } - else - { - op1 = operands[1]; - break; - } - } - - if (addend) - { - op1 = force_operand (op1, NULL_RTX); - op1 = expand_simple_binop (Pmode, PLUS, op1, addend, - op0, 1, OPTAB_DIRECT); - } - else - op1 = force_operand (op1, op0); - - if (op1 == op0) - return; - - op1 = convert_to_mode (mode, op1, 1); - - default: - break; - } - - if ((flag_pic || MACHOPIC_INDIRECT) - && symbolic_operand (op1, mode)) - { - if (TARGET_MACHO && !TARGET_64BIT) - { -#if TARGET_MACHO - /* dynamic-no-pic */ - if (MACHOPIC_INDIRECT) - { - rtx temp = (op0 && REG_P (op0) && mode == Pmode) - ? op0 : gen_reg_rtx (Pmode); - op1 = machopic_indirect_data_reference (op1, temp); - if (MACHOPIC_PURE) - op1 = machopic_legitimize_pic_address (op1, mode, - temp == op1 ? 0 : temp); - } - if (op0 != op1 && GET_CODE (op0) != MEM) - { - rtx insn = gen_rtx_SET (op0, op1); - emit_insn (insn); - return; - } - if (GET_CODE (op0) == MEM) - op1 = force_reg (Pmode, op1); - else - { - rtx temp = op0; - if (GET_CODE (temp) != REG) - temp = gen_reg_rtx (Pmode); - temp = legitimize_pic_address (op1, temp); - if (temp == op0) - return; - op1 = temp; - } - /* dynamic-no-pic */ -#endif - } - else - { - if (MEM_P (op0)) - op1 = force_reg (mode, op1); - else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode))) - { - rtx reg = can_create_pseudo_p () ? NULL_RTX : op0; - op1 = legitimize_pic_address (op1, reg); - if (op0 == op1) - return; - op1 = convert_to_mode (mode, op1, 1); - } - } - } - else - { - if (MEM_P (op0) - && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode) - || !push_operand (op0, mode)) - && MEM_P (op1)) - op1 = force_reg (mode, op1); - - if (push_operand (op0, mode) - && ! general_no_elim_operand (op1, mode)) - op1 = copy_to_mode_reg (mode, op1); - - /* Force large constants in 64bit compilation into register - to get them CSEed. */ - if (can_create_pseudo_p () - && (mode == DImode) && TARGET_64BIT - && immediate_operand (op1, mode) - && !x86_64_zext_immediate_operand (op1, VOIDmode) - && !register_operand (op0, mode) - && optimize) - op1 = copy_to_mode_reg (mode, op1); - - if (can_create_pseudo_p () - && CONST_DOUBLE_P (op1)) - { - /* If we are loading a floating point constant to a register, - force the value to memory now, since we'll get better code - out the back end. */ - - op1 = validize_mem (force_const_mem (mode, op1)); - if (!register_operand (op0, mode)) - { - rtx temp = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (temp, op1)); - emit_move_insn (op0, temp); - return; - } - } - } - - emit_insn (gen_rtx_SET (op0, op1)); -} - -void -ix86_expand_vector_move (machine_mode mode, rtx operands[]) -{ - rtx op0 = operands[0], op1 = operands[1]; - /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU - psABI since the biggest alignment is 4 byte for IA MCU psABI. */ - unsigned int align = (TARGET_IAMCU - ? GET_MODE_BITSIZE (mode) - : GET_MODE_ALIGNMENT (mode)); - - if (push_operand (op0, VOIDmode)) - op0 = emit_move_resolve_push (mode, op0); - - /* Force constants other than zero into memory. We do not know how - the instructions used to build constants modify the upper 64 bits - of the register, once we have that information we may be able - to handle some of them more efficiently. */ - if (can_create_pseudo_p () - && (CONSTANT_P (op1) - || (SUBREG_P (op1) - && CONSTANT_P (SUBREG_REG (op1)))) - && ((register_operand (op0, mode) - && !standard_sse_constant_p (op1, mode)) - /* ix86_expand_vector_move_misalign() does not like constants. */ - || (SSE_REG_MODE_P (mode) - && MEM_P (op0) - && MEM_ALIGN (op0) < align))) - { - if (SUBREG_P (op1)) - { - machine_mode imode = GET_MODE (SUBREG_REG (op1)); - rtx r = force_const_mem (imode, SUBREG_REG (op1)); - if (r) - r = validize_mem (r); - else - r = force_reg (imode, SUBREG_REG (op1)); - op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1)); - } - else - op1 = validize_mem (force_const_mem (mode, op1)); - } - - /* We need to check memory alignment for SSE mode since attribute - can make operands unaligned. */ - if (can_create_pseudo_p () - && SSE_REG_MODE_P (mode) - && ((MEM_P (op0) && (MEM_ALIGN (op0) < align)) - || (MEM_P (op1) && (MEM_ALIGN (op1) < align)))) - { - rtx tmp[2]; - - /* ix86_expand_vector_move_misalign() does not like both - arguments in memory. */ - if (!register_operand (op0, mode) - && !register_operand (op1, mode)) - op1 = force_reg (mode, op1); - - tmp[0] = op0; tmp[1] = op1; - ix86_expand_vector_move_misalign (mode, tmp); - return; - } - - /* Make operand1 a register if it isn't already. */ - if (can_create_pseudo_p () - && !register_operand (op0, mode) - && !register_operand (op1, mode)) - { - emit_move_insn (op0, force_reg (GET_MODE (op0), op1)); - return; - } - - emit_insn (gen_rtx_SET (op0, op1)); -} - -/* Split 32-byte AVX unaligned load and store if needed. */ - -static void -ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) -{ - rtx m; - rtx (*extract) (rtx, rtx, rtx); - machine_mode mode; - - if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD) - || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE)) - { - emit_insn (gen_rtx_SET (op0, op1)); - return; - } - - rtx orig_op0 = NULL_RTX; - mode = GET_MODE (op0); - switch (GET_MODE_CLASS (mode)) - { - case MODE_VECTOR_INT: - case MODE_INT: - if (mode != V32QImode) - { - if (!MEM_P (op0)) - { - orig_op0 = op0; - op0 = gen_reg_rtx (V32QImode); - } - else - op0 = gen_lowpart (V32QImode, op0); - op1 = gen_lowpart (V32QImode, op1); - mode = V32QImode; - } - break; - case MODE_VECTOR_FLOAT: - break; - default: - gcc_unreachable (); - } - - switch (mode) - { - default: - gcc_unreachable (); - case E_V32QImode: - extract = gen_avx_vextractf128v32qi; - mode = V16QImode; - break; - case E_V8SFmode: - extract = gen_avx_vextractf128v8sf; - mode = V4SFmode; - break; - case E_V4DFmode: - extract = gen_avx_vextractf128v4df; - mode = V2DFmode; - break; - } - - if (MEM_P (op1)) - { - rtx r = gen_reg_rtx (mode); - m = adjust_address (op1, mode, 0); - emit_move_insn (r, m); - m = adjust_address (op1, mode, 16); - r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); - emit_move_insn (op0, r); - } - else if (MEM_P (op0)) - { - m = adjust_address (op0, mode, 0); - emit_insn (extract (m, op1, const0_rtx)); - m = adjust_address (op0, mode, 16); - emit_insn (extract (m, copy_rtx (op1), const1_rtx)); - } - else - gcc_unreachable (); - - if (orig_op0) - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); -} - -/* Implement the movmisalign patterns for SSE. Non-SSE modes go - straight to ix86_expand_vector_move. */ -/* Code generation for scalar reg-reg moves of single and double precision data: - if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true) - movaps reg, reg - else - movss reg, reg - if (x86_sse_partial_reg_dependency == true) - movapd reg, reg - else - movsd reg, reg - - Code generation for scalar loads of double precision data: - if (x86_sse_split_regs == true) - movlpd mem, reg (gas syntax) - else - movsd mem, reg - - Code generation for unaligned packed loads of single precision data - (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency): - if (x86_sse_unaligned_move_optimal) - movups mem, reg - - if (x86_sse_partial_reg_dependency == true) - { - xorps reg, reg - movlps mem, reg - movhps mem+8, reg - } - else - { - movlps mem, reg - movhps mem+8, reg - } - - Code generation for unaligned packed loads of double precision data - (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs): - if (x86_sse_unaligned_move_optimal) - movupd mem, reg - - if (x86_sse_split_regs == true) - { - movlpd mem, reg - movhpd mem+8, reg - } - else - { - movsd mem, reg - movhpd mem+8, reg - } - */ - -void -ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) -{ - rtx op0, op1, m; - - op0 = operands[0]; - op1 = operands[1]; - - /* Use unaligned load/store for AVX512 or when optimizing for size. */ - if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ()) - { - emit_insn (gen_rtx_SET (op0, op1)); - return; - } - - if (TARGET_AVX) - { - if (GET_MODE_SIZE (mode) == 32) - ix86_avx256_split_vector_move_misalign (op0, op1); - else - /* Always use 128-bit mov<mode>_internal pattern for AVX. */ - emit_insn (gen_rtx_SET (op0, op1)); - return; - } - - if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) - { - emit_insn (gen_rtx_SET (op0, op1)); - return; - } - - /* ??? If we have typed data, then it would appear that using - movdqu is the only way to get unaligned data loaded with - integer type. */ - if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - { - emit_insn (gen_rtx_SET (op0, op1)); - return; - } - - if (MEM_P (op1)) - { - if (TARGET_SSE2 && mode == V2DFmode) - { - rtx zero; - - /* When SSE registers are split into halves, we can avoid - writing to the top half twice. */ - if (TARGET_SSE_SPLIT_REGS) - { - emit_clobber (op0); - zero = op0; - } - else - { - /* ??? Not sure about the best option for the Intel chips. - The following would seem to satisfy; the register is - entirely cleared, breaking the dependency chain. We - then store to the upper half, with a dependency depth - of one. A rumor has it that Intel recommends two movsd - followed by an unpacklpd, but this is unconfirmed. And - given that the dependency depth of the unpacklpd would - still be one, I'm not sure why this would be better. */ - zero = CONST0_RTX (V2DFmode); - } - - m = adjust_address (op1, DFmode, 0); - emit_insn (gen_sse2_loadlpd (op0, zero, m)); - m = adjust_address (op1, DFmode, 8); - emit_insn (gen_sse2_loadhpd (op0, op0, m)); - } - else - { - rtx t; - - if (mode != V4SFmode) - t = gen_reg_rtx (V4SFmode); - else - t = op0; - - if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) - emit_move_insn (t, CONST0_RTX (V4SFmode)); - else - emit_clobber (t); - - m = adjust_address (op1, V2SFmode, 0); - emit_insn (gen_sse_loadlps (t, t, m)); - m = adjust_address (op1, V2SFmode, 8); - emit_insn (gen_sse_loadhps (t, t, m)); - if (mode != V4SFmode) - emit_move_insn (op0, gen_lowpart (mode, t)); - } - } - else if (MEM_P (op0)) - { - if (TARGET_SSE2 && mode == V2DFmode) - { - m = adjust_address (op0, DFmode, 0); - emit_insn (gen_sse2_storelpd (m, op1)); - m = adjust_address (op0, DFmode, 8); - emit_insn (gen_sse2_storehpd (m, op1)); - } - else - { - if (mode != V4SFmode) - op1 = gen_lowpart (V4SFmode, op1); - - m = adjust_address (op0, V2SFmode, 0); - emit_insn (gen_sse_storelps (m, op1)); - m = adjust_address (op0, V2SFmode, 8); - emit_insn (gen_sse_storehps (m, copy_rtx (op1))); - } - } - else - gcc_unreachable (); -} - -/* Move bits 64:95 to bits 32:63. */ - -void -ix86_move_vector_high_sse_to_mmx (rtx op) -{ - rtx mask = gen_rtx_PARALLEL (VOIDmode, - gen_rtvec (4, GEN_INT (0), GEN_INT (2), - GEN_INT (0), GEN_INT (0))); - rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op)); - op = gen_rtx_VEC_SELECT (V4SImode, dest, mask); - rtx insn = gen_rtx_SET (dest, op); - emit_insn (insn); -} - -/* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */ - -void -ix86_split_mmx_pack (rtx operands[], enum rtx_code code) -{ - rtx op0 = operands[0]; - rtx op1 = operands[1]; - rtx op2 = operands[2]; - - machine_mode dmode = GET_MODE (op0); - machine_mode smode = GET_MODE (op1); - machine_mode inner_dmode = GET_MODE_INNER (dmode); - machine_mode inner_smode = GET_MODE_INNER (smode); - - /* Get the corresponding SSE mode for destination. */ - int nunits = 16 / GET_MODE_SIZE (inner_dmode); - machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode), - nunits).require (); - machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode), - nunits / 2).require (); - - /* Get the corresponding SSE mode for source. */ - nunits = 16 / GET_MODE_SIZE (inner_smode); - machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode), - nunits).require (); - - /* Generate SSE pack with signed/unsigned saturation. */ - rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0)); - op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1)); - op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2)); - - op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1); - op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2); - rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode, - op1, op2)); - emit_insn (insn); - - ix86_move_vector_high_sse_to_mmx (op0); -} - -/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */ - -void -ix86_split_mmx_punpck (rtx operands[], bool high_p) -{ - rtx op0 = operands[0]; - rtx op1 = operands[1]; - rtx op2 = operands[2]; - machine_mode mode = GET_MODE (op0); - rtx mask; - /* The corresponding SSE mode. */ - machine_mode sse_mode, double_sse_mode; - - switch (mode) - { - case E_V8QImode: - sse_mode = V16QImode; - double_sse_mode = V32QImode; - mask = gen_rtx_PARALLEL (VOIDmode, - gen_rtvec (16, - GEN_INT (0), GEN_INT (16), - GEN_INT (1), GEN_INT (17), - GEN_INT (2), GEN_INT (18), - GEN_INT (3), GEN_INT (19), - GEN_INT (4), GEN_INT (20), - GEN_INT (5), GEN_INT (21), - GEN_INT (6), GEN_INT (22), - GEN_INT (7), GEN_INT (23))); - break; - - case E_V4HImode: - sse_mode = V8HImode; - double_sse_mode = V16HImode; - mask = gen_rtx_PARALLEL (VOIDmode, - gen_rtvec (8, - GEN_INT (0), GEN_INT (8), - GEN_INT (1), GEN_INT (9), - GEN_INT (2), GEN_INT (10), - GEN_INT (3), GEN_INT (11))); - break; - - case E_V2SImode: - sse_mode = V4SImode; - double_sse_mode = V8SImode; - mask = gen_rtx_PARALLEL (VOIDmode, - gen_rtvec (4, - GEN_INT (0), GEN_INT (4), - GEN_INT (1), GEN_INT (5))); - break; - - default: - gcc_unreachable (); - } - - /* Generate SSE punpcklXX. */ - rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0)); - op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1)); - op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2)); - - op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2); - op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask); - rtx insn = gen_rtx_SET (dest, op2); - emit_insn (insn); - - if (high_p) - { - /* Move bits 64:127 to bits 0:63. */ - mask = gen_rtx_PARALLEL (VOIDmode, - gen_rtvec (4, GEN_INT (2), GEN_INT (3), - GEN_INT (0), GEN_INT (0))); - dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest)); - op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask); - insn = gen_rtx_SET (dest, op1); - emit_insn (insn); - } -} - -/* Helper function of ix86_fixup_binary_operands to canonicalize - operand order. Returns true if the operands should be swapped. */ - -static bool -ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode, - rtx operands[]) -{ - rtx dst = operands[0]; - rtx src1 = operands[1]; - rtx src2 = operands[2]; - - /* If the operation is not commutative, we can't do anything. */ - if (GET_RTX_CLASS (code) != RTX_COMM_ARITH - && GET_RTX_CLASS (code) != RTX_COMM_COMPARE) - return false; - - /* Highest priority is that src1 should match dst. */ - if (rtx_equal_p (dst, src1)) - return false; - if (rtx_equal_p (dst, src2)) - return true; - - /* Next highest priority is that immediate constants come second. */ - if (immediate_operand (src2, mode)) - return false; - if (immediate_operand (src1, mode)) - return true; - - /* Lowest priority is that memory references should come second. */ - if (MEM_P (src2)) - return false; - if (MEM_P (src1)) - return true; - - return false; -} - - -/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the - destination to use for the operation. If different from the true - destination in operands[0], a copy operation will be required. */ - -rtx -ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode, - rtx operands[]) -{ - rtx dst = operands[0]; - rtx src1 = operands[1]; - rtx src2 = operands[2]; - - /* Canonicalize operand order. */ - if (ix86_swap_binary_operands_p (code, mode, operands)) - { - /* It is invalid to swap operands of different modes. */ - gcc_assert (GET_MODE (src1) == GET_MODE (src2)); - - std::swap (src1, src2); - } - - /* Both source operands cannot be in memory. */ - if (MEM_P (src1) && MEM_P (src2)) - { - /* Optimization: Only read from memory once. */ - if (rtx_equal_p (src1, src2)) - { - src2 = force_reg (mode, src2); - src1 = src2; - } - else if (rtx_equal_p (dst, src1)) - src2 = force_reg (mode, src2); - else - src1 = force_reg (mode, src1); - } - - /* If the destination is memory, and we do not have matching source - operands, do things in registers. */ - if (MEM_P (dst) && !rtx_equal_p (dst, src1)) - dst = gen_reg_rtx (mode); - - /* Source 1 cannot be a constant. */ - if (CONSTANT_P (src1)) - src1 = force_reg (mode, src1); - - /* Source 1 cannot be a non-matching memory. */ - if (MEM_P (src1) && !rtx_equal_p (dst, src1)) - src1 = force_reg (mode, src1); - - /* Improve address combine. */ - if (code == PLUS - && GET_MODE_CLASS (mode) == MODE_INT - && MEM_P (src2)) - src2 = force_reg (mode, src2); - - operands[1] = src1; - operands[2] = src2; - return dst; -} - -/* Similarly, but assume that the destination has already been - set up properly. */ - -void -ix86_fixup_binary_operands_no_copy (enum rtx_code code, - machine_mode mode, rtx operands[]) -{ - rtx dst = ix86_fixup_binary_operands (code, mode, operands); - gcc_assert (dst == operands[0]); -} - -/* Attempt to expand a binary operator. Make the expansion closer to the - actual machine, then just general_operand, which will allow 3 separate - memory references (one output, two input) in a single insn. */ - -void -ix86_expand_binary_operator (enum rtx_code code, machine_mode mode, - rtx operands[]) -{ - rtx src1, src2, dst, op, clob; - - dst = ix86_fixup_binary_operands (code, mode, operands); - src1 = operands[1]; - src2 = operands[2]; - - /* Emit the instruction. */ - - op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2)); - - if (reload_completed - && code == PLUS - && !rtx_equal_p (dst, src1)) - { - /* This is going to be an LEA; avoid splitting it later. */ - emit_insn (op); - } - else - { - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); - } - - /* Fix up the destination if needed. */ - if (dst != operands[0]) - emit_move_insn (operands[0], dst); -} - -/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with - the given OPERANDS. */ - -void -ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode, - rtx operands[]) -{ - rtx op1 = NULL_RTX, op2 = NULL_RTX; - if (SUBREG_P (operands[1])) - { - op1 = operands[1]; - op2 = operands[2]; - } - else if (SUBREG_P (operands[2])) - { - op1 = operands[2]; - op2 = operands[1]; - } - /* Optimize (__m128i) d | (__m128i) e and similar code - when d and e are float vectors into float vector logical - insn. In C/C++ without using intrinsics there is no other way - to express vector logical operation on float vectors than - to cast them temporarily to integer vectors. */ - if (op1 - && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL - && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR) - && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT - && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode) - && SUBREG_BYTE (op1) == 0 - && (GET_CODE (op2) == CONST_VECTOR - || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2)) - && SUBREG_BYTE (op2) == 0)) - && can_create_pseudo_p ()) - { - rtx dst; - switch (GET_MODE (SUBREG_REG (op1))) - { - case E_V4SFmode: - case E_V8SFmode: - case E_V16SFmode: - case E_V2DFmode: - case E_V4DFmode: - case E_V8DFmode: - dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1))); - if (GET_CODE (op2) == CONST_VECTOR) - { - op2 = gen_lowpart (GET_MODE (dst), op2); - op2 = force_reg (GET_MODE (dst), op2); - } - else - { - op1 = operands[1]; - op2 = SUBREG_REG (operands[2]); - if (!vector_operand (op2, GET_MODE (dst))) - op2 = force_reg (GET_MODE (dst), op2); - } - op1 = SUBREG_REG (op1); - if (!vector_operand (op1, GET_MODE (dst))) - op1 = force_reg (GET_MODE (dst), op1); - emit_insn (gen_rtx_SET (dst, - gen_rtx_fmt_ee (code, GET_MODE (dst), - op1, op2))); - emit_move_insn (operands[0], gen_lowpart (mode, dst)); - return; - default: - break; - } - } - if (!vector_operand (operands[1], mode)) - operands[1] = force_reg (mode, operands[1]); - if (!vector_operand (operands[2], mode)) - operands[2] = force_reg (mode, operands[2]); - ix86_fixup_binary_operands_no_copy (code, mode, operands); - emit_insn (gen_rtx_SET (operands[0], - gen_rtx_fmt_ee (code, mode, operands[1], - operands[2]))); -} - -/* Return TRUE or FALSE depending on whether the binary operator meets the - appropriate constraints. */ - -bool -ix86_binary_operator_ok (enum rtx_code code, machine_mode mode, - rtx operands[3]) -{ - rtx dst = operands[0]; - rtx src1 = operands[1]; - rtx src2 = operands[2]; - - /* Both source operands cannot be in memory. */ - if (MEM_P (src1) && MEM_P (src2)) - return false; - - /* Canonicalize operand order for commutative operators. */ - if (ix86_swap_binary_operands_p (code, mode, operands)) - std::swap (src1, src2); - - /* If the destination is memory, we must have a matching source operand. */ - if (MEM_P (dst) && !rtx_equal_p (dst, src1)) - return false; - - /* Source 1 cannot be a constant. */ - if (CONSTANT_P (src1)) - return false; - - /* Source 1 cannot be a non-matching memory. */ - if (MEM_P (src1) && !rtx_equal_p (dst, src1)) - /* Support "andhi/andsi/anddi" as a zero-extending move. */ - return (code == AND - && (mode == HImode - || mode == SImode - || (TARGET_64BIT && mode == DImode)) - && satisfies_constraint_L (src2)); - - return true; -} - -/* Attempt to expand a unary operator. Make the expansion closer to the - actual machine, then just general_operand, which will allow 2 separate - memory references (one output, one input) in a single insn. */ - -void -ix86_expand_unary_operator (enum rtx_code code, machine_mode mode, - rtx operands[]) -{ - bool matching_memory = false; - rtx src, dst, op, clob; - - dst = operands[0]; - src = operands[1]; - - /* If the destination is memory, and we do not have matching source - operands, do things in registers. */ - if (MEM_P (dst)) - { - if (rtx_equal_p (dst, src)) - matching_memory = true; - else - dst = gen_reg_rtx (mode); - } - - /* When source operand is memory, destination must match. */ - if (MEM_P (src) && !matching_memory) - src = force_reg (mode, src); - - /* Emit the instruction. */ - - op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src)); - - if (code == NOT) - emit_insn (op); - else - { - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); - } - - /* Fix up the destination if needed. */ - if (dst != operands[0]) - emit_move_insn (operands[0], dst); -} - -/* Predict just emitted jump instruction to be taken with probability PROB. */ - -static void -predict_jump (int prob) -{ - rtx_insn *insn = get_last_insn (); - gcc_assert (JUMP_P (insn)); - add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob)); -} - -/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and - divisor are within the range [0-255]. */ - -void -ix86_split_idivmod (machine_mode mode, rtx operands[], - bool unsigned_p) -{ - rtx_code_label *end_label, *qimode_label; - rtx div, mod; - rtx_insn *insn; - rtx scratch, tmp0, tmp1, tmp2; - rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx); - - switch (mode) - { - case E_SImode: - if (GET_MODE (operands[0]) == SImode) - { - if (GET_MODE (operands[1]) == SImode) - gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1; - else - gen_divmod4_1 - = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2; - } - else - gen_divmod4_1 - = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1; - break; - - case E_DImode: - gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1; - break; - - default: - gcc_unreachable (); - } - - end_label = gen_label_rtx (); - qimode_label = gen_label_rtx (); - - scratch = gen_reg_rtx (mode); - - /* Use 8bit unsigned divimod if dividend and divisor are within - the range [0-255]. */ - emit_move_insn (scratch, operands[2]); - scratch = expand_simple_binop (mode, IOR, scratch, operands[3], - scratch, 1, OPTAB_DIRECT); - emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100))); - tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG); - tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx); - tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0, - gen_rtx_LABEL_REF (VOIDmode, qimode_label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0)); - predict_jump (REG_BR_PROB_BASE * 50 / 100); - JUMP_LABEL (insn) = qimode_label; - - /* Generate original signed/unsigned divimod. */ - div = gen_divmod4_1 (operands[0], operands[1], - operands[2], operands[3]); - emit_insn (div); - - /* Branch to the end. */ - emit_jump_insn (gen_jump (end_label)); - emit_barrier (); - - /* Generate 8bit unsigned divide. */ - emit_label (qimode_label); - /* Don't use operands[0] for result of 8bit divide since not all - registers support QImode ZERO_EXTRACT. */ - tmp0 = lowpart_subreg (HImode, scratch, mode); - tmp1 = lowpart_subreg (HImode, operands[2], mode); - tmp2 = lowpart_subreg (QImode, operands[3], mode); - emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2)); - - if (unsigned_p) - { - div = gen_rtx_UDIV (mode, operands[2], operands[3]); - mod = gen_rtx_UMOD (mode, operands[2], operands[3]); - } - else - { - div = gen_rtx_DIV (mode, operands[2], operands[3]); - mod = gen_rtx_MOD (mode, operands[2], operands[3]); - } - if (mode == SImode) - { - if (GET_MODE (operands[0]) != SImode) - div = gen_rtx_ZERO_EXTEND (DImode, div); - if (GET_MODE (operands[1]) != SImode) - mod = gen_rtx_ZERO_EXTEND (DImode, mod); - } - - /* Extract remainder from AH. */ - tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), - tmp0, GEN_INT (8), GEN_INT (8)); - if (REG_P (operands[1])) - insn = emit_move_insn (operands[1], tmp1); - else - { - /* Need a new scratch register since the old one has result - of 8bit divide. */ - scratch = gen_reg_rtx (GET_MODE (operands[1])); - emit_move_insn (scratch, tmp1); - insn = emit_move_insn (operands[1], scratch); - } - set_unique_reg_note (insn, REG_EQUAL, mod); - - /* Zero extend quotient from AL. */ - tmp1 = gen_lowpart (QImode, tmp0); - insn = emit_insn (gen_extend_insn - (operands[0], tmp1, - GET_MODE (operands[0]), QImode, 1)); - set_unique_reg_note (insn, REG_EQUAL, div); - - emit_label (end_label); -} - -/* Emit x86 binary operand CODE in mode MODE, where the first operand - matches destination. RTX includes clobber of FLAGS_REG. */ - -void -ix86_emit_binop (enum rtx_code code, machine_mode mode, - rtx dst, rtx src) -{ - rtx op, clob; - - op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src)); - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - - emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); -} - -/* Return true if regno1 def is nearest to the insn. */ - -static bool -find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2) -{ - rtx_insn *prev = insn; - rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn)); - - if (insn == start) - return false; - while (prev && prev != start) - { - if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev)) - { - prev = PREV_INSN (prev); - continue; - } - if (insn_defines_reg (regno1, INVALID_REGNUM, prev)) - return true; - else if (insn_defines_reg (regno2, INVALID_REGNUM, prev)) - return false; - prev = PREV_INSN (prev); - } - - /* None of the regs is defined in the bb. */ - return false; -} - -/* Split lea instructions into a sequence of instructions - which are executed on ALU to avoid AGU stalls. - It is assumed that it is allowed to clobber flags register - at lea position. */ - -void -ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode) -{ - unsigned int regno0, regno1, regno2; - struct ix86_address parts; - rtx target, tmp; - int ok, adds; - - ok = ix86_decompose_address (operands[1], &parts); - gcc_assert (ok); - - target = gen_lowpart (mode, operands[0]); - - regno0 = true_regnum (target); - regno1 = INVALID_REGNUM; - regno2 = INVALID_REGNUM; - - if (parts.base) - { - parts.base = gen_lowpart (mode, parts.base); - regno1 = true_regnum (parts.base); - } - - if (parts.index) - { - parts.index = gen_lowpart (mode, parts.index); - regno2 = true_regnum (parts.index); - } - - if (parts.disp) - parts.disp = gen_lowpart (mode, parts.disp); - - if (parts.scale > 1) - { - /* Case r1 = r1 + ... */ - if (regno1 == regno0) - { - /* If we have a case r1 = r1 + C * r2 then we - should use multiplication which is very - expensive. Assume cost model is wrong if we - have such case here. */ - gcc_assert (regno2 != regno0); - - for (adds = parts.scale; adds > 0; adds--) - ix86_emit_binop (PLUS, mode, target, parts.index); - } - else - { - /* r1 = r2 + r3 * C case. Need to move r3 into r1. */ - if (regno0 != regno2) - emit_insn (gen_rtx_SET (target, parts.index)); - - /* Use shift for scaling. */ - ix86_emit_binop (ASHIFT, mode, target, - GEN_INT (exact_log2 (parts.scale))); - - if (parts.base) - ix86_emit_binop (PLUS, mode, target, parts.base); - - if (parts.disp && parts.disp != const0_rtx) - ix86_emit_binop (PLUS, mode, target, parts.disp); - } - } - else if (!parts.base && !parts.index) - { - gcc_assert(parts.disp); - emit_insn (gen_rtx_SET (target, parts.disp)); - } - else - { - if (!parts.base) - { - if (regno0 != regno2) - emit_insn (gen_rtx_SET (target, parts.index)); - } - else if (!parts.index) - { - if (regno0 != regno1) - emit_insn (gen_rtx_SET (target, parts.base)); - } - else - { - if (regno0 == regno1) - tmp = parts.index; - else if (regno0 == regno2) - tmp = parts.base; - else - { - rtx tmp1; - - /* Find better operand for SET instruction, depending - on which definition is farther from the insn. */ - if (find_nearest_reg_def (insn, regno1, regno2)) - tmp = parts.index, tmp1 = parts.base; - else - tmp = parts.base, tmp1 = parts.index; - - emit_insn (gen_rtx_SET (target, tmp)); - - if (parts.disp && parts.disp != const0_rtx) - ix86_emit_binop (PLUS, mode, target, parts.disp); - - ix86_emit_binop (PLUS, mode, target, tmp1); - return; - } - - ix86_emit_binop (PLUS, mode, target, tmp); - } - - if (parts.disp && parts.disp != const0_rtx) - ix86_emit_binop (PLUS, mode, target, parts.disp); - } -} - -/* Post-reload splitter for converting an SF or DFmode value in an - SSE register into an unsigned SImode. */ - -void -ix86_split_convert_uns_si_sse (rtx operands[]) -{ - machine_mode vecmode; - rtx value, large, zero_or_two31, input, two31, x; - - large = operands[1]; - zero_or_two31 = operands[2]; - input = operands[3]; - two31 = operands[4]; - vecmode = GET_MODE (large); - value = gen_rtx_REG (vecmode, REGNO (operands[0])); - - /* Load up the value into the low element. We must ensure that the other - elements are valid floats -- zero is the easiest such value. */ - if (MEM_P (input)) - { - if (vecmode == V4SFmode) - emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input)); - else - emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input)); - } - else - { - input = gen_rtx_REG (vecmode, REGNO (input)); - emit_move_insn (value, CONST0_RTX (vecmode)); - if (vecmode == V4SFmode) - emit_insn (gen_sse_movss (value, value, input)); - else - emit_insn (gen_sse2_movsd (value, value, input)); - } - - emit_move_insn (large, two31); - emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31); - - x = gen_rtx_fmt_ee (LE, vecmode, large, value); - emit_insn (gen_rtx_SET (large, x)); - - x = gen_rtx_AND (vecmode, zero_or_two31, large); - emit_insn (gen_rtx_SET (zero_or_two31, x)); - - x = gen_rtx_MINUS (vecmode, value, zero_or_two31); - emit_insn (gen_rtx_SET (value, x)); - - large = gen_rtx_REG (V4SImode, REGNO (large)); - emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31))); - - x = gen_rtx_REG (V4SImode, REGNO (value)); - if (vecmode == V4SFmode) - emit_insn (gen_fix_truncv4sfv4si2 (x, value)); - else - emit_insn (gen_sse2_cvttpd2dq (x, value)); - value = x; - - emit_insn (gen_xorv4si3 (value, value, large)); -} - -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, - machine_mode mode, rtx target, - rtx var, int one_var); - -/* Convert an unsigned DImode value into a DFmode, using only SSE. - Expects the 64-bit DImode to be supplied in a pair of integral - registers. Requires SSE2; will use SSE3 if available. For x86_32, - -mfpmath=sse, !optimize_size only. */ - -void -ix86_expand_convert_uns_didf_sse (rtx target, rtx input) -{ - REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt; - rtx int_xmm, fp_xmm; - rtx biases, exponents; - rtx x; - - int_xmm = gen_reg_rtx (V4SImode); - if (TARGET_INTER_UNIT_MOVES_TO_VEC) - emit_insn (gen_movdi_to_sse (int_xmm, input)); - else if (TARGET_SSE_SPLIT_REGS) - { - emit_clobber (int_xmm); - emit_move_insn (gen_lowpart (DImode, int_xmm), input); - } - else - { - x = gen_reg_rtx (V2DImode); - ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0); - emit_move_insn (int_xmm, gen_lowpart (V4SImode, x)); - } - - x = gen_rtx_CONST_VECTOR (V4SImode, - gen_rtvec (4, GEN_INT (0x43300000UL), - GEN_INT (0x45300000UL), - const0_rtx, const0_rtx)); - exponents = validize_mem (force_const_mem (V4SImode, x)); - - /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */ - emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents)); - - /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm) - yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)). - Similarly (0x45300000UL ## fp_value_hi_xmm) yields - (0x1.0p84 + double(fp_value_hi_xmm)). - Note these exponents differ by 32. */ - - fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm)); - - /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values - in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */ - real_ldexp (&bias_lo_rvt, &dconst1, 52); - real_ldexp (&bias_hi_rvt, &dconst1, 84); - biases = const_double_from_real_value (bias_lo_rvt, DFmode); - x = const_double_from_real_value (bias_hi_rvt, DFmode); - biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x)); - biases = validize_mem (force_const_mem (V2DFmode, biases)); - emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases)); - - /* Add the upper and lower DFmode values together. */ - if (TARGET_SSE3) - emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm)); - else - { - x = copy_to_mode_reg (V2DFmode, fp_xmm); - emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm)); - emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x)); - } - - ix86_expand_vector_extract (false, target, fp_xmm, 0); -} - -/* Not used, but eases macroization of patterns. */ -void -ix86_expand_convert_uns_sixf_sse (rtx, rtx) -{ - gcc_unreachable (); -} - -/* Convert an unsigned SImode value into a DFmode. Only currently used - for SSE, but applicable anywhere. */ - -void -ix86_expand_convert_uns_sidf_sse (rtx target, rtx input) -{ - REAL_VALUE_TYPE TWO31r; - rtx x, fp; - - x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1), - NULL, 1, OPTAB_DIRECT); - - fp = gen_reg_rtx (DFmode); - emit_insn (gen_floatsidf2 (fp, x)); - - real_ldexp (&TWO31r, &dconst1, 31); - x = const_double_from_real_value (TWO31r, DFmode); - - x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT); - if (x != target) - emit_move_insn (target, x); -} - -/* Convert a signed DImode value into a DFmode. Only used for SSE in - 32-bit mode; otherwise we have a direct convert instruction. */ - -void -ix86_expand_convert_sign_didf_sse (rtx target, rtx input) -{ - REAL_VALUE_TYPE TWO32r; - rtx fp_lo, fp_hi, x; - - fp_lo = gen_reg_rtx (DFmode); - fp_hi = gen_reg_rtx (DFmode); - - emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input))); - - real_ldexp (&TWO32r, &dconst1, 32); - x = const_double_from_real_value (TWO32r, DFmode); - fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT); - - ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input)); - - x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target, - 0, OPTAB_DIRECT); - if (x != target) - emit_move_insn (target, x); -} - -/* Convert an unsigned SImode value into a SFmode, using only SSE. - For x86_32, -mfpmath=sse, !optimize_size only. */ -void -ix86_expand_convert_uns_sisf_sse (rtx target, rtx input) -{ - REAL_VALUE_TYPE ONE16r; - rtx fp_hi, fp_lo, int_hi, int_lo, x; - - real_ldexp (&ONE16r, &dconst1, 16); - x = const_double_from_real_value (ONE16r, SFmode); - int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff), - NULL, 0, OPTAB_DIRECT); - int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16), - NULL, 0, OPTAB_DIRECT); - fp_hi = gen_reg_rtx (SFmode); - fp_lo = gen_reg_rtx (SFmode); - emit_insn (gen_floatsisf2 (fp_hi, int_hi)); - emit_insn (gen_floatsisf2 (fp_lo, int_lo)); - fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi, - 0, OPTAB_DIRECT); - fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target, - 0, OPTAB_DIRECT); - if (!rtx_equal_p (target, fp_hi)) - emit_move_insn (target, fp_hi); -} - -/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert - a vector of unsigned ints VAL to vector of floats TARGET. */ - -void -ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val) -{ - rtx tmp[8]; - REAL_VALUE_TYPE TWO16r; - machine_mode intmode = GET_MODE (val); - machine_mode fltmode = GET_MODE (target); - rtx (*cvt) (rtx, rtx); - - if (intmode == V4SImode) - cvt = gen_floatv4siv4sf2; - else - cvt = gen_floatv8siv8sf2; - tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff)); - tmp[0] = force_reg (intmode, tmp[0]); - tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1, - OPTAB_DIRECT); - tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16), - NULL_RTX, 1, OPTAB_DIRECT); - tmp[3] = gen_reg_rtx (fltmode); - emit_insn (cvt (tmp[3], tmp[1])); - tmp[4] = gen_reg_rtx (fltmode); - emit_insn (cvt (tmp[4], tmp[2])); - real_ldexp (&TWO16r, &dconst1, 16); - tmp[5] = const_double_from_real_value (TWO16r, SFmode); - tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5])); - tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1, - OPTAB_DIRECT); - tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1, - OPTAB_DIRECT); - if (tmp[7] != target) - emit_move_insn (target, tmp[7]); -} - -/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc* - pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*. - This is done by doing just signed conversion if < 0x1p31, and otherwise by - subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */ - -rtx -ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp) -{ - REAL_VALUE_TYPE TWO31r; - rtx two31r, tmp[4]; - machine_mode mode = GET_MODE (val); - machine_mode scalarmode = GET_MODE_INNER (mode); - machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode; - rtx (*cmp) (rtx, rtx, rtx, rtx); - int i; - - for (i = 0; i < 3; i++) - tmp[i] = gen_reg_rtx (mode); - real_ldexp (&TWO31r, &dconst1, 31); - two31r = const_double_from_real_value (TWO31r, scalarmode); - two31r = ix86_build_const_vector (mode, 1, two31r); - two31r = force_reg (mode, two31r); - switch (mode) - { - case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break; - case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break; - case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break; - case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break; - default: gcc_unreachable (); - } - tmp[3] = gen_rtx_LE (mode, two31r, val); - emit_insn (cmp (tmp[0], two31r, val, tmp[3])); - tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1], - 0, OPTAB_DIRECT); - if (intmode == V4SImode || TARGET_AVX2) - *xorp = expand_simple_binop (intmode, ASHIFT, - gen_lowpart (intmode, tmp[0]), - GEN_INT (31), NULL_RTX, 0, - OPTAB_DIRECT); - else - { - rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode); - two31 = ix86_build_const_vector (intmode, 1, two31); - *xorp = expand_simple_binop (intmode, AND, - gen_lowpart (intmode, tmp[0]), - two31, NULL_RTX, 0, - OPTAB_DIRECT); - } - return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2], - 0, OPTAB_DIRECT); -} - -/* Generate code for floating point ABS or NEG. */ - -void -ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode, - rtx operands[]) -{ - rtx set, dst, src; - bool use_sse = false; - bool vector_mode = VECTOR_MODE_P (mode); - machine_mode vmode = mode; - rtvec par; - - if (vector_mode || mode == TFmode) - use_sse = true; - else if (TARGET_SSE_MATH) - { - use_sse = SSE_FLOAT_MODE_P (mode); - if (mode == SFmode) - vmode = V4SFmode; - else if (mode == DFmode) - vmode = V2DFmode; - } - - dst = operands[0]; - src = operands[1]; - - set = gen_rtx_fmt_e (code, mode, src); - set = gen_rtx_SET (dst, set); - - if (use_sse) - { - rtx mask, use, clob; - - /* NEG and ABS performed with SSE use bitwise mask operations. - Create the appropriate mask now. */ - mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS); - use = gen_rtx_USE (VOIDmode, mask); - if (vector_mode || mode == TFmode) - par = gen_rtvec (2, set, use); - else - { - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - par = gen_rtvec (3, set, use, clob); - } - } - else - { - rtx clob; - - /* Changing of sign for FP values is doable using integer unit too. */ - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - par = gen_rtvec (2, set, clob); - } - - emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); -} - -/* Deconstruct a floating point ABS or NEG operation - with integer registers into integer operations. */ - -void -ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode, - rtx operands[]) -{ - enum rtx_code absneg_op; - rtx dst, set; - - gcc_assert (operands_match_p (operands[0], operands[1])); - - switch (mode) - { - case E_SFmode: - dst = gen_lowpart (SImode, operands[0]); - - if (code == ABS) - { - set = gen_int_mode (0x7fffffff, SImode); - absneg_op = AND; - } - else - { - set = gen_int_mode (0x80000000, SImode); - absneg_op = XOR; - } - set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set); - break; - - case E_DFmode: - if (TARGET_64BIT) - { - dst = gen_lowpart (DImode, operands[0]); - dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63)); - - if (code == ABS) - set = const0_rtx; - else - set = gen_rtx_NOT (DImode, dst); - } - else - { - dst = gen_highpart (SImode, operands[0]); - - if (code == ABS) - { - set = gen_int_mode (0x7fffffff, SImode); - absneg_op = AND; - } - else - { - set = gen_int_mode (0x80000000, SImode); - absneg_op = XOR; - } - set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set); - } - break; - - case E_XFmode: - dst = gen_rtx_REG (SImode, - REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2)); - if (code == ABS) - { - set = GEN_INT (0x7fff); - absneg_op = AND; - } - else - { - set = GEN_INT (0x8000); - absneg_op = XOR; - } - set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set); - break; - - default: - gcc_unreachable (); - } - - set = gen_rtx_SET (dst, set); - - rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - rtvec par = gen_rtvec (2, set, clob); - - emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); -} - -/* Expand a copysign operation. Special case operand 0 being a constant. */ - -void -ix86_expand_copysign (rtx operands[]) -{ - machine_mode mode, vmode; - rtx dest, op0, op1, mask; - - dest = operands[0]; - op0 = operands[1]; - op1 = operands[2]; - - mode = GET_MODE (dest); - - if (mode == SFmode) - vmode = V4SFmode; - else if (mode == DFmode) - vmode = V2DFmode; - else if (mode == TFmode) - vmode = mode; - else - gcc_unreachable (); - - mask = ix86_build_signbit_mask (vmode, 0, 0); - - if (CONST_DOUBLE_P (op0)) - { - if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0))) - op0 = simplify_unary_operation (ABS, mode, op0, mode); - - if (mode == SFmode || mode == DFmode) - { - if (op0 == CONST0_RTX (mode)) - op0 = CONST0_RTX (vmode); - else - { - rtx v = ix86_build_const_vector (vmode, false, op0); - - op0 = force_reg (vmode, v); - } - } - else if (op0 != CONST0_RTX (mode)) - op0 = force_reg (mode, op0); - - emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask)); - } - else - { - rtx nmask = ix86_build_signbit_mask (vmode, 0, 1); - - emit_insn (gen_copysign3_var - (mode, dest, NULL_RTX, op0, op1, nmask, mask)); - } -} - -/* Deconstruct a copysign operation into bit masks. Operand 0 is known to - be a constant, and so has already been expanded into a vector constant. */ - -void -ix86_split_copysign_const (rtx operands[]) -{ - machine_mode mode, vmode; - rtx dest, op0, mask, x; - - dest = operands[0]; - op0 = operands[1]; - mask = operands[3]; - - mode = GET_MODE (dest); - vmode = GET_MODE (mask); - - dest = lowpart_subreg (vmode, dest, mode); - x = gen_rtx_AND (vmode, dest, mask); - emit_insn (gen_rtx_SET (dest, x)); - - if (op0 != CONST0_RTX (vmode)) - { - x = gen_rtx_IOR (vmode, dest, op0); - emit_insn (gen_rtx_SET (dest, x)); - } -} - -/* Deconstruct a copysign operation into bit masks. Operand 0 is variable, - so we have to do two masks. */ - -void -ix86_split_copysign_var (rtx operands[]) -{ - machine_mode mode, vmode; - rtx dest, scratch, op0, op1, mask, nmask, x; - - dest = operands[0]; - scratch = operands[1]; - op0 = operands[2]; - op1 = operands[3]; - nmask = operands[4]; - mask = operands[5]; - - mode = GET_MODE (dest); - vmode = GET_MODE (mask); - - if (rtx_equal_p (op0, op1)) - { - /* Shouldn't happen often (it's useless, obviously), but when it does - we'd generate incorrect code if we continue below. */ - emit_move_insn (dest, op0); - return; - } - - if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */ - { - gcc_assert (REGNO (op1) == REGNO (scratch)); - - x = gen_rtx_AND (vmode, scratch, mask); - emit_insn (gen_rtx_SET (scratch, x)); - - dest = mask; - op0 = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_NOT (vmode, dest); - x = gen_rtx_AND (vmode, x, op0); - emit_insn (gen_rtx_SET (dest, x)); - } - else - { - if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */ - { - x = gen_rtx_AND (vmode, scratch, mask); - } - else /* alternative 2,4 */ - { - gcc_assert (REGNO (mask) == REGNO (scratch)); - op1 = lowpart_subreg (vmode, op1, mode); - x = gen_rtx_AND (vmode, scratch, op1); - } - emit_insn (gen_rtx_SET (scratch, x)); - - if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */ - { - dest = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_AND (vmode, dest, nmask); - } - else /* alternative 3,4 */ - { - gcc_assert (REGNO (nmask) == REGNO (dest)); - dest = nmask; - op0 = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_AND (vmode, dest, op0); - } - emit_insn (gen_rtx_SET (dest, x)); - } - - x = gen_rtx_IOR (vmode, dest, scratch); - emit_insn (gen_rtx_SET (dest, x)); -} - -/* Expand an xorsign operation. */ - -void -ix86_expand_xorsign (rtx operands[]) -{ - machine_mode mode, vmode; - rtx dest, op0, op1, mask; - - dest = operands[0]; - op0 = operands[1]; - op1 = operands[2]; - - mode = GET_MODE (dest); - - if (mode == SFmode) - vmode = V4SFmode; - else if (mode == DFmode) - vmode = V2DFmode; - else - gcc_unreachable (); - - mask = ix86_build_signbit_mask (vmode, 0, 0); - - emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask)); -} - -/* Deconstruct an xorsign operation into bit masks. */ - -void -ix86_split_xorsign (rtx operands[]) -{ - machine_mode mode, vmode; - rtx dest, op0, mask, x; - - dest = operands[0]; - op0 = operands[1]; - mask = operands[3]; - - mode = GET_MODE (dest); - vmode = GET_MODE (mask); - - dest = lowpart_subreg (vmode, dest, mode); - x = gen_rtx_AND (vmode, dest, mask); - emit_insn (gen_rtx_SET (dest, x)); - - op0 = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_XOR (vmode, dest, op0); - emit_insn (gen_rtx_SET (dest, x)); -} - -static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1); - -void -ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label) -{ - machine_mode mode = GET_MODE (op0); - rtx tmp; - - /* Handle special case - vector comparsion with boolean result, transform - it using ptest instruction. */ - if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - { - rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG); - machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode; - - gcc_assert (code == EQ || code == NE); - /* Generate XOR since we can't check that one operand is zero vector. */ - tmp = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1))); - tmp = gen_lowpart (p_mode, tmp); - emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG), - gen_rtx_UNSPEC (CCmode, - gen_rtvec (2, tmp, tmp), - UNSPEC_PTEST))); - tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx); - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, - gen_rtx_LABEL_REF (VOIDmode, label), - pc_rtx); - emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - return; - } - - switch (mode) - { - case E_SFmode: - case E_DFmode: - case E_XFmode: - case E_QImode: - case E_HImode: - case E_SImode: - simple: - tmp = ix86_expand_compare (code, op0, op1); - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, - gen_rtx_LABEL_REF (VOIDmode, label), - pc_rtx); - emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - return; - - case E_DImode: - if (TARGET_64BIT) - goto simple; - /* For 32-bit target DI comparison may be performed on - SSE registers. To allow this we should avoid split - to SI mode which is achieved by doing xor in DI mode - and then comparing with zero (which is recognized by - STV pass). We don't compare using xor when optimizing - for size. */ - if (!optimize_insn_for_size_p () - && TARGET_STV - && (code == EQ || code == NE)) - { - op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1)); - op1 = const0_rtx; - } - /* FALLTHRU */ - case E_TImode: - /* Expand DImode branch into multiple compare+branch. */ - { - rtx lo[2], hi[2]; - rtx_code_label *label2; - enum rtx_code code1, code2, code3; - machine_mode submode; - - if (CONSTANT_P (op0) && !CONSTANT_P (op1)) - { - std::swap (op0, op1); - code = swap_condition (code); - } - - split_double_mode (mode, &op0, 1, lo+0, hi+0); - split_double_mode (mode, &op1, 1, lo+1, hi+1); - - submode = mode == DImode ? SImode : DImode; - - /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to - avoid two branches. This costs one extra insn, so disable when - optimizing for size. */ - - if ((code == EQ || code == NE) - && (!optimize_insn_for_size_p () - || hi[1] == const0_rtx || lo[1] == const0_rtx)) - { - rtx xor0, xor1; - - xor1 = hi[0]; - if (hi[1] != const0_rtx) - xor1 = expand_binop (submode, xor_optab, xor1, hi[1], - NULL_RTX, 0, OPTAB_WIDEN); - - xor0 = lo[0]; - if (lo[1] != const0_rtx) - xor0 = expand_binop (submode, xor_optab, xor0, lo[1], - NULL_RTX, 0, OPTAB_WIDEN); - - tmp = expand_binop (submode, ior_optab, xor1, xor0, - NULL_RTX, 0, OPTAB_WIDEN); - - ix86_expand_branch (code, tmp, const0_rtx, label); - return; - } - - /* Otherwise, if we are doing less-than or greater-or-equal-than, - op1 is a constant and the low word is zero, then we can just - examine the high word. Similarly for low word -1 and - less-or-equal-than or greater-than. */ - - if (CONST_INT_P (hi[1])) - switch (code) - { - case LT: case LTU: case GE: case GEU: - if (lo[1] == const0_rtx) - { - ix86_expand_branch (code, hi[0], hi[1], label); - return; - } - break; - case LE: case LEU: case GT: case GTU: - if (lo[1] == constm1_rtx) - { - ix86_expand_branch (code, hi[0], hi[1], label); - return; - } - break; - default: - break; - } - - /* Emulate comparisons that do not depend on Zero flag with - double-word subtraction. Note that only Overflow, Sign - and Carry flags are valid, so swap arguments and condition - of comparisons that would otherwise test Zero flag. */ - - switch (code) - { - case LE: case LEU: case GT: case GTU: - std::swap (lo[0], lo[1]); - std::swap (hi[0], hi[1]); - code = swap_condition (code); - /* FALLTHRU */ - - case LT: case LTU: case GE: case GEU: - { - bool uns = (code == LTU || code == GEU); - rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx) - = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz; - - if (!nonimmediate_operand (lo[0], submode)) - lo[0] = force_reg (submode, lo[0]); - if (!x86_64_general_operand (lo[1], submode)) - lo[1] = force_reg (submode, lo[1]); - - if (!register_operand (hi[0], submode)) - hi[0] = force_reg (submode, hi[0]); - if ((uns && !nonimmediate_operand (hi[1], submode)) - || (!uns && !x86_64_general_operand (hi[1], submode))) - hi[1] = force_reg (submode, hi[1]); - - emit_insn (gen_cmp_1 (submode, lo[0], lo[1])); - - tmp = gen_rtx_SCRATCH (submode); - emit_insn (sbb_insn (submode, tmp, hi[0], hi[1])); - - tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG); - ix86_expand_branch (code, tmp, const0_rtx, label); - return; - } - - default: - break; - } - - /* Otherwise, we need two or three jumps. */ - - label2 = gen_label_rtx (); - - code1 = code; - code2 = swap_condition (code); - code3 = unsigned_condition (code); - - switch (code) - { - case LT: case GT: case LTU: case GTU: - break; - - case LE: code1 = LT; code2 = GT; break; - case GE: code1 = GT; code2 = LT; break; - case LEU: code1 = LTU; code2 = GTU; break; - case GEU: code1 = GTU; code2 = LTU; break; - - case EQ: code1 = UNKNOWN; code2 = NE; break; - case NE: code2 = UNKNOWN; break; - - default: - gcc_unreachable (); - } - - /* - * a < b => - * if (hi(a) < hi(b)) goto true; - * if (hi(a) > hi(b)) goto false; - * if (lo(a) < lo(b)) goto true; - * false: - */ - - if (code1 != UNKNOWN) - ix86_expand_branch (code1, hi[0], hi[1], label); - if (code2 != UNKNOWN) - ix86_expand_branch (code2, hi[0], hi[1], label2); - - ix86_expand_branch (code3, lo[0], lo[1], label); - - if (code2 != UNKNOWN) - emit_label (label2); - return; - } - - default: - gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC); - goto simple; - } -} - -/* Figure out whether to use unordered fp comparisons. */ - -static bool -ix86_unordered_fp_compare (enum rtx_code code) -{ - if (!TARGET_IEEE_FP) - return false; - - switch (code) - { - case LT: - case LE: - case GT: - case GE: - case LTGT: - return false; - - case EQ: - case NE: - - case UNORDERED: - case ORDERED: - case UNLT: - case UNLE: - case UNGT: - case UNGE: - case UNEQ: - return true; - - default: - gcc_unreachable (); - } -} - -/* Return a comparison we can do and that it is equivalent to - swap_condition (code) apart possibly from orderedness. - But, never change orderedness if TARGET_IEEE_FP, returning - UNKNOWN in that case if necessary. */ - -static enum rtx_code -ix86_fp_swap_condition (enum rtx_code code) -{ - switch (code) - { - case GT: /* GTU - CF=0 & ZF=0 */ - return TARGET_IEEE_FP ? UNKNOWN : UNLT; - case GE: /* GEU - CF=0 */ - return TARGET_IEEE_FP ? UNKNOWN : UNLE; - case UNLT: /* LTU - CF=1 */ - return TARGET_IEEE_FP ? UNKNOWN : GT; - case UNLE: /* LEU - CF=1 | ZF=1 */ - return TARGET_IEEE_FP ? UNKNOWN : GE; - default: - return swap_condition (code); - } -} - -/* Return cost of comparison CODE using the best strategy for performance. - All following functions do use number of instructions as a cost metrics. - In future this should be tweaked to compute bytes for optimize_size and - take into account performance of various instructions on various CPUs. */ - -static int -ix86_fp_comparison_cost (enum rtx_code code) -{ - int arith_cost; - - /* The cost of code using bit-twiddling on %ah. */ - switch (code) - { - case UNLE: - case UNLT: - case LTGT: - case GT: - case GE: - case UNORDERED: - case ORDERED: - case UNEQ: - arith_cost = 4; - break; - case LT: - case NE: - case EQ: - case UNGE: - arith_cost = TARGET_IEEE_FP ? 5 : 4; - break; - case LE: - case UNGT: - arith_cost = TARGET_IEEE_FP ? 6 : 4; - break; - default: - gcc_unreachable (); - } - - switch (ix86_fp_comparison_strategy (code)) - { - case IX86_FPCMP_COMI: - return arith_cost > 4 ? 3 : 2; - case IX86_FPCMP_SAHF: - return arith_cost > 4 ? 4 : 3; - default: - return arith_cost; - } -} - -/* Swap, force into registers, or otherwise massage the two operands - to a fp comparison. The operands are updated in place; the new - comparison code is returned. */ - -static enum rtx_code -ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) -{ - bool unordered_compare = ix86_unordered_fp_compare (code); - rtx op0 = *pop0, op1 = *pop1; - machine_mode op_mode = GET_MODE (op0); - bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode); - - /* All of the unordered compare instructions only work on registers. - The same is true of the fcomi compare instructions. The XFmode - compare instructions require registers except when comparing - against zero or when converting operand 1 from fixed point to - floating point. */ - - if (!is_sse - && (unordered_compare - || (op_mode == XFmode - && ! (standard_80387_constant_p (op0) == 1 - || standard_80387_constant_p (op1) == 1) - && GET_CODE (op1) != FLOAT) - || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI)) - { - op0 = force_reg (op_mode, op0); - op1 = force_reg (op_mode, op1); - } - else - { - /* %%% We only allow op1 in memory; op0 must be st(0). So swap - things around if they appear profitable, otherwise force op0 - into a register. */ - - if (standard_80387_constant_p (op0) == 0 - || (MEM_P (op0) - && ! (standard_80387_constant_p (op1) == 0 - || MEM_P (op1)))) - { - enum rtx_code new_code = ix86_fp_swap_condition (code); - if (new_code != UNKNOWN) - { - std::swap (op0, op1); - code = new_code; - } - } - - if (!REG_P (op0)) - op0 = force_reg (op_mode, op0); - - if (CONSTANT_P (op1)) - { - int tmp = standard_80387_constant_p (op1); - if (tmp == 0) - op1 = validize_mem (force_const_mem (op_mode, op1)); - else if (tmp == 1) - { - if (TARGET_CMOVE) - op1 = force_reg (op_mode, op1); - } - else - op1 = force_reg (op_mode, op1); - } - } - - /* Try to rearrange the comparison to make it cheaper. */ - if (ix86_fp_comparison_cost (code) - > ix86_fp_comparison_cost (swap_condition (code)) - && (REG_P (op1) || can_create_pseudo_p ())) - { - std::swap (op0, op1); - code = swap_condition (code); - if (!REG_P (op0)) - op0 = force_reg (op_mode, op0); - } - - *pop0 = op0; - *pop1 = op1; - return code; -} - -/* Generate insn patterns to do a floating point compare of OPERANDS. */ - -static rtx -ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1) -{ - bool unordered_compare = ix86_unordered_fp_compare (code); - machine_mode cmp_mode; - rtx tmp, scratch; - - code = ix86_prepare_fp_compare_args (code, &op0, &op1); - - tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); - if (unordered_compare) - tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); - - /* Do fcomi/sahf based test when profitable. */ - switch (ix86_fp_comparison_strategy (code)) - { - case IX86_FPCMP_COMI: - cmp_mode = CCFPmode; - emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp)); - break; - - case IX86_FPCMP_SAHF: - cmp_mode = CCFPmode; - tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); - scratch = gen_reg_rtx (HImode); - emit_insn (gen_rtx_SET (scratch, tmp)); - emit_insn (gen_x86_sahf_1 (scratch)); - break; - - case IX86_FPCMP_ARITH: - cmp_mode = CCNOmode; - tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); - scratch = gen_reg_rtx (HImode); - emit_insn (gen_rtx_SET (scratch, tmp)); - - /* In the unordered case, we have to check C2 for NaN's, which - doesn't happen to work out to anything nice combination-wise. - So do some bit twiddling on the value we've got in AH to come - up with an appropriate set of condition codes. */ - - switch (code) - { - case GT: - case UNGT: - if (code == GT || !TARGET_IEEE_FP) - { - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45))); - code = EQ; - } - else - { - emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); - emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx)); - emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44))); - cmp_mode = CCmode; - code = GEU; - } - break; - case LT: - case UNLT: - if (code == LT && TARGET_IEEE_FP) - { - emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); - emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx)); - cmp_mode = CCmode; - code = EQ; - } - else - { - emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx)); - code = NE; - } - break; - case GE: - case UNGE: - if (code == GE || !TARGET_IEEE_FP) - { - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05))); - code = EQ; - } - else - { - emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); - emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx)); - code = NE; - } - break; - case LE: - case UNLE: - if (code == LE && TARGET_IEEE_FP) - { - emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); - emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx)); - emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40))); - cmp_mode = CCmode; - code = LTU; - } - else - { - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45))); - code = NE; - } - break; - case EQ: - case UNEQ: - if (code == EQ && TARGET_IEEE_FP) - { - emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); - emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40))); - cmp_mode = CCmode; - code = EQ; - } - else - { - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40))); - code = NE; - } - break; - case NE: - case LTGT: - if (code == NE && TARGET_IEEE_FP) - { - emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); - emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, - GEN_INT (0x40))); - code = NE; - } - else - { - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40))); - code = EQ; - } - break; - - case UNORDERED: - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04))); - code = NE; - break; - case ORDERED: - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04))); - code = EQ; - break; - - default: - gcc_unreachable (); - } - break; - - default: - gcc_unreachable(); - } - - /* Return the test that should be put into the flags user, i.e. - the bcc, scc, or cmov instruction. */ - return gen_rtx_fmt_ee (code, VOIDmode, - gen_rtx_REG (cmp_mode, FLAGS_REG), - const0_rtx); -} - -/* Generate insn patterns to do an integer compare of OPERANDS. */ - -static rtx -ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1) -{ - machine_mode cmpmode; - rtx tmp, flags; - - cmpmode = SELECT_CC_MODE (code, op0, op1); - flags = gen_rtx_REG (cmpmode, FLAGS_REG); - - /* This is very simple, but making the interface the same as in the - FP case makes the rest of the code easier. */ - tmp = gen_rtx_COMPARE (cmpmode, op0, op1); - emit_insn (gen_rtx_SET (flags, tmp)); - - /* Return the test that should be put into the flags user, i.e. - the bcc, scc, or cmov instruction. */ - return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx); -} - -static rtx -ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1) -{ - rtx ret; - - if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC) - ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1); - - else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0))) - { - gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0))); - ret = ix86_expand_fp_compare (code, op0, op1); - } - else - ret = ix86_expand_int_compare (code, op0, op1); - - return ret; -} - -void -ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1) -{ - rtx ret; - - gcc_assert (GET_MODE (dest) == QImode); - - ret = ix86_expand_compare (code, op0, op1); - PUT_MODE (ret, QImode); - emit_insn (gen_rtx_SET (dest, ret)); -} - -/* Expand comparison setting or clearing carry flag. Return true when - successful and set pop for the operation. */ -static bool -ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop) -{ - machine_mode mode - = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1); - - /* Do not handle double-mode compares that go through special path. */ - if (mode == (TARGET_64BIT ? TImode : DImode)) - return false; - - if (SCALAR_FLOAT_MODE_P (mode)) - { - rtx compare_op; - rtx_insn *compare_seq; - - gcc_assert (!DECIMAL_FLOAT_MODE_P (mode)); - - /* Shortcut: following common codes never translate - into carry flag compares. */ - if (code == EQ || code == NE || code == UNEQ || code == LTGT - || code == ORDERED || code == UNORDERED) - return false; - - /* These comparisons require zero flag; swap operands so they won't. */ - if ((code == GT || code == UNLE || code == LE || code == UNGT) - && !TARGET_IEEE_FP) - { - std::swap (op0, op1); - code = swap_condition (code); - } - - /* Try to expand the comparison and verify that we end up with - carry flag based comparison. This fails to be true only when - we decide to expand comparison using arithmetic that is not - too common scenario. */ - start_sequence (); - compare_op = ix86_expand_fp_compare (code, op0, op1); - compare_seq = get_insns (); - end_sequence (); - - if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode) - code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op)); - else - code = GET_CODE (compare_op); - - if (code != LTU && code != GEU) - return false; - - emit_insn (compare_seq); - *pop = compare_op; - return true; - } - - if (!INTEGRAL_MODE_P (mode)) - return false; - - switch (code) - { - case LTU: - case GEU: - break; - - /* Convert a==0 into (unsigned)a<1. */ - case EQ: - case NE: - if (op1 != const0_rtx) - return false; - op1 = const1_rtx; - code = (code == EQ ? LTU : GEU); - break; - - /* Convert a>b into b<a or a>=b-1. */ - case GTU: - case LEU: - if (CONST_INT_P (op1)) - { - op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0)); - /* Bail out on overflow. We still can swap operands but that - would force loading of the constant into register. */ - if (op1 == const0_rtx - || !x86_64_immediate_operand (op1, GET_MODE (op1))) - return false; - code = (code == GTU ? GEU : LTU); - } - else - { - std::swap (op0, op1); - code = (code == GTU ? LTU : GEU); - } - break; - - /* Convert a>=0 into (unsigned)a<0x80000000. */ - case LT: - case GE: - if (mode == DImode || op1 != const0_rtx) - return false; - op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode); - code = (code == LT ? GEU : LTU); - break; - case LE: - case GT: - if (mode == DImode || op1 != constm1_rtx) - return false; - op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode); - code = (code == LE ? GEU : LTU); - break; - - default: - return false; - } - /* Swapping operands may cause constant to appear as first operand. */ - if (!nonimmediate_operand (op0, VOIDmode)) - { - if (!can_create_pseudo_p ()) - return false; - op0 = force_reg (mode, op0); - } - *pop = ix86_expand_compare (code, op0, op1); - gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU); - return true; -} - -/* Expand conditional increment or decrement using adb/sbb instructions. - The default case using setcc followed by the conditional move can be - done by generic code. */ -bool -ix86_expand_int_addcc (rtx operands[]) -{ - enum rtx_code code = GET_CODE (operands[1]); - rtx flags; - rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx); - rtx compare_op; - rtx val = const0_rtx; - bool fpcmp = false; - machine_mode mode; - rtx op0 = XEXP (operands[1], 0); - rtx op1 = XEXP (operands[1], 1); - - if (operands[3] != const1_rtx - && operands[3] != constm1_rtx) - return false; - if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op)) - return false; - code = GET_CODE (compare_op); - - flags = XEXP (compare_op, 0); - - if (GET_MODE (flags) == CCFPmode) - { - fpcmp = true; - code = ix86_fp_compare_code_to_integer (code); - } - - if (code != LTU) - { - val = constm1_rtx; - if (fpcmp) - PUT_CODE (compare_op, - reverse_condition_maybe_unordered - (GET_CODE (compare_op))); - else - PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op))); - } - - mode = GET_MODE (operands[0]); - - /* Construct either adc or sbb insn. */ - if ((code == LTU) == (operands[3] == constm1_rtx)) - insn = gen_sub3_carry; - else - insn = gen_add3_carry; - - emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op)); - - return true; -} - -bool -ix86_expand_int_movcc (rtx operands[]) -{ - enum rtx_code code = GET_CODE (operands[1]), compare_code; - rtx_insn *compare_seq; - rtx compare_op; - machine_mode mode = GET_MODE (operands[0]); - bool sign_bit_compare_p = false; - rtx op0 = XEXP (operands[1], 0); - rtx op1 = XEXP (operands[1], 1); - - if (GET_MODE (op0) == TImode - || (GET_MODE (op0) == DImode - && !TARGET_64BIT)) - return false; - - start_sequence (); - compare_op = ix86_expand_compare (code, op0, op1); - compare_seq = get_insns (); - end_sequence (); - - compare_code = GET_CODE (compare_op); - - if ((op1 == const0_rtx && (code == GE || code == LT)) - || (op1 == constm1_rtx && (code == GT || code == LE))) - sign_bit_compare_p = true; - - /* Don't attempt mode expansion here -- if we had to expand 5 or 6 - HImode insns, we'd be swallowed in word prefix ops. */ - - if ((mode != HImode || TARGET_FAST_PREFIX) - && (mode != (TARGET_64BIT ? TImode : DImode)) - && CONST_INT_P (operands[2]) - && CONST_INT_P (operands[3])) - { - rtx out = operands[0]; - HOST_WIDE_INT ct = INTVAL (operands[2]); - HOST_WIDE_INT cf = INTVAL (operands[3]); - HOST_WIDE_INT diff; - - diff = ct - cf; - /* Sign bit compares are better done using shifts than we do by using - sbb. */ - if (sign_bit_compare_p - || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op)) - { - /* Detect overlap between destination and compare sources. */ - rtx tmp = out; - - if (!sign_bit_compare_p) - { - rtx flags; - bool fpcmp = false; - - compare_code = GET_CODE (compare_op); - - flags = XEXP (compare_op, 0); - - if (GET_MODE (flags) == CCFPmode) - { - fpcmp = true; - compare_code - = ix86_fp_compare_code_to_integer (compare_code); - } - - /* To simplify rest of code, restrict to the GEU case. */ - if (compare_code == LTU) - { - std::swap (ct, cf); - compare_code = reverse_condition (compare_code); - code = reverse_condition (code); - } - else - { - if (fpcmp) - PUT_CODE (compare_op, - reverse_condition_maybe_unordered - (GET_CODE (compare_op))); - else - PUT_CODE (compare_op, - reverse_condition (GET_CODE (compare_op))); - } - diff = ct - cf; - - if (reg_overlap_mentioned_p (out, op0) - || reg_overlap_mentioned_p (out, op1)) - tmp = gen_reg_rtx (mode); - - if (mode == DImode) - emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op)); - else - emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp), - flags, compare_op)); - } - else - { - if (code == GT || code == GE) - code = reverse_condition (code); - else - { - std::swap (ct, cf); - diff = ct - cf; - } - tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1); - } - - if (diff == 1) - { - /* - * cmpl op0,op1 - * sbbl dest,dest - * [addl dest, ct] - * - * Size 5 - 8. - */ - if (ct) - tmp = expand_simple_binop (mode, PLUS, - tmp, GEN_INT (ct), - copy_rtx (tmp), 1, OPTAB_DIRECT); - } - else if (cf == -1) - { - /* - * cmpl op0,op1 - * sbbl dest,dest - * orl $ct, dest - * - * Size 8. - */ - tmp = expand_simple_binop (mode, IOR, - tmp, GEN_INT (ct), - copy_rtx (tmp), 1, OPTAB_DIRECT); - } - else if (diff == -1 && ct) - { - /* - * cmpl op0,op1 - * sbbl dest,dest - * notl dest - * [addl dest, cf] - * - * Size 8 - 11. - */ - tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); - if (cf) - tmp = expand_simple_binop (mode, PLUS, - copy_rtx (tmp), GEN_INT (cf), - copy_rtx (tmp), 1, OPTAB_DIRECT); - } - else - { - /* - * cmpl op0,op1 - * sbbl dest,dest - * [notl dest] - * andl cf - ct, dest - * [addl dest, ct] - * - * Size 8 - 11. - */ - - if (cf == 0) - { - cf = ct; - ct = 0; - tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); - } - - tmp = expand_simple_binop (mode, AND, - copy_rtx (tmp), - gen_int_mode (cf - ct, mode), - copy_rtx (tmp), 1, OPTAB_DIRECT); - if (ct) - tmp = expand_simple_binop (mode, PLUS, - copy_rtx (tmp), GEN_INT (ct), - copy_rtx (tmp), 1, OPTAB_DIRECT); - } - - if (!rtx_equal_p (tmp, out)) - emit_move_insn (copy_rtx (out), copy_rtx (tmp)); - - return true; - } - - if (diff < 0) - { - machine_mode cmp_mode = GET_MODE (op0); - enum rtx_code new_code; - - if (SCALAR_FLOAT_MODE_P (cmp_mode)) - { - gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode)); - - /* We may be reversing a non-trapping - comparison to a trapping comparison. */ - if (HONOR_NANS (cmp_mode) && flag_trapping_math - && code != EQ && code != NE - && code != ORDERED && code != UNORDERED) - new_code = UNKNOWN; - else - new_code = reverse_condition_maybe_unordered (code); - } - else - new_code = ix86_reverse_condition (code, cmp_mode); - if (new_code != UNKNOWN) - { - std::swap (ct, cf); - diff = -diff; - code = new_code; - } - } - - compare_code = UNKNOWN; - if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT - && CONST_INT_P (op1)) - { - if (op1 == const0_rtx - && (code == LT || code == GE)) - compare_code = code; - else if (op1 == constm1_rtx) - { - if (code == LE) - compare_code = LT; - else if (code == GT) - compare_code = GE; - } - } - - /* Optimize dest = (op0 < 0) ? -1 : cf. */ - if (compare_code != UNKNOWN - && GET_MODE (op0) == GET_MODE (out) - && (cf == -1 || ct == -1)) - { - /* If lea code below could be used, only optimize - if it results in a 2 insn sequence. */ - - if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8 - || diff == 3 || diff == 5 || diff == 9) - || (compare_code == LT && ct == -1) - || (compare_code == GE && cf == -1)) - { - /* - * notl op1 (if necessary) - * sarl $31, op1 - * orl cf, op1 - */ - if (ct != -1) - { - cf = ct; - ct = -1; - code = reverse_condition (code); - } - - out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1); - - out = expand_simple_binop (mode, IOR, - out, GEN_INT (cf), - out, 1, OPTAB_DIRECT); - if (out != operands[0]) - emit_move_insn (operands[0], out); - - return true; - } - } - - - if ((diff == 1 || diff == 2 || diff == 4 || diff == 8 - || diff == 3 || diff == 5 || diff == 9) - && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL) - && (mode != DImode - || x86_64_immediate_operand (GEN_INT (cf), VOIDmode))) - { - /* - * xorl dest,dest - * cmpl op1,op2 - * setcc dest - * lea cf(dest*(ct-cf)),dest - * - * Size 14. - * - * This also catches the degenerate setcc-only case. - */ - - rtx tmp; - int nops; - - out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1); - - nops = 0; - /* On x86_64 the lea instruction operates on Pmode, so we need - to get arithmetics done in proper mode to match. */ - if (diff == 1) - tmp = copy_rtx (out); - else - { - rtx out1; - out1 = copy_rtx (out); - tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1)); - nops++; - if (diff & 1) - { - tmp = gen_rtx_PLUS (mode, tmp, out1); - nops++; - } - } - if (cf != 0) - { - tmp = plus_constant (mode, tmp, cf); - nops++; - } - if (!rtx_equal_p (tmp, out)) - { - if (nops == 1) - out = force_operand (tmp, copy_rtx (out)); - else - emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp))); - } - if (!rtx_equal_p (out, operands[0])) - emit_move_insn (operands[0], copy_rtx (out)); - - return true; - } - - /* - * General case: Jumpful: - * xorl dest,dest cmpl op1, op2 - * cmpl op1, op2 movl ct, dest - * setcc dest jcc 1f - * decl dest movl cf, dest - * andl (cf-ct),dest 1: - * addl ct,dest - * - * Size 20. Size 14. - * - * This is reasonably steep, but branch mispredict costs are - * high on modern cpus, so consider failing only if optimizing - * for space. - */ - - if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) - && BRANCH_COST (optimize_insn_for_speed_p (), - false) >= 2) - { - if (cf == 0) - { - machine_mode cmp_mode = GET_MODE (op0); - enum rtx_code new_code; - - if (SCALAR_FLOAT_MODE_P (cmp_mode)) - { - gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode)); - - /* We may be reversing a non-trapping - comparison to a trapping comparison. */ - if (HONOR_NANS (cmp_mode) && flag_trapping_math - && code != EQ && code != NE - && code != ORDERED && code != UNORDERED) - new_code = UNKNOWN; - else - new_code = reverse_condition_maybe_unordered (code); - - } - else - { - new_code = ix86_reverse_condition (code, cmp_mode); - if (compare_code != UNKNOWN && new_code != UNKNOWN) - compare_code = reverse_condition (compare_code); - } - - if (new_code != UNKNOWN) - { - cf = ct; - ct = 0; - code = new_code; - } - } - - if (compare_code != UNKNOWN) - { - /* notl op1 (if needed) - sarl $31, op1 - andl (cf-ct), op1 - addl ct, op1 - - For x < 0 (resp. x <= -1) there will be no notl, - so if possible swap the constants to get rid of the - complement. - True/false will be -1/0 while code below (store flag - followed by decrement) is 0/-1, so the constants need - to be exchanged once more. */ - - if (compare_code == GE || !cf) - { - code = reverse_condition (code); - compare_code = LT; - } - else - std::swap (ct, cf); - - out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1); - } - else - { - out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1); - - out = expand_simple_binop (mode, PLUS, copy_rtx (out), - constm1_rtx, - copy_rtx (out), 1, OPTAB_DIRECT); - } - - out = expand_simple_binop (mode, AND, copy_rtx (out), - gen_int_mode (cf - ct, mode), - copy_rtx (out), 1, OPTAB_DIRECT); - if (ct) - out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct), - copy_rtx (out), 1, OPTAB_DIRECT); - if (!rtx_equal_p (out, operands[0])) - emit_move_insn (operands[0], copy_rtx (out)); - - return true; - } - } - - if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) - { - /* Try a few things more with specific constants and a variable. */ - - optab op; - rtx var, orig_out, out, tmp; - - if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2) - return false; - - /* If one of the two operands is an interesting constant, load a - constant with the above and mask it in with a logical operation. */ - - if (CONST_INT_P (operands[2])) - { - var = operands[3]; - if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx) - operands[3] = constm1_rtx, op = and_optab; - else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx) - operands[3] = const0_rtx, op = ior_optab; - else - return false; - } - else if (CONST_INT_P (operands[3])) - { - var = operands[2]; - if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx) - operands[2] = constm1_rtx, op = and_optab; - else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx) - operands[2] = const0_rtx, op = ior_optab; - else - return false; - } - else - return false; - - orig_out = operands[0]; - tmp = gen_reg_rtx (mode); - operands[0] = tmp; - - /* Recurse to get the constant loaded. */ - if (!ix86_expand_int_movcc (operands)) - return false; - - /* Mask in the interesting variable. */ - out = expand_binop (mode, op, var, tmp, orig_out, 0, - OPTAB_WIDEN); - if (!rtx_equal_p (out, orig_out)) - emit_move_insn (copy_rtx (orig_out), copy_rtx (out)); - - return true; - } - - /* - * For comparison with above, - * - * movl cf,dest - * movl ct,tmp - * cmpl op1,op2 - * cmovcc tmp,dest - * - * Size 15. - */ - - if (! nonimmediate_operand (operands[2], mode)) - operands[2] = force_reg (mode, operands[2]); - if (! nonimmediate_operand (operands[3], mode)) - operands[3] = force_reg (mode, operands[3]); - - if (! register_operand (operands[2], VOIDmode) - && (mode == QImode - || ! register_operand (operands[3], VOIDmode))) - operands[2] = force_reg (mode, operands[2]); - - if (mode == QImode - && ! register_operand (operands[3], VOIDmode)) - operands[3] = force_reg (mode, operands[3]); - - emit_insn (compare_seq); - emit_insn (gen_rtx_SET (operands[0], - gen_rtx_IF_THEN_ELSE (mode, - compare_op, operands[2], - operands[3]))); - return true; -} - -/* Detect conditional moves that exactly match min/max operational - semantics. Note that this is IEEE safe, as long as we don't - interchange the operands. - - Returns FALSE if this conditional move doesn't match a MIN/MAX, - and TRUE if the operation is successful and instructions are emitted. */ - -static bool -ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0, - rtx cmp_op1, rtx if_true, rtx if_false) -{ - machine_mode mode; - bool is_min; - rtx tmp; - - if (code == LT) - ; - else if (code == UNGE) - std::swap (if_true, if_false); - else - return false; - - if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false)) - is_min = true; - else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false)) - is_min = false; - else - return false; - - mode = GET_MODE (dest); - - /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here, - but MODE may be a vector mode and thus not appropriate. */ - if (!flag_finite_math_only || flag_signed_zeros) - { - int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX; - rtvec v; - - if_true = force_reg (mode, if_true); - v = gen_rtvec (2, if_true, if_false); - tmp = gen_rtx_UNSPEC (mode, v, u); - } - else - { - code = is_min ? SMIN : SMAX; - if (MEM_P (if_true) && MEM_P (if_false)) - if_true = force_reg (mode, if_true); - tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false); - } - - emit_insn (gen_rtx_SET (dest, tmp)); - return true; -} - -/* Return true if MODE is valid for vector compare to mask register, - Same result for conditionl vector move with mask register. */ -static bool -ix86_valid_mask_cmp_mode (machine_mode mode) -{ - /* XOP has its own vector conditional movement. */ - if (TARGET_XOP && !TARGET_AVX512F) - return false; - - /* AVX512F is needed for mask operation. */ - if (!(TARGET_AVX512F && VECTOR_MODE_P (mode))) - return false; - - /* AVX512BW is needed for vector QI/HImode, - AVX512VL is needed for 128/256-bit vector. */ - machine_mode inner_mode = GET_MODE_INNER (mode); - int vector_size = GET_MODE_SIZE (mode); - if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW) - return false; - - return vector_size == 64 || TARGET_AVX512VL; -} - -/* Expand an SSE comparison. Return the register with the result. */ - -static rtx -ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1, - rtx op_true, rtx op_false) -{ - machine_mode mode = GET_MODE (dest); - machine_mode cmp_ops_mode = GET_MODE (cmp_op0); - - /* In general case result of comparison can differ from operands' type. */ - machine_mode cmp_mode; - - /* In AVX512F the result of comparison is an integer mask. */ - bool maskcmp = false; - rtx x; - - if (ix86_valid_mask_cmp_mode (cmp_ops_mode)) - { - unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode); - maskcmp = true; - cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode; - } - else - cmp_mode = cmp_ops_mode; - - cmp_op0 = force_reg (cmp_ops_mode, cmp_op0); - - int (*op1_predicate)(rtx, machine_mode) - = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand; - - if (!op1_predicate (cmp_op1, cmp_ops_mode)) - cmp_op1 = force_reg (cmp_ops_mode, cmp_op1); - - if (optimize - || (maskcmp && cmp_mode != mode) - || (op_true && reg_overlap_mentioned_p (dest, op_true)) - || (op_false && reg_overlap_mentioned_p (dest, op_false))) - dest = gen_reg_rtx (maskcmp ? cmp_mode : mode); - - x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1); - - if (cmp_mode != mode && !maskcmp) - { - x = force_reg (cmp_ops_mode, x); - convert_move (dest, x, false); - } - else - emit_insn (gen_rtx_SET (dest, x)); - - return dest; -} - -/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical - operations. This is used for both scalar and vector conditional moves. */ - -void -ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) -{ - machine_mode mode = GET_MODE (dest); - machine_mode cmpmode = GET_MODE (cmp); - - /* In AVX512F the result of comparison is an integer mask. */ - bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode); - - rtx t2, t3, x; - - /* If we have an integer mask and FP value then we need - to cast mask to FP mode. */ - if (mode != cmpmode && VECTOR_MODE_P (cmpmode)) - { - cmp = force_reg (cmpmode, cmp); - cmp = gen_rtx_SUBREG (mode, cmp, 0); - } - - if (maskcmp) - { - /* Using vector move with mask register. */ - cmp = force_reg (cmpmode, cmp); - /* Optimize for mask zero. */ - op_true = (op_true != CONST0_RTX (mode) - ? force_reg (mode, op_true) : op_true); - op_false = (op_false != CONST0_RTX (mode) - ? force_reg (mode, op_false) : op_false); - if (op_true == CONST0_RTX (mode)) - { - rtx (*gen_not) (rtx, rtx); - switch (cmpmode) - { - case E_QImode: gen_not = gen_knotqi; break; - case E_HImode: gen_not = gen_knothi; break; - case E_SImode: gen_not = gen_knotsi; break; - case E_DImode: gen_not = gen_knotdi; break; - default: gcc_unreachable (); - } - rtx n = gen_reg_rtx (cmpmode); - emit_insn (gen_not (n, cmp)); - cmp = n; - /* Reverse op_true op_false. */ - std::swap (op_true, op_false); - } - - rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp); - emit_insn (gen_rtx_SET (dest, vec_merge)); - return; - } - else if (vector_all_ones_operand (op_true, mode) - && op_false == CONST0_RTX (mode)) - { - emit_insn (gen_rtx_SET (dest, cmp)); - return; - } - else if (op_false == CONST0_RTX (mode)) - { - op_true = force_reg (mode, op_true); - x = gen_rtx_AND (mode, cmp, op_true); - emit_insn (gen_rtx_SET (dest, x)); - return; - } - else if (op_true == CONST0_RTX (mode)) - { - op_false = force_reg (mode, op_false); - x = gen_rtx_NOT (mode, cmp); - x = gen_rtx_AND (mode, x, op_false); - emit_insn (gen_rtx_SET (dest, x)); - return; - } - else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)) - { - op_false = force_reg (mode, op_false); - x = gen_rtx_IOR (mode, cmp, op_false); - emit_insn (gen_rtx_SET (dest, x)); - return; - } - else if (TARGET_XOP) - { - op_true = force_reg (mode, op_true); - - if (!nonimmediate_operand (op_false, mode)) - op_false = force_reg (mode, op_false); - - emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp, - op_true, - op_false))); - return; - } - - rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; - rtx d = dest; - - if (!vector_operand (op_true, mode)) - op_true = force_reg (mode, op_true); - - op_false = force_reg (mode, op_false); - - switch (mode) - { - case E_V4SFmode: - if (TARGET_SSE4_1) - gen = gen_sse4_1_blendvps; - break; - case E_V2DFmode: - if (TARGET_SSE4_1) - gen = gen_sse4_1_blendvpd; - break; - case E_SFmode: - if (TARGET_SSE4_1) - { - gen = gen_sse4_1_blendvss; - op_true = force_reg (mode, op_true); - } - break; - case E_DFmode: - if (TARGET_SSE4_1) - { - gen = gen_sse4_1_blendvsd; - op_true = force_reg (mode, op_true); - } - break; - case E_V16QImode: - case E_V8HImode: - case E_V4SImode: - case E_V2DImode: - if (TARGET_SSE4_1) - { - gen = gen_sse4_1_pblendvb; - if (mode != V16QImode) - d = gen_reg_rtx (V16QImode); - op_false = gen_lowpart (V16QImode, op_false); - op_true = gen_lowpart (V16QImode, op_true); - cmp = gen_lowpart (V16QImode, cmp); - } - break; - case E_V8SFmode: - if (TARGET_AVX) - gen = gen_avx_blendvps256; - break; - case E_V4DFmode: - if (TARGET_AVX) - gen = gen_avx_blendvpd256; - break; - case E_V32QImode: - case E_V16HImode: - case E_V8SImode: - case E_V4DImode: - if (TARGET_AVX2) - { - gen = gen_avx2_pblendvb; - if (mode != V32QImode) - d = gen_reg_rtx (V32QImode); - op_false = gen_lowpart (V32QImode, op_false); - op_true = gen_lowpart (V32QImode, op_true); - cmp = gen_lowpart (V32QImode, cmp); - } - break; - - case E_V64QImode: - gen = gen_avx512bw_blendmv64qi; - break; - case E_V32HImode: - gen = gen_avx512bw_blendmv32hi; - break; - case E_V16SImode: - gen = gen_avx512f_blendmv16si; - break; - case E_V8DImode: - gen = gen_avx512f_blendmv8di; - break; - case E_V8DFmode: - gen = gen_avx512f_blendmv8df; - break; - case E_V16SFmode: - gen = gen_avx512f_blendmv16sf; - break; - - default: - break; - } - - if (gen != NULL) - { - emit_insn (gen (d, op_false, op_true, cmp)); - if (d != dest) - emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d)); - } - else - { - op_true = force_reg (mode, op_true); - - t2 = gen_reg_rtx (mode); - if (optimize) - t3 = gen_reg_rtx (mode); - else - t3 = dest; - - x = gen_rtx_AND (mode, op_true, cmp); - emit_insn (gen_rtx_SET (t2, x)); - - x = gen_rtx_NOT (mode, cmp); - x = gen_rtx_AND (mode, x, op_false); - emit_insn (gen_rtx_SET (t3, x)); - - x = gen_rtx_IOR (mode, t3, t2); - emit_insn (gen_rtx_SET (dest, x)); - } -} - -/* Swap, force into registers, or otherwise massage the two operands - to an sse comparison with a mask result. Thus we differ a bit from - ix86_prepare_fp_compare_args which expects to produce a flags result. - - The DEST operand exists to help determine whether to commute commutative - operators. The POP0/POP1 operands are updated in place. The new - comparison code is returned, or UNKNOWN if not implementable. */ - -static enum rtx_code -ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code, - rtx *pop0, rtx *pop1) -{ - switch (code) - { - case LTGT: - case UNEQ: - /* AVX supports all the needed comparisons. */ - if (TARGET_AVX) - break; - /* We have no LTGT as an operator. We could implement it with - NE & ORDERED, but this requires an extra temporary. It's - not clear that it's worth it. */ - return UNKNOWN; - - case LT: - case LE: - case UNGT: - case UNGE: - /* These are supported directly. */ - break; - - case EQ: - case NE: - case UNORDERED: - case ORDERED: - /* AVX has 3 operand comparisons, no need to swap anything. */ - if (TARGET_AVX) - break; - /* For commutative operators, try to canonicalize the destination - operand to be first in the comparison - this helps reload to - avoid extra moves. */ - if (!dest || !rtx_equal_p (dest, *pop1)) - break; - /* FALLTHRU */ - - case GE: - case GT: - case UNLE: - case UNLT: - /* These are not supported directly before AVX, and furthermore - ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the - comparison operands to transform into something that is - supported. */ - std::swap (*pop0, *pop1); - code = swap_condition (code); - break; - - default: - gcc_unreachable (); - } - - return code; -} - -/* Expand a floating-point conditional move. Return true if successful. */ - -bool -ix86_expand_fp_movcc (rtx operands[]) -{ - machine_mode mode = GET_MODE (operands[0]); - enum rtx_code code = GET_CODE (operands[1]); - rtx tmp, compare_op; - rtx op0 = XEXP (operands[1], 0); - rtx op1 = XEXP (operands[1], 1); - - if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode)) - { - machine_mode cmode; - - /* Since we've no cmove for sse registers, don't force bad register - allocation just to gain access to it. Deny movcc when the - comparison mode doesn't match the move mode. */ - cmode = GET_MODE (op0); - if (cmode == VOIDmode) - cmode = GET_MODE (op1); - if (cmode != mode) - return false; - - code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1); - if (code == UNKNOWN) - return false; - - if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1, - operands[2], operands[3])) - return true; - - tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1, - operands[2], operands[3]); - ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]); - return true; - } - - if (GET_MODE (op0) == TImode - || (GET_MODE (op0) == DImode - && !TARGET_64BIT)) - return false; - - /* The floating point conditional move instructions don't directly - support conditions resulting from a signed integer comparison. */ - - compare_op = ix86_expand_compare (code, op0, op1); - if (!fcmov_comparison_operator (compare_op, VOIDmode)) - { - tmp = gen_reg_rtx (QImode); - ix86_expand_setcc (tmp, code, op0, op1); - - compare_op = ix86_expand_compare (NE, tmp, const0_rtx); - } - - emit_insn (gen_rtx_SET (operands[0], - gen_rtx_IF_THEN_ELSE (mode, compare_op, - operands[2], operands[3]))); - - return true; -} - -/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */ - -static int -ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code) -{ - switch (code) - { - case EQ: - return 0; - case LT: - case LTU: - return 1; - case LE: - case LEU: - return 2; - case NE: - return 4; - case GE: - case GEU: - return 5; - case GT: - case GTU: - return 6; - default: - gcc_unreachable (); - } -} - -/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */ - -static int -ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code) -{ - switch (code) - { - case EQ: - return 0x00; - case NE: - return 0x04; - case GT: - return 0x0e; - case LE: - return 0x02; - case GE: - return 0x0d; - case LT: - return 0x01; - case UNLE: - return 0x0a; - case UNLT: - return 0x09; - case UNGE: - return 0x05; - case UNGT: - return 0x06; - case UNEQ: - return 0x18; - case LTGT: - return 0x0c; - case ORDERED: - return 0x07; - case UNORDERED: - return 0x03; - default: - gcc_unreachable (); - } -} - -/* Return immediate value to be used in UNSPEC_PCMP - for comparison CODE in MODE. */ - -static int -ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode) -{ - if (FLOAT_MODE_P (mode)) - return ix86_fp_cmp_code_to_pcmp_immediate (code); - return ix86_int_cmp_code_to_pcmp_immediate (code); -} - -/* Expand AVX-512 vector comparison. */ - -bool -ix86_expand_mask_vec_cmp (rtx operands[]) -{ - machine_mode mask_mode = GET_MODE (operands[0]); - machine_mode cmp_mode = GET_MODE (operands[2]); - enum rtx_code code = GET_CODE (operands[1]); - rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode)); - int unspec_code; - rtx unspec; - - switch (code) - { - case LEU: - case GTU: - case GEU: - case LTU: - unspec_code = UNSPEC_UNSIGNED_PCMP; - break; - - default: - unspec_code = UNSPEC_PCMP; - } - - unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2], - operands[3], imm), - unspec_code); - emit_insn (gen_rtx_SET (operands[0], unspec)); - - return true; -} - -/* Expand fp vector comparison. */ - -bool -ix86_expand_fp_vec_cmp (rtx operands[]) -{ - enum rtx_code code = GET_CODE (operands[1]); - rtx cmp; - - code = ix86_prepare_sse_fp_compare_args (operands[0], code, - &operands[2], &operands[3]); - if (code == UNKNOWN) - { - rtx temp; - switch (GET_CODE (operands[1])) - { - case LTGT: - temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2], - operands[3], NULL, NULL); - cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2], - operands[3], NULL, NULL); - code = AND; - break; - case UNEQ: - temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2], - operands[3], NULL, NULL); - cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2], - operands[3], NULL, NULL); - code = IOR; - break; - default: - gcc_unreachable (); - } - cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1, - OPTAB_DIRECT); - } - else - cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3], - operands[1], operands[2]); - - if (operands[0] != cmp) - emit_move_insn (operands[0], cmp); - - return true; -} - -static rtx -ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1, - rtx op_true, rtx op_false, bool *negate) -{ - machine_mode data_mode = GET_MODE (dest); - machine_mode mode = GET_MODE (cop0); - rtx x; - - *negate = false; - - /* XOP supports all of the comparisons on all 128-bit vector int types. */ - if (TARGET_XOP - && (mode == V16QImode || mode == V8HImode - || mode == V4SImode || mode == V2DImode)) - ; - /* AVX512F supports all of the comparsions - on all 128/256/512-bit vector int types. */ - else if (ix86_valid_mask_cmp_mode (mode)) - ; - else - { - /* Canonicalize the comparison to EQ, GT, GTU. */ - switch (code) - { - case EQ: - case GT: - case GTU: - break; - - case NE: - case LE: - case LEU: - code = reverse_condition (code); - *negate = true; - break; - - case GE: - case GEU: - code = reverse_condition (code); - *negate = true; - /* FALLTHRU */ - - case LT: - case LTU: - std::swap (cop0, cop1); - code = swap_condition (code); - break; - - default: - gcc_unreachable (); - } - - /* Only SSE4.1/SSE4.2 supports V2DImode. */ - if (mode == V2DImode) - { - switch (code) - { - case EQ: - /* SSE4.1 supports EQ. */ - if (!TARGET_SSE4_1) - return NULL; - break; - - case GT: - case GTU: - /* SSE4.2 supports GT/GTU. */ - if (!TARGET_SSE4_2) - return NULL; - break; - - default: - gcc_unreachable (); - } - } - - rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode); - rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode); - if (*negate) - std::swap (optrue, opfalse); - - /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when - not using integer masks into min (x, y) == x ? -1 : 0 (i.e. - min (x, y) == x). While we add one instruction (the minimum), - we remove the need for two instructions in the negation, as the - result is done this way. - When using masks, do it for SI/DImode element types, as it is shorter - than the two subtractions. */ - if ((code != EQ - && GET_MODE_SIZE (mode) != 64 - && vector_all_ones_operand (opfalse, data_mode) - && optrue == CONST0_RTX (data_mode)) - || (code == GTU - && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4 - /* Don't do it if not using integer masks and we'd end up with - the right values in the registers though. */ - && (GET_MODE_SIZE (mode) == 64 - || !vector_all_ones_operand (optrue, data_mode) - || opfalse != CONST0_RTX (data_mode)))) - { - rtx (*gen) (rtx, rtx, rtx) = NULL; - - switch (mode) - { - case E_V16SImode: - gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3; - break; - case E_V8DImode: - gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3; - cop0 = force_reg (mode, cop0); - cop1 = force_reg (mode, cop1); - break; - case E_V32QImode: - if (TARGET_AVX2) - gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3; - break; - case E_V16HImode: - if (TARGET_AVX2) - gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3; - break; - case E_V8SImode: - if (TARGET_AVX2) - gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3; - break; - case E_V4DImode: - if (TARGET_AVX512VL) - { - gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3; - cop0 = force_reg (mode, cop0); - cop1 = force_reg (mode, cop1); - } - break; - case E_V16QImode: - if (code == GTU && TARGET_SSE2) - gen = gen_uminv16qi3; - else if (code == GT && TARGET_SSE4_1) - gen = gen_sminv16qi3; - break; - case E_V8HImode: - if (code == GTU && TARGET_SSE4_1) - gen = gen_uminv8hi3; - else if (code == GT && TARGET_SSE2) - gen = gen_sminv8hi3; - break; - case E_V4SImode: - if (TARGET_SSE4_1) - gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3; - break; - case E_V2DImode: - if (TARGET_AVX512VL) - { - gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3; - cop0 = force_reg (mode, cop0); - cop1 = force_reg (mode, cop1); - } - break; - default: - break; - } - - if (gen) - { - rtx tem = gen_reg_rtx (mode); - if (!vector_operand (cop0, mode)) - cop0 = force_reg (mode, cop0); - if (!vector_operand (cop1, mode)) - cop1 = force_reg (mode, cop1); - *negate = !*negate; - emit_insn (gen (tem, cop0, cop1)); - cop1 = tem; - code = EQ; - } - } - - /* Unsigned parallel compare is not supported by the hardware. - Play some tricks to turn this into a signed comparison - against 0. */ - if (code == GTU) - { - cop0 = force_reg (mode, cop0); - - switch (mode) - { - case E_V16SImode: - case E_V8DImode: - case E_V8SImode: - case E_V4DImode: - case E_V4SImode: - case E_V2DImode: - { - rtx t1, t2, mask; - - /* Subtract (-(INT MAX) - 1) from both operands to make - them signed. */ - mask = ix86_build_signbit_mask (mode, true, false); - t1 = gen_reg_rtx (mode); - emit_insn (gen_sub3_insn (t1, cop0, mask)); - - t2 = gen_reg_rtx (mode); - emit_insn (gen_sub3_insn (t2, cop1, mask)); - - cop0 = t1; - cop1 = t2; - code = GT; - } - break; - - case E_V64QImode: - case E_V32HImode: - case E_V32QImode: - case E_V16HImode: - case E_V16QImode: - case E_V8HImode: - /* Perform a parallel unsigned saturating subtraction. */ - x = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET - (x, gen_rtx_US_MINUS (mode, cop0, cop1))); - cop0 = x; - cop1 = CONST0_RTX (mode); - code = EQ; - *negate = !*negate; - break; - - default: - gcc_unreachable (); - } - } - } - - if (*negate) - std::swap (op_true, op_false); - - /* Allow the comparison to be done in one mode, but the movcc to - happen in another mode. */ - if (data_mode == mode) - { - x = ix86_expand_sse_cmp (dest, code, cop0, cop1, - op_true, op_false); - } - else - { - gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode)); - x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1, - op_true, op_false); - if (GET_MODE (x) == mode) - x = gen_lowpart (data_mode, x); - } - - return x; -} - -/* Expand integer vector comparison. */ - -bool -ix86_expand_int_vec_cmp (rtx operands[]) -{ - rtx_code code = GET_CODE (operands[1]); - bool negate = false; - rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2], - operands[3], NULL, NULL, &negate); - - if (!cmp) - return false; - - if (negate) - cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp, - CONST0_RTX (GET_MODE (cmp)), - NULL, NULL, &negate); - - gcc_assert (!negate); - - if (operands[0] != cmp) - emit_move_insn (operands[0], cmp); - - return true; -} - -/* Expand a floating-point vector conditional move; a vcond operation - rather than a movcc operation. */ - -bool -ix86_expand_fp_vcond (rtx operands[]) -{ - enum rtx_code code = GET_CODE (operands[3]); - rtx cmp; - - code = ix86_prepare_sse_fp_compare_args (operands[0], code, - &operands[4], &operands[5]); - if (code == UNKNOWN) - { - rtx temp; - switch (GET_CODE (operands[3])) - { - case LTGT: - temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4], - operands[5], operands[0], operands[0]); - cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4], - operands[5], operands[1], operands[2]); - code = AND; - break; - case UNEQ: - temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4], - operands[5], operands[0], operands[0]); - cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4], - operands[5], operands[1], operands[2]); - code = IOR; - break; - default: - gcc_unreachable (); - } - cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1, - OPTAB_DIRECT); - ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]); - return true; - } - - if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4], - operands[5], operands[1], operands[2])) - return true; - - cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5], - operands[1], operands[2]); - ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]); - return true; -} - -/* Expand a signed/unsigned integral vector conditional move. */ - -bool -ix86_expand_int_vcond (rtx operands[]) -{ - machine_mode data_mode = GET_MODE (operands[0]); - machine_mode mode = GET_MODE (operands[4]); - enum rtx_code code = GET_CODE (operands[3]); - bool negate = false; - rtx x, cop0, cop1; - - cop0 = operands[4]; - cop1 = operands[5]; - - /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31 - and x < 0 ? 1 : 0 into (unsigned) x >> 31. */ - if ((code == LT || code == GE) - && data_mode == mode - && cop1 == CONST0_RTX (mode) - && operands[1 + (code == LT)] == CONST0_RTX (data_mode) - && GET_MODE_UNIT_SIZE (data_mode) > 1 - && GET_MODE_UNIT_SIZE (data_mode) <= 8 - && (GET_MODE_SIZE (data_mode) == 16 - || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32))) - { - rtx negop = operands[2 - (code == LT)]; - int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1; - if (negop == CONST1_RTX (data_mode)) - { - rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift), - operands[0], 1, OPTAB_DIRECT); - if (res != operands[0]) - emit_move_insn (operands[0], res); - return true; - } - else if (GET_MODE_INNER (data_mode) != DImode - && vector_all_ones_operand (negop, data_mode)) - { - rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift), - operands[0], 0, OPTAB_DIRECT); - if (res != operands[0]) - emit_move_insn (operands[0], res); - return true; - } - } - - if (!nonimmediate_operand (cop1, mode)) - cop1 = force_reg (mode, cop1); - if (!general_operand (operands[1], data_mode)) - operands[1] = force_reg (data_mode, operands[1]); - if (!general_operand (operands[2], data_mode)) - operands[2] = force_reg (data_mode, operands[2]); - - x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1, - operands[1], operands[2], &negate); - - if (!x) - return false; - - ix86_expand_sse_movcc (operands[0], x, operands[1+negate], - operands[2-negate]); - return true; -} - -static bool -ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1, - struct expand_vec_perm_d *d) -{ - /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const - expander, so args are either in d, or in op0, op1 etc. */ - machine_mode mode = GET_MODE (d ? d->op0 : op0); - machine_mode maskmode = mode; - rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; - - switch (mode) - { - case E_V8HImode: - if (TARGET_AVX512VL && TARGET_AVX512BW) - gen = gen_avx512vl_vpermt2varv8hi3; - break; - case E_V16HImode: - if (TARGET_AVX512VL && TARGET_AVX512BW) - gen = gen_avx512vl_vpermt2varv16hi3; - break; - case E_V64QImode: - if (TARGET_AVX512VBMI) - gen = gen_avx512bw_vpermt2varv64qi3; - break; - case E_V32HImode: - if (TARGET_AVX512BW) - gen = gen_avx512bw_vpermt2varv32hi3; - break; - case E_V4SImode: - if (TARGET_AVX512VL) - gen = gen_avx512vl_vpermt2varv4si3; - break; - case E_V8SImode: - if (TARGET_AVX512VL) - gen = gen_avx512vl_vpermt2varv8si3; - break; - case E_V16SImode: - if (TARGET_AVX512F) - gen = gen_avx512f_vpermt2varv16si3; - break; - case E_V4SFmode: - if (TARGET_AVX512VL) - { - gen = gen_avx512vl_vpermt2varv4sf3; - maskmode = V4SImode; - } - break; - case E_V8SFmode: - if (TARGET_AVX512VL) - { - gen = gen_avx512vl_vpermt2varv8sf3; - maskmode = V8SImode; - } - break; - case E_V16SFmode: - if (TARGET_AVX512F) - { - gen = gen_avx512f_vpermt2varv16sf3; - maskmode = V16SImode; - } - break; - case E_V2DImode: - if (TARGET_AVX512VL) - gen = gen_avx512vl_vpermt2varv2di3; - break; - case E_V4DImode: - if (TARGET_AVX512VL) - gen = gen_avx512vl_vpermt2varv4di3; - break; - case E_V8DImode: - if (TARGET_AVX512F) - gen = gen_avx512f_vpermt2varv8di3; - break; - case E_V2DFmode: - if (TARGET_AVX512VL) - { - gen = gen_avx512vl_vpermt2varv2df3; - maskmode = V2DImode; - } - break; - case E_V4DFmode: - if (TARGET_AVX512VL) - { - gen = gen_avx512vl_vpermt2varv4df3; - maskmode = V4DImode; - } - break; - case E_V8DFmode: - if (TARGET_AVX512F) - { - gen = gen_avx512f_vpermt2varv8df3; - maskmode = V8DImode; - } - break; - default: - break; - } - - if (gen == NULL) - return false; - - /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const - expander, so args are either in d, or in op0, op1 etc. */ - if (d) - { - rtx vec[64]; - target = d->target; - op0 = d->op0; - op1 = d->op1; - for (int i = 0; i < d->nelt; ++i) - vec[i] = GEN_INT (d->perm[i]); - mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); - } - - emit_insn (gen (target, force_reg (maskmode, mask), op0, op1)); - return true; -} - -/* Expand a variable vector permutation. */ - -void -ix86_expand_vec_perm (rtx operands[]) -{ - rtx target = operands[0]; - rtx op0 = operands[1]; - rtx op1 = operands[2]; - rtx mask = operands[3]; - rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32]; - machine_mode mode = GET_MODE (op0); - machine_mode maskmode = GET_MODE (mask); - int w, e, i; - bool one_operand_shuffle = rtx_equal_p (op0, op1); - - /* Number of elements in the vector. */ - w = GET_MODE_NUNITS (mode); - e = GET_MODE_UNIT_SIZE (mode); - gcc_assert (w <= 64); - - if (TARGET_AVX512F && one_operand_shuffle) - { - rtx (*gen) (rtx, rtx, rtx) = NULL; - switch (mode) - { - case E_V16SImode: - gen =gen_avx512f_permvarv16si; - break; - case E_V16SFmode: - gen = gen_avx512f_permvarv16sf; - break; - case E_V8DImode: - gen = gen_avx512f_permvarv8di; - break; - case E_V8DFmode: - gen = gen_avx512f_permvarv8df; - break; - default: - break; - } - if (gen != NULL) - { - emit_insn (gen (target, op0, mask)); - return; - } - } - - if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL)) - return; - - if (TARGET_AVX2) - { - if (mode == V4DImode || mode == V4DFmode || mode == V16HImode) - { - /* Unfortunately, the VPERMQ and VPERMPD instructions only support - an constant shuffle operand. With a tiny bit of effort we can - use VPERMD instead. A re-interpretation stall for V4DFmode is - unfortunate but there's no avoiding it. - Similarly for V16HImode we don't have instructions for variable - shuffling, while for V32QImode we can use after preparing suitable - masks vpshufb; vpshufb; vpermq; vpor. */ - - if (mode == V16HImode) - { - maskmode = mode = V32QImode; - w = 32; - e = 1; - } - else - { - maskmode = mode = V8SImode; - w = 8; - e = 4; - } - t1 = gen_reg_rtx (maskmode); - - /* Replicate the low bits of the V4DImode mask into V8SImode: - mask = { A B C D } - t1 = { A A B B C C D D }. */ - for (i = 0; i < w / 2; ++i) - vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2); - vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); - vt = force_reg (maskmode, vt); - mask = gen_lowpart (maskmode, mask); - if (maskmode == V8SImode) - emit_insn (gen_avx2_permvarv8si (t1, mask, vt)); - else - emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt)); - - /* Multiply the shuffle indicies by two. */ - t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1, - OPTAB_DIRECT); - - /* Add one to the odd shuffle indicies: - t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */ - for (i = 0; i < w / 2; ++i) - { - vec[i * 2] = const0_rtx; - vec[i * 2 + 1] = const1_rtx; - } - vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); - vt = validize_mem (force_const_mem (maskmode, vt)); - t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1, - OPTAB_DIRECT); - - /* Continue as if V8SImode (resp. V32QImode) was used initially. */ - operands[3] = mask = t1; - target = gen_reg_rtx (mode); - op0 = gen_lowpart (mode, op0); - op1 = gen_lowpart (mode, op1); - } - - switch (mode) - { - case E_V8SImode: - /* The VPERMD and VPERMPS instructions already properly ignore - the high bits of the shuffle elements. No need for us to - perform an AND ourselves. */ - if (one_operand_shuffle) - { - emit_insn (gen_avx2_permvarv8si (target, op0, mask)); - if (target != operands[0]) - emit_move_insn (operands[0], - gen_lowpart (GET_MODE (operands[0]), target)); - } - else - { - t1 = gen_reg_rtx (V8SImode); - t2 = gen_reg_rtx (V8SImode); - emit_insn (gen_avx2_permvarv8si (t1, op0, mask)); - emit_insn (gen_avx2_permvarv8si (t2, op1, mask)); - goto merge_two; - } - return; - - case E_V8SFmode: - mask = gen_lowpart (V8SImode, mask); - if (one_operand_shuffle) - emit_insn (gen_avx2_permvarv8sf (target, op0, mask)); - else - { - t1 = gen_reg_rtx (V8SFmode); - t2 = gen_reg_rtx (V8SFmode); - emit_insn (gen_avx2_permvarv8sf (t1, op0, mask)); - emit_insn (gen_avx2_permvarv8sf (t2, op1, mask)); - goto merge_two; - } - return; - - case E_V4SImode: - /* By combining the two 128-bit input vectors into one 256-bit - input vector, we can use VPERMD and VPERMPS for the full - two-operand shuffle. */ - t1 = gen_reg_rtx (V8SImode); - t2 = gen_reg_rtx (V8SImode); - emit_insn (gen_avx_vec_concatv8si (t1, op0, op1)); - emit_insn (gen_avx_vec_concatv8si (t2, mask, mask)); - emit_insn (gen_avx2_permvarv8si (t1, t1, t2)); - emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx)); - return; - - case E_V4SFmode: - t1 = gen_reg_rtx (V8SFmode); - t2 = gen_reg_rtx (V8SImode); - mask = gen_lowpart (V4SImode, mask); - emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1)); - emit_insn (gen_avx_vec_concatv8si (t2, mask, mask)); - emit_insn (gen_avx2_permvarv8sf (t1, t1, t2)); - emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx)); - return; - - case E_V32QImode: - t1 = gen_reg_rtx (V32QImode); - t2 = gen_reg_rtx (V32QImode); - t3 = gen_reg_rtx (V32QImode); - vt2 = GEN_INT (-128); - vt = gen_const_vec_duplicate (V32QImode, vt2); - vt = force_reg (V32QImode, vt); - for (i = 0; i < 32; i++) - vec[i] = i < 16 ? vt2 : const0_rtx; - vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec)); - vt2 = force_reg (V32QImode, vt2); - /* From mask create two adjusted masks, which contain the same - bits as mask in the low 7 bits of each vector element. - The first mask will have the most significant bit clear - if it requests element from the same 128-bit lane - and MSB set if it requests element from the other 128-bit lane. - The second mask will have the opposite values of the MSB, - and additionally will have its 128-bit lanes swapped. - E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have - t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and - t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ... - stands for other 12 bytes. */ - /* The bit whether element is from the same lane or the other - lane is bit 4, so shift it up by 3 to the MSB position. */ - t5 = gen_reg_rtx (V4DImode); - emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask), - GEN_INT (3))); - /* Clear MSB bits from the mask just in case it had them set. */ - emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask)); - /* After this t1 will have MSB set for elements from other lane. */ - emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2)); - /* Clear bits other than MSB. */ - emit_insn (gen_andv32qi3 (t1, t1, vt)); - /* Or in the lower bits from mask into t3. */ - emit_insn (gen_iorv32qi3 (t3, t1, t2)); - /* And invert MSB bits in t1, so MSB is set for elements from the same - lane. */ - emit_insn (gen_xorv32qi3 (t1, t1, vt)); - /* Swap 128-bit lanes in t3. */ - t6 = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3), - const2_rtx, GEN_INT (3), - const0_rtx, const1_rtx)); - /* And or in the lower bits from mask into t1. */ - emit_insn (gen_iorv32qi3 (t1, t1, t2)); - if (one_operand_shuffle) - { - /* Each of these shuffles will put 0s in places where - element from the other 128-bit lane is needed, otherwise - will shuffle in the requested value. */ - emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, - gen_lowpart (V32QImode, t6))); - emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1)); - /* For t3 the 128-bit lanes are swapped again. */ - t7 = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3), - const2_rtx, GEN_INT (3), - const0_rtx, const1_rtx)); - /* And oring both together leads to the result. */ - emit_insn (gen_iorv32qi3 (target, t1, - gen_lowpart (V32QImode, t7))); - if (target != operands[0]) - emit_move_insn (operands[0], - gen_lowpart (GET_MODE (operands[0]), target)); - return; - } - - t4 = gen_reg_rtx (V32QImode); - /* Similarly to the above one_operand_shuffle code, - just for repeated twice for each operand. merge_two: - code will merge the two results together. */ - emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, - gen_lowpart (V32QImode, t6))); - emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, - gen_lowpart (V32QImode, t6))); - emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1)); - emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1)); - t7 = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4), - const2_rtx, GEN_INT (3), - const0_rtx, const1_rtx)); - t8 = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3), - const2_rtx, GEN_INT (3), - const0_rtx, const1_rtx)); - emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7))); - emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8))); - t1 = t4; - t2 = t3; - goto merge_two; - - default: - gcc_assert (GET_MODE_SIZE (mode) <= 16); - break; - } - } - - if (TARGET_XOP) - { - /* The XOP VPPERM insn supports three inputs. By ignoring the - one_operand_shuffle special case, we avoid creating another - set of constant vectors in memory. */ - one_operand_shuffle = false; - - /* mask = mask & {2*w-1, ...} */ - vt = GEN_INT (2*w - 1); - } - else - { - /* mask = mask & {w-1, ...} */ - vt = GEN_INT (w - 1); - } - - vt = gen_const_vec_duplicate (maskmode, vt); - mask = expand_simple_binop (maskmode, AND, mask, vt, - NULL_RTX, 0, OPTAB_DIRECT); - - /* For non-QImode operations, convert the word permutation control - into a byte permutation control. */ - if (mode != V16QImode) - { - mask = expand_simple_binop (maskmode, ASHIFT, mask, - GEN_INT (exact_log2 (e)), - NULL_RTX, 0, OPTAB_DIRECT); - - /* Convert mask to vector of chars. */ - mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask)); - - /* Replicate each of the input bytes into byte positions: - (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8} - (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12} - (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */ - for (i = 0; i < 16; ++i) - vec[i] = GEN_INT (i/e * e); - vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); - vt = validize_mem (force_const_mem (V16QImode, vt)); - if (TARGET_XOP) - emit_insn (gen_xop_pperm (mask, mask, mask, vt)); - else - emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt)); - - /* Convert it into the byte positions by doing - mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */ - for (i = 0; i < 16; ++i) - vec[i] = GEN_INT (i % e); - vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); - vt = validize_mem (force_const_mem (V16QImode, vt)); - emit_insn (gen_addv16qi3 (mask, mask, vt)); - } - - /* The actual shuffle operations all operate on V16QImode. */ - op0 = gen_lowpart (V16QImode, op0); - op1 = gen_lowpart (V16QImode, op1); - - if (TARGET_XOP) - { - if (GET_MODE (target) != V16QImode) - target = gen_reg_rtx (V16QImode); - emit_insn (gen_xop_pperm (target, op0, op1, mask)); - if (target != operands[0]) - emit_move_insn (operands[0], - gen_lowpart (GET_MODE (operands[0]), target)); - } - else if (one_operand_shuffle) - { - if (GET_MODE (target) != V16QImode) - target = gen_reg_rtx (V16QImode); - emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask)); - if (target != operands[0]) - emit_move_insn (operands[0], - gen_lowpart (GET_MODE (operands[0]), target)); - } - else - { - rtx xops[6]; - bool ok; - - /* Shuffle the two input vectors independently. */ - t1 = gen_reg_rtx (V16QImode); - t2 = gen_reg_rtx (V16QImode); - emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask)); - emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask)); - - merge_two: - /* Then merge them together. The key is whether any given control - element contained a bit set that indicates the second word. */ - mask = operands[3]; - vt = GEN_INT (w); - if (maskmode == V2DImode && !TARGET_SSE4_1) - { - /* Without SSE4.1, we don't have V2DImode EQ. Perform one - more shuffle to convert the V2DI input mask into a V4SI - input mask. At which point the masking that expand_int_vcond - will work as desired. */ - rtx t3 = gen_reg_rtx (V4SImode); - emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask), - const0_rtx, const0_rtx, - const2_rtx, const2_rtx)); - mask = t3; - maskmode = V4SImode; - e = w = 4; - } - - vt = gen_const_vec_duplicate (maskmode, vt); - vt = force_reg (maskmode, vt); - mask = expand_simple_binop (maskmode, AND, mask, vt, - NULL_RTX, 0, OPTAB_DIRECT); - - if (GET_MODE (target) != mode) - target = gen_reg_rtx (mode); - xops[0] = target; - xops[1] = gen_lowpart (mode, t2); - xops[2] = gen_lowpart (mode, t1); - xops[3] = gen_rtx_EQ (maskmode, mask, vt); - xops[4] = mask; - xops[5] = vt; - ok = ix86_expand_int_vcond (xops); - gcc_assert (ok); - if (target != operands[0]) - emit_move_insn (operands[0], - gen_lowpart (GET_MODE (operands[0]), target)); - } -} - -/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is - true if we should do zero extension, else sign extension. HIGH_P is - true if we want the N/2 high elements, else the low elements. */ - -void -ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) -{ - machine_mode imode = GET_MODE (src); - rtx tmp; - - if (TARGET_SSE4_1) - { - rtx (*unpack)(rtx, rtx); - rtx (*extract)(rtx, rtx) = NULL; - machine_mode halfmode = BLKmode; - - switch (imode) - { - case E_V64QImode: - if (unsigned_p) - unpack = gen_avx512bw_zero_extendv32qiv32hi2; - else - unpack = gen_avx512bw_sign_extendv32qiv32hi2; - halfmode = V32QImode; - extract - = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi; - break; - case E_V32QImode: - if (unsigned_p) - unpack = gen_avx2_zero_extendv16qiv16hi2; - else - unpack = gen_avx2_sign_extendv16qiv16hi2; - halfmode = V16QImode; - extract - = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi; - break; - case E_V32HImode: - if (unsigned_p) - unpack = gen_avx512f_zero_extendv16hiv16si2; - else - unpack = gen_avx512f_sign_extendv16hiv16si2; - halfmode = V16HImode; - extract - = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi; - break; - case E_V16HImode: - if (unsigned_p) - unpack = gen_avx2_zero_extendv8hiv8si2; - else - unpack = gen_avx2_sign_extendv8hiv8si2; - halfmode = V8HImode; - extract - = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi; - break; - case E_V16SImode: - if (unsigned_p) - unpack = gen_avx512f_zero_extendv8siv8di2; - else - unpack = gen_avx512f_sign_extendv8siv8di2; - halfmode = V8SImode; - extract - = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si; - break; - case E_V8SImode: - if (unsigned_p) - unpack = gen_avx2_zero_extendv4siv4di2; - else - unpack = gen_avx2_sign_extendv4siv4di2; - halfmode = V4SImode; - extract - = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si; - break; - case E_V16QImode: - if (unsigned_p) - unpack = gen_sse4_1_zero_extendv8qiv8hi2; - else - unpack = gen_sse4_1_sign_extendv8qiv8hi2; - break; - case E_V8HImode: - if (unsigned_p) - unpack = gen_sse4_1_zero_extendv4hiv4si2; - else - unpack = gen_sse4_1_sign_extendv4hiv4si2; - break; - case E_V4SImode: - if (unsigned_p) - unpack = gen_sse4_1_zero_extendv2siv2di2; - else - unpack = gen_sse4_1_sign_extendv2siv2di2; - break; - default: - gcc_unreachable (); - } - - if (GET_MODE_SIZE (imode) >= 32) - { - tmp = gen_reg_rtx (halfmode); - emit_insn (extract (tmp, src)); - } - else if (high_p) - { - /* Shift higher 8 bytes to lower 8 bytes. */ - tmp = gen_reg_rtx (V1TImode); - emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src), - GEN_INT (64))); - tmp = gen_lowpart (imode, tmp); - } - else - tmp = src; - - emit_insn (unpack (dest, tmp)); - } - else - { - rtx (*unpack)(rtx, rtx, rtx); - - switch (imode) - { - case E_V16QImode: - if (high_p) - unpack = gen_vec_interleave_highv16qi; - else - unpack = gen_vec_interleave_lowv16qi; - break; - case E_V8HImode: - if (high_p) - unpack = gen_vec_interleave_highv8hi; - else - unpack = gen_vec_interleave_lowv8hi; - break; - case E_V4SImode: - if (high_p) - unpack = gen_vec_interleave_highv4si; - else - unpack = gen_vec_interleave_lowv4si; - break; - default: - gcc_unreachable (); - } - - if (unsigned_p) - tmp = force_reg (imode, CONST0_RTX (imode)); - else - tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode), - src, pc_rtx, pc_rtx); - - rtx tmp2 = gen_reg_rtx (imode); - emit_insn (unpack (tmp2, src, tmp)); - emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2)); - } -} - -/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode, - but works for floating pointer parameters and nonoffsetable memories. - For pushes, it returns just stack offsets; the values will be saved - in the right order. Maximally three parts are generated. */ - -static int -ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode) -{ - int size; - - if (!TARGET_64BIT) - size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4; - else - size = (GET_MODE_SIZE (mode) + 4) / 8; - - gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand))); - gcc_assert (size >= 2 && size <= 4); - - /* Optimize constant pool reference to immediates. This is used by fp - moves, that force all constants to memory to allow combining. */ - if (MEM_P (operand) && MEM_READONLY_P (operand)) - operand = avoid_constant_pool_reference (operand); - - if (MEM_P (operand) && !offsettable_memref_p (operand)) - { - /* The only non-offsetable memories we handle are pushes. */ - int ok = push_operand (operand, VOIDmode); - - gcc_assert (ok); - - operand = copy_rtx (operand); - PUT_MODE (operand, word_mode); - parts[0] = parts[1] = parts[2] = parts[3] = operand; - return size; - } - - if (GET_CODE (operand) == CONST_VECTOR) - { - scalar_int_mode imode = int_mode_for_mode (mode).require (); - /* Caution: if we looked through a constant pool memory above, - the operand may actually have a different mode now. That's - ok, since we want to pun this all the way back to an integer. */ - operand = simplify_subreg (imode, operand, GET_MODE (operand), 0); - gcc_assert (operand != NULL); - mode = imode; - } - - if (!TARGET_64BIT) - { - if (mode == DImode) - split_double_mode (mode, &operand, 1, &parts[0], &parts[1]); - else - { - int i; - - if (REG_P (operand)) - { - gcc_assert (reload_completed); - for (i = 0; i < size; i++) - parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i); - } - else if (offsettable_memref_p (operand)) - { - operand = adjust_address (operand, SImode, 0); - parts[0] = operand; - for (i = 1; i < size; i++) - parts[i] = adjust_address (operand, SImode, 4 * i); - } - else if (CONST_DOUBLE_P (operand)) - { - const REAL_VALUE_TYPE *r; - long l[4]; - - r = CONST_DOUBLE_REAL_VALUE (operand); - switch (mode) - { - case E_TFmode: - real_to_target (l, r, mode); - parts[3] = gen_int_mode (l[3], SImode); - parts[2] = gen_int_mode (l[2], SImode); - break; - case E_XFmode: - /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since - long double may not be 80-bit. */ - real_to_target (l, r, mode); - parts[2] = gen_int_mode (l[2], SImode); - break; - case E_DFmode: - REAL_VALUE_TO_TARGET_DOUBLE (*r, l); - break; - default: - gcc_unreachable (); - } - parts[1] = gen_int_mode (l[1], SImode); - parts[0] = gen_int_mode (l[0], SImode); - } - else - gcc_unreachable (); - } - } - else - { - if (mode == TImode) - split_double_mode (mode, &operand, 1, &parts[0], &parts[1]); - if (mode == XFmode || mode == TFmode) - { - machine_mode upper_mode = mode==XFmode ? SImode : DImode; - if (REG_P (operand)) - { - gcc_assert (reload_completed); - parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0); - parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1); - } - else if (offsettable_memref_p (operand)) - { - operand = adjust_address (operand, DImode, 0); - parts[0] = operand; - parts[1] = adjust_address (operand, upper_mode, 8); - } - else if (CONST_DOUBLE_P (operand)) - { - long l[4]; - - real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode); - - /* real_to_target puts 32-bit pieces in each long. */ - parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff)) - | ((l[1] & HOST_WIDE_INT_C (0xffffffff)) - << 32), DImode); - - if (upper_mode == SImode) - parts[1] = gen_int_mode (l[2], SImode); - else - parts[1] - = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff)) - | ((l[3] & HOST_WIDE_INT_C (0xffffffff)) - << 32), DImode); - } - else - gcc_unreachable (); - } - } - - return size; -} - -/* Emit insns to perform a move or push of DI, DF, XF, and TF values. - Return false when normal moves are needed; true when all required - insns have been emitted. Operands 2-4 contain the input values - int the correct order; operands 5-7 contain the output values. */ - -void -ix86_split_long_move (rtx operands[]) -{ - rtx part[2][4]; - int nparts, i, j; - int push = 0; - int collisions = 0; - machine_mode mode = GET_MODE (operands[0]); - bool collisionparts[4]; - - /* The DFmode expanders may ask us to move double. - For 64bit target this is single move. By hiding the fact - here we simplify i386.md splitters. */ - if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8) - { - /* Optimize constant pool reference to immediates. This is used by - fp moves, that force all constants to memory to allow combining. */ - - if (MEM_P (operands[1]) - && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF - && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0))) - operands[1] = get_pool_constant (XEXP (operands[1], 0)); - if (push_operand (operands[0], VOIDmode)) - { - operands[0] = copy_rtx (operands[0]); - PUT_MODE (operands[0], word_mode); - } - else - operands[0] = gen_lowpart (DImode, operands[0]); - operands[1] = gen_lowpart (DImode, operands[1]); - emit_move_insn (operands[0], operands[1]); - return; - } - - /* The only non-offsettable memory we handle is push. */ - if (push_operand (operands[0], VOIDmode)) - push = 1; - else - gcc_assert (!MEM_P (operands[0]) - || offsettable_memref_p (operands[0])); - - nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0])); - ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0])); - - /* When emitting push, take care for source operands on the stack. */ - if (push && MEM_P (operands[1]) - && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1])) - { - rtx src_base = XEXP (part[1][nparts - 1], 0); - - /* Compensate for the stack decrement by 4. */ - if (!TARGET_64BIT && nparts == 3 - && mode == XFmode && TARGET_128BIT_LONG_DOUBLE) - src_base = plus_constant (Pmode, src_base, 4); - - /* src_base refers to the stack pointer and is - automatically decreased by emitted push. */ - for (i = 0; i < nparts; i++) - part[1][i] = change_address (part[1][i], - GET_MODE (part[1][i]), src_base); - } - - /* We need to do copy in the right order in case an address register - of the source overlaps the destination. */ - if (REG_P (part[0][0]) && MEM_P (part[1][0])) - { - rtx tmp; - - for (i = 0; i < nparts; i++) - { - collisionparts[i] - = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0)); - if (collisionparts[i]) - collisions++; - } - - /* Collision in the middle part can be handled by reordering. */ - if (collisions == 1 && nparts == 3 && collisionparts [1]) - { - std::swap (part[0][1], part[0][2]); - std::swap (part[1][1], part[1][2]); - } - else if (collisions == 1 - && nparts == 4 - && (collisionparts [1] || collisionparts [2])) - { - if (collisionparts [1]) - { - std::swap (part[0][1], part[0][2]); - std::swap (part[1][1], part[1][2]); - } - else - { - std::swap (part[0][2], part[0][3]); - std::swap (part[1][2], part[1][3]); - } - } - - /* If there are more collisions, we can't handle it by reordering. - Do an lea to the last part and use only one colliding move. */ - else if (collisions > 1) - { - rtx base, addr; - - collisions = 1; - - base = part[0][nparts - 1]; - - /* Handle the case when the last part isn't valid for lea. - Happens in 64-bit mode storing the 12-byte XFmode. */ - if (GET_MODE (base) != Pmode) - base = gen_rtx_REG (Pmode, REGNO (base)); - - addr = XEXP (part[1][0], 0); - if (TARGET_TLS_DIRECT_SEG_REFS) - { - struct ix86_address parts; - int ok = ix86_decompose_address (addr, &parts); - gcc_assert (ok); - /* It is not valid to use %gs: or %fs: in lea. */ - gcc_assert (parts.seg == ADDR_SPACE_GENERIC); - } - emit_insn (gen_rtx_SET (base, addr)); - part[1][0] = replace_equiv_address (part[1][0], base); - for (i = 1; i < nparts; i++) - { - tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i); - part[1][i] = replace_equiv_address (part[1][i], tmp); - } - } - } - - if (push) - { - if (!TARGET_64BIT) - { - if (nparts == 3) - { - if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode) - emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4))); - emit_move_insn (part[0][2], part[1][2]); - } - else if (nparts == 4) - { - emit_move_insn (part[0][3], part[1][3]); - emit_move_insn (part[0][2], part[1][2]); - } - } - else - { - /* In 64bit mode we don't have 32bit push available. In case this is - register, it is OK - we will just use larger counterpart. We also - retype memory - these comes from attempt to avoid REX prefix on - moving of second half of TFmode value. */ - if (GET_MODE (part[1][1]) == SImode) - { - switch (GET_CODE (part[1][1])) - { - case MEM: - part[1][1] = adjust_address (part[1][1], DImode, 0); - break; - - case REG: - part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1])); - break; - - default: - gcc_unreachable (); - } - - if (GET_MODE (part[1][0]) == SImode) - part[1][0] = part[1][1]; - } - } - emit_move_insn (part[0][1], part[1][1]); - emit_move_insn (part[0][0], part[1][0]); - return; - } - - /* Choose correct order to not overwrite the source before it is copied. */ - if ((REG_P (part[0][0]) - && REG_P (part[1][1]) - && (REGNO (part[0][0]) == REGNO (part[1][1]) - || (nparts == 3 - && REGNO (part[0][0]) == REGNO (part[1][2])) - || (nparts == 4 - && REGNO (part[0][0]) == REGNO (part[1][3])))) - || (collisions > 0 - && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0)))) - { - for (i = 0, j = nparts - 1; i < nparts; i++, j--) - { - operands[2 + i] = part[0][j]; - operands[6 + i] = part[1][j]; - } - } - else - { - for (i = 0; i < nparts; i++) - { - operands[2 + i] = part[0][i]; - operands[6 + i] = part[1][i]; - } - } - - /* If optimizing for size, attempt to locally unCSE nonzero constants. */ - if (optimize_insn_for_size_p ()) - { - for (j = 0; j < nparts - 1; j++) - if (CONST_INT_P (operands[6 + j]) - && operands[6 + j] != const0_rtx - && REG_P (operands[2 + j])) - for (i = j; i < nparts - 1; i++) - if (CONST_INT_P (operands[7 + i]) - && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j])) - operands[7 + i] = operands[2 + j]; - } - - for (i = 0; i < nparts; i++) - emit_move_insn (operands[2 + i], operands[6 + i]); - - return; -} - -/* Helper function of ix86_split_ashl used to generate an SImode/DImode - left shift by a constant, either using a single shift or - a sequence of add instructions. */ - -static void -ix86_expand_ashl_const (rtx operand, int count, machine_mode mode) -{ - if (count == 1 - || (count * ix86_cost->add <= ix86_cost->shift_const - && !optimize_insn_for_size_p ())) - { - while (count-- > 0) - emit_insn (gen_add2_insn (operand, operand)); - } - else - { - rtx (*insn)(rtx, rtx, rtx); - - insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3; - emit_insn (insn (operand, operand, GEN_INT (count))); - } -} - -void -ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode) -{ - rtx (*gen_ashl3)(rtx, rtx, rtx); - rtx (*gen_shld)(rtx, rtx, rtx); - int half_width = GET_MODE_BITSIZE (mode) >> 1; - machine_mode half_mode; - - rtx low[2], high[2]; - int count; - - if (CONST_INT_P (operands[2])) - { - split_double_mode (mode, operands, 2, low, high); - count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); - - if (count >= half_width) - { - emit_move_insn (high[0], low[1]); - emit_move_insn (low[0], const0_rtx); - - if (count > half_width) - ix86_expand_ashl_const (high[0], count - half_width, mode); - } - else - { - gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld; - - if (!rtx_equal_p (operands[0], operands[1])) - emit_move_insn (operands[0], operands[1]); - - emit_insn (gen_shld (high[0], low[0], GEN_INT (count))); - ix86_expand_ashl_const (low[0], count, mode); - } - return; - } - - split_double_mode (mode, operands, 1, low, high); - half_mode = mode == DImode ? SImode : DImode; - - gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3; - - if (operands[1] == const1_rtx) - { - /* Assuming we've chosen a QImode capable registers, then 1 << N - can be done with two 32/64-bit shifts, no branches, no cmoves. */ - if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0])) - { - rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG); - - ix86_expand_clear (low[0]); - ix86_expand_clear (high[0]); - emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width))); - - d = gen_lowpart (QImode, low[0]); - d = gen_rtx_STRICT_LOW_PART (VOIDmode, d); - s = gen_rtx_EQ (QImode, flags, const0_rtx); - emit_insn (gen_rtx_SET (d, s)); - - d = gen_lowpart (QImode, high[0]); - d = gen_rtx_STRICT_LOW_PART (VOIDmode, d); - s = gen_rtx_NE (QImode, flags, const0_rtx); - emit_insn (gen_rtx_SET (d, s)); - } - - /* Otherwise, we can get the same results by manually performing - a bit extract operation on bit 5/6, and then performing the two - shifts. The two methods of getting 0/1 into low/high are exactly - the same size. Avoiding the shift in the bit extract case helps - pentium4 a bit; no one else seems to care much either way. */ - else - { - rtx (*gen_lshr3)(rtx, rtx, rtx); - rtx (*gen_and3)(rtx, rtx, rtx); - rtx (*gen_xor3)(rtx, rtx, rtx); - HOST_WIDE_INT bits; - rtx x; - - if (mode == DImode) - { - gen_lshr3 = gen_lshrsi3; - gen_and3 = gen_andsi3; - gen_xor3 = gen_xorsi3; - bits = 5; - } - else - { - gen_lshr3 = gen_lshrdi3; - gen_and3 = gen_anddi3; - gen_xor3 = gen_xordi3; - bits = 6; - } - - if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ()) - x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]); - else - x = gen_lowpart (half_mode, operands[2]); - emit_insn (gen_rtx_SET (high[0], x)); - - emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits))); - emit_insn (gen_and3 (high[0], high[0], const1_rtx)); - emit_move_insn (low[0], high[0]); - emit_insn (gen_xor3 (low[0], low[0], const1_rtx)); - } - - emit_insn (gen_ashl3 (low[0], low[0], operands[2])); - emit_insn (gen_ashl3 (high[0], high[0], operands[2])); - return; - } - - if (operands[1] == constm1_rtx) - { - /* For -1 << N, we can avoid the shld instruction, because we - know that we're shifting 0...31/63 ones into a -1. */ - emit_move_insn (low[0], constm1_rtx); - if (optimize_insn_for_size_p ()) - emit_move_insn (high[0], low[0]); - else - emit_move_insn (high[0], constm1_rtx); - } - else - { - gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld; - - if (!rtx_equal_p (operands[0], operands[1])) - emit_move_insn (operands[0], operands[1]); - - split_double_mode (mode, operands, 1, low, high); - emit_insn (gen_shld (high[0], low[0], operands[2])); - } - - emit_insn (gen_ashl3 (low[0], low[0], operands[2])); - - if (TARGET_CMOVE && scratch) - { - ix86_expand_clear (scratch); - emit_insn (gen_x86_shift_adj_1 - (half_mode, high[0], low[0], operands[2], scratch)); - } - else - emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2])); -} - -void -ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode) -{ - rtx (*gen_ashr3)(rtx, rtx, rtx) - = mode == DImode ? gen_ashrsi3 : gen_ashrdi3; - rtx (*gen_shrd)(rtx, rtx, rtx); - int half_width = GET_MODE_BITSIZE (mode) >> 1; - - rtx low[2], high[2]; - int count; - - if (CONST_INT_P (operands[2])) - { - split_double_mode (mode, operands, 2, low, high); - count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); - - if (count == GET_MODE_BITSIZE (mode) - 1) - { - emit_move_insn (high[0], high[1]); - emit_insn (gen_ashr3 (high[0], high[0], - GEN_INT (half_width - 1))); - emit_move_insn (low[0], high[0]); - - } - else if (count >= half_width) - { - emit_move_insn (low[0], high[1]); - emit_move_insn (high[0], low[0]); - emit_insn (gen_ashr3 (high[0], high[0], - GEN_INT (half_width - 1))); - - if (count > half_width) - emit_insn (gen_ashr3 (low[0], low[0], - GEN_INT (count - half_width))); - } - else - { - gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; - - if (!rtx_equal_p (operands[0], operands[1])) - emit_move_insn (operands[0], operands[1]); - - emit_insn (gen_shrd (low[0], high[0], GEN_INT (count))); - emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count))); - } - } - else - { - machine_mode half_mode; - - gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; - - if (!rtx_equal_p (operands[0], operands[1])) - emit_move_insn (operands[0], operands[1]); - - split_double_mode (mode, operands, 1, low, high); - half_mode = mode == DImode ? SImode : DImode; - - emit_insn (gen_shrd (low[0], high[0], operands[2])); - emit_insn (gen_ashr3 (high[0], high[0], operands[2])); - - if (TARGET_CMOVE && scratch) - { - emit_move_insn (scratch, high[0]); - emit_insn (gen_ashr3 (scratch, scratch, - GEN_INT (half_width - 1))); - emit_insn (gen_x86_shift_adj_1 - (half_mode, low[0], high[0], operands[2], scratch)); - } - else - emit_insn (gen_x86_shift_adj_3 - (half_mode, low[0], high[0], operands[2])); - } -} - -void -ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode) -{ - rtx (*gen_lshr3)(rtx, rtx, rtx) - = mode == DImode ? gen_lshrsi3 : gen_lshrdi3; - rtx (*gen_shrd)(rtx, rtx, rtx); - int half_width = GET_MODE_BITSIZE (mode) >> 1; - - rtx low[2], high[2]; - int count; - - if (CONST_INT_P (operands[2])) - { - split_double_mode (mode, operands, 2, low, high); - count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); - - if (count >= half_width) - { - emit_move_insn (low[0], high[1]); - ix86_expand_clear (high[0]); - - if (count > half_width) - emit_insn (gen_lshr3 (low[0], low[0], - GEN_INT (count - half_width))); - } - else - { - gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; - - if (!rtx_equal_p (operands[0], operands[1])) - emit_move_insn (operands[0], operands[1]); - - emit_insn (gen_shrd (low[0], high[0], GEN_INT (count))); - emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count))); - } - } - else - { - machine_mode half_mode; - - gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; - - if (!rtx_equal_p (operands[0], operands[1])) - emit_move_insn (operands[0], operands[1]); - - split_double_mode (mode, operands, 1, low, high); - half_mode = mode == DImode ? SImode : DImode; - - emit_insn (gen_shrd (low[0], high[0], operands[2])); - emit_insn (gen_lshr3 (high[0], high[0], operands[2])); - - if (TARGET_CMOVE && scratch) - { - ix86_expand_clear (scratch); - emit_insn (gen_x86_shift_adj_1 - (half_mode, low[0], high[0], operands[2], scratch)); - } - else - emit_insn (gen_x86_shift_adj_2 - (half_mode, low[0], high[0], operands[2])); - } -} - -/* Return mode for the memcpy/memset loop counter. Prefer SImode over - DImode for constant loop counts. */ - -static machine_mode -counter_mode (rtx count_exp) -{ - if (GET_MODE (count_exp) != VOIDmode) - return GET_MODE (count_exp); - if (!CONST_INT_P (count_exp)) - return Pmode; - if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff)) - return DImode; - return SImode; -} - -/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR - to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT - specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set - memory by VALUE (supposed to be in MODE). - - The size is rounded down to whole number of chunk size moved at once. - SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */ - - -static void -expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem, - rtx destptr, rtx srcptr, rtx value, - rtx count, machine_mode mode, int unroll, - int expected_size, bool issetmem) -{ - rtx_code_label *out_label, *top_label; - rtx iter, tmp; - machine_mode iter_mode = counter_mode (count); - int piece_size_n = GET_MODE_SIZE (mode) * unroll; - rtx piece_size = GEN_INT (piece_size_n); - rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1)); - rtx size; - int i; - - top_label = gen_label_rtx (); - out_label = gen_label_rtx (); - iter = gen_reg_rtx (iter_mode); - - size = expand_simple_binop (iter_mode, AND, count, piece_size_mask, - NULL, 1, OPTAB_DIRECT); - /* Those two should combine. */ - if (piece_size == const1_rtx) - { - emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode, - true, out_label); - predict_jump (REG_BR_PROB_BASE * 10 / 100); - } - emit_move_insn (iter, const0_rtx); - - emit_label (top_label); - - tmp = convert_modes (Pmode, iter_mode, iter, true); - - /* This assert could be relaxed - in this case we'll need to compute - smallest power of two, containing in PIECE_SIZE_N and pass it to - offset_address. */ - gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0); - destmem = offset_address (destmem, tmp, piece_size_n); - destmem = adjust_address (destmem, mode, 0); - - if (!issetmem) - { - srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n); - srcmem = adjust_address (srcmem, mode, 0); - - /* When unrolling for chips that reorder memory reads and writes, - we can save registers by using single temporary. - Also using 4 temporaries is overkill in 32bit mode. */ - if (!TARGET_64BIT && 0) - { - for (i = 0; i < unroll; i++) - { - if (i) - { - destmem = adjust_address (copy_rtx (destmem), mode, - GET_MODE_SIZE (mode)); - srcmem = adjust_address (copy_rtx (srcmem), mode, - GET_MODE_SIZE (mode)); - } - emit_move_insn (destmem, srcmem); - } - } - else - { - rtx tmpreg[4]; - gcc_assert (unroll <= 4); - for (i = 0; i < unroll; i++) - { - tmpreg[i] = gen_reg_rtx (mode); - if (i) - srcmem = adjust_address (copy_rtx (srcmem), mode, - GET_MODE_SIZE (mode)); - emit_move_insn (tmpreg[i], srcmem); - } - for (i = 0; i < unroll; i++) - { - if (i) - destmem = adjust_address (copy_rtx (destmem), mode, - GET_MODE_SIZE (mode)); - emit_move_insn (destmem, tmpreg[i]); - } - } - } - else - for (i = 0; i < unroll; i++) - { - if (i) - destmem = adjust_address (copy_rtx (destmem), mode, - GET_MODE_SIZE (mode)); - emit_move_insn (destmem, value); - } - - tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter, - true, OPTAB_LIB_WIDEN); - if (tmp != iter) - emit_move_insn (iter, tmp); - - emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode, - true, top_label); - if (expected_size != -1) - { - expected_size /= GET_MODE_SIZE (mode) * unroll; - if (expected_size == 0) - predict_jump (0); - else if (expected_size > REG_BR_PROB_BASE) - predict_jump (REG_BR_PROB_BASE - 1); - else - predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) - / expected_size); - } - else - predict_jump (REG_BR_PROB_BASE * 80 / 100); - iter = ix86_zero_extend_to_Pmode (iter); - tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr, - true, OPTAB_LIB_WIDEN); - if (tmp != destptr) - emit_move_insn (destptr, tmp); - if (!issetmem) - { - tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr, - true, OPTAB_LIB_WIDEN); - if (tmp != srcptr) - emit_move_insn (srcptr, tmp); - } - emit_label (out_label); -} - -/* Divide COUNTREG by SCALE. */ -static rtx -scale_counter (rtx countreg, int scale) -{ - rtx sc; - - if (scale == 1) - return countreg; - if (CONST_INT_P (countreg)) - return GEN_INT (INTVAL (countreg) / scale); - gcc_assert (REG_P (countreg)); - - sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg, - GEN_INT (exact_log2 (scale)), - NULL, 1, OPTAB_DIRECT); - return sc; -} - -/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument. - When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored. - When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored. - For setmem case, VALUE is a promoted to a wider size ORIG_VALUE. - ORIG_VALUE is the original value passed to memset to fill the memory with. - Other arguments have same meaning as for previous function. */ - -static void -expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem, - rtx destptr, rtx srcptr, rtx value, rtx orig_value, - rtx count, - machine_mode mode, bool issetmem) -{ - rtx destexp; - rtx srcexp; - rtx countreg; - HOST_WIDE_INT rounded_count; - - /* If possible, it is shorter to use rep movs. - TODO: Maybe it is better to move this logic to decide_alg. */ - if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3) - && (!issetmem || orig_value == const0_rtx)) - mode = SImode; - - if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) - destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); - - countreg = ix86_zero_extend_to_Pmode (scale_counter (count, - GET_MODE_SIZE (mode))); - if (mode != QImode) - { - destexp = gen_rtx_ASHIFT (Pmode, countreg, - GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); - destexp = gen_rtx_PLUS (Pmode, destexp, destptr); - } - else - destexp = gen_rtx_PLUS (Pmode, destptr, countreg); - if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count)) - { - rounded_count - = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode)); - destmem = shallow_copy_rtx (destmem); - set_mem_size (destmem, rounded_count); - } - else if (MEM_SIZE_KNOWN_P (destmem)) - clear_mem_size (destmem); - - if (issetmem) - { - value = force_reg (mode, gen_lowpart (mode, value)); - emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp)); - } - else - { - if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode) - srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0); - if (mode != QImode) - { - srcexp = gen_rtx_ASHIFT (Pmode, countreg, - GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); - srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr); - } - else - srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg); - if (CONST_INT_P (count)) - { - rounded_count - = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode)); - srcmem = shallow_copy_rtx (srcmem); - set_mem_size (srcmem, rounded_count); - } - else - { - if (MEM_SIZE_KNOWN_P (srcmem)) - clear_mem_size (srcmem); - } - emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg, - destexp, srcexp)); - } -} - -/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to - DESTMEM. - SRC is passed by pointer to be updated on return. - Return value is updated DST. */ -static rtx -emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr, - HOST_WIDE_INT size_to_move) -{ - rtx dst = destmem, src = *srcmem, tempreg; - enum insn_code code; - machine_mode move_mode; - int piece_size, i; - - /* Find the widest mode in which we could perform moves. - Start with the biggest power of 2 less than SIZE_TO_MOVE and half - it until move of such size is supported. */ - piece_size = 1 << floor_log2 (size_to_move); - while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode) - || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing) - { - gcc_assert (piece_size > 1); - piece_size >>= 1; - } - - /* Find the corresponding vector mode with the same size as MOVE_MODE. - MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */ - if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode)) - { - int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode); - if (!mode_for_vector (word_mode, nunits).exists (&move_mode) - || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing) - { - move_mode = word_mode; - piece_size = GET_MODE_SIZE (move_mode); - code = optab_handler (mov_optab, move_mode); - } - } - gcc_assert (code != CODE_FOR_nothing); - - dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0); - src = adjust_automodify_address_nv (src, move_mode, srcptr, 0); - - /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */ - gcc_assert (size_to_move % piece_size == 0); - - for (i = 0; i < size_to_move; i += piece_size) - { - /* We move from memory to memory, so we'll need to do it via - a temporary register. */ - tempreg = gen_reg_rtx (move_mode); - emit_insn (GEN_FCN (code) (tempreg, src)); - emit_insn (GEN_FCN (code) (dst, tempreg)); - - emit_move_insn (destptr, - plus_constant (Pmode, copy_rtx (destptr), piece_size)); - emit_move_insn (srcptr, - plus_constant (Pmode, copy_rtx (srcptr), piece_size)); - - dst = adjust_automodify_address_nv (dst, move_mode, destptr, - piece_size); - src = adjust_automodify_address_nv (src, move_mode, srcptr, - piece_size); - } - - /* Update DST and SRC rtx. */ - *srcmem = src; - return dst; -} - -/* Helper function for the string operations below. Dest VARIABLE whether - it is aligned to VALUE bytes. If true, jump to the label. */ - -static rtx_code_label * -ix86_expand_aligntest (rtx variable, int value, bool epilogue) -{ - rtx_code_label *label = gen_label_rtx (); - rtx tmpcount = gen_reg_rtx (GET_MODE (variable)); - if (GET_MODE (variable) == DImode) - emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value))); - else - emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value))); - emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable), - 1, label); - if (epilogue) - predict_jump (REG_BR_PROB_BASE * 50 / 100); - else - predict_jump (REG_BR_PROB_BASE * 90 / 100); - return label; -} - - -/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */ - -static void -expand_cpymem_epilogue (rtx destmem, rtx srcmem, - rtx destptr, rtx srcptr, rtx count, int max_size) -{ - rtx src, dest; - if (CONST_INT_P (count)) - { - HOST_WIDE_INT countval = INTVAL (count); - HOST_WIDE_INT epilogue_size = countval % max_size; - int i; - - /* For now MAX_SIZE should be a power of 2. This assert could be - relaxed, but it'll require a bit more complicated epilogue - expanding. */ - gcc_assert ((max_size & (max_size - 1)) == 0); - for (i = max_size; i >= 1; i >>= 1) - { - if (epilogue_size & i) - destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i); - } - return; - } - if (max_size > 8) - { - count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1), - count, 1, OPTAB_DIRECT); - expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL, - count, QImode, 1, 4, false); - return; - } - - /* When there are stringops, we can cheaply increase dest and src pointers. - Otherwise we save code size by maintaining offset (zero is readily - available from preceding rep operation) and using x86 addressing modes. - */ - if (TARGET_SINGLE_STRINGOP) - { - if (max_size > 4) - { - rtx_code_label *label = ix86_expand_aligntest (count, 4, true); - src = change_address (srcmem, SImode, srcptr); - dest = change_address (destmem, SImode, destptr); - emit_insn (gen_strmov (destptr, dest, srcptr, src)); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 2) - { - rtx_code_label *label = ix86_expand_aligntest (count, 2, true); - src = change_address (srcmem, HImode, srcptr); - dest = change_address (destmem, HImode, destptr); - emit_insn (gen_strmov (destptr, dest, srcptr, src)); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 1) - { - rtx_code_label *label = ix86_expand_aligntest (count, 1, true); - src = change_address (srcmem, QImode, srcptr); - dest = change_address (destmem, QImode, destptr); - emit_insn (gen_strmov (destptr, dest, srcptr, src)); - emit_label (label); - LABEL_NUSES (label) = 1; - } - } - else - { - rtx offset = force_reg (Pmode, const0_rtx); - rtx tmp; - - if (max_size > 4) - { - rtx_code_label *label = ix86_expand_aligntest (count, 4, true); - src = change_address (srcmem, SImode, srcptr); - dest = change_address (destmem, SImode, destptr); - emit_move_insn (dest, src); - tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL, - true, OPTAB_LIB_WIDEN); - if (tmp != offset) - emit_move_insn (offset, tmp); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 2) - { - rtx_code_label *label = ix86_expand_aligntest (count, 2, true); - tmp = gen_rtx_PLUS (Pmode, srcptr, offset); - src = change_address (srcmem, HImode, tmp); - tmp = gen_rtx_PLUS (Pmode, destptr, offset); - dest = change_address (destmem, HImode, tmp); - emit_move_insn (dest, src); - tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp, - true, OPTAB_LIB_WIDEN); - if (tmp != offset) - emit_move_insn (offset, tmp); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 1) - { - rtx_code_label *label = ix86_expand_aligntest (count, 1, true); - tmp = gen_rtx_PLUS (Pmode, srcptr, offset); - src = change_address (srcmem, QImode, tmp); - tmp = gen_rtx_PLUS (Pmode, destptr, offset); - dest = change_address (destmem, QImode, tmp); - emit_move_insn (dest, src); - emit_label (label); - LABEL_NUSES (label) = 1; - } - } -} - -/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM - with value PROMOTED_VAL. - SRC is passed by pointer to be updated on return. - Return value is updated DST. */ -static rtx -emit_memset (rtx destmem, rtx destptr, rtx promoted_val, - HOST_WIDE_INT size_to_move) -{ - rtx dst = destmem; - enum insn_code code; - machine_mode move_mode; - int piece_size, i; - - /* Find the widest mode in which we could perform moves. - Start with the biggest power of 2 less than SIZE_TO_MOVE and half - it until move of such size is supported. */ - move_mode = GET_MODE (promoted_val); - if (move_mode == VOIDmode) - move_mode = QImode; - if (size_to_move < GET_MODE_SIZE (move_mode)) - { - unsigned int move_bits = size_to_move * BITS_PER_UNIT; - move_mode = int_mode_for_size (move_bits, 0).require (); - promoted_val = gen_lowpart (move_mode, promoted_val); - } - piece_size = GET_MODE_SIZE (move_mode); - code = optab_handler (mov_optab, move_mode); - gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX); - - dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0); - - /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */ - gcc_assert (size_to_move % piece_size == 0); - - for (i = 0; i < size_to_move; i += piece_size) - { - if (piece_size <= GET_MODE_SIZE (word_mode)) - { - emit_insn (gen_strset (destptr, dst, promoted_val)); - dst = adjust_automodify_address_nv (dst, move_mode, destptr, - piece_size); - continue; - } - - emit_insn (GEN_FCN (code) (dst, promoted_val)); - - emit_move_insn (destptr, - plus_constant (Pmode, copy_rtx (destptr), piece_size)); - - dst = adjust_automodify_address_nv (dst, move_mode, destptr, - piece_size); - } - - /* Update DST rtx. */ - return dst; -} -/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ -static void -expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value, - rtx count, int max_size) -{ - count = expand_simple_binop (counter_mode (count), AND, count, - GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT); - expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL, - gen_lowpart (QImode, value), count, QImode, - 1, max_size / 2, true); -} - -/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ -static void -expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value, - rtx count, int max_size) -{ - rtx dest; - - if (CONST_INT_P (count)) - { - HOST_WIDE_INT countval = INTVAL (count); - HOST_WIDE_INT epilogue_size = countval % max_size; - int i; - - /* For now MAX_SIZE should be a power of 2. This assert could be - relaxed, but it'll require a bit more complicated epilogue - expanding. */ - gcc_assert ((max_size & (max_size - 1)) == 0); - for (i = max_size; i >= 1; i >>= 1) - { - if (epilogue_size & i) - { - if (vec_value && i > GET_MODE_SIZE (GET_MODE (value))) - destmem = emit_memset (destmem, destptr, vec_value, i); - else - destmem = emit_memset (destmem, destptr, value, i); - } - } - return; - } - if (max_size > 32) - { - expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size); - return; - } - if (max_size > 16) - { - rtx_code_label *label = ix86_expand_aligntest (count, 16, true); - if (TARGET_64BIT) - { - dest = change_address (destmem, DImode, destptr); - emit_insn (gen_strset (destptr, dest, value)); - dest = adjust_automodify_address_nv (dest, DImode, destptr, 8); - emit_insn (gen_strset (destptr, dest, value)); - } - else - { - dest = change_address (destmem, SImode, destptr); - emit_insn (gen_strset (destptr, dest, value)); - dest = adjust_automodify_address_nv (dest, SImode, destptr, 4); - emit_insn (gen_strset (destptr, dest, value)); - dest = adjust_automodify_address_nv (dest, SImode, destptr, 8); - emit_insn (gen_strset (destptr, dest, value)); - dest = adjust_automodify_address_nv (dest, SImode, destptr, 12); - emit_insn (gen_strset (destptr, dest, value)); - } - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 8) - { - rtx_code_label *label = ix86_expand_aligntest (count, 8, true); - if (TARGET_64BIT) - { - dest = change_address (destmem, DImode, destptr); - emit_insn (gen_strset (destptr, dest, value)); - } - else - { - dest = change_address (destmem, SImode, destptr); - emit_insn (gen_strset (destptr, dest, value)); - dest = adjust_automodify_address_nv (dest, SImode, destptr, 4); - emit_insn (gen_strset (destptr, dest, value)); - } - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 4) - { - rtx_code_label *label = ix86_expand_aligntest (count, 4, true); - dest = change_address (destmem, SImode, destptr); - emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 2) - { - rtx_code_label *label = ix86_expand_aligntest (count, 2, true); - dest = change_address (destmem, HImode, destptr); - emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 1) - { - rtx_code_label *label = ix86_expand_aligntest (count, 1, true); - dest = change_address (destmem, QImode, destptr); - emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); - emit_label (label); - LABEL_NUSES (label) = 1; - } -} - -/* Adjust COUNTER by the VALUE. */ -static void -ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value) -{ - emit_insn (gen_add2_insn (countreg, GEN_INT (-value))); -} - -/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to - DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN. - Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are - ignored. - Return value is updated DESTMEM. */ - -static rtx -expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem, - rtx destptr, rtx srcptr, rtx value, - rtx vec_value, rtx count, int align, - int desired_alignment, bool issetmem) -{ - int i; - for (i = 1; i < desired_alignment; i <<= 1) - { - if (align <= i) - { - rtx_code_label *label = ix86_expand_aligntest (destptr, i, false); - if (issetmem) - { - if (vec_value && i > GET_MODE_SIZE (GET_MODE (value))) - destmem = emit_memset (destmem, destptr, vec_value, i); - else - destmem = emit_memset (destmem, destptr, value, i); - } - else - destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i); - ix86_adjust_counter (count, i); - emit_label (label); - LABEL_NUSES (label) = 1; - set_mem_align (destmem, i * 2 * BITS_PER_UNIT); - } - } - return destmem; -} - -/* Test if COUNT&SIZE is nonzero and if so, expand movme - or setmem sequence that is valid for SIZE..2*SIZE-1 bytes - and jump to DONE_LABEL. */ -static void -expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem, - rtx destptr, rtx srcptr, - rtx value, rtx vec_value, - rtx count, int size, - rtx done_label, bool issetmem) -{ - rtx_code_label *label = ix86_expand_aligntest (count, size, false); - machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk (); - rtx modesize; - int n; - - /* If we do not have vector value to copy, we must reduce size. */ - if (issetmem) - { - if (!vec_value) - { - if (GET_MODE (value) == VOIDmode && size > 8) - mode = Pmode; - else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value))) - mode = GET_MODE (value); - } - else - mode = GET_MODE (vec_value), value = vec_value; - } - else - { - /* Choose appropriate vector mode. */ - if (size >= 32) - mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode; - else if (size >= 16) - mode = TARGET_SSE ? V16QImode : DImode; - srcmem = change_address (srcmem, mode, srcptr); - } - destmem = change_address (destmem, mode, destptr); - modesize = GEN_INT (GET_MODE_SIZE (mode)); - gcc_assert (GET_MODE_SIZE (mode) <= size); - for (n = 0; n * GET_MODE_SIZE (mode) < size; n++) - { - if (issetmem) - emit_move_insn (destmem, gen_lowpart (mode, value)); - else - { - emit_move_insn (destmem, srcmem); - srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); - } - destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); - } - - destmem = offset_address (destmem, count, 1); - destmem = offset_address (destmem, GEN_INT (-2 * size), - GET_MODE_SIZE (mode)); - if (!issetmem) - { - srcmem = offset_address (srcmem, count, 1); - srcmem = offset_address (srcmem, GEN_INT (-2 * size), - GET_MODE_SIZE (mode)); - } - for (n = 0; n * GET_MODE_SIZE (mode) < size; n++) - { - if (issetmem) - emit_move_insn (destmem, gen_lowpart (mode, value)); - else - { - emit_move_insn (destmem, srcmem); - srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); - } - destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); - } - emit_jump_insn (gen_jump (done_label)); - emit_barrier (); - - emit_label (label); - LABEL_NUSES (label) = 1; -} - -/* Handle small memcpy (up to SIZE that is supposed to be small power of 2. - and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN - bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can - proceed with an loop copying SIZE bytes at once. Do moves in MODE. - DONE_LABEL is a label after the whole copying sequence. The label is created - on demand if *DONE_LABEL is NULL. - MIN_SIZE is minimal size of block copied. This value gets adjusted for new - bounds after the initial copies. - - DESTMEM/SRCMEM are memory expressions pointing to the copies block, - DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether - we will dispatch to a library call for large blocks. - - In pseudocode we do: - - if (COUNT < SIZE) - { - Assume that SIZE is 4. Bigger sizes are handled analogously - if (COUNT & 4) - { - copy 4 bytes from SRCPTR to DESTPTR - copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4 - goto done_label - } - if (!COUNT) - goto done_label; - copy 1 byte from SRCPTR to DESTPTR - if (COUNT & 2) - { - copy 2 bytes from SRCPTR to DESTPTR - copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2 - } - } - else - { - copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR - copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE - - OLD_DESPTR = DESTPTR; - Align DESTPTR up to DESIRED_ALIGN - SRCPTR += DESTPTR - OLD_DESTPTR - COUNT -= DEST_PTR - OLD_DESTPTR - if (DYNAMIC_CHECK) - Round COUNT down to multiple of SIZE - << optional caller supplied zero size guard is here >> - << optional caller supplied dynamic check is here >> - << caller supplied main copy loop is here >> - } - done_label: - */ -static void -expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem, - rtx *destptr, rtx *srcptr, - machine_mode mode, - rtx value, rtx vec_value, - rtx *count, - rtx_code_label **done_label, - int size, - int desired_align, - int align, - unsigned HOST_WIDE_INT *min_size, - bool dynamic_check, - bool issetmem) -{ - rtx_code_label *loop_label = NULL, *label; - int n; - rtx modesize; - int prolog_size = 0; - rtx mode_value; - - /* Chose proper value to copy. */ - if (issetmem && VECTOR_MODE_P (mode)) - mode_value = vec_value; - else - mode_value = value; - gcc_assert (GET_MODE_SIZE (mode) <= size); - - /* See if block is big or small, handle small blocks. */ - if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size) - { - int size2 = size; - loop_label = gen_label_rtx (); - - if (!*done_label) - *done_label = gen_label_rtx (); - - emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count), - 1, loop_label); - size2 >>= 1; - - /* Handle sizes > 3. */ - for (;size2 > 2; size2 >>= 1) - expand_small_cpymem_or_setmem (destmem, srcmem, - *destptr, *srcptr, - value, vec_value, - *count, - size2, *done_label, issetmem); - /* Nothing to copy? Jump to DONE_LABEL if so */ - emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count), - 1, *done_label); - - /* Do a byte copy. */ - destmem = change_address (destmem, QImode, *destptr); - if (issetmem) - emit_move_insn (destmem, gen_lowpart (QImode, value)); - else - { - srcmem = change_address (srcmem, QImode, *srcptr); - emit_move_insn (destmem, srcmem); - } - - /* Handle sizes 2 and 3. */ - label = ix86_expand_aligntest (*count, 2, false); - destmem = change_address (destmem, HImode, *destptr); - destmem = offset_address (destmem, *count, 1); - destmem = offset_address (destmem, GEN_INT (-2), 2); - if (issetmem) - emit_move_insn (destmem, gen_lowpart (HImode, value)); - else - { - srcmem = change_address (srcmem, HImode, *srcptr); - srcmem = offset_address (srcmem, *count, 1); - srcmem = offset_address (srcmem, GEN_INT (-2), 2); - emit_move_insn (destmem, srcmem); - } - - emit_label (label); - LABEL_NUSES (label) = 1; - emit_jump_insn (gen_jump (*done_label)); - emit_barrier (); - } - else - gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size - || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size); - - /* Start memcpy for COUNT >= SIZE. */ - if (loop_label) - { - emit_label (loop_label); - LABEL_NUSES (loop_label) = 1; - } - - /* Copy first desired_align bytes. */ - if (!issetmem) - srcmem = change_address (srcmem, mode, *srcptr); - destmem = change_address (destmem, mode, *destptr); - modesize = GEN_INT (GET_MODE_SIZE (mode)); - for (n = 0; prolog_size < desired_align - align; n++) - { - if (issetmem) - emit_move_insn (destmem, mode_value); - else - { - emit_move_insn (destmem, srcmem); - srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); - } - destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); - prolog_size += GET_MODE_SIZE (mode); - } - - - /* Copy last SIZE bytes. */ - destmem = offset_address (destmem, *count, 1); - destmem = offset_address (destmem, - GEN_INT (-size - prolog_size), - 1); - if (issetmem) - emit_move_insn (destmem, mode_value); - else - { - srcmem = offset_address (srcmem, *count, 1); - srcmem = offset_address (srcmem, - GEN_INT (-size - prolog_size), - 1); - emit_move_insn (destmem, srcmem); - } - for (n = 1; n * GET_MODE_SIZE (mode) < size; n++) - { - destmem = offset_address (destmem, modesize, 1); - if (issetmem) - emit_move_insn (destmem, mode_value); - else - { - srcmem = offset_address (srcmem, modesize, 1); - emit_move_insn (destmem, srcmem); - } - } - - /* Align destination. */ - if (desired_align > 1 && desired_align > align) - { - rtx saveddest = *destptr; - - gcc_assert (desired_align <= size); - /* Align destptr up, place it to new register. */ - *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr, - GEN_INT (prolog_size), - NULL_RTX, 1, OPTAB_DIRECT); - if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest)) - REG_POINTER (*destptr) = 1; - *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr, - GEN_INT (-desired_align), - *destptr, 1, OPTAB_DIRECT); - /* See how many bytes we skipped. */ - saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest, - *destptr, - saveddest, 1, OPTAB_DIRECT); - /* Adjust srcptr and count. */ - if (!issetmem) - *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, - saveddest, *srcptr, 1, OPTAB_DIRECT); - *count = expand_simple_binop (GET_MODE (*count), PLUS, *count, - saveddest, *count, 1, OPTAB_DIRECT); - /* We copied at most size + prolog_size. */ - if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size)) - *min_size - = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size); - else - *min_size = 0; - - /* Our loops always round down the block size, but for dispatch to - library we need precise value. */ - if (dynamic_check) - *count = expand_simple_binop (GET_MODE (*count), AND, *count, - GEN_INT (-size), *count, 1, OPTAB_DIRECT); - } - else - { - gcc_assert (prolog_size == 0); - /* Decrease count, so we won't end up copying last word twice. */ - if (!CONST_INT_P (*count)) - *count = expand_simple_binop (GET_MODE (*count), PLUS, *count, - constm1_rtx, *count, 1, OPTAB_DIRECT); - else - *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1, - (unsigned HOST_WIDE_INT)size)); - if (*min_size) - *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size); - } -} - - -/* This function is like the previous one, except here we know how many bytes - need to be copied. That allows us to update alignment not only of DST, which - is returned, but also of SRC, which is passed as a pointer for that - reason. */ -static rtx -expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg, - rtx srcreg, rtx value, rtx vec_value, - int desired_align, int align_bytes, - bool issetmem) -{ - rtx src = NULL; - rtx orig_dst = dst; - rtx orig_src = NULL; - int piece_size = 1; - int copied_bytes = 0; - - if (!issetmem) - { - gcc_assert (srcp != NULL); - src = *srcp; - orig_src = src; - } - - for (piece_size = 1; - piece_size <= desired_align && copied_bytes < align_bytes; - piece_size <<= 1) - { - if (align_bytes & piece_size) - { - if (issetmem) - { - if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value))) - dst = emit_memset (dst, destreg, vec_value, piece_size); - else - dst = emit_memset (dst, destreg, value, piece_size); - } - else - dst = emit_memmov (dst, &src, destreg, srcreg, piece_size); - copied_bytes += piece_size; - } - } - if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT) - set_mem_align (dst, desired_align * BITS_PER_UNIT); - if (MEM_SIZE_KNOWN_P (orig_dst)) - set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes); - - if (!issetmem) - { - int src_align_bytes = get_mem_align_offset (src, desired_align - * BITS_PER_UNIT); - if (src_align_bytes >= 0) - src_align_bytes = desired_align - src_align_bytes; - if (src_align_bytes >= 0) - { - unsigned int src_align; - for (src_align = desired_align; src_align >= 2; src_align >>= 1) - { - if ((src_align_bytes & (src_align - 1)) - == (align_bytes & (src_align - 1))) - break; - } - if (src_align > (unsigned int) desired_align) - src_align = desired_align; - if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT) - set_mem_align (src, src_align * BITS_PER_UNIT); - } - if (MEM_SIZE_KNOWN_P (orig_src)) - set_mem_size (src, MEM_SIZE (orig_src) - align_bytes); - *srcp = src; - } - - return dst; -} - -/* Return true if ALG can be used in current context. - Assume we expand memset if MEMSET is true. */ -static bool -alg_usable_p (enum stringop_alg alg, bool memset, bool have_as) -{ - if (alg == no_stringop) - return false; - if (alg == vector_loop) - return TARGET_SSE || TARGET_AVX; - /* Algorithms using the rep prefix want at least edi and ecx; - additionally, memset wants eax and memcpy wants esi. Don't - consider such algorithms if the user has appropriated those - registers for their own purposes, or if we have a non-default - address space, since some string insns cannot override the segment. */ - if (alg == rep_prefix_1_byte - || alg == rep_prefix_4_byte - || alg == rep_prefix_8_byte) - { - if (have_as) - return false; - if (fixed_regs[CX_REG] - || fixed_regs[DI_REG] - || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])) - return false; - } - return true; -} - -/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */ -static enum stringop_alg -decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, - unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size, - bool memset, bool zero_memset, bool have_as, - int *dynamic_check, bool *noalign, bool recur) -{ - const struct stringop_algs *algs; - bool optimize_for_speed; - int max = 0; - const struct processor_costs *cost; - int i; - bool any_alg_usable_p = false; - - *noalign = false; - *dynamic_check = -1; - - /* Even if the string operation call is cold, we still might spend a lot - of time processing large blocks. */ - if (optimize_function_for_size_p (cfun) - || (optimize_insn_for_size_p () - && (max_size < 256 - || (expected_size != -1 && expected_size < 256)))) - optimize_for_speed = false; - else - optimize_for_speed = true; - - cost = optimize_for_speed ? ix86_cost : &ix86_size_cost; - if (memset) - algs = &cost->memset[TARGET_64BIT != 0]; - else - algs = &cost->memcpy[TARGET_64BIT != 0]; - - /* See maximal size for user defined algorithm. */ - for (i = 0; i < MAX_STRINGOP_ALGS; i++) - { - enum stringop_alg candidate = algs->size[i].alg; - bool usable = alg_usable_p (candidate, memset, have_as); - any_alg_usable_p |= usable; - - if (candidate != libcall && candidate && usable) - max = algs->size[i].max; - } - - /* If expected size is not known but max size is small enough - so inline version is a win, set expected size into - the range. */ - if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1) - && expected_size == -1) - expected_size = min_size / 2 + max_size / 2; - - /* If user specified the algorithm, honor it if possible. */ - if (ix86_stringop_alg != no_stringop - && alg_usable_p (ix86_stringop_alg, memset, have_as)) - return ix86_stringop_alg; - /* rep; movq or rep; movl is the smallest variant. */ - else if (!optimize_for_speed) - { - *noalign = true; - if (!count || (count & 3) || (memset && !zero_memset)) - return alg_usable_p (rep_prefix_1_byte, memset, have_as) - ? rep_prefix_1_byte : loop_1_byte; - else - return alg_usable_p (rep_prefix_4_byte, memset, have_as) - ? rep_prefix_4_byte : loop; - } - /* Very tiny blocks are best handled via the loop, REP is expensive to - setup. */ - else if (expected_size != -1 && expected_size < 4) - return loop_1_byte; - else if (expected_size != -1) - { - enum stringop_alg alg = libcall; - bool alg_noalign = false; - for (i = 0; i < MAX_STRINGOP_ALGS; i++) - { - /* We get here if the algorithms that were not libcall-based - were rep-prefix based and we are unable to use rep prefixes - based on global register usage. Break out of the loop and - use the heuristic below. */ - if (algs->size[i].max == 0) - break; - if (algs->size[i].max >= expected_size || algs->size[i].max == -1) - { - enum stringop_alg candidate = algs->size[i].alg; - - if (candidate != libcall - && alg_usable_p (candidate, memset, have_as)) - { - alg = candidate; - alg_noalign = algs->size[i].noalign; - } - /* Honor TARGET_INLINE_ALL_STRINGOPS by picking - last non-libcall inline algorithm. */ - if (TARGET_INLINE_ALL_STRINGOPS) - { - /* When the current size is best to be copied by a libcall, - but we are still forced to inline, run the heuristic below - that will pick code for medium sized blocks. */ - if (alg != libcall) - { - *noalign = alg_noalign; - return alg; - } - else if (!any_alg_usable_p) - break; - } - else if (alg_usable_p (candidate, memset, have_as)) - { - *noalign = algs->size[i].noalign; - return candidate; - } - } - } - } - /* When asked to inline the call anyway, try to pick meaningful choice. - We look for maximal size of block that is faster to copy by hand and - take blocks of at most of that size guessing that average size will - be roughly half of the block. - - If this turns out to be bad, we might simply specify the preferred - choice in ix86_costs. */ - if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY) - && (algs->unknown_size == libcall - || !alg_usable_p (algs->unknown_size, memset, have_as))) - { - enum stringop_alg alg; - HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2; - - /* If there aren't any usable algorithms or if recursing already, - then recursing on smaller sizes or same size isn't going to - find anything. Just return the simple byte-at-a-time copy loop. */ - if (!any_alg_usable_p || recur) - { - /* Pick something reasonable. */ - if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur) - *dynamic_check = 128; - return loop_1_byte; - } - alg = decide_alg (count, new_expected_size, min_size, max_size, memset, - zero_memset, have_as, dynamic_check, noalign, true); - gcc_assert (*dynamic_check == -1); - if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) - *dynamic_check = max; - else - gcc_assert (alg != libcall); - return alg; - } - return (alg_usable_p (algs->unknown_size, memset, have_as) - ? algs->unknown_size : libcall); -} - -/* Decide on alignment. We know that the operand is already aligned to ALIGN - (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */ -static int -decide_alignment (int align, - enum stringop_alg alg, - int expected_size, - machine_mode move_mode) -{ - int desired_align = 0; - - gcc_assert (alg != no_stringop); - - if (alg == libcall) - return 0; - if (move_mode == VOIDmode) - return 0; - - desired_align = GET_MODE_SIZE (move_mode); - /* PentiumPro has special logic triggering for 8 byte aligned blocks. - copying whole cacheline at once. */ - if (TARGET_PENTIUMPRO - && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte)) - desired_align = 8; - - if (optimize_size) - desired_align = 1; - if (desired_align < align) - desired_align = align; - if (expected_size != -1 && expected_size < 4) - desired_align = align; - - return desired_align; -} - - -/* Helper function for memcpy. For QImode value 0xXY produce - 0xXYXYXYXY of wide specified by MODE. This is essentially - a * 0x10101010, but we can do slightly better than - synth_mult by unwinding the sequence by hand on CPUs with - slow multiply. */ -static rtx -promote_duplicated_reg (machine_mode mode, rtx val) -{ - machine_mode valmode = GET_MODE (val); - rtx tmp; - int nops = mode == DImode ? 3 : 2; - - gcc_assert (mode == SImode || mode == DImode || val == const0_rtx); - if (val == const0_rtx) - return copy_to_mode_reg (mode, CONST0_RTX (mode)); - if (CONST_INT_P (val)) - { - HOST_WIDE_INT v = INTVAL (val) & 255; - - v |= v << 8; - v |= v << 16; - if (mode == DImode) - v |= (v << 16) << 16; - return copy_to_mode_reg (mode, gen_int_mode (v, mode)); - } - - if (valmode == VOIDmode) - valmode = QImode; - if (valmode != QImode) - val = gen_lowpart (QImode, val); - if (mode == QImode) - return val; - if (!TARGET_PARTIAL_REG_STALL) - nops--; - if (ix86_cost->mult_init[mode == DImode ? 3 : 2] - + ix86_cost->mult_bit * (mode == DImode ? 8 : 4) - <= (ix86_cost->shift_const + ix86_cost->add) * nops - + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0))) - { - rtx reg = convert_modes (mode, QImode, val, true); - tmp = promote_duplicated_reg (mode, const1_rtx); - return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1, - OPTAB_DIRECT); - } - else - { - rtx reg = convert_modes (mode, QImode, val, true); - - if (!TARGET_PARTIAL_REG_STALL) - if (mode == SImode) - emit_insn (gen_insvsi_1 (reg, reg)); - else - emit_insn (gen_insvdi_1 (reg, reg)); - else - { - tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8), - NULL, 1, OPTAB_DIRECT); - reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, - OPTAB_DIRECT); - } - tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16), - NULL, 1, OPTAB_DIRECT); - reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); - if (mode == SImode) - return reg; - tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32), - NULL, 1, OPTAB_DIRECT); - reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); - return reg; - } -} - -/* Duplicate value VAL using promote_duplicated_reg into maximal size that will - be needed by main loop copying SIZE_NEEDED chunks and prologue getting - alignment from ALIGN to DESIRED_ALIGN. */ -static rtx -promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, - int align) -{ - rtx promoted_val; - - if (TARGET_64BIT - && (size_needed > 4 || (desired_align > align && desired_align > 4))) - promoted_val = promote_duplicated_reg (DImode, val); - else if (size_needed > 2 || (desired_align > align && desired_align > 2)) - promoted_val = promote_duplicated_reg (SImode, val); - else if (size_needed > 1 || (desired_align > align && desired_align > 1)) - promoted_val = promote_duplicated_reg (HImode, val); - else - promoted_val = val; - - return promoted_val; -} - -/* Copy the address to a Pmode register. This is used for x32 to - truncate DImode TLS address to a SImode register. */ - -static rtx -ix86_copy_addr_to_reg (rtx addr) -{ - rtx reg; - if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode) - { - reg = copy_addr_to_reg (addr); - REG_POINTER (reg) = 1; - return reg; - } - else - { - gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode); - reg = copy_to_mode_reg (DImode, addr); - REG_POINTER (reg) = 1; - return gen_rtx_SUBREG (SImode, reg, 0); - } -} - -/* Expand string move (memcpy) ot store (memset) operation. Use i386 string - operations when profitable. The code depends upon architecture, block size - and alignment, but always has one of the following overall structures: - - Aligned move sequence: - - 1) Prologue guard: Conditional that jumps up to epilogues for small - blocks that can be handled by epilogue alone. This is faster - but also needed for correctness, since prologue assume the block - is larger than the desired alignment. - - Optional dynamic check for size and libcall for large - blocks is emitted here too, with -minline-stringops-dynamically. - - 2) Prologue: copy first few bytes in order to get destination - aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less - than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be - copied. We emit either a jump tree on power of two sized - blocks, or a byte loop. - - 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks - with specified algorithm. - - 4) Epilogue: code copying tail of the block that is too small to be - handled by main body (or up to size guarded by prologue guard). - - Misaligned move sequence - - 1) missaligned move prologue/epilogue containing: - a) Prologue handling small memory blocks and jumping to done_label - (skipped if blocks are known to be large enough) - b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is - needed by single possibly misaligned move - (skipped if alignment is not needed) - c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves - - 2) Zero size guard dispatching to done_label, if needed - - 3) dispatch to library call, if needed, - - 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks - with specified algorithm. */ -bool -ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, - rtx align_exp, rtx expected_align_exp, - rtx expected_size_exp, rtx min_size_exp, - rtx max_size_exp, rtx probable_max_size_exp, - bool issetmem) -{ - rtx destreg; - rtx srcreg = NULL; - rtx_code_label *label = NULL; - rtx tmp; - rtx_code_label *jump_around_label = NULL; - HOST_WIDE_INT align = 1; - unsigned HOST_WIDE_INT count = 0; - HOST_WIDE_INT expected_size = -1; - int size_needed = 0, epilogue_size_needed; - int desired_align = 0, align_bytes = 0; - enum stringop_alg alg; - rtx promoted_val = NULL; - rtx vec_promoted_val = NULL; - bool force_loopy_epilogue = false; - int dynamic_check; - bool need_zero_guard = false; - bool noalign; - machine_mode move_mode = VOIDmode; - machine_mode wider_mode; - int unroll_factor = 1; - /* TODO: Once value ranges are available, fill in proper data. */ - unsigned HOST_WIDE_INT min_size = 0; - unsigned HOST_WIDE_INT max_size = -1; - unsigned HOST_WIDE_INT probable_max_size = -1; - bool misaligned_prologue_used = false; - bool have_as; - - if (CONST_INT_P (align_exp)) - align = INTVAL (align_exp); - /* i386 can do misaligned access on reasonably increased cost. */ - if (CONST_INT_P (expected_align_exp) - && INTVAL (expected_align_exp) > align) - align = INTVAL (expected_align_exp); - /* ALIGN is the minimum of destination and source alignment, but we care here - just about destination alignment. */ - else if (!issetmem - && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT) - align = MEM_ALIGN (dst) / BITS_PER_UNIT; - - if (CONST_INT_P (count_exp)) - { - min_size = max_size = probable_max_size = count = expected_size - = INTVAL (count_exp); - /* When COUNT is 0, there is nothing to do. */ - if (!count) - return true; - } - else - { - if (min_size_exp) - min_size = INTVAL (min_size_exp); - if (max_size_exp) - max_size = INTVAL (max_size_exp); - if (probable_max_size_exp) - probable_max_size = INTVAL (probable_max_size_exp); - if (CONST_INT_P (expected_size_exp)) - expected_size = INTVAL (expected_size_exp); - } - - /* Make sure we don't need to care about overflow later on. */ - if (count > (HOST_WIDE_INT_1U << 30)) - return false; - - have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst)); - if (!issetmem) - have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)); - - /* Step 0: Decide on preferred algorithm, desired alignment and - size of chunks to be copied by main loop. */ - alg = decide_alg (count, expected_size, min_size, probable_max_size, - issetmem, - issetmem && val_exp == const0_rtx, have_as, - &dynamic_check, &noalign, false); - - if (dump_file) - fprintf (dump_file, "Selected stringop expansion strategy: %s\n", - stringop_alg_names[alg]); - - if (alg == libcall) - return false; - gcc_assert (alg != no_stringop); - - /* For now vector-version of memset is generated only for memory zeroing, as - creating of promoted vector value is very cheap in this case. */ - if (issetmem && alg == vector_loop && val_exp != const0_rtx) - alg = unrolled_loop; - - if (!count) - count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); - destreg = ix86_copy_addr_to_reg (XEXP (dst, 0)); - if (!issetmem) - srcreg = ix86_copy_addr_to_reg (XEXP (src, 0)); - - unroll_factor = 1; - move_mode = word_mode; - switch (alg) - { - case libcall: - case no_stringop: - case last_alg: - gcc_unreachable (); - case loop_1_byte: - need_zero_guard = true; - move_mode = QImode; - break; - case loop: - need_zero_guard = true; - break; - case unrolled_loop: - need_zero_guard = true; - unroll_factor = (TARGET_64BIT ? 4 : 2); - break; - case vector_loop: - need_zero_guard = true; - unroll_factor = 4; - /* Find the widest supported mode. */ - move_mode = word_mode; - while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode) - && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing) - move_mode = wider_mode; - - if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128) - move_mode = TImode; - - /* Find the corresponding vector mode with the same size as MOVE_MODE. - MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */ - if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode)) - { - int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode); - if (!mode_for_vector (word_mode, nunits).exists (&move_mode) - || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing) - move_mode = word_mode; - } - gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing); - break; - case rep_prefix_8_byte: - move_mode = DImode; - break; - case rep_prefix_4_byte: - move_mode = SImode; - break; - case rep_prefix_1_byte: - move_mode = QImode; - break; - } - size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; - epilogue_size_needed = size_needed; - - /* If we are going to call any library calls conditionally, make sure any - pending stack adjustment happen before the first conditional branch, - otherwise they will be emitted before the library call only and won't - happen from the other branches. */ - if (dynamic_check != -1) - do_pending_stack_adjust (); - - desired_align = decide_alignment (align, alg, expected_size, move_mode); - if (!TARGET_ALIGN_STRINGOPS || noalign) - align = desired_align; - - /* Step 1: Prologue guard. */ - - /* Alignment code needs count to be in register. */ - if (CONST_INT_P (count_exp) && desired_align > align) - { - if (INTVAL (count_exp) > desired_align - && INTVAL (count_exp) > size_needed) - { - align_bytes - = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT); - if (align_bytes <= 0) - align_bytes = 0; - else - align_bytes = desired_align - align_bytes; - } - if (align_bytes == 0) - count_exp = force_reg (counter_mode (count_exp), count_exp); - } - gcc_assert (desired_align >= 1 && align >= 1); - - /* Misaligned move sequences handle both prologue and epilogue at once. - Default code generation results in a smaller code for large alignments - and also avoids redundant job when sizes are known precisely. */ - misaligned_prologue_used - = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES - && MAX (desired_align, epilogue_size_needed) <= 32 - && desired_align <= epilogue_size_needed - && ((desired_align > align && !align_bytes) - || (!count && epilogue_size_needed > 1))); - - /* Do the cheap promotion to allow better CSE across the - main loop and epilogue (ie one load of the big constant in the - front of all code. - For now the misaligned move sequences do not have fast path - without broadcasting. */ - if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used))) - { - if (alg == vector_loop) - { - gcc_assert (val_exp == const0_rtx); - vec_promoted_val = promote_duplicated_reg (move_mode, val_exp); - promoted_val = promote_duplicated_reg_to_size (val_exp, - GET_MODE_SIZE (word_mode), - desired_align, align); - } - else - { - promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, - desired_align, align); - } - } - /* Misaligned move sequences handles both prologues and epilogues at once. - Default code generation results in smaller code for large alignments and - also avoids redundant job when sizes are known precisely. */ - if (misaligned_prologue_used) - { - /* Misaligned move prologue handled small blocks by itself. */ - expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves - (dst, src, &destreg, &srcreg, - move_mode, promoted_val, vec_promoted_val, - &count_exp, - &jump_around_label, - desired_align < align - ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed, - desired_align, align, &min_size, dynamic_check, issetmem); - if (!issetmem) - src = change_address (src, BLKmode, srcreg); - dst = change_address (dst, BLKmode, destreg); - set_mem_align (dst, desired_align * BITS_PER_UNIT); - epilogue_size_needed = 0; - if (need_zero_guard - && min_size < (unsigned HOST_WIDE_INT) size_needed) - { - /* It is possible that we copied enough so the main loop will not - execute. */ - gcc_assert (size_needed > 1); - if (jump_around_label == NULL_RTX) - jump_around_label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, - GEN_INT (size_needed), - LTU, 0, counter_mode (count_exp), 1, jump_around_label); - if (expected_size == -1 - || expected_size < (desired_align - align) / 2 + size_needed) - predict_jump (REG_BR_PROB_BASE * 20 / 100); - else - predict_jump (REG_BR_PROB_BASE * 60 / 100); - } - } - /* Ensure that alignment prologue won't copy past end of block. */ - else if (size_needed > 1 || (desired_align > 1 && desired_align > align)) - { - epilogue_size_needed = MAX (size_needed - 1, desired_align - align); - /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. - Make sure it is power of 2. */ - epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1); - - /* To improve performance of small blocks, we jump around the VAL - promoting mode. This mean that if the promoted VAL is not constant, - we might not use it in the epilogue and have to use byte - loop variant. */ - if (issetmem && epilogue_size_needed > 2 && !promoted_val) - force_loopy_epilogue = true; - if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed) - || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed) - { - /* If main algorithm works on QImode, no epilogue is needed. - For small sizes just don't align anything. */ - if (size_needed == 1) - desired_align = align; - else - goto epilogue; - } - else if (!count - && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed) - { - label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, - GEN_INT (epilogue_size_needed), - LTU, 0, counter_mode (count_exp), 1, label); - if (expected_size == -1 || expected_size < epilogue_size_needed) - predict_jump (REG_BR_PROB_BASE * 60 / 100); - else - predict_jump (REG_BR_PROB_BASE * 20 / 100); - } - } - - /* Emit code to decide on runtime whether library call or inline should be - used. */ - if (dynamic_check != -1) - { - if (!issetmem && CONST_INT_P (count_exp)) - { - if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check) - { - emit_block_copy_via_libcall (dst, src, count_exp); - count_exp = const0_rtx; - goto epilogue; - } - } - else - { - rtx_code_label *hot_label = gen_label_rtx (); - if (jump_around_label == NULL_RTX) - jump_around_label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), - LEU, 0, counter_mode (count_exp), - 1, hot_label); - predict_jump (REG_BR_PROB_BASE * 90 / 100); - if (issetmem) - set_storage_via_libcall (dst, count_exp, val_exp); - else - emit_block_copy_via_libcall (dst, src, count_exp); - emit_jump (jump_around_label); - emit_label (hot_label); - } - } - - /* Step 2: Alignment prologue. */ - /* Do the expensive promotion once we branched off the small blocks. */ - if (issetmem && !promoted_val) - promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, - desired_align, align); - - if (desired_align > align && !misaligned_prologue_used) - { - if (align_bytes == 0) - { - /* Except for the first move in prologue, we no longer know - constant offset in aliasing info. It don't seems to worth - the pain to maintain it for the first move, so throw away - the info early. */ - dst = change_address (dst, BLKmode, destreg); - if (!issetmem) - src = change_address (src, BLKmode, srcreg); - dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg, - promoted_val, vec_promoted_val, - count_exp, align, desired_align, - issetmem); - /* At most desired_align - align bytes are copied. */ - if (min_size < (unsigned)(desired_align - align)) - min_size = 0; - else - min_size -= desired_align - align; - } - else - { - /* If we know how many bytes need to be stored before dst is - sufficiently aligned, maintain aliasing info accurately. */ - dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg, - srcreg, - promoted_val, - vec_promoted_val, - desired_align, - align_bytes, - issetmem); - - count_exp = plus_constant (counter_mode (count_exp), - count_exp, -align_bytes); - count -= align_bytes; - min_size -= align_bytes; - max_size -= align_bytes; - } - if (need_zero_guard - && min_size < (unsigned HOST_WIDE_INT) size_needed - && (count < (unsigned HOST_WIDE_INT) size_needed - || (align_bytes == 0 - && count < ((unsigned HOST_WIDE_INT) size_needed - + desired_align - align)))) - { - /* It is possible that we copied enough so the main loop will not - execute. */ - gcc_assert (size_needed > 1); - if (label == NULL_RTX) - label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, - GEN_INT (size_needed), - LTU, 0, counter_mode (count_exp), 1, label); - if (expected_size == -1 - || expected_size < (desired_align - align) / 2 + size_needed) - predict_jump (REG_BR_PROB_BASE * 20 / 100); - else - predict_jump (REG_BR_PROB_BASE * 60 / 100); - } - } - if (label && size_needed == 1) - { - emit_label (label); - LABEL_NUSES (label) = 1; - label = NULL; - epilogue_size_needed = 1; - if (issetmem) - promoted_val = val_exp; - } - else if (label == NULL_RTX && !misaligned_prologue_used) - epilogue_size_needed = size_needed; - - /* Step 3: Main loop. */ - - switch (alg) - { - case libcall: - case no_stringop: - case last_alg: - gcc_unreachable (); - case loop_1_byte: - case loop: - case unrolled_loop: - expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val, - count_exp, move_mode, unroll_factor, - expected_size, issetmem); - break; - case vector_loop: - expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, - vec_promoted_val, count_exp, move_mode, - unroll_factor, expected_size, issetmem); - break; - case rep_prefix_8_byte: - case rep_prefix_4_byte: - case rep_prefix_1_byte: - expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val, - val_exp, count_exp, move_mode, issetmem); - break; - } - /* Adjust properly the offset of src and dest memory for aliasing. */ - if (CONST_INT_P (count_exp)) - { - if (!issetmem) - src = adjust_automodify_address_nv (src, BLKmode, srcreg, - (count / size_needed) * size_needed); - dst = adjust_automodify_address_nv (dst, BLKmode, destreg, - (count / size_needed) * size_needed); - } - else - { - if (!issetmem) - src = change_address (src, BLKmode, srcreg); - dst = change_address (dst, BLKmode, destreg); - } - - /* Step 4: Epilogue to copy the remaining bytes. */ - epilogue: - if (label) - { - /* When the main loop is done, COUNT_EXP might hold original count, - while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. - Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED - bytes. Compensate if needed. */ - - if (size_needed < epilogue_size_needed) - { - tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp, - GEN_INT (size_needed - 1), count_exp, 1, - OPTAB_DIRECT); - if (tmp != count_exp) - emit_move_insn (count_exp, tmp); - } - emit_label (label); - LABEL_NUSES (label) = 1; - } - - if (count_exp != const0_rtx && epilogue_size_needed > 1) - { - if (force_loopy_epilogue) - expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, - epilogue_size_needed); - else - { - if (issetmem) - expand_setmem_epilogue (dst, destreg, promoted_val, - vec_promoted_val, count_exp, - epilogue_size_needed); - else - expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp, - epilogue_size_needed); - } - } - if (jump_around_label) - emit_label (jump_around_label); - return true; -} - - -/* Expand the appropriate insns for doing strlen if not just doing - repnz; scasb - - out = result, initialized with the start address - align_rtx = alignment of the address. - scratch = scratch register, initialized with the startaddress when - not aligned, otherwise undefined - - This is just the body. It needs the initializations mentioned above and - some address computing at the end. These things are done in i386.md. */ - -static void -ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx) -{ - int align; - rtx tmp; - rtx_code_label *align_2_label = NULL; - rtx_code_label *align_3_label = NULL; - rtx_code_label *align_4_label = gen_label_rtx (); - rtx_code_label *end_0_label = gen_label_rtx (); - rtx mem; - rtx tmpreg = gen_reg_rtx (SImode); - rtx scratch = gen_reg_rtx (SImode); - rtx cmp; - - align = 0; - if (CONST_INT_P (align_rtx)) - align = INTVAL (align_rtx); - - /* Loop to check 1..3 bytes for null to get an aligned pointer. */ - - /* Is there a known alignment and is it less than 4? */ - if (align < 4) - { - rtx scratch1 = gen_reg_rtx (Pmode); - emit_move_insn (scratch1, out); - /* Is there a known alignment and is it not 2? */ - if (align != 2) - { - align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */ - align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */ - - /* Leave just the 3 lower bits. */ - align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3), - NULL_RTX, 0, OPTAB_WIDEN); - - emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, - Pmode, 1, align_4_label); - emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL, - Pmode, 1, align_2_label); - emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL, - Pmode, 1, align_3_label); - } - else - { - /* Since the alignment is 2, we have to check 2 or 0 bytes; - check if is aligned to 4 - byte. */ - - align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx, - NULL_RTX, 0, OPTAB_WIDEN); - - emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, - Pmode, 1, align_4_label); - } - - mem = change_address (src, QImode, out); - - /* Now compare the bytes. */ - - /* Compare the first n unaligned byte on a byte per byte basis. */ - emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, - QImode, 1, end_0_label); - - /* Increment the address. */ - emit_insn (gen_add2_insn (out, const1_rtx)); - - /* Not needed with an alignment of 2 */ - if (align != 2) - { - emit_label (align_2_label); - - emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, - end_0_label); - - emit_insn (gen_add2_insn (out, const1_rtx)); - - emit_label (align_3_label); - } - - emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, - end_0_label); - - emit_insn (gen_add2_insn (out, const1_rtx)); - } - - /* Generate loop to check 4 bytes at a time. It is not a good idea to - align this loop. It gives only huge programs, but does not help to - speed up. */ - emit_label (align_4_label); - - mem = change_address (src, SImode, out); - emit_move_insn (scratch, mem); - emit_insn (gen_add2_insn (out, GEN_INT (4))); - - /* This formula yields a nonzero result iff one of the bytes is zero. - This saves three branches inside loop and many cycles. */ - - emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101))); - emit_insn (gen_one_cmplsi2 (scratch, scratch)); - emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch)); - emit_insn (gen_andsi3 (tmpreg, tmpreg, - gen_int_mode (0x80808080, SImode))); - emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1, - align_4_label); - - if (TARGET_CMOVE) - { - rtx reg = gen_reg_rtx (SImode); - rtx reg2 = gen_reg_rtx (Pmode); - emit_move_insn (reg, tmpreg); - emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16))); - - /* If zero is not in the first two bytes, move two bytes forward. */ - emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080))); - tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); - tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); - emit_insn (gen_rtx_SET (tmpreg, - gen_rtx_IF_THEN_ELSE (SImode, tmp, - reg, - tmpreg))); - /* Emit lea manually to avoid clobbering of flags. */ - emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2))); - - tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); - tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); - emit_insn (gen_rtx_SET (out, - gen_rtx_IF_THEN_ELSE (Pmode, tmp, - reg2, - out))); - } - else - { - rtx_code_label *end_2_label = gen_label_rtx (); - /* Is zero in the first two bytes? */ - - emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080))); - tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); - tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx); - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, - gen_rtx_LABEL_REF (VOIDmode, end_2_label), - pc_rtx); - tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - JUMP_LABEL (tmp) = end_2_label; - - /* Not in the first two. Move two bytes forward. */ - emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16))); - emit_insn (gen_add2_insn (out, const2_rtx)); - - emit_label (end_2_label); - - } - - /* Avoid branch in fixing the byte. */ - tmpreg = gen_lowpart (QImode, tmpreg); - emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg)); - tmp = gen_rtx_REG (CCmode, FLAGS_REG); - cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx); - emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp)); - - emit_label (end_0_label); -} - -/* Expand strlen. */ - -bool -ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) -{ -if (TARGET_UNROLL_STRLEN - && TARGET_INLINE_ALL_STRINGOPS - && eoschar == const0_rtx - && optimize > 1) - { - /* The generic case of strlen expander is long. Avoid it's - expanding unless TARGET_INLINE_ALL_STRINGOPS. */ - rtx addr = force_reg (Pmode, XEXP (src, 0)); - /* Well it seems that some optimizer does not combine a call like - foo(strlen(bar), strlen(bar)); - when the move and the subtraction is done here. It does calculate - the length just once when these instructions are done inside of - output_strlen_unroll(). But I think since &bar[strlen(bar)] is - often used and I use one fewer register for the lifetime of - output_strlen_unroll() this is better. */ - - emit_move_insn (out, addr); - - ix86_expand_strlensi_unroll_1 (out, src, align); - - /* strlensi_unroll_1 returns the address of the zero at the end of - the string, like memchr(), so compute the length by subtracting - the start address. */ - emit_insn (gen_sub2_insn (out, addr)); - return true; - } - else - return false; -} - -/* For given symbol (function) construct code to compute address of it's PLT - entry in large x86-64 PIC model. */ - -static rtx -construct_plt_address (rtx symbol) -{ - rtx tmp, unspec; - - gcc_assert (GET_CODE (symbol) == SYMBOL_REF); - gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF); - gcc_assert (Pmode == DImode); - - tmp = gen_reg_rtx (Pmode); - unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF); - - emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec)); - emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx)); - return tmp; -} - -/* Additional registers that are clobbered by SYSV calls. */ - -static int const x86_64_ms_sysv_extra_clobbered_registers - [NUM_X86_64_MS_CLOBBERED_REGS] = -{ - SI_REG, DI_REG, - XMM6_REG, XMM7_REG, - XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG, - XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG -}; - -rtx_insn * -ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, - rtx callarg2, - rtx pop, bool sibcall) -{ - rtx vec[3]; - rtx use = NULL, call; - unsigned int vec_len = 0; - tree fndecl; - - if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) - { - fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0)); - if (fndecl - && (lookup_attribute ("interrupt", - TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))) - error ("interrupt service routine cannot be called directly"); - } - else - fndecl = NULL_TREE; - - if (pop == const0_rtx) - pop = NULL; - gcc_assert (!TARGET_64BIT || !pop); - - if (TARGET_MACHO && !TARGET_64BIT) - { -#if TARGET_MACHO - if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) - fnaddr = machopic_indirect_call_target (fnaddr); -#endif - } - else - { - /* Static functions and indirect calls don't need the pic register. Also, - check if PLT was explicitly avoided via no-plt or "noplt" attribute, making - it an indirect call. */ - rtx addr = XEXP (fnaddr, 0); - if (flag_pic - && GET_CODE (addr) == SYMBOL_REF - && !SYMBOL_REF_LOCAL_P (addr)) - { - if (flag_plt - && (SYMBOL_REF_DECL (addr) == NULL_TREE - || !lookup_attribute ("noplt", - DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr))))) - { - if (!TARGET_64BIT - || (ix86_cmodel == CM_LARGE_PIC - && DEFAULT_ABI != MS_ABI)) - { - use_reg (&use, gen_rtx_REG (Pmode, - REAL_PIC_OFFSET_TABLE_REGNUM)); - if (ix86_use_pseudo_pic_reg ()) - emit_move_insn (gen_rtx_REG (Pmode, - REAL_PIC_OFFSET_TABLE_REGNUM), - pic_offset_table_rtx); - } - } - else if (!TARGET_PECOFF && !TARGET_MACHO) - { - if (TARGET_64BIT) - { - fnaddr = gen_rtx_UNSPEC (Pmode, - gen_rtvec (1, addr), - UNSPEC_GOTPCREL); - fnaddr = gen_rtx_CONST (Pmode, fnaddr); - } - else - { - fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), - UNSPEC_GOT); - fnaddr = gen_rtx_CONST (Pmode, fnaddr); - fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, - fnaddr); - } - fnaddr = gen_const_mem (Pmode, fnaddr); - /* Pmode may not be the same as word_mode for x32, which - doesn't support indirect branch via 32-bit memory slot. - Since x32 GOT slot is 64 bit with zero upper 32 bits, - indirect branch via x32 GOT slot is OK. */ - if (GET_MODE (fnaddr) != word_mode) - fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr); - fnaddr = gen_rtx_MEM (QImode, fnaddr); - } - } - } - - /* Skip setting up RAX register for -mskip-rax-setup when there are no - parameters passed in vector registers. */ - if (TARGET_64BIT - && (INTVAL (callarg2) > 0 - || (INTVAL (callarg2) == 0 - && (TARGET_SSE || !flag_skip_rax_setup)))) - { - rtx al = gen_rtx_REG (QImode, AX_REG); - emit_move_insn (al, callarg2); - use_reg (&use, al); - } - - if (ix86_cmodel == CM_LARGE_PIC - && !TARGET_PECOFF - && MEM_P (fnaddr) - && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF - && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode)) - fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0))); - /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect - branch via x32 GOT slot is OK. */ - else if (!(TARGET_X32 - && MEM_P (fnaddr) - && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND - && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode)) - && (sibcall - ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode) - : !call_insn_operand (XEXP (fnaddr, 0), word_mode))) - { - fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1); - fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr)); - } - - call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1); - - if (retval) - call = gen_rtx_SET (retval, call); - vec[vec_len++] = call; - - if (pop) - { - pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop); - pop = gen_rtx_SET (stack_pointer_rtx, pop); - vec[vec_len++] = pop; - } - - if (cfun->machine->no_caller_saved_registers - && (!fndecl - || (!TREE_THIS_VOLATILE (fndecl) - && !lookup_attribute ("no_caller_saved_registers", - TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))) - { - static const char ix86_call_used_regs[] = CALL_USED_REGISTERS; - bool is_64bit_ms_abi = (TARGET_64BIT - && ix86_function_abi (fndecl) == MS_ABI); - char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi); - - /* If there are no caller-saved registers, add all registers - that are clobbered by the call which returns. */ - for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++) - if (!fixed_regs[i] - && (ix86_call_used_regs[i] == 1 - || (ix86_call_used_regs[i] & c_mask)) - && !STACK_REGNO_P (i) - && !MMX_REGNO_P (i)) - clobber_reg (&use, - gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i)); - } - else if (TARGET_64BIT_MS_ABI - && (!callarg2 || INTVAL (callarg2) != -2)) - { - unsigned i; - - for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++) - { - int regno = x86_64_ms_sysv_extra_clobbered_registers[i]; - machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode; - - clobber_reg (&use, gen_rtx_REG (mode, regno)); - } - - /* Set here, but it may get cleared later. */ - if (TARGET_CALL_MS2SYSV_XLOGUES) - { - if (!TARGET_SSE) - ; - - /* Don't break hot-patched functions. */ - else if (ix86_function_ms_hook_prologue (current_function_decl)) - ; - - /* TODO: Cases not yet examined. */ - else if (flag_split_stack) - warn_once_call_ms2sysv_xlogues ("-fsplit-stack"); - - else - { - gcc_assert (!reload_completed); - cfun->machine->call_ms2sysv = true; - } - } - } - - if (vec_len > 1) - call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec)); - rtx_insn *call_insn = emit_call_insn (call); - if (use) - CALL_INSN_FUNCTION_USAGE (call_insn) = use; - - return call_insn; -} - -/* Split simple return with popping POPC bytes from stack to indirect - branch with stack adjustment . */ - -void -ix86_split_simple_return_pop_internal (rtx popc) -{ - struct machine_function *m = cfun->machine; - rtx ecx = gen_rtx_REG (SImode, CX_REG); - rtx_insn *insn; - - /* There is no "pascal" calling convention in any 64bit ABI. */ - gcc_assert (!TARGET_64BIT); - - insn = emit_insn (gen_pop (ecx)); - m->fs.cfa_offset -= UNITS_PER_WORD; - m->fs.sp_offset -= UNITS_PER_WORD; - - rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); - x = gen_rtx_SET (stack_pointer_rtx, x); - add_reg_note (insn, REG_CFA_ADJUST_CFA, x); - add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx)); - RTX_FRAME_RELATED_P (insn) = 1; - - x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc); - x = gen_rtx_SET (stack_pointer_rtx, x); - insn = emit_insn (x); - add_reg_note (insn, REG_CFA_ADJUST_CFA, x); - RTX_FRAME_RELATED_P (insn) = 1; - - /* Now return address is in ECX. */ - emit_jump_insn (gen_simple_return_indirect_internal (ecx)); -} - -/* Errors in the source file can cause expand_expr to return const0_rtx - where we expect a vector. To avoid crashing, use one of the vector - clear instructions. */ - -static rtx -safe_vector_operand (rtx x, machine_mode mode) -{ - if (x == const0_rtx) - x = CONST0_RTX (mode); - return x; -} - -/* Subroutine of ix86_expand_builtin to take care of binop insns. */ - -static rtx -ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - machine_mode tmode = insn_data[icode].operand[0].mode; - machine_mode mode0 = insn_data[icode].operand[1].mode; - machine_mode mode1 = insn_data[icode].operand[2].mode; - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - if (VECTOR_MODE_P (mode1)) - op1 = safe_vector_operand (op1, mode1); - - if (optimize || !target - || GET_MODE (target) != tmode - || !insn_data[icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - if (GET_MODE (op1) == SImode && mode1 == TImode) - { - rtx x = gen_reg_rtx (V4SImode); - emit_insn (gen_sse2_loadd (x, op1)); - op1 = gen_lowpart (TImode, x); - } - - if (!insn_data[icode].operand[1].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if (!insn_data[icode].operand[2].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - pat = GEN_FCN (icode) (target, op0, op1); - if (! pat) - return 0; - - emit_insn (pat); - - return target; -} - -/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */ - -static rtx -ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target, - enum ix86_builtin_func_type m_type, - enum rtx_code sub_code) -{ - rtx pat; - int i; - int nargs; - bool comparison_p = false; - bool tf_p = false; - bool last_arg_constant = false; - int num_memory = 0; - struct { - rtx op; - machine_mode mode; - } args[4]; - - machine_mode tmode = insn_data[icode].operand[0].mode; - - switch (m_type) - { - case MULTI_ARG_4_DF2_DI_I: - case MULTI_ARG_4_DF2_DI_I1: - case MULTI_ARG_4_SF2_SI_I: - case MULTI_ARG_4_SF2_SI_I1: - nargs = 4; - last_arg_constant = true; - break; - - case MULTI_ARG_3_SF: - case MULTI_ARG_3_DF: - case MULTI_ARG_3_SF2: - case MULTI_ARG_3_DF2: - case MULTI_ARG_3_DI: - case MULTI_ARG_3_SI: - case MULTI_ARG_3_SI_DI: - case MULTI_ARG_3_HI: - case MULTI_ARG_3_HI_SI: - case MULTI_ARG_3_QI: - case MULTI_ARG_3_DI2: - case MULTI_ARG_3_SI2: - case MULTI_ARG_3_HI2: - case MULTI_ARG_3_QI2: - nargs = 3; - break; - - case MULTI_ARG_2_SF: - case MULTI_ARG_2_DF: - case MULTI_ARG_2_DI: - case MULTI_ARG_2_SI: - case MULTI_ARG_2_HI: - case MULTI_ARG_2_QI: - nargs = 2; - break; - - case MULTI_ARG_2_DI_IMM: - case MULTI_ARG_2_SI_IMM: - case MULTI_ARG_2_HI_IMM: - case MULTI_ARG_2_QI_IMM: - nargs = 2; - last_arg_constant = true; - break; - - case MULTI_ARG_1_SF: - case MULTI_ARG_1_DF: - case MULTI_ARG_1_SF2: - case MULTI_ARG_1_DF2: - case MULTI_ARG_1_DI: - case MULTI_ARG_1_SI: - case MULTI_ARG_1_HI: - case MULTI_ARG_1_QI: - case MULTI_ARG_1_SI_DI: - case MULTI_ARG_1_HI_DI: - case MULTI_ARG_1_HI_SI: - case MULTI_ARG_1_QI_DI: - case MULTI_ARG_1_QI_SI: - case MULTI_ARG_1_QI_HI: - nargs = 1; - break; - - case MULTI_ARG_2_DI_CMP: - case MULTI_ARG_2_SI_CMP: - case MULTI_ARG_2_HI_CMP: - case MULTI_ARG_2_QI_CMP: - nargs = 2; - comparison_p = true; - break; - - case MULTI_ARG_2_SF_TF: - case MULTI_ARG_2_DF_TF: - case MULTI_ARG_2_DI_TF: - case MULTI_ARG_2_SI_TF: - case MULTI_ARG_2_HI_TF: - case MULTI_ARG_2_QI_TF: - nargs = 2; - tf_p = true; - break; - - default: - gcc_unreachable (); - } - - if (optimize || !target - || GET_MODE (target) != tmode - || !insn_data[icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - else if (memory_operand (target, tmode)) - num_memory++; - - gcc_assert (nargs <= 4); - - for (i = 0; i < nargs; i++) - { - tree arg = CALL_EXPR_ARG (exp, i); - rtx op = expand_normal (arg); - int adjust = (comparison_p) ? 1 : 0; - machine_mode mode = insn_data[icode].operand[i+adjust+1].mode; - - if (last_arg_constant && i == nargs - 1) - { - if (!insn_data[icode].operand[i + 1].predicate (op, mode)) - { - enum insn_code new_icode = icode; - switch (icode) - { - case CODE_FOR_xop_vpermil2v2df3: - case CODE_FOR_xop_vpermil2v4sf3: - case CODE_FOR_xop_vpermil2v4df3: - case CODE_FOR_xop_vpermil2v8sf3: - error ("the last argument must be a 2-bit immediate"); - return gen_reg_rtx (tmode); - case CODE_FOR_xop_rotlv2di3: - new_icode = CODE_FOR_rotlv2di3; - goto xop_rotl; - case CODE_FOR_xop_rotlv4si3: - new_icode = CODE_FOR_rotlv4si3; - goto xop_rotl; - case CODE_FOR_xop_rotlv8hi3: - new_icode = CODE_FOR_rotlv8hi3; - goto xop_rotl; - case CODE_FOR_xop_rotlv16qi3: - new_icode = CODE_FOR_rotlv16qi3; - xop_rotl: - if (CONST_INT_P (op)) - { - int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1; - op = GEN_INT (INTVAL (op) & mask); - gcc_checking_assert - (insn_data[icode].operand[i + 1].predicate (op, mode)); - } - else - { - gcc_checking_assert - (nargs == 2 - && insn_data[new_icode].operand[0].mode == tmode - && insn_data[new_icode].operand[1].mode == tmode - && insn_data[new_icode].operand[2].mode == mode - && insn_data[new_icode].operand[0].predicate - == insn_data[icode].operand[0].predicate - && insn_data[new_icode].operand[1].predicate - == insn_data[icode].operand[1].predicate); - icode = new_icode; - goto non_constant; - } - break; - default: - gcc_unreachable (); - } - } - } - else - { - non_constant: - if (VECTOR_MODE_P (mode)) - op = safe_vector_operand (op, mode); - - /* If we aren't optimizing, only allow one memory operand to be - generated. */ - if (memory_operand (op, mode)) - num_memory++; - - gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode); - - if (optimize - || !insn_data[icode].operand[i+adjust+1].predicate (op, mode) - || num_memory > 1) - op = force_reg (mode, op); - } - - args[i].op = op; - args[i].mode = mode; - } - - switch (nargs) - { - case 1: - pat = GEN_FCN (icode) (target, args[0].op); - break; - - case 2: - if (tf_p) - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, - GEN_INT ((int)sub_code)); - else if (! comparison_p) - pat = GEN_FCN (icode) (target, args[0].op, args[1].op); - else - { - rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target), - args[0].op, - args[1].op); - - pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op); - } - break; - - case 3: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); - break; - - case 4: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op); - break; - - default: - gcc_unreachable (); - } - - if (! pat) - return 0; - - emit_insn (pat); - return target; -} - -/* Subroutine of ix86_expand_args_builtin to take care of scalar unop - insns with vec_merge. */ - -static rtx -ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp, - rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - rtx op1, op0 = expand_normal (arg0); - machine_mode tmode = insn_data[icode].operand[0].mode; - machine_mode mode0 = insn_data[icode].operand[1].mode; - - if (optimize || !target - || GET_MODE (target) != tmode - || !insn_data[icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_data[icode].operand[1].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - - op1 = op0; - if (!insn_data[icode].operand[2].predicate (op1, mode0)) - op1 = copy_to_mode_reg (mode0, op1); - - pat = GEN_FCN (icode) (target, op0, op1); - if (! pat) - return 0; - emit_insn (pat); - return target; -} - -/* Subroutine of ix86_expand_builtin to take care of comparison insns. */ - -static rtx -ix86_expand_sse_compare (const struct builtin_description *d, - tree exp, rtx target, bool swap) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - rtx op2; - machine_mode tmode = insn_data[d->icode].operand[0].mode; - machine_mode mode0 = insn_data[d->icode].operand[1].mode; - machine_mode mode1 = insn_data[d->icode].operand[2].mode; - enum rtx_code comparison = d->comparison; - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - if (VECTOR_MODE_P (mode1)) - op1 = safe_vector_operand (op1, mode1); - - /* Swap operands if we have a comparison that isn't available in - hardware. */ - if (swap) - std::swap (op0, op1); - - if (optimize || !target - || GET_MODE (target) != tmode - || !insn_data[d->icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_data[d->icode].operand[1].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if ((optimize && !register_operand (op1, mode1)) - || !insn_data[d->icode].operand[2].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1); - pat = GEN_FCN (d->icode) (target, op0, op1, op2); - if (! pat) - return 0; - emit_insn (pat); - return target; -} - -/* Subroutine of ix86_expand_builtin to take care of comi insns. */ - -static rtx -ix86_expand_sse_comi (const struct builtin_description *d, tree exp, - rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - machine_mode mode0 = insn_data[d->icode].operand[0].mode; - machine_mode mode1 = insn_data[d->icode].operand[1].mode; - enum rtx_code comparison = d->comparison; - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - if (VECTOR_MODE_P (mode1)) - op1 = safe_vector_operand (op1, mode1); - - /* Swap operands if we have a comparison that isn't available in - hardware. */ - if (d->flag & BUILTIN_DESC_SWAP_OPERANDS) - std::swap (op0, op1); - - target = gen_reg_rtx (SImode); - emit_move_insn (target, const0_rtx); - target = gen_rtx_SUBREG (QImode, target, 0); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_data[d->icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if ((optimize && !register_operand (op1, mode1)) - || !insn_data[d->icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - pat = GEN_FCN (d->icode) (op0, op1); - if (! pat) - return 0; - emit_insn (pat); - emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), - gen_rtx_fmt_ee (comparison, QImode, - SET_DEST (pat), - const0_rtx))); - - return SUBREG_REG (target); -} - -/* Subroutines of ix86_expand_args_builtin to take care of round insns. */ - -static rtx -ix86_expand_sse_round (const struct builtin_description *d, tree exp, - rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - rtx op1, op0 = expand_normal (arg0); - machine_mode tmode = insn_data[d->icode].operand[0].mode; - machine_mode mode0 = insn_data[d->icode].operand[1].mode; - - if (optimize || target == 0 - || GET_MODE (target) != tmode - || !insn_data[d->icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_data[d->icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - - op1 = GEN_INT (d->comparison); - - pat = GEN_FCN (d->icode) (target, op0, op1); - if (! pat) - return 0; - emit_insn (pat); - return target; -} - -static rtx -ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d, - tree exp, rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - rtx op2; - machine_mode tmode = insn_data[d->icode].operand[0].mode; - machine_mode mode0 = insn_data[d->icode].operand[1].mode; - machine_mode mode1 = insn_data[d->icode].operand[2].mode; - - if (optimize || target == 0 - || GET_MODE (target) != tmode - || !insn_data[d->icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - op0 = safe_vector_operand (op0, mode0); - op1 = safe_vector_operand (op1, mode1); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_data[d->icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if ((optimize && !register_operand (op1, mode1)) - || !insn_data[d->icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - op2 = GEN_INT (d->comparison); - - pat = GEN_FCN (d->icode) (target, op0, op1, op2); - if (! pat) - return 0; - emit_insn (pat); - return target; -} - -/* Subroutine of ix86_expand_builtin to take care of ptest insns. */ - -static rtx -ix86_expand_sse_ptest (const struct builtin_description *d, tree exp, - rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - machine_mode mode0 = insn_data[d->icode].operand[0].mode; - machine_mode mode1 = insn_data[d->icode].operand[1].mode; - enum rtx_code comparison = d->comparison; - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - if (VECTOR_MODE_P (mode1)) - op1 = safe_vector_operand (op1, mode1); - - target = gen_reg_rtx (SImode); - emit_move_insn (target, const0_rtx); - target = gen_rtx_SUBREG (QImode, target, 0); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_data[d->icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if ((optimize && !register_operand (op1, mode1)) - || !insn_data[d->icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - pat = GEN_FCN (d->icode) (op0, op1); - if (! pat) - return 0; - emit_insn (pat); - emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), - gen_rtx_fmt_ee (comparison, QImode, - SET_DEST (pat), - const0_rtx))); - - return SUBREG_REG (target); -} - -/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */ - -static rtx -ix86_expand_sse_pcmpestr (const struct builtin_description *d, - tree exp, rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - tree arg2 = CALL_EXPR_ARG (exp, 2); - tree arg3 = CALL_EXPR_ARG (exp, 3); - tree arg4 = CALL_EXPR_ARG (exp, 4); - rtx scratch0, scratch1; - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - rtx op2 = expand_normal (arg2); - rtx op3 = expand_normal (arg3); - rtx op4 = expand_normal (arg4); - machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm; - - tmode0 = insn_data[d->icode].operand[0].mode; - tmode1 = insn_data[d->icode].operand[1].mode; - modev2 = insn_data[d->icode].operand[2].mode; - modei3 = insn_data[d->icode].operand[3].mode; - modev4 = insn_data[d->icode].operand[4].mode; - modei5 = insn_data[d->icode].operand[5].mode; - modeimm = insn_data[d->icode].operand[6].mode; - - if (VECTOR_MODE_P (modev2)) - op0 = safe_vector_operand (op0, modev2); - if (VECTOR_MODE_P (modev4)) - op2 = safe_vector_operand (op2, modev4); - - if (!insn_data[d->icode].operand[2].predicate (op0, modev2)) - op0 = copy_to_mode_reg (modev2, op0); - if (!insn_data[d->icode].operand[3].predicate (op1, modei3)) - op1 = copy_to_mode_reg (modei3, op1); - if ((optimize && !register_operand (op2, modev4)) - || !insn_data[d->icode].operand[4].predicate (op2, modev4)) - op2 = copy_to_mode_reg (modev4, op2); - if (!insn_data[d->icode].operand[5].predicate (op3, modei5)) - op3 = copy_to_mode_reg (modei5, op3); - - if (!insn_data[d->icode].operand[6].predicate (op4, modeimm)) - { - error ("the fifth argument must be an 8-bit immediate"); - return const0_rtx; - } - - if (d->code == IX86_BUILTIN_PCMPESTRI128) - { - if (optimize || !target - || GET_MODE (target) != tmode0 - || !insn_data[d->icode].operand[0].predicate (target, tmode0)) - target = gen_reg_rtx (tmode0); - - scratch1 = gen_reg_rtx (tmode1); - - pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4); - } - else if (d->code == IX86_BUILTIN_PCMPESTRM128) - { - if (optimize || !target - || GET_MODE (target) != tmode1 - || !insn_data[d->icode].operand[1].predicate (target, tmode1)) - target = gen_reg_rtx (tmode1); - - scratch0 = gen_reg_rtx (tmode0); - - pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4); - } - else - { - gcc_assert (d->flag); - - scratch0 = gen_reg_rtx (tmode0); - scratch1 = gen_reg_rtx (tmode1); - - pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4); - } - - if (! pat) - return 0; - - emit_insn (pat); - - if (d->flag) - { - target = gen_reg_rtx (SImode); - emit_move_insn (target, const0_rtx); - target = gen_rtx_SUBREG (QImode, target, 0); - - emit_insn - (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), - gen_rtx_fmt_ee (EQ, QImode, - gen_rtx_REG ((machine_mode) d->flag, - FLAGS_REG), - const0_rtx))); - return SUBREG_REG (target); - } - else - return target; -} - - -/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */ - -static rtx -ix86_expand_sse_pcmpistr (const struct builtin_description *d, - tree exp, rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - tree arg2 = CALL_EXPR_ARG (exp, 2); - rtx scratch0, scratch1; - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - rtx op2 = expand_normal (arg2); - machine_mode tmode0, tmode1, modev2, modev3, modeimm; - - tmode0 = insn_data[d->icode].operand[0].mode; - tmode1 = insn_data[d->icode].operand[1].mode; - modev2 = insn_data[d->icode].operand[2].mode; - modev3 = insn_data[d->icode].operand[3].mode; - modeimm = insn_data[d->icode].operand[4].mode; - - if (VECTOR_MODE_P (modev2)) - op0 = safe_vector_operand (op0, modev2); - if (VECTOR_MODE_P (modev3)) - op1 = safe_vector_operand (op1, modev3); - - if (!insn_data[d->icode].operand[2].predicate (op0, modev2)) - op0 = copy_to_mode_reg (modev2, op0); - if ((optimize && !register_operand (op1, modev3)) - || !insn_data[d->icode].operand[3].predicate (op1, modev3)) - op1 = copy_to_mode_reg (modev3, op1); - - if (!insn_data[d->icode].operand[4].predicate (op2, modeimm)) - { - error ("the third argument must be an 8-bit immediate"); - return const0_rtx; - } - - if (d->code == IX86_BUILTIN_PCMPISTRI128) - { - if (optimize || !target - || GET_MODE (target) != tmode0 - || !insn_data[d->icode].operand[0].predicate (target, tmode0)) - target = gen_reg_rtx (tmode0); - - scratch1 = gen_reg_rtx (tmode1); - - pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2); - } - else if (d->code == IX86_BUILTIN_PCMPISTRM128) - { - if (optimize || !target - || GET_MODE (target) != tmode1 - || !insn_data[d->icode].operand[1].predicate (target, tmode1)) - target = gen_reg_rtx (tmode1); - - scratch0 = gen_reg_rtx (tmode0); - - pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2); - } - else - { - gcc_assert (d->flag); - - scratch0 = gen_reg_rtx (tmode0); - scratch1 = gen_reg_rtx (tmode1); - - pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2); - } - - if (! pat) - return 0; - - emit_insn (pat); - - if (d->flag) - { - target = gen_reg_rtx (SImode); - emit_move_insn (target, const0_rtx); - target = gen_rtx_SUBREG (QImode, target, 0); - - emit_insn - (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), - gen_rtx_fmt_ee (EQ, QImode, - gen_rtx_REG ((machine_mode) d->flag, - FLAGS_REG), - const0_rtx))); - return SUBREG_REG (target); - } - else - return target; -} - -/* Fixup modeless constants to fit required mode. */ - -static rtx -fixup_modeless_constant (rtx x, machine_mode mode) -{ - if (GET_MODE (x) == VOIDmode) - x = convert_to_mode (mode, x, 1); - return x; -} - -/* Subroutine of ix86_expand_builtin to take care of insns with - variable number of operands. */ - -static rtx -ix86_expand_args_builtin (const struct builtin_description *d, - tree exp, rtx target) -{ - rtx pat, real_target; - unsigned int i, nargs; - unsigned int nargs_constant = 0; - unsigned int mask_pos = 0; - int num_memory = 0; - struct - { - rtx op; - machine_mode mode; - } args[6]; - bool second_arg_count = false; - enum insn_code icode = d->icode; - const struct insn_data_d *insn_p = &insn_data[icode]; - machine_mode tmode = insn_p->operand[0].mode; - machine_mode rmode = VOIDmode; - bool swap = false; - enum rtx_code comparison = d->comparison; - - switch ((enum ix86_builtin_func_type) d->flag) - { - case V2DF_FTYPE_V2DF_ROUND: - case V4DF_FTYPE_V4DF_ROUND: - case V8DF_FTYPE_V8DF_ROUND: - case V4SF_FTYPE_V4SF_ROUND: - case V8SF_FTYPE_V8SF_ROUND: - case V16SF_FTYPE_V16SF_ROUND: - case V4SI_FTYPE_V4SF_ROUND: - case V8SI_FTYPE_V8SF_ROUND: - case V16SI_FTYPE_V16SF_ROUND: - return ix86_expand_sse_round (d, exp, target); - case V4SI_FTYPE_V2DF_V2DF_ROUND: - case V8SI_FTYPE_V4DF_V4DF_ROUND: - case V16SI_FTYPE_V8DF_V8DF_ROUND: - return ix86_expand_sse_round_vec_pack_sfix (d, exp, target); - case INT_FTYPE_V8SF_V8SF_PTEST: - case INT_FTYPE_V4DI_V4DI_PTEST: - case INT_FTYPE_V4DF_V4DF_PTEST: - case INT_FTYPE_V4SF_V4SF_PTEST: - case INT_FTYPE_V2DI_V2DI_PTEST: - case INT_FTYPE_V2DF_V2DF_PTEST: - return ix86_expand_sse_ptest (d, exp, target); - case FLOAT128_FTYPE_FLOAT128: - case FLOAT_FTYPE_FLOAT: - case INT_FTYPE_INT: - case UINT_FTYPE_UINT: - case UINT16_FTYPE_UINT16: - case UINT64_FTYPE_INT: - case UINT64_FTYPE_UINT64: - case INT64_FTYPE_INT64: - case INT64_FTYPE_V4SF: - case INT64_FTYPE_V2DF: - case INT_FTYPE_V16QI: - case INT_FTYPE_V8QI: - case INT_FTYPE_V8SF: - case INT_FTYPE_V4DF: - case INT_FTYPE_V4SF: - case INT_FTYPE_V2DF: - case INT_FTYPE_V32QI: - case V16QI_FTYPE_V16QI: - case V8SI_FTYPE_V8SF: - case V8SI_FTYPE_V4SI: - case V8HI_FTYPE_V8HI: - case V8HI_FTYPE_V16QI: - case V8QI_FTYPE_V8QI: - case V8SF_FTYPE_V8SF: - case V8SF_FTYPE_V8SI: - case V8SF_FTYPE_V4SF: - case V8SF_FTYPE_V8HI: - case V4SI_FTYPE_V4SI: - case V4SI_FTYPE_V16QI: - case V4SI_FTYPE_V4SF: - case V4SI_FTYPE_V8SI: - case V4SI_FTYPE_V8HI: - case V4SI_FTYPE_V4DF: - case V4SI_FTYPE_V2DF: - case V4HI_FTYPE_V4HI: - case V4DF_FTYPE_V4DF: - case V4DF_FTYPE_V4SI: - case V4DF_FTYPE_V4SF: - case V4DF_FTYPE_V2DF: - case V4SF_FTYPE_V4SF: - case V4SF_FTYPE_V4SI: - case V4SF_FTYPE_V8SF: - case V4SF_FTYPE_V4DF: - case V4SF_FTYPE_V8HI: - case V4SF_FTYPE_V2DF: - case V2DI_FTYPE_V2DI: - case V2DI_FTYPE_V16QI: - case V2DI_FTYPE_V8HI: - case V2DI_FTYPE_V4SI: - case V2DF_FTYPE_V2DF: - case V2DF_FTYPE_V4SI: - case V2DF_FTYPE_V4DF: - case V2DF_FTYPE_V4SF: - case V2DF_FTYPE_V2SI: - case V2SI_FTYPE_V2SI: - case V2SI_FTYPE_V4SF: - case V2SI_FTYPE_V2SF: - case V2SI_FTYPE_V2DF: - case V2SF_FTYPE_V2SF: - case V2SF_FTYPE_V2SI: - case V32QI_FTYPE_V32QI: - case V32QI_FTYPE_V16QI: - case V16HI_FTYPE_V16HI: - case V16HI_FTYPE_V8HI: - case V8SI_FTYPE_V8SI: - case V16HI_FTYPE_V16QI: - case V8SI_FTYPE_V16QI: - case V4DI_FTYPE_V16QI: - case V8SI_FTYPE_V8HI: - case V4DI_FTYPE_V8HI: - case V4DI_FTYPE_V4SI: - case V4DI_FTYPE_V2DI: - case UQI_FTYPE_UQI: - case UHI_FTYPE_UHI: - case USI_FTYPE_USI: - case USI_FTYPE_UQI: - case USI_FTYPE_UHI: - case UDI_FTYPE_UDI: - case UHI_FTYPE_V16QI: - case USI_FTYPE_V32QI: - case UDI_FTYPE_V64QI: - case V16QI_FTYPE_UHI: - case V32QI_FTYPE_USI: - case V64QI_FTYPE_UDI: - case V8HI_FTYPE_UQI: - case V16HI_FTYPE_UHI: - case V32HI_FTYPE_USI: - case V4SI_FTYPE_UQI: - case V8SI_FTYPE_UQI: - case V4SI_FTYPE_UHI: - case V8SI_FTYPE_UHI: - case UQI_FTYPE_V8HI: - case UHI_FTYPE_V16HI: - case USI_FTYPE_V32HI: - case UQI_FTYPE_V4SI: - case UQI_FTYPE_V8SI: - case UHI_FTYPE_V16SI: - case UQI_FTYPE_V2DI: - case UQI_FTYPE_V4DI: - case UQI_FTYPE_V8DI: - case V16SI_FTYPE_UHI: - case V2DI_FTYPE_UQI: - case V4DI_FTYPE_UQI: - case V16SI_FTYPE_INT: - case V16SF_FTYPE_V8SF: - case V16SI_FTYPE_V8SI: - case V16SF_FTYPE_V4SF: - case V16SI_FTYPE_V4SI: - case V16SI_FTYPE_V16SF: - case V16SI_FTYPE_V16SI: - case V64QI_FTYPE_V64QI: - case V32HI_FTYPE_V32HI: - case V16SF_FTYPE_V16SF: - case V8DI_FTYPE_UQI: - case V8DI_FTYPE_V8DI: - case V8DF_FTYPE_V4DF: - case V8DF_FTYPE_V2DF: - case V8DF_FTYPE_V8DF: - case V4DI_FTYPE_V4DI: - case V16HI_FTYPE_V16SF: - case V8HI_FTYPE_V8SF: - case V8HI_FTYPE_V4SF: - nargs = 1; - break; - case V4SF_FTYPE_V4SF_VEC_MERGE: - case V2DF_FTYPE_V2DF_VEC_MERGE: - return ix86_expand_unop_vec_merge_builtin (icode, exp, target); - case FLOAT128_FTYPE_FLOAT128_FLOAT128: - case V16QI_FTYPE_V16QI_V16QI: - case V16QI_FTYPE_V8HI_V8HI: - case V16SF_FTYPE_V16SF_V16SF: - case V8QI_FTYPE_V8QI_V8QI: - case V8QI_FTYPE_V4HI_V4HI: - case V8HI_FTYPE_V8HI_V8HI: - case V8HI_FTYPE_V16QI_V16QI: - case V8HI_FTYPE_V4SI_V4SI: - case V8SF_FTYPE_V8SF_V8SF: - case V8SF_FTYPE_V8SF_V8SI: - case V8DF_FTYPE_V8DF_V8DF: - case V4SI_FTYPE_V4SI_V4SI: - case V4SI_FTYPE_V8HI_V8HI: - case V4SI_FTYPE_V2DF_V2DF: - case V4HI_FTYPE_V4HI_V4HI: - case V4HI_FTYPE_V8QI_V8QI: - case V4HI_FTYPE_V2SI_V2SI: - case V4DF_FTYPE_V4DF_V4DF: - case V4DF_FTYPE_V4DF_V4DI: - case V4SF_FTYPE_V4SF_V4SF: - case V4SF_FTYPE_V4SF_V4SI: - case V4SF_FTYPE_V4SF_V2SI: - case V4SF_FTYPE_V4SF_V2DF: - case V4SF_FTYPE_V4SF_UINT: - case V4SF_FTYPE_V4SF_DI: - case V4SF_FTYPE_V4SF_SI: - case V2DI_FTYPE_V2DI_V2DI: - case V2DI_FTYPE_V16QI_V16QI: - case V2DI_FTYPE_V4SI_V4SI: - case V2DI_FTYPE_V2DI_V16QI: - case V2SI_FTYPE_V2SI_V2SI: - case V2SI_FTYPE_V4HI_V4HI: - case V2SI_FTYPE_V2SF_V2SF: - case V2DF_FTYPE_V2DF_V2DF: - case V2DF_FTYPE_V2DF_V4SF: - case V2DF_FTYPE_V2DF_V2DI: - case V2DF_FTYPE_V2DF_DI: - case V2DF_FTYPE_V2DF_SI: - case V2DF_FTYPE_V2DF_UINT: - case V2SF_FTYPE_V2SF_V2SF: - case V1DI_FTYPE_V1DI_V1DI: - case V1DI_FTYPE_V8QI_V8QI: - case V1DI_FTYPE_V2SI_V2SI: - case V32QI_FTYPE_V16HI_V16HI: - case V16HI_FTYPE_V8SI_V8SI: - case V64QI_FTYPE_V64QI_V64QI: - case V32QI_FTYPE_V32QI_V32QI: - case V16HI_FTYPE_V32QI_V32QI: - case V16HI_FTYPE_V16HI_V16HI: - case V8SI_FTYPE_V4DF_V4DF: - case V8SI_FTYPE_V8SI_V8SI: - case V8SI_FTYPE_V16HI_V16HI: - case V4DI_FTYPE_V4DI_V4DI: - case V4DI_FTYPE_V8SI_V8SI: - case V8DI_FTYPE_V64QI_V64QI: - if (comparison == UNKNOWN) - return ix86_expand_binop_builtin (icode, exp, target); - nargs = 2; - break; - case V4SF_FTYPE_V4SF_V4SF_SWAP: - case V2DF_FTYPE_V2DF_V2DF_SWAP: - gcc_assert (comparison != UNKNOWN); - nargs = 2; - swap = true; - break; - case V16HI_FTYPE_V16HI_V8HI_COUNT: - case V16HI_FTYPE_V16HI_SI_COUNT: - case V8SI_FTYPE_V8SI_V4SI_COUNT: - case V8SI_FTYPE_V8SI_SI_COUNT: - case V4DI_FTYPE_V4DI_V2DI_COUNT: - case V4DI_FTYPE_V4DI_INT_COUNT: - case V8HI_FTYPE_V8HI_V8HI_COUNT: - case V8HI_FTYPE_V8HI_SI_COUNT: - case V4SI_FTYPE_V4SI_V4SI_COUNT: - case V4SI_FTYPE_V4SI_SI_COUNT: - case V4HI_FTYPE_V4HI_V4HI_COUNT: - case V4HI_FTYPE_V4HI_SI_COUNT: - case V2DI_FTYPE_V2DI_V2DI_COUNT: - case V2DI_FTYPE_V2DI_SI_COUNT: - case V2SI_FTYPE_V2SI_V2SI_COUNT: - case V2SI_FTYPE_V2SI_SI_COUNT: - case V1DI_FTYPE_V1DI_V1DI_COUNT: - case V1DI_FTYPE_V1DI_SI_COUNT: - nargs = 2; - second_arg_count = true; - break; - case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT: - case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT: - case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT: - case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT: - case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT: - case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT: - case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT: - case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT: - case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT: - case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT: - case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT: - case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT: - case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT: - case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT: - case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT: - case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT: - case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT: - case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT: - nargs = 4; - second_arg_count = true; - break; - case UINT64_FTYPE_UINT64_UINT64: - case UINT_FTYPE_UINT_UINT: - case UINT_FTYPE_UINT_USHORT: - case UINT_FTYPE_UINT_UCHAR: - case UINT16_FTYPE_UINT16_INT: - case UINT8_FTYPE_UINT8_INT: - case UQI_FTYPE_UQI_UQI: - case UHI_FTYPE_UHI_UHI: - case USI_FTYPE_USI_USI: - case UDI_FTYPE_UDI_UDI: - case V16SI_FTYPE_V8DF_V8DF: - case V32HI_FTYPE_V16SF_V16SF: - case V16HI_FTYPE_V8SF_V8SF: - case V8HI_FTYPE_V4SF_V4SF: - case V16HI_FTYPE_V16SF_UHI: - case V8HI_FTYPE_V8SF_UQI: - case V8HI_FTYPE_V4SF_UQI: - nargs = 2; - break; - case V2DI_FTYPE_V2DI_INT_CONVERT: - nargs = 2; - rmode = V1TImode; - nargs_constant = 1; - break; - case V4DI_FTYPE_V4DI_INT_CONVERT: - nargs = 2; - rmode = V2TImode; - nargs_constant = 1; - break; - case V8DI_FTYPE_V8DI_INT_CONVERT: - nargs = 2; - rmode = V4TImode; - nargs_constant = 1; - break; - case V8HI_FTYPE_V8HI_INT: - case V8HI_FTYPE_V8SF_INT: - case V16HI_FTYPE_V16SF_INT: - case V8HI_FTYPE_V4SF_INT: - case V8SF_FTYPE_V8SF_INT: - case V4SF_FTYPE_V16SF_INT: - case V16SF_FTYPE_V16SF_INT: - case V4SI_FTYPE_V4SI_INT: - case V4SI_FTYPE_V8SI_INT: - case V4HI_FTYPE_V4HI_INT: - case V4DF_FTYPE_V4DF_INT: - case V4DF_FTYPE_V8DF_INT: - case V4SF_FTYPE_V4SF_INT: - case V4SF_FTYPE_V8SF_INT: - case V2DI_FTYPE_V2DI_INT: - case V2DF_FTYPE_V2DF_INT: - case V2DF_FTYPE_V4DF_INT: - case V16HI_FTYPE_V16HI_INT: - case V8SI_FTYPE_V8SI_INT: - case V16SI_FTYPE_V16SI_INT: - case V4SI_FTYPE_V16SI_INT: - case V4DI_FTYPE_V4DI_INT: - case V2DI_FTYPE_V4DI_INT: - case V4DI_FTYPE_V8DI_INT: - case UQI_FTYPE_UQI_UQI_CONST: - case UHI_FTYPE_UHI_UQI: - case USI_FTYPE_USI_UQI: - case UDI_FTYPE_UDI_UQI: - nargs = 2; - nargs_constant = 1; - break; - case V16QI_FTYPE_V16QI_V16QI_V16QI: - case V8SF_FTYPE_V8SF_V8SF_V8SF: - case V4DF_FTYPE_V4DF_V4DF_V4DF: - case V4SF_FTYPE_V4SF_V4SF_V4SF: - case V2DF_FTYPE_V2DF_V2DF_V2DF: - case V32QI_FTYPE_V32QI_V32QI_V32QI: - case UHI_FTYPE_V16SI_V16SI_UHI: - case UQI_FTYPE_V8DI_V8DI_UQI: - case V16HI_FTYPE_V16SI_V16HI_UHI: - case V16QI_FTYPE_V16SI_V16QI_UHI: - case V16QI_FTYPE_V8DI_V16QI_UQI: - case V16SF_FTYPE_V16SF_V16SF_UHI: - case V16SF_FTYPE_V4SF_V16SF_UHI: - case V16SI_FTYPE_SI_V16SI_UHI: - case V16SI_FTYPE_V16HI_V16SI_UHI: - case V16SI_FTYPE_V16QI_V16SI_UHI: - case V8SF_FTYPE_V4SF_V8SF_UQI: - case V4DF_FTYPE_V2DF_V4DF_UQI: - case V8SI_FTYPE_V4SI_V8SI_UQI: - case V8SI_FTYPE_SI_V8SI_UQI: - case V4SI_FTYPE_V4SI_V4SI_UQI: - case V4SI_FTYPE_SI_V4SI_UQI: - case V4DI_FTYPE_V2DI_V4DI_UQI: - case V4DI_FTYPE_DI_V4DI_UQI: - case V2DI_FTYPE_V2DI_V2DI_UQI: - case V2DI_FTYPE_DI_V2DI_UQI: - case V64QI_FTYPE_V64QI_V64QI_UDI: - case V64QI_FTYPE_V16QI_V64QI_UDI: - case V64QI_FTYPE_QI_V64QI_UDI: - case V32QI_FTYPE_V32QI_V32QI_USI: - case V32QI_FTYPE_V16QI_V32QI_USI: - case V32QI_FTYPE_QI_V32QI_USI: - case V16QI_FTYPE_V16QI_V16QI_UHI: - case V16QI_FTYPE_QI_V16QI_UHI: - case V32HI_FTYPE_V8HI_V32HI_USI: - case V32HI_FTYPE_HI_V32HI_USI: - case V16HI_FTYPE_V8HI_V16HI_UHI: - case V16HI_FTYPE_HI_V16HI_UHI: - case V8HI_FTYPE_V8HI_V8HI_UQI: - case V8HI_FTYPE_HI_V8HI_UQI: - case V8SF_FTYPE_V8HI_V8SF_UQI: - case V4SF_FTYPE_V8HI_V4SF_UQI: - case V8SI_FTYPE_V8SF_V8SI_UQI: - case V4SI_FTYPE_V4SF_V4SI_UQI: - case V4DI_FTYPE_V4SF_V4DI_UQI: - case V2DI_FTYPE_V4SF_V2DI_UQI: - case V4SF_FTYPE_V4DI_V4SF_UQI: - case V4SF_FTYPE_V2DI_V4SF_UQI: - case V4DF_FTYPE_V4DI_V4DF_UQI: - case V2DF_FTYPE_V2DI_V2DF_UQI: - case V16QI_FTYPE_V8HI_V16QI_UQI: - case V16QI_FTYPE_V16HI_V16QI_UHI: - case V16QI_FTYPE_V4SI_V16QI_UQI: - case V16QI_FTYPE_V8SI_V16QI_UQI: - case V8HI_FTYPE_V4SI_V8HI_UQI: - case V8HI_FTYPE_V8SI_V8HI_UQI: - case V16QI_FTYPE_V2DI_V16QI_UQI: - case V16QI_FTYPE_V4DI_V16QI_UQI: - case V8HI_FTYPE_V2DI_V8HI_UQI: - case V8HI_FTYPE_V4DI_V8HI_UQI: - case V4SI_FTYPE_V2DI_V4SI_UQI: - case V4SI_FTYPE_V4DI_V4SI_UQI: - case V32QI_FTYPE_V32HI_V32QI_USI: - case UHI_FTYPE_V16QI_V16QI_UHI: - case USI_FTYPE_V32QI_V32QI_USI: - case UDI_FTYPE_V64QI_V64QI_UDI: - case UQI_FTYPE_V8HI_V8HI_UQI: - case UHI_FTYPE_V16HI_V16HI_UHI: - case USI_FTYPE_V32HI_V32HI_USI: - case UQI_FTYPE_V4SI_V4SI_UQI: - case UQI_FTYPE_V8SI_V8SI_UQI: - case UQI_FTYPE_V2DI_V2DI_UQI: - case UQI_FTYPE_V4DI_V4DI_UQI: - case V4SF_FTYPE_V2DF_V4SF_UQI: - case V4SF_FTYPE_V4DF_V4SF_UQI: - case V16SI_FTYPE_V16SI_V16SI_UHI: - case V16SI_FTYPE_V4SI_V16SI_UHI: - case V2DI_FTYPE_V4SI_V2DI_UQI: - case V2DI_FTYPE_V8HI_V2DI_UQI: - case V2DI_FTYPE_V16QI_V2DI_UQI: - case V4DI_FTYPE_V4DI_V4DI_UQI: - case V4DI_FTYPE_V4SI_V4DI_UQI: - case V4DI_FTYPE_V8HI_V4DI_UQI: - case V4DI_FTYPE_V16QI_V4DI_UQI: - case V4DI_FTYPE_V4DF_V4DI_UQI: - case V2DI_FTYPE_V2DF_V2DI_UQI: - case V4SI_FTYPE_V4DF_V4SI_UQI: - case V4SI_FTYPE_V2DF_V4SI_UQI: - case V4SI_FTYPE_V8HI_V4SI_UQI: - case V4SI_FTYPE_V16QI_V4SI_UQI: - case V4DI_FTYPE_V4DI_V4DI_V4DI: - case V8DF_FTYPE_V2DF_V8DF_UQI: - case V8DF_FTYPE_V4DF_V8DF_UQI: - case V8DF_FTYPE_V8DF_V8DF_UQI: - case V8SF_FTYPE_V8SF_V8SF_UQI: - case V8SF_FTYPE_V8SI_V8SF_UQI: - case V4DF_FTYPE_V4DF_V4DF_UQI: - case V4SF_FTYPE_V4SF_V4SF_UQI: - case V2DF_FTYPE_V2DF_V2DF_UQI: - case V2DF_FTYPE_V4SF_V2DF_UQI: - case V2DF_FTYPE_V4SI_V2DF_UQI: - case V4SF_FTYPE_V4SI_V4SF_UQI: - case V4DF_FTYPE_V4SF_V4DF_UQI: - case V4DF_FTYPE_V4SI_V4DF_UQI: - case V8SI_FTYPE_V8SI_V8SI_UQI: - case V8SI_FTYPE_V8HI_V8SI_UQI: - case V8SI_FTYPE_V16QI_V8SI_UQI: - case V8DF_FTYPE_V8SI_V8DF_UQI: - case V8DI_FTYPE_DI_V8DI_UQI: - case V16SF_FTYPE_V8SF_V16SF_UHI: - case V16SI_FTYPE_V8SI_V16SI_UHI: - case V16HI_FTYPE_V16HI_V16HI_UHI: - case V8HI_FTYPE_V16QI_V8HI_UQI: - case V16HI_FTYPE_V16QI_V16HI_UHI: - case V32HI_FTYPE_V32HI_V32HI_USI: - case V32HI_FTYPE_V32QI_V32HI_USI: - case V8DI_FTYPE_V16QI_V8DI_UQI: - case V8DI_FTYPE_V2DI_V8DI_UQI: - case V8DI_FTYPE_V4DI_V8DI_UQI: - case V8DI_FTYPE_V8DI_V8DI_UQI: - case V8DI_FTYPE_V8HI_V8DI_UQI: - case V8DI_FTYPE_V8SI_V8DI_UQI: - case V8HI_FTYPE_V8DI_V8HI_UQI: - case V8SI_FTYPE_V8DI_V8SI_UQI: - case V4SI_FTYPE_V4SI_V4SI_V4SI: - case V16SI_FTYPE_V16SI_V16SI_V16SI: - case V8DI_FTYPE_V8DI_V8DI_V8DI: - case V32HI_FTYPE_V32HI_V32HI_V32HI: - case V2DI_FTYPE_V2DI_V2DI_V2DI: - case V16HI_FTYPE_V16HI_V16HI_V16HI: - case V8SI_FTYPE_V8SI_V8SI_V8SI: - case V8HI_FTYPE_V8HI_V8HI_V8HI: - case V32HI_FTYPE_V16SF_V16SF_USI: - case V16HI_FTYPE_V8SF_V8SF_UHI: - case V8HI_FTYPE_V4SF_V4SF_UQI: - case V16HI_FTYPE_V16SF_V16HI_UHI: - case V8HI_FTYPE_V8SF_V8HI_UQI: - case V8HI_FTYPE_V4SF_V8HI_UQI: - case V16SF_FTYPE_V16SF_V32HI_V32HI: - case V8SF_FTYPE_V8SF_V16HI_V16HI: - case V4SF_FTYPE_V4SF_V8HI_V8HI: - nargs = 3; - break; - case V32QI_FTYPE_V32QI_V32QI_INT: - case V16HI_FTYPE_V16HI_V16HI_INT: - case V16QI_FTYPE_V16QI_V16QI_INT: - case V4DI_FTYPE_V4DI_V4DI_INT: - case V8HI_FTYPE_V8HI_V8HI_INT: - case V8SI_FTYPE_V8SI_V8SI_INT: - case V8SI_FTYPE_V8SI_V4SI_INT: - case V8SF_FTYPE_V8SF_V8SF_INT: - case V8SF_FTYPE_V8SF_V4SF_INT: - case V4SI_FTYPE_V4SI_V4SI_INT: - case V4DF_FTYPE_V4DF_V4DF_INT: - case V16SF_FTYPE_V16SF_V16SF_INT: - case V16SF_FTYPE_V16SF_V4SF_INT: - case V16SI_FTYPE_V16SI_V4SI_INT: - case V4DF_FTYPE_V4DF_V2DF_INT: - case V4SF_FTYPE_V4SF_V4SF_INT: - case V2DI_FTYPE_V2DI_V2DI_INT: - case V4DI_FTYPE_V4DI_V2DI_INT: - case V2DF_FTYPE_V2DF_V2DF_INT: - case UQI_FTYPE_V8DI_V8UDI_INT: - case UQI_FTYPE_V8DF_V8DF_INT: - case UQI_FTYPE_V2DF_V2DF_INT: - case UQI_FTYPE_V4SF_V4SF_INT: - case UHI_FTYPE_V16SI_V16SI_INT: - case UHI_FTYPE_V16SF_V16SF_INT: - case V64QI_FTYPE_V64QI_V64QI_INT: - case V32HI_FTYPE_V32HI_V32HI_INT: - case V16SI_FTYPE_V16SI_V16SI_INT: - case V8DI_FTYPE_V8DI_V8DI_INT: - nargs = 3; - nargs_constant = 1; - break; - case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT: - nargs = 3; - rmode = V4DImode; - nargs_constant = 1; - break; - case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT: - nargs = 3; - rmode = V2DImode; - nargs_constant = 1; - break; - case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT: - nargs = 3; - rmode = DImode; - nargs_constant = 1; - break; - case V2DI_FTYPE_V2DI_UINT_UINT: - nargs = 3; - nargs_constant = 2; - break; - case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT: - nargs = 3; - rmode = V8DImode; - nargs_constant = 1; - break; - case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT: - nargs = 5; - rmode = V8DImode; - mask_pos = 2; - nargs_constant = 1; - break; - case QI_FTYPE_V8DF_INT_UQI: - case QI_FTYPE_V4DF_INT_UQI: - case QI_FTYPE_V2DF_INT_UQI: - case HI_FTYPE_V16SF_INT_UHI: - case QI_FTYPE_V8SF_INT_UQI: - case QI_FTYPE_V4SF_INT_UQI: - case V4SI_FTYPE_V4SI_V4SI_UHI: - case V8SI_FTYPE_V8SI_V8SI_UHI: - nargs = 3; - mask_pos = 1; - nargs_constant = 1; - break; - case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT: - nargs = 5; - rmode = V4DImode; - mask_pos = 2; - nargs_constant = 1; - break; - case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT: - nargs = 5; - rmode = V2DImode; - mask_pos = 2; - nargs_constant = 1; - break; - case V32QI_FTYPE_V32QI_V32QI_V32QI_USI: - case V32HI_FTYPE_V32HI_V32HI_V32HI_USI: - case V32HI_FTYPE_V64QI_V64QI_V32HI_USI: - case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI: - case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI: - case V32HI_FTYPE_V32HI_V8HI_V32HI_USI: - case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI: - case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI: - case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI: - case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI: - case V32QI_FTYPE_V16HI_V16HI_V32QI_USI: - case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI: - case V32HI_FTYPE_V16SI_V16SI_V32HI_USI: - case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI: - case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI: - case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI: - case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI: - case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI: - case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI: - case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI: - case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI: - case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI: - case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI: - case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI: - case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI: - case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI: - case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI: - case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI: - case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI: - case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI: - case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI: - case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI: - case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI: - case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI: - case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI: - case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI: - case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI: - case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI: - case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI: - case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI: - case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI: - case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI: - case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI: - case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI: - case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI: - case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI: - case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI: - case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI: - case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI: - case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI: - case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI: - case V32HI_FTYPE_V16SF_V16SF_V32HI_USI: - case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI: - case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI: - nargs = 4; - break; - case V2DF_FTYPE_V2DF_V2DF_V2DI_INT: - case V4DF_FTYPE_V4DF_V4DF_V4DI_INT: - case V4SF_FTYPE_V4SF_V4SF_V4SI_INT: - case V8SF_FTYPE_V8SF_V8SF_V8SI_INT: - case V16SF_FTYPE_V16SF_V16SF_V16SI_INT: - nargs = 4; - nargs_constant = 1; - break; - case UQI_FTYPE_V4DI_V4DI_INT_UQI: - case UQI_FTYPE_V8SI_V8SI_INT_UQI: - case QI_FTYPE_V4DF_V4DF_INT_UQI: - case QI_FTYPE_V8SF_V8SF_INT_UQI: - case UQI_FTYPE_V2DI_V2DI_INT_UQI: - case UQI_FTYPE_V4SI_V4SI_INT_UQI: - case UQI_FTYPE_V2DF_V2DF_INT_UQI: - case UQI_FTYPE_V4SF_V4SF_INT_UQI: - case UDI_FTYPE_V64QI_V64QI_INT_UDI: - case USI_FTYPE_V32QI_V32QI_INT_USI: - case UHI_FTYPE_V16QI_V16QI_INT_UHI: - case USI_FTYPE_V32HI_V32HI_INT_USI: - case UHI_FTYPE_V16HI_V16HI_INT_UHI: - case UQI_FTYPE_V8HI_V8HI_INT_UQI: - nargs = 4; - mask_pos = 1; - nargs_constant = 1; - break; - case V2DI_FTYPE_V2DI_V2DI_UINT_UINT: - nargs = 4; - nargs_constant = 2; - break; - case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED: - case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG: - case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI: - case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI: - case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI: - nargs = 4; - break; - case UQI_FTYPE_V8DI_V8DI_INT_UQI: - case UHI_FTYPE_V16SI_V16SI_INT_UHI: - mask_pos = 1; - nargs = 4; - nargs_constant = 1; - break; - case V8SF_FTYPE_V8SF_INT_V8SF_UQI: - case V4SF_FTYPE_V4SF_INT_V4SF_UQI: - case V2DF_FTYPE_V4DF_INT_V2DF_UQI: - case V2DI_FTYPE_V4DI_INT_V2DI_UQI: - case V8SF_FTYPE_V16SF_INT_V8SF_UQI: - case V8SI_FTYPE_V16SI_INT_V8SI_UQI: - case V2DF_FTYPE_V8DF_INT_V2DF_UQI: - case V2DI_FTYPE_V8DI_INT_V2DI_UQI: - case V4SF_FTYPE_V8SF_INT_V4SF_UQI: - case V4SI_FTYPE_V8SI_INT_V4SI_UQI: - case V8HI_FTYPE_V8SF_INT_V8HI_UQI: - case V8HI_FTYPE_V4SF_INT_V8HI_UQI: - case V32HI_FTYPE_V32HI_INT_V32HI_USI: - case V16HI_FTYPE_V16HI_INT_V16HI_UHI: - case V8HI_FTYPE_V8HI_INT_V8HI_UQI: - case V4DI_FTYPE_V4DI_INT_V4DI_UQI: - case V2DI_FTYPE_V2DI_INT_V2DI_UQI: - case V8SI_FTYPE_V8SI_INT_V8SI_UQI: - case V4SI_FTYPE_V4SI_INT_V4SI_UQI: - case V4DF_FTYPE_V4DF_INT_V4DF_UQI: - case V2DF_FTYPE_V2DF_INT_V2DF_UQI: - case V8DF_FTYPE_V8DF_INT_V8DF_UQI: - case V16SF_FTYPE_V16SF_INT_V16SF_UHI: - case V16HI_FTYPE_V16SF_INT_V16HI_UHI: - case V16SI_FTYPE_V16SI_INT_V16SI_UHI: - case V4SI_FTYPE_V16SI_INT_V4SI_UQI: - case V4DI_FTYPE_V8DI_INT_V4DI_UQI: - case V4DF_FTYPE_V8DF_INT_V4DF_UQI: - case V4SF_FTYPE_V16SF_INT_V4SF_UQI: - case V8DI_FTYPE_V8DI_INT_V8DI_UQI: - nargs = 4; - mask_pos = 2; - nargs_constant = 1; - break; - case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI: - case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI: - case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI: - case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI: - case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI: - case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI: - case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI: - case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI: - case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI: - case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI: - case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI: - case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI: - case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI: - case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI: - case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI: - case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI: - case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI: - case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI: - case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI: - case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI: - case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI: - case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI: - case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI: - case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI: - case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI: - case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI: - case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI: - nargs = 5; - mask_pos = 2; - nargs_constant = 1; - break; - case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI: - case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI: - case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI: - case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI: - case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI: - case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI: - case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI: - case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI: - case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI: - case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI: - nargs = 5; - mask_pos = 1; - nargs_constant = 1; - break; - case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI: - case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI: - case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI: - case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT: - case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT: - case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT: - case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT: - case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT: - case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT: - case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT: - case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT: - case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT: - nargs = 5; - mask_pos = 1; - nargs_constant = 2; - break; - - default: - gcc_unreachable (); - } - - gcc_assert (nargs <= ARRAY_SIZE (args)); - - if (comparison != UNKNOWN) - { - gcc_assert (nargs == 2); - return ix86_expand_sse_compare (d, exp, target, swap); - } - - if (rmode == VOIDmode || rmode == tmode) - { - if (optimize - || target == 0 - || GET_MODE (target) != tmode - || !insn_p->operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - else if (memory_operand (target, tmode)) - num_memory++; - real_target = target; - } - else - { - real_target = gen_reg_rtx (tmode); - target = lowpart_subreg (rmode, real_target, tmode); - } - - for (i = 0; i < nargs; i++) - { - tree arg = CALL_EXPR_ARG (exp, i); - rtx op = expand_normal (arg); - machine_mode mode = insn_p->operand[i + 1].mode; - bool match = insn_p->operand[i + 1].predicate (op, mode); - - if (second_arg_count && i == 1) - { - /* SIMD shift insns take either an 8-bit immediate or - register as count. But builtin functions take int as - count. If count doesn't match, we put it in register. - The instructions are using 64-bit count, if op is just - 32-bit, zero-extend it, as negative shift counts - are undefined behavior and zero-extension is more - efficient. */ - if (!match) - { - if (SCALAR_INT_MODE_P (GET_MODE (op))) - op = convert_modes (mode, GET_MODE (op), op, 1); - else - op = lowpart_subreg (mode, op, GET_MODE (op)); - if (!insn_p->operand[i + 1].predicate (op, mode)) - op = copy_to_reg (op); - } - } - else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) || - (!mask_pos && (nargs - i) <= nargs_constant)) - { - if (!match) - switch (icode) - { - case CODE_FOR_avx_vinsertf128v4di: - case CODE_FOR_avx_vextractf128v4di: - error ("the last argument must be an 1-bit immediate"); - return const0_rtx; - - case CODE_FOR_avx512f_cmpv8di3_mask: - case CODE_FOR_avx512f_cmpv16si3_mask: - case CODE_FOR_avx512f_ucmpv8di3_mask: - case CODE_FOR_avx512f_ucmpv16si3_mask: - case CODE_FOR_avx512vl_cmpv4di3_mask: - case CODE_FOR_avx512vl_cmpv8si3_mask: - case CODE_FOR_avx512vl_ucmpv4di3_mask: - case CODE_FOR_avx512vl_ucmpv8si3_mask: - case CODE_FOR_avx512vl_cmpv2di3_mask: - case CODE_FOR_avx512vl_cmpv4si3_mask: - case CODE_FOR_avx512vl_ucmpv2di3_mask: - case CODE_FOR_avx512vl_ucmpv4si3_mask: - error ("the last argument must be a 3-bit immediate"); - return const0_rtx; - - case CODE_FOR_sse4_1_roundsd: - case CODE_FOR_sse4_1_roundss: - - case CODE_FOR_sse4_1_roundpd: - case CODE_FOR_sse4_1_roundps: - case CODE_FOR_avx_roundpd256: - case CODE_FOR_avx_roundps256: - - case CODE_FOR_sse4_1_roundpd_vec_pack_sfix: - case CODE_FOR_sse4_1_roundps_sfix: - case CODE_FOR_avx_roundpd_vec_pack_sfix256: - case CODE_FOR_avx_roundps_sfix256: - - case CODE_FOR_sse4_1_blendps: - case CODE_FOR_avx_blendpd256: - case CODE_FOR_avx_vpermilv4df: - case CODE_FOR_avx_vpermilv4df_mask: - case CODE_FOR_avx512f_getmantv8df_mask: - case CODE_FOR_avx512f_getmantv16sf_mask: - case CODE_FOR_avx512vl_getmantv8sf_mask: - case CODE_FOR_avx512vl_getmantv4df_mask: - case CODE_FOR_avx512vl_getmantv4sf_mask: - case CODE_FOR_avx512vl_getmantv2df_mask: - case CODE_FOR_avx512dq_rangepv8df_mask_round: - case CODE_FOR_avx512dq_rangepv16sf_mask_round: - case CODE_FOR_avx512dq_rangepv4df_mask: - case CODE_FOR_avx512dq_rangepv8sf_mask: - case CODE_FOR_avx512dq_rangepv2df_mask: - case CODE_FOR_avx512dq_rangepv4sf_mask: - case CODE_FOR_avx_shufpd256_mask: - error ("the last argument must be a 4-bit immediate"); - return const0_rtx; - - case CODE_FOR_sha1rnds4: - case CODE_FOR_sse4_1_blendpd: - case CODE_FOR_avx_vpermilv2df: - case CODE_FOR_avx_vpermilv2df_mask: - case CODE_FOR_xop_vpermil2v2df3: - case CODE_FOR_xop_vpermil2v4sf3: - case CODE_FOR_xop_vpermil2v4df3: - case CODE_FOR_xop_vpermil2v8sf3: - case CODE_FOR_avx512f_vinsertf32x4_mask: - case CODE_FOR_avx512f_vinserti32x4_mask: - case CODE_FOR_avx512f_vextractf32x4_mask: - case CODE_FOR_avx512f_vextracti32x4_mask: - case CODE_FOR_sse2_shufpd: - case CODE_FOR_sse2_shufpd_mask: - case CODE_FOR_avx512dq_shuf_f64x2_mask: - case CODE_FOR_avx512dq_shuf_i64x2_mask: - case CODE_FOR_avx512vl_shuf_i32x4_mask: - case CODE_FOR_avx512vl_shuf_f32x4_mask: - error ("the last argument must be a 2-bit immediate"); - return const0_rtx; - - case CODE_FOR_avx_vextractf128v4df: - case CODE_FOR_avx_vextractf128v8sf: - case CODE_FOR_avx_vextractf128v8si: - case CODE_FOR_avx_vinsertf128v4df: - case CODE_FOR_avx_vinsertf128v8sf: - case CODE_FOR_avx_vinsertf128v8si: - case CODE_FOR_avx512f_vinsertf64x4_mask: - case CODE_FOR_avx512f_vinserti64x4_mask: - case CODE_FOR_avx512f_vextractf64x4_mask: - case CODE_FOR_avx512f_vextracti64x4_mask: - case CODE_FOR_avx512dq_vinsertf32x8_mask: - case CODE_FOR_avx512dq_vinserti32x8_mask: - case CODE_FOR_avx512vl_vinsertv4df: - case CODE_FOR_avx512vl_vinsertv4di: - case CODE_FOR_avx512vl_vinsertv8sf: - case CODE_FOR_avx512vl_vinsertv8si: - error ("the last argument must be a 1-bit immediate"); - return const0_rtx; - - case CODE_FOR_avx_vmcmpv2df3: - case CODE_FOR_avx_vmcmpv4sf3: - case CODE_FOR_avx_cmpv2df3: - case CODE_FOR_avx_cmpv4sf3: - case CODE_FOR_avx_cmpv4df3: - case CODE_FOR_avx_cmpv8sf3: - case CODE_FOR_avx512f_cmpv8df3_mask: - case CODE_FOR_avx512f_cmpv16sf3_mask: - case CODE_FOR_avx512f_vmcmpv2df3_mask: - case CODE_FOR_avx512f_vmcmpv4sf3_mask: - error ("the last argument must be a 5-bit immediate"); - return const0_rtx; - - default: - switch (nargs_constant) - { - case 2: - if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) || - (!mask_pos && (nargs - i) == nargs_constant)) - { - error ("the next to last argument must be an 8-bit immediate"); - break; - } - /* FALLTHRU */ - case 1: - error ("the last argument must be an 8-bit immediate"); - break; - default: - gcc_unreachable (); - } - return const0_rtx; - } - } - else - { - if (VECTOR_MODE_P (mode)) - op = safe_vector_operand (op, mode); - - /* If we aren't optimizing, only allow one memory operand to - be generated. */ - if (memory_operand (op, mode)) - num_memory++; - - op = fixup_modeless_constant (op, mode); - - if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) - { - if (optimize || !match || num_memory > 1) - op = copy_to_mode_reg (mode, op); - } - else - { - op = copy_to_reg (op); - op = lowpart_subreg (mode, op, GET_MODE (op)); - } - } - - args[i].op = op; - args[i].mode = mode; - } - - switch (nargs) - { - case 1: - pat = GEN_FCN (icode) (real_target, args[0].op); - break; - case 2: - pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op); - break; - case 3: - pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, - args[2].op); - break; - case 4: - pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, - args[2].op, args[3].op); - break; - case 5: - pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, - args[2].op, args[3].op, args[4].op); - break; - case 6: - pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, - args[2].op, args[3].op, args[4].op, - args[5].op); - break; - default: - gcc_unreachable (); - } - - if (! pat) - return 0; - - emit_insn (pat); - return target; -} - -/* Transform pattern of following layout: - (set A - (unspec [B C] UNSPEC_EMBEDDED_ROUNDING)) - ) - into: - (set (A B)) */ - -static rtx -ix86_erase_embedded_rounding (rtx pat) -{ - if (GET_CODE (pat) == INSN) - pat = PATTERN (pat); - - gcc_assert (GET_CODE (pat) == SET); - rtx src = SET_SRC (pat); - gcc_assert (XVECLEN (src, 0) == 2); - rtx p0 = XVECEXP (src, 0, 0); - gcc_assert (GET_CODE (src) == UNSPEC - && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING); - rtx res = gen_rtx_SET (SET_DEST (pat), p0); - return res; -} - -/* Subroutine of ix86_expand_round_builtin to take care of comi insns - with rounding. */ -static rtx -ix86_expand_sse_comi_round (const struct builtin_description *d, - tree exp, rtx target) -{ - rtx pat, set_dst; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - tree arg2 = CALL_EXPR_ARG (exp, 2); - tree arg3 = CALL_EXPR_ARG (exp, 3); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - rtx op2 = expand_normal (arg2); - rtx op3 = expand_normal (arg3); - enum insn_code icode = d->icode; - const struct insn_data_d *insn_p = &insn_data[icode]; - machine_mode mode0 = insn_p->operand[0].mode; - machine_mode mode1 = insn_p->operand[1].mode; - - /* See avxintrin.h for values. */ - static const enum rtx_code comparisons[32] = - { - EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED, - UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED, - EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED, - UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED - }; - static const bool ordereds[32] = - { - true, true, true, false, false, false, false, true, - false, false, false, true, true, true, true, false, - true, true, true, false, false, false, false, true, - false, false, false, true, true, true, true, false - }; - static const bool non_signalings[32] = - { - true, false, false, true, true, false, false, true, - true, false, false, true, true, false, false, true, - false, true, true, false, false, true, true, false, - false, true, true, false, false, true, true, false - }; - - if (!CONST_INT_P (op2)) - { - error ("the third argument must be comparison constant"); - return const0_rtx; - } - if (INTVAL (op2) < 0 || INTVAL (op2) >= 32) - { - error ("incorrect comparison mode"); - return const0_rtx; - } - - if (!insn_p->operand[2].predicate (op3, SImode)) - { - error ("incorrect rounding operand"); - return const0_rtx; - } - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - if (VECTOR_MODE_P (mode1)) - op1 = safe_vector_operand (op1, mode1); - - enum rtx_code comparison = comparisons[INTVAL (op2)]; - bool ordered = ordereds[INTVAL (op2)]; - bool non_signaling = non_signalings[INTVAL (op2)]; - rtx const_val = const0_rtx; - - bool check_unordered = false; - machine_mode mode = CCFPmode; - switch (comparison) - { - case ORDERED: - if (!ordered) - { - /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */ - if (!non_signaling) - ordered = true; - mode = CCSmode; - } - else - { - /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */ - if (non_signaling) - ordered = false; - mode = CCPmode; - } - comparison = NE; - break; - case UNORDERED: - if (ordered) - { - /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */ - if (non_signaling) - ordered = false; - mode = CCSmode; - } - else - { - /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */ - if (!non_signaling) - ordered = true; - mode = CCPmode; - } - comparison = EQ; - break; - - case LE: /* -> GE */ - case LT: /* -> GT */ - case UNGE: /* -> UNLE */ - case UNGT: /* -> UNLT */ - std::swap (op0, op1); - comparison = swap_condition (comparison); - /* FALLTHRU */ - case GT: - case GE: - case UNEQ: - case UNLT: - case UNLE: - case LTGT: - /* These are supported by CCFPmode. NB: Use ordered/signaling - COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF - with NAN operands. */ - if (ordered == non_signaling) - ordered = !ordered; - break; - case EQ: - /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for - _CMP_EQ_OQ/_CMP_EQ_OS. */ - check_unordered = true; - mode = CCZmode; - break; - case NE: - /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for - _CMP_NEQ_UQ/_CMP_NEQ_US. */ - gcc_assert (!ordered); - check_unordered = true; - mode = CCZmode; - const_val = const1_rtx; - break; - default: - gcc_unreachable (); - } - - target = gen_reg_rtx (SImode); - emit_move_insn (target, const_val); - target = gen_rtx_SUBREG (QImode, target, 0); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_p->operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if ((optimize && !register_operand (op1, mode1)) - || !insn_p->operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - /* - 1. COMI: ordered and signaling. - 2. UCOMI: unordered and non-signaling. - */ - if (non_signaling) - icode = (icode == CODE_FOR_sse_comi_round - ? CODE_FOR_sse_ucomi_round - : CODE_FOR_sse2_ucomi_round); - - pat = GEN_FCN (icode) (op0, op1, op3); - if (! pat) - return 0; - - /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */ - if (INTVAL (op3) == NO_ROUND) - { - pat = ix86_erase_embedded_rounding (pat); - if (! pat) - return 0; - - set_dst = SET_DEST (pat); - } - else - { - gcc_assert (GET_CODE (pat) == SET); - set_dst = SET_DEST (pat); - } - - emit_insn (pat); - - rtx_code_label *label = NULL; - - /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient - with NAN operands. */ - if (check_unordered) - { - gcc_assert (comparison == EQ || comparison == NE); - - rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG); - label = gen_label_rtx (); - rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx); - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, - gen_rtx_LABEL_REF (VOIDmode, label), - pc_rtx); - emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - } - - /* NB: Set CCFPmode and check a different CCmode which is in subset - of CCFPmode. */ - if (GET_MODE (set_dst) != mode) - { - gcc_assert (mode == CCAmode || mode == CCCmode - || mode == CCOmode || mode == CCPmode - || mode == CCSmode || mode == CCZmode); - set_dst = gen_rtx_REG (mode, FLAGS_REG); - } - - emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), - gen_rtx_fmt_ee (comparison, QImode, - set_dst, - const0_rtx))); - - if (label) - emit_label (label); - - return SUBREG_REG (target); -} - -static rtx -ix86_expand_round_builtin (const struct builtin_description *d, - tree exp, rtx target) -{ - rtx pat; - unsigned int i, nargs; - struct - { - rtx op; - machine_mode mode; - } args[6]; - enum insn_code icode = d->icode; - const struct insn_data_d *insn_p = &insn_data[icode]; - machine_mode tmode = insn_p->operand[0].mode; - unsigned int nargs_constant = 0; - unsigned int redundant_embed_rnd = 0; - - switch ((enum ix86_builtin_func_type) d->flag) - { - case UINT64_FTYPE_V2DF_INT: - case UINT64_FTYPE_V4SF_INT: - case UINT_FTYPE_V2DF_INT: - case UINT_FTYPE_V4SF_INT: - case INT64_FTYPE_V2DF_INT: - case INT64_FTYPE_V4SF_INT: - case INT_FTYPE_V2DF_INT: - case INT_FTYPE_V4SF_INT: - nargs = 2; - break; - case V4SF_FTYPE_V4SF_UINT_INT: - case V4SF_FTYPE_V4SF_UINT64_INT: - case V2DF_FTYPE_V2DF_UINT64_INT: - case V4SF_FTYPE_V4SF_INT_INT: - case V4SF_FTYPE_V4SF_INT64_INT: - case V2DF_FTYPE_V2DF_INT64_INT: - case V4SF_FTYPE_V4SF_V4SF_INT: - case V2DF_FTYPE_V2DF_V2DF_INT: - case V4SF_FTYPE_V4SF_V2DF_INT: - case V2DF_FTYPE_V2DF_V4SF_INT: - nargs = 3; - break; - case V8SF_FTYPE_V8DF_V8SF_QI_INT: - case V8DF_FTYPE_V8DF_V8DF_QI_INT: - case V8SI_FTYPE_V8DF_V8SI_QI_INT: - case V8DI_FTYPE_V8DF_V8DI_QI_INT: - case V8SF_FTYPE_V8DI_V8SF_QI_INT: - case V8DF_FTYPE_V8DI_V8DF_QI_INT: - case V16SF_FTYPE_V16SF_V16SF_HI_INT: - case V8DI_FTYPE_V8SF_V8DI_QI_INT: - case V16SF_FTYPE_V16SI_V16SF_HI_INT: - case V16SI_FTYPE_V16SF_V16SI_HI_INT: - case V8DF_FTYPE_V8SF_V8DF_QI_INT: - case V16SF_FTYPE_V16HI_V16SF_HI_INT: - case V2DF_FTYPE_V2DF_V2DF_V2DF_INT: - case V4SF_FTYPE_V4SF_V4SF_V4SF_INT: - nargs = 4; - break; - case V4SF_FTYPE_V4SF_V4SF_INT_INT: - case V2DF_FTYPE_V2DF_V2DF_INT_INT: - nargs_constant = 2; - nargs = 4; - break; - case INT_FTYPE_V4SF_V4SF_INT_INT: - case INT_FTYPE_V2DF_V2DF_INT_INT: - return ix86_expand_sse_comi_round (d, exp, target); - case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT: - case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT: - case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT: - case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT: - case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT: - case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT: - case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT: - case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT: - nargs = 5; - break; - case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT: - case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT: - nargs_constant = 4; - nargs = 5; - break; - case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT: - case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT: - case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT: - case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT: - nargs_constant = 3; - nargs = 5; - break; - case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT: - case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT: - case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT: - case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT: - case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT: - case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT: - nargs = 6; - nargs_constant = 4; - break; - case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT: - case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT: - case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT: - case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT: - nargs = 6; - nargs_constant = 3; - break; - default: - gcc_unreachable (); - } - gcc_assert (nargs <= ARRAY_SIZE (args)); - - if (optimize - || target == 0 - || GET_MODE (target) != tmode - || !insn_p->operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - for (i = 0; i < nargs; i++) - { - tree arg = CALL_EXPR_ARG (exp, i); - rtx op = expand_normal (arg); - machine_mode mode = insn_p->operand[i + 1].mode; - bool match = insn_p->operand[i + 1].predicate (op, mode); - - if (i == nargs - nargs_constant) - { - if (!match) - { - switch (icode) - { - case CODE_FOR_avx512f_getmantv8df_mask_round: - case CODE_FOR_avx512f_getmantv16sf_mask_round: - case CODE_FOR_avx512f_vgetmantv2df_round: - case CODE_FOR_avx512f_vgetmantv2df_mask_round: - case CODE_FOR_avx512f_vgetmantv4sf_round: - case CODE_FOR_avx512f_vgetmantv4sf_mask_round: - error ("the immediate argument must be a 4-bit immediate"); - return const0_rtx; - case CODE_FOR_avx512f_cmpv8df3_mask_round: - case CODE_FOR_avx512f_cmpv16sf3_mask_round: - case CODE_FOR_avx512f_vmcmpv2df3_mask_round: - case CODE_FOR_avx512f_vmcmpv4sf3_mask_round: - error ("the immediate argument must be a 5-bit immediate"); - return const0_rtx; - default: - error ("the immediate argument must be an 8-bit immediate"); - return const0_rtx; - } - } - } - else if (i == nargs-1) - { - if (!insn_p->operand[nargs].predicate (op, SImode)) - { - error ("incorrect rounding operand"); - return const0_rtx; - } - - /* If there is no rounding use normal version of the pattern. */ - if (INTVAL (op) == NO_ROUND) - redundant_embed_rnd = 1; - } - else - { - if (VECTOR_MODE_P (mode)) - op = safe_vector_operand (op, mode); - - op = fixup_modeless_constant (op, mode); - - if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) - { - if (optimize || !match) - op = copy_to_mode_reg (mode, op); - } - else - { - op = copy_to_reg (op); - op = lowpart_subreg (mode, op, GET_MODE (op)); - } - } - - args[i].op = op; - args[i].mode = mode; - } - - switch (nargs) - { - case 1: - pat = GEN_FCN (icode) (target, args[0].op); - break; - case 2: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op); - break; - case 3: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, - args[2].op); - break; - case 4: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, - args[2].op, args[3].op); - break; - case 5: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, - args[2].op, args[3].op, args[4].op); - break; - case 6: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, - args[2].op, args[3].op, args[4].op, - args[5].op); - break; - default: - gcc_unreachable (); - } - - if (!pat) - return 0; - - if (redundant_embed_rnd) - pat = ix86_erase_embedded_rounding (pat); - - emit_insn (pat); - return target; -} - -/* Subroutine of ix86_expand_builtin to take care of special insns - with variable number of operands. */ - -static rtx -ix86_expand_special_args_builtin (const struct builtin_description *d, - tree exp, rtx target) -{ - tree arg; - rtx pat, op; - unsigned int i, nargs, arg_adjust, memory; - bool aligned_mem = false; - struct - { - rtx op; - machine_mode mode; - } args[3]; - enum insn_code icode = d->icode; - bool last_arg_constant = false; - const struct insn_data_d *insn_p = &insn_data[icode]; - machine_mode tmode = insn_p->operand[0].mode; - enum { load, store } klass; - - switch ((enum ix86_builtin_func_type) d->flag) - { - case VOID_FTYPE_VOID: - emit_insn (GEN_FCN (icode) (target)); - return 0; - case VOID_FTYPE_UINT64: - case VOID_FTYPE_UNSIGNED: - nargs = 0; - klass = store; - memory = 0; - break; - - case INT_FTYPE_VOID: - case USHORT_FTYPE_VOID: - case UINT64_FTYPE_VOID: - case UINT_FTYPE_VOID: - case UNSIGNED_FTYPE_VOID: - nargs = 0; - klass = load; - memory = 0; - break; - case UINT64_FTYPE_PUNSIGNED: - case V2DI_FTYPE_PV2DI: - case V4DI_FTYPE_PV4DI: - case V32QI_FTYPE_PCCHAR: - case V16QI_FTYPE_PCCHAR: - case V8SF_FTYPE_PCV4SF: - case V8SF_FTYPE_PCFLOAT: - case V4SF_FTYPE_PCFLOAT: - case V4DF_FTYPE_PCV2DF: - case V4DF_FTYPE_PCDOUBLE: - case V2DF_FTYPE_PCDOUBLE: - case VOID_FTYPE_PVOID: - case V8DI_FTYPE_PV8DI: - nargs = 1; - klass = load; - memory = 0; - switch (icode) - { - case CODE_FOR_sse4_1_movntdqa: - case CODE_FOR_avx2_movntdqa: - case CODE_FOR_avx512f_movntdqa: - aligned_mem = true; - break; - default: - break; - } - break; - case VOID_FTYPE_PV2SF_V4SF: - case VOID_FTYPE_PV8DI_V8DI: - case VOID_FTYPE_PV4DI_V4DI: - case VOID_FTYPE_PV2DI_V2DI: - case VOID_FTYPE_PCHAR_V32QI: - case VOID_FTYPE_PCHAR_V16QI: - case VOID_FTYPE_PFLOAT_V16SF: - case VOID_FTYPE_PFLOAT_V8SF: - case VOID_FTYPE_PFLOAT_V4SF: - case VOID_FTYPE_PDOUBLE_V8DF: - case VOID_FTYPE_PDOUBLE_V4DF: - case VOID_FTYPE_PDOUBLE_V2DF: - case VOID_FTYPE_PLONGLONG_LONGLONG: - case VOID_FTYPE_PULONGLONG_ULONGLONG: - case VOID_FTYPE_PUNSIGNED_UNSIGNED: - case VOID_FTYPE_PINT_INT: - nargs = 1; - klass = store; - /* Reserve memory operand for target. */ - memory = ARRAY_SIZE (args); - switch (icode) - { - /* These builtins and instructions require the memory - to be properly aligned. */ - case CODE_FOR_avx_movntv4di: - case CODE_FOR_sse2_movntv2di: - case CODE_FOR_avx_movntv8sf: - case CODE_FOR_sse_movntv4sf: - case CODE_FOR_sse4a_vmmovntv4sf: - case CODE_FOR_avx_movntv4df: - case CODE_FOR_sse2_movntv2df: - case CODE_FOR_sse4a_vmmovntv2df: - case CODE_FOR_sse2_movntidi: - case CODE_FOR_sse_movntq: - case CODE_FOR_sse2_movntisi: - case CODE_FOR_avx512f_movntv16sf: - case CODE_FOR_avx512f_movntv8df: - case CODE_FOR_avx512f_movntv8di: - aligned_mem = true; - break; - default: - break; - } - break; - case VOID_FTYPE_PVOID_PCVOID: - nargs = 1; - klass = store; - memory = 0; - - break; - case V4SF_FTYPE_V4SF_PCV2SF: - case V2DF_FTYPE_V2DF_PCDOUBLE: - nargs = 2; - klass = load; - memory = 1; - break; - case V8SF_FTYPE_PCV8SF_V8SI: - case V4DF_FTYPE_PCV4DF_V4DI: - case V4SF_FTYPE_PCV4SF_V4SI: - case V2DF_FTYPE_PCV2DF_V2DI: - case V8SI_FTYPE_PCV8SI_V8SI: - case V4DI_FTYPE_PCV4DI_V4DI: - case V4SI_FTYPE_PCV4SI_V4SI: - case V2DI_FTYPE_PCV2DI_V2DI: - case VOID_FTYPE_INT_INT64: - nargs = 2; - klass = load; - memory = 0; - break; - case VOID_FTYPE_PV8DF_V8DF_UQI: - case VOID_FTYPE_PV4DF_V4DF_UQI: - case VOID_FTYPE_PV2DF_V2DF_UQI: - case VOID_FTYPE_PV16SF_V16SF_UHI: - case VOID_FTYPE_PV8SF_V8SF_UQI: - case VOID_FTYPE_PV4SF_V4SF_UQI: - case VOID_FTYPE_PV8DI_V8DI_UQI: - case VOID_FTYPE_PV4DI_V4DI_UQI: - case VOID_FTYPE_PV2DI_V2DI_UQI: - case VOID_FTYPE_PV16SI_V16SI_UHI: - case VOID_FTYPE_PV8SI_V8SI_UQI: - case VOID_FTYPE_PV4SI_V4SI_UQI: - case VOID_FTYPE_PV64QI_V64QI_UDI: - case VOID_FTYPE_PV32HI_V32HI_USI: - case VOID_FTYPE_PV32QI_V32QI_USI: - case VOID_FTYPE_PV16QI_V16QI_UHI: - case VOID_FTYPE_PV16HI_V16HI_UHI: - case VOID_FTYPE_PV8HI_V8HI_UQI: - switch (icode) - { - /* These builtins and instructions require the memory - to be properly aligned. */ - case CODE_FOR_avx512f_storev16sf_mask: - case CODE_FOR_avx512f_storev16si_mask: - case CODE_FOR_avx512f_storev8df_mask: - case CODE_FOR_avx512f_storev8di_mask: - case CODE_FOR_avx512vl_storev8sf_mask: - case CODE_FOR_avx512vl_storev8si_mask: - case CODE_FOR_avx512vl_storev4df_mask: - case CODE_FOR_avx512vl_storev4di_mask: - case CODE_FOR_avx512vl_storev4sf_mask: - case CODE_FOR_avx512vl_storev4si_mask: - case CODE_FOR_avx512vl_storev2df_mask: - case CODE_FOR_avx512vl_storev2di_mask: - aligned_mem = true; - break; - default: - break; - } - /* FALLTHRU */ - case VOID_FTYPE_PV8SF_V8SI_V8SF: - case VOID_FTYPE_PV4DF_V4DI_V4DF: - case VOID_FTYPE_PV4SF_V4SI_V4SF: - case VOID_FTYPE_PV2DF_V2DI_V2DF: - case VOID_FTYPE_PV8SI_V8SI_V8SI: - case VOID_FTYPE_PV4DI_V4DI_V4DI: - case VOID_FTYPE_PV4SI_V4SI_V4SI: - case VOID_FTYPE_PV2DI_V2DI_V2DI: - case VOID_FTYPE_PV8SI_V8DI_UQI: - case VOID_FTYPE_PV8HI_V8DI_UQI: - case VOID_FTYPE_PV16HI_V16SI_UHI: - case VOID_FTYPE_PUDI_V8DI_UQI: - case VOID_FTYPE_PV16QI_V16SI_UHI: - case VOID_FTYPE_PV4SI_V4DI_UQI: - case VOID_FTYPE_PUDI_V2DI_UQI: - case VOID_FTYPE_PUDI_V4DI_UQI: - case VOID_FTYPE_PUSI_V2DI_UQI: - case VOID_FTYPE_PV8HI_V8SI_UQI: - case VOID_FTYPE_PUDI_V4SI_UQI: - case VOID_FTYPE_PUSI_V4DI_UQI: - case VOID_FTYPE_PUHI_V2DI_UQI: - case VOID_FTYPE_PUDI_V8SI_UQI: - case VOID_FTYPE_PUSI_V4SI_UQI: - case VOID_FTYPE_PCHAR_V64QI_UDI: - case VOID_FTYPE_PCHAR_V32QI_USI: - case VOID_FTYPE_PCHAR_V16QI_UHI: - case VOID_FTYPE_PSHORT_V32HI_USI: - case VOID_FTYPE_PSHORT_V16HI_UHI: - case VOID_FTYPE_PSHORT_V8HI_UQI: - case VOID_FTYPE_PINT_V16SI_UHI: - case VOID_FTYPE_PINT_V8SI_UQI: - case VOID_FTYPE_PINT_V4SI_UQI: - case VOID_FTYPE_PINT64_V8DI_UQI: - case VOID_FTYPE_PINT64_V4DI_UQI: - case VOID_FTYPE_PINT64_V2DI_UQI: - case VOID_FTYPE_PDOUBLE_V8DF_UQI: - case VOID_FTYPE_PDOUBLE_V4DF_UQI: - case VOID_FTYPE_PDOUBLE_V2DF_UQI: - case VOID_FTYPE_PFLOAT_V16SF_UHI: - case VOID_FTYPE_PFLOAT_V8SF_UQI: - case VOID_FTYPE_PFLOAT_V4SF_UQI: - case VOID_FTYPE_PV32QI_V32HI_USI: - case VOID_FTYPE_PV16QI_V16HI_UHI: - case VOID_FTYPE_PUDI_V8HI_UQI: - nargs = 2; - klass = store; - /* Reserve memory operand for target. */ - memory = ARRAY_SIZE (args); - break; - case V4SF_FTYPE_PCV4SF_V4SF_UQI: - case V8SF_FTYPE_PCV8SF_V8SF_UQI: - case V16SF_FTYPE_PCV16SF_V16SF_UHI: - case V4SI_FTYPE_PCV4SI_V4SI_UQI: - case V8SI_FTYPE_PCV8SI_V8SI_UQI: - case V16SI_FTYPE_PCV16SI_V16SI_UHI: - case V2DF_FTYPE_PCV2DF_V2DF_UQI: - case V4DF_FTYPE_PCV4DF_V4DF_UQI: - case V8DF_FTYPE_PCV8DF_V8DF_UQI: - case V2DI_FTYPE_PCV2DI_V2DI_UQI: - case V4DI_FTYPE_PCV4DI_V4DI_UQI: - case V8DI_FTYPE_PCV8DI_V8DI_UQI: - case V64QI_FTYPE_PCV64QI_V64QI_UDI: - case V32HI_FTYPE_PCV32HI_V32HI_USI: - case V32QI_FTYPE_PCV32QI_V32QI_USI: - case V16QI_FTYPE_PCV16QI_V16QI_UHI: - case V16HI_FTYPE_PCV16HI_V16HI_UHI: - case V8HI_FTYPE_PCV8HI_V8HI_UQI: - switch (icode) - { - /* These builtins and instructions require the memory - to be properly aligned. */ - case CODE_FOR_avx512f_loadv16sf_mask: - case CODE_FOR_avx512f_loadv16si_mask: - case CODE_FOR_avx512f_loadv8df_mask: - case CODE_FOR_avx512f_loadv8di_mask: - case CODE_FOR_avx512vl_loadv8sf_mask: - case CODE_FOR_avx512vl_loadv8si_mask: - case CODE_FOR_avx512vl_loadv4df_mask: - case CODE_FOR_avx512vl_loadv4di_mask: - case CODE_FOR_avx512vl_loadv4sf_mask: - case CODE_FOR_avx512vl_loadv4si_mask: - case CODE_FOR_avx512vl_loadv2df_mask: - case CODE_FOR_avx512vl_loadv2di_mask: - case CODE_FOR_avx512bw_loadv64qi_mask: - case CODE_FOR_avx512vl_loadv32qi_mask: - case CODE_FOR_avx512vl_loadv16qi_mask: - case CODE_FOR_avx512bw_loadv32hi_mask: - case CODE_FOR_avx512vl_loadv16hi_mask: - case CODE_FOR_avx512vl_loadv8hi_mask: - aligned_mem = true; - break; - default: - break; - } - /* FALLTHRU */ - case V64QI_FTYPE_PCCHAR_V64QI_UDI: - case V32QI_FTYPE_PCCHAR_V32QI_USI: - case V16QI_FTYPE_PCCHAR_V16QI_UHI: - case V32HI_FTYPE_PCSHORT_V32HI_USI: - case V16HI_FTYPE_PCSHORT_V16HI_UHI: - case V8HI_FTYPE_PCSHORT_V8HI_UQI: - case V16SI_FTYPE_PCINT_V16SI_UHI: - case V8SI_FTYPE_PCINT_V8SI_UQI: - case V4SI_FTYPE_PCINT_V4SI_UQI: - case V8DI_FTYPE_PCINT64_V8DI_UQI: - case V4DI_FTYPE_PCINT64_V4DI_UQI: - case V2DI_FTYPE_PCINT64_V2DI_UQI: - case V8DF_FTYPE_PCDOUBLE_V8DF_UQI: - case V4DF_FTYPE_PCDOUBLE_V4DF_UQI: - case V2DF_FTYPE_PCDOUBLE_V2DF_UQI: - case V16SF_FTYPE_PCFLOAT_V16SF_UHI: - case V8SF_FTYPE_PCFLOAT_V8SF_UQI: - case V4SF_FTYPE_PCFLOAT_V4SF_UQI: - nargs = 3; - klass = load; - memory = 0; - break; - case VOID_FTYPE_UINT_UINT_UINT: - case VOID_FTYPE_UINT64_UINT_UINT: - case UCHAR_FTYPE_UINT_UINT_UINT: - case UCHAR_FTYPE_UINT64_UINT_UINT: - nargs = 3; - klass = load; - memory = ARRAY_SIZE (args); - last_arg_constant = true; - break; - default: - gcc_unreachable (); - } - - gcc_assert (nargs <= ARRAY_SIZE (args)); - - if (klass == store) - { - arg = CALL_EXPR_ARG (exp, 0); - op = expand_normal (arg); - gcc_assert (target == 0); - if (memory) - { - op = ix86_zero_extend_to_Pmode (op); - target = gen_rtx_MEM (tmode, op); - /* target at this point has just BITS_PER_UNIT MEM_ALIGN - on it. Try to improve it using get_pointer_alignment, - and if the special builtin is one that requires strict - mode alignment, also from it's GET_MODE_ALIGNMENT. - Failure to do so could lead to ix86_legitimate_combined_insn - rejecting all changes to such insns. */ - unsigned int align = get_pointer_alignment (arg); - if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode)) - align = GET_MODE_ALIGNMENT (tmode); - if (MEM_ALIGN (target) < align) - set_mem_align (target, align); - } - else - target = force_reg (tmode, op); - arg_adjust = 1; - } - else - { - arg_adjust = 0; - if (optimize - || target == 0 - || !register_operand (target, tmode) - || GET_MODE (target) != tmode) - target = gen_reg_rtx (tmode); - } - - for (i = 0; i < nargs; i++) - { - machine_mode mode = insn_p->operand[i + 1].mode; - bool match; - - arg = CALL_EXPR_ARG (exp, i + arg_adjust); - op = expand_normal (arg); - match = insn_p->operand[i + 1].predicate (op, mode); - - if (last_arg_constant && (i + 1) == nargs) - { - if (!match) - { - if (icode == CODE_FOR_lwp_lwpvalsi3 - || icode == CODE_FOR_lwp_lwpinssi3 - || icode == CODE_FOR_lwp_lwpvaldi3 - || icode == CODE_FOR_lwp_lwpinsdi3) - error ("the last argument must be a 32-bit immediate"); - else - error ("the last argument must be an 8-bit immediate"); - return const0_rtx; - } - } - else - { - if (i == memory) - { - /* This must be the memory operand. */ - op = ix86_zero_extend_to_Pmode (op); - op = gen_rtx_MEM (mode, op); - /* op at this point has just BITS_PER_UNIT MEM_ALIGN - on it. Try to improve it using get_pointer_alignment, - and if the special builtin is one that requires strict - mode alignment, also from it's GET_MODE_ALIGNMENT. - Failure to do so could lead to ix86_legitimate_combined_insn - rejecting all changes to such insns. */ - unsigned int align = get_pointer_alignment (arg); - if (aligned_mem && align < GET_MODE_ALIGNMENT (mode)) - align = GET_MODE_ALIGNMENT (mode); - if (MEM_ALIGN (op) < align) - set_mem_align (op, align); - } - else - { - /* This must be register. */ - if (VECTOR_MODE_P (mode)) - op = safe_vector_operand (op, mode); - - op = fixup_modeless_constant (op, mode); - - if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) - op = copy_to_mode_reg (mode, op); - else - { - op = copy_to_reg (op); - op = lowpart_subreg (mode, op, GET_MODE (op)); - } - } - } - - args[i].op = op; - args[i].mode = mode; - } - - switch (nargs) - { - case 0: - pat = GEN_FCN (icode) (target); - break; - case 1: - pat = GEN_FCN (icode) (target, args[0].op); - break; - case 2: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op); - break; - case 3: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); - break; - default: - gcc_unreachable (); - } - - if (! pat) - return 0; - emit_insn (pat); - return klass == store ? 0 : target; -} - -/* Return the integer constant in ARG. Constrain it to be in the range - of the subparts of VEC_TYPE; issue an error if not. */ - -static int -get_element_number (tree vec_type, tree arg) -{ - unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1; - - if (!tree_fits_uhwi_p (arg) - || (elt = tree_to_uhwi (arg), elt > max)) - { - error ("selector must be an integer constant in the range " - "[0, %wi]", max); - return 0; - } - - return elt; -} - -/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around - ix86_expand_vector_init. We DO have language-level syntax for this, in - the form of (type){ init-list }. Except that since we can't place emms - instructions from inside the compiler, we can't allow the use of MMX - registers unless the user explicitly asks for it. So we do *not* define - vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead - we have builtins invoked by mmintrin.h that gives us license to emit - these sorts of instructions. */ - -static rtx -ix86_expand_vec_init_builtin (tree type, tree exp, rtx target) -{ - machine_mode tmode = TYPE_MODE (type); - machine_mode inner_mode = GET_MODE_INNER (tmode); - int i, n_elt = GET_MODE_NUNITS (tmode); - rtvec v = rtvec_alloc (n_elt); - - gcc_assert (VECTOR_MODE_P (tmode)); - gcc_assert (call_expr_nargs (exp) == n_elt); - - for (i = 0; i < n_elt; ++i) - { - rtx x = expand_normal (CALL_EXPR_ARG (exp, i)); - RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x); - } - - if (!target || !register_operand (target, tmode)) - target = gen_reg_rtx (tmode); - - ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v)); - return target; -} - -/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around - ix86_expand_vector_extract. They would be redundant (for non-MMX) if we - had a language-level syntax for referencing vector elements. */ - -static rtx -ix86_expand_vec_ext_builtin (tree exp, rtx target) -{ - machine_mode tmode, mode0; - tree arg0, arg1; - int elt; - rtx op0; - - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - - op0 = expand_normal (arg0); - elt = get_element_number (TREE_TYPE (arg0), arg1); - - tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); - mode0 = TYPE_MODE (TREE_TYPE (arg0)); - gcc_assert (VECTOR_MODE_P (mode0)); - - op0 = force_reg (mode0, op0); - - if (optimize || !target || !register_operand (target, tmode)) - target = gen_reg_rtx (tmode); - - ix86_expand_vector_extract (true, target, op0, elt); - - return target; -} - -/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around - ix86_expand_vector_set. They would be redundant (for non-MMX) if we had - a language-level syntax for referencing vector elements. */ - -static rtx -ix86_expand_vec_set_builtin (tree exp) -{ - machine_mode tmode, mode1; - tree arg0, arg1, arg2; - int elt; - rtx op0, op1, target; - - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - - tmode = TYPE_MODE (TREE_TYPE (arg0)); - mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); - gcc_assert (VECTOR_MODE_P (tmode)); - - op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL); - op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL); - elt = get_element_number (TREE_TYPE (arg0), arg2); - - if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode) - op1 = convert_modes (mode1, GET_MODE (op1), op1, true); - - op0 = force_reg (tmode, op0); - op1 = force_reg (mode1, op1); - - /* OP0 is the source of these builtin functions and shouldn't be - modified. Create a copy, use it and return it as target. */ - target = gen_reg_rtx (tmode); - emit_move_insn (target, op0); - ix86_expand_vector_set (true, target, op1, elt); - - return target; -} - -/* Expand an expression EXP that calls a built-in function, - with result going to TARGET if that's convenient - (and in mode MODE if that's convenient). - SUBTARGET may be used as the target for computing one of EXP's operands. - IGNORE is nonzero if the value is to be ignored. */ - -rtx -ix86_expand_builtin (tree exp, rtx target, rtx subtarget, - machine_mode mode, int ignore) -{ - size_t i; - enum insn_code icode, icode2; - tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); - tree arg0, arg1, arg2, arg3, arg4; - rtx op0, op1, op2, op3, op4, pat, pat2, insn; - machine_mode mode0, mode1, mode2, mode3, mode4; - unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl); - - /* For CPU builtins that can be folded, fold first and expand the fold. */ - switch (fcode) - { - case IX86_BUILTIN_CPU_INIT: - { - /* Make it call __cpu_indicator_init in libgcc. */ - tree call_expr, fndecl, type; - type = build_function_type_list (integer_type_node, NULL_TREE); - fndecl = build_fn_decl ("__cpu_indicator_init", type); - call_expr = build_call_expr (fndecl, 0); - return expand_expr (call_expr, target, mode, EXPAND_NORMAL); - } - case IX86_BUILTIN_CPU_IS: - case IX86_BUILTIN_CPU_SUPPORTS: - { - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree fold_expr = fold_builtin_cpu (fndecl, &arg0); - gcc_assert (fold_expr != NULL_TREE); - return expand_expr (fold_expr, target, mode, EXPAND_NORMAL); - } - } - - HOST_WIDE_INT isa = ix86_isa_flags; - HOST_WIDE_INT isa2 = ix86_isa_flags2; - HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa; - HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2; - /* The general case is we require all the ISAs specified in bisa{,2} - to be enabled. - The exceptions are: - OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A - OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 - OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4 - where for each such pair it is sufficient if either of the ISAs is - enabled, plus if it is ored with other options also those others. - OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */ - if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) - == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) - && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0) - isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A); - if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) - == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) - && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0) - isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32); - if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) - == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) - && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0) - isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4); - if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE) - { - bisa &= ~OPTION_MASK_ISA_MMX; - bisa |= OPTION_MASK_ISA_SSE2; - } - if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2) - { - bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT; - if (TARGET_ABI_X32) - bisa |= OPTION_MASK_ABI_X32; - else - bisa |= OPTION_MASK_ABI_64; - char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL, - (enum fpmath_unit) 0, - (enum prefer_vector_width) 0, - false, add_abi_p); - if (!opts) - error ("%qE needs unknown isa option", fndecl); - else - { - gcc_assert (opts != NULL); - error ("%qE needs isa option %s", fndecl, opts); - free (opts); - } - return expand_call (exp, target, ignore); - } - - switch (fcode) - { - case IX86_BUILTIN_MASKMOVQ: - case IX86_BUILTIN_MASKMOVDQU: - icode = (fcode == IX86_BUILTIN_MASKMOVQ - ? CODE_FOR_mmx_maskmovq - : CODE_FOR_sse2_maskmovdqu); - /* Note the arg order is different from the operand order. */ - arg1 = CALL_EXPR_ARG (exp, 0); - arg2 = CALL_EXPR_ARG (exp, 1); - arg0 = CALL_EXPR_ARG (exp, 2); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - mode0 = insn_data[icode].operand[0].mode; - mode1 = insn_data[icode].operand[1].mode; - mode2 = insn_data[icode].operand[2].mode; - - op0 = ix86_zero_extend_to_Pmode (op0); - op0 = gen_rtx_MEM (mode1, op0); - - if (!insn_data[icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if (!insn_data[icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - if (!insn_data[icode].operand[2].predicate (op2, mode2)) - op2 = copy_to_mode_reg (mode2, op2); - pat = GEN_FCN (icode) (op0, op1, op2); - if (! pat) - return 0; - emit_insn (pat); - return 0; - - case IX86_BUILTIN_LDMXCSR: - op0 = expand_normal (CALL_EXPR_ARG (exp, 0)); - target = assign_386_stack_local (SImode, SLOT_TEMP); - emit_move_insn (target, op0); - emit_insn (gen_sse_ldmxcsr (target)); - return 0; - - case IX86_BUILTIN_STMXCSR: - target = assign_386_stack_local (SImode, SLOT_TEMP); - emit_insn (gen_sse_stmxcsr (target)); - return copy_to_mode_reg (SImode, target); - - case IX86_BUILTIN_CLFLUSH: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - icode = CODE_FOR_sse2_clflush; - if (!insn_data[icode].operand[0].predicate (op0, Pmode)) - op0 = ix86_zero_extend_to_Pmode (op0); - - emit_insn (gen_sse2_clflush (op0)); - return 0; - - case IX86_BUILTIN_CLWB: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - icode = CODE_FOR_clwb; - if (!insn_data[icode].operand[0].predicate (op0, Pmode)) - op0 = ix86_zero_extend_to_Pmode (op0); - - emit_insn (gen_clwb (op0)); - return 0; - - case IX86_BUILTIN_CLFLUSHOPT: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - icode = CODE_FOR_clflushopt; - if (!insn_data[icode].operand[0].predicate (op0, Pmode)) - op0 = ix86_zero_extend_to_Pmode (op0); - - emit_insn (gen_clflushopt (op0)); - return 0; - - case IX86_BUILTIN_MONITOR: - case IX86_BUILTIN_MONITORX: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - if (!REG_P (op0)) - op0 = ix86_zero_extend_to_Pmode (op0); - if (!REG_P (op1)) - op1 = copy_to_mode_reg (SImode, op1); - if (!REG_P (op2)) - op2 = copy_to_mode_reg (SImode, op2); - - emit_insn (fcode == IX86_BUILTIN_MONITOR - ? gen_sse3_monitor (Pmode, op0, op1, op2) - : gen_monitorx (Pmode, op0, op1, op2)); - return 0; - - case IX86_BUILTIN_MWAIT: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - if (!REG_P (op0)) - op0 = copy_to_mode_reg (SImode, op0); - if (!REG_P (op1)) - op1 = copy_to_mode_reg (SImode, op1); - emit_insn (gen_sse3_mwait (op0, op1)); - return 0; - - case IX86_BUILTIN_MWAITX: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - if (!REG_P (op0)) - op0 = copy_to_mode_reg (SImode, op0); - if (!REG_P (op1)) - op1 = copy_to_mode_reg (SImode, op1); - if (!REG_P (op2)) - op2 = copy_to_mode_reg (SImode, op2); - emit_insn (gen_mwaitx (op0, op1, op2)); - return 0; - - case IX86_BUILTIN_UMONITOR: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - - op0 = ix86_zero_extend_to_Pmode (op0); - emit_insn (gen_umonitor (Pmode, op0)); - return 0; - - case IX86_BUILTIN_UMWAIT: - case IX86_BUILTIN_TPAUSE: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - - if (!REG_P (op0)) - op0 = copy_to_mode_reg (SImode, op0); - - op1 = force_reg (DImode, op1); - - if (TARGET_64BIT) - { - op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32), - NULL, 1, OPTAB_DIRECT); - switch (fcode) - { - case IX86_BUILTIN_UMWAIT: - icode = CODE_FOR_umwait_rex64; - break; - case IX86_BUILTIN_TPAUSE: - icode = CODE_FOR_tpause_rex64; - break; - default: - gcc_unreachable (); - } - - op2 = gen_lowpart (SImode, op2); - op1 = gen_lowpart (SImode, op1); - pat = GEN_FCN (icode) (op0, op1, op2); - } - else - { - switch (fcode) - { - case IX86_BUILTIN_UMWAIT: - icode = CODE_FOR_umwait; - break; - case IX86_BUILTIN_TPAUSE: - icode = CODE_FOR_tpause; - break; - default: - gcc_unreachable (); - } - pat = GEN_FCN (icode) (op0, op1); - } - - if (!pat) - return 0; - - emit_insn (pat); - - if (target == 0 - || !register_operand (target, QImode)) - target = gen_reg_rtx (QImode); - - pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG), - const0_rtx); - emit_insn (gen_rtx_SET (target, pat)); - - return target; - - case IX86_BUILTIN_CLZERO: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - if (!REG_P (op0)) - op0 = ix86_zero_extend_to_Pmode (op0); - emit_insn (gen_clzero (Pmode, op0)); - return 0; - - case IX86_BUILTIN_CLDEMOTE: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - icode = CODE_FOR_cldemote; - if (!insn_data[icode].operand[0].predicate (op0, Pmode)) - op0 = ix86_zero_extend_to_Pmode (op0); - - emit_insn (gen_cldemote (op0)); - return 0; - - case IX86_BUILTIN_VEC_INIT_V2SI: - case IX86_BUILTIN_VEC_INIT_V4HI: - case IX86_BUILTIN_VEC_INIT_V8QI: - return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target); - - case IX86_BUILTIN_VEC_EXT_V2DF: - case IX86_BUILTIN_VEC_EXT_V2DI: - case IX86_BUILTIN_VEC_EXT_V4SF: - case IX86_BUILTIN_VEC_EXT_V4SI: - case IX86_BUILTIN_VEC_EXT_V8HI: - case IX86_BUILTIN_VEC_EXT_V2SI: - case IX86_BUILTIN_VEC_EXT_V4HI: - case IX86_BUILTIN_VEC_EXT_V16QI: - return ix86_expand_vec_ext_builtin (exp, target); - - case IX86_BUILTIN_VEC_SET_V2DI: - case IX86_BUILTIN_VEC_SET_V4SF: - case IX86_BUILTIN_VEC_SET_V4SI: - case IX86_BUILTIN_VEC_SET_V8HI: - case IX86_BUILTIN_VEC_SET_V4HI: - case IX86_BUILTIN_VEC_SET_V16QI: - return ix86_expand_vec_set_builtin (exp); - - case IX86_BUILTIN_NANQ: - case IX86_BUILTIN_NANSQ: - return expand_call (exp, target, ignore); - - case IX86_BUILTIN_RDPID: - - op0 = gen_reg_rtx (word_mode); - - if (TARGET_64BIT) - { - insn = gen_rdpid_rex64 (op0); - op0 = convert_to_mode (SImode, op0, 1); - } - else - insn = gen_rdpid (op0); - - emit_insn (insn); - - if (target == 0 - || !register_operand (target, SImode)) - target = gen_reg_rtx (SImode); - - emit_move_insn (target, op0); - return target; - - case IX86_BUILTIN_2INTERSECTD512: - case IX86_BUILTIN_2INTERSECTQ512: - case IX86_BUILTIN_2INTERSECTD256: - case IX86_BUILTIN_2INTERSECTQ256: - case IX86_BUILTIN_2INTERSECTD128: - case IX86_BUILTIN_2INTERSECTQ128: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - arg3 = CALL_EXPR_ARG (exp, 3); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - op3 = expand_normal (arg3); - - if (!address_operand (op0, VOIDmode)) - { - op0 = convert_memory_address (Pmode, op0); - op0 = copy_addr_to_reg (op0); - } - if (!address_operand (op1, VOIDmode)) - { - op1 = convert_memory_address (Pmode, op1); - op1 = copy_addr_to_reg (op1); - } - - switch (fcode) - { - case IX86_BUILTIN_2INTERSECTD512: - mode4 = P2HImode; - icode = CODE_FOR_avx512vp2intersect_2intersectv16si; - break; - case IX86_BUILTIN_2INTERSECTQ512: - mode4 = P2QImode; - icode = CODE_FOR_avx512vp2intersect_2intersectv8di; - break; - case IX86_BUILTIN_2INTERSECTD256: - mode4 = P2QImode; - icode = CODE_FOR_avx512vp2intersect_2intersectv8si; - break; - case IX86_BUILTIN_2INTERSECTQ256: - mode4 = P2QImode; - icode = CODE_FOR_avx512vp2intersect_2intersectv4di; - break; - case IX86_BUILTIN_2INTERSECTD128: - mode4 = P2QImode; - icode = CODE_FOR_avx512vp2intersect_2intersectv4si; - break; - case IX86_BUILTIN_2INTERSECTQ128: - mode4 = P2QImode; - icode = CODE_FOR_avx512vp2intersect_2intersectv2di; - break; - default: - gcc_unreachable (); - } - - mode2 = insn_data[icode].operand[1].mode; - mode3 = insn_data[icode].operand[2].mode; - if (!insn_data[icode].operand[1].predicate (op2, mode2)) - op2 = copy_to_mode_reg (mode2, op2); - if (!insn_data[icode].operand[2].predicate (op3, mode3)) - op3 = copy_to_mode_reg (mode3, op3); - - op4 = gen_reg_rtx (mode4); - emit_insn (GEN_FCN (icode) (op4, op2, op3)); - mode0 = mode4 == P2HImode ? HImode : QImode; - emit_move_insn (gen_rtx_MEM (mode0, op0), - gen_lowpart (mode0, op4)); - emit_move_insn (gen_rtx_MEM (mode0, op1), - gen_highpart (mode0, op4)); - - return 0; - - case IX86_BUILTIN_RDPMC: - case IX86_BUILTIN_RDTSC: - case IX86_BUILTIN_RDTSCP: - case IX86_BUILTIN_XGETBV: - - op0 = gen_reg_rtx (DImode); - op1 = gen_reg_rtx (DImode); - - if (fcode == IX86_BUILTIN_RDPMC) - { - arg0 = CALL_EXPR_ARG (exp, 0); - op2 = expand_normal (arg0); - if (!register_operand (op2, SImode)) - op2 = copy_to_mode_reg (SImode, op2); - - insn = (TARGET_64BIT - ? gen_rdpmc_rex64 (op0, op1, op2) - : gen_rdpmc (op0, op2)); - emit_insn (insn); - } - else if (fcode == IX86_BUILTIN_XGETBV) - { - arg0 = CALL_EXPR_ARG (exp, 0); - op2 = expand_normal (arg0); - if (!register_operand (op2, SImode)) - op2 = copy_to_mode_reg (SImode, op2); - - insn = (TARGET_64BIT - ? gen_xgetbv_rex64 (op0, op1, op2) - : gen_xgetbv (op0, op2)); - emit_insn (insn); - } - else if (fcode == IX86_BUILTIN_RDTSC) - { - insn = (TARGET_64BIT - ? gen_rdtsc_rex64 (op0, op1) - : gen_rdtsc (op0)); - emit_insn (insn); - } - else - { - op2 = gen_reg_rtx (SImode); - - insn = (TARGET_64BIT - ? gen_rdtscp_rex64 (op0, op1, op2) - : gen_rdtscp (op0, op2)); - emit_insn (insn); - - arg0 = CALL_EXPR_ARG (exp, 0); - op4 = expand_normal (arg0); - if (!address_operand (op4, VOIDmode)) - { - op4 = convert_memory_address (Pmode, op4); - op4 = copy_addr_to_reg (op4); - } - emit_move_insn (gen_rtx_MEM (SImode, op4), op2); - } - - if (target == 0 - || !register_operand (target, DImode)) - target = gen_reg_rtx (DImode); - - if (TARGET_64BIT) - { - op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32), - op1, 1, OPTAB_DIRECT); - op0 = expand_simple_binop (DImode, IOR, op0, op1, - op0, 1, OPTAB_DIRECT); - } - - emit_move_insn (target, op0); - return target; - - case IX86_BUILTIN_ENQCMD: - case IX86_BUILTIN_ENQCMDS: - case IX86_BUILTIN_MOVDIR64B: - - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - - op0 = ix86_zero_extend_to_Pmode (op0); - if (!address_operand (op1, VOIDmode)) - { - op1 = convert_memory_address (Pmode, op1); - op1 = copy_addr_to_reg (op1); - } - op1 = gen_rtx_MEM (XImode, op1); - - if (fcode == IX86_BUILTIN_MOVDIR64B) - { - emit_insn (gen_movdir64b (Pmode, op0, op1)); - return 0; - } - else - { - rtx pat; - - target = gen_reg_rtx (SImode); - emit_move_insn (target, const0_rtx); - target = gen_rtx_SUBREG (QImode, target, 0); - - if (fcode == IX86_BUILTIN_ENQCMD) - pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1); - else - pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1); - - emit_insn (pat); - - emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), - gen_rtx_fmt_ee (EQ, QImode, - SET_DEST (pat), - const0_rtx))); - - return SUBREG_REG (target); - } - - case IX86_BUILTIN_FXSAVE: - case IX86_BUILTIN_FXRSTOR: - case IX86_BUILTIN_FXSAVE64: - case IX86_BUILTIN_FXRSTOR64: - case IX86_BUILTIN_FNSTENV: - case IX86_BUILTIN_FLDENV: - mode0 = BLKmode; - switch (fcode) - { - case IX86_BUILTIN_FXSAVE: - icode = CODE_FOR_fxsave; - break; - case IX86_BUILTIN_FXRSTOR: - icode = CODE_FOR_fxrstor; - break; - case IX86_BUILTIN_FXSAVE64: - icode = CODE_FOR_fxsave64; - break; - case IX86_BUILTIN_FXRSTOR64: - icode = CODE_FOR_fxrstor64; - break; - case IX86_BUILTIN_FNSTENV: - icode = CODE_FOR_fnstenv; - break; - case IX86_BUILTIN_FLDENV: - icode = CODE_FOR_fldenv; - break; - default: - gcc_unreachable (); - } - - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - - if (!address_operand (op0, VOIDmode)) - { - op0 = convert_memory_address (Pmode, op0); - op0 = copy_addr_to_reg (op0); - } - op0 = gen_rtx_MEM (mode0, op0); - - pat = GEN_FCN (icode) (op0); - if (pat) - emit_insn (pat); - return 0; - - case IX86_BUILTIN_XSETBV: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - - if (!REG_P (op0)) - op0 = copy_to_mode_reg (SImode, op0); - - op1 = force_reg (DImode, op1); - - if (TARGET_64BIT) - { - op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32), - NULL, 1, OPTAB_DIRECT); - - icode = CODE_FOR_xsetbv_rex64; - - op2 = gen_lowpart (SImode, op2); - op1 = gen_lowpart (SImode, op1); - pat = GEN_FCN (icode) (op0, op1, op2); - } - else - { - icode = CODE_FOR_xsetbv; - - pat = GEN_FCN (icode) (op0, op1); - } - if (pat) - emit_insn (pat); - return 0; - - case IX86_BUILTIN_XSAVE: - case IX86_BUILTIN_XRSTOR: - case IX86_BUILTIN_XSAVE64: - case IX86_BUILTIN_XRSTOR64: - case IX86_BUILTIN_XSAVEOPT: - case IX86_BUILTIN_XSAVEOPT64: - case IX86_BUILTIN_XSAVES: - case IX86_BUILTIN_XRSTORS: - case IX86_BUILTIN_XSAVES64: - case IX86_BUILTIN_XRSTORS64: - case IX86_BUILTIN_XSAVEC: - case IX86_BUILTIN_XSAVEC64: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - - if (!address_operand (op0, VOIDmode)) - { - op0 = convert_memory_address (Pmode, op0); - op0 = copy_addr_to_reg (op0); - } - op0 = gen_rtx_MEM (BLKmode, op0); - - op1 = force_reg (DImode, op1); - - if (TARGET_64BIT) - { - op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32), - NULL, 1, OPTAB_DIRECT); - switch (fcode) - { - case IX86_BUILTIN_XSAVE: - icode = CODE_FOR_xsave_rex64; - break; - case IX86_BUILTIN_XRSTOR: - icode = CODE_FOR_xrstor_rex64; - break; - case IX86_BUILTIN_XSAVE64: - icode = CODE_FOR_xsave64; - break; - case IX86_BUILTIN_XRSTOR64: - icode = CODE_FOR_xrstor64; - break; - case IX86_BUILTIN_XSAVEOPT: - icode = CODE_FOR_xsaveopt_rex64; - break; - case IX86_BUILTIN_XSAVEOPT64: - icode = CODE_FOR_xsaveopt64; - break; - case IX86_BUILTIN_XSAVES: - icode = CODE_FOR_xsaves_rex64; - break; - case IX86_BUILTIN_XRSTORS: - icode = CODE_FOR_xrstors_rex64; - break; - case IX86_BUILTIN_XSAVES64: - icode = CODE_FOR_xsaves64; - break; - case IX86_BUILTIN_XRSTORS64: - icode = CODE_FOR_xrstors64; - break; - case IX86_BUILTIN_XSAVEC: - icode = CODE_FOR_xsavec_rex64; - break; - case IX86_BUILTIN_XSAVEC64: - icode = CODE_FOR_xsavec64; - break; - default: - gcc_unreachable (); - } - - op2 = gen_lowpart (SImode, op2); - op1 = gen_lowpart (SImode, op1); - pat = GEN_FCN (icode) (op0, op1, op2); - } - else - { - switch (fcode) - { - case IX86_BUILTIN_XSAVE: - icode = CODE_FOR_xsave; - break; - case IX86_BUILTIN_XRSTOR: - icode = CODE_FOR_xrstor; - break; - case IX86_BUILTIN_XSAVEOPT: - icode = CODE_FOR_xsaveopt; - break; - case IX86_BUILTIN_XSAVES: - icode = CODE_FOR_xsaves; - break; - case IX86_BUILTIN_XRSTORS: - icode = CODE_FOR_xrstors; - break; - case IX86_BUILTIN_XSAVEC: - icode = CODE_FOR_xsavec; - break; - default: - gcc_unreachable (); - } - pat = GEN_FCN (icode) (op0, op1); - } - - if (pat) - emit_insn (pat); - return 0; - - case IX86_BUILTIN_LLWPCB: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - icode = CODE_FOR_lwp_llwpcb; - if (!insn_data[icode].operand[0].predicate (op0, Pmode)) - op0 = ix86_zero_extend_to_Pmode (op0); - emit_insn (gen_lwp_llwpcb (op0)); - return 0; - - case IX86_BUILTIN_SLWPCB: - icode = CODE_FOR_lwp_slwpcb; - if (!target - || !insn_data[icode].operand[0].predicate (target, Pmode)) - target = gen_reg_rtx (Pmode); - emit_insn (gen_lwp_slwpcb (target)); - return target; - - case IX86_BUILTIN_BEXTRI32: - case IX86_BUILTIN_BEXTRI64: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - icode = (fcode == IX86_BUILTIN_BEXTRI32 - ? CODE_FOR_tbm_bextri_si - : CODE_FOR_tbm_bextri_di); - if (!CONST_INT_P (op1)) - { - error ("last argument must be an immediate"); - return const0_rtx; - } - else - { - unsigned char length = (INTVAL (op1) >> 8) & 0xFF; - unsigned char lsb_index = INTVAL (op1) & 0xFF; - op1 = GEN_INT (length); - op2 = GEN_INT (lsb_index); - - mode1 = insn_data[icode].operand[1].mode; - if (!insn_data[icode].operand[1].predicate (op0, mode1)) - op0 = copy_to_mode_reg (mode1, op0); - - mode0 = insn_data[icode].operand[0].mode; - if (target == 0 - || !register_operand (target, mode0)) - target = gen_reg_rtx (mode0); - - pat = GEN_FCN (icode) (target, op0, op1, op2); - if (pat) - emit_insn (pat); - return target; - } - - case IX86_BUILTIN_RDRAND16_STEP: - icode = CODE_FOR_rdrandhi_1; - mode0 = HImode; - goto rdrand_step; - - case IX86_BUILTIN_RDRAND32_STEP: - icode = CODE_FOR_rdrandsi_1; - mode0 = SImode; - goto rdrand_step; - - case IX86_BUILTIN_RDRAND64_STEP: - icode = CODE_FOR_rdranddi_1; - mode0 = DImode; - -rdrand_step: - arg0 = CALL_EXPR_ARG (exp, 0); - op1 = expand_normal (arg0); - if (!address_operand (op1, VOIDmode)) - { - op1 = convert_memory_address (Pmode, op1); - op1 = copy_addr_to_reg (op1); - } - - op0 = gen_reg_rtx (mode0); - emit_insn (GEN_FCN (icode) (op0)); - - emit_move_insn (gen_rtx_MEM (mode0, op1), op0); - - op1 = gen_reg_rtx (SImode); - emit_move_insn (op1, CONST1_RTX (SImode)); - - /* Emit SImode conditional move. */ - if (mode0 == HImode) - { - if (TARGET_ZERO_EXTEND_WITH_AND - && optimize_function_for_speed_p (cfun)) - { - op2 = force_reg (SImode, const0_rtx); - - emit_insn (gen_movstricthi - (gen_lowpart (HImode, op2), op0)); - } - else - { - op2 = gen_reg_rtx (SImode); - - emit_insn (gen_zero_extendhisi2 (op2, op0)); - } - } - else if (mode0 == SImode) - op2 = op0; - else - op2 = gen_rtx_SUBREG (SImode, op0, 0); - - if (target == 0 - || !register_operand (target, SImode)) - target = gen_reg_rtx (SImode); - - pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG), - const0_rtx); - emit_insn (gen_rtx_SET (target, - gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1))); - return target; - - case IX86_BUILTIN_RDSEED16_STEP: - icode = CODE_FOR_rdseedhi_1; - mode0 = HImode; - goto rdseed_step; - - case IX86_BUILTIN_RDSEED32_STEP: - icode = CODE_FOR_rdseedsi_1; - mode0 = SImode; - goto rdseed_step; - - case IX86_BUILTIN_RDSEED64_STEP: - icode = CODE_FOR_rdseeddi_1; - mode0 = DImode; - -rdseed_step: - arg0 = CALL_EXPR_ARG (exp, 0); - op1 = expand_normal (arg0); - if (!address_operand (op1, VOIDmode)) - { - op1 = convert_memory_address (Pmode, op1); - op1 = copy_addr_to_reg (op1); - } - - op0 = gen_reg_rtx (mode0); - emit_insn (GEN_FCN (icode) (op0)); - - emit_move_insn (gen_rtx_MEM (mode0, op1), op0); - - op2 = gen_reg_rtx (QImode); - - pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG), - const0_rtx); - emit_insn (gen_rtx_SET (op2, pat)); - - if (target == 0 - || !register_operand (target, SImode)) - target = gen_reg_rtx (SImode); - - emit_insn (gen_zero_extendqisi2 (target, op2)); - return target; - - case IX86_BUILTIN_SBB32: - icode = CODE_FOR_subborrowsi; - icode2 = CODE_FOR_subborrowsi_0; - mode0 = SImode; - mode1 = DImode; - mode2 = CCmode; - goto handlecarry; - - case IX86_BUILTIN_SBB64: - icode = CODE_FOR_subborrowdi; - icode2 = CODE_FOR_subborrowdi_0; - mode0 = DImode; - mode1 = TImode; - mode2 = CCmode; - goto handlecarry; - - case IX86_BUILTIN_ADDCARRYX32: - icode = CODE_FOR_addcarrysi; - icode2 = CODE_FOR_addcarrysi_0; - mode0 = SImode; - mode1 = DImode; - mode2 = CCCmode; - goto handlecarry; - - case IX86_BUILTIN_ADDCARRYX64: - icode = CODE_FOR_addcarrydi; - icode2 = CODE_FOR_addcarrydi_0; - mode0 = DImode; - mode1 = TImode; - mode2 = CCCmode; - - handlecarry: - arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */ - arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */ - arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */ - arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */ - - op1 = expand_normal (arg0); - if (!integer_zerop (arg0)) - op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1)); - - op2 = expand_normal (arg1); - if (!register_operand (op2, mode0)) - op2 = copy_to_mode_reg (mode0, op2); - - op3 = expand_normal (arg2); - if (!register_operand (op3, mode0)) - op3 = copy_to_mode_reg (mode0, op3); - - op4 = expand_normal (arg3); - if (!address_operand (op4, VOIDmode)) - { - op4 = convert_memory_address (Pmode, op4); - op4 = copy_addr_to_reg (op4); - } - - op0 = gen_reg_rtx (mode0); - if (integer_zerop (arg0)) - { - /* If arg0 is 0, optimize right away into add or sub - instruction that sets CCCmode flags. */ - op1 = gen_rtx_REG (mode2, FLAGS_REG); - emit_insn (GEN_FCN (icode2) (op0, op2, op3)); - } - else - { - /* Generate CF from input operand. */ - emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx)); - - /* Generate instruction that consumes CF. */ - op1 = gen_rtx_REG (CCCmode, FLAGS_REG); - pat = gen_rtx_LTU (mode1, op1, const0_rtx); - pat2 = gen_rtx_LTU (mode0, op1, const0_rtx); - emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2)); - } - - /* Return current CF value. */ - if (target == 0) - target = gen_reg_rtx (QImode); - - pat = gen_rtx_LTU (QImode, op1, const0_rtx); - emit_insn (gen_rtx_SET (target, pat)); - - /* Store the result. */ - emit_move_insn (gen_rtx_MEM (mode0, op4), op0); - - return target; - - case IX86_BUILTIN_READ_FLAGS: - emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG))); - - if (optimize - || target == NULL_RTX - || !nonimmediate_operand (target, word_mode) - || GET_MODE (target) != word_mode) - target = gen_reg_rtx (word_mode); - - emit_insn (gen_pop (target)); - return target; - - case IX86_BUILTIN_WRITE_FLAGS: - - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - if (!general_no_elim_operand (op0, word_mode)) - op0 = copy_to_mode_reg (word_mode, op0); - - emit_insn (gen_push (op0)); - emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG))); - return 0; - - case IX86_BUILTIN_KTESTC8: - icode = CODE_FOR_ktestqi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KTESTZ8: - icode = CODE_FOR_ktestqi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KTESTC16: - icode = CODE_FOR_ktesthi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KTESTZ16: - icode = CODE_FOR_ktesthi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KTESTC32: - icode = CODE_FOR_ktestsi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KTESTZ32: - icode = CODE_FOR_ktestsi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KTESTC64: - icode = CODE_FOR_ktestdi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KTESTZ64: - icode = CODE_FOR_ktestdi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KORTESTC8: - icode = CODE_FOR_kortestqi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KORTESTZ8: - icode = CODE_FOR_kortestqi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KORTESTC16: - icode = CODE_FOR_kortesthi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KORTESTZ16: - icode = CODE_FOR_kortesthi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KORTESTC32: - icode = CODE_FOR_kortestsi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KORTESTZ32: - icode = CODE_FOR_kortestsi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KORTESTC64: - icode = CODE_FOR_kortestdi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KORTESTZ64: - icode = CODE_FOR_kortestdi; - mode3 = CCZmode; - - kortest: - arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */ - arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */ - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - - mode0 = insn_data[icode].operand[0].mode; - mode1 = insn_data[icode].operand[1].mode; - - if (GET_MODE (op0) != VOIDmode) - op0 = force_reg (GET_MODE (op0), op0); - - op0 = gen_lowpart (mode0, op0); - - if (!insn_data[icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - - if (GET_MODE (op1) != VOIDmode) - op1 = force_reg (GET_MODE (op1), op1); - - op1 = gen_lowpart (mode1, op1); - - if (!insn_data[icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - target = gen_reg_rtx (QImode); - - /* Emit kortest. */ - emit_insn (GEN_FCN (icode) (op0, op1)); - /* And use setcc to return result from flags. */ - ix86_expand_setcc (target, EQ, - gen_rtx_REG (mode3, FLAGS_REG), const0_rtx); - return target; - - case IX86_BUILTIN_GATHERSIV2DF: - icode = CODE_FOR_avx2_gathersiv2df; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV4DF: - icode = CODE_FOR_avx2_gathersiv4df; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV2DF: - icode = CODE_FOR_avx2_gatherdiv2df; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV4DF: - icode = CODE_FOR_avx2_gatherdiv4df; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV4SF: - icode = CODE_FOR_avx2_gathersiv4sf; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV8SF: - icode = CODE_FOR_avx2_gathersiv8sf; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV4SF: - icode = CODE_FOR_avx2_gatherdiv4sf; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV8SF: - icode = CODE_FOR_avx2_gatherdiv8sf; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV2DI: - icode = CODE_FOR_avx2_gathersiv2di; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV4DI: - icode = CODE_FOR_avx2_gathersiv4di; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV2DI: - icode = CODE_FOR_avx2_gatherdiv2di; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV4DI: - icode = CODE_FOR_avx2_gatherdiv4di; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV4SI: - icode = CODE_FOR_avx2_gathersiv4si; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV8SI: - icode = CODE_FOR_avx2_gathersiv8si; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV4SI: - icode = CODE_FOR_avx2_gatherdiv4si; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV8SI: - icode = CODE_FOR_avx2_gatherdiv8si; - goto gather_gen; - case IX86_BUILTIN_GATHERALTSIV4DF: - icode = CODE_FOR_avx2_gathersiv4df; - goto gather_gen; - case IX86_BUILTIN_GATHERALTDIV8SF: - icode = CODE_FOR_avx2_gatherdiv8sf; - goto gather_gen; - case IX86_BUILTIN_GATHERALTSIV4DI: - icode = CODE_FOR_avx2_gathersiv4di; - goto gather_gen; - case IX86_BUILTIN_GATHERALTDIV8SI: - icode = CODE_FOR_avx2_gatherdiv8si; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV16SF: - icode = CODE_FOR_avx512f_gathersiv16sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV8DF: - icode = CODE_FOR_avx512f_gathersiv8df; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV16SF: - icode = CODE_FOR_avx512f_gatherdiv16sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV8DF: - icode = CODE_FOR_avx512f_gatherdiv8df; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV16SI: - icode = CODE_FOR_avx512f_gathersiv16si; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV8DI: - icode = CODE_FOR_avx512f_gathersiv8di; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV16SI: - icode = CODE_FOR_avx512f_gatherdiv16si; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV8DI: - icode = CODE_FOR_avx512f_gatherdiv8di; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTSIV8DF: - icode = CODE_FOR_avx512f_gathersiv8df; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTDIV16SF: - icode = CODE_FOR_avx512f_gatherdiv16sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTSIV8DI: - icode = CODE_FOR_avx512f_gathersiv8di; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTDIV16SI: - icode = CODE_FOR_avx512f_gatherdiv16si; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV2DF: - icode = CODE_FOR_avx512vl_gathersiv2df; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV4DF: - icode = CODE_FOR_avx512vl_gathersiv4df; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV2DF: - icode = CODE_FOR_avx512vl_gatherdiv2df; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV4DF: - icode = CODE_FOR_avx512vl_gatherdiv4df; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV4SF: - icode = CODE_FOR_avx512vl_gathersiv4sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV8SF: - icode = CODE_FOR_avx512vl_gathersiv8sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV4SF: - icode = CODE_FOR_avx512vl_gatherdiv4sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV8SF: - icode = CODE_FOR_avx512vl_gatherdiv8sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV2DI: - icode = CODE_FOR_avx512vl_gathersiv2di; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV4DI: - icode = CODE_FOR_avx512vl_gathersiv4di; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV2DI: - icode = CODE_FOR_avx512vl_gatherdiv2di; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV4DI: - icode = CODE_FOR_avx512vl_gatherdiv4di; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV4SI: - icode = CODE_FOR_avx512vl_gathersiv4si; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV8SI: - icode = CODE_FOR_avx512vl_gathersiv8si; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV4SI: - icode = CODE_FOR_avx512vl_gatherdiv4si; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV8SI: - icode = CODE_FOR_avx512vl_gatherdiv8si; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTSIV4DF: - icode = CODE_FOR_avx512vl_gathersiv4df; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTDIV8SF: - icode = CODE_FOR_avx512vl_gatherdiv8sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTSIV4DI: - icode = CODE_FOR_avx512vl_gathersiv4di; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTDIV8SI: - icode = CODE_FOR_avx512vl_gatherdiv8si; - goto gather_gen; - case IX86_BUILTIN_SCATTERSIV16SF: - icode = CODE_FOR_avx512f_scattersiv16sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV8DF: - icode = CODE_FOR_avx512f_scattersiv8df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV16SF: - icode = CODE_FOR_avx512f_scatterdiv16sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV8DF: - icode = CODE_FOR_avx512f_scatterdiv8df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV16SI: - icode = CODE_FOR_avx512f_scattersiv16si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV8DI: - icode = CODE_FOR_avx512f_scattersiv8di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV16SI: - icode = CODE_FOR_avx512f_scatterdiv16si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV8DI: - icode = CODE_FOR_avx512f_scatterdiv8di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV8SF: - icode = CODE_FOR_avx512vl_scattersiv8sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV4SF: - icode = CODE_FOR_avx512vl_scattersiv4sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV4DF: - icode = CODE_FOR_avx512vl_scattersiv4df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV2DF: - icode = CODE_FOR_avx512vl_scattersiv2df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV8SF: - icode = CODE_FOR_avx512vl_scatterdiv8sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV4SF: - icode = CODE_FOR_avx512vl_scatterdiv4sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV4DF: - icode = CODE_FOR_avx512vl_scatterdiv4df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV2DF: - icode = CODE_FOR_avx512vl_scatterdiv2df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV8SI: - icode = CODE_FOR_avx512vl_scattersiv8si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV4SI: - icode = CODE_FOR_avx512vl_scattersiv4si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV4DI: - icode = CODE_FOR_avx512vl_scattersiv4di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV2DI: - icode = CODE_FOR_avx512vl_scattersiv2di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV8SI: - icode = CODE_FOR_avx512vl_scatterdiv8si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV4SI: - icode = CODE_FOR_avx512vl_scatterdiv4si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV4DI: - icode = CODE_FOR_avx512vl_scatterdiv4di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV2DI: - icode = CODE_FOR_avx512vl_scatterdiv2di; - goto scatter_gen; - case IX86_BUILTIN_GATHERPFDPD: - icode = CODE_FOR_avx512pf_gatherpfv8sidf; - goto vec_prefetch_gen; - case IX86_BUILTIN_SCATTERALTSIV8DF: - icode = CODE_FOR_avx512f_scattersiv8df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTDIV16SF: - icode = CODE_FOR_avx512f_scatterdiv16sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTSIV8DI: - icode = CODE_FOR_avx512f_scattersiv8di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTDIV16SI: - icode = CODE_FOR_avx512f_scatterdiv16si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTSIV4DF: - icode = CODE_FOR_avx512vl_scattersiv4df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTDIV8SF: - icode = CODE_FOR_avx512vl_scatterdiv8sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTSIV4DI: - icode = CODE_FOR_avx512vl_scattersiv4di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTDIV8SI: - icode = CODE_FOR_avx512vl_scatterdiv8si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTSIV2DF: - icode = CODE_FOR_avx512vl_scattersiv2df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTDIV4SF: - icode = CODE_FOR_avx512vl_scatterdiv4sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTSIV2DI: - icode = CODE_FOR_avx512vl_scattersiv2di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTDIV4SI: - icode = CODE_FOR_avx512vl_scatterdiv4si; - goto scatter_gen; - case IX86_BUILTIN_GATHERPFDPS: - icode = CODE_FOR_avx512pf_gatherpfv16sisf; - goto vec_prefetch_gen; - case IX86_BUILTIN_GATHERPFQPD: - icode = CODE_FOR_avx512pf_gatherpfv8didf; - goto vec_prefetch_gen; - case IX86_BUILTIN_GATHERPFQPS: - icode = CODE_FOR_avx512pf_gatherpfv8disf; - goto vec_prefetch_gen; - case IX86_BUILTIN_SCATTERPFDPD: - icode = CODE_FOR_avx512pf_scatterpfv8sidf; - goto vec_prefetch_gen; - case IX86_BUILTIN_SCATTERPFDPS: - icode = CODE_FOR_avx512pf_scatterpfv16sisf; - goto vec_prefetch_gen; - case IX86_BUILTIN_SCATTERPFQPD: - icode = CODE_FOR_avx512pf_scatterpfv8didf; - goto vec_prefetch_gen; - case IX86_BUILTIN_SCATTERPFQPS: - icode = CODE_FOR_avx512pf_scatterpfv8disf; - goto vec_prefetch_gen; - - gather_gen: - rtx half; - rtx (*gen) (rtx, rtx); - - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - arg3 = CALL_EXPR_ARG (exp, 3); - arg4 = CALL_EXPR_ARG (exp, 4); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - op3 = expand_normal (arg3); - op4 = expand_normal (arg4); - /* Note the arg order is different from the operand order. */ - mode0 = insn_data[icode].operand[1].mode; - mode2 = insn_data[icode].operand[3].mode; - mode3 = insn_data[icode].operand[4].mode; - mode4 = insn_data[icode].operand[5].mode; - - if (target == NULL_RTX - || GET_MODE (target) != insn_data[icode].operand[0].mode - || !insn_data[icode].operand[0].predicate (target, - GET_MODE (target))) - subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode); - else - subtarget = target; - - switch (fcode) - { - case IX86_BUILTIN_GATHER3ALTSIV8DF: - case IX86_BUILTIN_GATHER3ALTSIV8DI: - half = gen_reg_rtx (V8SImode); - if (!nonimmediate_operand (op2, V16SImode)) - op2 = copy_to_mode_reg (V16SImode, op2); - emit_insn (gen_vec_extract_lo_v16si (half, op2)); - op2 = half; - break; - case IX86_BUILTIN_GATHER3ALTSIV4DF: - case IX86_BUILTIN_GATHER3ALTSIV4DI: - case IX86_BUILTIN_GATHERALTSIV4DF: - case IX86_BUILTIN_GATHERALTSIV4DI: - half = gen_reg_rtx (V4SImode); - if (!nonimmediate_operand (op2, V8SImode)) - op2 = copy_to_mode_reg (V8SImode, op2); - emit_insn (gen_vec_extract_lo_v8si (half, op2)); - op2 = half; - break; - case IX86_BUILTIN_GATHER3ALTDIV16SF: - case IX86_BUILTIN_GATHER3ALTDIV16SI: - half = gen_reg_rtx (mode0); - if (mode0 == V8SFmode) - gen = gen_vec_extract_lo_v16sf; - else - gen = gen_vec_extract_lo_v16si; - if (!nonimmediate_operand (op0, GET_MODE (op0))) - op0 = copy_to_mode_reg (GET_MODE (op0), op0); - emit_insn (gen (half, op0)); - op0 = half; - op3 = lowpart_subreg (QImode, op3, HImode); - break; - case IX86_BUILTIN_GATHER3ALTDIV8SF: - case IX86_BUILTIN_GATHER3ALTDIV8SI: - case IX86_BUILTIN_GATHERALTDIV8SF: - case IX86_BUILTIN_GATHERALTDIV8SI: - half = gen_reg_rtx (mode0); - if (mode0 == V4SFmode) - gen = gen_vec_extract_lo_v8sf; - else - gen = gen_vec_extract_lo_v8si; - if (!nonimmediate_operand (op0, GET_MODE (op0))) - op0 = copy_to_mode_reg (GET_MODE (op0), op0); - emit_insn (gen (half, op0)); - op0 = half; - if (VECTOR_MODE_P (GET_MODE (op3))) - { - half = gen_reg_rtx (mode0); - if (!nonimmediate_operand (op3, GET_MODE (op3))) - op3 = copy_to_mode_reg (GET_MODE (op3), op3); - emit_insn (gen (half, op3)); - op3 = half; - } - break; - default: - break; - } - - /* Force memory operand only with base register here. But we - don't want to do it on memory operand for other builtin - functions. */ - op1 = ix86_zero_extend_to_Pmode (op1); - - if (!insn_data[icode].operand[1].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if (!insn_data[icode].operand[2].predicate (op1, Pmode)) - op1 = copy_to_mode_reg (Pmode, op1); - if (!insn_data[icode].operand[3].predicate (op2, mode2)) - op2 = copy_to_mode_reg (mode2, op2); - - op3 = fixup_modeless_constant (op3, mode3); - - if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode) - { - if (!insn_data[icode].operand[4].predicate (op3, mode3)) - op3 = copy_to_mode_reg (mode3, op3); - } - else - { - op3 = copy_to_reg (op3); - op3 = lowpart_subreg (mode3, op3, GET_MODE (op3)); - } - if (!insn_data[icode].operand[5].predicate (op4, mode4)) - { - error ("the last argument must be scale 1, 2, 4, 8"); - return const0_rtx; - } - - /* Optimize. If mask is known to have all high bits set, - replace op0 with pc_rtx to signal that the instruction - overwrites the whole destination and doesn't use its - previous contents. */ - if (optimize) - { - if (TREE_CODE (arg3) == INTEGER_CST) - { - if (integer_all_onesp (arg3)) - op0 = pc_rtx; - } - else if (TREE_CODE (arg3) == VECTOR_CST) - { - unsigned int negative = 0; - for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i) - { - tree cst = VECTOR_CST_ELT (arg3, i); - if (TREE_CODE (cst) == INTEGER_CST - && tree_int_cst_sign_bit (cst)) - negative++; - else if (TREE_CODE (cst) == REAL_CST - && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst))) - negative++; - } - if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3))) - op0 = pc_rtx; - } - else if (TREE_CODE (arg3) == SSA_NAME - && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE) - { - /* Recognize also when mask is like: - __v2df src = _mm_setzero_pd (); - __v2df mask = _mm_cmpeq_pd (src, src); - or - __v8sf src = _mm256_setzero_ps (); - __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ); - as that is a cheaper way to load all ones into - a register than having to load a constant from - memory. */ - gimple *def_stmt = SSA_NAME_DEF_STMT (arg3); - if (is_gimple_call (def_stmt)) - { - tree fndecl = gimple_call_fndecl (def_stmt); - if (fndecl - && fndecl_built_in_p (fndecl, BUILT_IN_MD)) - switch (DECL_MD_FUNCTION_CODE (fndecl)) - { - case IX86_BUILTIN_CMPPD: - case IX86_BUILTIN_CMPPS: - case IX86_BUILTIN_CMPPD256: - case IX86_BUILTIN_CMPPS256: - if (!integer_zerop (gimple_call_arg (def_stmt, 2))) - break; - /* FALLTHRU */ - case IX86_BUILTIN_CMPEQPD: - case IX86_BUILTIN_CMPEQPS: - if (initializer_zerop (gimple_call_arg (def_stmt, 0)) - && initializer_zerop (gimple_call_arg (def_stmt, - 1))) - op0 = pc_rtx; - break; - default: - break; - } - } - } - } - - pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4); - if (! pat) - return const0_rtx; - emit_insn (pat); - - switch (fcode) - { - case IX86_BUILTIN_GATHER3DIV16SF: - if (target == NULL_RTX) - target = gen_reg_rtx (V8SFmode); - emit_insn (gen_vec_extract_lo_v16sf (target, subtarget)); - break; - case IX86_BUILTIN_GATHER3DIV16SI: - if (target == NULL_RTX) - target = gen_reg_rtx (V8SImode); - emit_insn (gen_vec_extract_lo_v16si (target, subtarget)); - break; - case IX86_BUILTIN_GATHER3DIV8SF: - case IX86_BUILTIN_GATHERDIV8SF: - if (target == NULL_RTX) - target = gen_reg_rtx (V4SFmode); - emit_insn (gen_vec_extract_lo_v8sf (target, subtarget)); - break; - case IX86_BUILTIN_GATHER3DIV8SI: - case IX86_BUILTIN_GATHERDIV8SI: - if (target == NULL_RTX) - target = gen_reg_rtx (V4SImode); - emit_insn (gen_vec_extract_lo_v8si (target, subtarget)); - break; - default: - target = subtarget; - break; - } - return target; - - scatter_gen: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - arg3 = CALL_EXPR_ARG (exp, 3); - arg4 = CALL_EXPR_ARG (exp, 4); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - op3 = expand_normal (arg3); - op4 = expand_normal (arg4); - mode1 = insn_data[icode].operand[1].mode; - mode2 = insn_data[icode].operand[2].mode; - mode3 = insn_data[icode].operand[3].mode; - mode4 = insn_data[icode].operand[4].mode; - - /* Scatter instruction stores operand op3 to memory with - indices from op2 and scale from op4 under writemask op1. - If index operand op2 has more elements then source operand - op3 one need to use only its low half. And vice versa. */ - switch (fcode) - { - case IX86_BUILTIN_SCATTERALTSIV8DF: - case IX86_BUILTIN_SCATTERALTSIV8DI: - half = gen_reg_rtx (V8SImode); - if (!nonimmediate_operand (op2, V16SImode)) - op2 = copy_to_mode_reg (V16SImode, op2); - emit_insn (gen_vec_extract_lo_v16si (half, op2)); - op2 = half; - break; - case IX86_BUILTIN_SCATTERALTDIV16SF: - case IX86_BUILTIN_SCATTERALTDIV16SI: - half = gen_reg_rtx (mode3); - if (mode3 == V8SFmode) - gen = gen_vec_extract_lo_v16sf; - else - gen = gen_vec_extract_lo_v16si; - if (!nonimmediate_operand (op3, GET_MODE (op3))) - op3 = copy_to_mode_reg (GET_MODE (op3), op3); - emit_insn (gen (half, op3)); - op3 = half; - break; - case IX86_BUILTIN_SCATTERALTSIV4DF: - case IX86_BUILTIN_SCATTERALTSIV4DI: - half = gen_reg_rtx (V4SImode); - if (!nonimmediate_operand (op2, V8SImode)) - op2 = copy_to_mode_reg (V8SImode, op2); - emit_insn (gen_vec_extract_lo_v8si (half, op2)); - op2 = half; - break; - case IX86_BUILTIN_SCATTERALTDIV8SF: - case IX86_BUILTIN_SCATTERALTDIV8SI: - half = gen_reg_rtx (mode3); - if (mode3 == V4SFmode) - gen = gen_vec_extract_lo_v8sf; - else - gen = gen_vec_extract_lo_v8si; - if (!nonimmediate_operand (op3, GET_MODE (op3))) - op3 = copy_to_mode_reg (GET_MODE (op3), op3); - emit_insn (gen (half, op3)); - op3 = half; - break; - case IX86_BUILTIN_SCATTERALTSIV2DF: - case IX86_BUILTIN_SCATTERALTSIV2DI: - if (!nonimmediate_operand (op2, V4SImode)) - op2 = copy_to_mode_reg (V4SImode, op2); - break; - case IX86_BUILTIN_SCATTERALTDIV4SF: - case IX86_BUILTIN_SCATTERALTDIV4SI: - if (!nonimmediate_operand (op3, GET_MODE (op3))) - op3 = copy_to_mode_reg (GET_MODE (op3), op3); - break; - default: - break; - } - - /* Force memory operand only with base register here. But we - don't want to do it on memory operand for other builtin - functions. */ - op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1)); - - if (!insn_data[icode].operand[0].predicate (op0, Pmode)) - op0 = copy_to_mode_reg (Pmode, op0); - - op1 = fixup_modeless_constant (op1, mode1); - - if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode) - { - if (!insn_data[icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - } - else - { - op1 = copy_to_reg (op1); - op1 = lowpart_subreg (mode1, op1, GET_MODE (op1)); - } - - if (!insn_data[icode].operand[2].predicate (op2, mode2)) - op2 = copy_to_mode_reg (mode2, op2); - - if (!insn_data[icode].operand[3].predicate (op3, mode3)) - op3 = copy_to_mode_reg (mode3, op3); - - if (!insn_data[icode].operand[4].predicate (op4, mode4)) - { - error ("the last argument must be scale 1, 2, 4, 8"); - return const0_rtx; - } - - pat = GEN_FCN (icode) (op0, op1, op2, op3, op4); - if (! pat) - return const0_rtx; - - emit_insn (pat); - return 0; - - vec_prefetch_gen: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - arg3 = CALL_EXPR_ARG (exp, 3); - arg4 = CALL_EXPR_ARG (exp, 4); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - op3 = expand_normal (arg3); - op4 = expand_normal (arg4); - mode0 = insn_data[icode].operand[0].mode; - mode1 = insn_data[icode].operand[1].mode; - mode3 = insn_data[icode].operand[3].mode; - mode4 = insn_data[icode].operand[4].mode; - - op0 = fixup_modeless_constant (op0, mode0); - - if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode) - { - if (!insn_data[icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - } - else - { - op0 = copy_to_reg (op0); - op0 = lowpart_subreg (mode0, op0, GET_MODE (op0)); - } - - if (!insn_data[icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - /* Force memory operand only with base register here. But we - don't want to do it on memory operand for other builtin - functions. */ - op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1)); - - if (!insn_data[icode].operand[2].predicate (op2, Pmode)) - op2 = copy_to_mode_reg (Pmode, op2); - - if (!insn_data[icode].operand[3].predicate (op3, mode3)) - { - error ("the forth argument must be scale 1, 2, 4, 8"); - return const0_rtx; - } - - if (!insn_data[icode].operand[4].predicate (op4, mode4)) - { - error ("incorrect hint operand"); - return const0_rtx; - } - - pat = GEN_FCN (icode) (op0, op1, op2, op3, op4); - if (! pat) - return const0_rtx; - - emit_insn (pat); - - return 0; - - case IX86_BUILTIN_XABORT: - icode = CODE_FOR_xabort; - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - mode0 = insn_data[icode].operand[0].mode; - if (!insn_data[icode].operand[0].predicate (op0, mode0)) - { - error ("the argument to %<xabort%> intrinsic must " - "be an 8-bit immediate"); - return const0_rtx; - } - emit_insn (gen_xabort (op0)); - return 0; - - case IX86_BUILTIN_RSTORSSP: - case IX86_BUILTIN_CLRSSBSY: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - icode = (fcode == IX86_BUILTIN_RSTORSSP - ? CODE_FOR_rstorssp - : CODE_FOR_clrssbsy); - if (!address_operand (op0, VOIDmode)) - { - op1 = convert_memory_address (Pmode, op0); - op0 = copy_addr_to_reg (op1); - } - emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0))); - return 0; - - case IX86_BUILTIN_WRSSD: - case IX86_BUILTIN_WRSSQ: - case IX86_BUILTIN_WRUSSD: - case IX86_BUILTIN_WRUSSQ: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - arg1 = CALL_EXPR_ARG (exp, 1); - op1 = expand_normal (arg1); - switch (fcode) - { - case IX86_BUILTIN_WRSSD: - icode = CODE_FOR_wrsssi; - mode = SImode; - break; - case IX86_BUILTIN_WRSSQ: - icode = CODE_FOR_wrssdi; - mode = DImode; - break; - case IX86_BUILTIN_WRUSSD: - icode = CODE_FOR_wrusssi; - mode = SImode; - break; - case IX86_BUILTIN_WRUSSQ: - icode = CODE_FOR_wrussdi; - mode = DImode; - break; - } - op0 = force_reg (mode, op0); - if (!address_operand (op1, VOIDmode)) - { - op2 = convert_memory_address (Pmode, op1); - op1 = copy_addr_to_reg (op2); - } - emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1))); - return 0; - - default: - break; - } - - if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST - && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST; - return ix86_expand_special_args_builtin (bdesc_special_args + i, exp, - target); - } - - if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST - && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST; - rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL; - rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx); - rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx); - int masked = 1; - machine_mode mode, wide_mode, nar_mode; - - nar_mode = V4SFmode; - mode = V16SFmode; - wide_mode = V64SFmode; - fcn_mask = gen_avx5124fmaddps_4fmaddps_mask; - fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz; - - switch (fcode) - { - case IX86_BUILTIN_4FMAPS: - fcn = gen_avx5124fmaddps_4fmaddps; - masked = 0; - goto v4fma_expand; - - case IX86_BUILTIN_4DPWSSD: - nar_mode = V4SImode; - mode = V16SImode; - wide_mode = V64SImode; - fcn = gen_avx5124vnniw_vp4dpwssd; - masked = 0; - goto v4fma_expand; - - case IX86_BUILTIN_4DPWSSDS: - nar_mode = V4SImode; - mode = V16SImode; - wide_mode = V64SImode; - fcn = gen_avx5124vnniw_vp4dpwssds; - masked = 0; - goto v4fma_expand; - - case IX86_BUILTIN_4FNMAPS: - fcn = gen_avx5124fmaddps_4fnmaddps; - masked = 0; - goto v4fma_expand; - - case IX86_BUILTIN_4FNMAPS_MASK: - fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask; - fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz; - goto v4fma_expand; - - case IX86_BUILTIN_4DPWSSD_MASK: - nar_mode = V4SImode; - mode = V16SImode; - wide_mode = V64SImode; - fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask; - fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz; - goto v4fma_expand; - - case IX86_BUILTIN_4DPWSSDS_MASK: - nar_mode = V4SImode; - mode = V16SImode; - wide_mode = V64SImode; - fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask; - fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz; - goto v4fma_expand; - - case IX86_BUILTIN_4FMAPS_MASK: - { - tree args[4]; - rtx ops[4]; - rtx wide_reg; - rtx accum; - rtx addr; - rtx mem; - -v4fma_expand: - wide_reg = gen_reg_rtx (wide_mode); - for (i = 0; i < 4; i++) - { - args[i] = CALL_EXPR_ARG (exp, i); - ops[i] = expand_normal (args[i]); - - emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64), - ops[i]); - } - - accum = expand_normal (CALL_EXPR_ARG (exp, 4)); - accum = force_reg (mode, accum); - - addr = expand_normal (CALL_EXPR_ARG (exp, 5)); - addr = force_reg (Pmode, addr); - - mem = gen_rtx_MEM (nar_mode, addr); - - target = gen_reg_rtx (mode); - - emit_move_insn (target, accum); - - if (! masked) - emit_insn (fcn (target, accum, wide_reg, mem)); - else - { - rtx merge, mask; - merge = expand_normal (CALL_EXPR_ARG (exp, 6)); - - mask = expand_normal (CALL_EXPR_ARG (exp, 7)); - - if (CONST_INT_P (mask)) - mask = fixup_modeless_constant (mask, HImode); - - mask = force_reg (HImode, mask); - - if (GET_MODE (mask) != HImode) - mask = gen_rtx_SUBREG (HImode, mask, 0); - - /* If merge is 0 then we're about to emit z-masked variant. */ - if (const0_operand (merge, mode)) - emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask)); - /* If merge is the same as accum then emit merge-masked variant. */ - else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4)) - { - merge = force_reg (mode, merge); - emit_insn (fcn_mask (target, wide_reg, mem, merge, mask)); - } - /* Merge with something unknown might happen if we z-mask w/ -O0. */ - else - { - target = gen_reg_rtx (mode); - emit_move_insn (target, merge); - emit_insn (fcn_mask (target, wide_reg, mem, target, mask)); - } - } - return target; - } - - case IX86_BUILTIN_4FNMASS: - fcn = gen_avx5124fmaddps_4fnmaddss; - masked = 0; - goto s4fma_expand; - - case IX86_BUILTIN_4FMASS: - fcn = gen_avx5124fmaddps_4fmaddss; - masked = 0; - goto s4fma_expand; - - case IX86_BUILTIN_4FNMASS_MASK: - fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask; - fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz; - goto s4fma_expand; - - case IX86_BUILTIN_4FMASS_MASK: - { - tree args[4]; - rtx ops[4]; - rtx wide_reg; - rtx accum; - rtx addr; - rtx mem; - - fcn_mask = gen_avx5124fmaddps_4fmaddss_mask; - fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz; - -s4fma_expand: - mode = V4SFmode; - wide_reg = gen_reg_rtx (V64SFmode); - for (i = 0; i < 4; i++) - { - rtx tmp; - args[i] = CALL_EXPR_ARG (exp, i); - ops[i] = expand_normal (args[i]); - - tmp = gen_reg_rtx (SFmode); - emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0)); - - emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64), - gen_rtx_SUBREG (V16SFmode, tmp, 0)); - } - - accum = expand_normal (CALL_EXPR_ARG (exp, 4)); - accum = force_reg (V4SFmode, accum); - - addr = expand_normal (CALL_EXPR_ARG (exp, 5)); - addr = force_reg (Pmode, addr); - - mem = gen_rtx_MEM (V4SFmode, addr); - - target = gen_reg_rtx (V4SFmode); - - emit_move_insn (target, accum); - - if (! masked) - emit_insn (fcn (target, accum, wide_reg, mem)); - else - { - rtx merge, mask; - merge = expand_normal (CALL_EXPR_ARG (exp, 6)); - - mask = expand_normal (CALL_EXPR_ARG (exp, 7)); - - if (CONST_INT_P (mask)) - mask = fixup_modeless_constant (mask, QImode); - - mask = force_reg (QImode, mask); - - if (GET_MODE (mask) != QImode) - mask = gen_rtx_SUBREG (QImode, mask, 0); - - /* If merge is 0 then we're about to emit z-masked variant. */ - if (const0_operand (merge, mode)) - emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask)); - /* If merge is the same as accum then emit merge-masked - variant. */ - else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4)) - { - merge = force_reg (mode, merge); - emit_insn (fcn_mask (target, wide_reg, mem, merge, mask)); - } - /* Merge with something unknown might happen if we z-mask - w/ -O0. */ - else - { - target = gen_reg_rtx (mode); - emit_move_insn (target, merge); - emit_insn (fcn_mask (target, wide_reg, mem, target, mask)); - } - } - return target; - } - case IX86_BUILTIN_RDPID: - return ix86_expand_special_args_builtin (bdesc_args + i, exp, - target); - case IX86_BUILTIN_FABSQ: - case IX86_BUILTIN_COPYSIGNQ: - if (!TARGET_SSE) - /* Emit a normal call if SSE isn't available. */ - return expand_call (exp, target, ignore); - /* FALLTHRU */ - default: - return ix86_expand_args_builtin (bdesc_args + i, exp, target); - } - } - - if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST - && fcode <= IX86_BUILTIN__BDESC_COMI_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST; - return ix86_expand_sse_comi (bdesc_comi + i, exp, target); - } - - if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST - && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST; - return ix86_expand_round_builtin (bdesc_round_args + i, exp, target); - } - - if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST - && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST; - return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target); - } - - if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST - && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST; - return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target); - } - - if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST - && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST; - const struct builtin_description *d = bdesc_multi_arg + i; - return ix86_expand_multi_arg_builtin (d->icode, exp, target, - (enum ix86_builtin_func_type) - d->flag, d->comparison); - } - - if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST - && fcode <= IX86_BUILTIN__BDESC_CET_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_CET_FIRST; - return ix86_expand_special_args_builtin (bdesc_cet + i, exp, - target); - } - - if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST - && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST; - return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp, - target); - } - - gcc_unreachable (); -} - -/* A subroutine of ix86_expand_vector_init_duplicate. Tries to - fill target with val via vec_duplicate. */ - -static bool -ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val) -{ - bool ok; - rtx_insn *insn; - rtx dup; - - /* First attempt to recognize VAL as-is. */ - dup = gen_vec_duplicate (mode, val); - insn = emit_insn (gen_rtx_SET (target, dup)); - if (recog_memoized (insn) < 0) - { - rtx_insn *seq; - machine_mode innermode = GET_MODE_INNER (mode); - rtx reg; - - /* If that fails, force VAL into a register. */ - - start_sequence (); - reg = force_reg (innermode, val); - if (GET_MODE (reg) != innermode) - reg = gen_lowpart (innermode, reg); - SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg); - seq = get_insns (); - end_sequence (); - if (seq) - emit_insn_before (seq, insn); - - ok = recog_memoized (insn) >= 0; - gcc_assert (ok); - } - return true; -} - -/* Get a vector mode of the same size as the original but with elements - twice as wide. This is only guaranteed to apply to integral vectors. */ - -static machine_mode -get_mode_wider_vector (machine_mode o) -{ - /* ??? Rely on the ordering that genmodes.c gives to vectors. */ - machine_mode n = GET_MODE_WIDER_MODE (o).require (); - gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2); - gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n)); - return n; -} - -static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d); -static bool expand_vec_perm_1 (struct expand_vec_perm_d *d); - -/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector - with all elements equal to VAR. Return true if successful. */ - -static bool -ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, - rtx target, rtx val) -{ - bool ok; - - switch (mode) - { - case E_V2SImode: - case E_V2SFmode: - if (!mmx_ok) - return false; - /* FALLTHRU */ - - case E_V4DFmode: - case E_V4DImode: - case E_V8SFmode: - case E_V8SImode: - case E_V2DFmode: - case E_V2DImode: - case E_V4SFmode: - case E_V4SImode: - case E_V16SImode: - case E_V8DImode: - case E_V16SFmode: - case E_V8DFmode: - return ix86_vector_duplicate_value (mode, target, val); - - case E_V4HImode: - if (!mmx_ok) - return false; - if (TARGET_SSE || TARGET_3DNOW_A) - { - rtx x; - - val = gen_lowpart (SImode, val); - x = gen_rtx_TRUNCATE (HImode, val); - x = gen_rtx_VEC_DUPLICATE (mode, x); - emit_insn (gen_rtx_SET (target, x)); - return true; - } - goto widen; - - case E_V8QImode: - if (!mmx_ok) - return false; - goto widen; - - case E_V8HImode: - if (TARGET_AVX2) - return ix86_vector_duplicate_value (mode, target, val); - - if (TARGET_SSE2) - { - struct expand_vec_perm_d dperm; - rtx tmp1, tmp2; - - permute: - memset (&dperm, 0, sizeof (dperm)); - dperm.target = target; - dperm.vmode = mode; - dperm.nelt = GET_MODE_NUNITS (mode); - dperm.op0 = dperm.op1 = gen_reg_rtx (mode); - dperm.one_operand_p = true; - - /* Extend to SImode using a paradoxical SUBREG. */ - tmp1 = gen_reg_rtx (SImode); - emit_move_insn (tmp1, gen_lowpart (SImode, val)); - - /* Insert the SImode value as low element of a V4SImode vector. */ - tmp2 = gen_reg_rtx (V4SImode); - emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1)); - emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2)); - - ok = (expand_vec_perm_1 (&dperm) - || expand_vec_perm_broadcast_1 (&dperm)); - gcc_assert (ok); - return ok; - } - goto widen; - - case E_V16QImode: - if (TARGET_AVX2) - return ix86_vector_duplicate_value (mode, target, val); - - if (TARGET_SSE2) - goto permute; - goto widen; - - widen: - /* Replicate the value once into the next wider mode and recurse. */ - { - machine_mode smode, wsmode, wvmode; - rtx x; - - smode = GET_MODE_INNER (mode); - wvmode = get_mode_wider_vector (mode); - wsmode = GET_MODE_INNER (wvmode); - - val = convert_modes (wsmode, smode, val, true); - x = expand_simple_binop (wsmode, ASHIFT, val, - GEN_INT (GET_MODE_BITSIZE (smode)), - NULL_RTX, 1, OPTAB_LIB_WIDEN); - val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN); - - x = gen_reg_rtx (wvmode); - ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val); - gcc_assert (ok); - emit_move_insn (target, gen_lowpart (GET_MODE (target), x)); - return ok; - } - - case E_V16HImode: - case E_V32QImode: - if (TARGET_AVX2) - return ix86_vector_duplicate_value (mode, target, val); - else - { - machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode); - rtx x = gen_reg_rtx (hvmode); - - ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val); - gcc_assert (ok); - - x = gen_rtx_VEC_CONCAT (mode, x, x); - emit_insn (gen_rtx_SET (target, x)); - } - return true; - - case E_V64QImode: - case E_V32HImode: - if (TARGET_AVX512BW) - return ix86_vector_duplicate_value (mode, target, val); - else - { - machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode); - rtx x = gen_reg_rtx (hvmode); - - ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val); - gcc_assert (ok); - - x = gen_rtx_VEC_CONCAT (mode, x, x); - emit_insn (gen_rtx_SET (target, x)); - } - return true; - - default: - return false; - } -} - -/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector - whose ONE_VAR element is VAR, and other elements are zero. Return true - if successful. */ - -static bool -ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, - rtx target, rtx var, int one_var) -{ - machine_mode vsimode; - rtx new_target; - rtx x, tmp; - bool use_vector_set = false; - rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL; - - switch (mode) - { - case E_V2DImode: - /* For SSE4.1, we normally use vector set. But if the second - element is zero and inter-unit moves are OK, we use movq - instead. */ - use_vector_set = (TARGET_64BIT && TARGET_SSE4_1 - && !(TARGET_INTER_UNIT_MOVES_TO_VEC - && one_var == 0)); - break; - case E_V16QImode: - case E_V4SImode: - case E_V4SFmode: - use_vector_set = TARGET_SSE4_1; - break; - case E_V8HImode: - use_vector_set = TARGET_SSE2; - break; - case E_V8QImode: - use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; - break; - case E_V4HImode: - use_vector_set = TARGET_SSE || TARGET_3DNOW_A; - break; - case E_V32QImode: - case E_V16HImode: - use_vector_set = TARGET_AVX; - break; - case E_V8SImode: - use_vector_set = TARGET_AVX; - gen_vec_set_0 = gen_vec_setv8si_0; - break; - case E_V8SFmode: - use_vector_set = TARGET_AVX; - gen_vec_set_0 = gen_vec_setv8sf_0; - break; - case E_V4DFmode: - use_vector_set = TARGET_AVX; - gen_vec_set_0 = gen_vec_setv4df_0; - break; - case E_V4DImode: - /* Use ix86_expand_vector_set in 64bit mode only. */ - use_vector_set = TARGET_AVX && TARGET_64BIT; - gen_vec_set_0 = gen_vec_setv4di_0; - break; - case E_V16SImode: - use_vector_set = TARGET_AVX512F && one_var == 0; - gen_vec_set_0 = gen_vec_setv16si_0; - break; - case E_V16SFmode: - use_vector_set = TARGET_AVX512F && one_var == 0; - gen_vec_set_0 = gen_vec_setv16sf_0; - break; - case E_V8DFmode: - use_vector_set = TARGET_AVX512F && one_var == 0; - gen_vec_set_0 = gen_vec_setv8df_0; - break; - case E_V8DImode: - /* Use ix86_expand_vector_set in 64bit mode only. */ - use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0; - gen_vec_set_0 = gen_vec_setv8di_0; - break; - default: - break; - } - - if (use_vector_set) - { - if (gen_vec_set_0 && one_var == 0) - { - var = force_reg (GET_MODE_INNER (mode), var); - emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var)); - return true; - } - emit_insn (gen_rtx_SET (target, CONST0_RTX (mode))); - var = force_reg (GET_MODE_INNER (mode), var); - ix86_expand_vector_set (mmx_ok, target, var, one_var); - return true; - } - - switch (mode) - { - case E_V2SFmode: - case E_V2SImode: - if (!mmx_ok) - return false; - /* FALLTHRU */ - - case E_V2DFmode: - case E_V2DImode: - if (one_var != 0) - return false; - var = force_reg (GET_MODE_INNER (mode), var); - x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode))); - emit_insn (gen_rtx_SET (target, x)); - return true; - - case E_V4SFmode: - case E_V4SImode: - if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER) - new_target = gen_reg_rtx (mode); - else - new_target = target; - var = force_reg (GET_MODE_INNER (mode), var); - x = gen_rtx_VEC_DUPLICATE (mode, var); - x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx); - emit_insn (gen_rtx_SET (new_target, x)); - if (one_var != 0) - { - /* We need to shuffle the value to the correct position, so - create a new pseudo to store the intermediate result. */ - - /* With SSE2, we can use the integer shuffle insns. */ - if (mode != V4SFmode && TARGET_SSE2) - { - emit_insn (gen_sse2_pshufd_1 (new_target, new_target, - const1_rtx, - GEN_INT (one_var == 1 ? 0 : 1), - GEN_INT (one_var == 2 ? 0 : 1), - GEN_INT (one_var == 3 ? 0 : 1))); - if (target != new_target) - emit_move_insn (target, new_target); - return true; - } - - /* Otherwise convert the intermediate result to V4SFmode and - use the SSE1 shuffle instructions. */ - if (mode != V4SFmode) - { - tmp = gen_reg_rtx (V4SFmode); - emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target)); - } - else - tmp = new_target; - - emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp, - const1_rtx, - GEN_INT (one_var == 1 ? 0 : 1), - GEN_INT (one_var == 2 ? 0+4 : 1+4), - GEN_INT (one_var == 3 ? 0+4 : 1+4))); - - if (mode != V4SFmode) - emit_move_insn (target, gen_lowpart (V4SImode, tmp)); - else if (tmp != target) - emit_move_insn (target, tmp); - } - else if (target != new_target) - emit_move_insn (target, new_target); - return true; - - case E_V8HImode: - case E_V16QImode: - vsimode = V4SImode; - goto widen; - case E_V4HImode: - case E_V8QImode: - if (!mmx_ok) - return false; - vsimode = V2SImode; - goto widen; - widen: - if (one_var != 0) - return false; - - /* Zero extend the variable element to SImode and recurse. */ - var = convert_modes (SImode, GET_MODE_INNER (mode), var, true); - - x = gen_reg_rtx (vsimode); - if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x, - var, one_var)) - gcc_unreachable (); - - emit_move_insn (target, gen_lowpart (mode, x)); - return true; - - default: - return false; - } -} - -/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector - consisting of the values in VALS. It is known that all elements - except ONE_VAR are constants. Return true if successful. */ - -static bool -ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode, - rtx target, rtx vals, int one_var) -{ - rtx var = XVECEXP (vals, 0, one_var); - machine_mode wmode; - rtx const_vec, x; - - const_vec = copy_rtx (vals); - XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode)); - const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0)); - - switch (mode) - { - case E_V2DFmode: - case E_V2DImode: - case E_V2SFmode: - case E_V2SImode: - /* For the two element vectors, it's just as easy to use - the general case. */ - return false; - - case E_V4DImode: - /* Use ix86_expand_vector_set in 64bit mode only. */ - if (!TARGET_64BIT) - return false; - /* FALLTHRU */ - case E_V4DFmode: - case E_V8SFmode: - case E_V8SImode: - case E_V16HImode: - case E_V32QImode: - case E_V4SFmode: - case E_V4SImode: - case E_V8HImode: - case E_V4HImode: - break; - - case E_V16QImode: - if (TARGET_SSE4_1) - break; - wmode = V8HImode; - goto widen; - case E_V8QImode: - if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1) - break; - wmode = V4HImode; - goto widen; - widen: - /* There's no way to set one QImode entry easily. Combine - the variable value with its adjacent constant value, and - promote to an HImode set. */ - x = XVECEXP (vals, 0, one_var ^ 1); - if (one_var & 1) - { - var = convert_modes (HImode, QImode, var, true); - var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8), - NULL_RTX, 1, OPTAB_LIB_WIDEN); - x = GEN_INT (INTVAL (x) & 0xff); - } - else - { - var = convert_modes (HImode, QImode, var, true); - x = gen_int_mode (UINTVAL (x) << 8, HImode); - } - if (x != const0_rtx) - var = expand_simple_binop (HImode, IOR, var, x, var, - 1, OPTAB_LIB_WIDEN); - - x = gen_reg_rtx (wmode); - emit_move_insn (x, gen_lowpart (wmode, const_vec)); - ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1); - - emit_move_insn (target, gen_lowpart (mode, x)); - return true; - - default: - return false; - } - - emit_move_insn (target, const_vec); - ix86_expand_vector_set (mmx_ok, target, var, one_var); - return true; -} - -/* A subroutine of ix86_expand_vector_init_general. Use vector - concatenate to handle the most general case: all values variable, - and none identical. */ - -static void -ix86_expand_vector_init_concat (machine_mode mode, - rtx target, rtx *ops, int n) -{ - machine_mode half_mode = VOIDmode; - rtx half[2]; - rtvec v; - int i, j; - - switch (n) - { - case 2: - switch (mode) - { - case E_V16SImode: - half_mode = V8SImode; - break; - case E_V16SFmode: - half_mode = V8SFmode; - break; - case E_V8DImode: - half_mode = V4DImode; - break; - case E_V8DFmode: - half_mode = V4DFmode; - break; - case E_V8SImode: - half_mode = V4SImode; - break; - case E_V8SFmode: - half_mode = V4SFmode; - break; - case E_V4DImode: - half_mode = V2DImode; - break; - case E_V4DFmode: - half_mode = V2DFmode; - break; - case E_V4SImode: - half_mode = V2SImode; - break; - case E_V4SFmode: - half_mode = V2SFmode; - break; - case E_V2DImode: - half_mode = DImode; - break; - case E_V2SImode: - half_mode = SImode; - break; - case E_V2DFmode: - half_mode = DFmode; - break; - case E_V2SFmode: - half_mode = SFmode; - break; - default: - gcc_unreachable (); - } - - if (!register_operand (ops[1], half_mode)) - ops[1] = force_reg (half_mode, ops[1]); - if (!register_operand (ops[0], half_mode)) - ops[0] = force_reg (half_mode, ops[0]); - emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0], - ops[1]))); - break; - - case 4: - switch (mode) - { - case E_V4DImode: - half_mode = V2DImode; - break; - case E_V4DFmode: - half_mode = V2DFmode; - break; - case E_V4SImode: - half_mode = V2SImode; - break; - case E_V4SFmode: - half_mode = V2SFmode; - break; - default: - gcc_unreachable (); - } - goto half; - - case 8: - switch (mode) - { - case E_V8DImode: - half_mode = V4DImode; - break; - case E_V8DFmode: - half_mode = V4DFmode; - break; - case E_V8SImode: - half_mode = V4SImode; - break; - case E_V8SFmode: - half_mode = V4SFmode; - break; - default: - gcc_unreachable (); - } - goto half; - - case 16: - switch (mode) - { - case E_V16SImode: - half_mode = V8SImode; - break; - case E_V16SFmode: - half_mode = V8SFmode; - break; - default: - gcc_unreachable (); - } - goto half; - -half: - /* FIXME: We process inputs backward to help RA. PR 36222. */ - i = n - 1; - for (j = 1; j != -1; j--) - { - half[j] = gen_reg_rtx (half_mode); - switch (n >> 1) - { - case 2: - v = gen_rtvec (2, ops[i-1], ops[i]); - i -= 2; - break; - case 4: - v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]); - i -= 4; - break; - case 8: - v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4], - ops[i-3], ops[i-2], ops[i-1], ops[i]); - i -= 8; - break; - default: - gcc_unreachable (); - } - ix86_expand_vector_init (false, half[j], - gen_rtx_PARALLEL (half_mode, v)); - } - - ix86_expand_vector_init_concat (mode, target, half, 2); - break; - - default: - gcc_unreachable (); - } -} - -/* A subroutine of ix86_expand_vector_init_general. Use vector - interleave to handle the most general case: all values variable, - and none identical. */ - -static void -ix86_expand_vector_init_interleave (machine_mode mode, - rtx target, rtx *ops, int n) -{ - machine_mode first_imode, second_imode, third_imode, inner_mode; - int i, j; - rtx op0, op1; - rtx (*gen_load_even) (rtx, rtx, rtx); - rtx (*gen_interleave_first_low) (rtx, rtx, rtx); - rtx (*gen_interleave_second_low) (rtx, rtx, rtx); - - switch (mode) - { - case E_V8HImode: - gen_load_even = gen_vec_setv8hi; - gen_interleave_first_low = gen_vec_interleave_lowv4si; - gen_interleave_second_low = gen_vec_interleave_lowv2di; - inner_mode = HImode; - first_imode = V4SImode; - second_imode = V2DImode; - third_imode = VOIDmode; - break; - case E_V16QImode: - gen_load_even = gen_vec_setv16qi; - gen_interleave_first_low = gen_vec_interleave_lowv8hi; - gen_interleave_second_low = gen_vec_interleave_lowv4si; - inner_mode = QImode; - first_imode = V8HImode; - second_imode = V4SImode; - third_imode = V2DImode; - break; - default: - gcc_unreachable (); - } - - for (i = 0; i < n; i++) - { - /* Extend the odd elment to SImode using a paradoxical SUBREG. */ - op0 = gen_reg_rtx (SImode); - emit_move_insn (op0, gen_lowpart (SImode, ops [i + i])); - - /* Insert the SImode value as low element of V4SImode vector. */ - op1 = gen_reg_rtx (V4SImode); - op0 = gen_rtx_VEC_MERGE (V4SImode, - gen_rtx_VEC_DUPLICATE (V4SImode, - op0), - CONST0_RTX (V4SImode), - const1_rtx); - emit_insn (gen_rtx_SET (op1, op0)); - - /* Cast the V4SImode vector back to a vector in orignal mode. */ - op0 = gen_reg_rtx (mode); - emit_move_insn (op0, gen_lowpart (mode, op1)); - - /* Load even elements into the second position. */ - emit_insn (gen_load_even (op0, - force_reg (inner_mode, - ops [i + i + 1]), - const1_rtx)); - - /* Cast vector to FIRST_IMODE vector. */ - ops[i] = gen_reg_rtx (first_imode); - emit_move_insn (ops[i], gen_lowpart (first_imode, op0)); - } - - /* Interleave low FIRST_IMODE vectors. */ - for (i = j = 0; i < n; i += 2, j++) - { - op0 = gen_reg_rtx (first_imode); - emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1])); - - /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */ - ops[j] = gen_reg_rtx (second_imode); - emit_move_insn (ops[j], gen_lowpart (second_imode, op0)); - } - - /* Interleave low SECOND_IMODE vectors. */ - switch (second_imode) - { - case E_V4SImode: - for (i = j = 0; i < n / 2; i += 2, j++) - { - op0 = gen_reg_rtx (second_imode); - emit_insn (gen_interleave_second_low (op0, ops[i], - ops[i + 1])); - - /* Cast the SECOND_IMODE vector to the THIRD_IMODE - vector. */ - ops[j] = gen_reg_rtx (third_imode); - emit_move_insn (ops[j], gen_lowpart (third_imode, op0)); - } - second_imode = V2DImode; - gen_interleave_second_low = gen_vec_interleave_lowv2di; - /* FALLTHRU */ - - case E_V2DImode: - op0 = gen_reg_rtx (second_imode); - emit_insn (gen_interleave_second_low (op0, ops[0], - ops[1])); - - /* Cast the SECOND_IMODE vector back to a vector on original - mode. */ - emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0))); - break; - - default: - gcc_unreachable (); - } -} - -/* A subroutine of ix86_expand_vector_init. Handle the most general case: - all values variable, and none identical. */ - -static void -ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode, - rtx target, rtx vals) -{ - rtx ops[64], op0, op1, op2, op3, op4, op5; - machine_mode half_mode = VOIDmode; - machine_mode quarter_mode = VOIDmode; - int n, i; - - switch (mode) - { - case E_V2SFmode: - case E_V2SImode: - if (!mmx_ok && !TARGET_SSE) - break; - /* FALLTHRU */ - - case E_V16SImode: - case E_V16SFmode: - case E_V8DFmode: - case E_V8DImode: - case E_V8SFmode: - case E_V8SImode: - case E_V4DFmode: - case E_V4DImode: - case E_V4SFmode: - case E_V4SImode: - case E_V2DFmode: - case E_V2DImode: - n = GET_MODE_NUNITS (mode); - for (i = 0; i < n; i++) - ops[i] = XVECEXP (vals, 0, i); - ix86_expand_vector_init_concat (mode, target, ops, n); - return; - - case E_V2TImode: - for (i = 0; i < 2; i++) - ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i)); - op0 = gen_reg_rtx (V4DImode); - ix86_expand_vector_init_concat (V4DImode, op0, ops, 2); - emit_move_insn (target, gen_lowpart (GET_MODE (target), op0)); - return; - - case E_V4TImode: - for (i = 0; i < 4; i++) - ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i)); - ops[4] = gen_reg_rtx (V4DImode); - ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2); - ops[5] = gen_reg_rtx (V4DImode); - ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2); - op0 = gen_reg_rtx (V8DImode); - ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2); - emit_move_insn (target, gen_lowpart (GET_MODE (target), op0)); - return; - - case E_V32QImode: - half_mode = V16QImode; - goto half; - - case E_V16HImode: - half_mode = V8HImode; - goto half; - -half: - n = GET_MODE_NUNITS (mode); - for (i = 0; i < n; i++) - ops[i] = XVECEXP (vals, 0, i); - op0 = gen_reg_rtx (half_mode); - op1 = gen_reg_rtx (half_mode); - ix86_expand_vector_init_interleave (half_mode, op0, ops, - n >> 2); - ix86_expand_vector_init_interleave (half_mode, op1, - &ops [n >> 1], n >> 2); - emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1))); - return; - - case E_V64QImode: - quarter_mode = V16QImode; - half_mode = V32QImode; - goto quarter; - - case E_V32HImode: - quarter_mode = V8HImode; - half_mode = V16HImode; - goto quarter; - -quarter: - n = GET_MODE_NUNITS (mode); - for (i = 0; i < n; i++) - ops[i] = XVECEXP (vals, 0, i); - op0 = gen_reg_rtx (quarter_mode); - op1 = gen_reg_rtx (quarter_mode); - op2 = gen_reg_rtx (quarter_mode); - op3 = gen_reg_rtx (quarter_mode); - op4 = gen_reg_rtx (half_mode); - op5 = gen_reg_rtx (half_mode); - ix86_expand_vector_init_interleave (quarter_mode, op0, ops, - n >> 3); - ix86_expand_vector_init_interleave (quarter_mode, op1, - &ops [n >> 2], n >> 3); - ix86_expand_vector_init_interleave (quarter_mode, op2, - &ops [n >> 1], n >> 3); - ix86_expand_vector_init_interleave (quarter_mode, op3, - &ops [(n >> 1) | (n >> 2)], n >> 3); - emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1))); - emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3))); - emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5))); - return; - - case E_V16QImode: - if (!TARGET_SSE4_1) - break; - /* FALLTHRU */ - - case E_V8HImode: - if (!TARGET_SSE2) - break; - - /* Don't use ix86_expand_vector_init_interleave if we can't - move from GPR to SSE register directly. */ - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) - break; - - n = GET_MODE_NUNITS (mode); - for (i = 0; i < n; i++) - ops[i] = XVECEXP (vals, 0, i); - ix86_expand_vector_init_interleave (mode, target, ops, n >> 1); - return; - - case E_V4HImode: - case E_V8QImode: - break; - - default: - gcc_unreachable (); - } - - { - int i, j, n_elts, n_words, n_elt_per_word; - machine_mode inner_mode; - rtx words[4], shift; - - inner_mode = GET_MODE_INNER (mode); - n_elts = GET_MODE_NUNITS (mode); - n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD; - n_elt_per_word = n_elts / n_words; - shift = GEN_INT (GET_MODE_BITSIZE (inner_mode)); - - for (i = 0; i < n_words; ++i) - { - rtx word = NULL_RTX; - - for (j = 0; j < n_elt_per_word; ++j) - { - rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1); - elt = convert_modes (word_mode, inner_mode, elt, true); - - if (j == 0) - word = elt; - else - { - word = expand_simple_binop (word_mode, ASHIFT, word, shift, - word, 1, OPTAB_LIB_WIDEN); - word = expand_simple_binop (word_mode, IOR, word, elt, - word, 1, OPTAB_LIB_WIDEN); - } - } - - words[i] = word; - } - - if (n_words == 1) - emit_move_insn (target, gen_lowpart (mode, words[0])); - else if (n_words == 2) - { - rtx tmp = gen_reg_rtx (mode); - emit_clobber (tmp); - emit_move_insn (gen_lowpart (word_mode, tmp), words[0]); - emit_move_insn (gen_highpart (word_mode, tmp), words[1]); - emit_move_insn (target, tmp); - } - else if (n_words == 4) - { - rtx tmp = gen_reg_rtx (V4SImode); - gcc_assert (word_mode == SImode); - vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words)); - ix86_expand_vector_init_general (false, V4SImode, tmp, vals); - emit_move_insn (target, gen_lowpart (mode, tmp)); - } - else - gcc_unreachable (); - } -} - -/* Initialize vector TARGET via VALS. Suppress the use of MMX - instructions unless MMX_OK is true. */ - -void -ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals) -{ - machine_mode mode = GET_MODE (target); - machine_mode inner_mode = GET_MODE_INNER (mode); - int n_elts = GET_MODE_NUNITS (mode); - int n_var = 0, one_var = -1; - bool all_same = true, all_const_zero = true; - int i; - rtx x; - - /* Handle first initialization from vector elts. */ - if (n_elts != XVECLEN (vals, 0)) - { - rtx subtarget = target; - x = XVECEXP (vals, 0, 0); - gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode); - if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts) - { - rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) }; - if (inner_mode == QImode || inner_mode == HImode) - { - unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode); - mode = mode_for_vector (SImode, n_bits / 4).require (); - inner_mode = mode_for_vector (SImode, n_bits / 8).require (); - ops[0] = gen_lowpart (inner_mode, ops[0]); - ops[1] = gen_lowpart (inner_mode, ops[1]); - subtarget = gen_reg_rtx (mode); - } - ix86_expand_vector_init_concat (mode, subtarget, ops, 2); - if (subtarget != target) - emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget)); - return; - } - gcc_unreachable (); - } - - for (i = 0; i < n_elts; ++i) - { - x = XVECEXP (vals, 0, i); - if (!(CONST_SCALAR_INT_P (x) - || CONST_DOUBLE_P (x) - || CONST_FIXED_P (x))) - n_var++, one_var = i; - else if (x != CONST0_RTX (inner_mode)) - all_const_zero = false; - if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) - all_same = false; - } - - /* Constants are best loaded from the constant pool. */ - if (n_var == 0) - { - emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0))); - return; - } - - /* If all values are identical, broadcast the value. */ - if (all_same - && ix86_expand_vector_init_duplicate (mmx_ok, mode, target, - XVECEXP (vals, 0, 0))) - return; - - /* Values where only one field is non-constant are best loaded from - the pool and overwritten via move later. */ - if (n_var == 1) - { - if (all_const_zero - && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target, - XVECEXP (vals, 0, one_var), - one_var)) - return; - - if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var)) - return; - } - - ix86_expand_vector_init_general (mmx_ok, mode, target, vals); -} - -void -ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) -{ - machine_mode mode = GET_MODE (target); - machine_mode inner_mode = GET_MODE_INNER (mode); - machine_mode half_mode; - bool use_vec_merge = false; - rtx tmp; - static rtx (*gen_extract[6][2]) (rtx, rtx) - = { - { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi }, - { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi }, - { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si }, - { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di }, - { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf }, - { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df } - }; - static rtx (*gen_insert[6][2]) (rtx, rtx, rtx) - = { - { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi }, - { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi }, - { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si }, - { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di }, - { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf }, - { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df } - }; - int i, j, n; - machine_mode mmode = VOIDmode; - rtx (*gen_blendm) (rtx, rtx, rtx, rtx); - - switch (mode) - { - case E_V2SImode: - use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; - if (use_vec_merge) - break; - /* FALLTHRU */ - - case E_V2SFmode: - if (mmx_ok) - { - tmp = gen_reg_rtx (GET_MODE_INNER (mode)); - ix86_expand_vector_extract (true, tmp, target, 1 - elt); - if (elt == 0) - tmp = gen_rtx_VEC_CONCAT (mode, val, tmp); - else - tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); - emit_insn (gen_rtx_SET (target, tmp)); - return; - } - break; - - case E_V2DImode: - use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT; - if (use_vec_merge) - break; - - tmp = gen_reg_rtx (GET_MODE_INNER (mode)); - ix86_expand_vector_extract (false, tmp, target, 1 - elt); - if (elt == 0) - tmp = gen_rtx_VEC_CONCAT (mode, val, tmp); - else - tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); - emit_insn (gen_rtx_SET (target, tmp)); - return; - - case E_V2DFmode: - /* NB: For ELT == 0, use standard scalar operation patterns which - preserve the rest of the vector for combiner: - - (vec_merge:V2DF - (vec_duplicate:V2DF (reg:DF)) - (reg:V2DF) - (const_int 1)) - */ - if (elt == 0) - goto do_vec_merge; - - { - rtx op0, op1; - - /* For the two element vectors, we implement a VEC_CONCAT with - the extraction of the other element. */ - - tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt))); - tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp); - - if (elt == 0) - op0 = val, op1 = tmp; - else - op0 = tmp, op1 = val; - - tmp = gen_rtx_VEC_CONCAT (mode, op0, op1); - emit_insn (gen_rtx_SET (target, tmp)); - } - return; - - case E_V4SFmode: - use_vec_merge = TARGET_SSE4_1; - if (use_vec_merge) - break; - - switch (elt) - { - case 0: - use_vec_merge = true; - break; - - case 1: - /* tmp = target = A B C D */ - tmp = copy_to_reg (target); - /* target = A A B B */ - emit_insn (gen_vec_interleave_lowv4sf (target, target, target)); - /* target = X A B B */ - ix86_expand_vector_set (false, target, val, 0); - /* target = A X C D */ - emit_insn (gen_sse_shufps_v4sf (target, target, tmp, - const1_rtx, const0_rtx, - GEN_INT (2+4), GEN_INT (3+4))); - return; - - case 2: - /* tmp = target = A B C D */ - tmp = copy_to_reg (target); - /* tmp = X B C D */ - ix86_expand_vector_set (false, tmp, val, 0); - /* target = A B X D */ - emit_insn (gen_sse_shufps_v4sf (target, target, tmp, - const0_rtx, const1_rtx, - GEN_INT (0+4), GEN_INT (3+4))); - return; - - case 3: - /* tmp = target = A B C D */ - tmp = copy_to_reg (target); - /* tmp = X B C D */ - ix86_expand_vector_set (false, tmp, val, 0); - /* target = A B X D */ - emit_insn (gen_sse_shufps_v4sf (target, target, tmp, - const0_rtx, const1_rtx, - GEN_INT (2+4), GEN_INT (0+4))); - return; - - default: - gcc_unreachable (); - } - break; - - case E_V4SImode: - use_vec_merge = TARGET_SSE4_1; - if (use_vec_merge) - break; - - /* Element 0 handled by vec_merge below. */ - if (elt == 0) - { - use_vec_merge = true; - break; - } - - if (TARGET_SSE2) - { - /* With SSE2, use integer shuffles to swap element 0 and ELT, - store into element 0, then shuffle them back. */ - - rtx order[4]; - - order[0] = GEN_INT (elt); - order[1] = const1_rtx; - order[2] = const2_rtx; - order[3] = GEN_INT (3); - order[elt] = const0_rtx; - - emit_insn (gen_sse2_pshufd_1 (target, target, order[0], - order[1], order[2], order[3])); - - ix86_expand_vector_set (false, target, val, 0); - - emit_insn (gen_sse2_pshufd_1 (target, target, order[0], - order[1], order[2], order[3])); - } - else - { - /* For SSE1, we have to reuse the V4SF code. */ - rtx t = gen_reg_rtx (V4SFmode); - emit_move_insn (t, gen_lowpart (V4SFmode, target)); - ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt); - emit_move_insn (target, gen_lowpart (mode, t)); - } - return; - - case E_V8HImode: - use_vec_merge = TARGET_SSE2; - break; - case E_V4HImode: - use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A); - break; - - case E_V16QImode: - use_vec_merge = TARGET_SSE4_1; - break; - - case E_V8QImode: - use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; - break; - - case E_V32QImode: - half_mode = V16QImode; - j = 0; - n = 16; - goto half; - - case E_V16HImode: - half_mode = V8HImode; - j = 1; - n = 8; - goto half; - - case E_V8SImode: - half_mode = V4SImode; - j = 2; - n = 4; - goto half; - - case E_V4DImode: - half_mode = V2DImode; - j = 3; - n = 2; - goto half; - - case E_V8SFmode: - half_mode = V4SFmode; - j = 4; - n = 4; - goto half; - - case E_V4DFmode: - half_mode = V2DFmode; - j = 5; - n = 2; - goto half; - -half: - /* Compute offset. */ - i = elt / n; - elt %= n; - - gcc_assert (i <= 1); - - /* Extract the half. */ - tmp = gen_reg_rtx (half_mode); - emit_insn (gen_extract[j][i] (tmp, target)); - - /* Put val in tmp at elt. */ - ix86_expand_vector_set (false, tmp, val, elt); - - /* Put it back. */ - emit_insn (gen_insert[j][i] (target, target, tmp)); - return; - - case E_V8DFmode: - if (TARGET_AVX512F) - { - mmode = QImode; - gen_blendm = gen_avx512f_blendmv8df; - } - break; - - case E_V8DImode: - if (TARGET_AVX512F) - { - mmode = QImode; - gen_blendm = gen_avx512f_blendmv8di; - } - break; - - case E_V16SFmode: - if (TARGET_AVX512F) - { - mmode = HImode; - gen_blendm = gen_avx512f_blendmv16sf; - } - break; - - case E_V16SImode: - if (TARGET_AVX512F) - { - mmode = HImode; - gen_blendm = gen_avx512f_blendmv16si; - } - break; - - case E_V32HImode: - if (TARGET_AVX512BW) - { - mmode = SImode; - gen_blendm = gen_avx512bw_blendmv32hi; - } - else if (TARGET_AVX512F) - { - half_mode = E_V8HImode; - n = 8; - goto quarter; - } - break; - - case E_V64QImode: - if (TARGET_AVX512BW) - { - mmode = DImode; - gen_blendm = gen_avx512bw_blendmv64qi; - } - else if (TARGET_AVX512F) - { - half_mode = E_V16QImode; - n = 16; - goto quarter; - } - break; - -quarter: - /* Compute offset. */ - i = elt / n; - elt %= n; - - gcc_assert (i <= 3); - - { - /* Extract the quarter. */ - tmp = gen_reg_rtx (V4SImode); - rtx tmp2 = gen_lowpart (V16SImode, target); - rtx mask = gen_reg_rtx (QImode); - - emit_move_insn (mask, constm1_rtx); - emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i), - tmp, mask)); - - tmp2 = gen_reg_rtx (half_mode); - emit_move_insn (tmp2, gen_lowpart (half_mode, tmp)); - tmp = tmp2; - - /* Put val in tmp at elt. */ - ix86_expand_vector_set (false, tmp, val, elt); - - /* Put it back. */ - tmp2 = gen_reg_rtx (V16SImode); - rtx tmp3 = gen_lowpart (V16SImode, target); - mask = gen_reg_rtx (HImode); - emit_move_insn (mask, constm1_rtx); - tmp = gen_lowpart (V4SImode, tmp); - emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i), - tmp3, mask)); - emit_move_insn (target, gen_lowpart (mode, tmp2)); - } - return; - - default: - break; - } - - if (mmode != VOIDmode) - { - tmp = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val))); - /* The avx512*_blendm<mode> expanders have different operand order - from VEC_MERGE. In VEC_MERGE, the first input operand is used for - elements where the mask is set and second input operand otherwise, - in {sse,avx}*_*blend* the first input operand is used for elements - where the mask is clear and second input operand otherwise. */ - emit_insn (gen_blendm (target, target, tmp, - force_reg (mmode, - gen_int_mode (HOST_WIDE_INT_1U << elt, - mmode)))); - } - else if (use_vec_merge) - { -do_vec_merge: - tmp = gen_rtx_VEC_DUPLICATE (mode, val); - tmp = gen_rtx_VEC_MERGE (mode, tmp, target, - GEN_INT (HOST_WIDE_INT_1U << elt)); - emit_insn (gen_rtx_SET (target, tmp)); - } - else - { - rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); - - emit_move_insn (mem, target); - - tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode)); - emit_move_insn (tmp, val); - - emit_move_insn (target, mem); - } -} - -void -ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) -{ - machine_mode mode = GET_MODE (vec); - machine_mode inner_mode = GET_MODE_INNER (mode); - bool use_vec_extr = false; - rtx tmp; - - switch (mode) - { - case E_V2SImode: - use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; - if (use_vec_extr) - break; - /* FALLTHRU */ - - case E_V2SFmode: - if (!mmx_ok) - break; - /* FALLTHRU */ - - case E_V2DFmode: - case E_V2DImode: - case E_V2TImode: - case E_V4TImode: - use_vec_extr = true; - break; - - case E_V4SFmode: - use_vec_extr = TARGET_SSE4_1; - if (use_vec_extr) - break; - - switch (elt) - { - case 0: - tmp = vec; - break; - - case 1: - case 3: - tmp = gen_reg_rtx (mode); - emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec, - GEN_INT (elt), GEN_INT (elt), - GEN_INT (elt+4), GEN_INT (elt+4))); - break; - - case 2: - tmp = gen_reg_rtx (mode); - emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec)); - break; - - default: - gcc_unreachable (); - } - vec = tmp; - use_vec_extr = true; - elt = 0; - break; - - case E_V4SImode: - use_vec_extr = TARGET_SSE4_1; - if (use_vec_extr) - break; - - if (TARGET_SSE2) - { - switch (elt) - { - case 0: - tmp = vec; - break; - - case 1: - case 3: - tmp = gen_reg_rtx (mode); - emit_insn (gen_sse2_pshufd_1 (tmp, vec, - GEN_INT (elt), GEN_INT (elt), - GEN_INT (elt), GEN_INT (elt))); - break; - - case 2: - tmp = gen_reg_rtx (mode); - emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec)); - break; - - default: - gcc_unreachable (); - } - vec = tmp; - use_vec_extr = true; - elt = 0; - } - else - { - /* For SSE1, we have to reuse the V4SF code. */ - ix86_expand_vector_extract (false, gen_lowpart (SFmode, target), - gen_lowpart (V4SFmode, vec), elt); - return; - } - break; - - case E_V8HImode: - use_vec_extr = TARGET_SSE2; - break; - case E_V4HImode: - use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A); - break; - - case E_V16QImode: - use_vec_extr = TARGET_SSE4_1; - if (!use_vec_extr - && TARGET_SSE2 - && elt == 0 - && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC)) - { - tmp = gen_reg_rtx (SImode); - ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec), - 0); - emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp))); - return; - } - break; - - case E_V8SFmode: - if (TARGET_AVX) - { - tmp = gen_reg_rtx (V4SFmode); - if (elt < 4) - emit_insn (gen_vec_extract_lo_v8sf (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v8sf (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 3); - return; - } - break; - - case E_V4DFmode: - if (TARGET_AVX) - { - tmp = gen_reg_rtx (V2DFmode); - if (elt < 2) - emit_insn (gen_vec_extract_lo_v4df (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v4df (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 1); - return; - } - break; - - case E_V32QImode: - if (TARGET_AVX) - { - tmp = gen_reg_rtx (V16QImode); - if (elt < 16) - emit_insn (gen_vec_extract_lo_v32qi (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v32qi (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 15); - return; - } - break; - - case E_V16HImode: - if (TARGET_AVX) - { - tmp = gen_reg_rtx (V8HImode); - if (elt < 8) - emit_insn (gen_vec_extract_lo_v16hi (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v16hi (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 7); - return; - } - break; - - case E_V8SImode: - if (TARGET_AVX) - { - tmp = gen_reg_rtx (V4SImode); - if (elt < 4) - emit_insn (gen_vec_extract_lo_v8si (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v8si (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 3); - return; - } - break; - - case E_V4DImode: - if (TARGET_AVX) - { - tmp = gen_reg_rtx (V2DImode); - if (elt < 2) - emit_insn (gen_vec_extract_lo_v4di (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v4di (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 1); - return; - } - break; - - case E_V32HImode: - if (TARGET_AVX512BW) - { - tmp = gen_reg_rtx (V16HImode); - if (elt < 16) - emit_insn (gen_vec_extract_lo_v32hi (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v32hi (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 15); - return; - } - break; - - case E_V64QImode: - if (TARGET_AVX512BW) - { - tmp = gen_reg_rtx (V32QImode); - if (elt < 32) - emit_insn (gen_vec_extract_lo_v64qi (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v64qi (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 31); - return; - } - break; - - case E_V16SFmode: - tmp = gen_reg_rtx (V8SFmode); - if (elt < 8) - emit_insn (gen_vec_extract_lo_v16sf (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v16sf (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 7); - return; - - case E_V8DFmode: - tmp = gen_reg_rtx (V4DFmode); - if (elt < 4) - emit_insn (gen_vec_extract_lo_v8df (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v8df (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 3); - return; - - case E_V16SImode: - tmp = gen_reg_rtx (V8SImode); - if (elt < 8) - emit_insn (gen_vec_extract_lo_v16si (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v16si (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 7); - return; - - case E_V8DImode: - tmp = gen_reg_rtx (V4DImode); - if (elt < 4) - emit_insn (gen_vec_extract_lo_v8di (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v8di (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 3); - return; - - case E_V8QImode: - use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; - /* ??? Could extract the appropriate HImode element and shift. */ - break; - - default: - break; - } - - if (use_vec_extr) - { - tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt))); - tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp); - - /* Let the rtl optimizers know about the zero extension performed. */ - if (inner_mode == QImode || inner_mode == HImode) - { - tmp = gen_rtx_ZERO_EXTEND (SImode, tmp); - target = gen_lowpart (SImode, target); - } - - emit_insn (gen_rtx_SET (target, tmp)); - } - else - { - rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); - - emit_move_insn (mem, vec); - - tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode)); - emit_move_insn (target, tmp); - } -} - -/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC - to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode. - The upper bits of DEST are undefined, though they shouldn't cause - exceptions (some bits from src or all zeros are ok). */ - -static void -emit_reduc_half (rtx dest, rtx src, int i) -{ - rtx tem, d = dest; - switch (GET_MODE (src)) - { - case E_V4SFmode: - if (i == 128) - tem = gen_sse_movhlps (dest, src, src); - else - tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx, - GEN_INT (1 + 4), GEN_INT (1 + 4)); - break; - case E_V2DFmode: - tem = gen_vec_interleave_highv2df (dest, src, src); - break; - case E_V16QImode: - case E_V8HImode: - case E_V4SImode: - case E_V2DImode: - d = gen_reg_rtx (V1TImode); - tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src), - GEN_INT (i / 2)); - break; - case E_V8SFmode: - if (i == 256) - tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx); - else - tem = gen_avx_shufps256 (dest, src, src, - GEN_INT (i == 128 ? 2 + (3 << 2) : 1)); - break; - case E_V4DFmode: - if (i == 256) - tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx); - else - tem = gen_avx_shufpd256 (dest, src, src, const1_rtx); - break; - case E_V32QImode: - case E_V16HImode: - case E_V8SImode: - case E_V4DImode: - if (i == 256) - { - if (GET_MODE (dest) != V4DImode) - d = gen_reg_rtx (V4DImode); - tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src), - gen_lowpart (V4DImode, src), - const1_rtx); - } - else - { - d = gen_reg_rtx (V2TImode); - tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src), - GEN_INT (i / 2)); - } - break; - case E_V64QImode: - case E_V32HImode: - if (i < 64) - { - d = gen_reg_rtx (V4TImode); - tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src), - GEN_INT (i / 2)); - break; - } - /* FALLTHRU */ - case E_V16SImode: - case E_V16SFmode: - case E_V8DImode: - case E_V8DFmode: - if (i > 128) - tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest), - gen_lowpart (V16SImode, src), - gen_lowpart (V16SImode, src), - GEN_INT (0x4 + (i == 512 ? 4 : 0)), - GEN_INT (0x5 + (i == 512 ? 4 : 0)), - GEN_INT (0x6 + (i == 512 ? 4 : 0)), - GEN_INT (0x7 + (i == 512 ? 4 : 0)), - GEN_INT (0xC), GEN_INT (0xD), - GEN_INT (0xE), GEN_INT (0xF), - GEN_INT (0x10), GEN_INT (0x11), - GEN_INT (0x12), GEN_INT (0x13), - GEN_INT (0x14), GEN_INT (0x15), - GEN_INT (0x16), GEN_INT (0x17)); - else - tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest), - gen_lowpart (V16SImode, src), - GEN_INT (i == 128 ? 0x2 : 0x1), - GEN_INT (0x3), - GEN_INT (0x3), - GEN_INT (0x3), - GEN_INT (i == 128 ? 0x6 : 0x5), - GEN_INT (0x7), - GEN_INT (0x7), - GEN_INT (0x7), - GEN_INT (i == 128 ? 0xA : 0x9), - GEN_INT (0xB), - GEN_INT (0xB), - GEN_INT (0xB), - GEN_INT (i == 128 ? 0xE : 0xD), - GEN_INT (0xF), - GEN_INT (0xF), - GEN_INT (0xF)); - break; - default: - gcc_unreachable (); - } - emit_insn (tem); - if (d != dest) - emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d)); -} - -/* Expand a vector reduction. FN is the binary pattern to reduce; - DEST is the destination; IN is the input vector. */ - -void -ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in) -{ - rtx half, dst, vec = in; - machine_mode mode = GET_MODE (in); - int i; - - /* SSE4 has a special instruction for V8HImode UMIN reduction. */ - if (TARGET_SSE4_1 - && mode == V8HImode - && fn == gen_uminv8hi3) - { - emit_insn (gen_sse4_1_phminposuw (dest, in)); - return; - } - - for (i = GET_MODE_BITSIZE (mode); - i > GET_MODE_UNIT_BITSIZE (mode); - i >>= 1) - { - half = gen_reg_rtx (mode); - emit_reduc_half (half, vec, i); - if (i == GET_MODE_UNIT_BITSIZE (mode) * 2) - dst = dest; - else - dst = gen_reg_rtx (mode); - emit_insn (fn (dst, half, vec)); - vec = dst; - } -} - -/* Output code to perform a conditional jump to LABEL, if C2 flag in - FP status register is set. */ - -void -ix86_emit_fp_unordered_jump (rtx label) -{ - rtx reg = gen_reg_rtx (HImode); - rtx_insn *insn; - rtx temp; - - emit_insn (gen_x86_fnstsw_1 (reg)); - - if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ())) - { - emit_insn (gen_x86_sahf_1 (reg)); - - temp = gen_rtx_REG (CCmode, FLAGS_REG); - temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx); - } - else - { - emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04))); - - temp = gen_rtx_REG (CCNOmode, FLAGS_REG); - temp = gen_rtx_NE (VOIDmode, temp, const0_rtx); - } - - temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp, - gen_rtx_LABEL_REF (VOIDmode, label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp)); - predict_jump (REG_BR_PROB_BASE * 10 / 100); - JUMP_LABEL (insn) = label; -} - -/* Output code to perform an sinh XFmode calculation. */ - -void ix86_emit_i387_sinh (rtx op0, rtx op1) -{ - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx scratch = gen_reg_rtx (HImode); - rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); - rtx half = const_double_from_real_value (dconsthalf, XFmode); - rtx cst1, tmp; - rtx_code_label *jump_label = gen_label_rtx (); - rtx_insn *insn; - - /* scratch = fxam (op1) */ - emit_insn (gen_fxamxf2_i387 (scratch, op1)); - - /* e1 = expm1 (|op1|) */ - emit_insn (gen_absxf2 (e2, op1)); - emit_insn (gen_expm1xf2 (e1, e2)); - - /* e2 = e1 / (e1 + 1.0) + e1 */ - cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); - emit_insn (gen_addxf3 (e2, e1, cst1)); - emit_insn (gen_divxf3 (e2, e1, e2)); - emit_insn (gen_addxf3 (e2, e2, e1)); - - /* flags = signbit (op1) */ - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); - - /* if (flags) then e2 = -e2 */ - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, - gen_rtx_EQ (VOIDmode, flags, const0_rtx), - gen_rtx_LABEL_REF (VOIDmode, jump_label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - predict_jump (REG_BR_PROB_BASE * 50 / 100); - JUMP_LABEL (insn) = jump_label; - - emit_insn (gen_negxf2 (e2, e2)); - - emit_label (jump_label); - LABEL_NUSES (jump_label) = 1; - - /* op0 = 0.5 * e2 */ - half = force_reg (XFmode, half); - emit_insn (gen_mulxf3 (op0, e2, half)); -} - -/* Output code to perform an cosh XFmode calculation. */ - -void ix86_emit_i387_cosh (rtx op0, rtx op1) -{ - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx half = const_double_from_real_value (dconsthalf, XFmode); - rtx cst1; - - /* e1 = exp (op1) */ - emit_insn (gen_expxf2 (e1, op1)); - - /* e2 = e1 + 1.0 / e1 */ - cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); - emit_insn (gen_divxf3 (e2, cst1, e1)); - emit_insn (gen_addxf3 (e2, e1, e2)); - - /* op0 = 0.5 * e2 */ - half = force_reg (XFmode, half); - emit_insn (gen_mulxf3 (op0, e2, half)); -} - -/* Output code to perform an tanh XFmode calculation. */ - -void ix86_emit_i387_tanh (rtx op0, rtx op1) -{ - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx scratch = gen_reg_rtx (HImode); - rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); - rtx cst2, tmp; - rtx_code_label *jump_label = gen_label_rtx (); - rtx_insn *insn; - - /* scratch = fxam (op1) */ - emit_insn (gen_fxamxf2_i387 (scratch, op1)); - - /* e1 = expm1 (-|2 * op1|) */ - emit_insn (gen_addxf3 (e2, op1, op1)); - emit_insn (gen_absxf2 (e2, e2)); - emit_insn (gen_negxf2 (e2, e2)); - emit_insn (gen_expm1xf2 (e1, e2)); - - /* e2 = e1 / (e1 + 2.0) */ - cst2 = force_reg (XFmode, CONST2_RTX (XFmode)); - emit_insn (gen_addxf3 (e2, e1, cst2)); - emit_insn (gen_divxf3 (e2, e1, e2)); - - /* flags = signbit (op1) */ - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); - - /* if (!flags) then e2 = -e2 */ - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, - gen_rtx_NE (VOIDmode, flags, const0_rtx), - gen_rtx_LABEL_REF (VOIDmode, jump_label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - predict_jump (REG_BR_PROB_BASE * 50 / 100); - JUMP_LABEL (insn) = jump_label; - - emit_insn (gen_negxf2 (e2, e2)); - - emit_label (jump_label); - LABEL_NUSES (jump_label) = 1; - - emit_move_insn (op0, e2); -} - -/* Output code to perform an asinh XFmode calculation. */ - -void ix86_emit_i387_asinh (rtx op0, rtx op1) -{ - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx scratch = gen_reg_rtx (HImode); - rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); - rtx cst1, tmp; - rtx_code_label *jump_label = gen_label_rtx (); - rtx_insn *insn; - - /* e2 = sqrt (op1^2 + 1.0) + 1.0 */ - emit_insn (gen_mulxf3 (e1, op1, op1)); - cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); - emit_insn (gen_addxf3 (e2, e1, cst1)); - emit_insn (gen_sqrtxf2 (e2, e2)); - emit_insn (gen_addxf3 (e2, e2, cst1)); - - /* e1 = e1 / e2 */ - emit_insn (gen_divxf3 (e1, e1, e2)); - - /* scratch = fxam (op1) */ - emit_insn (gen_fxamxf2_i387 (scratch, op1)); - - /* e1 = e1 + |op1| */ - emit_insn (gen_absxf2 (e2, op1)); - emit_insn (gen_addxf3 (e1, e1, e2)); - - /* e2 = log1p (e1) */ - ix86_emit_i387_log1p (e2, e1); - - /* flags = signbit (op1) */ - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); - - /* if (flags) then e2 = -e2 */ - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, - gen_rtx_EQ (VOIDmode, flags, const0_rtx), - gen_rtx_LABEL_REF (VOIDmode, jump_label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - predict_jump (REG_BR_PROB_BASE * 50 / 100); - JUMP_LABEL (insn) = jump_label; - - emit_insn (gen_negxf2 (e2, e2)); - - emit_label (jump_label); - LABEL_NUSES (jump_label) = 1; - - emit_move_insn (op0, e2); -} - -/* Output code to perform an acosh XFmode calculation. */ - -void ix86_emit_i387_acosh (rtx op0, rtx op1) -{ - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); - - /* e2 = sqrt (op1 + 1.0) */ - emit_insn (gen_addxf3 (e2, op1, cst1)); - emit_insn (gen_sqrtxf2 (e2, e2)); - - /* e1 = sqrt (op1 - 1.0) */ - emit_insn (gen_subxf3 (e1, op1, cst1)); - emit_insn (gen_sqrtxf2 (e1, e1)); - - /* e1 = e1 * e2 */ - emit_insn (gen_mulxf3 (e1, e1, e2)); - - /* e1 = e1 + op1 */ - emit_insn (gen_addxf3 (e1, e1, op1)); - - /* op0 = log (e1) */ - emit_insn (gen_logxf2 (op0, e1)); -} - -/* Output code to perform an atanh XFmode calculation. */ - -void ix86_emit_i387_atanh (rtx op0, rtx op1) -{ - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx scratch = gen_reg_rtx (HImode); - rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); - rtx half = const_double_from_real_value (dconsthalf, XFmode); - rtx cst1, tmp; - rtx_code_label *jump_label = gen_label_rtx (); - rtx_insn *insn; - - /* scratch = fxam (op1) */ - emit_insn (gen_fxamxf2_i387 (scratch, op1)); - - /* e2 = |op1| */ - emit_insn (gen_absxf2 (e2, op1)); - - /* e1 = -(e2 + e2) / (e2 + 1.0) */ - cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); - emit_insn (gen_addxf3 (e1, e2, cst1)); - emit_insn (gen_addxf3 (e2, e2, e2)); - emit_insn (gen_negxf2 (e2, e2)); - emit_insn (gen_divxf3 (e1, e2, e1)); - - /* e2 = log1p (e1) */ - ix86_emit_i387_log1p (e2, e1); - - /* flags = signbit (op1) */ - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); - - /* if (!flags) then e2 = -e2 */ - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, - gen_rtx_NE (VOIDmode, flags, const0_rtx), - gen_rtx_LABEL_REF (VOIDmode, jump_label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - predict_jump (REG_BR_PROB_BASE * 50 / 100); - JUMP_LABEL (insn) = jump_label; - - emit_insn (gen_negxf2 (e2, e2)); - - emit_label (jump_label); - LABEL_NUSES (jump_label) = 1; - - /* op0 = 0.5 * e2 */ - half = force_reg (XFmode, half); - emit_insn (gen_mulxf3 (op0, e2, half)); -} - -/* Output code to perform a log1p XFmode calculation. */ - -void ix86_emit_i387_log1p (rtx op0, rtx op1) -{ - rtx_code_label *label1 = gen_label_rtx (); - rtx_code_label *label2 = gen_label_rtx (); - - rtx tmp = gen_reg_rtx (XFmode); - rtx res = gen_reg_rtx (XFmode); - rtx cst, cstln2, cst1; - rtx_insn *insn; - - cst = const_double_from_real_value - (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode); - cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */ - - emit_insn (gen_absxf2 (tmp, op1)); - - cst = force_reg (XFmode, cst); - ix86_expand_branch (GE, tmp, cst, label1); - predict_jump (REG_BR_PROB_BASE * 10 / 100); - insn = get_last_insn (); - JUMP_LABEL (insn) = label1; - - emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2)); - emit_jump (label2); - - emit_label (label1); - LABEL_NUSES (label1) = 1; - - cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); - emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1))); - emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2)); - - emit_label (label2); - LABEL_NUSES (label2) = 1; - - emit_move_insn (op0, res); -} - -/* Emit code for round calculation. */ -void ix86_emit_i387_round (rtx op0, rtx op1) -{ - machine_mode inmode = GET_MODE (op1); - machine_mode outmode = GET_MODE (op0); - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx scratch = gen_reg_rtx (HImode); - rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); - rtx half = const_double_from_real_value (dconsthalf, XFmode); - rtx res = gen_reg_rtx (outmode); - rtx_code_label *jump_label = gen_label_rtx (); - rtx (*floor_insn) (rtx, rtx); - rtx (*neg_insn) (rtx, rtx); - rtx_insn *insn; - rtx tmp; - - switch (inmode) - { - case E_SFmode: - case E_DFmode: - tmp = gen_reg_rtx (XFmode); - - emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1))); - op1 = tmp; - break; - case E_XFmode: - break; - default: - gcc_unreachable (); - } - - switch (outmode) - { - case E_SFmode: - floor_insn = gen_frndintxf2_floor; - neg_insn = gen_negsf2; - break; - case E_DFmode: - floor_insn = gen_frndintxf2_floor; - neg_insn = gen_negdf2; - break; - case E_XFmode: - floor_insn = gen_frndintxf2_floor; - neg_insn = gen_negxf2; - break; - case E_HImode: - floor_insn = gen_lfloorxfhi2; - neg_insn = gen_neghi2; - break; - case E_SImode: - floor_insn = gen_lfloorxfsi2; - neg_insn = gen_negsi2; - break; - case E_DImode: - floor_insn = gen_lfloorxfdi2; - neg_insn = gen_negdi2; - break; - default: - gcc_unreachable (); - } - - /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */ - - /* scratch = fxam(op1) */ - emit_insn (gen_fxamxf2_i387 (scratch, op1)); - - /* e1 = fabs(op1) */ - emit_insn (gen_absxf2 (e1, op1)); - - /* e2 = e1 + 0.5 */ - half = force_reg (XFmode, half); - emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half))); - - /* res = floor(e2) */ - switch (outmode) - { - case E_SFmode: - case E_DFmode: - { - tmp = gen_reg_rtx (XFmode); - - emit_insn (floor_insn (tmp, e2)); - emit_insn (gen_rtx_SET (res, - gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp), - UNSPEC_TRUNC_NOOP))); - } - break; - default: - emit_insn (floor_insn (res, e2)); - } - - /* flags = signbit(a) */ - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); - - /* if (flags) then res = -res */ - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, - gen_rtx_EQ (VOIDmode, flags, const0_rtx), - gen_rtx_LABEL_REF (VOIDmode, jump_label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - predict_jump (REG_BR_PROB_BASE * 50 / 100); - JUMP_LABEL (insn) = jump_label; - - emit_insn (neg_insn (res, res)); - - emit_label (jump_label); - LABEL_NUSES (jump_label) = 1; - - emit_move_insn (op0, res); -} - -/* Output code to perform a Newton-Rhapson approximation of a single precision - floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */ - -void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode) -{ - rtx x0, x1, e0, e1; - - x0 = gen_reg_rtx (mode); - e0 = gen_reg_rtx (mode); - e1 = gen_reg_rtx (mode); - x1 = gen_reg_rtx (mode); - - /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */ - - b = force_reg (mode, b); - - /* x0 = rcp(b) estimate */ - if (mode == V16SFmode || mode == V8DFmode) - { - if (TARGET_AVX512ER) - { - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), - UNSPEC_RCP28))); - /* res = a * x0 */ - emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0))); - return; - } - else - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), - UNSPEC_RCP14))); - } - else - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), - UNSPEC_RCP))); - - /* e0 = x0 * b */ - emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b))); - - /* e0 = x0 * e0 */ - emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0))); - - /* e1 = x0 + x0 */ - emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0))); - - /* x1 = e1 - e0 */ - emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0))); - - /* res = a * x1 */ - emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1))); -} - -/* Output code to perform a Newton-Rhapson approximation of a - single precision floating point [reciprocal] square root. */ - -void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip) -{ - rtx x0, e0, e1, e2, e3, mthree, mhalf; - REAL_VALUE_TYPE r; - int unspec; - - x0 = gen_reg_rtx (mode); - e0 = gen_reg_rtx (mode); - e1 = gen_reg_rtx (mode); - e2 = gen_reg_rtx (mode); - e3 = gen_reg_rtx (mode); - - if (TARGET_AVX512ER && mode == V16SFmode) - { - if (recip) - /* res = rsqrt28(a) estimate */ - emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), - UNSPEC_RSQRT28))); - else - { - /* x0 = rsqrt28(a) estimate */ - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), - UNSPEC_RSQRT28))); - /* res = rcp28(x0) estimate */ - emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0), - UNSPEC_RCP28))); - } - return; - } - - real_from_integer (&r, VOIDmode, -3, SIGNED); - mthree = const_double_from_real_value (r, SFmode); - - real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL); - mhalf = const_double_from_real_value (r, SFmode); - unspec = UNSPEC_RSQRT; - - if (VECTOR_MODE_P (mode)) - { - mthree = ix86_build_const_vector (mode, true, mthree); - mhalf = ix86_build_const_vector (mode, true, mhalf); - /* There is no 512-bit rsqrt. There is however rsqrt14. */ - if (GET_MODE_SIZE (mode) == 64) - unspec = UNSPEC_RSQRT14; - } - - /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) - rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */ - - a = force_reg (mode, a); - - /* x0 = rsqrt(a) estimate */ - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), - unspec))); - - /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */ - if (!recip) - { - rtx zero = force_reg (mode, CONST0_RTX(mode)); - rtx mask; - - /* Handle masked compare. */ - if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64) - { - mask = gen_reg_rtx (HImode); - /* Imm value 0x4 corresponds to not-equal comparison. */ - emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4))); - emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask)); - } - else - { - mask = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a))); - emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask))); - } - } - - /* e0 = x0 * a */ - emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a))); - /* e1 = e0 * x0 */ - emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0))); - - /* e2 = e1 - 3. */ - mthree = force_reg (mode, mthree); - emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree))); - - mhalf = force_reg (mode, mhalf); - if (recip) - /* e3 = -.5 * x0 */ - emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf))); - else - /* e3 = -.5 * e0 */ - emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf))); - /* ret = e2 * e3 */ - emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3))); -} - -/* Expand fabs (OP0) and return a new rtx that holds the result. The - mask for masking out the sign-bit is stored in *SMASK, if that is - non-null. */ - -static rtx -ix86_expand_sse_fabs (rtx op0, rtx *smask) -{ - machine_mode vmode, mode = GET_MODE (op0); - rtx xa, mask; - - xa = gen_reg_rtx (mode); - if (mode == SFmode) - vmode = V4SFmode; - else if (mode == DFmode) - vmode = V2DFmode; - else - vmode = mode; - mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true); - if (!VECTOR_MODE_P (mode)) - { - /* We need to generate a scalar mode mask in this case. */ - rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); - tmp = gen_rtx_VEC_SELECT (mode, mask, tmp); - mask = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (mask, tmp)); - } - emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask))); - - if (smask) - *smask = mask; - - return xa; -} - -/* Expands a comparison of OP0 with OP1 using comparison code CODE, - swapping the operands if SWAP_OPERANDS is true. The expanded - code is a forward jump to a newly created label in case the - comparison is true. The generated label rtx is returned. */ -static rtx_code_label * -ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1, - bool swap_operands) -{ - bool unordered_compare = ix86_unordered_fp_compare (code); - rtx_code_label *label; - rtx tmp, reg; - - if (swap_operands) - std::swap (op0, op1); - - label = gen_label_rtx (); - tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); - if (unordered_compare) - tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); - reg = gen_rtx_REG (CCFPmode, FLAGS_REG); - emit_insn (gen_rtx_SET (reg, tmp)); - tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx); - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, - gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx); - tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - JUMP_LABEL (tmp) = label; - - return label; -} - -/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1 - using comparison code CODE. Operands are swapped for the comparison if - SWAP_OPERANDS is true. Returns a rtx for the generated mask. */ -static rtx -ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1, - bool swap_operands) -{ - rtx (*insn)(rtx, rtx, rtx, rtx); - machine_mode mode = GET_MODE (op0); - rtx mask = gen_reg_rtx (mode); - - if (swap_operands) - std::swap (op0, op1); - - insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse; - - emit_insn (insn (mask, op0, op1, - gen_rtx_fmt_ee (code, mode, op0, op1))); - return mask; -} - -/* Expand copysign from SIGN to the positive value ABS_VALUE - storing in RESULT. If MASK is non-null, it shall be a mask to mask out - the sign-bit. */ - -static void -ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask) -{ - machine_mode mode = GET_MODE (sign); - rtx sgn = gen_reg_rtx (mode); - if (mask == NULL_RTX) - { - machine_mode vmode; - - if (mode == SFmode) - vmode = V4SFmode; - else if (mode == DFmode) - vmode = V2DFmode; - else - vmode = mode; - - mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false); - if (!VECTOR_MODE_P (mode)) - { - /* We need to generate a scalar mode mask in this case. */ - rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); - tmp = gen_rtx_VEC_SELECT (mode, mask, tmp); - mask = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (mask, tmp)); - } - } - else - mask = gen_rtx_NOT (mode, mask); - emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign))); - emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn))); -} - -/* Expand SSE sequence for computing lround from OP1 storing - into OP0. */ - -void -ix86_expand_lround (rtx op0, rtx op1) -{ - /* C code for the stuff we're doing below: - tmp = op1 + copysign (nextafter (0.5, 0.0), op1) - return (long)tmp; - */ - machine_mode mode = GET_MODE (op1); - const struct real_format *fmt; - REAL_VALUE_TYPE pred_half, half_minus_pred_half; - rtx adj; - - /* load nextafter (0.5, 0.0) */ - fmt = REAL_MODE_FORMAT (mode); - real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); - real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half); - - /* adj = copysign (0.5, op1) */ - adj = force_reg (mode, const_double_from_real_value (pred_half, mode)); - ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX); - - /* adj = op1 + adj */ - adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT); - - /* op0 = (imode)adj */ - expand_fix (op0, adj, 0); -} - -/* Expand SSE2 sequence for computing lround from OPERAND1 storing - into OPERAND0. */ - -void -ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor) -{ - /* C code for the stuff we're doing below (for do_floor): - xi = (long)op1; - xi -= (double)xi > op1 ? 1 : 0; - return xi; - */ - machine_mode fmode = GET_MODE (op1); - machine_mode imode = GET_MODE (op0); - rtx ireg, freg, tmp; - rtx_code_label *label; - - /* reg = (long)op1 */ - ireg = gen_reg_rtx (imode); - expand_fix (ireg, op1, 0); - - /* freg = (double)reg */ - freg = gen_reg_rtx (fmode); - expand_float (freg, ireg, 0); - - /* ireg = (freg > op1) ? ireg - 1 : ireg */ - label = ix86_expand_sse_compare_and_jump (UNLE, - freg, op1, !do_floor); - tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS, - ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT); - emit_move_insn (ireg, tmp); - - emit_label (label); - LABEL_NUSES (label) = 1; - - emit_move_insn (op0, ireg); -} - -/* Generate and return a rtx of mode MODE for 2**n where n is the number - of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */ - -static rtx -ix86_gen_TWO52 (machine_mode mode) -{ - REAL_VALUE_TYPE TWO52r; - rtx TWO52; - - real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23); - TWO52 = const_double_from_real_value (TWO52r, mode); - TWO52 = force_reg (mode, TWO52); - - return TWO52; -} - -/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */ - -void -ix86_expand_rint (rtx operand0, rtx operand1) -{ - /* C code for the stuff we're doing below: - xa = fabs (operand1); - if (!isless (xa, 2**52)) - return operand1; - two52 = 2**52; - if (flag_rounding_math) - { - two52 = copysign (two52, operand1); - xa = operand1; - } - xa = xa + two52 - two52; - return copysign (xa, operand1); - */ - machine_mode mode = GET_MODE (operand0); - rtx res, xa, TWO52, two52, mask; - rtx_code_label *label; - - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); - - /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, &mask); - - /* if (!isless (xa, TWO52)) goto label; */ - TWO52 = ix86_gen_TWO52 (mode); - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); - - two52 = TWO52; - if (flag_rounding_math) - { - two52 = gen_reg_rtx (mode); - ix86_sse_copysign_to_positive (two52, TWO52, res, mask); - xa = res; - } - - xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT); - xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT); - - ix86_sse_copysign_to_positive (res, xa, res, mask); - - emit_label (label); - LABEL_NUSES (label) = 1; - - emit_move_insn (operand0, res); -} - -/* Expand SSE2 sequence for computing floor or ceil - from OPERAND1 storing into OPERAND0. */ -void -ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor) -{ - /* C code for the stuff we expand below. - double xa = fabs (x), x2; - if (!isless (xa, TWO52)) - return x; - x2 = (double)(long)x; - Compensate. Floor: - if (x2 > x) - x2 -= 1; - Compensate. Ceil: - if (x2 < x) - x2 += 1; - if (HONOR_SIGNED_ZEROS (mode)) - return copysign (x2, x); - return x2; - */ - machine_mode mode = GET_MODE (operand0); - rtx xa, xi, TWO52, tmp, one, res, mask; - rtx_code_label *label; - - TWO52 = ix86_gen_TWO52 (mode); - - /* Temporary for holding the result, initialized to the input - operand to ease control flow. */ - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); - - /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, &mask); - - /* if (!isless (xa, TWO52)) goto label; */ - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); - - /* xa = (double)(long)x */ - xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); - expand_fix (xi, res, 0); - expand_float (xa, xi, 0); - - /* generate 1.0 */ - one = force_reg (mode, const_double_from_real_value (dconst1, mode)); - - /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ - tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); - emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); - tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS, - xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); - emit_move_insn (res, tmp); - - if (HONOR_SIGNED_ZEROS (mode)) - ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); - - emit_label (label); - LABEL_NUSES (label) = 1; - - emit_move_insn (operand0, res); -} - -/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing - into OPERAND0 without relying on DImode truncation via cvttsd2siq - that is only available on 64bit targets. */ -void -ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor) -{ - /* C code for the stuff we expand below. - double xa = fabs (x), x2; - if (!isless (xa, TWO52)) - return x; - xa = xa + TWO52 - TWO52; - x2 = copysign (xa, x); - Compensate. Floor: - if (x2 > x) - x2 -= 1; - Compensate. Ceil: - if (x2 < x) - x2 += 1; - if (HONOR_SIGNED_ZEROS (mode)) - x2 = copysign (x2, x); - return x2; - */ - machine_mode mode = GET_MODE (operand0); - rtx xa, TWO52, tmp, one, res, mask; - rtx_code_label *label; - - TWO52 = ix86_gen_TWO52 (mode); - - /* Temporary for holding the result, initialized to the input - operand to ease control flow. */ - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); - - /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, &mask); - - /* if (!isless (xa, TWO52)) goto label; */ - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); - - /* xa = xa + TWO52 - TWO52; */ - xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); - xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT); - - /* xa = copysign (xa, operand1) */ - ix86_sse_copysign_to_positive (xa, xa, res, mask); - - /* generate 1.0 */ - one = force_reg (mode, const_double_from_real_value (dconst1, mode)); - - /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ - tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); - emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); - tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS, - xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); - if (!do_floor && HONOR_SIGNED_ZEROS (mode)) - ix86_sse_copysign_to_positive (tmp, tmp, res, mask); - emit_move_insn (res, tmp); - - emit_label (label); - LABEL_NUSES (label) = 1; - - emit_move_insn (operand0, res); -} - -/* Expand SSE sequence for computing trunc - from OPERAND1 storing into OPERAND0. */ -void -ix86_expand_trunc (rtx operand0, rtx operand1) -{ - /* C code for SSE variant we expand below. - double xa = fabs (x), x2; - if (!isless (xa, TWO52)) - return x; - x2 = (double)(long)x; - if (HONOR_SIGNED_ZEROS (mode)) - return copysign (x2, x); - return x2; - */ - machine_mode mode = GET_MODE (operand0); - rtx xa, xi, TWO52, res, mask; - rtx_code_label *label; - - TWO52 = ix86_gen_TWO52 (mode); - - /* Temporary for holding the result, initialized to the input - operand to ease control flow. */ - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); - - /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, &mask); - - /* if (!isless (xa, TWO52)) goto label; */ - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); - - /* x = (double)(long)x */ - xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); - expand_fix (xi, res, 0); - expand_float (res, xi, 0); - - if (HONOR_SIGNED_ZEROS (mode)) - ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); - - emit_label (label); - LABEL_NUSES (label) = 1; - - emit_move_insn (operand0, res); -} - -/* Expand SSE sequence for computing trunc from OPERAND1 storing - into OPERAND0 without relying on DImode truncation via cvttsd2siq - that is only available on 64bit targets. */ -void -ix86_expand_truncdf_32 (rtx operand0, rtx operand1) -{ - machine_mode mode = GET_MODE (operand0); - rtx xa, mask, TWO52, one, res, smask, tmp; - rtx_code_label *label; - - /* C code for SSE variant we expand below. - double xa = fabs (x), x2; - if (!isless (xa, TWO52)) - return x; - xa2 = xa + TWO52 - TWO52; - Compensate: - if (xa2 > xa) - xa2 -= 1.0; - x2 = copysign (xa2, x); - return x2; - */ - - TWO52 = ix86_gen_TWO52 (mode); - - /* Temporary for holding the result, initialized to the input - operand to ease control flow. */ - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); - - /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, &smask); - - /* if (!isless (xa, TWO52)) goto label; */ - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); - - /* res = xa + TWO52 - TWO52; */ - tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); - tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT); - emit_move_insn (res, tmp); - - /* generate 1.0 */ - one = force_reg (mode, const_double_from_real_value (dconst1, mode)); - - /* Compensate: res = xa2 - (res > xa ? 1 : 0) */ - mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false); - emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one))); - tmp = expand_simple_binop (mode, MINUS, - res, mask, NULL_RTX, 0, OPTAB_DIRECT); - emit_move_insn (res, tmp); - - /* res = copysign (res, operand1) */ - ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask); - - emit_label (label); - LABEL_NUSES (label) = 1; - - emit_move_insn (operand0, res); -} - -/* Expand SSE sequence for computing round - from OPERAND1 storing into OPERAND0. */ -void -ix86_expand_round (rtx operand0, rtx operand1) -{ - /* C code for the stuff we're doing below: - double xa = fabs (x); - if (!isless (xa, TWO52)) - return x; - xa = (double)(long)(xa + nextafter (0.5, 0.0)); - return copysign (xa, x); - */ - machine_mode mode = GET_MODE (operand0); - rtx res, TWO52, xa, xi, half, mask; - rtx_code_label *label; - const struct real_format *fmt; - REAL_VALUE_TYPE pred_half, half_minus_pred_half; - - /* Temporary for holding the result, initialized to the input - operand to ease control flow. */ - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); - - TWO52 = ix86_gen_TWO52 (mode); - xa = ix86_expand_sse_fabs (res, &mask); - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); - - /* load nextafter (0.5, 0.0) */ - fmt = REAL_MODE_FORMAT (mode); - real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); - real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half); - - /* xa = xa + 0.5 */ - half = force_reg (mode, const_double_from_real_value (pred_half, mode)); - xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT); - - /* xa = (double)(int64_t)xa */ - xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); - expand_fix (xi, xa, 0); - expand_float (xa, xi, 0); - - /* res = copysign (xa, operand1) */ - ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask); - - emit_label (label); - LABEL_NUSES (label) = 1; - - emit_move_insn (operand0, res); -} - -/* Expand SSE sequence for computing round from OPERAND1 storing - into OPERAND0 without relying on DImode truncation via cvttsd2siq - that is only available on 64bit targets. */ -void -ix86_expand_rounddf_32 (rtx operand0, rtx operand1) -{ - /* C code for the stuff we expand below. - double xa = fabs (x), xa2, x2; - if (!isless (xa, TWO52)) - return x; - Using the absolute value and copying back sign makes - -0.0 -> -0.0 correct. - xa2 = xa + TWO52 - TWO52; - Compensate. - dxa = xa2 - xa; - if (dxa <= -0.5) - xa2 += 1; - else if (dxa > 0.5) - xa2 -= 1; - x2 = copysign (xa2, x); - return x2; - */ - machine_mode mode = GET_MODE (operand0); - rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask; - rtx_code_label *label; - - TWO52 = ix86_gen_TWO52 (mode); - - /* Temporary for holding the result, initialized to the input - operand to ease control flow. */ - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); - - /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, &mask); - - /* if (!isless (xa, TWO52)) goto label; */ - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); - - /* xa2 = xa + TWO52 - TWO52; */ - xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); - xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT); - - /* dxa = xa2 - xa; */ - dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT); - - /* generate 0.5, 1.0 and -0.5 */ - half = force_reg (mode, const_double_from_real_value (dconsthalf, mode)); - one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT); - mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX, - 0, OPTAB_DIRECT); - - /* Compensate. */ - /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */ - tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false); - emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one))); - xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); - /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */ - tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false); - emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one))); - xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); - - /* res = copysign (xa2, operand1) */ - ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask); - - emit_label (label); - LABEL_NUSES (label) = 1; - - emit_move_insn (operand0, res); -} - -/* Expand SSE sequence for computing round - from OP1 storing into OP0 using sse4 round insn. */ -void -ix86_expand_round_sse4 (rtx op0, rtx op1) -{ - machine_mode mode = GET_MODE (op0); - rtx e1, e2, res, half; - const struct real_format *fmt; - REAL_VALUE_TYPE pred_half, half_minus_pred_half; - rtx (*gen_copysign) (rtx, rtx, rtx); - rtx (*gen_round) (rtx, rtx, rtx); - - switch (mode) - { - case E_SFmode: - gen_copysign = gen_copysignsf3; - gen_round = gen_sse4_1_roundsf2; - break; - case E_DFmode: - gen_copysign = gen_copysigndf3; - gen_round = gen_sse4_1_rounddf2; - break; - default: - gcc_unreachable (); - } - - /* round (a) = trunc (a + copysign (0.5, a)) */ - - /* load nextafter (0.5, 0.0) */ - fmt = REAL_MODE_FORMAT (mode); - real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); - real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half); - half = const_double_from_real_value (pred_half, mode); - - /* e1 = copysign (0.5, op1) */ - e1 = gen_reg_rtx (mode); - emit_insn (gen_copysign (e1, half, op1)); - - /* e2 = op1 + e1 */ - e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT); - - /* res = trunc (e2) */ - res = gen_reg_rtx (mode); - emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC))); - - emit_move_insn (op0, res); -} - -/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel []))) - insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh - insn every time. */ - -static GTY(()) rtx_insn *vselect_insn; - -/* Initialize vselect_insn. */ - -static void -init_vselect_insn (void) -{ - unsigned i; - rtx x; - - x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN)); - for (i = 0; i < MAX_VECT_LEN; ++i) - XVECEXP (x, 0, i) = const0_rtx; - x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx, - const0_rtx), x); - x = gen_rtx_SET (const0_rtx, x); - start_sequence (); - vselect_insn = emit_insn (x); - end_sequence (); -} - -/* Construct (set target (vec_select op0 (parallel perm))) and - return true if that's a valid instruction in the active ISA. */ - -static bool -expand_vselect (rtx target, rtx op0, const unsigned char *perm, - unsigned nelt, bool testing_p) -{ - unsigned int i; - rtx x, save_vconcat; - int icode; - - if (vselect_insn == NULL_RTX) - init_vselect_insn (); - - x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1); - PUT_NUM_ELEM (XVEC (x, 0), nelt); - for (i = 0; i < nelt; ++i) - XVECEXP (x, 0, i) = GEN_INT (perm[i]); - save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0); - XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0; - PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target)); - SET_DEST (PATTERN (vselect_insn)) = target; - icode = recog_memoized (vselect_insn); - - if (icode >= 0 && !testing_p) - emit_insn (copy_rtx (PATTERN (vselect_insn))); - - SET_DEST (PATTERN (vselect_insn)) = const0_rtx; - XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat; - INSN_CODE (vselect_insn) = -1; - - return icode >= 0; -} - -/* Similar, but generate a vec_concat from op0 and op1 as well. */ - -static bool -expand_vselect_vconcat (rtx target, rtx op0, rtx op1, - const unsigned char *perm, unsigned nelt, - bool testing_p) -{ - machine_mode v2mode; - rtx x; - bool ok; - - if (vselect_insn == NULL_RTX) - init_vselect_insn (); - - if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode)) - return false; - x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0); - PUT_MODE (x, v2mode); - XEXP (x, 0) = op0; - XEXP (x, 1) = op1; - ok = expand_vselect (target, x, perm, nelt, testing_p); - XEXP (x, 0) = const0_rtx; - XEXP (x, 1) = const0_rtx; - return ok; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D - using movss or movsd. */ -static bool -expand_vec_perm_movs (struct expand_vec_perm_d *d) -{ - machine_mode vmode = d->vmode; - unsigned i, nelt = d->nelt; - rtx x; - - if (d->one_operand_p) - return false; - - if (!(TARGET_SSE && vmode == V4SFmode) - && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode) - && !(TARGET_SSE2 && vmode == V2DFmode)) - return false; - - /* Only the first element is changed. */ - if (d->perm[0] != nelt && d->perm[0] != 0) - return false; - for (i = 1; i < nelt; ++i) - if (d->perm[i] != i + nelt - d->perm[0]) - return false; - - if (d->testing_p) - return true; - - if (d->perm[0] == nelt) - x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1)); - else - x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1)); - - emit_insn (gen_rtx_SET (d->target, x)); - - return true; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D - in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */ - -static bool -expand_vec_perm_blend (struct expand_vec_perm_d *d) -{ - machine_mode mmode, vmode = d->vmode; - unsigned i, nelt = d->nelt; - unsigned HOST_WIDE_INT mask; - rtx target, op0, op1, maskop, x; - rtx rperm[32], vperm; - - if (d->one_operand_p) - return false; - if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 - && (TARGET_AVX512BW - || GET_MODE_UNIT_SIZE (vmode) >= 4)) - ; - else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) - ; - else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) - ; - else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) - ; - else - return false; - - /* This is a blend, not a permute. Elements must stay in their - respective lanes. */ - for (i = 0; i < nelt; ++i) - { - unsigned e = d->perm[i]; - if (!(e == i || e == i + nelt)) - return false; - } - - if (d->testing_p) - return true; - - /* ??? Without SSE4.1, we could implement this with and/andn/or. This - decision should be extracted elsewhere, so that we only try that - sequence once all budget==3 options have been tried. */ - target = d->target; - op0 = d->op0; - op1 = d->op1; - mask = 0; - - switch (vmode) - { - case E_V8DFmode: - case E_V16SFmode: - case E_V4DFmode: - case E_V8SFmode: - case E_V2DFmode: - case E_V4SFmode: - case E_V8HImode: - case E_V8SImode: - case E_V32HImode: - case E_V64QImode: - case E_V16SImode: - case E_V8DImode: - for (i = 0; i < nelt; ++i) - mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i; - break; - - case E_V2DImode: - for (i = 0; i < 2; ++i) - mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4); - vmode = V8HImode; - goto do_subreg; - - case E_V4SImode: - for (i = 0; i < 4; ++i) - mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); - vmode = V8HImode; - goto do_subreg; - - case E_V16QImode: - /* See if bytes move in pairs so we can use pblendw with - an immediate argument, rather than pblendvb with a vector - argument. */ - for (i = 0; i < 16; i += 2) - if (d->perm[i] + 1 != d->perm[i + 1]) - { - use_pblendvb: - for (i = 0; i < nelt; ++i) - rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx); - - finish_pblendvb: - vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); - vperm = force_reg (vmode, vperm); - - if (GET_MODE_SIZE (vmode) == 16) - emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm)); - else - emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm)); - if (target != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, target)); - return true; - } - - for (i = 0; i < 8; ++i) - mask |= (d->perm[i * 2] >= 16) << i; - vmode = V8HImode; - /* FALLTHRU */ - - do_subreg: - target = gen_reg_rtx (vmode); - op0 = gen_lowpart (vmode, op0); - op1 = gen_lowpart (vmode, op1); - break; - - case E_V32QImode: - /* See if bytes move in pairs. If not, vpblendvb must be used. */ - for (i = 0; i < 32; i += 2) - if (d->perm[i] + 1 != d->perm[i + 1]) - goto use_pblendvb; - /* See if bytes move in quadruplets. If yes, vpblendd - with immediate can be used. */ - for (i = 0; i < 32; i += 4) - if (d->perm[i] + 2 != d->perm[i + 2]) - break; - if (i < 32) - { - /* See if bytes move the same in both lanes. If yes, - vpblendw with immediate can be used. */ - for (i = 0; i < 16; i += 2) - if (d->perm[i] + 16 != d->perm[i + 16]) - goto use_pblendvb; - - /* Use vpblendw. */ - for (i = 0; i < 16; ++i) - mask |= (d->perm[i * 2] >= 32) << i; - vmode = V16HImode; - goto do_subreg; - } - - /* Use vpblendd. */ - for (i = 0; i < 8; ++i) - mask |= (d->perm[i * 4] >= 32) << i; - vmode = V8SImode; - goto do_subreg; - - case E_V16HImode: - /* See if words move in pairs. If yes, vpblendd can be used. */ - for (i = 0; i < 16; i += 2) - if (d->perm[i] + 1 != d->perm[i + 1]) - break; - if (i < 16) - { - /* See if words move the same in both lanes. If not, - vpblendvb must be used. */ - for (i = 0; i < 8; i++) - if (d->perm[i] + 8 != d->perm[i + 8]) - { - /* Use vpblendvb. */ - for (i = 0; i < 32; ++i) - rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx); - - vmode = V32QImode; - nelt = 32; - target = gen_reg_rtx (vmode); - op0 = gen_lowpart (vmode, op0); - op1 = gen_lowpart (vmode, op1); - goto finish_pblendvb; - } - - /* Use vpblendw. */ - for (i = 0; i < 16; ++i) - mask |= (d->perm[i] >= 16) << i; - break; - } - - /* Use vpblendd. */ - for (i = 0; i < 8; ++i) - mask |= (d->perm[i * 2] >= 16) << i; - vmode = V8SImode; - goto do_subreg; - - case E_V4DImode: - /* Use vpblendd. */ - for (i = 0; i < 4; ++i) - mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); - vmode = V8SImode; - goto do_subreg; - - default: - gcc_unreachable (); - } - - switch (vmode) - { - case E_V8DFmode: - case E_V8DImode: - mmode = QImode; - break; - case E_V16SFmode: - case E_V16SImode: - mmode = HImode; - break; - case E_V32HImode: - mmode = SImode; - break; - case E_V64QImode: - mmode = DImode; - break; - default: - mmode = VOIDmode; - } - - if (mmode != VOIDmode) - maskop = force_reg (mmode, gen_int_mode (mask, mmode)); - else - maskop = GEN_INT (mask); - - /* This matches five different patterns with the different modes. */ - x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop); - x = gen_rtx_SET (target, x); - emit_insn (x); - if (target != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, target)); - - return true; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D - in terms of the variable form of vpermilps. - - Note that we will have already failed the immediate input vpermilps, - which requires that the high and low part shuffle be identical; the - variable form doesn't require that. */ - -static bool -expand_vec_perm_vpermil (struct expand_vec_perm_d *d) -{ - rtx rperm[8], vperm; - unsigned i; - - if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p) - return false; - - /* We can only permute within the 128-bit lane. */ - for (i = 0; i < 8; ++i) - { - unsigned e = d->perm[i]; - if (i < 4 ? e >= 4 : e < 4) - return false; - } - - if (d->testing_p) - return true; - - for (i = 0; i < 8; ++i) - { - unsigned e = d->perm[i]; - - /* Within each 128-bit lane, the elements of op0 are numbered - from 0 and the elements of op1 are numbered from 4. */ - if (e >= 8 + 4) - e -= 8; - else if (e >= 4) - e -= 4; - - rperm[i] = GEN_INT (e); - } - - vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm)); - vperm = force_reg (V8SImode, vperm); - emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm)); - - return true; -} - -/* Return true if permutation D can be performed as VMODE permutation - instead. */ - -static bool -valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d) -{ - unsigned int i, j, chunk; - - if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT - || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT - || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode)) - return false; - - if (GET_MODE_NUNITS (vmode) >= d->nelt) - return true; - - chunk = d->nelt / GET_MODE_NUNITS (vmode); - for (i = 0; i < d->nelt; i += chunk) - if (d->perm[i] & (chunk - 1)) - return false; - else - for (j = 1; j < chunk; ++j) - if (d->perm[i] + j != d->perm[i + j]) - return false; - - return true; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D - in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */ - -static bool -expand_vec_perm_pshufb (struct expand_vec_perm_d *d) -{ - unsigned i, nelt, eltsz, mask; - unsigned char perm[64]; - machine_mode vmode = V16QImode; - rtx rperm[64], vperm, target, op0, op1; - - nelt = d->nelt; - - if (!d->one_operand_p) - { - if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16) - { - if (TARGET_AVX2 - && valid_perm_using_mode_p (V2TImode, d)) - { - if (d->testing_p) - return true; - - /* Use vperm2i128 insn. The pattern uses - V4DImode instead of V2TImode. */ - target = d->target; - if (d->vmode != V4DImode) - target = gen_reg_rtx (V4DImode); - op0 = gen_lowpart (V4DImode, d->op0); - op1 = gen_lowpart (V4DImode, d->op1); - rperm[0] - = GEN_INT ((d->perm[0] / (nelt / 2)) - | ((d->perm[nelt / 2] / (nelt / 2)) * 16)); - emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0])); - if (target != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, target)); - return true; - } - return false; - } - } - else - { - if (GET_MODE_SIZE (d->vmode) == 16) - { - if (!TARGET_SSSE3) - return false; - } - else if (GET_MODE_SIZE (d->vmode) == 32) - { - if (!TARGET_AVX2) - return false; - - /* V4DImode should be already handled through - expand_vselect by vpermq instruction. */ - gcc_assert (d->vmode != V4DImode); - - vmode = V32QImode; - if (d->vmode == V8SImode - || d->vmode == V16HImode - || d->vmode == V32QImode) - { - /* First see if vpermq can be used for - V8SImode/V16HImode/V32QImode. */ - if (valid_perm_using_mode_p (V4DImode, d)) - { - for (i = 0; i < 4; i++) - perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3; - if (d->testing_p) - return true; - target = gen_reg_rtx (V4DImode); - if (expand_vselect (target, gen_lowpart (V4DImode, d->op0), - perm, 4, false)) - { - emit_move_insn (d->target, - gen_lowpart (d->vmode, target)); - return true; - } - return false; - } - - /* Next see if vpermd can be used. */ - if (valid_perm_using_mode_p (V8SImode, d)) - vmode = V8SImode; - } - /* Or if vpermps can be used. */ - else if (d->vmode == V8SFmode) - vmode = V8SImode; - - if (vmode == V32QImode) - { - /* vpshufb only works intra lanes, it is not - possible to shuffle bytes in between the lanes. */ - for (i = 0; i < nelt; ++i) - if ((d->perm[i] ^ i) & (nelt / 2)) - return false; - } - } - else if (GET_MODE_SIZE (d->vmode) == 64) - { - if (!TARGET_AVX512BW) - return false; - - /* If vpermq didn't work, vpshufb won't work either. */ - if (d->vmode == V8DFmode || d->vmode == V8DImode) - return false; - - vmode = V64QImode; - if (d->vmode == V16SImode - || d->vmode == V32HImode - || d->vmode == V64QImode) - { - /* First see if vpermq can be used for - V16SImode/V32HImode/V64QImode. */ - if (valid_perm_using_mode_p (V8DImode, d)) - { - for (i = 0; i < 8; i++) - perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7; - if (d->testing_p) - return true; - target = gen_reg_rtx (V8DImode); - if (expand_vselect (target, gen_lowpart (V8DImode, d->op0), - perm, 8, false)) - { - emit_move_insn (d->target, - gen_lowpart (d->vmode, target)); - return true; - } - return false; - } - - /* Next see if vpermd can be used. */ - if (valid_perm_using_mode_p (V16SImode, d)) - vmode = V16SImode; - } - /* Or if vpermps can be used. */ - else if (d->vmode == V16SFmode) - vmode = V16SImode; - if (vmode == V64QImode) - { - /* vpshufb only works intra lanes, it is not - possible to shuffle bytes in between the lanes. */ - for (i = 0; i < nelt; ++i) - if ((d->perm[i] ^ i) & (3 * nelt / 4)) - return false; - } - } - else - return false; - } - - if (d->testing_p) - return true; - - if (vmode == V8SImode) - for (i = 0; i < 8; ++i) - rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7); - else if (vmode == V16SImode) - for (i = 0; i < 16; ++i) - rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15); - else - { - eltsz = GET_MODE_UNIT_SIZE (d->vmode); - if (!d->one_operand_p) - mask = 2 * nelt - 1; - else if (vmode == V16QImode) - mask = nelt - 1; - else if (vmode == V64QImode) - mask = nelt / 4 - 1; - else - mask = nelt / 2 - 1; - - for (i = 0; i < nelt; ++i) - { - unsigned j, e = d->perm[i] & mask; - for (j = 0; j < eltsz; ++j) - rperm[i * eltsz + j] = GEN_INT (e * eltsz + j); - } - } - - vperm = gen_rtx_CONST_VECTOR (vmode, - gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm)); - vperm = force_reg (vmode, vperm); - - target = d->target; - if (d->vmode != vmode) - target = gen_reg_rtx (vmode); - op0 = gen_lowpart (vmode, d->op0); - if (d->one_operand_p) - { - if (vmode == V16QImode) - emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); - else if (vmode == V32QImode) - emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); - else if (vmode == V64QImode) - emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm)); - else if (vmode == V8SFmode) - emit_insn (gen_avx2_permvarv8sf (target, op0, vperm)); - else if (vmode == V8SImode) - emit_insn (gen_avx2_permvarv8si (target, op0, vperm)); - else if (vmode == V16SFmode) - emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm)); - else if (vmode == V16SImode) - emit_insn (gen_avx512f_permvarv16si (target, op0, vperm)); - else - gcc_unreachable (); - } - else - { - op1 = gen_lowpart (vmode, d->op1); - emit_insn (gen_xop_pperm (target, op0, op1, vperm)); - } - if (target != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, target)); - - return true; -} - -/* For V*[QHS]Imode permutations, check if the same permutation - can't be performed in a 2x, 4x or 8x wider inner mode. */ - -static bool -canonicalize_vector_int_perm (const struct expand_vec_perm_d *d, - struct expand_vec_perm_d *nd) -{ - int i; - machine_mode mode = VOIDmode; - - switch (d->vmode) - { - case E_V16QImode: mode = V8HImode; break; - case E_V32QImode: mode = V16HImode; break; - case E_V64QImode: mode = V32HImode; break; - case E_V8HImode: mode = V4SImode; break; - case E_V16HImode: mode = V8SImode; break; - case E_V32HImode: mode = V16SImode; break; - case E_V4SImode: mode = V2DImode; break; - case E_V8SImode: mode = V4DImode; break; - case E_V16SImode: mode = V8DImode; break; - default: return false; - } - for (i = 0; i < d->nelt; i += 2) - if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1) - return false; - nd->vmode = mode; - nd->nelt = d->nelt / 2; - for (i = 0; i < nd->nelt; i++) - nd->perm[i] = d->perm[2 * i] / 2; - if (GET_MODE_INNER (mode) != DImode) - canonicalize_vector_int_perm (nd, nd); - if (nd != d) - { - nd->one_operand_p = d->one_operand_p; - nd->testing_p = d->testing_p; - if (d->op0 == d->op1) - nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0); - else - { - nd->op0 = gen_lowpart (nd->vmode, d->op0); - nd->op1 = gen_lowpart (nd->vmode, d->op1); - } - if (d->testing_p) - nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1); - else - nd->target = gen_reg_rtx (nd->vmode); - } - return true; -} - -/* Try to expand one-operand permutation with constant mask. */ - -static bool -ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d) -{ - machine_mode mode = GET_MODE (d->op0); - machine_mode maskmode = mode; - rtx (*gen) (rtx, rtx, rtx) = NULL; - rtx target, op0, mask; - rtx vec[64]; - - if (!rtx_equal_p (d->op0, d->op1)) - return false; - - if (!TARGET_AVX512F) - return false; - - switch (mode) - { - case E_V16SImode: - gen = gen_avx512f_permvarv16si; - break; - case E_V16SFmode: - gen = gen_avx512f_permvarv16sf; - maskmode = V16SImode; - break; - case E_V8DImode: - gen = gen_avx512f_permvarv8di; - break; - case E_V8DFmode: - gen = gen_avx512f_permvarv8df; - maskmode = V8DImode; - break; - default: - return false; - } - - target = d->target; - op0 = d->op0; - for (int i = 0; i < d->nelt; ++i) - vec[i] = GEN_INT (d->perm[i]); - mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); - emit_insn (gen (target, op0, force_reg (maskmode, mask))); - return true; -} - -static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool); - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D - in a single instruction. */ - -static bool -expand_vec_perm_1 (struct expand_vec_perm_d *d) -{ - unsigned i, nelt = d->nelt; - struct expand_vec_perm_d nd; - - /* Check plain VEC_SELECT first, because AVX has instructions that could - match both SEL and SEL+CONCAT, but the plain SEL will allow a memory - input where SEL+CONCAT may not. */ - if (d->one_operand_p) - { - int mask = nelt - 1; - bool identity_perm = true; - bool broadcast_perm = true; - - for (i = 0; i < nelt; i++) - { - nd.perm[i] = d->perm[i] & mask; - if (nd.perm[i] != i) - identity_perm = false; - if (nd.perm[i]) - broadcast_perm = false; - } - - if (identity_perm) - { - if (!d->testing_p) - emit_move_insn (d->target, d->op0); - return true; - } - else if (broadcast_perm && TARGET_AVX2) - { - /* Use vpbroadcast{b,w,d}. */ - rtx (*gen) (rtx, rtx) = NULL; - switch (d->vmode) - { - case E_V64QImode: - if (TARGET_AVX512BW) - gen = gen_avx512bw_vec_dupv64qi_1; - break; - case E_V32QImode: - gen = gen_avx2_pbroadcastv32qi_1; - break; - case E_V32HImode: - if (TARGET_AVX512BW) - gen = gen_avx512bw_vec_dupv32hi_1; - break; - case E_V16HImode: - gen = gen_avx2_pbroadcastv16hi_1; - break; - case E_V16SImode: - if (TARGET_AVX512F) - gen = gen_avx512f_vec_dupv16si_1; - break; - case E_V8SImode: - gen = gen_avx2_pbroadcastv8si_1; - break; - case E_V16QImode: - gen = gen_avx2_pbroadcastv16qi; - break; - case E_V8HImode: - gen = gen_avx2_pbroadcastv8hi; - break; - case E_V16SFmode: - if (TARGET_AVX512F) - gen = gen_avx512f_vec_dupv16sf_1; - break; - case E_V8SFmode: - gen = gen_avx2_vec_dupv8sf_1; - break; - case E_V8DFmode: - if (TARGET_AVX512F) - gen = gen_avx512f_vec_dupv8df_1; - break; - case E_V8DImode: - if (TARGET_AVX512F) - gen = gen_avx512f_vec_dupv8di_1; - break; - /* For other modes prefer other shuffles this function creates. */ - default: break; - } - if (gen != NULL) - { - if (!d->testing_p) - emit_insn (gen (d->target, d->op0)); - return true; - } - } - - if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p)) - return true; - - /* There are plenty of patterns in sse.md that are written for - SEL+CONCAT and are not replicated for a single op. Perhaps - that should be changed, to avoid the nastiness here. */ - - /* Recognize interleave style patterns, which means incrementing - every other permutation operand. */ - for (i = 0; i < nelt; i += 2) - { - nd.perm[i] = d->perm[i] & mask; - nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt; - } - if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, - d->testing_p)) - return true; - - /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */ - if (nelt >= 4) - { - for (i = 0; i < nelt; i += 4) - { - nd.perm[i + 0] = d->perm[i + 0] & mask; - nd.perm[i + 1] = d->perm[i + 1] & mask; - nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt; - nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt; - } - - if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, - d->testing_p)) - return true; - } - } - - /* Try movss/movsd instructions. */ - if (expand_vec_perm_movs (d)) - return true; - - /* Finally, try the fully general two operand permute. */ - if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt, - d->testing_p)) - return true; - - /* Recognize interleave style patterns with reversed operands. */ - if (!d->one_operand_p) - { - for (i = 0; i < nelt; ++i) - { - unsigned e = d->perm[i]; - if (e >= nelt) - e -= nelt; - else - e += nelt; - nd.perm[i] = e; - } - - if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt, - d->testing_p)) - return true; - } - - /* Try the SSE4.1 blend variable merge instructions. */ - if (expand_vec_perm_blend (d)) - return true; - - /* Try one of the AVX vpermil variable permutations. */ - if (expand_vec_perm_vpermil (d)) - return true; - - /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128, - vpshufb, vpermd, vpermps or vpermq variable permutation. */ - if (expand_vec_perm_pshufb (d)) - return true; - - /* Try the AVX2 vpalignr instruction. */ - if (expand_vec_perm_palignr (d, true)) - return true; - - /* Try the AVX512F vperm{s,d} instructions. */ - if (ix86_expand_vec_one_operand_perm_avx512 (d)) - return true; - - /* Try the AVX512F vpermt2/vpermi2 instructions. */ - if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d)) - return true; - - /* See if we can get the same permutation in different vector integer - mode. */ - if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) - { - if (!d->testing_p) - emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); - return true; - } - return false; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D - in terms of a pair of pshuflw + pshufhw instructions. */ - -static bool -expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d) -{ - unsigned char perm2[MAX_VECT_LEN]; - unsigned i; - bool ok; - - if (d->vmode != V8HImode || !d->one_operand_p) - return false; - - /* The two permutations only operate in 64-bit lanes. */ - for (i = 0; i < 4; ++i) - if (d->perm[i] >= 4) - return false; - for (i = 4; i < 8; ++i) - if (d->perm[i] < 4) - return false; - - if (d->testing_p) - return true; - - /* Emit the pshuflw. */ - memcpy (perm2, d->perm, 4); - for (i = 4; i < 8; ++i) - perm2[i] = i; - ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p); - gcc_assert (ok); - - /* Emit the pshufhw. */ - memcpy (perm2 + 4, d->perm + 4, 4); - for (i = 0; i < 4; ++i) - perm2[i] = i; - ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p); - gcc_assert (ok); - - return true; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify - the permutation using the SSSE3 palignr instruction. This succeeds - when all of the elements in PERM fit within one vector and we merely - need to shift them down so that a single vector permutation has a - chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only - the vpalignr instruction itself can perform the requested permutation. */ - -static bool -expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p) -{ - unsigned i, nelt = d->nelt; - unsigned min, max, minswap, maxswap; - bool in_order, ok, swap = false; - rtx shift, target; - struct expand_vec_perm_d dcopy; - - /* Even with AVX, palignr only operates on 128-bit vectors, - in AVX2 palignr operates on both 128-bit lanes. */ - if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) - && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32)) - return false; - - min = 2 * nelt; - max = 0; - minswap = 2 * nelt; - maxswap = 0; - for (i = 0; i < nelt; ++i) - { - unsigned e = d->perm[i]; - unsigned eswap = d->perm[i] ^ nelt; - if (GET_MODE_SIZE (d->vmode) == 32) - { - e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1); - eswap = e ^ (nelt / 2); - } - if (e < min) - min = e; - if (e > max) - max = e; - if (eswap < minswap) - minswap = eswap; - if (eswap > maxswap) - maxswap = eswap; - } - if (min == 0 - || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt)) - { - if (d->one_operand_p - || minswap == 0 - || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32 - ? nelt / 2 : nelt)) - return false; - swap = true; - min = minswap; - max = maxswap; - } - - /* Given that we have SSSE3, we know we'll be able to implement the - single operand permutation after the palignr with pshufb for - 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed - first. */ - if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p) - return true; - - dcopy = *d; - if (swap) - { - dcopy.op0 = d->op1; - dcopy.op1 = d->op0; - for (i = 0; i < nelt; ++i) - dcopy.perm[i] ^= nelt; - } - - in_order = true; - for (i = 0; i < nelt; ++i) - { - unsigned e = dcopy.perm[i]; - if (GET_MODE_SIZE (d->vmode) == 32 - && e >= nelt - && (e & (nelt / 2 - 1)) < min) - e = e - min - (nelt / 2); - else - e = e - min; - if (e != i) - in_order = false; - dcopy.perm[i] = e; - } - dcopy.one_operand_p = true; - - if (single_insn_only_p && !in_order) - return false; - - /* For AVX2, test whether we can permute the result in one instruction. */ - if (d->testing_p) - { - if (in_order) - return true; - dcopy.op1 = dcopy.op0; - return expand_vec_perm_1 (&dcopy); - } - - shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode)); - if (GET_MODE_SIZE (d->vmode) == 16) - { - target = gen_reg_rtx (TImode); - emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1), - gen_lowpart (TImode, dcopy.op0), shift)); - } - else - { - target = gen_reg_rtx (V2TImode); - emit_insn (gen_avx2_palignrv2ti (target, - gen_lowpart (V2TImode, dcopy.op1), - gen_lowpart (V2TImode, dcopy.op0), - shift)); - } - - dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); - - /* Test for the degenerate case where the alignment by itself - produces the desired permutation. */ - if (in_order) - { - emit_move_insn (d->target, dcopy.op0); - return true; - } - - ok = expand_vec_perm_1 (&dcopy); - gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32); - - return ok; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify - the permutation using the SSE4_1 pblendv instruction. Potentially - reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */ - -static bool -expand_vec_perm_pblendv (struct expand_vec_perm_d *d) -{ - unsigned i, which, nelt = d->nelt; - struct expand_vec_perm_d dcopy, dcopy1; - machine_mode vmode = d->vmode; - bool ok; - - /* Use the same checks as in expand_vec_perm_blend. */ - if (d->one_operand_p) - return false; - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) - ; - else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) - ; - else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) - ; - else - return false; - - /* Figure out where permutation elements stay not in their - respective lanes. */ - for (i = 0, which = 0; i < nelt; ++i) - { - unsigned e = d->perm[i]; - if (e != i) - which |= (e < nelt ? 1 : 2); - } - /* We can pblend the part where elements stay not in their - respective lanes only when these elements are all in one - half of a permutation. - {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective - lanes, but both 8 and 9 >= 8 - {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their - respective lanes and 8 >= 8, but 2 not. */ - if (which != 1 && which != 2) - return false; - if (d->testing_p && GET_MODE_SIZE (vmode) == 16) - return true; - - /* First we apply one operand permutation to the part where - elements stay not in their respective lanes. */ - dcopy = *d; - if (which == 2) - dcopy.op0 = dcopy.op1 = d->op1; - else - dcopy.op0 = dcopy.op1 = d->op0; - if (!d->testing_p) - dcopy.target = gen_reg_rtx (vmode); - dcopy.one_operand_p = true; - - for (i = 0; i < nelt; ++i) - dcopy.perm[i] = d->perm[i] & (nelt - 1); - - ok = expand_vec_perm_1 (&dcopy); - if (GET_MODE_SIZE (vmode) != 16 && !ok) - return false; - else - gcc_assert (ok); - if (d->testing_p) - return true; - - /* Next we put permuted elements into their positions. */ - dcopy1 = *d; - if (which == 2) - dcopy1.op1 = dcopy.target; - else - dcopy1.op0 = dcopy.target; - - for (i = 0; i < nelt; ++i) - dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i); - - ok = expand_vec_perm_blend (&dcopy1); - gcc_assert (ok); - - return true; -} - -static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d); - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify - a two vector permutation into a single vector permutation by using - an interleave operation to merge the vectors. */ - -static bool -expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) -{ - struct expand_vec_perm_d dremap, dfinal; - unsigned i, nelt = d->nelt, nelt2 = nelt / 2; - unsigned HOST_WIDE_INT contents; - unsigned char remap[2 * MAX_VECT_LEN]; - rtx_insn *seq; - bool ok, same_halves = false; - - if (GET_MODE_SIZE (d->vmode) == 16) - { - if (d->one_operand_p) - return false; - } - else if (GET_MODE_SIZE (d->vmode) == 32) - { - if (!TARGET_AVX) - return false; - /* For 32-byte modes allow even d->one_operand_p. - The lack of cross-lane shuffling in some instructions - might prevent a single insn shuffle. */ - dfinal = *d; - dfinal.testing_p = true; - /* If expand_vec_perm_interleave3 can expand this into - a 3 insn sequence, give up and let it be expanded as - 3 insn sequence. While that is one insn longer, - it doesn't need a memory operand and in the common - case that both interleave low and high permutations - with the same operands are adjacent needs 4 insns - for both after CSE. */ - if (expand_vec_perm_interleave3 (&dfinal)) - return false; - } - else - return false; - - /* Examine from whence the elements come. */ - contents = 0; - for (i = 0; i < nelt; ++i) - contents |= HOST_WIDE_INT_1U << d->perm[i]; - - memset (remap, 0xff, sizeof (remap)); - dremap = *d; - - if (GET_MODE_SIZE (d->vmode) == 16) - { - unsigned HOST_WIDE_INT h1, h2, h3, h4; - - /* Split the two input vectors into 4 halves. */ - h1 = (HOST_WIDE_INT_1U << nelt2) - 1; - h2 = h1 << nelt2; - h3 = h2 << nelt2; - h4 = h3 << nelt2; - - /* If the elements from the low halves use interleave low, and similarly - for interleave high. If the elements are from mis-matched halves, we - can use shufps for V4SF/V4SI or do a DImode shuffle. */ - if ((contents & (h1 | h3)) == contents) - { - /* punpckl* */ - for (i = 0; i < nelt2; ++i) - { - remap[i] = i * 2; - remap[i + nelt] = i * 2 + 1; - dremap.perm[i * 2] = i; - dremap.perm[i * 2 + 1] = i + nelt; - } - if (!TARGET_SSE2 && d->vmode == V4SImode) - dremap.vmode = V4SFmode; - } - else if ((contents & (h2 | h4)) == contents) - { - /* punpckh* */ - for (i = 0; i < nelt2; ++i) - { - remap[i + nelt2] = i * 2; - remap[i + nelt + nelt2] = i * 2 + 1; - dremap.perm[i * 2] = i + nelt2; - dremap.perm[i * 2 + 1] = i + nelt + nelt2; - } - if (!TARGET_SSE2 && d->vmode == V4SImode) - dremap.vmode = V4SFmode; - } - else if ((contents & (h1 | h4)) == contents) - { - /* shufps */ - for (i = 0; i < nelt2; ++i) - { - remap[i] = i; - remap[i + nelt + nelt2] = i + nelt2; - dremap.perm[i] = i; - dremap.perm[i + nelt2] = i + nelt + nelt2; - } - if (nelt != 4) - { - /* shufpd */ - dremap.vmode = V2DImode; - dremap.nelt = 2; - dremap.perm[0] = 0; - dremap.perm[1] = 3; - } - } - else if ((contents & (h2 | h3)) == contents) - { - /* shufps */ - for (i = 0; i < nelt2; ++i) - { - remap[i + nelt2] = i; - remap[i + nelt] = i + nelt2; - dremap.perm[i] = i + nelt2; - dremap.perm[i + nelt2] = i + nelt; - } - if (nelt != 4) - { - /* shufpd */ - dremap.vmode = V2DImode; - dremap.nelt = 2; - dremap.perm[0] = 1; - dremap.perm[1] = 2; - } - } - else - return false; - } - else - { - unsigned int nelt4 = nelt / 4, nzcnt = 0; - unsigned HOST_WIDE_INT q[8]; - unsigned int nonzero_halves[4]; - - /* Split the two input vectors into 8 quarters. */ - q[0] = (HOST_WIDE_INT_1U << nelt4) - 1; - for (i = 1; i < 8; ++i) - q[i] = q[0] << (nelt4 * i); - for (i = 0; i < 4; ++i) - if (((q[2 * i] | q[2 * i + 1]) & contents) != 0) - { - nonzero_halves[nzcnt] = i; - ++nzcnt; - } - - if (nzcnt == 1) - { - gcc_assert (d->one_operand_p); - nonzero_halves[1] = nonzero_halves[0]; - same_halves = true; - } - else if (d->one_operand_p) - { - gcc_assert (nonzero_halves[0] == 0); - gcc_assert (nonzero_halves[1] == 1); - } - - if (nzcnt <= 2) - { - if (d->perm[0] / nelt2 == nonzero_halves[1]) - { - /* Attempt to increase the likelihood that dfinal - shuffle will be intra-lane. */ - std::swap (nonzero_halves[0], nonzero_halves[1]); - } - - /* vperm2f128 or vperm2i128. */ - for (i = 0; i < nelt2; ++i) - { - remap[i + nonzero_halves[1] * nelt2] = i + nelt2; - remap[i + nonzero_halves[0] * nelt2] = i; - dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2; - dremap.perm[i] = i + nonzero_halves[0] * nelt2; - } - - if (d->vmode != V8SFmode - && d->vmode != V4DFmode - && d->vmode != V8SImode) - { - dremap.vmode = V8SImode; - dremap.nelt = 8; - for (i = 0; i < 4; ++i) - { - dremap.perm[i] = i + nonzero_halves[0] * 4; - dremap.perm[i + 4] = i + nonzero_halves[1] * 4; - } - } - } - else if (d->one_operand_p) - return false; - else if (TARGET_AVX2 - && (contents & (q[0] | q[2] | q[4] | q[6])) == contents) - { - /* vpunpckl* */ - for (i = 0; i < nelt4; ++i) - { - remap[i] = i * 2; - remap[i + nelt] = i * 2 + 1; - remap[i + nelt2] = i * 2 + nelt2; - remap[i + nelt + nelt2] = i * 2 + nelt2 + 1; - dremap.perm[i * 2] = i; - dremap.perm[i * 2 + 1] = i + nelt; - dremap.perm[i * 2 + nelt2] = i + nelt2; - dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2; - } - } - else if (TARGET_AVX2 - && (contents & (q[1] | q[3] | q[5] | q[7])) == contents) - { - /* vpunpckh* */ - for (i = 0; i < nelt4; ++i) - { - remap[i + nelt4] = i * 2; - remap[i + nelt + nelt4] = i * 2 + 1; - remap[i + nelt2 + nelt4] = i * 2 + nelt2; - remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1; - dremap.perm[i * 2] = i + nelt4; - dremap.perm[i * 2 + 1] = i + nelt + nelt4; - dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4; - dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4; - } - } - else - return false; - } - - /* Use the remapping array set up above to move the elements from their - swizzled locations into their final destinations. */ - dfinal = *d; - for (i = 0; i < nelt; ++i) - { - unsigned e = remap[d->perm[i]]; - gcc_assert (e < nelt); - /* If same_halves is true, both halves of the remapped vector are the - same. Avoid cross-lane accesses if possible. */ - if (same_halves && i >= nelt2) - { - gcc_assert (e < nelt2); - dfinal.perm[i] = e + nelt2; - } - else - dfinal.perm[i] = e; - } - if (!d->testing_p) - { - dremap.target = gen_reg_rtx (dremap.vmode); - dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target); - } - dfinal.op1 = dfinal.op0; - dfinal.one_operand_p = true; - - /* Test if the final remap can be done with a single insn. For V4SFmode or - V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */ - start_sequence (); - ok = expand_vec_perm_1 (&dfinal); - seq = get_insns (); - end_sequence (); - - if (!ok) - return false; - - if (d->testing_p) - return true; - - if (dremap.vmode != dfinal.vmode) - { - dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0); - dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1); - } - - ok = expand_vec_perm_1 (&dremap); - gcc_assert (ok); - - emit_insn (seq); - return true; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify - a single vector cross-lane permutation into vpermq followed - by any of the single insn permutations. */ - -static bool -expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d) -{ - struct expand_vec_perm_d dremap, dfinal; - unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4; - unsigned contents[2]; - bool ok; - - if (!(TARGET_AVX2 - && (d->vmode == V32QImode || d->vmode == V16HImode) - && d->one_operand_p)) - return false; - - contents[0] = 0; - contents[1] = 0; - for (i = 0; i < nelt2; ++i) - { - contents[0] |= 1u << (d->perm[i] / nelt4); - contents[1] |= 1u << (d->perm[i + nelt2] / nelt4); - } - - for (i = 0; i < 2; ++i) - { - unsigned int cnt = 0; - for (j = 0; j < 4; ++j) - if ((contents[i] & (1u << j)) != 0 && ++cnt > 2) - return false; - } - - if (d->testing_p) - return true; - - dremap = *d; - dremap.vmode = V4DImode; - dremap.nelt = 4; - dremap.target = gen_reg_rtx (V4DImode); - dremap.op0 = gen_lowpart (V4DImode, d->op0); - dremap.op1 = dremap.op0; - dremap.one_operand_p = true; - for (i = 0; i < 2; ++i) - { - unsigned int cnt = 0; - for (j = 0; j < 4; ++j) - if ((contents[i] & (1u << j)) != 0) - dremap.perm[2 * i + cnt++] = j; - for (; cnt < 2; ++cnt) - dremap.perm[2 * i + cnt] = 0; - } - - dfinal = *d; - dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target); - dfinal.op1 = dfinal.op0; - dfinal.one_operand_p = true; - for (i = 0, j = 0; i < nelt; ++i) - { - if (i == nelt2) - j = 2; - dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0); - if ((d->perm[i] / nelt4) == dremap.perm[j]) - ; - else if ((d->perm[i] / nelt4) == dremap.perm[j + 1]) - dfinal.perm[i] |= nelt4; - else - gcc_unreachable (); - } - - ok = expand_vec_perm_1 (&dremap); - gcc_assert (ok); - - ok = expand_vec_perm_1 (&dfinal); - gcc_assert (ok); - - return true; -} - -static bool canonicalize_perm (struct expand_vec_perm_d *d); - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand - a vector permutation using two instructions, vperm2f128 resp. - vperm2i128 followed by any single in-lane permutation. */ - -static bool -expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d) -{ - struct expand_vec_perm_d dfirst, dsecond; - unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm; - bool ok; - - if (!TARGET_AVX - || GET_MODE_SIZE (d->vmode) != 32 - || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)) - return false; - - dsecond = *d; - dsecond.one_operand_p = false; - dsecond.testing_p = true; - - /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128 - immediate. For perm < 16 the second permutation uses - d->op0 as first operand, for perm >= 16 it uses d->op1 - as first operand. The second operand is the result of - vperm2[fi]128. */ - for (perm = 0; perm < 32; perm++) - { - /* Ignore permutations which do not move anything cross-lane. */ - if (perm < 16) - { - /* The second shuffle for e.g. V4DFmode has - 0123 and ABCD operands. - Ignore AB23, as 23 is already in the second lane - of the first operand. */ - if ((perm & 0xc) == (1 << 2)) continue; - /* And 01CD, as 01 is in the first lane of the first - operand. */ - if ((perm & 3) == 0) continue; - /* And 4567, as then the vperm2[fi]128 doesn't change - anything on the original 4567 second operand. */ - if ((perm & 0xf) == ((3 << 2) | 2)) continue; - } - else - { - /* The second shuffle for e.g. V4DFmode has - 4567 and ABCD operands. - Ignore AB67, as 67 is already in the second lane - of the first operand. */ - if ((perm & 0xc) == (3 << 2)) continue; - /* And 45CD, as 45 is in the first lane of the first - operand. */ - if ((perm & 3) == 2) continue; - /* And 0123, as then the vperm2[fi]128 doesn't change - anything on the original 0123 first operand. */ - if ((perm & 0xf) == (1 << 2)) continue; - } - - for (i = 0; i < nelt; i++) - { - j = d->perm[i] / nelt2; - if (j == ((perm >> (2 * (i >= nelt2))) & 3)) - dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1)); - else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16)) - dsecond.perm[i] = d->perm[i] & (nelt - 1); - else - break; - } - - if (i == nelt) - { - start_sequence (); - ok = expand_vec_perm_1 (&dsecond); - end_sequence (); - } - else - ok = false; - - if (ok) - { - if (d->testing_p) - return true; - - /* Found a usable second shuffle. dfirst will be - vperm2f128 on d->op0 and d->op1. */ - dsecond.testing_p = false; - dfirst = *d; - dfirst.target = gen_reg_rtx (d->vmode); - for (i = 0; i < nelt; i++) - dfirst.perm[i] = (i & (nelt2 - 1)) - + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2; - - canonicalize_perm (&dfirst); - ok = expand_vec_perm_1 (&dfirst); - gcc_assert (ok); - - /* And dsecond is some single insn shuffle, taking - d->op0 and result of vperm2f128 (if perm < 16) or - d->op1 and result of vperm2f128 (otherwise). */ - if (perm >= 16) - dsecond.op0 = dsecond.op1; - dsecond.op1 = dfirst.target; - - ok = expand_vec_perm_1 (&dsecond); - gcc_assert (ok); - - return true; - } - - /* For one operand, the only useful vperm2f128 permutation is 0x01 - aka lanes swap. */ - if (d->one_operand_p) - return false; - } - - return false; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify - a two vector permutation using 2 intra-lane interleave insns - and cross-lane shuffle for 32-byte vectors. */ - -static bool -expand_vec_perm_interleave3 (struct expand_vec_perm_d *d) -{ - unsigned i, nelt; - rtx (*gen) (rtx, rtx, rtx); - - if (d->one_operand_p) - return false; - if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32) - ; - else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode)) - ; - else - return false; - - nelt = d->nelt; - if (d->perm[0] != 0 && d->perm[0] != nelt / 2) - return false; - for (i = 0; i < nelt; i += 2) - if (d->perm[i] != d->perm[0] + i / 2 - || d->perm[i + 1] != d->perm[0] + i / 2 + nelt) - return false; - - if (d->testing_p) - return true; - - switch (d->vmode) - { - case E_V32QImode: - if (d->perm[0]) - gen = gen_vec_interleave_highv32qi; - else - gen = gen_vec_interleave_lowv32qi; - break; - case E_V16HImode: - if (d->perm[0]) - gen = gen_vec_interleave_highv16hi; - else - gen = gen_vec_interleave_lowv16hi; - break; - case E_V8SImode: - if (d->perm[0]) - gen = gen_vec_interleave_highv8si; - else - gen = gen_vec_interleave_lowv8si; - break; - case E_V4DImode: - if (d->perm[0]) - gen = gen_vec_interleave_highv4di; - else - gen = gen_vec_interleave_lowv4di; - break; - case E_V8SFmode: - if (d->perm[0]) - gen = gen_vec_interleave_highv8sf; - else - gen = gen_vec_interleave_lowv8sf; - break; - case E_V4DFmode: - if (d->perm[0]) - gen = gen_vec_interleave_highv4df; - else - gen = gen_vec_interleave_lowv4df; - break; - default: - gcc_unreachable (); - } - - emit_insn (gen (d->target, d->op0, d->op1)); - return true; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement - a single vector permutation using a single intra-lane vector - permutation, vperm2f128 swapping the lanes and vblend* insn blending - the non-swapped and swapped vectors together. */ - -static bool -expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d) -{ - struct expand_vec_perm_d dfirst, dsecond; - unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2; - rtx_insn *seq; - bool ok; - rtx (*blend) (rtx, rtx, rtx, rtx) = NULL; - - if (!TARGET_AVX - || TARGET_AVX2 - || (d->vmode != V8SFmode && d->vmode != V4DFmode) - || !d->one_operand_p) - return false; - - dfirst = *d; - for (i = 0; i < nelt; i++) - dfirst.perm[i] = 0xff; - for (i = 0, msk = 0; i < nelt; i++) - { - j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2; - if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i]) - return false; - dfirst.perm[j] = d->perm[i]; - if (j != i) - msk |= (1 << i); - } - for (i = 0; i < nelt; i++) - if (dfirst.perm[i] == 0xff) - dfirst.perm[i] = i; - - if (!d->testing_p) - dfirst.target = gen_reg_rtx (dfirst.vmode); - - start_sequence (); - ok = expand_vec_perm_1 (&dfirst); - seq = get_insns (); - end_sequence (); - - if (!ok) - return false; - - if (d->testing_p) - return true; - - emit_insn (seq); - - dsecond = *d; - dsecond.op0 = dfirst.target; - dsecond.op1 = dfirst.target; - dsecond.one_operand_p = true; - dsecond.target = gen_reg_rtx (dsecond.vmode); - for (i = 0; i < nelt; i++) - dsecond.perm[i] = i ^ nelt2; - - ok = expand_vec_perm_1 (&dsecond); - gcc_assert (ok); - - blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256; - emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk))); - return true; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF - permutation using two vperm2f128, followed by a vshufpd insn blending - the two vectors together. */ - -static bool -expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d) -{ - struct expand_vec_perm_d dfirst, dsecond, dthird; - bool ok; - - if (!TARGET_AVX || (d->vmode != V4DFmode)) - return false; - - if (d->testing_p) - return true; - - dfirst = *d; - dsecond = *d; - dthird = *d; - - dfirst.perm[0] = (d->perm[0] & ~1); - dfirst.perm[1] = (d->perm[0] & ~1) + 1; - dfirst.perm[2] = (d->perm[2] & ~1); - dfirst.perm[3] = (d->perm[2] & ~1) + 1; - dsecond.perm[0] = (d->perm[1] & ~1); - dsecond.perm[1] = (d->perm[1] & ~1) + 1; - dsecond.perm[2] = (d->perm[3] & ~1); - dsecond.perm[3] = (d->perm[3] & ~1) + 1; - dthird.perm[0] = (d->perm[0] % 2); - dthird.perm[1] = (d->perm[1] % 2) + 4; - dthird.perm[2] = (d->perm[2] % 2) + 2; - dthird.perm[3] = (d->perm[3] % 2) + 6; - - dfirst.target = gen_reg_rtx (dfirst.vmode); - dsecond.target = gen_reg_rtx (dsecond.vmode); - dthird.op0 = dfirst.target; - dthird.op1 = dsecond.target; - dthird.one_operand_p = false; - - canonicalize_perm (&dfirst); - canonicalize_perm (&dsecond); - - ok = expand_vec_perm_1 (&dfirst) - && expand_vec_perm_1 (&dsecond) - && expand_vec_perm_1 (&dthird); - - gcc_assert (ok); - - return true; -} - -static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *); - -/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement - a two vector permutation using two intra-lane vector - permutations, vperm2f128 swapping the lanes and vblend* insn blending - the non-swapped and swapped vectors together. */ - -static bool -expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d) -{ - struct expand_vec_perm_d dfirst, dsecond, dthird; - unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0; - rtx_insn *seq1, *seq2; - bool ok; - rtx (*blend) (rtx, rtx, rtx, rtx) = NULL; - - if (!TARGET_AVX - || TARGET_AVX2 - || (d->vmode != V8SFmode && d->vmode != V4DFmode) - || d->one_operand_p) - return false; - - dfirst = *d; - dsecond = *d; - for (i = 0; i < nelt; i++) - { - dfirst.perm[i] = 0xff; - dsecond.perm[i] = 0xff; - } - for (i = 0, msk = 0; i < nelt; i++) - { - j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2; - if (j == i) - { - dfirst.perm[j] = d->perm[i]; - which1 |= (d->perm[i] < nelt ? 1 : 2); - } - else - { - dsecond.perm[j] = d->perm[i]; - which2 |= (d->perm[i] < nelt ? 1 : 2); - msk |= (1U << i); - } - } - if (msk == 0 || msk == (1U << nelt) - 1) - return false; - - if (!d->testing_p) - { - dfirst.target = gen_reg_rtx (dfirst.vmode); - dsecond.target = gen_reg_rtx (dsecond.vmode); - } - - for (i = 0; i < nelt; i++) - { - if (dfirst.perm[i] == 0xff) - dfirst.perm[i] = (which1 == 2 ? i + nelt : i); - if (dsecond.perm[i] == 0xff) - dsecond.perm[i] = (which2 == 2 ? i + nelt : i); - } - canonicalize_perm (&dfirst); - start_sequence (); - ok = ix86_expand_vec_perm_const_1 (&dfirst); - seq1 = get_insns (); - end_sequence (); - - if (!ok) - return false; - - canonicalize_perm (&dsecond); - start_sequence (); - ok = ix86_expand_vec_perm_const_1 (&dsecond); - seq2 = get_insns (); - end_sequence (); - - if (!ok) - return false; - - if (d->testing_p) - return true; - - emit_insn (seq1); - emit_insn (seq2); - - dthird = *d; - dthird.op0 = dsecond.target; - dthird.op1 = dsecond.target; - dthird.one_operand_p = true; - dthird.target = gen_reg_rtx (dthird.vmode); - for (i = 0; i < nelt; i++) - dthird.perm[i] = i ^ nelt2; - - ok = expand_vec_perm_1 (&dthird); - gcc_assert (ok); - - blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256; - emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk))); - return true; -} - -/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word - permutation with two pshufb insns and an ior. We should have already - failed all two instruction sequences. */ - -static bool -expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) -{ - rtx rperm[2][16], vperm, l, h, op, m128; - unsigned int i, nelt, eltsz; - - if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) - return false; - gcc_assert (!d->one_operand_p); - - if (d->testing_p) - return true; - - nelt = d->nelt; - eltsz = GET_MODE_UNIT_SIZE (d->vmode); - - /* Generate two permutation masks. If the required element is within - the given vector it is shuffled into the proper lane. If the required - element is in the other vector, force a zero into the lane by setting - bit 7 in the permutation mask. */ - m128 = GEN_INT (-128); - for (i = 0; i < nelt; ++i) - { - unsigned j, e = d->perm[i]; - unsigned which = (e >= nelt); - if (e >= nelt) - e -= nelt; - - for (j = 0; j < eltsz; ++j) - { - rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j); - rperm[1-which][i*eltsz + j] = m128; - } - } - - vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0])); - vperm = force_reg (V16QImode, vperm); - - l = gen_reg_rtx (V16QImode); - op = gen_lowpart (V16QImode, d->op0); - emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm)); - - vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1])); - vperm = force_reg (V16QImode, vperm); - - h = gen_reg_rtx (V16QImode); - op = gen_lowpart (V16QImode, d->op1); - emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm)); - - op = d->target; - if (d->vmode != V16QImode) - op = gen_reg_rtx (V16QImode); - emit_insn (gen_iorv16qi3 (op, l, h)); - if (op != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, op)); - - return true; -} - -/* Implement arbitrary permutation of one V32QImode and V16QImode operand - with two vpshufb insns, vpermq and vpor. We should have already failed - all two or three instruction sequences. */ - -static bool -expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d) -{ - rtx rperm[2][32], vperm, l, h, hp, op, m128; - unsigned int i, nelt, eltsz; - - if (!TARGET_AVX2 - || !d->one_operand_p - || (d->vmode != V32QImode && d->vmode != V16HImode)) - return false; - - if (d->testing_p) - return true; - - nelt = d->nelt; - eltsz = GET_MODE_UNIT_SIZE (d->vmode); - - /* Generate two permutation masks. If the required element is within - the same lane, it is shuffled in. If the required element from the - other lane, force a zero by setting bit 7 in the permutation mask. - In the other mask the mask has non-negative elements if element - is requested from the other lane, but also moved to the other lane, - so that the result of vpshufb can have the two V2TImode halves - swapped. */ - m128 = GEN_INT (-128); - for (i = 0; i < nelt; ++i) - { - unsigned j, e = d->perm[i] & (nelt / 2 - 1); - unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz; - - for (j = 0; j < eltsz; ++j) - { - rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j); - rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128; - } - } - - vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1])); - vperm = force_reg (V32QImode, vperm); - - h = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, d->op0); - emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); - - /* Swap the 128-byte lanes of h into hp. */ - hp = gen_reg_rtx (V4DImode); - op = gen_lowpart (V4DImode, h); - emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx, - const1_rtx)); - - vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0])); - vperm = force_reg (V32QImode, vperm); - - l = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, d->op0); - emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); - - op = d->target; - if (d->vmode != V32QImode) - op = gen_reg_rtx (V32QImode); - emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp))); - if (op != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, op)); - - return true; -} - -/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even - and extract-odd permutations of two V32QImode and V16QImode operand - with two vpshufb insns, vpor and vpermq. We should have already - failed all two or three instruction sequences. */ - -static bool -expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) -{ - rtx rperm[2][32], vperm, l, h, ior, op, m128; - unsigned int i, nelt, eltsz; - - if (!TARGET_AVX2 - || d->one_operand_p - || (d->vmode != V32QImode && d->vmode != V16HImode)) - return false; - - for (i = 0; i < d->nelt; ++i) - if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2)) - return false; - - if (d->testing_p) - return true; - - nelt = d->nelt; - eltsz = GET_MODE_UNIT_SIZE (d->vmode); - - /* Generate two permutation masks. In the first permutation mask - the first quarter will contain indexes for the first half - of the op0, the second quarter will contain bit 7 set, third quarter - will contain indexes for the second half of the op0 and the - last quarter bit 7 set. In the second permutation mask - the first quarter will contain bit 7 set, the second quarter - indexes for the first half of the op1, the third quarter bit 7 set - and last quarter indexes for the second half of the op1. - I.e. the first mask e.g. for V32QImode extract even will be: - 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128 - (all values masked with 0xf except for -128) and second mask - for extract even will be - -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */ - m128 = GEN_INT (-128); - for (i = 0; i < nelt; ++i) - { - unsigned j, e = d->perm[i] & (nelt / 2 - 1); - unsigned which = d->perm[i] >= nelt; - unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0; - - for (j = 0; j < eltsz; ++j) - { - rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j); - rperm[1 - which][(i * eltsz + j) ^ xorv] = m128; - } - } - - vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0])); - vperm = force_reg (V32QImode, vperm); - - l = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, d->op0); - emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); - - vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1])); - vperm = force_reg (V32QImode, vperm); - - h = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, d->op1); - emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); - - ior = gen_reg_rtx (V32QImode); - emit_insn (gen_iorv32qi3 (ior, l, h)); - - /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */ - op = gen_reg_rtx (V4DImode); - ior = gen_lowpart (V4DImode, ior); - emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx, - const1_rtx, GEN_INT (3))); - emit_move_insn (d->target, gen_lowpart (d->vmode, op)); - - return true; -} - -/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even - and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands - with two "and" and "pack" or two "shift" and "pack" insns. We should - have already failed all two instruction sequences. */ - -static bool -expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) -{ - rtx op, dop0, dop1, t; - unsigned i, odd, c, s, nelt = d->nelt; - bool end_perm = false; - machine_mode half_mode; - rtx (*gen_and) (rtx, rtx, rtx); - rtx (*gen_pack) (rtx, rtx, rtx); - rtx (*gen_shift) (rtx, rtx, rtx); - - if (d->one_operand_p) - return false; - - switch (d->vmode) - { - case E_V8HImode: - /* Required for "pack". */ - if (!TARGET_SSE4_1) - return false; - c = 0xffff; - s = 16; - half_mode = V4SImode; - gen_and = gen_andv4si3; - gen_pack = gen_sse4_1_packusdw; - gen_shift = gen_lshrv4si3; - break; - case E_V16QImode: - /* No check as all instructions are SSE2. */ - c = 0xff; - s = 8; - half_mode = V8HImode; - gen_and = gen_andv8hi3; - gen_pack = gen_sse2_packuswb; - gen_shift = gen_lshrv8hi3; - break; - case E_V16HImode: - if (!TARGET_AVX2) - return false; - c = 0xffff; - s = 16; - half_mode = V8SImode; - gen_and = gen_andv8si3; - gen_pack = gen_avx2_packusdw; - gen_shift = gen_lshrv8si3; - end_perm = true; - break; - case E_V32QImode: - if (!TARGET_AVX2) - return false; - c = 0xff; - s = 8; - half_mode = V16HImode; - gen_and = gen_andv16hi3; - gen_pack = gen_avx2_packuswb; - gen_shift = gen_lshrv16hi3; - end_perm = true; - break; - default: - /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than - general shuffles. */ - return false; - } - - /* Check that permutation is even or odd. */ - odd = d->perm[0]; - if (odd > 1) - return false; - - for (i = 1; i < nelt; ++i) - if (d->perm[i] != 2 * i + odd) - return false; - - if (d->testing_p) - return true; - - dop0 = gen_reg_rtx (half_mode); - dop1 = gen_reg_rtx (half_mode); - if (odd == 0) - { - t = gen_const_vec_duplicate (half_mode, GEN_INT (c)); - t = force_reg (half_mode, t); - emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0))); - emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1))); - } - else - { - emit_insn (gen_shift (dop0, - gen_lowpart (half_mode, d->op0), - GEN_INT (s))); - emit_insn (gen_shift (dop1, - gen_lowpart (half_mode, d->op1), - GEN_INT (s))); - } - /* In AVX2 for 256 bit case we need to permute pack result. */ - if (TARGET_AVX2 && end_perm) - { - op = gen_reg_rtx (d->vmode); - t = gen_reg_rtx (V4DImode); - emit_insn (gen_pack (op, dop0, dop1)); - emit_insn (gen_avx2_permv4di_1 (t, - gen_lowpart (V4DImode, op), - const0_rtx, - const2_rtx, - const1_rtx, - GEN_INT (3))); - emit_move_insn (d->target, gen_lowpart (d->vmode, t)); - } - else - emit_insn (gen_pack (d->target, dop0, dop1)); - - return true; -} - -/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even - and extract-odd permutations of two V64QI operands - with two "shifts", two "truncs" and one "concat" insns for "odd" - and two "truncs" and one concat insn for "even." - Have already failed all two instruction sequences. */ - -static bool -expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d) -{ - rtx t1, t2, t3, t4; - unsigned i, odd, nelt = d->nelt; - - if (!TARGET_AVX512BW - || d->one_operand_p - || d->vmode != V64QImode) - return false; - - /* Check that permutation is even or odd. */ - odd = d->perm[0]; - if (odd > 1) - return false; - - for (i = 1; i < nelt; ++i) - if (d->perm[i] != 2 * i + odd) - return false; - - if (d->testing_p) - return true; - - - if (odd) - { - t1 = gen_reg_rtx (V32HImode); - t2 = gen_reg_rtx (V32HImode); - emit_insn (gen_lshrv32hi3 (t1, - gen_lowpart (V32HImode, d->op0), - GEN_INT (8))); - emit_insn (gen_lshrv32hi3 (t2, - gen_lowpart (V32HImode, d->op1), - GEN_INT (8))); - } - else - { - t1 = gen_lowpart (V32HImode, d->op0); - t2 = gen_lowpart (V32HImode, d->op1); - } - - t3 = gen_reg_rtx (V32QImode); - t4 = gen_reg_rtx (V32QImode); - emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1)); - emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2)); - emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4)); - - return true; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even - and extract-odd permutations. */ - -static bool -expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) -{ - rtx t1, t2, t3, t4, t5; - - switch (d->vmode) - { - case E_V4DFmode: - if (d->testing_p) - break; - t1 = gen_reg_rtx (V4DFmode); - t2 = gen_reg_rtx (V4DFmode); - - /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */ - emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20))); - emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31))); - - /* Now an unpck[lh]pd will produce the result required. */ - if (odd) - t3 = gen_avx_unpckhpd256 (d->target, t1, t2); - else - t3 = gen_avx_unpcklpd256 (d->target, t1, t2); - emit_insn (t3); - break; - - case E_V8SFmode: - { - int mask = odd ? 0xdd : 0x88; - - if (d->testing_p) - break; - t1 = gen_reg_rtx (V8SFmode); - t2 = gen_reg_rtx (V8SFmode); - t3 = gen_reg_rtx (V8SFmode); - - /* Shuffle within the 128-bit lanes to produce: - { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */ - emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1, - GEN_INT (mask))); - - /* Shuffle the lanes around to produce: - { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */ - emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1, - GEN_INT (0x3))); - - /* Shuffle within the 128-bit lanes to produce: - { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */ - emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44))); - - /* Shuffle within the 128-bit lanes to produce: - { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */ - emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee))); - - /* Shuffle the lanes around to produce: - { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */ - emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2, - GEN_INT (0x20))); - } - break; - - case E_V2DFmode: - case E_V4SFmode: - case E_V2DImode: - case E_V2SImode: - case E_V4SImode: - /* These are always directly implementable by expand_vec_perm_1. */ - gcc_unreachable (); - - case E_V2SFmode: - gcc_assert (TARGET_MMX_WITH_SSE); - /* We have no suitable instructions. */ - if (d->testing_p) - return false; - break; - - case E_V4HImode: - if (d->testing_p) - break; - /* We need 2*log2(N)-1 operations to achieve odd/even - with interleave. */ - t1 = gen_reg_rtx (V4HImode); - emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1)); - emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1)); - if (odd) - t2 = gen_mmx_punpckhwd (d->target, d->target, t1); - else - t2 = gen_mmx_punpcklwd (d->target, d->target, t1); - emit_insn (t2); - break; - - case E_V8HImode: - if (TARGET_SSE4_1) - return expand_vec_perm_even_odd_pack (d); - else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) - return expand_vec_perm_pshufb2 (d); - else - { - if (d->testing_p) - break; - /* We need 2*log2(N)-1 operations to achieve odd/even - with interleave. */ - t1 = gen_reg_rtx (V8HImode); - t2 = gen_reg_rtx (V8HImode); - emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1)); - emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1)); - emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1)); - emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1)); - if (odd) - t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2); - else - t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2); - emit_insn (t3); - } - break; - - case E_V16QImode: - return expand_vec_perm_even_odd_pack (d); - - case E_V16HImode: - case E_V32QImode: - return expand_vec_perm_even_odd_pack (d); - - case E_V64QImode: - return expand_vec_perm_even_odd_trunc (d); - - case E_V4DImode: - if (!TARGET_AVX2) - { - struct expand_vec_perm_d d_copy = *d; - d_copy.vmode = V4DFmode; - if (d->testing_p) - d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1); - else - d_copy.target = gen_reg_rtx (V4DFmode); - d_copy.op0 = gen_lowpart (V4DFmode, d->op0); - d_copy.op1 = gen_lowpart (V4DFmode, d->op1); - if (expand_vec_perm_even_odd_1 (&d_copy, odd)) - { - if (!d->testing_p) - emit_move_insn (d->target, - gen_lowpart (V4DImode, d_copy.target)); - return true; - } - return false; - } - - if (d->testing_p) - break; - - t1 = gen_reg_rtx (V4DImode); - t2 = gen_reg_rtx (V4DImode); - - /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */ - emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20))); - emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31))); - - /* Now an vpunpck[lh]qdq will produce the result required. */ - if (odd) - t3 = gen_avx2_interleave_highv4di (d->target, t1, t2); - else - t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2); - emit_insn (t3); - break; - - case E_V8SImode: - if (!TARGET_AVX2) - { - struct expand_vec_perm_d d_copy = *d; - d_copy.vmode = V8SFmode; - if (d->testing_p) - d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1); - else - d_copy.target = gen_reg_rtx (V8SFmode); - d_copy.op0 = gen_lowpart (V8SFmode, d->op0); - d_copy.op1 = gen_lowpart (V8SFmode, d->op1); - if (expand_vec_perm_even_odd_1 (&d_copy, odd)) - { - if (!d->testing_p) - emit_move_insn (d->target, - gen_lowpart (V8SImode, d_copy.target)); - return true; - } - return false; - } - - if (d->testing_p) - break; - - t1 = gen_reg_rtx (V8SImode); - t2 = gen_reg_rtx (V8SImode); - t3 = gen_reg_rtx (V4DImode); - t4 = gen_reg_rtx (V4DImode); - t5 = gen_reg_rtx (V4DImode); - - /* Shuffle the lanes around into - { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */ - emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0), - gen_lowpart (V4DImode, d->op1), - GEN_INT (0x20))); - emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0), - gen_lowpart (V4DImode, d->op1), - GEN_INT (0x31))); - - /* Swap the 2nd and 3rd position in each lane into - { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */ - emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3), - GEN_INT (2 * 4 + 1 * 16 + 3 * 64))); - emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4), - GEN_INT (2 * 4 + 1 * 16 + 3 * 64))); - - /* Now an vpunpck[lh]qdq will produce - { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */ - if (odd) - t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1), - gen_lowpart (V4DImode, t2)); - else - t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1), - gen_lowpart (V4DImode, t2)); - emit_insn (t3); - emit_move_insn (d->target, gen_lowpart (V8SImode, t5)); - break; - - default: - gcc_unreachable (); - } - - return true; -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match - extract-even and extract-odd permutations. */ - -static bool -expand_vec_perm_even_odd (struct expand_vec_perm_d *d) -{ - unsigned i, odd, nelt = d->nelt; - - odd = d->perm[0]; - if (odd != 0 && odd != 1) - return false; - - for (i = 1; i < nelt; ++i) - if (d->perm[i] != 2 * i + odd) - return false; - - return expand_vec_perm_even_odd_1 (d, odd); -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast - permutations. We assume that expand_vec_perm_1 has already failed. */ - -static bool -expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) -{ - unsigned elt = d->perm[0], nelt2 = d->nelt / 2; - machine_mode vmode = d->vmode; - unsigned char perm2[4]; - rtx op0 = d->op0, dest; - bool ok; - - switch (vmode) - { - case E_V4DFmode: - case E_V8SFmode: - /* These are special-cased in sse.md so that we can optionally - use the vbroadcast instruction. They expand to two insns - if the input happens to be in a register. */ - gcc_unreachable (); - - case E_V2DFmode: - case E_V2SFmode: - case E_V4SFmode: - case E_V2DImode: - case E_V2SImode: - case E_V4SImode: - /* These are always implementable using standard shuffle patterns. */ - gcc_unreachable (); - - case E_V8HImode: - case E_V16QImode: - /* These can be implemented via interleave. We save one insn by - stopping once we have promoted to V4SImode and then use pshufd. */ - if (d->testing_p) - return true; - do - { - rtx dest; - rtx (*gen) (rtx, rtx, rtx) - = vmode == V16QImode ? gen_vec_interleave_lowv16qi - : gen_vec_interleave_lowv8hi; - - if (elt >= nelt2) - { - gen = vmode == V16QImode ? gen_vec_interleave_highv16qi - : gen_vec_interleave_highv8hi; - elt -= nelt2; - } - nelt2 /= 2; - - dest = gen_reg_rtx (vmode); - emit_insn (gen (dest, op0, op0)); - vmode = get_mode_wider_vector (vmode); - op0 = gen_lowpart (vmode, dest); - } - while (vmode != V4SImode); - - memset (perm2, elt, 4); - dest = gen_reg_rtx (V4SImode); - ok = expand_vselect (dest, op0, perm2, 4, d->testing_p); - gcc_assert (ok); - if (!d->testing_p) - emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); - return true; - - case E_V64QImode: - case E_V32QImode: - case E_V16HImode: - case E_V8SImode: - case E_V4DImode: - /* For AVX2 broadcasts of the first element vpbroadcast* or - vpermq should be used by expand_vec_perm_1. */ - gcc_assert (!TARGET_AVX2 || d->perm[0]); - return false; - - default: - gcc_unreachable (); - } -} - -/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match - broadcast permutations. */ - -static bool -expand_vec_perm_broadcast (struct expand_vec_perm_d *d) -{ - unsigned i, elt, nelt = d->nelt; - - if (!d->one_operand_p) - return false; - - elt = d->perm[0]; - for (i = 1; i < nelt; ++i) - if (d->perm[i] != elt) - return false; - - return expand_vec_perm_broadcast_1 (d); -} - -/* Implement arbitrary permutations of two V64QImode operands - with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */ -static bool -expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d) -{ - if (!TARGET_AVX512BW || !(d->vmode == V64QImode)) - return false; - - if (d->testing_p) - return true; - - struct expand_vec_perm_d ds[2]; - rtx rperm[128], vperm, target0, target1; - unsigned int i, nelt; - machine_mode vmode; - - nelt = d->nelt; - vmode = V64QImode; - - for (i = 0; i < 2; i++) - { - ds[i] = *d; - ds[i].vmode = V32HImode; - ds[i].nelt = 32; - ds[i].target = gen_reg_rtx (V32HImode); - ds[i].op0 = gen_lowpart (V32HImode, d->op0); - ds[i].op1 = gen_lowpart (V32HImode, d->op1); - } - - /* Prepare permutations such that the first one takes care of - putting the even bytes into the right positions or one higher - positions (ds[0]) and the second one takes care of - putting the odd bytes into the right positions or one below - (ds[1]). */ - - for (i = 0; i < nelt; i++) - { - ds[i & 1].perm[i / 2] = d->perm[i] / 2; - if (i & 1) - { - rperm[i] = constm1_rtx; - rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1)); - } - else - { - rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1)); - rperm[i + 64] = constm1_rtx; - } - } - - bool ok = expand_vec_perm_1 (&ds[0]); - gcc_assert (ok); - ds[0].target = gen_lowpart (V64QImode, ds[0].target); - - ok = expand_vec_perm_1 (&ds[1]); - gcc_assert (ok); - ds[1].target = gen_lowpart (V64QImode, ds[1].target); - - vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm)); - vperm = force_reg (vmode, vperm); - target0 = gen_reg_rtx (V64QImode); - emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm)); - - vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64)); - vperm = force_reg (vmode, vperm); - target1 = gen_reg_rtx (V64QImode); - emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm)); - - emit_insn (gen_iorv64qi3 (d->target, target0, target1)); - return true; -} - -/* Implement arbitrary permutation of two V32QImode and V16QImode operands - with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed - all the shorter instruction sequences. */ - -static bool -expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d) -{ - rtx rperm[4][32], vperm, l[2], h[2], op, m128; - unsigned int i, nelt, eltsz; - bool used[4]; - - if (!TARGET_AVX2 - || d->one_operand_p - || (d->vmode != V32QImode && d->vmode != V16HImode)) - return false; - - if (d->testing_p) - return true; - - nelt = d->nelt; - eltsz = GET_MODE_UNIT_SIZE (d->vmode); - - /* Generate 4 permutation masks. If the required element is within - the same lane, it is shuffled in. If the required element from the - other lane, force a zero by setting bit 7 in the permutation mask. - In the other mask the mask has non-negative elements if element - is requested from the other lane, but also moved to the other lane, - so that the result of vpshufb can have the two V2TImode halves - swapped. */ - m128 = GEN_INT (-128); - for (i = 0; i < 32; ++i) - { - rperm[0][i] = m128; - rperm[1][i] = m128; - rperm[2][i] = m128; - rperm[3][i] = m128; - } - used[0] = false; - used[1] = false; - used[2] = false; - used[3] = false; - for (i = 0; i < nelt; ++i) - { - unsigned j, e = d->perm[i] & (nelt / 2 - 1); - unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz; - unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0); - - for (j = 0; j < eltsz; ++j) - rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j); - used[which] = true; - } - - for (i = 0; i < 2; ++i) - { - if (!used[2 * i + 1]) - { - h[i] = NULL_RTX; - continue; - } - vperm = gen_rtx_CONST_VECTOR (V32QImode, - gen_rtvec_v (32, rperm[2 * i + 1])); - vperm = force_reg (V32QImode, vperm); - h[i] = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, i ? d->op1 : d->op0); - emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm)); - } - - /* Swap the 128-byte lanes of h[X]. */ - for (i = 0; i < 2; ++i) - { - if (h[i] == NULL_RTX) - continue; - op = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]), - const2_rtx, GEN_INT (3), const0_rtx, - const1_rtx)); - h[i] = gen_lowpart (V32QImode, op); - } - - for (i = 0; i < 2; ++i) - { - if (!used[2 * i]) - { - l[i] = NULL_RTX; - continue; - } - vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i])); - vperm = force_reg (V32QImode, vperm); - l[i] = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, i ? d->op1 : d->op0); - emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm)); - } - - for (i = 0; i < 2; ++i) - { - if (h[i] && l[i]) - { - op = gen_reg_rtx (V32QImode); - emit_insn (gen_iorv32qi3 (op, l[i], h[i])); - l[i] = op; - } - else if (h[i]) - l[i] = h[i]; - } - - gcc_assert (l[0] && l[1]); - op = d->target; - if (d->vmode != V32QImode) - op = gen_reg_rtx (V32QImode); - emit_insn (gen_iorv32qi3 (op, l[0], l[1])); - if (op != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, op)); - return true; -} - -/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits - taken care of, perform the expansion in D and return true on success. */ - -static bool -ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) -{ - /* Try a single instruction expansion. */ - if (expand_vec_perm_1 (d)) - return true; - - /* Try sequences of two instructions. */ - - if (expand_vec_perm_pshuflw_pshufhw (d)) - return true; - - if (expand_vec_perm_palignr (d, false)) - return true; - - if (expand_vec_perm_interleave2 (d)) - return true; - - if (expand_vec_perm_broadcast (d)) - return true; - - if (expand_vec_perm_vpermq_perm_1 (d)) - return true; - - if (expand_vec_perm_vperm2f128 (d)) - return true; - - if (expand_vec_perm_pblendv (d)) - return true; - - /* Try sequences of three instructions. */ - - if (expand_vec_perm_even_odd_pack (d)) - return true; - - if (expand_vec_perm_2vperm2f128_vshuf (d)) - return true; - - if (expand_vec_perm_pshufb2 (d)) - return true; - - if (expand_vec_perm_interleave3 (d)) - return true; - - if (expand_vec_perm_vperm2f128_vblend (d)) - return true; - - /* Try sequences of four instructions. */ - - if (expand_vec_perm_even_odd_trunc (d)) - return true; - if (expand_vec_perm_vpshufb2_vpermq (d)) - return true; - - if (expand_vec_perm_vpshufb2_vpermq_even_odd (d)) - return true; - - if (expand_vec_perm_vpermt2_vpshub2 (d)) - return true; - - /* ??? Look for narrow permutations whose element orderings would - allow the promotion to a wider mode. */ - - /* ??? Look for sequences of interleave or a wider permute that place - the data into the correct lanes for a half-vector shuffle like - pshuf[lh]w or vpermilps. */ - - /* ??? Look for sequences of interleave that produce the desired results. - The combinatorics of punpck[lh] get pretty ugly... */ - - if (expand_vec_perm_even_odd (d)) - return true; - - /* Even longer sequences. */ - if (expand_vec_perm_vpshufb4_vpermq2 (d)) - return true; - - /* See if we can get the same permutation in different vector integer - mode. */ - struct expand_vec_perm_d nd; - if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) - { - if (!d->testing_p) - emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); - return true; - } - - /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */ - if (expand_vec_perm2_vperm2f128_vblend (d)) - return true; - - return false; -} - -/* If a permutation only uses one operand, make it clear. Returns true - if the permutation references both operands. */ - -static bool -canonicalize_perm (struct expand_vec_perm_d *d) -{ - int i, which, nelt = d->nelt; - - for (i = which = 0; i < nelt; ++i) - which |= (d->perm[i] < nelt ? 1 : 2); - - d->one_operand_p = true; - switch (which) - { - default: - gcc_unreachable(); - - case 3: - if (!rtx_equal_p (d->op0, d->op1)) - { - d->one_operand_p = false; - break; - } - /* The elements of PERM do not suggest that only the first operand - is used, but both operands are identical. Allow easier matching - of the permutation by folding the permutation into the single - input vector. */ - /* FALLTHRU */ - - case 2: - for (i = 0; i < nelt; ++i) - d->perm[i] &= nelt - 1; - d->op0 = d->op1; - break; - - case 1: - d->op1 = d->op0; - break; - } - - return (which == 3); -} - -/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */ - -bool -ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, - rtx op1, const vec_perm_indices &sel) -{ - struct expand_vec_perm_d d; - unsigned char perm[MAX_VECT_LEN]; - unsigned int i, nelt, which; - bool two_args; - - d.target = target; - d.op0 = op0; - d.op1 = op1; - - d.vmode = vmode; - gcc_assert (VECTOR_MODE_P (d.vmode)); - d.nelt = nelt = GET_MODE_NUNITS (d.vmode); - d.testing_p = !target; - - gcc_assert (sel.length () == nelt); - gcc_checking_assert (sizeof (d.perm) == sizeof (perm)); - - /* Given sufficient ISA support we can just return true here - for selected vector modes. */ - switch (d.vmode) - { - case E_V16SFmode: - case E_V16SImode: - case E_V8DImode: - case E_V8DFmode: - if (!TARGET_AVX512F) - return false; - /* All implementable with a single vperm[it]2 insn. */ - if (d.testing_p) - return true; - break; - case E_V32HImode: - if (!TARGET_AVX512BW) - return false; - if (d.testing_p) - /* All implementable with a single vperm[it]2 insn. */ - return true; - break; - case E_V64QImode: - if (!TARGET_AVX512BW) - return false; - if (d.testing_p) - /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */ - return true; - break; - case E_V8SImode: - case E_V8SFmode: - case E_V4DFmode: - case E_V4DImode: - if (!TARGET_AVX) - return false; - if (d.testing_p && TARGET_AVX512VL) - /* All implementable with a single vperm[it]2 insn. */ - return true; - break; - case E_V16HImode: - if (!TARGET_SSE2) - return false; - if (d.testing_p && TARGET_AVX2) - /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ - return true; - break; - case E_V32QImode: - if (!TARGET_SSE2) - return false; - if (d.testing_p && TARGET_AVX2) - /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ - return true; - break; - case E_V8HImode: - case E_V16QImode: - if (!TARGET_SSE2) - return false; - /* Fall through. */ - case E_V4SImode: - case E_V4SFmode: - if (!TARGET_SSE) - return false; - /* All implementable with a single vpperm insn. */ - if (d.testing_p && TARGET_XOP) - return true; - /* All implementable with 2 pshufb + 1 ior. */ - if (d.testing_p && TARGET_SSSE3) - return true; - break; - case E_V2SFmode: - case E_V2SImode: - case E_V4HImode: - if (!TARGET_MMX_WITH_SSE) - return false; - break; - case E_V2DImode: - case E_V2DFmode: - if (!TARGET_SSE) - return false; - /* All implementable with shufpd or unpck[lh]pd. */ - if (d.testing_p) - return true; - break; - default: - return false; - } - - for (i = which = 0; i < nelt; ++i) - { - unsigned char e = sel[i]; - gcc_assert (e < 2 * nelt); - d.perm[i] = e; - perm[i] = e; - which |= (e < nelt ? 1 : 2); - } - - if (d.testing_p) - { - /* For all elements from second vector, fold the elements to first. */ - if (which == 2) - for (i = 0; i < nelt; ++i) - d.perm[i] -= nelt; - - /* Check whether the mask can be applied to the vector type. */ - d.one_operand_p = (which != 3); - - /* Implementable with shufps or pshufd. */ - if (d.one_operand_p - && (d.vmode == V4SFmode || d.vmode == V2SFmode - || d.vmode == V4SImode || d.vmode == V2SImode)) - return true; - - /* Otherwise we have to go through the motions and see if we can - figure out how to generate the requested permutation. */ - d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1); - d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2); - if (!d.one_operand_p) - d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3); - - start_sequence (); - bool ret = ix86_expand_vec_perm_const_1 (&d); - end_sequence (); - - return ret; - } - - two_args = canonicalize_perm (&d); - - if (ix86_expand_vec_perm_const_1 (&d)) - return true; - - /* If the selector says both arguments are needed, but the operands are the - same, the above tried to expand with one_operand_p and flattened selector. - If that didn't work, retry without one_operand_p; we succeeded with that - during testing. */ - if (two_args && d.one_operand_p) - { - d.one_operand_p = false; - memcpy (d.perm, perm, sizeof (perm)); - return ix86_expand_vec_perm_const_1 (&d); - } - - return false; -} - -void -ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd) -{ - struct expand_vec_perm_d d; - unsigned i, nelt; - - d.target = targ; - d.op0 = op0; - d.op1 = op1; - d.vmode = GET_MODE (targ); - d.nelt = nelt = GET_MODE_NUNITS (d.vmode); - d.one_operand_p = false; - d.testing_p = false; - - for (i = 0; i < nelt; ++i) - d.perm[i] = i * 2 + odd; - - /* We'll either be able to implement the permutation directly... */ - if (expand_vec_perm_1 (&d)) - return; - - /* ... or we use the special-case patterns. */ - expand_vec_perm_even_odd_1 (&d, odd); -} - -static void -ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p) -{ - struct expand_vec_perm_d d; - unsigned i, nelt, base; - bool ok; - - d.target = targ; - d.op0 = op0; - d.op1 = op1; - d.vmode = GET_MODE (targ); - d.nelt = nelt = GET_MODE_NUNITS (d.vmode); - d.one_operand_p = false; - d.testing_p = false; - - base = high_p ? nelt / 2 : 0; - for (i = 0; i < nelt / 2; ++i) - { - d.perm[i * 2] = i + base; - d.perm[i * 2 + 1] = i + base + nelt; - } - - /* Note that for AVX this isn't one instruction. */ - ok = ix86_expand_vec_perm_const_1 (&d); - gcc_assert (ok); -} - - -/* Expand a vector operation CODE for a V*QImode in terms of the - same operation on V*HImode. */ - -void -ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) -{ - machine_mode qimode = GET_MODE (dest); - machine_mode himode; - rtx (*gen_il) (rtx, rtx, rtx); - rtx (*gen_ih) (rtx, rtx, rtx); - rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h; - struct expand_vec_perm_d d; - bool ok, full_interleave; - bool uns_p = false; - int i; - - switch (qimode) - { - case E_V16QImode: - himode = V8HImode; - gen_il = gen_vec_interleave_lowv16qi; - gen_ih = gen_vec_interleave_highv16qi; - break; - case E_V32QImode: - himode = V16HImode; - gen_il = gen_avx2_interleave_lowv32qi; - gen_ih = gen_avx2_interleave_highv32qi; - break; - case E_V64QImode: - himode = V32HImode; - gen_il = gen_avx512bw_interleave_lowv64qi; - gen_ih = gen_avx512bw_interleave_highv64qi; - break; - default: - gcc_unreachable (); - } - - op2_l = op2_h = op2; - switch (code) - { - case MULT: - /* Unpack data such that we've got a source byte in each low byte of - each word. We don't care what goes into the high byte of each word. - Rather than trying to get zero in there, most convenient is to let - it be a copy of the low byte. */ - op2_l = gen_reg_rtx (qimode); - op2_h = gen_reg_rtx (qimode); - emit_insn (gen_il (op2_l, op2, op2)); - emit_insn (gen_ih (op2_h, op2, op2)); - - op1_l = gen_reg_rtx (qimode); - op1_h = gen_reg_rtx (qimode); - emit_insn (gen_il (op1_l, op1, op1)); - emit_insn (gen_ih (op1_h, op1, op1)); - full_interleave = qimode == V16QImode; - break; - - case ASHIFT: - case LSHIFTRT: - uns_p = true; - /* FALLTHRU */ - case ASHIFTRT: - op1_l = gen_reg_rtx (himode); - op1_h = gen_reg_rtx (himode); - ix86_expand_sse_unpack (op1_l, op1, uns_p, false); - ix86_expand_sse_unpack (op1_h, op1, uns_p, true); - full_interleave = true; - break; - default: - gcc_unreachable (); - } - - /* Perform the operation. */ - res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX, - 1, OPTAB_DIRECT); - res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX, - 1, OPTAB_DIRECT); - gcc_assert (res_l && res_h); - - /* Merge the data back into the right place. */ - d.target = dest; - d.op0 = gen_lowpart (qimode, res_l); - d.op1 = gen_lowpart (qimode, res_h); - d.vmode = qimode; - d.nelt = GET_MODE_NUNITS (qimode); - d.one_operand_p = false; - d.testing_p = false; - - if (full_interleave) - { - /* For SSE2, we used an full interleave, so the desired - results are in the even elements. */ - for (i = 0; i < d.nelt; ++i) - d.perm[i] = i * 2; - } - else - { - /* For AVX, the interleave used above was not cross-lane. So the - extraction is evens but with the second and third quarter swapped. - Happily, that is even one insn shorter than even extraction. - For AVX512BW we have 4 lanes. We extract evens from within a lane, - always first from the first and then from the second source operand, - the index bits above the low 4 bits remains the same. - Thus, for d.nelt == 32 we want permutation - 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62 - and for d.nelt == 64 we want permutation - 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94, - 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */ - for (i = 0; i < d.nelt; ++i) - d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15); - } - - ok = ix86_expand_vec_perm_const_1 (&d); - gcc_assert (ok); - - set_unique_reg_note (get_last_insn (), REG_EQUAL, - gen_rtx_fmt_ee (code, qimode, op1, op2)); -} - -/* Helper function of ix86_expand_mul_widen_evenodd. Return true - if op is CONST_VECTOR with all odd elements equal to their - preceding element. */ - -static bool -const_vector_equal_evenodd_p (rtx op) -{ - machine_mode mode = GET_MODE (op); - int i, nunits = GET_MODE_NUNITS (mode); - if (GET_CODE (op) != CONST_VECTOR - || nunits != CONST_VECTOR_NUNITS (op)) - return false; - for (i = 0; i < nunits; i += 2) - if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1)) - return false; - return true; -} - -void -ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, - bool uns_p, bool odd_p) -{ - machine_mode mode = GET_MODE (op1); - machine_mode wmode = GET_MODE (dest); - rtx x; - rtx orig_op1 = op1, orig_op2 = op2; - - if (!nonimmediate_operand (op1, mode)) - op1 = force_reg (mode, op1); - if (!nonimmediate_operand (op2, mode)) - op2 = force_reg (mode, op2); - - /* We only play even/odd games with vectors of SImode. */ - gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode); - - /* If we're looking for the odd results, shift those members down to - the even slots. For some cpus this is faster than a PSHUFD. */ - if (odd_p) - { - /* For XOP use vpmacsdqh, but only for smult, as it is only - signed. */ - if (TARGET_XOP && mode == V4SImode && !uns_p) - { - x = force_reg (wmode, CONST0_RTX (wmode)); - emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x)); - return; - } - - x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode)); - if (!const_vector_equal_evenodd_p (orig_op1)) - op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1), - x, NULL, 1, OPTAB_DIRECT); - if (!const_vector_equal_evenodd_p (orig_op2)) - op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2), - x, NULL, 1, OPTAB_DIRECT); - op1 = gen_lowpart (mode, op1); - op2 = gen_lowpart (mode, op2); - } - - if (mode == V16SImode) - { - if (uns_p) - x = gen_vec_widen_umult_even_v16si (dest, op1, op2); - else - x = gen_vec_widen_smult_even_v16si (dest, op1, op2); - } - else if (mode == V8SImode) - { - if (uns_p) - x = gen_vec_widen_umult_even_v8si (dest, op1, op2); - else - x = gen_vec_widen_smult_even_v8si (dest, op1, op2); - } - else if (uns_p) - x = gen_vec_widen_umult_even_v4si (dest, op1, op2); - else if (TARGET_SSE4_1) - x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2); - else - { - rtx s1, s2, t0, t1, t2; - - /* The easiest way to implement this without PMULDQ is to go through - the motions as if we are performing a full 64-bit multiply. With - the exception that we need to do less shuffling of the elements. */ - - /* Compute the sign-extension, aka highparts, of the two operands. */ - s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode), - op1, pc_rtx, pc_rtx); - s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode), - op2, pc_rtx, pc_rtx); - - /* Multiply LO(A) * HI(B), and vice-versa. */ - t1 = gen_reg_rtx (wmode); - t2 = gen_reg_rtx (wmode); - emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2)); - emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1)); - - /* Multiply LO(A) * LO(B). */ - t0 = gen_reg_rtx (wmode); - emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2)); - - /* Combine and shift the highparts into place. */ - t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT); - t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1, - 1, OPTAB_DIRECT); - - /* Combine high and low parts. */ - force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT); - return; - } - emit_insn (x); -} - -void -ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2, - bool uns_p, bool high_p) -{ - machine_mode wmode = GET_MODE (dest); - machine_mode mode = GET_MODE (op1); - rtx t1, t2, t3, t4, mask; - - switch (mode) - { - case E_V4SImode: - t1 = gen_reg_rtx (mode); - t2 = gen_reg_rtx (mode); - if (TARGET_XOP && !uns_p) - { - /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case, - shuffle the elements once so that all elements are in the right - place for immediate use: { A C B D }. */ - emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx, - const1_rtx, GEN_INT (3))); - emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx, - const1_rtx, GEN_INT (3))); - } - else - { - /* Put the elements into place for the multiply. */ - ix86_expand_vec_interleave (t1, op1, op1, high_p); - ix86_expand_vec_interleave (t2, op2, op2, high_p); - high_p = false; - } - ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p); - break; - - case E_V8SImode: - /* Shuffle the elements between the lanes. After this we - have { A B E F | C D G H } for each operand. */ - t1 = gen_reg_rtx (V4DImode); - t2 = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1), - const0_rtx, const2_rtx, - const1_rtx, GEN_INT (3))); - emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2), - const0_rtx, const2_rtx, - const1_rtx, GEN_INT (3))); - - /* Shuffle the elements within the lanes. After this we - have { A A B B | C C D D } or { E E F F | G G H H }. */ - t3 = gen_reg_rtx (V8SImode); - t4 = gen_reg_rtx (V8SImode); - mask = GEN_INT (high_p - ? 2 + (2 << 2) + (3 << 4) + (3 << 6) - : 0 + (0 << 2) + (1 << 4) + (1 << 6)); - emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask)); - emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask)); - - ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false); - break; - - case E_V8HImode: - case E_V16HImode: - t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX, - uns_p, OPTAB_DIRECT); - t2 = expand_binop (mode, - uns_p ? umul_highpart_optab : smul_highpart_optab, - op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT); - gcc_assert (t1 && t2); - - t3 = gen_reg_rtx (mode); - ix86_expand_vec_interleave (t3, t1, t2, high_p); - emit_move_insn (dest, gen_lowpart (wmode, t3)); - break; - - case E_V16QImode: - case E_V32QImode: - case E_V32HImode: - case E_V16SImode: - case E_V64QImode: - t1 = gen_reg_rtx (wmode); - t2 = gen_reg_rtx (wmode); - ix86_expand_sse_unpack (t1, op1, uns_p, high_p); - ix86_expand_sse_unpack (t2, op2, uns_p, high_p); - - emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2))); - break; - - default: - gcc_unreachable (); - } -} - -void -ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) -{ - rtx res_1, res_2, res_3, res_4; - - res_1 = gen_reg_rtx (V4SImode); - res_2 = gen_reg_rtx (V4SImode); - res_3 = gen_reg_rtx (V2DImode); - res_4 = gen_reg_rtx (V2DImode); - ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false); - ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true); - - /* Move the results in element 2 down to element 1; we don't care - what goes in elements 2 and 3. Then we can merge the parts - back together with an interleave. - - Note that two other sequences were tried: - (1) Use interleaves at the start instead of psrldq, which allows - us to use a single shufps to merge things back at the end. - (2) Use shufps here to combine the two vectors, then pshufd to - put the elements in the correct order. - In both cases the cost of the reformatting stall was too high - and the overall sequence slower. */ - - emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3), - const0_rtx, const2_rtx, - const0_rtx, const0_rtx)); - emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4), - const0_rtx, const2_rtx, - const0_rtx, const0_rtx)); - res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2)); - - set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2)); -} - -void -ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) -{ - machine_mode mode = GET_MODE (op0); - rtx t1, t2, t3, t4, t5, t6; - - if (TARGET_AVX512DQ && mode == V8DImode) - emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2)); - else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode) - emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2)); - else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode) - emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2)); - else if (TARGET_XOP && mode == V2DImode) - { - /* op1: A,B,C,D, op2: E,F,G,H */ - op1 = gen_lowpart (V4SImode, op1); - op2 = gen_lowpart (V4SImode, op2); - - t1 = gen_reg_rtx (V4SImode); - t2 = gen_reg_rtx (V4SImode); - t3 = gen_reg_rtx (V2DImode); - t4 = gen_reg_rtx (V2DImode); - - /* t1: B,A,D,C */ - emit_insn (gen_sse2_pshufd_1 (t1, op1, - GEN_INT (1), - GEN_INT (0), - GEN_INT (3), - GEN_INT (2))); - - /* t2: (B*E),(A*F),(D*G),(C*H) */ - emit_insn (gen_mulv4si3 (t2, t1, op2)); - - /* t3: (B*E)+(A*F), (D*G)+(C*H) */ - emit_insn (gen_xop_phadddq (t3, t2)); - - /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */ - emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32))); - - /* Multiply lower parts and add all */ - t5 = gen_reg_rtx (V2DImode); - emit_insn (gen_vec_widen_umult_even_v4si (t5, - gen_lowpart (V4SImode, op1), - gen_lowpart (V4SImode, op2))); - force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT); - } - else - { - machine_mode nmode; - rtx (*umul) (rtx, rtx, rtx); - - if (mode == V2DImode) - { - umul = gen_vec_widen_umult_even_v4si; - nmode = V4SImode; - } - else if (mode == V4DImode) - { - umul = gen_vec_widen_umult_even_v8si; - nmode = V8SImode; - } - else if (mode == V8DImode) - { - umul = gen_vec_widen_umult_even_v16si; - nmode = V16SImode; - } - else - gcc_unreachable (); - - - /* Multiply low parts. */ - t1 = gen_reg_rtx (mode); - emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2))); - - /* Shift input vectors right 32 bits so we can multiply high parts. */ - t6 = GEN_INT (32); - t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT); - t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT); - - /* Multiply high parts by low parts. */ - t4 = gen_reg_rtx (mode); - t5 = gen_reg_rtx (mode); - emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2))); - emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1))); - - /* Combine and shift the highparts back. */ - t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT); - t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT); - - /* Combine high and low parts. */ - force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT); - } - - set_unique_reg_note (get_last_insn (), REG_EQUAL, - gen_rtx_MULT (mode, op1, op2)); -} - -/* Return 1 if control tansfer instruction INSN - should be encoded with notrack prefix. */ - -bool -ix86_notrack_prefixed_insn_p (rtx_insn *insn) -{ - if (!insn || !((flag_cf_protection & CF_BRANCH))) - return false; - - if (CALL_P (insn)) - { - rtx call = get_call_rtx_from (insn); - gcc_assert (call != NULL_RTX); - rtx addr = XEXP (call, 0); - - /* Do not emit 'notrack' if it's not an indirect call. */ - if (MEM_P (addr) - && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF) - return false; - else - return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0); - } - - if (JUMP_P (insn) && !flag_cet_switch) - { - rtx target = JUMP_LABEL (insn); - if (target == NULL_RTX || ANY_RETURN_P (target)) - return false; - - /* Check the jump is a switch table. */ - rtx_insn *label = as_a<rtx_insn *> (target); - rtx_insn *table = next_insn (label); - if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table)) - return false; - else - return true; - } - return false; -} - -/* Calculate integer abs() using only SSE2 instructions. */ - -void -ix86_expand_sse2_abs (rtx target, rtx input) -{ - machine_mode mode = GET_MODE (target); - rtx tmp0, tmp1, x; - - switch (mode) - { - case E_V2DImode: - case E_V4DImode: - /* For 64-bit signed integer X, with SSE4.2 use - pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X. - Otherwise handle it similarly to V4SImode, except use 64 as W instead of - 32 and use logical instead of arithmetic right shift (which is - unimplemented) and subtract. */ - if (TARGET_SSE4_2) - { - tmp0 = gen_reg_rtx (mode); - tmp1 = gen_reg_rtx (mode); - emit_move_insn (tmp1, CONST0_RTX (mode)); - if (mode == E_V2DImode) - emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input)); - else - emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input)); - } - else - { - tmp0 = expand_simple_binop (mode, LSHIFTRT, input, - GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - - 1), NULL, 0, OPTAB_DIRECT); - tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false); - } - - tmp1 = expand_simple_binop (mode, XOR, tmp0, input, - NULL, 0, OPTAB_DIRECT); - x = expand_simple_binop (mode, MINUS, tmp1, tmp0, - target, 0, OPTAB_DIRECT); - break; - - case E_V4SImode: - /* For 32-bit signed integer X, the best way to calculate the absolute - value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */ - tmp0 = expand_simple_binop (mode, ASHIFTRT, input, - GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1), - NULL, 0, OPTAB_DIRECT); - tmp1 = expand_simple_binop (mode, XOR, tmp0, input, - NULL, 0, OPTAB_DIRECT); - x = expand_simple_binop (mode, MINUS, tmp1, tmp0, - target, 0, OPTAB_DIRECT); - break; - - case E_V8HImode: - /* For 16-bit signed integer X, the best way to calculate the absolute - value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */ - tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); - - x = expand_simple_binop (mode, SMAX, tmp0, input, - target, 0, OPTAB_DIRECT); - break; - - case E_V16QImode: - /* For 8-bit signed integer X, the best way to calculate the absolute - value of X is min ((unsigned char) X, (unsigned char) (-X)), - as SSE2 provides the PMINUB insn. */ - tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); - - x = expand_simple_binop (V16QImode, UMIN, tmp0, input, - target, 0, OPTAB_DIRECT); - break; - - default: - gcc_unreachable (); - } - - if (x != target) - emit_move_insn (target, x); -} - -/* Expand an extract from a vector register through pextr insn. - Return true if successful. */ - -bool -ix86_expand_pextr (rtx *operands) -{ - rtx dst = operands[0]; - rtx src = operands[1]; - - unsigned int size = INTVAL (operands[2]); - unsigned int pos = INTVAL (operands[3]); - - if (SUBREG_P (dst)) - { - /* Reject non-lowpart subregs. */ - if (SUBREG_BYTE (dst) > 0) - return false; - dst = SUBREG_REG (dst); - } - - if (SUBREG_P (src)) - { - pos += SUBREG_BYTE (src) * BITS_PER_UNIT; - src = SUBREG_REG (src); - } - - switch (GET_MODE (src)) - { - case E_V16QImode: - case E_V8HImode: - case E_V4SImode: - case E_V2DImode: - case E_V1TImode: - case E_TImode: - { - machine_mode srcmode, dstmode; - rtx d, pat; - - if (!int_mode_for_size (size, 0).exists (&dstmode)) - return false; - - switch (dstmode) - { - case E_QImode: - if (!TARGET_SSE4_1) - return false; - srcmode = V16QImode; - break; - - case E_HImode: - if (!TARGET_SSE2) - return false; - srcmode = V8HImode; - break; - - case E_SImode: - if (!TARGET_SSE4_1) - return false; - srcmode = V4SImode; - break; - - case E_DImode: - gcc_assert (TARGET_64BIT); - if (!TARGET_SSE4_1) - return false; - srcmode = V2DImode; - break; - - default: - return false; - } - - /* Reject extractions from misaligned positions. */ - if (pos & (size-1)) - return false; - - if (GET_MODE (dst) == dstmode) - d = dst; - else - d = gen_reg_rtx (dstmode); - - /* Construct insn pattern. */ - pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size))); - pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat); - - /* Let the rtl optimizers know about the zero extension performed. */ - if (dstmode == QImode || dstmode == HImode) - { - pat = gen_rtx_ZERO_EXTEND (SImode, pat); - d = gen_lowpart (SImode, d); - } - - emit_insn (gen_rtx_SET (d, pat)); - - if (d != dst) - emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d)); - return true; - } - - default: - return false; - } -} - -/* Expand an insert into a vector register through pinsr insn. - Return true if successful. */ - -bool -ix86_expand_pinsr (rtx *operands) -{ - rtx dst = operands[0]; - rtx src = operands[3]; - - unsigned int size = INTVAL (operands[1]); - unsigned int pos = INTVAL (operands[2]); - - if (SUBREG_P (dst)) - { - pos += SUBREG_BYTE (dst) * BITS_PER_UNIT; - dst = SUBREG_REG (dst); - } - - switch (GET_MODE (dst)) - { - case E_V16QImode: - case E_V8HImode: - case E_V4SImode: - case E_V2DImode: - case E_V1TImode: - case E_TImode: - { - machine_mode srcmode, dstmode; - rtx (*pinsr)(rtx, rtx, rtx, rtx); - rtx d; - - if (!int_mode_for_size (size, 0).exists (&srcmode)) - return false; - - switch (srcmode) - { - case E_QImode: - if (!TARGET_SSE4_1) - return false; - dstmode = V16QImode; - pinsr = gen_sse4_1_pinsrb; - break; - - case E_HImode: - if (!TARGET_SSE2) - return false; - dstmode = V8HImode; - pinsr = gen_sse2_pinsrw; - break; - - case E_SImode: - if (!TARGET_SSE4_1) - return false; - dstmode = V4SImode; - pinsr = gen_sse4_1_pinsrd; - break; - - case E_DImode: - gcc_assert (TARGET_64BIT); - if (!TARGET_SSE4_1) - return false; - dstmode = V2DImode; - pinsr = gen_sse4_1_pinsrq; - break; - - default: - return false; - } - - /* Reject insertions to misaligned positions. */ - if (pos & (size-1)) - return false; - - if (SUBREG_P (src)) - { - unsigned int srcpos = SUBREG_BYTE (src); - - if (srcpos > 0) - { - rtx extr_ops[4]; - - extr_ops[0] = gen_reg_rtx (srcmode); - extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src)); - extr_ops[2] = GEN_INT (size); - extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT); - - if (!ix86_expand_pextr (extr_ops)) - return false; - - src = extr_ops[0]; - } - else - src = gen_lowpart (srcmode, SUBREG_REG (src)); - } - - if (GET_MODE (dst) == dstmode) - d = dst; - else - d = gen_reg_rtx (dstmode); - - emit_insn (pinsr (d, gen_lowpart (dstmode, dst), - gen_lowpart (srcmode, src), - GEN_INT (1 << (pos / size)))); - if (d != dst) - emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d)); - return true; - } - - default: - return false; - } -} - -/* All CPUs prefer to avoid cross-lane operations so perform reductions - upper against lower halves up to SSE reg size. */ - -machine_mode -ix86_split_reduction (machine_mode mode) -{ - /* Reduce lowpart against highpart until we reach SSE reg width to - avoid cross-lane operations. */ - switch (mode) - { - case E_V8DImode: - case E_V4DImode: - return V2DImode; - case E_V16SImode: - case E_V8SImode: - return V4SImode; - case E_V32HImode: - case E_V16HImode: - return V8HImode; - case E_V64QImode: - case E_V32QImode: - return V16QImode; - case E_V16SFmode: - case E_V8SFmode: - return V4SFmode; - case E_V8DFmode: - case E_V4DFmode: - return V2DFmode; - default: - return mode; - } -} - -/* Generate call to __divmoddi4. */ - -void -ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, - rtx op0, rtx op1, - rtx *quot_p, rtx *rem_p) -{ - rtx rem = assign_386_stack_local (mode, SLOT_TEMP); - - rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL, - mode, op0, mode, op1, mode, - XEXP (rem, 0), Pmode); - *quot_p = quot; - *rem_p = rem; -} - -#include "gt-i386-expand.h" diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c deleted file mode 100644 index b9b764c..0000000 --- a/gcc/config/i386/i386-features.c +++ /dev/null @@ -1,2884 +0,0 @@ -/* Copyright (C) 1988-2020 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING3. If not see -<http://www.gnu.org/licenses/>. */ - -#define IN_TARGET_CODE 1 - -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "backend.h" -#include "rtl.h" -#include "tree.h" -#include "memmodel.h" -#include "gimple.h" -#include "cfghooks.h" -#include "cfgloop.h" -#include "df.h" -#include "tm_p.h" -#include "stringpool.h" -#include "expmed.h" -#include "optabs.h" -#include "regs.h" -#include "emit-rtl.h" -#include "recog.h" -#include "cgraph.h" -#include "diagnostic.h" -#include "cfgbuild.h" -#include "alias.h" -#include "fold-const.h" -#include "attribs.h" -#include "calls.h" -#include "stor-layout.h" -#include "varasm.h" -#include "output.h" -#include "insn-attr.h" -#include "flags.h" -#include "except.h" -#include "explow.h" -#include "expr.h" -#include "cfgrtl.h" -#include "common/common-target.h" -#include "langhooks.h" -#include "reload.h" -#include "gimplify.h" -#include "dwarf2.h" -#include "tm-constrs.h" -#include "cselib.h" -#include "sched-int.h" -#include "opts.h" -#include "tree-pass.h" -#include "context.h" -#include "pass_manager.h" -#include "target-globals.h" -#include "gimple-iterator.h" -#include "tree-vectorizer.h" -#include "shrink-wrap.h" -#include "builtins.h" -#include "rtl-iter.h" -#include "tree-iterator.h" -#include "dbgcnt.h" -#include "case-cfn-macros.h" -#include "dojump.h" -#include "fold-const-call.h" -#include "tree-vrp.h" -#include "tree-ssanames.h" -#include "selftest.h" -#include "selftest-rtl.h" -#include "print-rtl.h" -#include "intl.h" -#include "ifcvt.h" -#include "symbol-summary.h" -#include "ipa-prop.h" -#include "ipa-fnsummary.h" -#include "wide-int-bitmask.h" -#include "tree-vector-builder.h" -#include "debug.h" -#include "dwarf2out.h" -#include "i386-builtins.h" -#include "i386-features.h" - -const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = { - "savms64", - "resms64", - "resms64x", - "savms64f", - "resms64f", - "resms64fx" -}; - -const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = { -/* The below offset values are where each register is stored for the layout - relative to incoming stack pointer. The value of each m_regs[].offset will - be relative to the incoming base pointer (rax or rsi) used by the stub. - - s_instances: 0 1 2 3 - Offset: realigned or aligned + 8 - Register aligned aligned + 8 aligned w/HFP w/HFP */ - XMM15_REG, /* 0x10 0x18 0x10 0x18 */ - XMM14_REG, /* 0x20 0x28 0x20 0x28 */ - XMM13_REG, /* 0x30 0x38 0x30 0x38 */ - XMM12_REG, /* 0x40 0x48 0x40 0x48 */ - XMM11_REG, /* 0x50 0x58 0x50 0x58 */ - XMM10_REG, /* 0x60 0x68 0x60 0x68 */ - XMM9_REG, /* 0x70 0x78 0x70 0x78 */ - XMM8_REG, /* 0x80 0x88 0x80 0x88 */ - XMM7_REG, /* 0x90 0x98 0x90 0x98 */ - XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */ - SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */ - DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */ - BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */ - BP_REG, /* 0xc0 0xc8 N/A N/A */ - R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */ - R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */ - R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */ - R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */ -}; - -/* Instantiate static const values. */ -const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET; -const unsigned xlogue_layout::MIN_REGS; -const unsigned xlogue_layout::MAX_REGS; -const unsigned xlogue_layout::MAX_EXTRA_REGS; -const unsigned xlogue_layout::VARIANT_COUNT; -const unsigned xlogue_layout::STUB_NAME_MAX_LEN; - -/* Initialize xlogue_layout::s_stub_names to zero. */ -char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT] - [STUB_NAME_MAX_LEN]; - -/* Instantiates all xlogue_layout instances. */ -const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = { - xlogue_layout (0, false), - xlogue_layout (8, false), - xlogue_layout (0, true), - xlogue_layout (8, true) -}; - -/* Return an appropriate const instance of xlogue_layout based upon values - in cfun->machine and crtl. */ -const class xlogue_layout & -xlogue_layout::get_instance () -{ - enum xlogue_stub_sets stub_set; - bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in; - - if (stack_realign_fp) - stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN; - else if (frame_pointer_needed) - stub_set = aligned_plus_8 - ? XLOGUE_SET_HFP_ALIGNED_PLUS_8 - : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN; - else - stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED; - - return s_instances[stub_set]; -} - -/* Determine how many clobbered registers can be saved by the stub. - Returns the count of registers the stub will save and restore. */ -unsigned -xlogue_layout::count_stub_managed_regs () -{ - bool hfp = frame_pointer_needed || stack_realign_fp; - unsigned i, count; - unsigned regno; - - for (count = i = MIN_REGS; i < MAX_REGS; ++i) - { - regno = REG_ORDER[i]; - if (regno == BP_REG && hfp) - continue; - if (!ix86_save_reg (regno, false, false)) - break; - ++count; - } - return count; -} - -/* Determine if register REGNO is a stub managed register given the - total COUNT of stub managed registers. */ -bool -xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count) -{ - bool hfp = frame_pointer_needed || stack_realign_fp; - unsigned i; - - for (i = 0; i < count; ++i) - { - gcc_assert (i < MAX_REGS); - if (REG_ORDER[i] == BP_REG && hfp) - ++count; - else if (REG_ORDER[i] == regno) - return true; - } - return false; -} - -/* Constructor for xlogue_layout. */ -xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp) - : m_hfp (hfp) , m_nregs (hfp ? 17 : 18), - m_stack_align_off_in (stack_align_off_in) -{ - HOST_WIDE_INT offset = stack_align_off_in; - unsigned i, j; - - for (i = j = 0; i < MAX_REGS; ++i) - { - unsigned regno = REG_ORDER[i]; - - if (regno == BP_REG && hfp) - continue; - if (SSE_REGNO_P (regno)) - { - offset += 16; - /* Verify that SSE regs are always aligned. */ - gcc_assert (!((stack_align_off_in + offset) & 15)); - } - else - offset += 8; - - m_regs[j].regno = regno; - m_regs[j++].offset = offset - STUB_INDEX_OFFSET; - } - gcc_assert (j == m_nregs); -} - -const char * -xlogue_layout::get_stub_name (enum xlogue_stub stub, - unsigned n_extra_regs) -{ - const int have_avx = TARGET_AVX; - char *name = s_stub_names[!!have_avx][stub][n_extra_regs]; - - /* Lazy init */ - if (!*name) - { - int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u", - (have_avx ? "avx" : "sse"), - STUB_BASE_NAMES[stub], - MIN_REGS + n_extra_regs); - gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN); - } - - return name; -} - -/* Return rtx of a symbol ref for the entry point (based upon - cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */ -rtx -xlogue_layout::get_stub_rtx (enum xlogue_stub stub) -{ - const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs; - gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS); - gcc_assert (stub < XLOGUE_STUB_COUNT); - gcc_assert (crtl->stack_realign_finalized); - - return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs)); -} - -unsigned scalar_chain::max_id = 0; - -namespace { - -/* Initialize new chain. */ - -scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_) -{ - smode = smode_; - vmode = vmode_; - - chain_id = ++max_id; - - if (dump_file) - fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id); - - bitmap_obstack_initialize (NULL); - insns = BITMAP_ALLOC (NULL); - defs = BITMAP_ALLOC (NULL); - defs_conv = BITMAP_ALLOC (NULL); - queue = NULL; -} - -/* Free chain's data. */ - -scalar_chain::~scalar_chain () -{ - BITMAP_FREE (insns); - BITMAP_FREE (defs); - BITMAP_FREE (defs_conv); - bitmap_obstack_release (NULL); -} - -/* Add instruction into chains' queue. */ - -void -scalar_chain::add_to_queue (unsigned insn_uid) -{ - if (bitmap_bit_p (insns, insn_uid) - || bitmap_bit_p (queue, insn_uid)) - return; - - if (dump_file) - fprintf (dump_file, " Adding insn %d into chain's #%d queue\n", - insn_uid, chain_id); - bitmap_set_bit (queue, insn_uid); -} - -general_scalar_chain::general_scalar_chain (enum machine_mode smode_, - enum machine_mode vmode_) - : scalar_chain (smode_, vmode_) -{ - insns_conv = BITMAP_ALLOC (NULL); - n_sse_to_integer = 0; - n_integer_to_sse = 0; -} - -general_scalar_chain::~general_scalar_chain () -{ - BITMAP_FREE (insns_conv); -} - -/* For DImode conversion, mark register defined by DEF as requiring - conversion. */ - -void -general_scalar_chain::mark_dual_mode_def (df_ref def) -{ - gcc_assert (DF_REF_REG_DEF_P (def)); - - /* Record the def/insn pair so we can later efficiently iterate over - the defs to convert on insns not in the chain. */ - bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def)); - if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def))) - { - if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def)) - && !reg_new) - return; - n_integer_to_sse++; - } - else - { - if (!reg_new) - return; - n_sse_to_integer++; - } - - if (dump_file) - fprintf (dump_file, - " Mark r%d def in insn %d as requiring both modes in chain #%d\n", - DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id); -} - -/* For TImode conversion, it is unused. */ - -void -timode_scalar_chain::mark_dual_mode_def (df_ref) -{ - gcc_unreachable (); -} - -/* Check REF's chain to add new insns into a queue - and find registers requiring conversion. */ - -void -scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref) -{ - df_link *chain; - - gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)) - || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref))); - add_to_queue (DF_REF_INSN_UID (ref)); - - for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next) - { - unsigned uid = DF_REF_INSN_UID (chain->ref); - - if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref))) - continue; - - if (!DF_REF_REG_MEM_P (chain->ref)) - { - if (bitmap_bit_p (insns, uid)) - continue; - - if (bitmap_bit_p (candidates, uid)) - { - add_to_queue (uid); - continue; - } - } - - if (DF_REF_REG_DEF_P (chain->ref)) - { - if (dump_file) - fprintf (dump_file, " r%d def in insn %d isn't convertible\n", - DF_REF_REGNO (chain->ref), uid); - mark_dual_mode_def (chain->ref); - } - else - { - if (dump_file) - fprintf (dump_file, " r%d use in insn %d isn't convertible\n", - DF_REF_REGNO (chain->ref), uid); - mark_dual_mode_def (ref); - } - } -} - -/* Add instruction into a chain. */ - -void -scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid) -{ - if (bitmap_bit_p (insns, insn_uid)) - return; - - if (dump_file) - fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id); - - bitmap_set_bit (insns, insn_uid); - - rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn; - rtx def_set = single_set (insn); - if (def_set && REG_P (SET_DEST (def_set)) - && !HARD_REGISTER_P (SET_DEST (def_set))) - bitmap_set_bit (defs, REGNO (SET_DEST (def_set))); - - /* ??? The following is quadratic since analyze_register_chain - iterates over all refs to look for dual-mode regs. Instead this - should be done separately for all regs mentioned in the chain once. */ - df_ref ref; - for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) - if (!HARD_REGISTER_P (DF_REF_REG (ref))) - analyze_register_chain (candidates, ref); - for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) - if (!DF_REF_REG_MEM_P (ref)) - analyze_register_chain (candidates, ref); -} - -/* Build new chain starting from insn INSN_UID recursively - adding all dependent uses and definitions. */ - -void -scalar_chain::build (bitmap candidates, unsigned insn_uid) -{ - queue = BITMAP_ALLOC (NULL); - bitmap_set_bit (queue, insn_uid); - - if (dump_file) - fprintf (dump_file, "Building chain #%d...\n", chain_id); - - while (!bitmap_empty_p (queue)) - { - insn_uid = bitmap_first_set_bit (queue); - bitmap_clear_bit (queue, insn_uid); - bitmap_clear_bit (candidates, insn_uid); - add_insn (candidates, insn_uid); - } - - if (dump_file) - { - fprintf (dump_file, "Collected chain #%d...\n", chain_id); - fprintf (dump_file, " insns: "); - dump_bitmap (dump_file, insns); - if (!bitmap_empty_p (defs_conv)) - { - bitmap_iterator bi; - unsigned id; - const char *comma = ""; - fprintf (dump_file, " defs to convert: "); - EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi) - { - fprintf (dump_file, "%sr%d", comma, id); - comma = ", "; - } - fprintf (dump_file, "\n"); - } - } - - BITMAP_FREE (queue); -} - -/* Return a cost of building a vector costant - instead of using a scalar one. */ - -int -general_scalar_chain::vector_const_cost (rtx exp) -{ - gcc_assert (CONST_INT_P (exp)); - - if (standard_sse_constant_p (exp, vmode)) - return ix86_cost->sse_op; - /* We have separate costs for SImode and DImode, use SImode costs - for smaller modes. */ - return ix86_cost->sse_load[smode == DImode ? 1 : 0]; -} - -/* Compute a gain for chain conversion. */ - -int -general_scalar_chain::compute_convert_gain () -{ - bitmap_iterator bi; - unsigned insn_uid; - int gain = 0; - int cost = 0; - - if (dump_file) - fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id); - - /* SSE costs distinguish between SImode and DImode loads/stores, for - int costs factor in the number of GPRs involved. When supporting - smaller modes than SImode the int load/store costs need to be - adjusted as well. */ - unsigned sse_cost_idx = smode == DImode ? 1 : 0; - unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1; - - EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi) - { - rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn; - rtx def_set = single_set (insn); - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); - int igain = 0; - - if (REG_P (src) && REG_P (dst)) - igain += 2 * m - ix86_cost->xmm_move; - else if (REG_P (src) && MEM_P (dst)) - igain - += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx]; - else if (MEM_P (src) && REG_P (dst)) - igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx]; - else if (GET_CODE (src) == ASHIFT - || GET_CODE (src) == ASHIFTRT - || GET_CODE (src) == LSHIFTRT) - { - if (m == 2) - { - if (INTVAL (XEXP (src, 1)) >= 32) - igain += ix86_cost->add; - else - igain += ix86_cost->shift_const; - } - - igain += ix86_cost->shift_const - ix86_cost->sse_op; - - if (CONST_INT_P (XEXP (src, 0))) - igain -= vector_const_cost (XEXP (src, 0)); - } - else if (GET_CODE (src) == PLUS - || GET_CODE (src) == MINUS - || GET_CODE (src) == IOR - || GET_CODE (src) == XOR - || GET_CODE (src) == AND) - { - igain += m * ix86_cost->add - ix86_cost->sse_op; - /* Additional gain for andnot for targets without BMI. */ - if (GET_CODE (XEXP (src, 0)) == NOT - && !TARGET_BMI) - igain += m * ix86_cost->add; - - if (CONST_INT_P (XEXP (src, 0))) - igain -= vector_const_cost (XEXP (src, 0)); - if (CONST_INT_P (XEXP (src, 1))) - igain -= vector_const_cost (XEXP (src, 1)); - } - else if (GET_CODE (src) == NEG - || GET_CODE (src) == NOT) - igain += m * ix86_cost->add - ix86_cost->sse_op - COSTS_N_INSNS (1); - else if (GET_CODE (src) == SMAX - || GET_CODE (src) == SMIN - || GET_CODE (src) == UMAX - || GET_CODE (src) == UMIN) - { - /* We do not have any conditional move cost, estimate it as a - reg-reg move. Comparisons are costed as adds. */ - igain += m * (COSTS_N_INSNS (2) + ix86_cost->add); - /* Integer SSE ops are all costed the same. */ - igain -= ix86_cost->sse_op; - } - else if (GET_CODE (src) == COMPARE) - { - /* Assume comparison cost is the same. */ - } - else if (CONST_INT_P (src)) - { - if (REG_P (dst)) - /* DImode can be immediate for TARGET_64BIT and SImode always. */ - igain += m * COSTS_N_INSNS (1); - else if (MEM_P (dst)) - igain += (m * ix86_cost->int_store[2] - - ix86_cost->sse_store[sse_cost_idx]); - igain -= vector_const_cost (src); - } - else - gcc_unreachable (); - - if (igain != 0 && dump_file) - { - fprintf (dump_file, " Instruction gain %d for ", igain); - dump_insn_slim (dump_file, insn); - } - gain += igain; - } - - if (dump_file) - fprintf (dump_file, " Instruction conversion gain: %d\n", gain); - - /* Cost the integer to sse and sse to integer moves. */ - cost += n_sse_to_integer * ix86_cost->sse_to_integer; - /* ??? integer_to_sse but we only have that in the RA cost table. - Assume sse_to_integer/integer_to_sse are the same which they - are at the moment. */ - cost += n_integer_to_sse * ix86_cost->sse_to_integer; - - if (dump_file) - fprintf (dump_file, " Registers conversion cost: %d\n", cost); - - gain -= cost; - - if (dump_file) - fprintf (dump_file, " Total gain: %d\n", gain); - - return gain; -} - -/* Insert generated conversion instruction sequence INSNS - after instruction AFTER. New BB may be required in case - instruction has EH region attached. */ - -void -scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after) -{ - if (!control_flow_insn_p (after)) - { - emit_insn_after (insns, after); - return; - } - - basic_block bb = BLOCK_FOR_INSN (after); - edge e = find_fallthru_edge (bb->succs); - gcc_assert (e); - - basic_block new_bb = split_edge (e); - emit_insn_after (insns, BB_HEAD (new_bb)); -} - -} // anon namespace - -/* Generate the canonical SET_SRC to move GPR to a VMODE vector register, - zeroing the upper parts. */ - -static rtx -gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr) -{ - switch (GET_MODE_NUNITS (vmode)) - { - case 1: - /* We are not using this case currently. */ - gcc_unreachable (); - case 2: - return gen_rtx_VEC_CONCAT (vmode, gpr, - CONST0_RTX (GET_MODE_INNER (vmode))); - default: - return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr), - CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U)); - } -} - -/* Make vector copies for all register REGNO definitions - and replace its uses in a chain. */ - -void -general_scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg) -{ - rtx vreg = *defs_map.get (reg); - - start_sequence (); - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) - { - rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP); - if (smode == DImode && !TARGET_64BIT) - { - emit_move_insn (adjust_address (tmp, SImode, 0), - gen_rtx_SUBREG (SImode, reg, 0)); - emit_move_insn (adjust_address (tmp, SImode, 4), - gen_rtx_SUBREG (SImode, reg, 4)); - } - else - emit_move_insn (copy_rtx (tmp), reg); - emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0), - gen_gpr_to_xmm_move_src (vmode, tmp))); - } - else if (!TARGET_64BIT && smode == DImode) - { - if (TARGET_SSE4_1) - { - emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), - CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, reg, 0))); - emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (SImode, reg, 4), - GEN_INT (2))); - } - else - { - rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), - CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, reg, 0))); - emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0), - CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, reg, 4))); - emit_insn (gen_vec_interleave_lowv4si - (gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (V4SImode, tmp, 0))); - } - } - else - emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0), - gen_gpr_to_xmm_move_src (vmode, reg))); - rtx_insn *seq = get_insns (); - end_sequence (); - emit_conversion_insns (seq, insn); - - if (dump_file) - fprintf (dump_file, - " Copied r%d to a vector register r%d for insn %d\n", - REGNO (reg), REGNO (vreg), INSN_UID (insn)); -} - -/* Copy the definition SRC of INSN inside the chain to DST for - scalar uses outside of the chain. */ - -void -general_scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src) -{ - start_sequence (); - if (!TARGET_INTER_UNIT_MOVES_FROM_VEC) - { - rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP); - emit_move_insn (tmp, src); - if (!TARGET_64BIT && smode == DImode) - { - emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0), - adjust_address (tmp, SImode, 0)); - emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4), - adjust_address (tmp, SImode, 4)); - } - else - emit_move_insn (dst, copy_rtx (tmp)); - } - else if (!TARGET_64BIT && smode == DImode) - { - if (TARGET_SSE4_1) - { - rtx tmp = gen_rtx_PARALLEL (VOIDmode, - gen_rtvec (1, const0_rtx)); - emit_insn - (gen_rtx_SET - (gen_rtx_SUBREG (SImode, dst, 0), - gen_rtx_VEC_SELECT (SImode, - gen_rtx_SUBREG (V4SImode, src, 0), - tmp))); - - tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx)); - emit_insn - (gen_rtx_SET - (gen_rtx_SUBREG (SImode, dst, 4), - gen_rtx_VEC_SELECT (SImode, - gen_rtx_SUBREG (V4SImode, src, 0), - tmp))); - } - else - { - rtx vcopy = gen_reg_rtx (V2DImode); - emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0)); - emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0), - gen_rtx_SUBREG (SImode, vcopy, 0)); - emit_move_insn (vcopy, - gen_rtx_LSHIFTRT (V2DImode, - vcopy, GEN_INT (32))); - emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4), - gen_rtx_SUBREG (SImode, vcopy, 0)); - } - } - else - emit_move_insn (dst, src); - - rtx_insn *seq = get_insns (); - end_sequence (); - emit_conversion_insns (seq, insn); - - if (dump_file) - fprintf (dump_file, - " Copied r%d to a scalar register r%d for insn %d\n", - REGNO (src), REGNO (dst), INSN_UID (insn)); -} - -/* Convert operand OP in INSN. We should handle - memory operands and uninitialized registers. - All other register uses are converted during - registers conversion. */ - -void -general_scalar_chain::convert_op (rtx *op, rtx_insn *insn) -{ - *op = copy_rtx_if_shared (*op); - - if (GET_CODE (*op) == NOT) - { - convert_op (&XEXP (*op, 0), insn); - PUT_MODE (*op, vmode); - } - else if (MEM_P (*op)) - { - rtx tmp = gen_reg_rtx (GET_MODE (*op)); - - /* Handle movabs. */ - if (!memory_operand (*op, GET_MODE (*op))) - { - rtx tmp2 = gen_reg_rtx (GET_MODE (*op)); - - emit_insn_before (gen_rtx_SET (tmp2, *op), insn); - *op = tmp2; - } - - emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0), - gen_gpr_to_xmm_move_src (vmode, *op)), - insn); - *op = gen_rtx_SUBREG (vmode, tmp, 0); - - if (dump_file) - fprintf (dump_file, " Preloading operand for insn %d into r%d\n", - INSN_UID (insn), REGNO (tmp)); - } - else if (REG_P (*op)) - { - *op = gen_rtx_SUBREG (vmode, *op, 0); - } - else if (CONST_INT_P (*op)) - { - rtx vec_cst; - rtx tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0); - - /* Prefer all ones vector in case of -1. */ - if (constm1_operand (*op, GET_MODE (*op))) - vec_cst = CONSTM1_RTX (vmode); - else - { - unsigned n = GET_MODE_NUNITS (vmode); - rtx *v = XALLOCAVEC (rtx, n); - v[0] = *op; - for (unsigned i = 1; i < n; ++i) - v[i] = const0_rtx; - vec_cst = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v)); - } - - if (!standard_sse_constant_p (vec_cst, vmode)) - { - start_sequence (); - vec_cst = validize_mem (force_const_mem (vmode, vec_cst)); - rtx_insn *seq = get_insns (); - end_sequence (); - emit_insn_before (seq, insn); - } - - emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn); - *op = tmp; - } - else - { - gcc_assert (SUBREG_P (*op)); - gcc_assert (GET_MODE (*op) == vmode); - } -} - -/* Convert INSN to vector mode. */ - -void -general_scalar_chain::convert_insn (rtx_insn *insn) -{ - /* Generate copies for out-of-chain uses of defs and adjust debug uses. */ - for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref)) - if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref))) - { - df_link *use; - for (use = DF_REF_CHAIN (ref); use; use = use->next) - if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref)) - && (DF_REF_REG_MEM_P (use->ref) - || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref)))) - break; - if (use) - convert_reg (insn, DF_REF_REG (ref), - *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)])); - else if (MAY_HAVE_DEBUG_BIND_INSNS) - { - /* If we generated a scalar copy we can leave debug-insns - as-is, if not, we have to adjust them. */ - auto_vec<rtx_insn *, 5> to_reset_debug_insns; - for (use = DF_REF_CHAIN (ref); use; use = use->next) - if (DEBUG_INSN_P (DF_REF_INSN (use->ref))) - { - rtx_insn *debug_insn = DF_REF_INSN (use->ref); - /* If there's a reaching definition outside of the - chain we have to reset. */ - df_link *def; - for (def = DF_REF_CHAIN (use->ref); def; def = def->next) - if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref))) - break; - if (def) - to_reset_debug_insns.safe_push (debug_insn); - else - { - *DF_REF_REAL_LOC (use->ref) - = *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]); - df_insn_rescan (debug_insn); - } - } - /* Have to do the reset outside of the DF_CHAIN walk to not - disrupt it. */ - while (!to_reset_debug_insns.is_empty ()) - { - rtx_insn *debug_insn = to_reset_debug_insns.pop (); - INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC (); - df_insn_rescan_debug_internal (debug_insn); - } - } - } - - /* Replace uses in this insn with the defs we use in the chain. */ - for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref)) - if (!DF_REF_REG_MEM_P (ref)) - if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)])) - { - /* Also update a corresponding REG_DEAD note. */ - rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref)); - if (note) - XEXP (note, 0) = *vreg; - *DF_REF_REAL_LOC (ref) = *vreg; - } - - rtx def_set = single_set (insn); - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); - rtx subreg; - - if (MEM_P (dst) && !REG_P (src)) - { - /* There are no scalar integer instructions and therefore - temporary register usage is required. */ - rtx tmp = gen_reg_rtx (smode); - emit_conversion_insns (gen_move_insn (dst, tmp), insn); - dst = gen_rtx_SUBREG (vmode, tmp, 0); - } - else if (REG_P (dst)) - { - /* Replace the definition with a SUBREG to the definition we - use inside the chain. */ - rtx *vdef = defs_map.get (dst); - if (vdef) - dst = *vdef; - dst = gen_rtx_SUBREG (vmode, dst, 0); - /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST - is a non-REG_P. So kill those off. */ - rtx note = find_reg_equal_equiv_note (insn); - if (note) - remove_note (insn, note); - } - - switch (GET_CODE (src)) - { - case ASHIFT: - case ASHIFTRT: - case LSHIFTRT: - convert_op (&XEXP (src, 0), insn); - PUT_MODE (src, vmode); - break; - - case PLUS: - case MINUS: - case IOR: - case XOR: - case AND: - case SMAX: - case SMIN: - case UMAX: - case UMIN: - convert_op (&XEXP (src, 0), insn); - convert_op (&XEXP (src, 1), insn); - PUT_MODE (src, vmode); - break; - - case NEG: - src = XEXP (src, 0); - convert_op (&src, insn); - subreg = gen_reg_rtx (vmode); - emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn); - src = gen_rtx_MINUS (vmode, subreg, src); - break; - - case NOT: - src = XEXP (src, 0); - convert_op (&src, insn); - subreg = gen_reg_rtx (vmode); - emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn); - src = gen_rtx_XOR (vmode, src, subreg); - break; - - case MEM: - if (!REG_P (dst)) - convert_op (&src, insn); - break; - - case REG: - if (!MEM_P (dst)) - convert_op (&src, insn); - break; - - case SUBREG: - gcc_assert (GET_MODE (src) == vmode); - break; - - case COMPARE: - src = SUBREG_REG (XEXP (XEXP (src, 0), 0)); - - gcc_assert (REG_P (src) && GET_MODE (src) == DImode); - subreg = gen_rtx_SUBREG (V2DImode, src, 0); - emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg), - copy_rtx_if_shared (subreg), - copy_rtx_if_shared (subreg)), - insn); - dst = gen_rtx_REG (CCmode, FLAGS_REG); - src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (subreg), - copy_rtx_if_shared (subreg)), - UNSPEC_PTEST); - break; - - case CONST_INT: - convert_op (&src, insn); - break; - - default: - gcc_unreachable (); - } - - SET_SRC (def_set) = src; - SET_DEST (def_set) = dst; - - /* Drop possible dead definitions. */ - PATTERN (insn) = def_set; - - INSN_CODE (insn) = -1; - int patt = recog_memoized (insn); - if (patt == -1) - fatal_insn_not_found (insn); - df_insn_rescan (insn); -} - -/* Fix uses of converted REG in debug insns. */ - -void -timode_scalar_chain::fix_debug_reg_uses (rtx reg) -{ - if (!flag_var_tracking) - return; - - df_ref ref, next; - for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next) - { - rtx_insn *insn = DF_REF_INSN (ref); - /* Make sure the next ref is for a different instruction, - so that we're not affected by the rescan. */ - next = DF_REF_NEXT_REG (ref); - while (next && DF_REF_INSN (next) == insn) - next = DF_REF_NEXT_REG (next); - - if (DEBUG_INSN_P (insn)) - { - /* It may be a debug insn with a TImode variable in - register. */ - bool changed = false; - for (; ref != next; ref = DF_REF_NEXT_REG (ref)) - { - rtx *loc = DF_REF_LOC (ref); - if (REG_P (*loc) && GET_MODE (*loc) == V1TImode) - { - *loc = gen_rtx_SUBREG (TImode, *loc, 0); - changed = true; - } - } - if (changed) - df_insn_rescan (insn); - } - } -} - -/* Convert INSN from TImode to V1T1mode. */ - -void -timode_scalar_chain::convert_insn (rtx_insn *insn) -{ - rtx def_set = single_set (insn); - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); - - switch (GET_CODE (dst)) - { - case REG: - { - rtx tmp = find_reg_equal_equiv_note (insn); - if (tmp) - PUT_MODE (XEXP (tmp, 0), V1TImode); - PUT_MODE (dst, V1TImode); - fix_debug_reg_uses (dst); - } - break; - case MEM: - PUT_MODE (dst, V1TImode); - break; - - default: - gcc_unreachable (); - } - - switch (GET_CODE (src)) - { - case REG: - PUT_MODE (src, V1TImode); - /* Call fix_debug_reg_uses only if SRC is never defined. */ - if (!DF_REG_DEF_CHAIN (REGNO (src))) - fix_debug_reg_uses (src); - break; - - case MEM: - PUT_MODE (src, V1TImode); - break; - - case CONST_WIDE_INT: - if (NONDEBUG_INSN_P (insn)) - { - /* Since there are no instructions to store 128-bit constant, - temporary register usage is required. */ - rtx tmp = gen_reg_rtx (V1TImode); - start_sequence (); - src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src)); - src = validize_mem (force_const_mem (V1TImode, src)); - rtx_insn *seq = get_insns (); - end_sequence (); - if (seq) - emit_insn_before (seq, insn); - emit_conversion_insns (gen_rtx_SET (dst, tmp), insn); - dst = tmp; - } - break; - - case CONST_INT: - switch (standard_sse_constant_p (src, TImode)) - { - case 1: - src = CONST0_RTX (GET_MODE (dst)); - break; - case 2: - src = CONSTM1_RTX (GET_MODE (dst)); - break; - default: - gcc_unreachable (); - } - if (NONDEBUG_INSN_P (insn)) - { - rtx tmp = gen_reg_rtx (V1TImode); - /* Since there are no instructions to store standard SSE - constant, temporary register usage is required. */ - emit_conversion_insns (gen_rtx_SET (dst, tmp), insn); - dst = tmp; - } - break; - - default: - gcc_unreachable (); - } - - SET_SRC (def_set) = src; - SET_DEST (def_set) = dst; - - /* Drop possible dead definitions. */ - PATTERN (insn) = def_set; - - INSN_CODE (insn) = -1; - recog_memoized (insn); - df_insn_rescan (insn); -} - -/* Generate copies from defs used by the chain but not defined therein. - Also populates defs_map which is used later by convert_insn. */ - -void -general_scalar_chain::convert_registers () -{ - bitmap_iterator bi; - unsigned id; - EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi) - { - rtx chain_reg = gen_reg_rtx (smode); - defs_map.put (regno_reg_rtx[id], chain_reg); - } - EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi) - for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref)) - if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref))) - make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref)); -} - -/* Convert whole chain creating required register - conversions and copies. */ - -int -scalar_chain::convert () -{ - bitmap_iterator bi; - unsigned id; - int converted_insns = 0; - - if (!dbg_cnt (stv_conversion)) - return 0; - - if (dump_file) - fprintf (dump_file, "Converting chain #%d...\n", chain_id); - - convert_registers (); - - EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi) - { - convert_insn (DF_INSN_UID_GET (id)->insn); - converted_insns++; - } - - return converted_insns; -} - -/* Return the SET expression if INSN doesn't reference hard register. - Return NULL if INSN uses or defines a hard register, excluding - pseudo register pushes, hard register uses in a memory address, - clobbers and flags definitions. */ - -static rtx -pseudo_reg_set (rtx_insn *insn) -{ - rtx set = single_set (insn); - if (!set) - return NULL; - - /* Check pseudo register push first. */ - if (REG_P (SET_SRC (set)) - && !HARD_REGISTER_P (SET_SRC (set)) - && push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))) - return set; - - df_ref ref; - FOR_EACH_INSN_DEF (ref, insn) - if (HARD_REGISTER_P (DF_REF_REAL_REG (ref)) - && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER) - && DF_REF_REGNO (ref) != FLAGS_REG) - return NULL; - - FOR_EACH_INSN_USE (ref, insn) - if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref))) - return NULL; - - return set; -} - -/* Check if comparison INSN may be transformed - into vector comparison. Currently we transform - zero checks only which look like: - - (set (reg:CCZ 17 flags) - (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4) - (subreg:SI (reg:DI x) 0)) - (const_int 0 [0]))) */ - -static bool -convertible_comparison_p (rtx_insn *insn, enum machine_mode mode) -{ - /* ??? Currently convertible for double-word DImode chain only. */ - if (TARGET_64BIT || mode != DImode) - return false; - - if (!TARGET_SSE4_1) - return false; - - rtx def_set = single_set (insn); - - gcc_assert (def_set); - - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); - - gcc_assert (GET_CODE (src) == COMPARE); - - if (GET_CODE (dst) != REG - || REGNO (dst) != FLAGS_REG - || GET_MODE (dst) != CCZmode) - return false; - - rtx op1 = XEXP (src, 0); - rtx op2 = XEXP (src, 1); - - if (op2 != CONST0_RTX (GET_MODE (op2))) - return false; - - if (GET_CODE (op1) != IOR) - return false; - - op2 = XEXP (op1, 1); - op1 = XEXP (op1, 0); - - if (!SUBREG_P (op1) - || !SUBREG_P (op2) - || GET_MODE (op1) != SImode - || GET_MODE (op2) != SImode - || ((SUBREG_BYTE (op1) != 0 - || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode)) - && (SUBREG_BYTE (op2) != 0 - || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode)))) - return false; - - op1 = SUBREG_REG (op1); - op2 = SUBREG_REG (op2); - - if (op1 != op2 - || !REG_P (op1) - || GET_MODE (op1) != DImode) - return false; - - return true; -} - -/* The general version of scalar_to_vector_candidate_p. */ - -static bool -general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode) -{ - rtx def_set = pseudo_reg_set (insn); - - if (!def_set) - return false; - - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); - - if (GET_CODE (src) == COMPARE) - return convertible_comparison_p (insn, mode); - - /* We are interested in "mode" only. */ - if ((GET_MODE (src) != mode - && !CONST_INT_P (src)) - || GET_MODE (dst) != mode) - return false; - - if (!REG_P (dst) && !MEM_P (dst)) - return false; - - switch (GET_CODE (src)) - { - case ASHIFTRT: - if (!TARGET_AVX512VL) - return false; - /* FALLTHRU */ - - case ASHIFT: - case LSHIFTRT: - if (!CONST_INT_P (XEXP (src, 1)) - || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1)) - return false; - break; - - case SMAX: - case SMIN: - case UMAX: - case UMIN: - if ((mode == DImode && !TARGET_AVX512VL) - || (mode == SImode && !TARGET_SSE4_1)) - return false; - /* Fallthru. */ - - case PLUS: - case MINUS: - case IOR: - case XOR: - case AND: - if (!REG_P (XEXP (src, 1)) - && !MEM_P (XEXP (src, 1)) - && !CONST_INT_P (XEXP (src, 1))) - return false; - - if (GET_MODE (XEXP (src, 1)) != mode - && !CONST_INT_P (XEXP (src, 1))) - return false; - break; - - case NEG: - case NOT: - break; - - case REG: - return true; - - case MEM: - case CONST_INT: - return REG_P (dst); - - default: - return false; - } - - if (!REG_P (XEXP (src, 0)) - && !MEM_P (XEXP (src, 0)) - && !CONST_INT_P (XEXP (src, 0)) - /* Check for andnot case. */ - && (GET_CODE (src) != AND - || GET_CODE (XEXP (src, 0)) != NOT - || !REG_P (XEXP (XEXP (src, 0), 0)))) - return false; - - if (GET_MODE (XEXP (src, 0)) != mode - && !CONST_INT_P (XEXP (src, 0))) - return false; - - return true; -} - -/* The TImode version of scalar_to_vector_candidate_p. */ - -static bool -timode_scalar_to_vector_candidate_p (rtx_insn *insn) -{ - rtx def_set = pseudo_reg_set (insn); - - if (!def_set) - return false; - - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); - - /* Only TImode load and store are allowed. */ - if (GET_MODE (dst) != TImode) - return false; - - if (MEM_P (dst)) - { - /* Check for store. Memory must be aligned or unaligned store - is optimal. Only support store from register, standard SSE - constant or CONST_WIDE_INT generated from piecewise store. - - ??? Verify performance impact before enabling CONST_INT for - __int128 store. */ - if (misaligned_operand (dst, TImode) - && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL) - return false; - - switch (GET_CODE (src)) - { - default: - return false; - - case REG: - case CONST_WIDE_INT: - return true; - - case CONST_INT: - return standard_sse_constant_p (src, TImode); - } - } - else if (MEM_P (src)) - { - /* Check for load. Memory must be aligned or unaligned load is - optimal. */ - return (REG_P (dst) - && (!misaligned_operand (src, TImode) - || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)); - } - - return false; -} - -/* For a register REGNO, scan instructions for its defs and uses. - Put REGNO in REGS if a def or use isn't in CANDIDATES. */ - -static void -timode_check_non_convertible_regs (bitmap candidates, bitmap regs, - unsigned int regno) -{ - for (df_ref def = DF_REG_DEF_CHAIN (regno); - def; - def = DF_REF_NEXT_REG (def)) - { - if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def))) - { - if (dump_file) - fprintf (dump_file, - "r%d has non convertible def in insn %d\n", - regno, DF_REF_INSN_UID (def)); - - bitmap_set_bit (regs, regno); - break; - } - } - - for (df_ref ref = DF_REG_USE_CHAIN (regno); - ref; - ref = DF_REF_NEXT_REG (ref)) - { - /* Debug instructions are skipped. */ - if (NONDEBUG_INSN_P (DF_REF_INSN (ref)) - && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref))) - { - if (dump_file) - fprintf (dump_file, - "r%d has non convertible use in insn %d\n", - regno, DF_REF_INSN_UID (ref)); - - bitmap_set_bit (regs, regno); - break; - } - } -} - -/* The TImode version of remove_non_convertible_regs. */ - -static void -timode_remove_non_convertible_regs (bitmap candidates) -{ - bitmap_iterator bi; - unsigned id; - bitmap regs = BITMAP_ALLOC (NULL); - - EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi) - { - rtx def_set = single_set (DF_INSN_UID_GET (id)->insn); - rtx dest = SET_DEST (def_set); - rtx src = SET_SRC (def_set); - - if ((!REG_P (dest) - || bitmap_bit_p (regs, REGNO (dest)) - || HARD_REGISTER_P (dest)) - && (!REG_P (src) - || bitmap_bit_p (regs, REGNO (src)) - || HARD_REGISTER_P (src))) - continue; - - if (REG_P (dest)) - timode_check_non_convertible_regs (candidates, regs, - REGNO (dest)); - - if (REG_P (src)) - timode_check_non_convertible_regs (candidates, regs, - REGNO (src)); - } - - EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi) - { - for (df_ref def = DF_REG_DEF_CHAIN (id); - def; - def = DF_REF_NEXT_REG (def)) - if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def))) - { - if (dump_file) - fprintf (dump_file, "Removing insn %d from candidates list\n", - DF_REF_INSN_UID (def)); - - bitmap_clear_bit (candidates, DF_REF_INSN_UID (def)); - } - - for (df_ref ref = DF_REG_USE_CHAIN (id); - ref; - ref = DF_REF_NEXT_REG (ref)) - if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref))) - { - if (dump_file) - fprintf (dump_file, "Removing insn %d from candidates list\n", - DF_REF_INSN_UID (ref)); - - bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref)); - } - } - - BITMAP_FREE (regs); -} - -/* Main STV pass function. Find and convert scalar - instructions into vector mode when profitable. */ - -static unsigned int -convert_scalars_to_vector (bool timode_p) -{ - basic_block bb; - int converted_insns = 0; - - bitmap_obstack_initialize (NULL); - const machine_mode cand_mode[3] = { SImode, DImode, TImode }; - const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode }; - bitmap_head candidates[3]; /* { SImode, DImode, TImode } */ - for (unsigned i = 0; i < 3; ++i) - bitmap_initialize (&candidates[i], &bitmap_default_obstack); - - calculate_dominance_info (CDI_DOMINATORS); - df_set_flags (DF_DEFER_INSN_RESCAN); - df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); - df_analyze (); - - /* Find all instructions we want to convert into vector mode. */ - if (dump_file) - fprintf (dump_file, "Searching for mode conversion candidates...\n"); - - FOR_EACH_BB_FN (bb, cfun) - { - rtx_insn *insn; - FOR_BB_INSNS (bb, insn) - if (timode_p - && timode_scalar_to_vector_candidate_p (insn)) - { - if (dump_file) - fprintf (dump_file, " insn %d is marked as a TImode candidate\n", - INSN_UID (insn)); - - bitmap_set_bit (&candidates[2], INSN_UID (insn)); - } - else if (!timode_p) - { - /* Check {SI,DI}mode. */ - for (unsigned i = 0; i <= 1; ++i) - if (general_scalar_to_vector_candidate_p (insn, cand_mode[i])) - { - if (dump_file) - fprintf (dump_file, " insn %d is marked as a %s candidate\n", - INSN_UID (insn), i == 0 ? "SImode" : "DImode"); - - bitmap_set_bit (&candidates[i], INSN_UID (insn)); - break; - } - } - } - - if (timode_p) - timode_remove_non_convertible_regs (&candidates[2]); - - for (unsigned i = 0; i <= 2; ++i) - if (!bitmap_empty_p (&candidates[i])) - break; - else if (i == 2 && dump_file) - fprintf (dump_file, "There are no candidates for optimization.\n"); - - for (unsigned i = 0; i <= 2; ++i) - while (!bitmap_empty_p (&candidates[i])) - { - unsigned uid = bitmap_first_set_bit (&candidates[i]); - scalar_chain *chain; - - if (cand_mode[i] == TImode) - chain = new timode_scalar_chain; - else - chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]); - - /* Find instructions chain we want to convert to vector mode. - Check all uses and definitions to estimate all required - conversions. */ - chain->build (&candidates[i], uid); - - if (chain->compute_convert_gain () > 0) - converted_insns += chain->convert (); - else - if (dump_file) - fprintf (dump_file, "Chain #%d conversion is not profitable\n", - chain->chain_id); - - delete chain; - } - - if (dump_file) - fprintf (dump_file, "Total insns converted: %d\n", converted_insns); - - for (unsigned i = 0; i <= 2; ++i) - bitmap_release (&candidates[i]); - bitmap_obstack_release (NULL); - df_process_deferred_rescans (); - - /* Conversion means we may have 128bit register spills/fills - which require aligned stack. */ - if (converted_insns) - { - if (crtl->stack_alignment_needed < 128) - crtl->stack_alignment_needed = 128; - if (crtl->stack_alignment_estimated < 128) - crtl->stack_alignment_estimated = 128; - - crtl->stack_realign_needed - = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated; - crtl->stack_realign_tried = crtl->stack_realign_needed; - - crtl->stack_realign_processed = true; - - if (!crtl->drap_reg) - { - rtx drap_rtx = targetm.calls.get_drap_rtx (); - - /* stack_realign_drap and drap_rtx must match. */ - gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL)); - - /* Do nothing if NULL is returned, - which means DRAP is not needed. */ - if (drap_rtx != NULL) - { - crtl->args.internal_arg_pointer = drap_rtx; - - /* Call fixup_tail_calls to clean up - REG_EQUIV note if DRAP is needed. */ - fixup_tail_calls (); - } - } - - /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */ - if (TARGET_64BIT) - for (tree parm = DECL_ARGUMENTS (current_function_decl); - parm; parm = DECL_CHAIN (parm)) - { - if (TYPE_MODE (TREE_TYPE (parm)) != TImode) - continue; - if (DECL_RTL_SET_P (parm) - && GET_MODE (DECL_RTL (parm)) == V1TImode) - { - rtx r = DECL_RTL (parm); - if (REG_P (r)) - SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0)); - } - if (DECL_INCOMING_RTL (parm) - && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode) - { - rtx r = DECL_INCOMING_RTL (parm); - if (REG_P (r)) - DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0); - } - } - } - - return 0; -} - -/* Modify the vzeroupper pattern in INSN so that it describes the effect - that the instruction has on the SSE registers. LIVE_REGS are the set - of registers that are live across the instruction. - - For a live register R we use: - - (set (reg:V2DF R) (reg:V2DF R)) - - which preserves the low 128 bits but clobbers the upper bits. */ - -static void -ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs) -{ - rtx pattern = PATTERN (insn); - unsigned int nregs = TARGET_64BIT ? 16 : 8; - unsigned int npats = nregs; - for (unsigned int i = 0; i < nregs; ++i) - { - unsigned int regno = GET_SSE_REGNO (i); - if (!bitmap_bit_p (live_regs, regno)) - npats--; - } - if (npats == 0) - return; - rtvec vec = rtvec_alloc (npats + 1); - RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0); - for (unsigned int i = 0, j = 0; i < nregs; ++i) - { - unsigned int regno = GET_SSE_REGNO (i); - if (!bitmap_bit_p (live_regs, regno)) - continue; - rtx reg = gen_rtx_REG (V2DImode, regno); - ++j; - RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg); - } - XVEC (pattern, 0) = vec; - INSN_CODE (insn) = -1; - df_insn_rescan (insn); -} - -/* Walk the vzeroupper instructions in the function and annotate them - with the effect that they have on the SSE registers. */ - -static void -ix86_add_reg_usage_to_vzerouppers (void) -{ - basic_block bb; - rtx_insn *insn; - auto_bitmap live_regs; - - df_analyze (); - FOR_EACH_BB_FN (bb, cfun) - { - bitmap_copy (live_regs, df_get_live_out (bb)); - df_simulate_initialize_backwards (bb, live_regs); - FOR_BB_INSNS_REVERSE (bb, insn) - { - if (!NONDEBUG_INSN_P (insn)) - continue; - if (vzeroupper_pattern (PATTERN (insn), VOIDmode)) - ix86_add_reg_usage_to_vzeroupper (insn, live_regs); - df_simulate_one_insn_backwards (bb, insn, live_regs); - } - } -} - -static unsigned int -rest_of_handle_insert_vzeroupper (void) -{ - int i; - - /* vzeroupper instructions are inserted immediately after reload to - account for possible spills from 256bit or 512bit registers. The pass - reuses mode switching infrastructure by re-running mode insertion - pass, so disable entities that have already been processed. */ - for (i = 0; i < MAX_386_ENTITIES; i++) - ix86_optimize_mode_switching[i] = 0; - - ix86_optimize_mode_switching[AVX_U128] = 1; - - /* Call optimize_mode_switching. */ - g->get_passes ()->execute_pass_mode_switching (); - ix86_add_reg_usage_to_vzerouppers (); - return 0; -} - -namespace { - -const pass_data pass_data_insert_vzeroupper = -{ - RTL_PASS, /* type */ - "vzeroupper", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_MACH_DEP, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - TODO_df_finish, /* todo_flags_finish */ -}; - -class pass_insert_vzeroupper : public rtl_opt_pass -{ -public: - pass_insert_vzeroupper(gcc::context *ctxt) - : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt) - {} - - /* opt_pass methods: */ - virtual bool gate (function *) - { - return TARGET_AVX - && TARGET_VZEROUPPER && flag_expensive_optimizations - && !optimize_size; - } - - virtual unsigned int execute (function *) - { - return rest_of_handle_insert_vzeroupper (); - } - -}; // class pass_insert_vzeroupper - -const pass_data pass_data_stv = -{ - RTL_PASS, /* type */ - "stv", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_MACH_DEP, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - TODO_df_finish, /* todo_flags_finish */ -}; - -class pass_stv : public rtl_opt_pass -{ -public: - pass_stv (gcc::context *ctxt) - : rtl_opt_pass (pass_data_stv, ctxt), - timode_p (false) - {} - - /* opt_pass methods: */ - virtual bool gate (function *) - { - return ((!timode_p || TARGET_64BIT) - && TARGET_STV && TARGET_SSE2 && optimize > 1); - } - - virtual unsigned int execute (function *) - { - return convert_scalars_to_vector (timode_p); - } - - opt_pass *clone () - { - return new pass_stv (m_ctxt); - } - - void set_pass_param (unsigned int n, bool param) - { - gcc_assert (n == 0); - timode_p = param; - } - -private: - bool timode_p; -}; // class pass_stv - -} // anon namespace - -rtl_opt_pass * -make_pass_insert_vzeroupper (gcc::context *ctxt) -{ - return new pass_insert_vzeroupper (ctxt); -} - -rtl_opt_pass * -make_pass_stv (gcc::context *ctxt) -{ - return new pass_stv (ctxt); -} - -/* Inserting ENDBRANCH instructions. */ - -static unsigned int -rest_of_insert_endbranch (void) -{ - timevar_push (TV_MACH_DEP); - - rtx cet_eb; - rtx_insn *insn; - basic_block bb; - - /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is - absent among function attributes. Later an optimization will be - introduced to make analysis if an address of a static function is - taken. A static function whose address is not taken will get a - nocf_check attribute. This will allow to reduce the number of EB. */ - - if (!lookup_attribute ("nocf_check", - TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))) - && (!flag_manual_endbr - || lookup_attribute ("cf_check", - DECL_ATTRIBUTES (cfun->decl))) - && (!cgraph_node::get (cfun->decl)->only_called_directly_p () - || ix86_cmodel == CM_LARGE - || ix86_cmodel == CM_LARGE_PIC - || flag_force_indirect_call - || (TARGET_DLLIMPORT_DECL_ATTRIBUTES - && DECL_DLLIMPORT_P (cfun->decl)))) - { - /* Queue ENDBR insertion to x86_function_profiler. */ - if (crtl->profile && flag_fentry) - cfun->machine->endbr_queued_at_entrance = true; - else - { - cet_eb = gen_nop_endbr (); - - bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb; - insn = BB_HEAD (bb); - emit_insn_before (cet_eb, insn); - } - } - - bb = 0; - FOR_EACH_BB_FN (bb, cfun) - { - for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); - insn = NEXT_INSN (insn)) - { - if (CALL_P (insn)) - { - bool need_endbr; - need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL; - if (!need_endbr && !SIBLING_CALL_P (insn)) - { - rtx call = get_call_rtx_from (insn); - rtx fnaddr = XEXP (call, 0); - tree fndecl = NULL_TREE; - - /* Also generate ENDBRANCH for non-tail call which - may return via indirect branch. */ - if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) - fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0)); - if (fndecl == NULL_TREE) - fndecl = MEM_EXPR (fnaddr); - if (fndecl - && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE - && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE) - fndecl = NULL_TREE; - if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl))) - { - tree fntype = TREE_TYPE (fndecl); - if (lookup_attribute ("indirect_return", - TYPE_ATTRIBUTES (fntype))) - need_endbr = true; - } - } - if (!need_endbr) - continue; - /* Generate ENDBRANCH after CALL, which can return more than - twice, setjmp-like functions. */ - - cet_eb = gen_nop_endbr (); - emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn)); - continue; - } - - if (JUMP_P (insn) && flag_cet_switch) - { - rtx target = JUMP_LABEL (insn); - if (target == NULL_RTX || ANY_RETURN_P (target)) - continue; - - /* Check the jump is a switch table. */ - rtx_insn *label = as_a<rtx_insn *> (target); - rtx_insn *table = next_insn (label); - if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table)) - continue; - - /* For the indirect jump find out all places it jumps and insert - ENDBRANCH there. It should be done under a special flag to - control ENDBRANCH generation for switch stmts. */ - edge_iterator ei; - edge e; - basic_block dest_blk; - - FOR_EACH_EDGE (e, ei, bb->succs) - { - rtx_insn *insn; - - dest_blk = e->dest; - insn = BB_HEAD (dest_blk); - gcc_assert (LABEL_P (insn)); - cet_eb = gen_nop_endbr (); - emit_insn_after (cet_eb, insn); - } - continue; - } - - if (LABEL_P (insn) && LABEL_PRESERVE_P (insn)) - { - cet_eb = gen_nop_endbr (); - emit_insn_after (cet_eb, insn); - continue; - } - } - } - - timevar_pop (TV_MACH_DEP); - return 0; -} - -namespace { - -const pass_data pass_data_insert_endbranch = -{ - RTL_PASS, /* type. */ - "cet", /* name. */ - OPTGROUP_NONE, /* optinfo_flags. */ - TV_MACH_DEP, /* tv_id. */ - 0, /* properties_required. */ - 0, /* properties_provided. */ - 0, /* properties_destroyed. */ - 0, /* todo_flags_start. */ - 0, /* todo_flags_finish. */ -}; - -class pass_insert_endbranch : public rtl_opt_pass -{ -public: - pass_insert_endbranch (gcc::context *ctxt) - : rtl_opt_pass (pass_data_insert_endbranch, ctxt) - {} - - /* opt_pass methods: */ - virtual bool gate (function *) - { - return ((flag_cf_protection & CF_BRANCH)); - } - - virtual unsigned int execute (function *) - { - return rest_of_insert_endbranch (); - } - -}; // class pass_insert_endbranch - -} // anon namespace - -rtl_opt_pass * -make_pass_insert_endbranch (gcc::context *ctxt) -{ - return new pass_insert_endbranch (ctxt); -} - -/* At entry of the nearest common dominator for basic blocks with - conversions, generate a single - vxorps %xmmN, %xmmN, %xmmN - for all - vcvtss2sd op, %xmmN, %xmmX - vcvtsd2ss op, %xmmN, %xmmX - vcvtsi2ss op, %xmmN, %xmmX - vcvtsi2sd op, %xmmN, %xmmX - - NB: We want to generate only a single vxorps to cover the whole - function. The LCM algorithm isn't appropriate here since it may - place a vxorps inside the loop. */ - -static unsigned int -remove_partial_avx_dependency (void) -{ - timevar_push (TV_MACH_DEP); - - bitmap_obstack_initialize (NULL); - bitmap convert_bbs = BITMAP_ALLOC (NULL); - - basic_block bb; - rtx_insn *insn, *set_insn; - rtx set; - rtx v4sf_const0 = NULL_RTX; - - auto_vec<rtx_insn *> control_flow_insns; - - FOR_EACH_BB_FN (bb, cfun) - { - FOR_BB_INSNS (bb, insn) - { - if (!NONDEBUG_INSN_P (insn)) - continue; - - set = single_set (insn); - if (!set) - continue; - - if (get_attr_avx_partial_xmm_update (insn) - != AVX_PARTIAL_XMM_UPDATE_TRUE) - continue; - - if (!v4sf_const0) - { - calculate_dominance_info (CDI_DOMINATORS); - df_set_flags (DF_DEFER_INSN_RESCAN); - df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); - df_md_add_problem (); - df_analyze (); - v4sf_const0 = gen_reg_rtx (V4SFmode); - } - - /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF, - SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and - vec_merge with subreg. */ - rtx src = SET_SRC (set); - rtx dest = SET_DEST (set); - machine_mode dest_mode = GET_MODE (dest); - - rtx zero; - machine_mode dest_vecmode; - if (dest_mode == E_SFmode) - { - dest_vecmode = V4SFmode; - zero = v4sf_const0; - } - else - { - dest_vecmode = V2DFmode; - zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0); - } - - /* Change source to vector mode. */ - src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src); - src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero, - GEN_INT (HOST_WIDE_INT_1U)); - /* Change destination to vector mode. */ - rtx vec = gen_reg_rtx (dest_vecmode); - /* Generate an XMM vector SET. */ - set = gen_rtx_SET (vec, src); - set_insn = emit_insn_before (set, insn); - df_insn_rescan (set_insn); - - if (cfun->can_throw_non_call_exceptions) - { - /* Handle REG_EH_REGION note. */ - rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX); - if (note) - { - control_flow_insns.safe_push (set_insn); - add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0)); - } - } - - src = gen_rtx_SUBREG (dest_mode, vec, 0); - set = gen_rtx_SET (dest, src); - - /* Drop possible dead definitions. */ - PATTERN (insn) = set; - - INSN_CODE (insn) = -1; - recog_memoized (insn); - df_insn_rescan (insn); - bitmap_set_bit (convert_bbs, bb->index); - } - } - - if (v4sf_const0) - { - /* (Re-)discover loops so that bb->loop_father can be used in the - analysis below. */ - loop_optimizer_init (AVOID_CFG_MODIFICATIONS); - - /* Generate a vxorps at entry of the nearest dominator for basic - blocks with conversions, which is in the fake loop that - contains the whole function, so that there is only a single - vxorps in the whole function. */ - bb = nearest_common_dominator_for_set (CDI_DOMINATORS, - convert_bbs); - while (bb->loop_father->latch - != EXIT_BLOCK_PTR_FOR_FN (cfun)) - bb = get_immediate_dominator (CDI_DOMINATORS, - bb->loop_father->header); - - set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode)); - - insn = BB_HEAD (bb); - while (insn && !NONDEBUG_INSN_P (insn)) - { - if (insn == BB_END (bb)) - { - insn = NULL; - break; - } - insn = NEXT_INSN (insn); - } - if (insn == BB_HEAD (bb)) - set_insn = emit_insn_before (set, insn); - else - set_insn = emit_insn_after (set, - insn ? PREV_INSN (insn) : BB_END (bb)); - df_insn_rescan (set_insn); - df_process_deferred_rescans (); - loop_optimizer_finalize (); - - if (!control_flow_insns.is_empty ()) - { - free_dominance_info (CDI_DOMINATORS); - - unsigned int i; - FOR_EACH_VEC_ELT (control_flow_insns, i, insn) - if (control_flow_insn_p (insn)) - { - /* Split the block after insn. There will be a fallthru - edge, which is OK so we keep it. We have to create - the exception edges ourselves. */ - bb = BLOCK_FOR_INSN (insn); - split_block (bb, insn); - rtl_make_eh_edge (NULL, bb, BB_END (bb)); - } - } - } - - bitmap_obstack_release (NULL); - BITMAP_FREE (convert_bbs); - - timevar_pop (TV_MACH_DEP); - return 0; -} - -namespace { - -const pass_data pass_data_remove_partial_avx_dependency = -{ - RTL_PASS, /* type */ - "rpad", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_MACH_DEP, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - TODO_df_finish, /* todo_flags_finish */ -}; - -class pass_remove_partial_avx_dependency : public rtl_opt_pass -{ -public: - pass_remove_partial_avx_dependency (gcc::context *ctxt) - : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt) - {} - - /* opt_pass methods: */ - virtual bool gate (function *) - { - return (TARGET_AVX - && TARGET_SSE_PARTIAL_REG_DEPENDENCY - && TARGET_SSE_MATH - && optimize - && optimize_function_for_speed_p (cfun)); - } - - virtual unsigned int execute (function *) - { - return remove_partial_avx_dependency (); - } -}; // class pass_rpad - -} // anon namespace - -rtl_opt_pass * -make_pass_remove_partial_avx_dependency (gcc::context *ctxt) -{ - return new pass_remove_partial_avx_dependency (ctxt); -} - -/* This compares the priority of target features in function DECL1 - and DECL2. It returns positive value if DECL1 is higher priority, - negative value if DECL2 is higher priority and 0 if they are the - same. */ - -int -ix86_compare_version_priority (tree decl1, tree decl2) -{ - unsigned int priority1 = get_builtin_code_for_version (decl1, NULL); - unsigned int priority2 = get_builtin_code_for_version (decl2, NULL); - - return (int)priority1 - (int)priority2; -} - -/* V1 and V2 point to function versions with different priorities - based on the target ISA. This function compares their priorities. */ - -static int -feature_compare (const void *v1, const void *v2) -{ - typedef struct _function_version_info - { - tree version_decl; - tree predicate_chain; - unsigned int dispatch_priority; - } function_version_info; - - const function_version_info c1 = *(const function_version_info *)v1; - const function_version_info c2 = *(const function_version_info *)v2; - return (c2.dispatch_priority - c1.dispatch_priority); -} - -/* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL - to return a pointer to VERSION_DECL if the outcome of the expression - formed by PREDICATE_CHAIN is true. This function will be called during - version dispatch to decide which function version to execute. It returns - the basic block at the end, to which more conditions can be added. */ - -static basic_block -add_condition_to_bb (tree function_decl, tree version_decl, - tree predicate_chain, basic_block new_bb) -{ - gimple *return_stmt; - tree convert_expr, result_var; - gimple *convert_stmt; - gimple *call_cond_stmt; - gimple *if_else_stmt; - - basic_block bb1, bb2, bb3; - edge e12, e23; - - tree cond_var, and_expr_var = NULL_TREE; - gimple_seq gseq; - - tree predicate_decl, predicate_arg; - - push_cfun (DECL_STRUCT_FUNCTION (function_decl)); - - gcc_assert (new_bb != NULL); - gseq = bb_seq (new_bb); - - - convert_expr = build1 (CONVERT_EXPR, ptr_type_node, - build_fold_addr_expr (version_decl)); - result_var = create_tmp_var (ptr_type_node); - convert_stmt = gimple_build_assign (result_var, convert_expr); - return_stmt = gimple_build_return (result_var); - - if (predicate_chain == NULL_TREE) - { - gimple_seq_add_stmt (&gseq, convert_stmt); - gimple_seq_add_stmt (&gseq, return_stmt); - set_bb_seq (new_bb, gseq); - gimple_set_bb (convert_stmt, new_bb); - gimple_set_bb (return_stmt, new_bb); - pop_cfun (); - return new_bb; - } - - while (predicate_chain != NULL) - { - cond_var = create_tmp_var (integer_type_node); - predicate_decl = TREE_PURPOSE (predicate_chain); - predicate_arg = TREE_VALUE (predicate_chain); - call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg); - gimple_call_set_lhs (call_cond_stmt, cond_var); - - gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl)); - gimple_set_bb (call_cond_stmt, new_bb); - gimple_seq_add_stmt (&gseq, call_cond_stmt); - - predicate_chain = TREE_CHAIN (predicate_chain); - - if (and_expr_var == NULL) - and_expr_var = cond_var; - else - { - gimple *assign_stmt; - /* Use MIN_EXPR to check if any integer is zero?. - and_expr_var = min_expr <cond_var, and_expr_var> */ - assign_stmt = gimple_build_assign (and_expr_var, - build2 (MIN_EXPR, integer_type_node, - cond_var, and_expr_var)); - - gimple_set_block (assign_stmt, DECL_INITIAL (function_decl)); - gimple_set_bb (assign_stmt, new_bb); - gimple_seq_add_stmt (&gseq, assign_stmt); - } - } - - if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var, - integer_zero_node, - NULL_TREE, NULL_TREE); - gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl)); - gimple_set_bb (if_else_stmt, new_bb); - gimple_seq_add_stmt (&gseq, if_else_stmt); - - gimple_seq_add_stmt (&gseq, convert_stmt); - gimple_seq_add_stmt (&gseq, return_stmt); - set_bb_seq (new_bb, gseq); - - bb1 = new_bb; - e12 = split_block (bb1, if_else_stmt); - bb2 = e12->dest; - e12->flags &= ~EDGE_FALLTHRU; - e12->flags |= EDGE_TRUE_VALUE; - - e23 = split_block (bb2, return_stmt); - - gimple_set_bb (convert_stmt, bb2); - gimple_set_bb (return_stmt, bb2); - - bb3 = e23->dest; - make_edge (bb1, bb3, EDGE_FALSE_VALUE); - - remove_edge (e23); - make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0); - - pop_cfun (); - - return bb3; -} - -/* This function generates the dispatch function for - multi-versioned functions. DISPATCH_DECL is the function which will - contain the dispatch logic. FNDECLS are the function choices for - dispatch, and is a tree chain. EMPTY_BB is the basic block pointer - in DISPATCH_DECL in which the dispatch code is generated. */ - -static int -dispatch_function_versions (tree dispatch_decl, - void *fndecls_p, - basic_block *empty_bb) -{ - tree default_decl; - gimple *ifunc_cpu_init_stmt; - gimple_seq gseq; - int ix; - tree ele; - vec<tree> *fndecls; - unsigned int num_versions = 0; - unsigned int actual_versions = 0; - unsigned int i; - - struct _function_version_info - { - tree version_decl; - tree predicate_chain; - unsigned int dispatch_priority; - }*function_version_info; - - gcc_assert (dispatch_decl != NULL - && fndecls_p != NULL - && empty_bb != NULL); - - /*fndecls_p is actually a vector. */ - fndecls = static_cast<vec<tree> *> (fndecls_p); - - /* At least one more version other than the default. */ - num_versions = fndecls->length (); - gcc_assert (num_versions >= 2); - - function_version_info = (struct _function_version_info *) - XNEWVEC (struct _function_version_info, (num_versions - 1)); - - /* The first version in the vector is the default decl. */ - default_decl = (*fndecls)[0]; - - push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl)); - - gseq = bb_seq (*empty_bb); - /* Function version dispatch is via IFUNC. IFUNC resolvers fire before - constructors, so explicity call __builtin_cpu_init here. */ - ifunc_cpu_init_stmt - = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL); - gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt); - gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb); - set_bb_seq (*empty_bb, gseq); - - pop_cfun (); - - - for (ix = 1; fndecls->iterate (ix, &ele); ++ix) - { - tree version_decl = ele; - tree predicate_chain = NULL_TREE; - unsigned int priority; - /* Get attribute string, parse it and find the right predicate decl. - The predicate function could be a lengthy combination of many - features, like arch-type and various isa-variants. */ - priority = get_builtin_code_for_version (version_decl, - &predicate_chain); - - if (predicate_chain == NULL_TREE) - continue; - - function_version_info [actual_versions].version_decl = version_decl; - function_version_info [actual_versions].predicate_chain - = predicate_chain; - function_version_info [actual_versions].dispatch_priority = priority; - actual_versions++; - } - - /* Sort the versions according to descending order of dispatch priority. The - priority is based on the ISA. This is not a perfect solution. There - could still be ambiguity. If more than one function version is suitable - to execute, which one should be dispatched? In future, allow the user - to specify a dispatch priority next to the version. */ - qsort (function_version_info, actual_versions, - sizeof (struct _function_version_info), feature_compare); - - for (i = 0; i < actual_versions; ++i) - *empty_bb = add_condition_to_bb (dispatch_decl, - function_version_info[i].version_decl, - function_version_info[i].predicate_chain, - *empty_bb); - - /* dispatch default version at the end. */ - *empty_bb = add_condition_to_bb (dispatch_decl, default_decl, - NULL, *empty_bb); - - free (function_version_info); - return 0; -} - -/* This function changes the assembler name for functions that are - versions. If DECL is a function version and has a "target" - attribute, it appends the attribute string to its assembler name. */ - -static tree -ix86_mangle_function_version_assembler_name (tree decl, tree id) -{ - tree version_attr; - const char *orig_name, *version_string; - char *attr_str, *assembler_name; - - if (DECL_DECLARED_INLINE_P (decl) - && lookup_attribute ("gnu_inline", - DECL_ATTRIBUTES (decl))) - error_at (DECL_SOURCE_LOCATION (decl), - "function versions cannot be marked as %<gnu_inline%>," - " bodies have to be generated"); - - if (DECL_VIRTUAL_P (decl) - || DECL_VINDEX (decl)) - sorry ("virtual function multiversioning not supported"); - - version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl)); - - /* target attribute string cannot be NULL. */ - gcc_assert (version_attr != NULL_TREE); - - orig_name = IDENTIFIER_POINTER (id); - version_string - = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr))); - - if (strcmp (version_string, "default") == 0) - return id; - - attr_str = sorted_attr_string (TREE_VALUE (version_attr)); - assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2); - - sprintf (assembler_name, "%s.%s", orig_name, attr_str); - - /* Allow assembler name to be modified if already set. */ - if (DECL_ASSEMBLER_NAME_SET_P (decl)) - SET_DECL_RTL (decl, NULL); - - tree ret = get_identifier (assembler_name); - XDELETEVEC (attr_str); - XDELETEVEC (assembler_name); - return ret; -} - -tree -ix86_mangle_decl_assembler_name (tree decl, tree id) -{ - /* For function version, add the target suffix to the assembler name. */ - if (TREE_CODE (decl) == FUNCTION_DECL - && DECL_FUNCTION_VERSIONED (decl)) - id = ix86_mangle_function_version_assembler_name (decl, id); -#ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME - id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id); -#endif - - return id; -} - -/* Make a dispatcher declaration for the multi-versioned function DECL. - Calls to DECL function will be replaced with calls to the dispatcher - by the front-end. Returns the decl of the dispatcher function. */ - -tree -ix86_get_function_versions_dispatcher (void *decl) -{ - tree fn = (tree) decl; - struct cgraph_node *node = NULL; - struct cgraph_node *default_node = NULL; - struct cgraph_function_version_info *node_v = NULL; - struct cgraph_function_version_info *first_v = NULL; - - tree dispatch_decl = NULL; - - struct cgraph_function_version_info *default_version_info = NULL; - - gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn)); - - node = cgraph_node::get (fn); - gcc_assert (node != NULL); - - node_v = node->function_version (); - gcc_assert (node_v != NULL); - - if (node_v->dispatcher_resolver != NULL) - return node_v->dispatcher_resolver; - - /* Find the default version and make it the first node. */ - first_v = node_v; - /* Go to the beginning of the chain. */ - while (first_v->prev != NULL) - first_v = first_v->prev; - default_version_info = first_v; - while (default_version_info != NULL) - { - if (is_function_default_version - (default_version_info->this_node->decl)) - break; - default_version_info = default_version_info->next; - } - - /* If there is no default node, just return NULL. */ - if (default_version_info == NULL) - return NULL; - - /* Make default info the first node. */ - if (first_v != default_version_info) - { - default_version_info->prev->next = default_version_info->next; - if (default_version_info->next) - default_version_info->next->prev = default_version_info->prev; - first_v->prev = default_version_info; - default_version_info->next = first_v; - default_version_info->prev = NULL; - } - - default_node = default_version_info->this_node; - -#if defined (ASM_OUTPUT_TYPE_DIRECTIVE) - if (targetm.has_ifunc_p ()) - { - struct cgraph_function_version_info *it_v = NULL; - struct cgraph_node *dispatcher_node = NULL; - struct cgraph_function_version_info *dispatcher_version_info = NULL; - - /* Right now, the dispatching is done via ifunc. */ - dispatch_decl = make_dispatcher_decl (default_node->decl); - - dispatcher_node = cgraph_node::get_create (dispatch_decl); - gcc_assert (dispatcher_node != NULL); - dispatcher_node->dispatcher_function = 1; - dispatcher_version_info - = dispatcher_node->insert_new_function_version (); - dispatcher_version_info->next = default_version_info; - dispatcher_node->definition = 1; - - /* Set the dispatcher for all the versions. */ - it_v = default_version_info; - while (it_v != NULL) - { - it_v->dispatcher_resolver = dispatch_decl; - it_v = it_v->next; - } - } - else -#endif - { - error_at (DECL_SOURCE_LOCATION (default_node->decl), - "multiversioning needs %<ifunc%> which is not supported " - "on this target"); - } - - return dispatch_decl; -} - -/* Make the resolver function decl to dispatch the versions of - a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is - ifunc alias that will point to the created resolver. Create an - empty basic block in the resolver and store the pointer in - EMPTY_BB. Return the decl of the resolver function. */ - -static tree -make_resolver_func (const tree default_decl, - const tree ifunc_alias_decl, - basic_block *empty_bb) -{ - tree decl, type, t; - - /* Create resolver function name based on default_decl. */ - tree decl_name = clone_function_name (default_decl, "resolver"); - const char *resolver_name = IDENTIFIER_POINTER (decl_name); - - /* The resolver function should return a (void *). */ - type = build_function_type_list (ptr_type_node, NULL_TREE); - - decl = build_fn_decl (resolver_name, type); - SET_DECL_ASSEMBLER_NAME (decl, decl_name); - - DECL_NAME (decl) = decl_name; - TREE_USED (decl) = 1; - DECL_ARTIFICIAL (decl) = 1; - DECL_IGNORED_P (decl) = 1; - TREE_PUBLIC (decl) = 0; - DECL_UNINLINABLE (decl) = 1; - - /* Resolver is not external, body is generated. */ - DECL_EXTERNAL (decl) = 0; - DECL_EXTERNAL (ifunc_alias_decl) = 0; - - DECL_CONTEXT (decl) = NULL_TREE; - DECL_INITIAL (decl) = make_node (BLOCK); - DECL_STATIC_CONSTRUCTOR (decl) = 0; - - if (DECL_COMDAT_GROUP (default_decl) - || TREE_PUBLIC (default_decl)) - { - /* In this case, each translation unit with a call to this - versioned function will put out a resolver. Ensure it - is comdat to keep just one copy. */ - DECL_COMDAT (decl) = 1; - make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl)); - } - else - TREE_PUBLIC (ifunc_alias_decl) = 0; - - /* Build result decl and add to function_decl. */ - t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node); - DECL_CONTEXT (t) = decl; - DECL_ARTIFICIAL (t) = 1; - DECL_IGNORED_P (t) = 1; - DECL_RESULT (decl) = t; - - gimplify_function_tree (decl); - push_cfun (DECL_STRUCT_FUNCTION (decl)); - *empty_bb = init_lowered_empty_function (decl, false, - profile_count::uninitialized ()); - - cgraph_node::add_new_function (decl, true); - symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl)); - - pop_cfun (); - - gcc_assert (ifunc_alias_decl != NULL); - /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */ - DECL_ATTRIBUTES (ifunc_alias_decl) - = make_attribute ("ifunc", resolver_name, - DECL_ATTRIBUTES (ifunc_alias_decl)); - - /* Create the alias for dispatch to resolver here. */ - cgraph_node::create_same_body_alias (ifunc_alias_decl, decl); - return decl; -} - -/* Generate the dispatching code body to dispatch multi-versioned function - DECL. The target hook is called to process the "target" attributes and - provide the code to dispatch the right function at run-time. NODE points - to the dispatcher decl whose body will be created. */ - -tree -ix86_generate_version_dispatcher_body (void *node_p) -{ - tree resolver_decl; - basic_block empty_bb; - tree default_ver_decl; - struct cgraph_node *versn; - struct cgraph_node *node; - - struct cgraph_function_version_info *node_version_info = NULL; - struct cgraph_function_version_info *versn_info = NULL; - - node = (cgraph_node *)node_p; - - node_version_info = node->function_version (); - gcc_assert (node->dispatcher_function - && node_version_info != NULL); - - if (node_version_info->dispatcher_resolver) - return node_version_info->dispatcher_resolver; - - /* The first version in the chain corresponds to the default version. */ - default_ver_decl = node_version_info->next->this_node->decl; - - /* node is going to be an alias, so remove the finalized bit. */ - node->definition = false; - - resolver_decl = make_resolver_func (default_ver_decl, - node->decl, &empty_bb); - - node_version_info->dispatcher_resolver = resolver_decl; - - push_cfun (DECL_STRUCT_FUNCTION (resolver_decl)); - - auto_vec<tree, 2> fn_ver_vec; - - for (versn_info = node_version_info->next; versn_info; - versn_info = versn_info->next) - { - versn = versn_info->this_node; - /* Check for virtual functions here again, as by this time it should - have been determined if this function needs a vtable index or - not. This happens for methods in derived classes that override - virtual methods in base classes but are not explicitly marked as - virtual. */ - if (DECL_VINDEX (versn->decl)) - sorry ("virtual function multiversioning not supported"); - - fn_ver_vec.safe_push (versn->decl); - } - - dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb); - cgraph_edge::rebuild_edges (); - pop_cfun (); - return resolver_decl; -} - - diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c deleted file mode 100644 index 67480b2..0000000 --- a/gcc/config/i386/i386-options.c +++ /dev/null @@ -1,3799 +0,0 @@ -/* Copyright (C) 1988-2020 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING3. If not see -<http://www.gnu.org/licenses/>. */ - -#define IN_TARGET_CODE 1 - -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "backend.h" -#include "rtl.h" -#include "tree.h" -#include "memmodel.h" -#include "gimple.h" -#include "cfghooks.h" -#include "cfgloop.h" -#include "df.h" -#include "tm_p.h" -#include "stringpool.h" -#include "expmed.h" -#include "optabs.h" -#include "regs.h" -#include "emit-rtl.h" -#include "recog.h" -#include "cgraph.h" -#include "diagnostic.h" -#include "cfgbuild.h" -#include "alias.h" -#include "fold-const.h" -#include "attribs.h" -#include "calls.h" -#include "stor-layout.h" -#include "varasm.h" -#include "output.h" -#include "insn-attr.h" -#include "flags.h" -#include "except.h" -#include "explow.h" -#include "expr.h" -#include "cfgrtl.h" -#include "common/common-target.h" -#include "langhooks.h" -#include "reload.h" -#include "gimplify.h" -#include "dwarf2.h" -#include "tm-constrs.h" -#include "cselib.h" -#include "sched-int.h" -#include "opts.h" -#include "tree-pass.h" -#include "context.h" -#include "pass_manager.h" -#include "target-globals.h" -#include "gimple-iterator.h" -#include "tree-vectorizer.h" -#include "shrink-wrap.h" -#include "builtins.h" -#include "rtl-iter.h" -#include "tree-iterator.h" -#include "dbgcnt.h" -#include "case-cfn-macros.h" -#include "dojump.h" -#include "fold-const-call.h" -#include "tree-vrp.h" -#include "tree-ssanames.h" -#include "selftest.h" -#include "selftest-rtl.h" -#include "print-rtl.h" -#include "intl.h" -#include "ifcvt.h" -#include "symbol-summary.h" -#include "ipa-prop.h" -#include "ipa-fnsummary.h" -#include "wide-int-bitmask.h" -#include "tree-vector-builder.h" -#include "debug.h" -#include "dwarf2out.h" -#include "i386-options.h" - -#include "x86-tune-costs.h" - -#ifndef SUBTARGET32_DEFAULT_CPU -#define SUBTARGET32_DEFAULT_CPU "i386" -#endif - -/* Processor feature/optimization bitmasks. */ -#define m_386 (HOST_WIDE_INT_1U<<PROCESSOR_I386) -#define m_486 (HOST_WIDE_INT_1U<<PROCESSOR_I486) -#define m_PENT (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM) -#define m_LAKEMONT (HOST_WIDE_INT_1U<<PROCESSOR_LAKEMONT) -#define m_PPRO (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUMPRO) -#define m_PENT4 (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM4) -#define m_NOCONA (HOST_WIDE_INT_1U<<PROCESSOR_NOCONA) -#define m_P4_NOCONA (m_PENT4 | m_NOCONA) -#define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2) -#define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM) -#define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE) -#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) -#define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL) -#define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT) -#define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL) -#define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM) -#define m_SKYLAKE (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE) -#define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512) -#define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) -#define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) -#define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER) -#define m_CASCADELAKE (HOST_WIDE_INT_1U<<PROCESSOR_CASCADELAKE) -#define m_TIGERLAKE (HOST_WIDE_INT_1U<<PROCESSOR_TIGERLAKE) -#define m_COOPERLAKE (HOST_WIDE_INT_1U<<PROCESSOR_COOPERLAKE) -#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \ - | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \ - | m_TIGERLAKE | m_COOPERLAKE) -#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512) -#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2) -#define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT) -#define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS) -#define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT) -#define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL) - -#define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE) -#define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6) -#define m_K6_GEODE (m_K6 | m_GEODE) -#define m_K8 (HOST_WIDE_INT_1U<<PROCESSOR_K8) -#define m_ATHLON (HOST_WIDE_INT_1U<<PROCESSOR_ATHLON) -#define m_ATHLON_K8 (m_K8 | m_ATHLON) -#define m_AMDFAM10 (HOST_WIDE_INT_1U<<PROCESSOR_AMDFAM10) -#define m_BDVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER1) -#define m_BDVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER2) -#define m_BDVER3 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER3) -#define m_BDVER4 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER4) -#define m_ZNVER1 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER1) -#define m_ZNVER2 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER2) -#define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1) -#define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2) -#define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4) -#define m_BTVER (m_BTVER1 | m_BTVER2) -#define m_ZNVER (m_ZNVER1 | m_ZNVER2) -#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \ - | m_ZNVER) - -#define m_GENERIC (HOST_WIDE_INT_1U<<PROCESSOR_GENERIC) - -const char* ix86_tune_feature_names[X86_TUNE_LAST] = { -#undef DEF_TUNE -#define DEF_TUNE(tune, name, selector) name, -#include "x86-tune.def" -#undef DEF_TUNE -}; - -/* Feature tests against the various tunings. */ -unsigned char ix86_tune_features[X86_TUNE_LAST]; - -/* Feature tests against the various tunings used to create ix86_tune_features - based on the processor mask. */ -static unsigned HOST_WIDE_INT initial_ix86_tune_features[X86_TUNE_LAST] = { -#undef DEF_TUNE -#define DEF_TUNE(tune, name, selector) selector, -#include "x86-tune.def" -#undef DEF_TUNE -}; - -/* Feature tests against the various architecture variations. */ -unsigned char ix86_arch_features[X86_ARCH_LAST]; - -struct ix86_target_opts -{ - const char *option; /* option string */ - HOST_WIDE_INT mask; /* isa mask options */ -}; - -/* This table is ordered so that options like -msse4.2 that imply other - ISAs come first. Target string will be displayed in the same order. */ -static struct ix86_target_opts isa2_opts[] = -{ - { "-mcx16", OPTION_MASK_ISA2_CX16 }, - { "-mvaes", OPTION_MASK_ISA2_VAES }, - { "-mrdpid", OPTION_MASK_ISA2_RDPID }, - { "-mpconfig", OPTION_MASK_ISA2_PCONFIG }, - { "-mwbnoinvd", OPTION_MASK_ISA2_WBNOINVD }, - { "-mavx512vp2intersect", OPTION_MASK_ISA2_AVX512VP2INTERSECT }, - { "-msgx", OPTION_MASK_ISA2_SGX }, - { "-mavx5124vnniw", OPTION_MASK_ISA2_AVX5124VNNIW }, - { "-mavx5124fmaps", OPTION_MASK_ISA2_AVX5124FMAPS }, - { "-mhle", OPTION_MASK_ISA2_HLE }, - { "-mmovbe", OPTION_MASK_ISA2_MOVBE }, - { "-mclzero", OPTION_MASK_ISA2_CLZERO }, - { "-mmwaitx", OPTION_MASK_ISA2_MWAITX }, - { "-mmovdir64b", OPTION_MASK_ISA2_MOVDIR64B }, - { "-mwaitpkg", OPTION_MASK_ISA2_WAITPKG }, - { "-mcldemote", OPTION_MASK_ISA2_CLDEMOTE }, - { "-mptwrite", OPTION_MASK_ISA2_PTWRITE }, - { "-mavx512bf16", OPTION_MASK_ISA2_AVX512BF16 }, - { "-menqcmd", OPTION_MASK_ISA2_ENQCMD }, - { "-mserialize", OPTION_MASK_ISA2_SERIALIZE }, - { "-mtsxldtrk", OPTION_MASK_ISA2_TSXLDTRK } -}; -static struct ix86_target_opts isa_opts[] = -{ - { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }, - { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG }, - { "-mvpclmulqdq", OPTION_MASK_ISA_VPCLMULQDQ }, - { "-mgfni", OPTION_MASK_ISA_GFNI }, - { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI }, - { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 }, - { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI }, - { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA }, - { "-mavx512vl", OPTION_MASK_ISA_AVX512VL }, - { "-mavx512bw", OPTION_MASK_ISA_AVX512BW }, - { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ }, - { "-mavx512er", OPTION_MASK_ISA_AVX512ER }, - { "-mavx512pf", OPTION_MASK_ISA_AVX512PF }, - { "-mavx512cd", OPTION_MASK_ISA_AVX512CD }, - { "-mavx512f", OPTION_MASK_ISA_AVX512F }, - { "-mavx2", OPTION_MASK_ISA_AVX2 }, - { "-mfma", OPTION_MASK_ISA_FMA }, - { "-mxop", OPTION_MASK_ISA_XOP }, - { "-mfma4", OPTION_MASK_ISA_FMA4 }, - { "-mf16c", OPTION_MASK_ISA_F16C }, - { "-mavx", OPTION_MASK_ISA_AVX }, -/*{ "-msse4" OPTION_MASK_ISA_SSE4 }, */ - { "-msse4.2", OPTION_MASK_ISA_SSE4_2 }, - { "-msse4.1", OPTION_MASK_ISA_SSE4_1 }, - { "-msse4a", OPTION_MASK_ISA_SSE4A }, - { "-mssse3", OPTION_MASK_ISA_SSSE3 }, - { "-msse3", OPTION_MASK_ISA_SSE3 }, - { "-maes", OPTION_MASK_ISA_AES }, - { "-msha", OPTION_MASK_ISA_SHA }, - { "-mpclmul", OPTION_MASK_ISA_PCLMUL }, - { "-msse2", OPTION_MASK_ISA_SSE2 }, - { "-msse", OPTION_MASK_ISA_SSE }, - { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A }, - { "-m3dnow", OPTION_MASK_ISA_3DNOW }, - { "-mmmx", OPTION_MASK_ISA_MMX }, - { "-mrtm", OPTION_MASK_ISA_RTM }, - { "-mprfchw", OPTION_MASK_ISA_PRFCHW }, - { "-mrdseed", OPTION_MASK_ISA_RDSEED }, - { "-madx", OPTION_MASK_ISA_ADX }, - { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 }, - { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT }, - { "-mxsaves", OPTION_MASK_ISA_XSAVES }, - { "-mxsavec", OPTION_MASK_ISA_XSAVEC }, - { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT }, - { "-mxsave", OPTION_MASK_ISA_XSAVE }, - { "-mabm", OPTION_MASK_ISA_ABM }, - { "-mbmi", OPTION_MASK_ISA_BMI }, - { "-mbmi2", OPTION_MASK_ISA_BMI2 }, - { "-mlzcnt", OPTION_MASK_ISA_LZCNT }, - { "-mtbm", OPTION_MASK_ISA_TBM }, - { "-mpopcnt", OPTION_MASK_ISA_POPCNT }, - { "-msahf", OPTION_MASK_ISA_SAHF }, - { "-mcrc32", OPTION_MASK_ISA_CRC32 }, - { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE }, - { "-mrdrnd", OPTION_MASK_ISA_RDRND }, - { "-mpku", OPTION_MASK_ISA_PKU }, - { "-mlwp", OPTION_MASK_ISA_LWP }, - { "-mfxsr", OPTION_MASK_ISA_FXSR }, - { "-mclwb", OPTION_MASK_ISA_CLWB }, - { "-mshstk", OPTION_MASK_ISA_SHSTK }, - { "-mmovdiri", OPTION_MASK_ISA_MOVDIRI } -}; - -/* Return 1 if TRAIT NAME is present in the OpenMP context's - device trait set, return 0 if not present in any OpenMP context in the - whole translation unit, or -1 if not present in the current OpenMP context - but might be present in another OpenMP context in the same TU. */ - -int -ix86_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait, - const char *name) -{ - switch (trait) - { - case omp_device_kind: - return strcmp (name, "cpu") == 0; - case omp_device_arch: - if (strcmp (name, "x86") == 0) - return 1; - if (TARGET_64BIT) - { - if (TARGET_X32) - return strcmp (name, "x32") == 0; - else - return strcmp (name, "x86_64") == 0; - } - if (strcmp (name, "ia32") == 0 || strcmp (name, "i386") == 0) - return 1; - if (strcmp (name, "i486") == 0) - return ix86_arch != PROCESSOR_I386 ? 1 : -1; - if (strcmp (name, "i586") == 0) - return (ix86_arch != PROCESSOR_I386 - && ix86_arch != PROCESSOR_I486) ? 1 : -1; - if (strcmp (name, "i686") == 0) - return (ix86_arch != PROCESSOR_I386 - && ix86_arch != PROCESSOR_I486 - && ix86_arch != PROCESSOR_LAKEMONT - && ix86_arch != PROCESSOR_PENTIUM) ? 1 : -1; - return 0; - case omp_device_isa: - for (int i = 0; i < 2; i++) - { - struct ix86_target_opts *opts = i ? isa2_opts : isa_opts; - size_t nopts = i ? ARRAY_SIZE (isa2_opts) : ARRAY_SIZE (isa_opts); - HOST_WIDE_INT mask = i ? ix86_isa_flags2 : ix86_isa_flags; - for (size_t n = 0; n < nopts; n++) - { - /* Handle sse4 as an alias to sse4.2. */ - if (opts[n].mask == OPTION_MASK_ISA_SSE4_2) - { - if (strcmp (name, "sse4") == 0) - return (mask & opts[n].mask) != 0 ? 1 : -1; - } - if (strcmp (name, opts[n].option + 2) == 0) - return (mask & opts[n].mask) != 0 ? 1 : -1; - } - } - return 0; - default: - gcc_unreachable (); - } -} - -/* Return a string that documents the current -m options. The caller is - responsible for freeing the string. */ - -char * -ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2, - int flags, int flags2, - const char *arch, const char *tune, - enum fpmath_unit fpmath, - enum prefer_vector_width pvw, - bool add_nl_p, bool add_abi_p) -{ - /* Flag options. */ - static struct ix86_target_opts flag_opts[] = - { - { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE }, - { "-mlong-double-128", MASK_LONG_DOUBLE_128 }, - { "-mlong-double-64", MASK_LONG_DOUBLE_64 }, - { "-m80387", MASK_80387 }, - { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS }, - { "-malign-double", MASK_ALIGN_DOUBLE }, - { "-mcld", MASK_CLD }, - { "-mfp-ret-in-387", MASK_FLOAT_RETURNS }, - { "-mieee-fp", MASK_IEEE_FP }, - { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS }, - { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY }, - { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT }, - { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS }, - { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 }, - { "-mno-push-args", MASK_NO_PUSH_ARGS }, - { "-mno-red-zone", MASK_NO_RED_ZONE }, - { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER }, - { "-mrecip", MASK_RECIP }, - { "-mrtd", MASK_RTD }, - { "-msseregparm", MASK_SSEREGPARM }, - { "-mstack-arg-probe", MASK_STACK_PROBE }, - { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS }, - { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS }, - { "-m8bit-idiv", MASK_USE_8BIT_IDIV }, - { "-mvzeroupper", MASK_VZEROUPPER }, - { "-mstv", MASK_STV }, - { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD }, - { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE }, - { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES } - }; - - /* Additional flag options. */ - static struct ix86_target_opts flag2_opts[] = - { - { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY } - }; - - const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts) - + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2]; - - char isa_other[40]; - char isa2_other[40]; - char flags_other[40]; - char flags2_other[40]; - unsigned num = 0; - unsigned i, j; - char *ret; - char *ptr; - size_t len; - size_t line_len; - size_t sep_len; - const char *abi; - - memset (opts, '\0', sizeof (opts)); - - /* Add -march= option. */ - if (arch) - { - opts[num][0] = "-march="; - opts[num++][1] = arch; - } - - /* Add -mtune= option. */ - if (tune) - { - opts[num][0] = "-mtune="; - opts[num++][1] = tune; - } - - /* Add -m32/-m64/-mx32. */ - if (add_abi_p) - { - if ((isa & OPTION_MASK_ISA_64BIT) != 0) - { - if ((isa & OPTION_MASK_ABI_64) != 0) - abi = "-m64"; - else - abi = "-mx32"; - } - else - abi = "-m32"; - opts[num++][0] = abi; - } - isa &= ~(OPTION_MASK_ISA_64BIT | OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32); - - /* Pick out the options in isa2 options. */ - for (i = 0; i < ARRAY_SIZE (isa2_opts); i++) - { - if ((isa2 & isa2_opts[i].mask) != 0) - { - opts[num++][0] = isa2_opts[i].option; - isa2 &= ~ isa2_opts[i].mask; - } - } - - if (isa2 && add_nl_p) - { - opts[num++][0] = isa2_other; - sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2); - } - - /* Pick out the options in isa options. */ - for (i = 0; i < ARRAY_SIZE (isa_opts); i++) - { - if ((isa & isa_opts[i].mask) != 0) - { - opts[num++][0] = isa_opts[i].option; - isa &= ~ isa_opts[i].mask; - } - } - - if (isa && add_nl_p) - { - opts[num++][0] = isa_other; - sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa); - } - - /* Add flag options. */ - for (i = 0; i < ARRAY_SIZE (flag_opts); i++) - { - if ((flags & flag_opts[i].mask) != 0) - { - opts[num++][0] = flag_opts[i].option; - flags &= ~ flag_opts[i].mask; - } - } - - if (flags && add_nl_p) - { - opts[num++][0] = flags_other; - sprintf (flags_other, "(other flags: %#x)", flags); - } - - /* Add additional flag options. */ - for (i = 0; i < ARRAY_SIZE (flag2_opts); i++) - { - if ((flags2 & flag2_opts[i].mask) != 0) - { - opts[num++][0] = flag2_opts[i].option; - flags2 &= ~ flag2_opts[i].mask; - } - } - - if (flags2 && add_nl_p) - { - opts[num++][0] = flags2_other; - sprintf (flags2_other, "(other flags2: %#x)", flags2); - } - - /* Add -mfpmath= option. */ - if (fpmath) - { - opts[num][0] = "-mfpmath="; - switch ((int) fpmath) - { - case FPMATH_387: - opts[num++][1] = "387"; - break; - - case FPMATH_SSE: - opts[num++][1] = "sse"; - break; - - case FPMATH_387 | FPMATH_SSE: - opts[num++][1] = "sse+387"; - break; - - default: - gcc_unreachable (); - } - } - - /* Add -mprefer-vector-width= option. */ - if (pvw) - { - opts[num][0] = "-mprefer-vector-width="; - switch ((int) pvw) - { - case PVW_AVX128: - opts[num++][1] = "128"; - break; - - case PVW_AVX256: - opts[num++][1] = "256"; - break; - - case PVW_AVX512: - opts[num++][1] = "512"; - break; - - default: - gcc_unreachable (); - } - } - - /* Any options? */ - if (num == 0) - return NULL; - - gcc_assert (num < ARRAY_SIZE (opts)); - - /* Size the string. */ - len = 0; - sep_len = (add_nl_p) ? 3 : 1; - for (i = 0; i < num; i++) - { - len += sep_len; - for (j = 0; j < 2; j++) - if (opts[i][j]) - len += strlen (opts[i][j]); - } - - /* Build the string. */ - ret = ptr = (char *) xmalloc (len); - line_len = 0; - - for (i = 0; i < num; i++) - { - size_t len2[2]; - - for (j = 0; j < 2; j++) - len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0; - - if (i != 0) - { - *ptr++ = ' '; - line_len++; - - if (add_nl_p && line_len + len2[0] + len2[1] > 70) - { - *ptr++ = '\\'; - *ptr++ = '\n'; - line_len = 0; - } - } - - for (j = 0; j < 2; j++) - if (opts[i][j]) - { - memcpy (ptr, opts[i][j], len2[j]); - ptr += len2[j]; - line_len += len2[j]; - } - } - - *ptr = '\0'; - gcc_assert (ret + len >= ptr); - - return ret; -} - -/* Function that is callable from the debugger to print the current - options. */ -void ATTRIBUTE_UNUSED -ix86_debug_options (void) -{ - char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2, - target_flags, ix86_target_flags, - ix86_arch_string, ix86_tune_string, - ix86_fpmath, prefer_vector_width_type, - true, true); - - if (opts) - { - fprintf (stderr, "%s\n\n", opts); - free (opts); - } - else - fputs ("<no options>\n\n", stderr); - - return; -} - -/* Save the current options */ - -void -ix86_function_specific_save (struct cl_target_option *ptr, - struct gcc_options *opts) -{ - ptr->arch = ix86_arch; - ptr->schedule = ix86_schedule; - ptr->prefetch_sse = x86_prefetch_sse; - ptr->tune = ix86_tune; - ptr->branch_cost = ix86_branch_cost; - ptr->tune_defaulted = ix86_tune_defaulted; - ptr->arch_specified = ix86_arch_specified; - ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit; - ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit; - ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit; - ptr->x_ix86_arch_string = opts->x_ix86_arch_string; - ptr->x_ix86_tune_string = opts->x_ix86_tune_string; - ptr->x_ix86_cmodel = opts->x_ix86_cmodel; - ptr->x_ix86_abi = opts->x_ix86_abi; - ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect; - ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost; - ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes; - ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer; - ptr->x_ix86_force_drap = opts->x_ix86_force_drap; - ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg; - ptr->x_ix86_pmode = opts->x_ix86_pmode; - ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg; - ptr->x_ix86_recip_name = opts->x_ix86_recip_name; - ptr->x_ix86_regparm = opts->x_ix86_regparm; - ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold; - ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx; - ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard; - ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg; - ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect; - ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string; - ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy; - ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy; - ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default; - ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type; - - /* The fields are char but the variables are not; make sure the - values fit in the fields. */ - gcc_assert (ptr->arch == ix86_arch); - gcc_assert (ptr->schedule == ix86_schedule); - gcc_assert (ptr->tune == ix86_tune); - gcc_assert (ptr->branch_cost == ix86_branch_cost); -} - -/* Feature tests against the various architecture variations, used to create - ix86_arch_features based on the processor mask. */ -static unsigned HOST_WIDE_INT initial_ix86_arch_features[X86_ARCH_LAST] = { - /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */ - ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6), - - /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */ - ~m_386, - - /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */ - ~(m_386 | m_486), - - /* X86_ARCH_XADD: Exchange and add was added for 80486. */ - ~m_386, - - /* X86_ARCH_BSWAP: Byteswap was added for 80486. */ - ~m_386, -}; - -/* This table must be in sync with enum processor_type in i386.h. */ -static const struct processor_costs *processor_cost_table[] = -{ - &generic_cost, - &i386_cost, - &i486_cost, - &pentium_cost, - &lakemont_cost, - &pentiumpro_cost, - &pentium4_cost, - &nocona_cost, - &core_cost, - &core_cost, - &core_cost, - &core_cost, - &atom_cost, - &slm_cost, - &slm_cost, - &slm_cost, - &slm_cost, - &slm_cost, - &slm_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &intel_cost, - &geode_cost, - &k6_cost, - &athlon_cost, - &k8_cost, - &amdfam10_cost, - &bdver_cost, - &bdver_cost, - &bdver_cost, - &bdver_cost, - &btver1_cost, - &btver2_cost, - &znver1_cost, - &znver2_cost -}; - -/* Guarantee that the array is aligned with enum processor_type. */ -STATIC_ASSERT (ARRAY_SIZE (processor_cost_table) == PROCESSOR_max); - -static bool -ix86_option_override_internal (bool main_args_p, - struct gcc_options *opts, - struct gcc_options *opts_set); -static void -set_ix86_tune_features (struct gcc_options *opts, - enum processor_type ix86_tune, bool dump); - -/* Restore the current options */ - -void -ix86_function_specific_restore (struct gcc_options *opts, - struct cl_target_option *ptr) -{ - enum processor_type old_tune = ix86_tune; - enum processor_type old_arch = ix86_arch; - unsigned HOST_WIDE_INT ix86_arch_mask; - int i; - - /* We don't change -fPIC. */ - opts->x_flag_pic = flag_pic; - - ix86_arch = (enum processor_type) ptr->arch; - ix86_schedule = (enum attr_cpu) ptr->schedule; - ix86_tune = (enum processor_type) ptr->tune; - x86_prefetch_sse = ptr->prefetch_sse; - opts->x_ix86_branch_cost = ptr->branch_cost; - ix86_tune_defaulted = ptr->tune_defaulted; - ix86_arch_specified = ptr->arch_specified; - opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit; - opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit; - opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit; - opts->x_ix86_arch_string = ptr->x_ix86_arch_string; - opts->x_ix86_tune_string = ptr->x_ix86_tune_string; - opts->x_ix86_cmodel = ptr->x_ix86_cmodel; - opts->x_ix86_abi = ptr->x_ix86_abi; - opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect; - opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost; - opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes; - opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer; - opts->x_ix86_force_drap = ptr->x_ix86_force_drap; - opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg; - opts->x_ix86_pmode = ptr->x_ix86_pmode; - opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg; - opts->x_ix86_recip_name = ptr->x_ix86_recip_name; - opts->x_ix86_regparm = ptr->x_ix86_regparm; - opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold; - opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx; - opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard; - opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg; - opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect; - opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string; - opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy; - opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy; - opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default; - opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type; - ix86_tune_cost = processor_cost_table[ix86_tune]; - /* TODO: ix86_cost should be chosen at instruction or function granuality - so for cold code we use size_cost even in !optimize_size compilation. */ - if (opts->x_optimize_size) - ix86_cost = &ix86_size_cost; - else - ix86_cost = ix86_tune_cost; - - /* Recreate the arch feature tests if the arch changed */ - if (old_arch != ix86_arch) - { - ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch; - for (i = 0; i < X86_ARCH_LAST; ++i) - ix86_arch_features[i] - = !!(initial_ix86_arch_features[i] & ix86_arch_mask); - } - - /* Recreate the tune optimization tests */ - if (old_tune != ix86_tune) - set_ix86_tune_features (opts, ix86_tune, false); -} - -/* Adjust target options after streaming them in. This is mainly about - reconciling them with global options. */ - -void -ix86_function_specific_post_stream_in (struct cl_target_option *ptr) -{ - /* flag_pic is a global option, but ix86_cmodel is target saved option - partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel - for PIC, or error out. */ - if (flag_pic) - switch (ptr->x_ix86_cmodel) - { - case CM_SMALL: - ptr->x_ix86_cmodel = CM_SMALL_PIC; - break; - - case CM_MEDIUM: - ptr->x_ix86_cmodel = CM_MEDIUM_PIC; - break; - - case CM_LARGE: - ptr->x_ix86_cmodel = CM_LARGE_PIC; - break; - - case CM_KERNEL: - error ("code model %s does not support PIC mode", "kernel"); - break; - - default: - break; - } - else - switch (ptr->x_ix86_cmodel) - { - case CM_SMALL_PIC: - ptr->x_ix86_cmodel = CM_SMALL; - break; - - case CM_MEDIUM_PIC: - ptr->x_ix86_cmodel = CM_MEDIUM; - break; - - case CM_LARGE_PIC: - ptr->x_ix86_cmodel = CM_LARGE; - break; - - default: - break; - } -} - -/* Print the current options */ - -void -ix86_function_specific_print (FILE *file, int indent, - struct cl_target_option *ptr) -{ - char *target_string - = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2, - ptr->x_target_flags, ptr->x_ix86_target_flags, - NULL, NULL, ptr->x_ix86_fpmath, - ptr->x_prefer_vector_width_type, false, true); - - gcc_assert (ptr->arch < PROCESSOR_max); - fprintf (file, "%*sarch = %d (%s)\n", - indent, "", - ptr->arch, processor_names[ptr->arch]); - - gcc_assert (ptr->tune < PROCESSOR_max); - fprintf (file, "%*stune = %d (%s)\n", - indent, "", - ptr->tune, processor_names[ptr->tune]); - - fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost); - - if (target_string) - { - fprintf (file, "%*s%s\n", indent, "", target_string); - free (target_string); - } -} - - -/* Inner function to process the attribute((target(...))), take an argument and - set the current options from the argument. If we have a list, recursively go - over the list. */ - -static bool -ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], - struct gcc_options *opts, - struct gcc_options *opts_set, - struct gcc_options *enum_opts_set, - bool target_clone_attr) -{ - char *next_optstr; - bool ret = true; - -#define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 } -#define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 } -#define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 } -#define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M } -#define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M } - - enum ix86_opt_type - { - ix86_opt_unknown, - ix86_opt_yes, - ix86_opt_no, - ix86_opt_str, - ix86_opt_enum, - ix86_opt_isa - }; - - static const struct - { - const char *string; - size_t len; - enum ix86_opt_type type; - int opt; - int mask; - } attrs[] = { - /* isa options */ - IX86_ATTR_ISA ("pconfig", OPT_mpconfig), - IX86_ATTR_ISA ("wbnoinvd", OPT_mwbnoinvd), - IX86_ATTR_ISA ("sgx", OPT_msgx), - IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps), - IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw), - IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq), - IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2), - IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni), - IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg), - IX86_ATTR_ISA ("avx512vp2intersect", OPT_mavx512vp2intersect), - - IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi), - IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma), - IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl), - IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw), - IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq), - IX86_ATTR_ISA ("avx512er", OPT_mavx512er), - IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf), - IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd), - IX86_ATTR_ISA ("avx512f", OPT_mavx512f), - IX86_ATTR_ISA ("avx2", OPT_mavx2), - IX86_ATTR_ISA ("fma", OPT_mfma), - IX86_ATTR_ISA ("xop", OPT_mxop), - IX86_ATTR_ISA ("fma4", OPT_mfma4), - IX86_ATTR_ISA ("f16c", OPT_mf16c), - IX86_ATTR_ISA ("avx", OPT_mavx), - IX86_ATTR_ISA ("sse4", OPT_msse4), - IX86_ATTR_ISA ("sse4.2", OPT_msse4_2), - IX86_ATTR_ISA ("sse4.1", OPT_msse4_1), - IX86_ATTR_ISA ("sse4a", OPT_msse4a), - IX86_ATTR_ISA ("ssse3", OPT_mssse3), - IX86_ATTR_ISA ("sse3", OPT_msse3), - IX86_ATTR_ISA ("aes", OPT_maes), - IX86_ATTR_ISA ("sha", OPT_msha), - IX86_ATTR_ISA ("pclmul", OPT_mpclmul), - IX86_ATTR_ISA ("sse2", OPT_msse2), - IX86_ATTR_ISA ("sse", OPT_msse), - IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa), - IX86_ATTR_ISA ("3dnow", OPT_m3dnow), - IX86_ATTR_ISA ("mmx", OPT_mmmx), - IX86_ATTR_ISA ("rtm", OPT_mrtm), - IX86_ATTR_ISA ("prfchw", OPT_mprfchw), - IX86_ATTR_ISA ("rdseed", OPT_mrdseed), - IX86_ATTR_ISA ("adx", OPT_madx), - IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1), - IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt), - IX86_ATTR_ISA ("xsaves", OPT_mxsaves), - IX86_ATTR_ISA ("xsavec", OPT_mxsavec), - IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt), - IX86_ATTR_ISA ("xsave", OPT_mxsave), - IX86_ATTR_ISA ("abm", OPT_mabm), - IX86_ATTR_ISA ("bmi", OPT_mbmi), - IX86_ATTR_ISA ("bmi2", OPT_mbmi2), - IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt), - IX86_ATTR_ISA ("tbm", OPT_mtbm), - IX86_ATTR_ISA ("popcnt", OPT_mpopcnt), - IX86_ATTR_ISA ("cx16", OPT_mcx16), - IX86_ATTR_ISA ("sahf", OPT_msahf), - IX86_ATTR_ISA ("movbe", OPT_mmovbe), - IX86_ATTR_ISA ("crc32", OPT_mcrc32), - IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase), - IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd), - IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx), - IX86_ATTR_ISA ("clzero", OPT_mclzero), - IX86_ATTR_ISA ("pku", OPT_mpku), - IX86_ATTR_ISA ("lwp", OPT_mlwp), - IX86_ATTR_ISA ("hle", OPT_mhle), - IX86_ATTR_ISA ("fxsr", OPT_mfxsr), - IX86_ATTR_ISA ("clwb", OPT_mclwb), - IX86_ATTR_ISA ("rdpid", OPT_mrdpid), - IX86_ATTR_ISA ("gfni", OPT_mgfni), - IX86_ATTR_ISA ("shstk", OPT_mshstk), - IX86_ATTR_ISA ("vaes", OPT_mvaes), - IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq), - IX86_ATTR_ISA ("movdiri", OPT_mmovdiri), - IX86_ATTR_ISA ("movdir64b", OPT_mmovdir64b), - IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg), - IX86_ATTR_ISA ("cldemote", OPT_mcldemote), - IX86_ATTR_ISA ("ptwrite", OPT_mptwrite), - IX86_ATTR_ISA ("avx512bf16", OPT_mavx512bf16), - IX86_ATTR_ISA ("enqcmd", OPT_menqcmd), - IX86_ATTR_ISA ("serialize", OPT_mserialize), - IX86_ATTR_ISA ("tsxldtrk", OPT_mtsxldtrk), - - /* enum options */ - IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_), - IX86_ATTR_ENUM ("prefer-vector-width=", OPT_mprefer_vector_width_), - - /* string options */ - IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH), - IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE), - - /* flag options */ - IX86_ATTR_YES ("cld", - OPT_mcld, - MASK_CLD), - - IX86_ATTR_NO ("fancy-math-387", - OPT_mfancy_math_387, - MASK_NO_FANCY_MATH_387), - - IX86_ATTR_YES ("ieee-fp", - OPT_mieee_fp, - MASK_IEEE_FP), - - IX86_ATTR_YES ("inline-all-stringops", - OPT_minline_all_stringops, - MASK_INLINE_ALL_STRINGOPS), - - IX86_ATTR_YES ("inline-stringops-dynamically", - OPT_minline_stringops_dynamically, - MASK_INLINE_STRINGOPS_DYNAMICALLY), - - IX86_ATTR_NO ("align-stringops", - OPT_mno_align_stringops, - MASK_NO_ALIGN_STRINGOPS), - - IX86_ATTR_YES ("recip", - OPT_mrecip, - MASK_RECIP), - }; - - location_t loc - = fndecl == NULL ? UNKNOWN_LOCATION : DECL_SOURCE_LOCATION (fndecl); - const char *attr_name = target_clone_attr ? "target_clone" : "target"; - - /* If this is a list, recurse to get the options. */ - if (TREE_CODE (args) == TREE_LIST) - { - bool ret = true; - - for (; args; args = TREE_CHAIN (args)) - if (TREE_VALUE (args) - && !ix86_valid_target_attribute_inner_p (fndecl, TREE_VALUE (args), - p_strings, opts, opts_set, - enum_opts_set, - target_clone_attr)) - ret = false; - - return ret; - } - - else if (TREE_CODE (args) != STRING_CST) - { - error_at (loc, "attribute %qs argument is not a string", attr_name); - return false; - } - - /* Handle multiple arguments separated by commas. */ - next_optstr = ASTRDUP (TREE_STRING_POINTER (args)); - - while (next_optstr && *next_optstr != '\0') - { - char *p = next_optstr; - char *orig_p = p; - char *comma = strchr (next_optstr, ','); - size_t len, opt_len; - int opt; - bool opt_set_p; - char ch; - unsigned i; - enum ix86_opt_type type = ix86_opt_unknown; - int mask = 0; - - if (comma) - { - *comma = '\0'; - len = comma - next_optstr; - next_optstr = comma + 1; - } - else - { - len = strlen (p); - next_optstr = NULL; - } - - /* Recognize no-xxx. */ - if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-') - { - opt_set_p = false; - p += 3; - len -= 3; - } - else - opt_set_p = true; - - /* Find the option. */ - ch = *p; - opt = N_OPTS; - for (i = 0; i < ARRAY_SIZE (attrs); i++) - { - type = attrs[i].type; - opt_len = attrs[i].len; - if (ch == attrs[i].string[0] - && ((type != ix86_opt_str && type != ix86_opt_enum) - ? len == opt_len - : len > opt_len) - && memcmp (p, attrs[i].string, opt_len) == 0) - { - opt = attrs[i].opt; - mask = attrs[i].mask; - break; - } - } - - /* Process the option. */ - if (opt == N_OPTS) - { - error_at (loc, "attribute %qs argument %qs is unknown", - orig_p, attr_name); - ret = false; - } - - else if (type == ix86_opt_isa) - { - struct cl_decoded_option decoded; - - generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded); - ix86_handle_option (opts, opts_set, - &decoded, input_location); - } - - else if (type == ix86_opt_yes || type == ix86_opt_no) - { - if (type == ix86_opt_no) - opt_set_p = !opt_set_p; - - if (opt_set_p) - opts->x_target_flags |= mask; - else - opts->x_target_flags &= ~mask; - } - - else if (type == ix86_opt_str) - { - if (p_strings[opt]) - { - error_at (loc, "attribute value %qs was already specified " - "in %qs attribute", orig_p, attr_name); - ret = false; - } - else - { - p_strings[opt] = xstrdup (p + opt_len); - if (opt == IX86_FUNCTION_SPECIFIC_ARCH) - { - /* If arch= is set, clear all bits in x_ix86_isa_flags, - except for ISA_64BIT, ABI_64, ABI_X32, and CODE16 - and all bits in x_ix86_isa_flags2. */ - opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT - | OPTION_MASK_ABI_64 - | OPTION_MASK_ABI_X32 - | OPTION_MASK_CODE16); - opts->x_ix86_isa_flags_explicit &= (OPTION_MASK_ISA_64BIT - | OPTION_MASK_ABI_64 - | OPTION_MASK_ABI_X32 - | OPTION_MASK_CODE16); - opts->x_ix86_isa_flags2 = 0; - opts->x_ix86_isa_flags2_explicit = 0; - } - } - } - - else if (type == ix86_opt_enum) - { - bool arg_ok; - int value; - - arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET); - if (arg_ok) - set_option (opts, enum_opts_set, opt, value, - p + opt_len, DK_UNSPECIFIED, input_location, - global_dc); - else - { - error_at (loc, "attribute value %qs is unknown in %qs attribute", - orig_p, attr_name); - ret = false; - } - } - - else - gcc_unreachable (); - } - - return ret; -} - -/* Release allocated strings. */ -static void -release_options_strings (char **option_strings) -{ - /* Free up memory allocated to hold the strings */ - for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++) - free (option_strings[i]); -} - -/* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */ - -tree -ix86_valid_target_attribute_tree (tree fndecl, tree args, - struct gcc_options *opts, - struct gcc_options *opts_set, - bool target_clone_attr) -{ - const char *orig_arch_string = opts->x_ix86_arch_string; - const char *orig_tune_string = opts->x_ix86_tune_string; - enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath; - enum prefer_vector_width orig_pvw_set = opts_set->x_prefer_vector_width_type; - int orig_tune_defaulted = ix86_tune_defaulted; - int orig_arch_specified = ix86_arch_specified; - char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL }; - tree t = NULL_TREE; - struct cl_target_option *def - = TREE_TARGET_OPTION (target_option_default_node); - struct gcc_options enum_opts_set; - - memset (&enum_opts_set, 0, sizeof (enum_opts_set)); - - /* Process each of the options on the chain. */ - if (!ix86_valid_target_attribute_inner_p (fndecl, args, option_strings, opts, - opts_set, &enum_opts_set, - target_clone_attr)) - return error_mark_node; - - /* If the changed options are different from the default, rerun - ix86_option_override_internal, and then save the options away. - The string options are attribute options, and will be undone - when we copy the save structure. */ - if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags - || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2 - || opts->x_target_flags != def->x_target_flags - || option_strings[IX86_FUNCTION_SPECIFIC_ARCH] - || option_strings[IX86_FUNCTION_SPECIFIC_TUNE] - || enum_opts_set.x_ix86_fpmath - || enum_opts_set.x_prefer_vector_width_type) - { - /* If we are using the default tune= or arch=, undo the string assigned, - and use the default. */ - if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]) - opts->x_ix86_arch_string - = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]); - else if (!orig_arch_specified) - opts->x_ix86_arch_string = NULL; - - if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]) - opts->x_ix86_tune_string - = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]); - else if (orig_tune_defaulted) - opts->x_ix86_tune_string = NULL; - - /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */ - if (enum_opts_set.x_ix86_fpmath) - opts_set->x_ix86_fpmath = (enum fpmath_unit) 1; - if (enum_opts_set.x_prefer_vector_width_type) - opts_set->x_prefer_vector_width_type = (enum prefer_vector_width) 1; - - /* Do any overrides, such as arch=xxx, or tune=xxx support. */ - bool r = ix86_option_override_internal (false, opts, opts_set); - if (!r) - { - release_options_strings (option_strings); - return error_mark_node; - } - - /* Add any builtin functions with the new isa if any. */ - ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2); - - /* Save the current options unless we are validating options for - #pragma. */ - t = build_target_option_node (opts); - - opts->x_ix86_arch_string = orig_arch_string; - opts->x_ix86_tune_string = orig_tune_string; - opts_set->x_ix86_fpmath = orig_fpmath_set; - opts_set->x_prefer_vector_width_type = orig_pvw_set; - - release_options_strings (option_strings); - } - - return t; -} - -/* Hook to validate attribute((target("string"))). */ - -bool -ix86_valid_target_attribute_p (tree fndecl, - tree ARG_UNUSED (name), - tree args, - int flags) -{ - struct gcc_options func_options; - tree new_target, new_optimize; - bool ret = true; - - /* attribute((target("default"))) does nothing, beyond - affecting multi-versioning. */ - if (TREE_VALUE (args) - && TREE_CODE (TREE_VALUE (args)) == STRING_CST - && TREE_CHAIN (args) == NULL_TREE - && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0) - return true; - - tree old_optimize = build_optimization_node (&global_options); - - /* Get the optimization options of the current function. */ - tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl); - - if (!func_optimize) - func_optimize = old_optimize; - - /* Init func_options. */ - memset (&func_options, 0, sizeof (func_options)); - init_options_struct (&func_options, NULL); - lang_hooks.init_options_struct (&func_options); - - cl_optimization_restore (&func_options, - TREE_OPTIMIZATION (func_optimize)); - - /* Initialize func_options to the default before its target options can - be set. */ - cl_target_option_restore (&func_options, - TREE_TARGET_OPTION (target_option_default_node)); - - /* FLAGS == 1 is used for target_clones attribute. */ - new_target - = ix86_valid_target_attribute_tree (fndecl, args, &func_options, - &global_options_set, flags == 1); - - new_optimize = build_optimization_node (&func_options); - - if (new_target == error_mark_node) - ret = false; - - else if (fndecl && new_target) - { - DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target; - - if (old_optimize != new_optimize) - DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize; - } - - return ret; -} - -const char *stringop_alg_names[] = { -#define DEF_ENUM -#define DEF_ALG(alg, name) #name, -#include "stringop.def" -#undef DEF_ENUM -#undef DEF_ALG -}; - -/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. - The string is of the following form (or comma separated list of it): - - strategy_alg:max_size:[align|noalign] - - where the full size range for the strategy is either [0, max_size] or - [min_size, max_size], in which min_size is the max_size + 1 of the - preceding range. The last size range must have max_size == -1. - - Examples: - - 1. - -mmemcpy-strategy=libcall:-1:noalign - - this is equivalent to (for known size memcpy) -mstringop-strategy=libcall - - - 2. - -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign - - This is to tell the compiler to use the following strategy for memset - 1) when the expected size is between [1, 16], use rep_8byte strategy; - 2) when the size is between [17, 2048], use vector_loop; - 3) when the size is > 2048, use libcall. */ - -struct stringop_size_range -{ - int max; - stringop_alg alg; - bool noalign; -}; - -static void -ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) -{ - const struct stringop_algs *default_algs; - stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; - char *curr_range_str, *next_range_str; - const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="; - int i = 0, n = 0; - - if (is_memset) - default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; - else - default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; - - curr_range_str = strategy_str; - - do - { - int maxs; - char alg_name[128]; - char align[16]; - next_range_str = strchr (curr_range_str, ','); - if (next_range_str) - *next_range_str++ = '\0'; - - if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs, - align) != 3) - { - error ("wrong argument %qs to option %qs", curr_range_str, opt); - return; - } - - if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1)) - { - error ("size ranges of option %qs should be increasing", opt); - return; - } - - for (i = 0; i < last_alg; i++) - if (!strcmp (alg_name, stringop_alg_names[i])) - break; - - if (i == last_alg) - { - error ("wrong strategy name %qs specified for option %qs", - alg_name, opt); - - auto_vec <const char *> candidates; - for (i = 0; i < last_alg; i++) - if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT) - candidates.safe_push (stringop_alg_names[i]); - - char *s; - const char *hint - = candidates_list_and_hint (alg_name, s, candidates); - if (hint) - inform (input_location, - "valid arguments to %qs are: %s; did you mean %qs?", - opt, s, hint); - else - inform (input_location, "valid arguments to %qs are: %s", - opt, s); - XDELETEVEC (s); - return; - } - - if ((stringop_alg) i == rep_prefix_8_byte - && !TARGET_64BIT) - { - /* rep; movq isn't available in 32-bit code. */ - error ("strategy name %qs specified for option %qs " - "not supported for 32-bit code", alg_name, opt); - return; - } - - input_ranges[n].max = maxs; - input_ranges[n].alg = (stringop_alg) i; - if (!strcmp (align, "align")) - input_ranges[n].noalign = false; - else if (!strcmp (align, "noalign")) - input_ranges[n].noalign = true; - else - { - error ("unknown alignment %qs specified for option %qs", align, opt); - return; - } - n++; - curr_range_str = next_range_str; - } - while (curr_range_str); - - if (input_ranges[n - 1].max != -1) - { - error ("the max value for the last size range should be -1" - " for option %qs", opt); - return; - } - - if (n > MAX_STRINGOP_ALGS) - { - error ("too many size ranges specified in option %qs", opt); - return; - } - - /* Now override the default algs array. */ - for (i = 0; i < n; i++) - { - *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; - *const_cast<stringop_alg *>(&default_algs->size[i].alg) - = input_ranges[i].alg; - *const_cast<int *>(&default_algs->size[i].noalign) - = input_ranges[i].noalign; - } -} - - -/* parse -mtune-ctrl= option. When DUMP is true, - print the features that are explicitly set. */ - -static void -parse_mtune_ctrl_str (struct gcc_options *opts, bool dump) -{ - if (!opts->x_ix86_tune_ctrl_string) - return; - - char *next_feature_string = NULL; - char *curr_feature_string = xstrdup (opts->x_ix86_tune_ctrl_string); - char *orig = curr_feature_string; - int i; - do - { - bool clear = false; - - next_feature_string = strchr (curr_feature_string, ','); - if (next_feature_string) - *next_feature_string++ = '\0'; - if (*curr_feature_string == '^') - { - curr_feature_string++; - clear = true; - } - for (i = 0; i < X86_TUNE_LAST; i++) - { - if (!strcmp (curr_feature_string, ix86_tune_feature_names[i])) - { - ix86_tune_features[i] = !clear; - if (dump) - fprintf (stderr, "Explicitly %s feature %s\n", - clear ? "clear" : "set", ix86_tune_feature_names[i]); - break; - } - } - if (i == X86_TUNE_LAST) - error ("unknown parameter to option %<-mtune-ctrl%>: %s", - clear ? curr_feature_string - 1 : curr_feature_string); - curr_feature_string = next_feature_string; - } - while (curr_feature_string); - free (orig); -} - -/* Helper function to set ix86_tune_features. IX86_TUNE is the - processor type. */ - -static void -set_ix86_tune_features (struct gcc_options *opts, - enum processor_type ix86_tune, bool dump) -{ - unsigned HOST_WIDE_INT ix86_tune_mask = HOST_WIDE_INT_1U << ix86_tune; - int i; - - for (i = 0; i < X86_TUNE_LAST; ++i) - { - if (ix86_tune_no_default) - ix86_tune_features[i] = 0; - else - ix86_tune_features[i] - = !!(initial_ix86_tune_features[i] & ix86_tune_mask); - } - - if (dump) - { - fprintf (stderr, "List of x86 specific tuning parameter names:\n"); - for (i = 0; i < X86_TUNE_LAST; i++) - fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i], - ix86_tune_features[i] ? "on" : "off"); - } - - parse_mtune_ctrl_str (opts, dump); -} - - -/* Default align_* from the processor table. */ - -static void -ix86_default_align (struct gcc_options *opts) -{ - /* -falign-foo without argument: supply one. */ - if (opts->x_flag_align_loops && !opts->x_str_align_loops) - opts->x_str_align_loops = processor_cost_table[ix86_tune]->align_loop; - if (opts->x_flag_align_jumps && !opts->x_str_align_jumps) - opts->x_str_align_jumps = processor_cost_table[ix86_tune]->align_jump; - if (opts->x_flag_align_labels && !opts->x_str_align_labels) - opts->x_str_align_labels = processor_cost_table[ix86_tune]->align_label; - if (opts->x_flag_align_functions && !opts->x_str_align_functions) - opts->x_str_align_functions = processor_cost_table[ix86_tune]->align_func; -} - -#ifndef USE_IX86_FRAME_POINTER -#define USE_IX86_FRAME_POINTER 0 -#endif - -/* (Re)compute option overrides affected by optimization levels in - target-specific ways. */ - -static void -ix86_recompute_optlev_based_flags (struct gcc_options *opts, - struct gcc_options *opts_set) -{ - /* Set the default values for switches whose default depends on TARGET_64BIT - in case they weren't overwritten by command line options. */ - if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - { - if (opts->x_optimize >= 1) - SET_OPTION_IF_UNSET (opts, opts_set, flag_omit_frame_pointer, - !USE_IX86_FRAME_POINTER); - if (opts->x_flag_asynchronous_unwind_tables - && TARGET_64BIT_MS_ABI) - SET_OPTION_IF_UNSET (opts, opts_set, flag_unwind_tables, 1); - if (opts->x_flag_asynchronous_unwind_tables == 2) - opts->x_flag_unwind_tables - = opts->x_flag_asynchronous_unwind_tables = 1; - if (opts->x_flag_pcc_struct_return == 2) - opts->x_flag_pcc_struct_return = 0; - } - else - { - if (opts->x_optimize >= 1) - SET_OPTION_IF_UNSET (opts, opts_set, flag_omit_frame_pointer, - !(USE_IX86_FRAME_POINTER || opts->x_optimize_size)); - if (opts->x_flag_asynchronous_unwind_tables == 2) - opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER; - if (opts->x_flag_pcc_struct_return == 2) - { - /* Intel MCU psABI specifies that -freg-struct-return should - be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1, - we check -miamcu so that -freg-struct-return is always - turned on if -miamcu is used. */ - if (TARGET_IAMCU_P (opts->x_target_flags)) - opts->x_flag_pcc_struct_return = 0; - else - opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN; - } - } -} - -/* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */ - -void -ix86_override_options_after_change (void) -{ - ix86_default_align (&global_options); - ix86_recompute_optlev_based_flags (&global_options, &global_options_set); -} - -/* Clear stack slot assignments remembered from previous functions. - This is called from INIT_EXPANDERS once before RTL is emitted for each - function. */ - -static struct machine_function * -ix86_init_machine_status (void) -{ - struct machine_function *f; - - f = ggc_cleared_alloc<machine_function> (); - f->call_abi = ix86_abi; - f->stack_frame_required = true; - - return f; -} - -/* Override various settings based on options. If MAIN_ARGS_P, the - options are from the command line, otherwise they are from - attributes. Return true if there's an error related to march - option. */ - -static bool -ix86_option_override_internal (bool main_args_p, - struct gcc_options *opts, - struct gcc_options *opts_set) -{ - int i; - unsigned HOST_WIDE_INT ix86_arch_mask; - const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL); - - /* -mrecip options. */ - static struct - { - const char *string; /* option name */ - unsigned int mask; /* mask bits to set */ - } - const recip_options[] = - { - { "all", RECIP_MASK_ALL }, - { "none", RECIP_MASK_NONE }, - { "div", RECIP_MASK_DIV }, - { "sqrt", RECIP_MASK_SQRT }, - { "vec-div", RECIP_MASK_VEC_DIV }, - { "vec-sqrt", RECIP_MASK_VEC_SQRT }, - }; - - - /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if - TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */ - if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32); -#ifdef TARGET_BI_ARCH - else - { -#if TARGET_BI_ARCH == 1 - /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64 - is on and OPTION_MASK_ABI_X32 is off. We turn off - OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by - -mx32. */ - if (TARGET_X32_P (opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64; -#else - /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is - on and OPTION_MASK_ABI_64 is off. We turn off - OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by - -m64 or OPTION_MASK_CODE16 is turned on by -m16. */ - if (TARGET_LP64_P (opts->x_ix86_isa_flags) - || TARGET_16BIT_P (opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32; -#endif - if (TARGET_64BIT_P (opts->x_ix86_isa_flags) - && TARGET_IAMCU_P (opts->x_target_flags)) - sorry ("Intel MCU psABI isn%'t supported in %s mode", - TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit"); - } -#endif - - if (TARGET_X32_P (opts->x_ix86_isa_flags)) - { - /* Always turn on OPTION_MASK_ISA_64BIT and turn off - OPTION_MASK_ABI_64 for TARGET_X32. */ - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT; - opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64; - } - else if (TARGET_16BIT_P (opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT - | OPTION_MASK_ABI_X32 - | OPTION_MASK_ABI_64); - else if (TARGET_LP64_P (opts->x_ix86_isa_flags)) - { - /* Always turn on OPTION_MASK_ISA_64BIT and turn off - OPTION_MASK_ABI_X32 for TARGET_LP64. */ - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT; - opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32; - } - -#ifdef SUBTARGET_OVERRIDE_OPTIONS - SUBTARGET_OVERRIDE_OPTIONS; -#endif - -#ifdef SUBSUBTARGET_OVERRIDE_OPTIONS - SUBSUBTARGET_OVERRIDE_OPTIONS; -#endif - - /* -fPIC is the default for x86_64. */ - if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags)) - opts->x_flag_pic = 2; - - /* Need to check -mtune=generic first. */ - if (opts->x_ix86_tune_string) - { - /* As special support for cross compilers we read -mtune=native - as -mtune=generic. With native compilers we won't see the - -mtune=native, as it was changed by the driver. */ - if (!strcmp (opts->x_ix86_tune_string, "native")) - { - opts->x_ix86_tune_string = "generic"; - } - else if (!strcmp (opts->x_ix86_tune_string, "x86-64")) - warning (OPT_Wdeprecated, - main_args_p - ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> " - "or %<-mtune=generic%> instead as appropriate") - : G_("%<target(\"tune=x86-64\")%> is deprecated; use " - "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>" - " instead as appropriate")); - } - else - { - if (opts->x_ix86_arch_string) - opts->x_ix86_tune_string = opts->x_ix86_arch_string; - if (!opts->x_ix86_tune_string) - { - opts->x_ix86_tune_string = processor_names[TARGET_CPU_DEFAULT]; - ix86_tune_defaulted = 1; - } - - /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string - or defaulted. We need to use a sensible tune option. */ - if (!strcmp (opts->x_ix86_tune_string, "x86-64")) - { - opts->x_ix86_tune_string = "generic"; - } - } - - if (opts->x_ix86_stringop_alg == rep_prefix_8_byte - && !TARGET_64BIT_P (opts->x_ix86_isa_flags)) - { - /* rep; movq isn't available in 32-bit code. */ - error ("%<-mstringop-strategy=rep_8byte%> not supported for 32-bit code"); - opts->x_ix86_stringop_alg = no_stringop; - } - - if (!opts->x_ix86_arch_string) - opts->x_ix86_arch_string - = TARGET_64BIT_P (opts->x_ix86_isa_flags) - ? "x86-64" : SUBTARGET32_DEFAULT_CPU; - else - ix86_arch_specified = 1; - - if (opts_set->x_ix86_pmode) - { - if ((TARGET_LP64_P (opts->x_ix86_isa_flags) - && opts->x_ix86_pmode == PMODE_SI) - || (!TARGET_64BIT_P (opts->x_ix86_isa_flags) - && opts->x_ix86_pmode == PMODE_DI)) - error ("address mode %qs not supported in the %s bit mode", - TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long", - TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32"); - } - else - opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags) - ? PMODE_DI : PMODE_SI; - - SET_OPTION_IF_UNSET (opts, opts_set, ix86_abi, DEFAULT_ABI); - - if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags)) - error ("%<-mabi=ms%> not supported with X32 ABI"); - gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI); - - const char *abi_name = opts->x_ix86_abi == MS_ABI ? "ms" : "sysv"; - if ((opts->x_flag_sanitize & SANITIZE_USER_ADDRESS) - && opts->x_ix86_abi != DEFAULT_ABI) - error ("%<-mabi=%s%> not supported with %<-fsanitize=address%>", abi_name); - if ((opts->x_flag_sanitize & SANITIZE_KERNEL_ADDRESS) - && opts->x_ix86_abi != DEFAULT_ABI) - error ("%<-mabi=%s%> not supported with %<-fsanitize=kernel-address%>", - abi_name); - if ((opts->x_flag_sanitize & SANITIZE_THREAD) - && opts->x_ix86_abi != DEFAULT_ABI) - error ("%<-mabi=%s%> not supported with %<-fsanitize=thread%>", abi_name); - - /* For targets using ms ABI enable ms-extensions, if not - explicit turned off. For non-ms ABI we turn off this - option. */ - SET_OPTION_IF_UNSET (opts, opts_set, flag_ms_extensions, - (MS_ABI == DEFAULT_ABI)); - - if (opts_set->x_ix86_cmodel) - { - switch (opts->x_ix86_cmodel) - { - case CM_SMALL: - case CM_SMALL_PIC: - if (opts->x_flag_pic) - opts->x_ix86_cmodel = CM_SMALL_PIC; - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in the %s bit mode", - "small", "32"); - break; - - case CM_MEDIUM: - case CM_MEDIUM_PIC: - if (opts->x_flag_pic) - opts->x_ix86_cmodel = CM_MEDIUM_PIC; - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in the %s bit mode", - "medium", "32"); - else if (TARGET_X32_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in x32 mode", - "medium"); - break; - - case CM_LARGE: - case CM_LARGE_PIC: - if (opts->x_flag_pic) - opts->x_ix86_cmodel = CM_LARGE_PIC; - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in the %s bit mode", - "large", "32"); - else if (TARGET_X32_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in x32 mode", - "large"); - break; - - case CM_32: - if (opts->x_flag_pic) - error ("code model %s does not support PIC mode", "32"); - if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in the %s bit mode", - "32", "64"); - break; - - case CM_KERNEL: - if (opts->x_flag_pic) - { - error ("code model %s does not support PIC mode", "kernel"); - opts->x_ix86_cmodel = CM_32; - } - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in the %s bit mode", - "kernel", "32"); - break; - - default: - gcc_unreachable (); - } - } - else - { - /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the - use of rip-relative addressing. This eliminates fixups that - would otherwise be needed if this object is to be placed in a - DLL, and is essentially just as efficient as direct addressing. */ - if (TARGET_64BIT_P (opts->x_ix86_isa_flags) - && (TARGET_RDOS || TARGET_PECOFF)) - opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1; - else if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL; - else - opts->x_ix86_cmodel = CM_32; - } - if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL) - { - error ("%<-masm=intel%> not supported in this configuration"); - opts->x_ix86_asm_dialect = ASM_ATT; - } - if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0) - != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0)) - sorry ("%i-bit mode not compiled in", - (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32); - - for (i = 0; i < pta_size; i++) - if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name)) - { - if (!strcmp (opts->x_ix86_arch_string, "generic")) - { - error (main_args_p - ? G_("%<generic%> CPU can be used only for %<-mtune=%> " - "switch") - : G_("%<generic%> CPU can be used only for " - "%<target(\"tune=\")%> attribute")); - return false; - } - else if (!strcmp (opts->x_ix86_arch_string, "intel")) - { - error (main_args_p - ? G_("%<intel%> CPU can be used only for %<-mtune=%> " - "switch") - : G_("%<intel%> CPU can be used only for " - "%<target(\"tune=\")%> attribute")); - return false; - } - - if (TARGET_64BIT_P (opts->x_ix86_isa_flags) - && !((processor_alias_table[i].flags & PTA_64BIT) != 0)) - { - error ("CPU you selected does not support x86-64 " - "instruction set"); - return false; - } - - ix86_schedule = processor_alias_table[i].schedule; - ix86_arch = processor_alias_table[i].processor; - /* Default cpu tuning to the architecture. */ - ix86_tune = ix86_arch; - - if (((processor_alias_table[i].flags & PTA_MMX) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX; - if (((processor_alias_table[i].flags & PTA_3DNOW) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW; - if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A; - if (((processor_alias_table[i].flags & PTA_SSE) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE; - if (((processor_alias_table[i].flags & PTA_SSE2) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2; - if (((processor_alias_table[i].flags & PTA_SSE3) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3; - if (((processor_alias_table[i].flags & PTA_SSSE3) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3; - if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1; - if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2; - if (((processor_alias_table[i].flags & PTA_AVX) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX; - if (((processor_alias_table[i].flags & PTA_AVX2) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2; - if (((processor_alias_table[i].flags & PTA_FMA) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA; - if (((processor_alias_table[i].flags & PTA_SSE4A) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A; - if (((processor_alias_table[i].flags & PTA_FMA4) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4; - if (((processor_alias_table[i].flags & PTA_XOP) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP; - if (((processor_alias_table[i].flags & PTA_LWP) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP; - if (((processor_alias_table[i].flags & PTA_ABM) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM; - if (((processor_alias_table[i].flags & PTA_BMI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI; - if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT; - if (((processor_alias_table[i].flags & PTA_TBM) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM; - if (((processor_alias_table[i].flags & PTA_BMI2) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2; - if (((processor_alias_table[i].flags & PTA_CX16) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_CX16)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_CX16; - if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT; - if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags) - && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0)) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF; - if (((processor_alias_table[i].flags & PTA_MOVBE) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_MOVBE)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_MOVBE; - if (((processor_alias_table[i].flags & PTA_AES) != 0) - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES)) - ix86_isa_flags |= OPTION_MASK_ISA_AES; - if (((processor_alias_table[i].flags & PTA_SHA) != 0) - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA)) - ix86_isa_flags |= OPTION_MASK_ISA_SHA; - if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL; - if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE; - if (((processor_alias_table[i].flags & PTA_RDRND) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND; - if (((processor_alias_table[i].flags & PTA_F16C) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C; - if (((processor_alias_table[i].flags & PTA_RTM) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM; - if (((processor_alias_table[i].flags & PTA_HLE) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_HLE)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_HLE; - if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW; - if (((processor_alias_table[i].flags & PTA_RDSEED) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED; - if (((processor_alias_table[i].flags & PTA_ADX) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX; - if (((processor_alias_table[i].flags & PTA_FXSR) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR; - if (((processor_alias_table[i].flags & PTA_XSAVE) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE; - if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT; - if (((processor_alias_table[i].flags & PTA_AVX512F) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F; - if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER; - if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF; - if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD; - if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1; - if (((processor_alias_table[i].flags & PTA_CLWB) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB; - if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT; - if (((processor_alias_table[i].flags & PTA_CLZERO) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_CLZERO)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_CLZERO; - if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC; - if (((processor_alias_table[i].flags & PTA_XSAVES) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES; - if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ; - if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW; - if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL; - if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI; - if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA; - if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI; - if (((processor_alias_table[i].flags & PTA_GFNI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI; - if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0) - && !(opts->x_ix86_isa_flags_explicit - & OPTION_MASK_ISA_AVX512VBMI2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2; - if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ; - if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0) - && !(opts->x_ix86_isa_flags_explicit - & OPTION_MASK_ISA_AVX512BITALG)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG; - - if (((processor_alias_table[i].flags & PTA_AVX512VP2INTERSECT) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA2_AVX512VP2INTERSECT)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX512VP2INTERSECT; - if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA2_AVX5124VNNIW)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX5124VNNIW; - if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA2_AVX5124FMAPS)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX5124FMAPS; - if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0) - && !(opts->x_ix86_isa_flags_explicit - & OPTION_MASK_ISA_AVX512VPOPCNTDQ)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ; - if (((processor_alias_table[i].flags & PTA_AVX512BF16) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA2_AVX512BF16)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX512BF16; - if (((processor_alias_table[i].flags & PTA_MOVDIRI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVDIRI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVDIRI; - if (((processor_alias_table[i].flags & PTA_MOVDIR64B) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_MOVDIR64B)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_MOVDIR64B; - if (((processor_alias_table[i].flags & PTA_SGX) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_SGX)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_SGX; - if (((processor_alias_table[i].flags & PTA_VAES) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_VAES)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_VAES; - if (((processor_alias_table[i].flags & PTA_RDPID) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_RDPID)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_RDPID; - if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_PCONFIG)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_PCONFIG; - if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_WBNOINVD)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_WBNOINVD; - if (((processor_alias_table[i].flags & PTA_PTWRITE) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_PTWRITE)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_PTWRITE; - - if ((processor_alias_table[i].flags - & (PTA_PREFETCH_SSE | PTA_SSE)) != 0) - x86_prefetch_sse = true; - if (((processor_alias_table[i].flags & PTA_MWAITX) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_MWAITX)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_MWAITX; - if (((processor_alias_table[i].flags & PTA_PKU) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU; - - /* Don't enable x87 instructions if only - general registers are allowed. */ - if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY) - && !(opts_set->x_target_flags & MASK_80387)) - { - if (((processor_alias_table[i].flags & PTA_NO_80387) != 0)) - opts->x_target_flags &= ~MASK_80387; - else - opts->x_target_flags |= MASK_80387; - } - break; - } - - if (i == pta_size) - { - error (main_args_p - ? G_("bad value (%qs) for %<-march=%> switch") - : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"), - opts->x_ix86_arch_string); - - auto_vec <const char *> candidates; - for (i = 0; i < pta_size; i++) - if (strcmp (processor_alias_table[i].name, "generic") - && strcmp (processor_alias_table[i].name, "intel") - && (!TARGET_64BIT_P (opts->x_ix86_isa_flags) - || ((processor_alias_table[i].flags & PTA_64BIT) != 0))) - candidates.safe_push (processor_alias_table[i].name); - -#ifdef HAVE_LOCAL_CPU_DETECT - /* Add also "native" as possible value. */ - candidates.safe_push ("native"); -#endif - - char *s; - const char *hint - = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates); - if (hint) - inform (input_location, - main_args_p - ? G_("valid arguments to %<-march=%> switch are: " - "%s; did you mean %qs?") - : G_("valid arguments to %<target(\"arch=\")%> attribute are: " - "%s; did you mean %qs?"), s, hint); - else - inform (input_location, - main_args_p - ? G_("valid arguments to %<-march=%> switch are: %s") - : G_("valid arguments to %<target(\"arch=\")%> attribute " - "are: %s"), s); - XDELETEVEC (s); - } - - ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch; - for (i = 0; i < X86_ARCH_LAST; ++i) - ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask); - - for (i = 0; i < pta_size; i++) - if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name)) - { - ix86_schedule = processor_alias_table[i].schedule; - ix86_tune = processor_alias_table[i].processor; - if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - { - if (!((processor_alias_table[i].flags & PTA_64BIT) != 0)) - { - if (ix86_tune_defaulted) - { - opts->x_ix86_tune_string = "x86-64"; - for (i = 0; i < pta_size; i++) - if (! strcmp (opts->x_ix86_tune_string, - processor_alias_table[i].name)) - break; - ix86_schedule = processor_alias_table[i].schedule; - ix86_tune = processor_alias_table[i].processor; - } - else - error ("CPU you selected does not support x86-64 " - "instruction set"); - } - } - /* Intel CPUs have always interpreted SSE prefetch instructions as - NOPs; so, we can enable SSE prefetch instructions even when - -mtune (rather than -march) points us to a processor that has them. - However, the VIA C3 gives a SIGILL, so we only do that for i686 and - higher processors. */ - if (TARGET_CMOV - && ((processor_alias_table[i].flags - & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)) - x86_prefetch_sse = true; - break; - } - - if (ix86_tune_specified && i == pta_size) - { - error (main_args_p - ? G_("bad value (%qs) for %<-mtune=%> switch") - : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"), - opts->x_ix86_tune_string); - - auto_vec <const char *> candidates; - for (i = 0; i < pta_size; i++) - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) - || ((processor_alias_table[i].flags & PTA_64BIT) != 0)) - candidates.safe_push (processor_alias_table[i].name); - -#ifdef HAVE_LOCAL_CPU_DETECT - /* Add also "native" as possible value. */ - candidates.safe_push ("native"); -#endif - - char *s; - const char *hint - = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates); - if (hint) - inform (input_location, - main_args_p - ? G_("valid arguments to %<-mtune=%> switch are: " - "%s; did you mean %qs?") - : G_("valid arguments to %<target(\"tune=\")%> attribute are: " - "%s; did you mean %qs?"), s, hint); - else - inform (input_location, - main_args_p - ? G_("valid arguments to %<-mtune=%> switch are: %s") - : G_("valid arguments to %<target(\"tune=\")%> attribute " - "are: %s"), s); - XDELETEVEC (s); - } - - set_ix86_tune_features (opts, ix86_tune, opts->x_ix86_dump_tunes); - - ix86_recompute_optlev_based_flags (opts, opts_set); - - ix86_tune_cost = processor_cost_table[ix86_tune]; - /* TODO: ix86_cost should be chosen at instruction or function granuality - so for cold code we use size_cost even in !optimize_size compilation. */ - if (opts->x_optimize_size) - ix86_cost = &ix86_size_cost; - else - ix86_cost = ix86_tune_cost; - - /* Arrange to set up i386_stack_locals for all functions. */ - init_machine_status = ix86_init_machine_status; - - /* Validate -mregparm= value. */ - if (opts_set->x_ix86_regparm) - { - if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - warning (0, "%<-mregparm%> is ignored in 64-bit mode"); - else if (TARGET_IAMCU_P (opts->x_target_flags)) - warning (0, "%<-mregparm%> is ignored for Intel MCU psABI"); - if (opts->x_ix86_regparm > REGPARM_MAX) - { - error ("%<-mregparm=%d%> is not between 0 and %d", - opts->x_ix86_regparm, REGPARM_MAX); - opts->x_ix86_regparm = 0; - } - } - if (TARGET_IAMCU_P (opts->x_target_flags) - || TARGET_64BIT_P (opts->x_ix86_isa_flags)) - opts->x_ix86_regparm = REGPARM_MAX; - - /* Default align_* from the processor table. */ - ix86_default_align (opts); - - /* Provide default for -mbranch-cost= value. */ - SET_OPTION_IF_UNSET (opts, opts_set, ix86_branch_cost, - ix86_tune_cost->branch_cost); - - if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - { - opts->x_target_flags - |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags; - - if (!ix86_arch_specified) - opts->x_ix86_isa_flags - |= TARGET_SUBTARGET64_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit; - - if (TARGET_RTD_P (opts->x_target_flags)) - warning (0, - main_args_p - ? G_("%<-mrtd%> is ignored in 64bit mode") - : G_("%<target(\"rtd\")%> is ignored in 64bit mode")); - } - else - { - opts->x_target_flags - |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags; - - if (!ix86_arch_specified) - opts->x_ix86_isa_flags - |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit; - - /* i386 ABI does not specify red zone. It still makes sense to use it - when programmer takes care to stack from being destroyed. */ - if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE)) - opts->x_target_flags |= MASK_NO_RED_ZONE; - } - - /* Keep nonleaf frame pointers. */ - if (opts->x_flag_omit_frame_pointer) - opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER; - else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags)) - opts->x_flag_omit_frame_pointer = 1; - - /* If we're doing fast math, we don't care about comparison order - wrt NaNs. This lets us use a shorter comparison sequence. */ - if (opts->x_flag_finite_math_only) - opts->x_target_flags &= ~MASK_IEEE_FP; - - /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387, - since the insns won't need emulation. */ - if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387]) - opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387; - - /* Likewise, if the target doesn't have a 387, or we've specified - software floating point, don't use 387 inline intrinsics. */ - if (!TARGET_80387_P (opts->x_target_flags)) - opts->x_target_flags |= MASK_NO_FANCY_MATH_387; - - /* Turn on MMX builtins for -msse. */ - if (TARGET_SSE_P (opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags - |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit; - - /* Enable SSE prefetch. */ - if (TARGET_SSE_P (opts->x_ix86_isa_flags) - || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags) - && !TARGET_3DNOW_P (opts->x_ix86_isa_flags)) - || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags)) - x86_prefetch_sse = true; - - /* Enable popcnt instruction for -msse4.2 or -mabm. */ - if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags) - || TARGET_ABM_P (opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags - |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit; - - /* Enable lzcnt instruction for -mabm. */ - if (TARGET_ABM_P(opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags - |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit; - - /* Disable BMI, BMI2 and TBM instructions for -m16. */ - if (TARGET_16BIT_P(opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags - &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM) - & ~opts->x_ix86_isa_flags_explicit); - - /* Validate -mpreferred-stack-boundary= value or default it to - PREFERRED_STACK_BOUNDARY_DEFAULT. */ - ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT; - if (opts_set->x_ix86_preferred_stack_boundary_arg) - { - int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2; - int max = TARGET_SEH ? 4 : 12; - - if (opts->x_ix86_preferred_stack_boundary_arg < min - || opts->x_ix86_preferred_stack_boundary_arg > max) - { - if (min == max) - error ("%<-mpreferred-stack-boundary%> is not supported " - "for this target"); - else - error ("%<-mpreferred-stack-boundary=%d%> is not between %d and %d", - opts->x_ix86_preferred_stack_boundary_arg, min, max); - } - else - ix86_preferred_stack_boundary - = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT; - } - - /* Set the default value for -mstackrealign. */ - SET_OPTION_IF_UNSET (opts, opts_set, ix86_force_align_arg_pointer, - STACK_REALIGN_DEFAULT); - - ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; - - /* Validate -mincoming-stack-boundary= value or default it to - MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */ - ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary; - if (opts_set->x_ix86_incoming_stack_boundary_arg) - { - int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2; - - if (opts->x_ix86_incoming_stack_boundary_arg < min - || opts->x_ix86_incoming_stack_boundary_arg > 12) - error ("%<-mincoming-stack-boundary=%d%> is not between %d and 12", - opts->x_ix86_incoming_stack_boundary_arg, min); - else - { - ix86_user_incoming_stack_boundary - = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT; - ix86_incoming_stack_boundary - = ix86_user_incoming_stack_boundary; - } - } - -#ifndef NO_PROFILE_COUNTERS - if (flag_nop_mcount) - error ("%<-mnop-mcount%> is not compatible with this target"); -#endif - if (flag_nop_mcount && flag_pic) - error ("%<-mnop-mcount%> is not implemented for %<-fPIC%>"); - - /* Accept -msseregparm only if at least SSE support is enabled. */ - if (TARGET_SSEREGPARM_P (opts->x_target_flags) - && ! TARGET_SSE_P (opts->x_ix86_isa_flags)) - error (main_args_p - ? G_("%<-msseregparm%> used without SSE enabled") - : G_("%<target(\"sseregparm\")%> used without SSE enabled")); - - if (opts_set->x_ix86_fpmath) - { - if (opts->x_ix86_fpmath & FPMATH_SSE) - { - if (!TARGET_SSE_P (opts->x_ix86_isa_flags)) - { - if (TARGET_80387_P (opts->x_target_flags)) - { - warning (0, "SSE instruction set disabled, using 387 arithmetics"); - opts->x_ix86_fpmath = FPMATH_387; - } - } - else if ((opts->x_ix86_fpmath & FPMATH_387) - && !TARGET_80387_P (opts->x_target_flags)) - { - warning (0, "387 instruction set disabled, using SSE arithmetics"); - opts->x_ix86_fpmath = FPMATH_SSE; - } - } - } - /* For all chips supporting SSE2, -mfpmath=sse performs better than - fpmath=387. The second is however default at many targets since the - extra 80bit precision of temporaries is considered to be part of ABI. - Overwrite the default at least for -ffast-math. - TODO: -mfpmath=both seems to produce same performing code with bit - smaller binaries. It is however not clear if register allocation is - ready for this setting. - Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE - codegen. We may switch to 387 with -ffast-math for size optimized - functions. */ - else if (fast_math_flags_set_p (&global_options) - && TARGET_SSE2_P (opts->x_ix86_isa_flags)) - opts->x_ix86_fpmath = FPMATH_SSE; - else - opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags); - - /* Use external vectorized library in vectorizing intrinsics. */ - if (opts_set->x_ix86_veclibabi_type) - switch (opts->x_ix86_veclibabi_type) - { - case ix86_veclibabi_type_svml: - ix86_veclib_handler = &ix86_veclibabi_svml; - break; - - case ix86_veclibabi_type_acml: - ix86_veclib_handler = &ix86_veclibabi_acml; - break; - - default: - gcc_unreachable (); - } - - if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS] - && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) - opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; - - /* If stack probes are required, the space used for large function - arguments on the stack must also be probed, so enable - -maccumulate-outgoing-args so this happens in the prologue. */ - if (TARGET_STACK_PROBE_P (opts->x_target_flags) - && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) - { - if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS) - warning (0, - main_args_p - ? G_("stack probing requires %<-maccumulate-outgoing-args%> " - "for correctness") - : G_("stack probing requires " - "%<target(\"accumulate-outgoing-args\")%> for " - "correctness")); - opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; - } - - /* Stack realignment without -maccumulate-outgoing-args requires %ebp, - so enable -maccumulate-outgoing-args when %ebp is fixed. */ - if (fixed_regs[BP_REG] - && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) - { - if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS) - warning (0, - main_args_p - ? G_("fixed ebp register requires " - "%<-maccumulate-outgoing-args%>") - : G_("fixed ebp register requires " - "%<target(\"accumulate-outgoing-args\")%>")); - opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; - } - - /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */ - { - char *p; - ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0); - p = strchr (internal_label_prefix, 'X'); - internal_label_prefix_len = p - internal_label_prefix; - *p = '\0'; - } - - /* When scheduling description is not available, disable scheduler pass - so it won't slow down the compilation and make x87 code slower. */ - if (!TARGET_SCHEDULE) - opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0; - - SET_OPTION_IF_UNSET (opts, opts_set, param_simultaneous_prefetches, - ix86_tune_cost->simultaneous_prefetches); - SET_OPTION_IF_UNSET (opts, opts_set, param_l1_cache_line_size, - ix86_tune_cost->prefetch_block); - SET_OPTION_IF_UNSET (opts, opts_set, param_l1_cache_size, - ix86_tune_cost->l1_cache_size); - SET_OPTION_IF_UNSET (opts, opts_set, param_l2_cache_size, - ix86_tune_cost->l2_cache_size); - - /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */ - if (opts->x_flag_prefetch_loop_arrays < 0 - && HAVE_prefetch - && (opts->x_optimize >= 3 || opts->x_flag_profile_use) - && !opts->x_optimize_size - && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL) - opts->x_flag_prefetch_loop_arrays = 1; - - /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0) - can be opts->x_optimized to ap = __builtin_next_arg (0). */ - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack) - targetm.expand_builtin_va_start = NULL; - -#ifdef USE_IX86_CLD - /* Use -mcld by default for 32-bit code if configured with --enable-cld. */ - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) - opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags; -#endif - - /* Set the default value for -mfentry. */ - if (!opts_set->x_flag_fentry) - opts->x_flag_fentry = TARGET_SEH; - else - { - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic - && opts->x_flag_fentry) - sorry ("%<-mfentry%> isn%'t supported for 32-bit in combination " - "with %<-fpic%>"); - else if (TARGET_SEH && !opts->x_flag_fentry) - sorry ("%<-mno-fentry%> isn%'t compatible with SEH"); - } - - if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES) - sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH"); - - if (!(opts_set->x_target_flags & MASK_VZEROUPPER) - && TARGET_EMIT_VZEROUPPER) - opts->x_target_flags |= MASK_VZEROUPPER; - if (!(opts_set->x_target_flags & MASK_STV)) - opts->x_target_flags |= MASK_STV; - /* Disable STV if -mpreferred-stack-boundary={2,3} or - -mincoming-stack-boundary={2,3} or -mstackrealign - the needed - stack realignment will be extra cost the pass doesn't take into - account and the pass can't realign the stack. */ - if (ix86_preferred_stack_boundary < 128 - || ix86_incoming_stack_boundary < 128 - || opts->x_ix86_force_align_arg_pointer) - opts->x_target_flags &= ~MASK_STV; - if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL] - && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) - opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; - if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL] - && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE)) - opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; - - /* Enable 128-bit AVX instruction generation - for the auto-vectorizer. */ - if (ix86_tune_features[X86_TUNE_AVX128_OPTIMAL] - && (opts_set->x_prefer_vector_width_type == PVW_NONE)) - opts->x_prefer_vector_width_type = PVW_AVX128; - - /* Use 256-bit AVX instruction generation - in the auto-vectorizer. */ - if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL] - && (opts_set->x_prefer_vector_width_type == PVW_NONE)) - opts->x_prefer_vector_width_type = PVW_AVX256; - - if (opts->x_ix86_recip_name) - { - char *p = ASTRDUP (opts->x_ix86_recip_name); - char *q; - unsigned int mask, i; - bool invert; - - while ((q = strtok (p, ",")) != NULL) - { - p = NULL; - if (*q == '!') - { - invert = true; - q++; - } - else - invert = false; - - if (!strcmp (q, "default")) - mask = RECIP_MASK_ALL; - else - { - for (i = 0; i < ARRAY_SIZE (recip_options); i++) - if (!strcmp (q, recip_options[i].string)) - { - mask = recip_options[i].mask; - break; - } - - if (i == ARRAY_SIZE (recip_options)) - { - error ("unknown option for %<-mrecip=%s%>", q); - invert = false; - mask = RECIP_MASK_NONE; - } - } - - opts->x_recip_mask_explicit |= mask; - if (invert) - opts->x_recip_mask &= ~mask; - else - opts->x_recip_mask |= mask; - } - } - - if (TARGET_RECIP_P (opts->x_target_flags)) - opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit; - else if (opts_set->x_target_flags & MASK_RECIP) - opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit); - - /* Default long double to 64-bit for 32-bit Bionic and to __float128 - for 64-bit Bionic. Also default long double to 64-bit for Intel - MCU psABI. */ - if ((TARGET_HAS_BIONIC || TARGET_IAMCU) - && !(opts_set->x_target_flags - & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128))) - opts->x_target_flags |= (TARGET_64BIT - ? MASK_LONG_DOUBLE_128 - : MASK_LONG_DOUBLE_64); - - /* Only one of them can be active. */ - gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0 - || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0); - - /* Handle stack protector */ - if (!opts_set->x_ix86_stack_protector_guard) - { -#ifdef TARGET_THREAD_SSP_OFFSET - if (!TARGET_HAS_BIONIC) - opts->x_ix86_stack_protector_guard = SSP_TLS; - else -#endif - opts->x_ix86_stack_protector_guard = SSP_GLOBAL; - } - - if (opts_set->x_ix86_stack_protector_guard_offset_str) - { - char *endp; - const char *str = opts->x_ix86_stack_protector_guard_offset_str; - - errno = 0; - int64_t offset; - -#if defined(INT64_T_IS_LONG) - offset = strtol (str, &endp, 0); -#else - offset = strtoll (str, &endp, 0); -#endif - - if (!*str || *endp || errno) - error ("%qs is not a valid number " - "in %<-mstack-protector-guard-offset=%>", str); - - if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000), - HOST_WIDE_INT_C (0x7fffffff))) - error ("%qs is not a valid offset " - "in %<-mstack-protector-guard-offset=%>", str); - - opts->x_ix86_stack_protector_guard_offset = offset; - } -#ifdef TARGET_THREAD_SSP_OFFSET - else - opts->x_ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET; -#endif - - if (opts_set->x_ix86_stack_protector_guard_reg_str) - { - const char *str = opts->x_ix86_stack_protector_guard_reg_str; - addr_space_t seg = ADDR_SPACE_GENERIC; - - /* Discard optional register prefix. */ - if (str[0] == '%') - str++; - - if (strlen (str) == 2 && str[1] == 's') - { - if (str[0] == 'f') - seg = ADDR_SPACE_SEG_FS; - else if (str[0] == 'g') - seg = ADDR_SPACE_SEG_GS; - } - - if (seg == ADDR_SPACE_GENERIC) - error ("%qs is not a valid base register " - "in %<-mstack-protector-guard-reg=%>", - opts->x_ix86_stack_protector_guard_reg_str); - - opts->x_ix86_stack_protector_guard_reg = seg; - } - else - { - opts->x_ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG; - - /* The kernel uses a different segment register for performance - reasons; a system call would not have to trash the userspace - segment register, which would be expensive. */ - if (opts->x_ix86_cmodel == CM_KERNEL) - opts->x_ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS; - } - - /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ - if (opts->x_ix86_tune_memcpy_strategy) - { - char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy); - ix86_parse_stringop_strategy_string (str, false); - free (str); - } - - if (opts->x_ix86_tune_memset_strategy) - { - char *str = xstrdup (opts->x_ix86_tune_memset_strategy); - ix86_parse_stringop_strategy_string (str, true); - free (str); - } - - /* Save the initial options in case the user does function specific - options. */ - if (main_args_p) - target_option_default_node = target_option_current_node - = build_target_option_node (opts); - - if (opts->x_flag_cf_protection != CF_NONE) - opts->x_flag_cf_protection - = (cf_protection_level) (opts->x_flag_cf_protection | CF_SET); - - if (ix86_tune_features [X86_TUNE_AVOID_256FMA_CHAINS]) - SET_OPTION_IF_UNSET (opts, opts_set, param_avoid_fma_max_bits, 256); - else if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS]) - SET_OPTION_IF_UNSET (opts, opts_set, param_avoid_fma_max_bits, 128); - - /* PR86952: jump table usage with retpolines is slow. - The PR provides some numbers about the slowness. */ - if (ix86_indirect_branch != indirect_branch_keep) - SET_OPTION_IF_UNSET (opts, opts_set, flag_jump_tables, 0); - - return true; -} - -/* Implement the TARGET_OPTION_OVERRIDE hook. */ - -void -ix86_option_override (void) -{ - ix86_option_override_internal (true, &global_options, &global_options_set); -} - -/* Remember the last target of ix86_set_current_function. */ -static GTY(()) tree ix86_previous_fndecl; - -/* Set targets globals to the default (or current #pragma GCC target - if active). Invalidate ix86_previous_fndecl cache. */ - -void -ix86_reset_previous_fndecl (void) -{ - tree new_tree = target_option_current_node; - cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree)); - if (TREE_TARGET_GLOBALS (new_tree)) - restore_target_globals (TREE_TARGET_GLOBALS (new_tree)); - else if (new_tree == target_option_default_node) - restore_target_globals (&default_target_globals); - else - TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts (); - ix86_previous_fndecl = NULL_TREE; -} - -/* Add target attribute to SIMD clone NODE if needed. */ - -void -ix86_simd_clone_adjust (struct cgraph_node *node) -{ - const char *str = NULL; - - /* Attributes need to be adjusted for definitions, not declarations. */ - if (!node->definition) - return; - - gcc_assert (node->decl == cfun->decl); - switch (node->simdclone->vecsize_mangle) - { - case 'b': - if (!TARGET_SSE2) - str = "sse2"; - break; - case 'c': - if (TARGET_PREFER_AVX128) - { - if (!TARGET_AVX) - str = "avx,prefer-vector-width=256"; - else - str = "prefer-vector-width=256"; - } - else if (!TARGET_AVX) - str = "avx"; - break; - case 'd': - if (TARGET_PREFER_AVX128) - { - if (!TARGET_AVX2) - str = "avx2,prefer-vector-width=256"; - else - str = "prefer-vector-width=256"; - } - else if (!TARGET_AVX2) - str = "avx2"; - break; - case 'e': - if (TARGET_PREFER_AVX256) - { - if (!TARGET_AVX512F) - str = "avx512f,prefer-vector-width=512"; - else - str = "prefer-vector-width=512"; - } - else if (!TARGET_AVX512F) - str = "avx512f"; - break; - default: - gcc_unreachable (); - } - if (str == NULL) - return; - push_cfun (NULL); - tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str)); - bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0); - gcc_assert (ok); - pop_cfun (); - ix86_reset_previous_fndecl (); - ix86_set_current_function (node->decl); -} - - - -/* Set the func_type field from the function FNDECL. */ - -static void -ix86_set_func_type (tree fndecl) -{ - if (cfun->machine->func_type == TYPE_UNKNOWN) - { - if (lookup_attribute ("interrupt", - TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))) - { - if (ix86_function_naked (fndecl)) - error_at (DECL_SOURCE_LOCATION (fndecl), - "interrupt and naked attributes are not compatible"); - - int nargs = 0; - for (tree arg = DECL_ARGUMENTS (fndecl); - arg; - arg = TREE_CHAIN (arg)) - nargs++; - cfun->machine->no_caller_saved_registers = true; - cfun->machine->func_type - = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT; - - ix86_optimize_mode_switching[X86_DIRFLAG] = 1; - - /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */ - if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG) - sorry ("only DWARF debug format is supported for interrupt " - "service routine"); - } - else - { - cfun->machine->func_type = TYPE_NORMAL; - if (lookup_attribute ("no_caller_saved_registers", - TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))) - cfun->machine->no_caller_saved_registers = true; - } - } -} - -/* Set the indirect_branch_type field from the function FNDECL. */ - -static void -ix86_set_indirect_branch_type (tree fndecl) -{ - if (cfun->machine->indirect_branch_type == indirect_branch_unset) - { - tree attr = lookup_attribute ("indirect_branch", - DECL_ATTRIBUTES (fndecl)); - if (attr != NULL) - { - tree args = TREE_VALUE (attr); - if (args == NULL) - gcc_unreachable (); - tree cst = TREE_VALUE (args); - if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0) - cfun->machine->indirect_branch_type = indirect_branch_keep; - else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0) - cfun->machine->indirect_branch_type = indirect_branch_thunk; - else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0) - cfun->machine->indirect_branch_type = indirect_branch_thunk_inline; - else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0) - cfun->machine->indirect_branch_type = indirect_branch_thunk_extern; - else - gcc_unreachable (); - } - else - cfun->machine->indirect_branch_type = ix86_indirect_branch; - - /* -mcmodel=large is not compatible with -mindirect-branch=thunk - nor -mindirect-branch=thunk-extern. */ - if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC) - && ((cfun->machine->indirect_branch_type - == indirect_branch_thunk_extern) - || (cfun->machine->indirect_branch_type - == indirect_branch_thunk))) - error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not " - "compatible", - ((cfun->machine->indirect_branch_type - == indirect_branch_thunk_extern) - ? "thunk-extern" : "thunk")); - - if (cfun->machine->indirect_branch_type != indirect_branch_keep - && (cfun->machine->indirect_branch_type - != indirect_branch_thunk_extern) - && (flag_cf_protection & CF_RETURN)) - error ("%<-mindirect-branch%> and %<-fcf-protection%> are not " - "compatible"); - } - - if (cfun->machine->function_return_type == indirect_branch_unset) - { - tree attr = lookup_attribute ("function_return", - DECL_ATTRIBUTES (fndecl)); - if (attr != NULL) - { - tree args = TREE_VALUE (attr); - if (args == NULL) - gcc_unreachable (); - tree cst = TREE_VALUE (args); - if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0) - cfun->machine->function_return_type = indirect_branch_keep; - else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0) - cfun->machine->function_return_type = indirect_branch_thunk; - else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0) - cfun->machine->function_return_type = indirect_branch_thunk_inline; - else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0) - cfun->machine->function_return_type = indirect_branch_thunk_extern; - else - gcc_unreachable (); - } - else - cfun->machine->function_return_type = ix86_function_return; - - /* -mcmodel=large is not compatible with -mfunction-return=thunk - nor -mfunction-return=thunk-extern. */ - if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC) - && ((cfun->machine->function_return_type - == indirect_branch_thunk_extern) - || (cfun->machine->function_return_type - == indirect_branch_thunk))) - error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not " - "compatible", - ((cfun->machine->function_return_type - == indirect_branch_thunk_extern) - ? "thunk-extern" : "thunk")); - - if (cfun->machine->function_return_type != indirect_branch_keep - && (cfun->machine->function_return_type - != indirect_branch_thunk_extern) - && (flag_cf_protection & CF_RETURN)) - error ("%<-mfunction-return%> and %<-fcf-protection%> are not " - "compatible"); - } -} - -/* Establish appropriate back-end context for processing the function - FNDECL. The argument might be NULL to indicate processing at top - level, outside of any function scope. */ -void -ix86_set_current_function (tree fndecl) -{ - /* Only change the context if the function changes. This hook is called - several times in the course of compiling a function, and we don't want to - slow things down too much or call target_reinit when it isn't safe. */ - if (fndecl == ix86_previous_fndecl) - { - /* There may be 2 function bodies for the same function FNDECL, - one is extern inline and one isn't. Call ix86_set_func_type - to set the func_type field. */ - if (fndecl != NULL_TREE) - { - ix86_set_func_type (fndecl); - ix86_set_indirect_branch_type (fndecl); - } - return; - } - - tree old_tree; - if (ix86_previous_fndecl == NULL_TREE) - old_tree = target_option_current_node; - else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)) - old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl); - else - old_tree = target_option_default_node; - - if (fndecl == NULL_TREE) - { - if (old_tree != target_option_current_node) - ix86_reset_previous_fndecl (); - return; - } - - ix86_set_func_type (fndecl); - ix86_set_indirect_branch_type (fndecl); - - tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl); - if (new_tree == NULL_TREE) - new_tree = target_option_default_node; - - if (old_tree != new_tree) - { - cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree)); - if (TREE_TARGET_GLOBALS (new_tree)) - restore_target_globals (TREE_TARGET_GLOBALS (new_tree)); - else if (new_tree == target_option_default_node) - restore_target_globals (&default_target_globals); - else - TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts (); - } - ix86_previous_fndecl = fndecl; - - static bool prev_no_caller_saved_registers; - - /* 64-bit MS and SYSV ABI have different set of call used registers. - Avoid expensive re-initialization of init_regs each time we switch - function context. */ - if (TARGET_64BIT - && (call_used_or_fixed_reg_p (SI_REG) - == (cfun->machine->call_abi == MS_ABI))) - reinit_regs (); - /* Need to re-initialize init_regs if caller-saved registers are - changed. */ - else if (prev_no_caller_saved_registers - != cfun->machine->no_caller_saved_registers) - reinit_regs (); - - if (cfun->machine->func_type != TYPE_NORMAL - || cfun->machine->no_caller_saved_registers) - { - /* Don't allow SSE, MMX nor x87 instructions since they - may change processor state. */ - const char *isa; - if (TARGET_SSE) - isa = "SSE"; - else if (TARGET_MMX) - isa = "MMX/3Dnow"; - else if (TARGET_80387) - isa = "80387"; - else - isa = NULL; - if (isa != NULL) - { - if (cfun->machine->func_type != TYPE_NORMAL) - sorry (cfun->machine->func_type == TYPE_EXCEPTION - ? G_("%s instructions aren%'t allowed in an" - " exception service routine") - : G_("%s instructions aren%'t allowed in an" - " interrupt service routine"), - isa); - else - sorry ("%s instructions aren%'t allowed in a function with " - "the %<no_caller_saved_registers%> attribute", isa); - /* Don't issue the same error twice. */ - cfun->machine->func_type = TYPE_NORMAL; - cfun->machine->no_caller_saved_registers = false; - } - } - - prev_no_caller_saved_registers - = cfun->machine->no_caller_saved_registers; -} - -/* Implement the TARGET_OFFLOAD_OPTIONS hook. */ -char * -ix86_offload_options (void) -{ - if (TARGET_LP64) - return xstrdup ("-foffload-abi=lp64"); - return xstrdup ("-foffload-abi=ilp32"); -} - -/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall", - and "sseregparm" calling convention attributes; - arguments as in struct attribute_spec.handler. */ - -static tree -ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, - bool *no_add_attrs) -{ - if (TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE - && TREE_CODE (*node) != FIELD_DECL - && TREE_CODE (*node) != TYPE_DECL) - { - warning (OPT_Wattributes, "%qE attribute only applies to functions", - name); - *no_add_attrs = true; - return NULL_TREE; - } - - /* Can combine regparm with all attributes but fastcall, and thiscall. */ - if (is_attribute_p ("regparm", name)) - { - tree cst; - - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and regparm attributes are not compatible"); - } - - if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) - { - error ("regparam and thiscall attributes are not compatible"); - } - - cst = TREE_VALUE (args); - if (TREE_CODE (cst) != INTEGER_CST) - { - warning (OPT_Wattributes, - "%qE attribute requires an integer constant argument", - name); - *no_add_attrs = true; - } - else if (compare_tree_int (cst, REGPARM_MAX) > 0) - { - warning (OPT_Wattributes, "argument to %qE attribute larger than %d", - name, REGPARM_MAX); - *no_add_attrs = true; - } - - return NULL_TREE; - } - - if (TARGET_64BIT) - { - /* Do not warn when emulating the MS ABI. */ - if ((TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE) - || ix86_function_type_abi (*node) != MS_ABI) - warning (OPT_Wattributes, "%qE attribute ignored", - name); - *no_add_attrs = true; - return NULL_TREE; - } - - /* Can combine fastcall with stdcall (redundant) and sseregparm. */ - if (is_attribute_p ("fastcall", name)) - { - if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and cdecl attributes are not compatible"); - } - if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and stdcall attributes are not compatible"); - } - if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and regparm attributes are not compatible"); - } - if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and thiscall attributes are not compatible"); - } - } - - /* Can combine stdcall with fastcall (redundant), regparm and - sseregparm. */ - else if (is_attribute_p ("stdcall", name)) - { - if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) - { - error ("stdcall and cdecl attributes are not compatible"); - } - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) - { - error ("stdcall and fastcall attributes are not compatible"); - } - if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) - { - error ("stdcall and thiscall attributes are not compatible"); - } - } - - /* Can combine cdecl with regparm and sseregparm. */ - else if (is_attribute_p ("cdecl", name)) - { - if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) - { - error ("stdcall and cdecl attributes are not compatible"); - } - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and cdecl attributes are not compatible"); - } - if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) - { - error ("cdecl and thiscall attributes are not compatible"); - } - } - else if (is_attribute_p ("thiscall", name)) - { - if (TREE_CODE (*node) != METHOD_TYPE && pedantic) - warning (OPT_Wattributes, "%qE attribute is used for non-class method", - name); - if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) - { - error ("stdcall and thiscall attributes are not compatible"); - } - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and thiscall attributes are not compatible"); - } - if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) - { - error ("cdecl and thiscall attributes are not compatible"); - } - } - - /* Can combine sseregparm with all attributes. */ - - return NULL_TREE; -} - -#ifndef CHECK_STACK_LIMIT -#define CHECK_STACK_LIMIT (-1) -#endif - -/* The transactional memory builtins are implicitly regparm or fastcall - depending on the ABI. Override the generic do-nothing attribute that - these builtins were declared with, and replace it with one of the two - attributes that we expect elsewhere. */ - -static tree -ix86_handle_tm_regparm_attribute (tree *node, tree, tree, - int flags, bool *no_add_attrs) -{ - tree alt; - - /* In no case do we want to add the placeholder attribute. */ - *no_add_attrs = true; - - /* The 64-bit ABI is unchanged for transactional memory. */ - if (TARGET_64BIT) - return NULL_TREE; - - /* ??? Is there a better way to validate 32-bit windows? We have - cfun->machine->call_abi, but that seems to be set only for 64-bit. */ - if (CHECK_STACK_LIMIT > 0) - alt = tree_cons (get_identifier ("fastcall"), NULL, NULL); - else - { - alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL); - alt = tree_cons (get_identifier ("regparm"), alt, NULL); - } - decl_attributes (node, alt, flags); - - return NULL_TREE; -} - -/* Handle a "force_align_arg_pointer" attribute. */ - -static tree -ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name, - tree, int, bool *no_add_attrs) -{ - if (TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE - && TREE_CODE (*node) != FIELD_DECL - && TREE_CODE (*node) != TYPE_DECL) - { - warning (OPT_Wattributes, "%qE attribute only applies to functions", - name); - *no_add_attrs = true; - } - - return NULL_TREE; -} - -/* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in - struct attribute_spec.handler. */ - -static tree -ix86_handle_struct_attribute (tree *node, tree name, tree, int, - bool *no_add_attrs) -{ - tree *type = NULL; - if (DECL_P (*node)) - { - if (TREE_CODE (*node) == TYPE_DECL) - type = &TREE_TYPE (*node); - } - else - type = node; - - if (!(type && RECORD_OR_UNION_TYPE_P (*type))) - { - warning (OPT_Wattributes, "%qE attribute ignored", - name); - *no_add_attrs = true; - } - - else if ((is_attribute_p ("ms_struct", name) - && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type))) - || ((is_attribute_p ("gcc_struct", name) - && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type))))) - { - warning (OPT_Wattributes, "%qE incompatible attribute ignored", - name); - *no_add_attrs = true; - } - - return NULL_TREE; -} - -/* Handle a "callee_pop_aggregate_return" attribute; arguments as - in struct attribute_spec handler. */ - -static tree -ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int, - bool *no_add_attrs) -{ - if (TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE - && TREE_CODE (*node) != FIELD_DECL - && TREE_CODE (*node) != TYPE_DECL) - { - warning (OPT_Wattributes, "%qE attribute only applies to functions", - name); - *no_add_attrs = true; - return NULL_TREE; - } - if (TARGET_64BIT) - { - warning (OPT_Wattributes, "%qE attribute only available for 32-bit", - name); - *no_add_attrs = true; - return NULL_TREE; - } - if (is_attribute_p ("callee_pop_aggregate_return", name)) - { - tree cst; - - cst = TREE_VALUE (args); - if (TREE_CODE (cst) != INTEGER_CST) - { - warning (OPT_Wattributes, - "%qE attribute requires an integer constant argument", - name); - *no_add_attrs = true; - } - else if (compare_tree_int (cst, 0) != 0 - && compare_tree_int (cst, 1) != 0) - { - warning (OPT_Wattributes, - "argument to %qE attribute is neither zero, nor one", - name); - *no_add_attrs = true; - } - - return NULL_TREE; - } - - return NULL_TREE; -} - -/* Handle a "ms_abi" or "sysv" attribute; arguments as in - struct attribute_spec.handler. */ - -static tree -ix86_handle_abi_attribute (tree *node, tree name, tree, int, - bool *no_add_attrs) -{ - if (TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE - && TREE_CODE (*node) != FIELD_DECL - && TREE_CODE (*node) != TYPE_DECL) - { - warning (OPT_Wattributes, "%qE attribute only applies to functions", - name); - *no_add_attrs = true; - return NULL_TREE; - } - - /* Can combine regparm with all attributes but fastcall. */ - if (is_attribute_p ("ms_abi", name)) - { - if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node))) - { - error ("%qs and %qs attributes are not compatible", - "ms_abi", "sysv_abi"); - } - - return NULL_TREE; - } - else if (is_attribute_p ("sysv_abi", name)) - { - if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node))) - { - error ("%qs and %qs attributes are not compatible", - "ms_abi", "sysv_abi"); - } - - return NULL_TREE; - } - - return NULL_TREE; -} - -static tree -ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int, - bool *no_add_attrs) -{ - if (TREE_CODE (*node) != FUNCTION_DECL) - { - warning (OPT_Wattributes, "%qE attribute only applies to functions", - name); - *no_add_attrs = true; - } - - if (is_attribute_p ("indirect_branch", name)) - { - tree cst = TREE_VALUE (args); - if (TREE_CODE (cst) != STRING_CST) - { - warning (OPT_Wattributes, - "%qE attribute requires a string constant argument", - name); - *no_add_attrs = true; - } - else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0 - && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0 - && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0 - && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0) - { - warning (OPT_Wattributes, - "argument to %qE attribute is not " - "(keep|thunk|thunk-inline|thunk-extern)", name); - *no_add_attrs = true; - } - } - - if (is_attribute_p ("function_return", name)) - { - tree cst = TREE_VALUE (args); - if (TREE_CODE (cst) != STRING_CST) - { - warning (OPT_Wattributes, - "%qE attribute requires a string constant argument", - name); - *no_add_attrs = true; - } - else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0 - && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0 - && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0 - && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0) - { - warning (OPT_Wattributes, - "argument to %qE attribute is not " - "(keep|thunk|thunk-inline|thunk-extern)", name); - *no_add_attrs = true; - } - } - - return NULL_TREE; -} - -static tree -ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree, - int, bool *) -{ - return NULL_TREE; -} - -static tree -ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *) -{ - /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet, - but the function type contains args and return type data. */ - tree func_type = *node; - tree return_type = TREE_TYPE (func_type); - - int nargs = 0; - tree current_arg_type = TYPE_ARG_TYPES (func_type); - while (current_arg_type - && ! VOID_TYPE_P (TREE_VALUE (current_arg_type))) - { - if (nargs == 0) - { - if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type))) - error ("interrupt service routine should have a pointer " - "as the first argument"); - } - else if (nargs == 1) - { - if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE - || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode) - error ("interrupt service routine should have %qs " - "as the second argument", - TARGET_64BIT - ? (TARGET_X32 ? "unsigned long long int" - : "unsigned long int") - : "unsigned int"); - } - nargs++; - current_arg_type = TREE_CHAIN (current_arg_type); - } - if (!nargs || nargs > 2) - error ("interrupt service routine can only have a pointer argument " - "and an optional integer argument"); - if (! VOID_TYPE_P (return_type)) - error ("interrupt service routine must return %<void%>"); - - return NULL_TREE; -} - -/* Handle fentry_name / fentry_section attribute. */ - -static tree -ix86_handle_fentry_name (tree *node, tree name, tree args, - int, bool *no_add_attrs) -{ - if (TREE_CODE (*node) == FUNCTION_DECL - && TREE_CODE (TREE_VALUE (args)) == STRING_CST) - /* Do nothing else, just set the attribute. We'll get at - it later with lookup_attribute. */ - ; - else - { - warning (OPT_Wattributes, "%qE attribute ignored", name); - *no_add_attrs = true; - } - - return NULL_TREE; -} - -/* Table of valid machine attributes. */ -const struct attribute_spec ix86_attribute_table[] = -{ - /* { name, min_len, max_len, decl_req, type_req, fn_type_req, - affects_type_identity, handler, exclude } */ - /* Stdcall attribute says callee is responsible for popping arguments - if they are not variable. */ - { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, - NULL }, - /* Fastcall attribute says callee is responsible for popping arguments - if they are not variable. */ - { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, - NULL }, - /* Thiscall attribute says callee is responsible for popping arguments - if they are not variable. */ - { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, - NULL }, - /* Cdecl attribute says the callee is a normal C declaration */ - { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, - NULL }, - /* Regparm attribute specifies how many integer arguments are to be - passed in registers. */ - { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute, - NULL }, - /* Sseregparm attribute says we are using x86_64 calling conventions - for FP arguments. */ - { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, - NULL }, - /* The transactional memory builtins are implicitly regparm or fastcall - depending on the ABI. Override the generic do-nothing attribute that - these builtins were declared with. */ - { "*tm regparm", 0, 0, false, true, true, true, - ix86_handle_tm_regparm_attribute, NULL }, - /* force_align_arg_pointer says this function realigns the stack at entry. */ - { "force_align_arg_pointer", 0, 0, - false, true, true, false, ix86_handle_force_align_arg_pointer_attribute, - NULL }, -#if TARGET_DLLIMPORT_DECL_ATTRIBUTES - { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute, - NULL }, - { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute, - NULL }, - { "shared", 0, 0, true, false, false, false, - ix86_handle_shared_attribute, NULL }, -#endif - { "ms_struct", 0, 0, false, false, false, false, - ix86_handle_struct_attribute, NULL }, - { "gcc_struct", 0, 0, false, false, false, false, - ix86_handle_struct_attribute, NULL }, -#ifdef SUBTARGET_ATTRIBUTE_TABLE - SUBTARGET_ATTRIBUTE_TABLE, -#endif - /* ms_abi and sysv_abi calling convention function attributes. */ - { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL }, - { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, - NULL }, - { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL }, - { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL }, - { "ms_hook_prologue", 0, 0, true, false, false, false, - ix86_handle_fndecl_attribute, NULL }, - { "callee_pop_aggregate_return", 1, 1, false, true, true, true, - ix86_handle_callee_pop_aggregate_return, NULL }, - { "interrupt", 0, 0, false, true, true, false, - ix86_handle_interrupt_attribute, NULL }, - { "no_caller_saved_registers", 0, 0, false, true, true, false, - ix86_handle_no_caller_saved_registers_attribute, NULL }, - { "naked", 0, 0, true, false, false, false, - ix86_handle_fndecl_attribute, NULL }, - { "indirect_branch", 1, 1, true, false, false, false, - ix86_handle_fndecl_attribute, NULL }, - { "function_return", 1, 1, true, false, false, false, - ix86_handle_fndecl_attribute, NULL }, - { "indirect_return", 0, 0, false, true, true, false, - NULL, NULL }, - { "fentry_name", 1, 1, true, false, false, false, - ix86_handle_fentry_name, NULL }, - { "fentry_section", 1, 1, true, false, false, false, - ix86_handle_fentry_name, NULL }, - { "cf_check", 0, 0, true, false, false, false, - ix86_handle_fndecl_attribute, NULL }, - - /* End element. */ - { NULL, 0, 0, false, false, false, false, NULL, NULL } -}; - -#include "gt-i386-options.h" diff --git a/gcc/config/i386/t-cet b/gcc/config/i386/t-cet deleted file mode 100644 index d685d31..0000000 --- a/gcc/config/i386/t-cet +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (C) 2017-2020 Free Software Foundation, Inc. -# -# This file is part of GCC. -# -# GCC is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GCC is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GCC; see the file COPYING3. If not see -# <http://www.gnu.org/licenses/>. - -cet.o: $(srcdir)/config/i386/cet.c - $(COMPILE) $< - $(POSTCOMPILE) diff --git a/gcc/config/i386/x86-tune-sched-atom.c b/gcc/config/i386/x86-tune-sched-atom.c deleted file mode 100644 index 1318efa..0000000 --- a/gcc/config/i386/x86-tune-sched-atom.c +++ /dev/null @@ -1,246 +0,0 @@ -/* Scheduler hooks for IA-32 which implement atom+ specific logic. - Copyright (C) 1988-2020 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING3. If not see -<http://www.gnu.org/licenses/>. */ - -#define IN_TARGET_CODE 1 - -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "backend.h" -#include "rtl.h" -#include "tree.h" -#include "cfghooks.h" -#include "tm_p.h" -#include "insn-config.h" -#include "insn-attr.h" -#include "recog.h" -#include "target.h" -#include "rtl-iter.h" -#include "regset.h" -#include "sched-int.h" - -/* Try to reorder ready list to take advantage of Atom pipelined IMUL - execution. It is applied if - (1) IMUL instruction is on the top of list; - (2) There exists the only producer of independent IMUL instruction in - ready list. - Return index of IMUL producer if it was found and -1 otherwise. */ -static int -do_reorder_for_imul (rtx_insn **ready, int n_ready) -{ - rtx_insn *insn; - rtx set, insn1, insn2; - sd_iterator_def sd_it; - dep_t dep; - int index = -1; - int i; - - if (!TARGET_BONNELL) - return index; - - /* Check that IMUL instruction is on the top of ready list. */ - insn = ready[n_ready - 1]; - set = single_set (insn); - if (!set) - return index; - if (!(GET_CODE (SET_SRC (set)) == MULT - && GET_MODE (SET_SRC (set)) == SImode)) - return index; - - /* Search for producer of independent IMUL instruction. */ - for (i = n_ready - 2; i >= 0; i--) - { - insn = ready[i]; - if (!NONDEBUG_INSN_P (insn)) - continue; - /* Skip IMUL instruction. */ - insn2 = PATTERN (insn); - if (GET_CODE (insn2) == PARALLEL) - insn2 = XVECEXP (insn2, 0, 0); - if (GET_CODE (insn2) == SET - && GET_CODE (SET_SRC (insn2)) == MULT - && GET_MODE (SET_SRC (insn2)) == SImode) - continue; - - FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep) - { - rtx con; - con = DEP_CON (dep); - if (!NONDEBUG_INSN_P (con)) - continue; - insn1 = PATTERN (con); - if (GET_CODE (insn1) == PARALLEL) - insn1 = XVECEXP (insn1, 0, 0); - - if (GET_CODE (insn1) == SET - && GET_CODE (SET_SRC (insn1)) == MULT - && GET_MODE (SET_SRC (insn1)) == SImode) - { - sd_iterator_def sd_it1; - dep_t dep1; - /* Check if there is no other dependee for IMUL. */ - index = i; - FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1) - { - rtx pro; - pro = DEP_PRO (dep1); - if (!NONDEBUG_INSN_P (pro)) - continue; - if (pro != insn) - index = -1; - } - if (index >= 0) - break; - } - } - if (index >= 0) - break; - } - return index; -} - -/* Try to find the best candidate on the top of ready list if two insns - have the same priority - candidate is best if its dependees were - scheduled earlier. Applied for Silvermont only. - Return true if top 2 insns must be interchanged. */ -static bool -swap_top_of_ready_list (rtx_insn **ready, int n_ready) -{ - rtx_insn *top = ready[n_ready - 1]; - rtx_insn *next = ready[n_ready - 2]; - rtx set; - sd_iterator_def sd_it; - dep_t dep; - int clock1 = -1; - int clock2 = -1; - #define INSN_TICK(INSN) (HID (INSN)->tick) - - if (!TARGET_SILVERMONT && !TARGET_INTEL) - return false; - - if (!NONDEBUG_INSN_P (top)) - return false; - if (!NONJUMP_INSN_P (top)) - return false; - if (!NONDEBUG_INSN_P (next)) - return false; - if (!NONJUMP_INSN_P (next)) - return false; - set = single_set (top); - if (!set) - return false; - set = single_set (next); - if (!set) - return false; - - if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next)) - { - if (INSN_PRIORITY (top) != INSN_PRIORITY (next)) - return false; - /* Determine winner more precise. */ - FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep) - { - rtx pro; - pro = DEP_PRO (dep); - if (!NONDEBUG_INSN_P (pro)) - continue; - if (INSN_TICK (pro) > clock1) - clock1 = INSN_TICK (pro); - } - FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep) - { - rtx pro; - pro = DEP_PRO (dep); - if (!NONDEBUG_INSN_P (pro)) - continue; - if (INSN_TICK (pro) > clock2) - clock2 = INSN_TICK (pro); - } - - if (clock1 == clock2) - { - /* Determine winner - load must win. */ - enum attr_memory memory1, memory2; - memory1 = get_attr_memory (top); - memory2 = get_attr_memory (next); - if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD) - return true; - } - return (bool) (clock2 < clock1); - } - return false; - #undef INSN_TICK -} - -/* Perform possible reodering of ready list for Atom/Silvermont only. - Return issue rate. */ -int -ix86_atom_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready, - int *pn_ready, int clock_var) -{ - int issue_rate = -1; - int n_ready = *pn_ready; - int i; - rtx_insn *insn; - int index = -1; - - /* Set up issue rate. */ - issue_rate = ix86_issue_rate (); - - /* Do reodering for BONNELL/SILVERMONT only. */ - if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL) - return issue_rate; - - /* Nothing to do if ready list contains only 1 instruction. */ - if (n_ready <= 1) - return issue_rate; - - /* Do reodering for post-reload scheduler only. */ - if (!reload_completed) - return issue_rate; - - if ((index = do_reorder_for_imul (ready, n_ready)) >= 0) - { - if (sched_verbose > 1) - fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n", - INSN_UID (ready[index])); - - /* Put IMUL producer (ready[index]) at the top of ready list. */ - insn = ready[index]; - for (i = index; i < n_ready - 1; i++) - ready[i] = ready[i + 1]; - ready[n_ready - 1] = insn; - return issue_rate; - } - - /* Skip selective scheduling since HID is not populated in it. */ - if (clock_var != 0 - && !sel_sched_p () - && swap_top_of_ready_list (ready, n_ready)) - { - if (sched_verbose > 1) - fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n", - INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2])); - /* Swap 2 top elements of ready list. */ - insn = ready[n_ready - 1]; - ready[n_ready - 1] = ready[n_ready - 2]; - ready[n_ready - 2] = insn; - } - return issue_rate; -} diff --git a/gcc/config/i386/x86-tune-sched-bd.c b/gcc/config/i386/x86-tune-sched-bd.c deleted file mode 100644 index 8c2abc4..0000000 --- a/gcc/config/i386/x86-tune-sched-bd.c +++ /dev/null @@ -1,824 +0,0 @@ -/* Scheduler hooks for IA-32 which implement bdver1-4 specific logic. - Copyright (C) 1988-2020 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING3. If not see -<http://www.gnu.org/licenses/>. */ - -#define IN_TARGET_CODE 1 - -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "backend.h" -#include "rtl.h" -#include "tree.h" -#include "cfghooks.h" -#include "tm_p.h" -#include "insn-config.h" -#include "insn-attr.h" -#include "recog.h" -#include "target.h" -#include "rtl-iter.h" -#include "regset.h" -#include "sched-int.h" - -/* The size of the dispatch window is the total number of bytes of - object code allowed in a window. */ -#define DISPATCH_WINDOW_SIZE 16 - -/* Number of dispatch windows considered for scheduling. */ -#define MAX_DISPATCH_WINDOWS 3 - -/* Maximum number of instructions in a window. */ -#define MAX_INSN 4 - -/* Maximum number of immediate operands in a window. */ -#define MAX_IMM 4 - -/* Maximum number of immediate bits allowed in a window. */ -#define MAX_IMM_SIZE 128 - -/* Maximum number of 32 bit immediates allowed in a window. */ -#define MAX_IMM_32 4 - -/* Maximum number of 64 bit immediates allowed in a window. */ -#define MAX_IMM_64 2 - -/* Maximum total of loads or prefetches allowed in a window. */ -#define MAX_LOAD 2 - -/* Maximum total of stores allowed in a window. */ -#define MAX_STORE 1 - -#undef BIG -#define BIG 100 - - -/* Dispatch groups. Istructions that affect the mix in a dispatch window. */ -enum dispatch_group { - disp_no_group = 0, - disp_load, - disp_store, - disp_load_store, - disp_prefetch, - disp_imm, - disp_imm_32, - disp_imm_64, - disp_branch, - disp_cmp, - disp_jcc, - disp_last -}; - -/* Number of allowable groups in a dispatch window. It is an array - indexed by dispatch_group enum. 100 is used as a big number, - because the number of these kind of operations does not have any - effect in dispatch window, but we need them for other reasons in - the table. */ -static unsigned int num_allowable_groups[disp_last] = { - 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG -}; - -char group_name[disp_last + 1][16] = { - "disp_no_group", "disp_load", "disp_store", "disp_load_store", - "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64", - "disp_branch", "disp_cmp", "disp_jcc", "disp_last" -}; - -/* Instruction path. */ -enum insn_path { - no_path = 0, - path_single, /* Single micro op. */ - path_double, /* Double micro op. */ - path_multi, /* Instructions with more than 2 micro op.. */ - last_path -}; - -/* sched_insn_info defines a window to the instructions scheduled in - the basic block. It contains a pointer to the insn_info table and - the instruction scheduled. - - Windows are allocated for each basic block and are linked - together. */ -typedef struct sched_insn_info_s { - rtx insn; - enum dispatch_group group; - enum insn_path path; - int byte_len; - int imm_bytes; -} sched_insn_info; - -/* Linked list of dispatch windows. This is a two way list of - dispatch windows of a basic block. It contains information about - the number of uops in the window and the total number of - instructions and of bytes in the object code for this dispatch - window. */ -typedef struct dispatch_windows_s { - int num_insn; /* Number of insn in the window. */ - int num_uops; /* Number of uops in the window. */ - int window_size; /* Number of bytes in the window. */ - int window_num; /* Window number between 0 or 1. */ - int num_imm; /* Number of immediates in an insn. */ - int num_imm_32; /* Number of 32 bit immediates in an insn. */ - int num_imm_64; /* Number of 64 bit immediates in an insn. */ - int imm_size; /* Total immediates in the window. */ - int num_loads; /* Total memory loads in the window. */ - int num_stores; /* Total memory stores in the window. */ - int violation; /* Violation exists in window. */ - sched_insn_info *window; /* Pointer to the window. */ - struct dispatch_windows_s *next; - struct dispatch_windows_s *prev; -} dispatch_windows; - -/* Immediate valuse used in an insn. */ -typedef struct imm_info_s - { - int imm; - int imm32; - int imm64; - } imm_info; - -static dispatch_windows *dispatch_window_list; -static dispatch_windows *dispatch_window_list1; - -/* Get dispatch group of insn. */ - -static enum dispatch_group -get_mem_group (rtx_insn *insn) -{ - enum attr_memory memory; - - if (INSN_CODE (insn) < 0) - return disp_no_group; - memory = get_attr_memory (insn); - if (memory == MEMORY_STORE) - return disp_store; - - if (memory == MEMORY_LOAD) - return disp_load; - - if (memory == MEMORY_BOTH) - return disp_load_store; - - return disp_no_group; -} - -/* Return true if insn is a compare instruction. */ - -static bool -is_cmp (rtx_insn *insn) -{ - enum attr_type type; - - type = get_attr_type (insn); - return (type == TYPE_TEST - || type == TYPE_ICMP - || type == TYPE_FCMP - || GET_CODE (PATTERN (insn)) == COMPARE); -} - -/* Return true if a dispatch violation encountered. */ - -static bool -dispatch_violation (void) -{ - if (dispatch_window_list->next) - return dispatch_window_list->next->violation; - return dispatch_window_list->violation; -} - -/* Return true if insn is a branch instruction. */ - -static bool -is_branch (rtx_insn *insn) -{ - return (CALL_P (insn) || JUMP_P (insn)); -} - -/* Return true if insn is a prefetch instruction. */ - -static bool -is_prefetch (rtx_insn *insn) -{ - return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH; -} - -/* This function initializes a dispatch window and the list container holding a - pointer to the window. */ - -static void -init_window (int window_num) -{ - int i; - dispatch_windows *new_list; - - if (window_num == 0) - new_list = dispatch_window_list; - else - new_list = dispatch_window_list1; - - new_list->num_insn = 0; - new_list->num_uops = 0; - new_list->window_size = 0; - new_list->next = NULL; - new_list->prev = NULL; - new_list->window_num = window_num; - new_list->num_imm = 0; - new_list->num_imm_32 = 0; - new_list->num_imm_64 = 0; - new_list->imm_size = 0; - new_list->num_loads = 0; - new_list->num_stores = 0; - new_list->violation = false; - - for (i = 0; i < MAX_INSN; i++) - { - new_list->window[i].insn = NULL; - new_list->window[i].group = disp_no_group; - new_list->window[i].path = no_path; - new_list->window[i].byte_len = 0; - new_list->window[i].imm_bytes = 0; - } - return; -} - -/* This function allocates and initializes a dispatch window and the - list container holding a pointer to the window. */ - -static dispatch_windows * -allocate_window (void) -{ - dispatch_windows *new_list = XNEW (struct dispatch_windows_s); - new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1); - - return new_list; -} - -/* This routine initializes the dispatch scheduling information. It - initiates building dispatch scheduler tables and constructs the - first dispatch window. */ - -static void -init_dispatch_sched (void) -{ - /* Allocate a dispatch list and a window. */ - dispatch_window_list = allocate_window (); - dispatch_window_list1 = allocate_window (); - init_window (0); - init_window (1); -} - -/* This function returns true if a branch is detected. End of a basic block - does not have to be a branch, but here we assume only branches end a - window. */ - -static bool -is_end_basic_block (enum dispatch_group group) -{ - return group == disp_branch; -} - -/* This function is called when the end of a window processing is reached. */ - -static void -process_end_window (void) -{ - gcc_assert (dispatch_window_list->num_insn <= MAX_INSN); - if (dispatch_window_list->next) - { - gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN); - gcc_assert (dispatch_window_list->window_size - + dispatch_window_list1->window_size <= 48); - init_window (1); - } - init_window (0); -} - -/* Allocates a new dispatch window and adds it to WINDOW_LIST. - WINDOW_NUM is either 0 or 1. A maximum of two windows are generated - for 48 bytes of instructions. Note that these windows are not dispatch - windows that their sizes are DISPATCH_WINDOW_SIZE. */ - -static dispatch_windows * -allocate_next_window (int window_num) -{ - if (window_num == 0) - { - if (dispatch_window_list->next) - init_window (1); - init_window (0); - return dispatch_window_list; - } - - dispatch_window_list->next = dispatch_window_list1; - dispatch_window_list1->prev = dispatch_window_list; - - return dispatch_window_list1; -} - -/* Compute number of immediate operands of an instruction. */ - -static void -find_constant (rtx in_rtx, imm_info *imm_values) -{ - if (INSN_P (in_rtx)) - in_rtx = PATTERN (in_rtx); - subrtx_iterator::array_type array; - FOR_EACH_SUBRTX (iter, array, in_rtx, ALL) - if (const_rtx x = *iter) - switch (GET_CODE (x)) - { - case CONST: - case SYMBOL_REF: - case CONST_INT: - (imm_values->imm)++; - if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode)) - (imm_values->imm32)++; - else - (imm_values->imm64)++; - break; - - case CONST_DOUBLE: - case CONST_WIDE_INT: - (imm_values->imm)++; - (imm_values->imm64)++; - break; - - case CODE_LABEL: - if (LABEL_KIND (x) == LABEL_NORMAL) - { - (imm_values->imm)++; - (imm_values->imm32)++; - } - break; - - default: - break; - } -} - -/* Return total size of immediate operands of an instruction along with number - of corresponding immediate-operands. It initializes its parameters to zero - befor calling FIND_CONSTANT. - INSN is the input instruction. IMM is the total of immediates. - IMM32 is the number of 32 bit immediates. IMM64 is the number of 64 - bit immediates. */ - -static int -get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64) -{ - imm_info imm_values = {0, 0, 0}; - - find_constant (insn, &imm_values); - *imm = imm_values.imm; - *imm32 = imm_values.imm32; - *imm64 = imm_values.imm64; - return imm_values.imm32 * 4 + imm_values.imm64 * 8; -} - -/* This function indicates if an operand of an instruction is an - immediate. */ - -static bool -has_immediate (rtx_insn *insn) -{ - int num_imm_operand; - int num_imm32_operand; - int num_imm64_operand; - - if (insn) - return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, - &num_imm64_operand); - return false; -} - -/* Return single or double path for instructions. */ - -static enum insn_path -get_insn_path (rtx_insn *insn) -{ - enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn); - - if ((int)path == 0) - return path_single; - - if ((int)path == 1) - return path_double; - - return path_multi; -} - -/* Return insn dispatch group. */ - -static enum dispatch_group -get_insn_group (rtx_insn *insn) -{ - enum dispatch_group group = get_mem_group (insn); - if (group) - return group; - - if (is_branch (insn)) - return disp_branch; - - if (is_cmp (insn)) - return disp_cmp; - - if (has_immediate (insn)) - return disp_imm; - - if (is_prefetch (insn)) - return disp_prefetch; - - return disp_no_group; -} - -/* Count number of GROUP restricted instructions in a dispatch - window WINDOW_LIST. */ - -static int -count_num_restricted (rtx_insn *insn, dispatch_windows *window_list) -{ - enum dispatch_group group = get_insn_group (insn); - int imm_size; - int num_imm_operand; - int num_imm32_operand; - int num_imm64_operand; - - if (group == disp_no_group) - return 0; - - if (group == disp_imm) - { - imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, - &num_imm64_operand); - if (window_list->imm_size + imm_size > MAX_IMM_SIZE - || num_imm_operand + window_list->num_imm > MAX_IMM - || (num_imm32_operand > 0 - && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32 - || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32)) - || (num_imm64_operand > 0 - && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64 - || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32)) - || (window_list->imm_size + imm_size == MAX_IMM_SIZE - && num_imm64_operand > 0 - && ((window_list->num_imm_64 > 0 - && window_list->num_insn >= 2) - || window_list->num_insn >= 3))) - return BIG; - - return 1; - } - - if ((group == disp_load_store - && (window_list->num_loads >= MAX_LOAD - || window_list->num_stores >= MAX_STORE)) - || ((group == disp_load - || group == disp_prefetch) - && window_list->num_loads >= MAX_LOAD) - || (group == disp_store - && window_list->num_stores >= MAX_STORE)) - return BIG; - - return 1; -} - -/* This function returns true if insn satisfies dispatch rules on the - last window scheduled. */ - -static bool -fits_dispatch_window (rtx_insn *insn) -{ - dispatch_windows *window_list = dispatch_window_list; - dispatch_windows *window_list_next = dispatch_window_list->next; - unsigned int num_restrict; - enum dispatch_group group = get_insn_group (insn); - enum insn_path path = get_insn_path (insn); - int sum; - - /* Make disp_cmp and disp_jcc get scheduled at the latest. These - instructions should be given the lowest priority in the - scheduling process in Haifa scheduler to make sure they will be - scheduled in the same dispatch window as the reference to them. */ - if (group == disp_jcc || group == disp_cmp) - return false; - - /* Check nonrestricted. */ - if (group == disp_no_group || group == disp_branch) - return true; - - /* Get last dispatch window. */ - if (window_list_next) - window_list = window_list_next; - - if (window_list->window_num == 1) - { - sum = window_list->prev->window_size + window_list->window_size; - - if (sum == 32 - || (ix86_min_insn_size (insn) + sum) >= 48) - /* Window 1 is full. Go for next window. */ - return true; - } - - num_restrict = count_num_restricted (insn, window_list); - - if (num_restrict > num_allowable_groups[group]) - return false; - - /* See if it fits in the first window. */ - if (window_list->window_num == 0) - { - /* The first widow should have only single and double path - uops. */ - if (path == path_double - && (window_list->num_uops + 2) > MAX_INSN) - return false; - else if (path != path_single) - return false; - } - return true; -} - -/* Add an instruction INSN with NUM_UOPS micro-operations to the - dispatch window WINDOW_LIST. */ - -static void -add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops) -{ - int byte_len = ix86_min_insn_size (insn); - int num_insn = window_list->num_insn; - int imm_size; - sched_insn_info *window = window_list->window; - enum dispatch_group group = get_insn_group (insn); - enum insn_path path = get_insn_path (insn); - int num_imm_operand; - int num_imm32_operand; - int num_imm64_operand; - - if (!window_list->violation && group != disp_cmp - && !fits_dispatch_window (insn)) - window_list->violation = true; - - imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, - &num_imm64_operand); - - /* Initialize window with new instruction. */ - window[num_insn].insn = insn; - window[num_insn].byte_len = byte_len; - window[num_insn].group = group; - window[num_insn].path = path; - window[num_insn].imm_bytes = imm_size; - - window_list->window_size += byte_len; - window_list->num_insn = num_insn + 1; - window_list->num_uops = window_list->num_uops + num_uops; - window_list->imm_size += imm_size; - window_list->num_imm += num_imm_operand; - window_list->num_imm_32 += num_imm32_operand; - window_list->num_imm_64 += num_imm64_operand; - - if (group == disp_store) - window_list->num_stores += 1; - else if (group == disp_load - || group == disp_prefetch) - window_list->num_loads += 1; - else if (group == disp_load_store) - { - window_list->num_stores += 1; - window_list->num_loads += 1; - } -} - -/* Adds a scheduled instruction, INSN, to the current dispatch window. - If the total bytes of instructions or the number of instructions in - the window exceed allowable, it allocates a new window. */ - -static void -add_to_dispatch_window (rtx_insn *insn) -{ - int byte_len; - dispatch_windows *window_list; - dispatch_windows *next_list; - dispatch_windows *window0_list; - enum insn_path path; - enum dispatch_group insn_group; - bool insn_fits; - int num_insn; - int num_uops; - int window_num; - int insn_num_uops; - int sum; - - if (INSN_CODE (insn) < 0) - return; - - byte_len = ix86_min_insn_size (insn); - window_list = dispatch_window_list; - next_list = window_list->next; - path = get_insn_path (insn); - insn_group = get_insn_group (insn); - - /* Get the last dispatch window. */ - if (next_list) - window_list = dispatch_window_list->next; - - if (path == path_single) - insn_num_uops = 1; - else if (path == path_double) - insn_num_uops = 2; - else - insn_num_uops = (int) path; - - /* If current window is full, get a new window. - Window number zero is full, if MAX_INSN uops are scheduled in it. - Window number one is full, if window zero's bytes plus window - one's bytes is 32, or if the bytes of the new instruction added - to the total makes it greater than 48, or it has already MAX_INSN - instructions in it. */ - num_insn = window_list->num_insn; - num_uops = window_list->num_uops; - window_num = window_list->window_num; - insn_fits = fits_dispatch_window (insn); - - if (num_insn >= MAX_INSN - || num_uops + insn_num_uops > MAX_INSN - || !(insn_fits)) - { - window_num = ~window_num & 1; - window_list = allocate_next_window (window_num); - } - - if (window_num == 0) - { - add_insn_window (insn, window_list, insn_num_uops); - if (window_list->num_insn >= MAX_INSN - && insn_group == disp_branch) - { - process_end_window (); - return; - } - } - else if (window_num == 1) - { - window0_list = window_list->prev; - sum = window0_list->window_size + window_list->window_size; - if (sum == 32 - || (byte_len + sum) >= 48) - { - process_end_window (); - window_list = dispatch_window_list; - } - - add_insn_window (insn, window_list, insn_num_uops); - } - else - gcc_unreachable (); - - if (is_end_basic_block (insn_group)) - { - /* End of basic block is reached do end-basic-block process. */ - process_end_window (); - return; - } -} - -/* Print the dispatch window, WINDOW_NUM, to FILE. */ - -DEBUG_FUNCTION static void -debug_dispatch_window_file (FILE *file, int window_num) -{ - dispatch_windows *list; - int i; - - if (window_num == 0) - list = dispatch_window_list; - else - list = dispatch_window_list1; - - fprintf (file, "Window #%d:\n", list->window_num); - fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n", - list->num_insn, list->num_uops, list->window_size); - fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n", - list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size); - - fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads, - list->num_stores); - fprintf (file, " insn info:\n"); - - for (i = 0; i < MAX_INSN; i++) - { - if (!list->window[i].insn) - break; - fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n", - i, group_name[list->window[i].group], - i, (void *)list->window[i].insn, - i, list->window[i].path, - i, list->window[i].byte_len, - i, list->window[i].imm_bytes); - } -} - -/* Print to stdout a dispatch window. */ - -DEBUG_FUNCTION void -debug_dispatch_window (int window_num) -{ - debug_dispatch_window_file (stdout, window_num); -} - -/* Print INSN dispatch information to FILE. */ - -DEBUG_FUNCTION static void -debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn) -{ - int byte_len; - enum insn_path path; - enum dispatch_group group; - int imm_size; - int num_imm_operand; - int num_imm32_operand; - int num_imm64_operand; - - if (INSN_CODE (insn) < 0) - return; - - byte_len = ix86_min_insn_size (insn); - path = get_insn_path (insn); - group = get_insn_group (insn); - imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, - &num_imm64_operand); - - fprintf (file, " insn info:\n"); - fprintf (file, " group = %s, path = %d, byte_len = %d\n", - group_name[group], path, byte_len); - fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n", - num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size); -} - -/* Print to STDERR the status of the ready list with respect to - dispatch windows. */ - -DEBUG_FUNCTION void -debug_ready_dispatch (void) -{ - int i; - int no_ready = number_in_ready (); - - fprintf (stdout, "Number of ready: %d\n", no_ready); - - for (i = 0; i < no_ready; i++) - debug_insn_dispatch_info_file (stdout, get_ready_element (i)); -} - -/* This routine is the driver of the dispatch scheduler. */ - -void -ix86_bd_do_dispatch (rtx_insn *insn, int mode) -{ - if (mode == DISPATCH_INIT) - init_dispatch_sched (); - else if (mode == ADD_TO_DISPATCH_WINDOW) - add_to_dispatch_window (insn); -} - -/* Return TRUE if Dispatch Scheduling is supported. */ - -bool -ix86_bd_has_dispatch (rtx_insn *insn, int action) -{ - /* Current implementation of dispatch scheduler models buldozer only. */ - if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3 - || TARGET_BDVER4) && flag_dispatch_scheduler) - switch (action) - { - default: - return false; - - case IS_DISPATCH_ON: - return true; - - case IS_CMP: - return is_cmp (insn); - - case DISPATCH_VIOLATION: - return dispatch_violation (); - - case FITS_DISPATCH_WINDOW: - return fits_dispatch_window (insn); - } - - return false; -} diff --git a/gcc/config/i386/x86-tune-sched-core.c b/gcc/config/i386/x86-tune-sched-core.c deleted file mode 100644 index 076368c..0000000 --- a/gcc/config/i386/x86-tune-sched-core.c +++ /dev/null @@ -1,257 +0,0 @@ -/* Scheduler hooks for IA-32 which implement bdver1-4 specific logic. - Copyright (C) 1988-2020 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING3. If not see -<http://www.gnu.org/licenses/>. */ - -#define IN_TARGET_CODE 1 - -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "backend.h" -#include "rtl.h" -#include "tree.h" -#include "cfghooks.h" -#include "tm_p.h" -#include "insn-config.h" -#include "insn-attr.h" -#include "recog.h" -#include "target.h" -#include "rtl-iter.h" -#include "regset.h" -#include "sched-int.h" - - -/* Model decoder of Core 2/i7. - Below hooks for multipass scheduling (see haifa-sched.c:max_issue) - track the instruction fetch block boundaries and make sure that long - (9+ bytes) instructions are assigned to D0. */ - -/* Maximum length of an insn that can be handled by - a secondary decoder unit. '8' for Core 2/i7. */ -static int core2i7_secondary_decoder_max_insn_size; - -/* Ifetch block size, i.e., number of bytes decoder reads per cycle. - '16' for Core 2/i7. */ -static int core2i7_ifetch_block_size; - -/* Maximum number of instructions decoder can handle per cycle. - '6' for Core 2/i7. */ -static int core2i7_ifetch_block_max_insns; - -typedef struct ix86_first_cycle_multipass_data_ * - ix86_first_cycle_multipass_data_t; -typedef const struct ix86_first_cycle_multipass_data_ * - const_ix86_first_cycle_multipass_data_t; - -/* A variable to store target state across calls to max_issue within - one cycle. */ -static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data, - *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data; - -/* Initialize DATA. */ -static void -core2i7_first_cycle_multipass_init (void *_data) -{ - ix86_first_cycle_multipass_data_t data - = (ix86_first_cycle_multipass_data_t) _data; - - data->ifetch_block_len = 0; - data->ifetch_block_n_insns = 0; - data->ready_try_change = NULL; - data->ready_try_change_size = 0; -} - -/* Advancing the cycle; reset ifetch block counts. */ -static void -core2i7_dfa_post_advance_cycle (void) -{ - ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data; - - gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns); - - data->ifetch_block_len = 0; - data->ifetch_block_n_insns = 0; -} - -/* Filter out insns from ready_try that the core will not be able to issue - on current cycle due to decoder. */ -static void -core2i7_first_cycle_multipass_filter_ready_try -(const_ix86_first_cycle_multipass_data_t data, - signed char *ready_try, int n_ready, bool first_cycle_insn_p) -{ - while (n_ready--) - { - rtx_insn *insn; - int insn_size; - - if (ready_try[n_ready]) - continue; - - insn = get_ready_element (n_ready); - insn_size = ix86_min_insn_size (insn); - - if (/* If this is a too long an insn for a secondary decoder ... */ - (!first_cycle_insn_p - && insn_size > core2i7_secondary_decoder_max_insn_size) - /* ... or it would not fit into the ifetch block ... */ - || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size - /* ... or the decoder is full already ... */ - || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns) - /* ... mask the insn out. */ - { - ready_try[n_ready] = 1; - - if (data->ready_try_change) - bitmap_set_bit (data->ready_try_change, n_ready); - } - } -} - -/* Prepare for a new round of multipass lookahead scheduling. */ -static void -core2i7_first_cycle_multipass_begin (void *_data, - signed char *ready_try, int n_ready, - bool first_cycle_insn_p) -{ - ix86_first_cycle_multipass_data_t data - = (ix86_first_cycle_multipass_data_t) _data; - const_ix86_first_cycle_multipass_data_t prev_data - = ix86_first_cycle_multipass_data; - - /* Restore the state from the end of the previous round. */ - data->ifetch_block_len = prev_data->ifetch_block_len; - data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns; - - /* Filter instructions that cannot be issued on current cycle due to - decoder restrictions. */ - core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready, - first_cycle_insn_p); -} - -/* INSN is being issued in current solution. Account for its impact on - the decoder model. */ -static void -core2i7_first_cycle_multipass_issue (void *_data, - signed char *ready_try, int n_ready, - rtx_insn *insn, const void *_prev_data) -{ - ix86_first_cycle_multipass_data_t data - = (ix86_first_cycle_multipass_data_t) _data; - const_ix86_first_cycle_multipass_data_t prev_data - = (const_ix86_first_cycle_multipass_data_t) _prev_data; - - int insn_size = ix86_min_insn_size (insn); - - data->ifetch_block_len = prev_data->ifetch_block_len + insn_size; - data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1; - gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size - && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns); - - /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */ - if (!data->ready_try_change) - { - data->ready_try_change = sbitmap_alloc (n_ready); - data->ready_try_change_size = n_ready; - } - else if (data->ready_try_change_size < n_ready) - { - data->ready_try_change = sbitmap_resize (data->ready_try_change, - n_ready, 0); - data->ready_try_change_size = n_ready; - } - bitmap_clear (data->ready_try_change); - - /* Filter out insns from ready_try that the core will not be able to issue - on current cycle due to decoder. */ - core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready, - false); -} - -/* Revert the effect on ready_try. */ -static void -core2i7_first_cycle_multipass_backtrack (const void *_data, - signed char *ready_try, - int n_ready ATTRIBUTE_UNUSED) -{ - const_ix86_first_cycle_multipass_data_t data - = (const_ix86_first_cycle_multipass_data_t) _data; - unsigned int i = 0; - sbitmap_iterator sbi; - - gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready); - EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi) - { - ready_try[i] = 0; - } -} - -/* Save the result of multipass lookahead scheduling for the next round. */ -static void -core2i7_first_cycle_multipass_end (const void *_data) -{ - const_ix86_first_cycle_multipass_data_t data - = (const_ix86_first_cycle_multipass_data_t) _data; - ix86_first_cycle_multipass_data_t next_data - = ix86_first_cycle_multipass_data; - - if (data != NULL) - { - next_data->ifetch_block_len = data->ifetch_block_len; - next_data->ifetch_block_n_insns = data->ifetch_block_n_insns; - } -} - -/* Deallocate target data. */ -static void -core2i7_first_cycle_multipass_fini (void *_data) -{ - ix86_first_cycle_multipass_data_t data - = (ix86_first_cycle_multipass_data_t) _data; - - if (data->ready_try_change) - { - sbitmap_free (data->ready_try_change); - data->ready_try_change = NULL; - data->ready_try_change_size = 0; - } -} - -void -ix86_core2i7_init_hooks (void) -{ - targetm.sched.dfa_post_advance_cycle - = core2i7_dfa_post_advance_cycle; - targetm.sched.first_cycle_multipass_init - = core2i7_first_cycle_multipass_init; - targetm.sched.first_cycle_multipass_begin - = core2i7_first_cycle_multipass_begin; - targetm.sched.first_cycle_multipass_issue - = core2i7_first_cycle_multipass_issue; - targetm.sched.first_cycle_multipass_backtrack - = core2i7_first_cycle_multipass_backtrack; - targetm.sched.first_cycle_multipass_end - = core2i7_first_cycle_multipass_end; - targetm.sched.first_cycle_multipass_fini - = core2i7_first_cycle_multipass_fini; - - /* Set decoder parameters. */ - core2i7_secondary_decoder_max_insn_size = 8; - core2i7_ifetch_block_size = 16; - core2i7_ifetch_block_max_insns = 6; -} diff --git a/gcc/config/i386/x86-tune-sched.c b/gcc/config/i386/x86-tune-sched.c deleted file mode 100644 index d4d8a12..0000000 --- a/gcc/config/i386/x86-tune-sched.c +++ /dev/null @@ -1,636 +0,0 @@ -/* Scheduler hooks for IA-32 which implement CPU specific logic. - Copyright (C) 1988-2020 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING3. If not see -<http://www.gnu.org/licenses/>. */ - -#define IN_TARGET_CODE 1 - -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "backend.h" -#include "rtl.h" -#include "tree.h" -#include "cfghooks.h" -#include "tm_p.h" -#include "target.h" -#include "insn-config.h" -#include "insn-attr.h" -#include "insn-opinit.h" -#include "recog.h" - -/* Return the maximum number of instructions a cpu can issue. */ - -int -ix86_issue_rate (void) -{ - switch (ix86_tune) - { - case PROCESSOR_PENTIUM: - case PROCESSOR_LAKEMONT: - case PROCESSOR_BONNELL: - case PROCESSOR_SILVERMONT: - case PROCESSOR_KNL: - case PROCESSOR_KNM: - case PROCESSOR_INTEL: - case PROCESSOR_K6: - case PROCESSOR_BTVER2: - case PROCESSOR_PENTIUM4: - case PROCESSOR_NOCONA: - return 2; - - case PROCESSOR_PENTIUMPRO: - case PROCESSOR_ATHLON: - case PROCESSOR_K8: - case PROCESSOR_AMDFAM10: - case PROCESSOR_BTVER1: - return 3; - - case PROCESSOR_BDVER1: - case PROCESSOR_BDVER2: - case PROCESSOR_BDVER3: - case PROCESSOR_BDVER4: - case PROCESSOR_ZNVER1: - case PROCESSOR_ZNVER2: - case PROCESSOR_CORE2: - case PROCESSOR_NEHALEM: - case PROCESSOR_SANDYBRIDGE: - case PROCESSOR_HASWELL: - case PROCESSOR_GENERIC: - return 4; - - default: - return 1; - } -} - -/* Return true iff USE_INSN has a memory address with operands set by - SET_INSN. */ - -bool -ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn) -{ - int i; - extract_insn_cached (use_insn); - for (i = recog_data.n_operands - 1; i >= 0; --i) - if (MEM_P (recog_data.operand[i])) - { - rtx addr = XEXP (recog_data.operand[i], 0); - if (modified_in_p (addr, set_insn) != 0) - { - /* No AGI stall if SET_INSN is a push or pop and USE_INSN - has SP based memory (unless index reg is modified in a pop). */ - rtx set = single_set (set_insn); - if (set - && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set))) - || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set))))) - { - struct ix86_address parts; - if (ix86_decompose_address (addr, &parts) - && parts.base == stack_pointer_rtx - && (parts.index == NULL_RTX - || MEM_P (SET_DEST (set)) - || !modified_in_p (parts.index, set_insn))) - return false; - } - return true; - } - return false; - } - return false; -} - -/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set - by DEP_INSN and nothing set by DEP_INSN. */ - -static bool -ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type) -{ - rtx set, set2; - - /* Simplify the test for uninteresting insns. */ - if (insn_type != TYPE_SETCC - && insn_type != TYPE_ICMOV - && insn_type != TYPE_FCMOV - && insn_type != TYPE_IBR) - return false; - - if ((set = single_set (dep_insn)) != 0) - { - set = SET_DEST (set); - set2 = NULL_RTX; - } - else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL - && XVECLEN (PATTERN (dep_insn), 0) == 2 - && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET - && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET) - { - set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0)); - set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0)); - } - else - return false; - - if (!REG_P (set) || REGNO (set) != FLAGS_REG) - return false; - - /* This test is true if the dependent insn reads the flags but - not any other potentially set register. */ - if (!reg_overlap_mentioned_p (set, PATTERN (insn))) - return false; - - if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn))) - return false; - - return true; -} - -/* Helper function for exact_store_load_dependency. - Return true if addr is found in insn. */ -static bool -exact_dependency_1 (rtx addr, rtx insn) -{ - enum rtx_code code; - const char *format_ptr; - int i, j; - - code = GET_CODE (insn); - switch (code) - { - case MEM: - if (rtx_equal_p (addr, insn)) - return true; - break; - case REG: - CASE_CONST_ANY: - case SYMBOL_REF: - case CODE_LABEL: - case PC: - case CC0: - case EXPR_LIST: - return false; - default: - break; - } - - format_ptr = GET_RTX_FORMAT (code); - for (i = 0; i < GET_RTX_LENGTH (code); i++) - { - switch (*format_ptr++) - { - case 'e': - if (exact_dependency_1 (addr, XEXP (insn, i))) - return true; - break; - case 'E': - for (j = 0; j < XVECLEN (insn, i); j++) - if (exact_dependency_1 (addr, XVECEXP (insn, i, j))) - return true; - break; - } - } - return false; -} - -/* Return true if there exists exact dependency for store & load, i.e. - the same memory address is used in them. */ -static bool -exact_store_load_dependency (rtx_insn *store, rtx_insn *load) -{ - rtx set1, set2; - - set1 = single_set (store); - if (!set1) - return false; - if (!MEM_P (SET_DEST (set1))) - return false; - set2 = single_set (load); - if (!set2) - return false; - if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2))) - return true; - return false; -} - - -/* This function corrects the value of COST (latency) based on the relationship - between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength - DW. It should return the new value. - - On x86 CPUs this is most commonly used to model the fact that valus of - registers used to compute address of memory operand needs to be ready - earlier than values of registers used in the actual operation. */ - -int -ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, - unsigned int) -{ - enum attr_type insn_type, dep_insn_type; - enum attr_memory memory; - rtx set, set2; - int dep_insn_code_number; - - /* Anti and output dependencies have zero cost on all CPUs. */ - if (dep_type != 0) - return 0; - - dep_insn_code_number = recog_memoized (dep_insn); - - /* If we can't recognize the insns, we can't really do anything. */ - if (dep_insn_code_number < 0 || recog_memoized (insn) < 0) - return cost; - - insn_type = get_attr_type (insn); - dep_insn_type = get_attr_type (dep_insn); - - switch (ix86_tune) - { - case PROCESSOR_PENTIUM: - case PROCESSOR_LAKEMONT: - /* Address Generation Interlock adds a cycle of latency. */ - if (insn_type == TYPE_LEA) - { - rtx addr = PATTERN (insn); - - if (GET_CODE (addr) == PARALLEL) - addr = XVECEXP (addr, 0, 0); - - gcc_assert (GET_CODE (addr) == SET); - - addr = SET_SRC (addr); - if (modified_in_p (addr, dep_insn)) - cost += 1; - } - else if (ix86_agi_dependent (dep_insn, insn)) - cost += 1; - - /* ??? Compares pair with jump/setcc. */ - if (ix86_flags_dependent (insn, dep_insn, insn_type)) - cost = 0; - - /* Floating point stores require value to be ready one cycle earlier. */ - if (insn_type == TYPE_FMOV - && get_attr_memory (insn) == MEMORY_STORE - && !ix86_agi_dependent (dep_insn, insn)) - cost += 1; - break; - - case PROCESSOR_PENTIUMPRO: - /* INT->FP conversion is expensive. */ - if (get_attr_fp_int_src (dep_insn)) - cost += 5; - - /* There is one cycle extra latency between an FP op and a store. */ - if (insn_type == TYPE_FMOV - && (set = single_set (dep_insn)) != NULL_RTX - && (set2 = single_set (insn)) != NULL_RTX - && rtx_equal_p (SET_DEST (set), SET_SRC (set2)) - && MEM_P (SET_DEST (set2))) - cost += 1; - - memory = get_attr_memory (insn); - - /* Show ability of reorder buffer to hide latency of load by executing - in parallel with previous instruction in case - previous instruction is not needed to compute the address. */ - if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependent (dep_insn, insn)) - { - /* Claim moves to take one cycle, as core can issue one load - at time and the next load can start cycle later. */ - if (dep_insn_type == TYPE_IMOV - || dep_insn_type == TYPE_FMOV) - cost = 1; - else if (cost > 1) - cost--; - } - break; - - case PROCESSOR_K6: - /* The esp dependency is resolved before - the instruction is really finished. */ - if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) - && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) - return 1; - - /* INT->FP conversion is expensive. */ - if (get_attr_fp_int_src (dep_insn)) - cost += 5; - - memory = get_attr_memory (insn); - - /* Show ability of reorder buffer to hide latency of load by executing - in parallel with previous instruction in case - previous instruction is not needed to compute the address. */ - if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependent (dep_insn, insn)) - { - /* Claim moves to take one cycle, as core can issue one load - at time and the next load can start cycle later. */ - if (dep_insn_type == TYPE_IMOV - || dep_insn_type == TYPE_FMOV) - cost = 1; - else if (cost > 2) - cost -= 2; - else - cost = 1; - } - break; - - case PROCESSOR_AMDFAM10: - case PROCESSOR_BDVER1: - case PROCESSOR_BDVER2: - case PROCESSOR_BDVER3: - case PROCESSOR_BDVER4: - case PROCESSOR_BTVER1: - case PROCESSOR_BTVER2: - /* Stack engine allows to execute push&pop instructions in parall. */ - if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) - && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) - return 0; - /* FALLTHRU */ - - case PROCESSOR_ATHLON: - case PROCESSOR_K8: - memory = get_attr_memory (insn); - - /* Show ability of reorder buffer to hide latency of load by executing - in parallel with previous instruction in case - previous instruction is not needed to compute the address. */ - if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependent (dep_insn, insn)) - { - enum attr_unit unit = get_attr_unit (insn); - int loadcost = 3; - - /* Because of the difference between the length of integer and - floating unit pipeline preparation stages, the memory operands - for floating point are cheaper. - - ??? For Athlon it the difference is most probably 2. */ - if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) - loadcost = 3; - else - loadcost = TARGET_ATHLON ? 2 : 0; - - if (cost >= loadcost) - cost -= loadcost; - else - cost = 0; - } - break; - - case PROCESSOR_ZNVER1: - case PROCESSOR_ZNVER2: - /* Stack engine allows to execute push&pop instructions in parall. */ - if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) - && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) - return 0; - - memory = get_attr_memory (insn); - - /* Show ability of reorder buffer to hide latency of load by executing - in parallel with previous instruction in case - previous instruction is not needed to compute the address. */ - if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependent (dep_insn, insn)) - { - enum attr_unit unit = get_attr_unit (insn); - int loadcost; - - if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) - loadcost = 4; - else - loadcost = 7; - - if (cost >= loadcost) - cost -= loadcost; - else - cost = 0; - } - break; - - case PROCESSOR_CORE2: - case PROCESSOR_NEHALEM: - case PROCESSOR_SANDYBRIDGE: - case PROCESSOR_HASWELL: - case PROCESSOR_GENERIC: - /* Stack engine allows to execute push&pop instructions in parall. */ - if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) - && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) - return 0; - - memory = get_attr_memory (insn); - - /* Show ability of reorder buffer to hide latency of load by executing - in parallel with previous instruction in case - previous instruction is not needed to compute the address. */ - if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependent (dep_insn, insn)) - { - if (cost >= 4) - cost -= 4; - else - cost = 0; - } - break; - - case PROCESSOR_SILVERMONT: - case PROCESSOR_KNL: - case PROCESSOR_KNM: - case PROCESSOR_INTEL: - if (!reload_completed) - return cost; - - /* Increase cost of integer loads. */ - memory = get_attr_memory (dep_insn); - if (memory == MEMORY_LOAD || memory == MEMORY_BOTH) - { - enum attr_unit unit = get_attr_unit (dep_insn); - if (unit == UNIT_INTEGER && cost == 1) - { - if (memory == MEMORY_LOAD) - cost = 3; - else - { - /* Increase cost of ld/st for short int types only - because of store forwarding issue. */ - rtx set = single_set (dep_insn); - if (set && (GET_MODE (SET_DEST (set)) == QImode - || GET_MODE (SET_DEST (set)) == HImode)) - { - /* Increase cost of store/load insn if exact - dependence exists and it is load insn. */ - enum attr_memory insn_memory = get_attr_memory (insn); - if (insn_memory == MEMORY_LOAD - && exact_store_load_dependency (dep_insn, insn)) - cost = 3; - } - } - } - } - - default: - break; - } - - return cost; -} - -/* How many alternative schedules to try. This should be as wide as the - scheduling freedom in the DFA, but no wider. Making this value too - large results extra work for the scheduler. */ - -int -ia32_multipass_dfa_lookahead (void) -{ - /* Generally, we want haifa-sched:max_issue() to look ahead as far - as many instructions can be executed on a cycle, i.e., - issue_rate. */ - if (reload_completed) - return ix86_issue_rate (); - /* Don't use lookahead for pre-reload schedule to save compile time. */ - return 0; -} - -/* Return true if target platform supports macro-fusion. */ - -bool -ix86_macro_fusion_p () -{ - return TARGET_FUSE_CMP_AND_BRANCH; -} - -/* Check whether current microarchitecture support macro fusion - for insn pair "CONDGEN + CONDJMP". Refer to - "Intel Architectures Optimization Reference Manual". */ - -bool -ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp) -{ - rtx src, dest; - enum rtx_code ccode; - rtx compare_set = NULL_RTX, test_if, cond; - rtx alu_set = NULL_RTX, addr = NULL_RTX; - enum attr_type condgen_type; - - if (!any_condjump_p (condjmp)) - return false; - - unsigned int condreg1, condreg2; - rtx cc_reg_1; - targetm.fixed_condition_code_regs (&condreg1, &condreg2); - cc_reg_1 = gen_rtx_REG (CCmode, condreg1); - if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp)) - || !condgen - || !modified_in_p (cc_reg_1, condgen)) - return false; - - condgen_type = get_attr_type (condgen); - if (condgen_type == TYPE_MULTI - && INSN_CODE (condgen) == code_for_stack_protect_test_1 (ptr_mode) - && TARGET_FUSE_ALU_AND_BRANCH) - { - /* stack_protect_test_<mode> ends with a sub, which subtracts - a non-rip special memory operand from a GPR. */ - src = NULL_RTX; - alu_set = XVECEXP (PATTERN (condgen), 0, 1); - goto handle_stack_protect_test; - } - else if (condgen_type != TYPE_TEST - && condgen_type != TYPE_ICMP - && condgen_type != TYPE_INCDEC - && condgen_type != TYPE_ALU) - return false; - - compare_set = single_set (condgen); - if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH) - return false; - - if (compare_set == NULL_RTX) - { - int i; - rtx pat = PATTERN (condgen); - for (i = 0; i < XVECLEN (pat, 0); i++) - if (GET_CODE (XVECEXP (pat, 0, i)) == SET) - { - rtx set_src = SET_SRC (XVECEXP (pat, 0, i)); - if (GET_CODE (set_src) == COMPARE) - compare_set = XVECEXP (pat, 0, i); - else - alu_set = XVECEXP (pat, 0, i); - } - } - if (compare_set == NULL_RTX) - return false; - src = SET_SRC (compare_set); - if (GET_CODE (src) != COMPARE) - return false; - - /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not - supported. */ - if ((MEM_P (XEXP (src, 0)) && CONST_INT_P (XEXP (src, 1))) - || (MEM_P (XEXP (src, 1)) && CONST_INT_P (XEXP (src, 0)))) - return false; - - /* No fusion for RIP-relative address. */ - if (MEM_P (XEXP (src, 0))) - addr = XEXP (XEXP (src, 0), 0); - else if (MEM_P (XEXP (src, 1))) - addr = XEXP (XEXP (src, 1), 0); - - if (addr) - { - ix86_address parts; - int ok = ix86_decompose_address (addr, &parts); - gcc_assert (ok); - - if (ix86_rip_relative_addr_p (&parts)) - return false; - } - - handle_stack_protect_test: - test_if = SET_SRC (pc_set (condjmp)); - cond = XEXP (test_if, 0); - ccode = GET_CODE (cond); - /* Check whether conditional jump use Sign or Overflow Flags. */ - if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS - && (ccode == GE || ccode == GT || ccode == LE || ccode == LT)) - return false; - - /* Return true for TYPE_TEST and TYPE_ICMP. */ - if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP) - return true; - - /* The following is the case that macro-fusion for alu + jmp. */ - if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set) - return false; - - /* No fusion for alu op with memory destination operand. */ - dest = SET_DEST (alu_set); - if (MEM_P (dest)) - return false; - - /* Macro-fusion for inc/dec + unsigned conditional jump is not - supported. */ - if (condgen_type == TYPE_INCDEC - && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU)) - return false; - - return true; -} - |