diff options
author | Mayshao <mayshao-oc@zhaoxin.com> | 2022-05-23 17:05:31 +0200 |
---|---|---|
committer | Uros Bizjak <ubizjak@gmail.com> | 2022-05-23 17:53:27 +0200 |
commit | a239aff82c3771242d957c0f744cf62b42ed2f2a (patch) | |
tree | d1533cb2eaa1651152f0a3be15411fb825d1ee49 | |
parent | e6c04ac9fd9c7b5538a6f5f45e5f9dc22954764b (diff) | |
download | gcc-a239aff82c3771242d957c0f744cf62b42ed2f2a.zip gcc-a239aff82c3771242d957c0f744cf62b42ed2f2a.tar.gz gcc-a239aff82c3771242d957c0f744cf62b42ed2f2a.tar.bz2 |
[x86_64]: Zhaoxin lujiazui enablement
This patch fix Zhaoxin CPU vendor ID detection problem and add zhaoxin
"lujiazui" processor support. Currently gcc can't recognize Zhaoxin CPU
(vendor ID "CentaurHauls" and "Shanghai") if user use -march=native option,
which is confusing for users. This patch enables -march=native in zhaoxin
family 7th processor and -march/-mtune=lujiazui, costs and tunning are set
according to the characteristics of the processor.
We add a new md file to describe lujiazui pipeline.
Testing:
Bootstrap is ok, and no regressions for i386/x86-64 testsuite.
Background:
Related Zhaoxin linux kernel patch can be found at:
https://lore.kernel.org/lkml/01042674b2f741b2aed1f797359bdffb@zhaoxin.com/
Related Zhaoxin glibc patch can be found at:
https://sourceware.org/git/?p=glibc.git;a=commit;h=32ac0b988466785d6e3cc1dffc364bb26fc63193
gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_zhaoxin_cpu): Detect
the specific type of Zhaoxin CPU, and return Zhaoxin CPU name.
(cpu_indicator_init): Handle Zhaoxin processors.
* common/config/i386/i386-common.cc: Add lujiazui.
* common/config/i386/i386-cpuinfo.h (enum processor_vendor): Add
VENDOR_ZHAOXIN.
(enum processor_types): Add ZHAOXIN_FAM7H.
(enum processor_subtypes): Add ZHAOXIN_FAM7H_LUJIAZUI.
* config.gcc: Add lujiazui.
* config/i386/cpuid.h (signature_SHANGHAI_ebx): Add
Signatures for zhaoxin
(signature_SHANGHAI_ecx): Ditto.
(signature_SHANGHAI_edx): Ditto.
* config/i386/driver-i386.cc (host_detect_local_cpu): Let
-march=native recognize lujiazui processors.
* config/i386/i386-c.cc (ix86_target_macros_internal): Add lujiazui.
* config/i386/i386-options.cc (m_LUJIAZUI): New_definition.
* config/i386/i386.h (enum processor_type): Ditto.
* config/i386/i386.md: Add lujiazui.
* config/i386/x86-tune-costs.h (struct processor_costs): Add
lujiazui costs.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Add lujiazui.
(ix86_adjust_cost): Ditto.
* config/i386/x86-tune.def (X86_TUNE_SCHEDULE): Add lujiazui Tunnings.
(X86_TUNE_PARTIAL_REG_DEPENDENCY): Ditto.
(X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY): Ditto.
(X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY): Ditto.
(X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY): Ditto.
(X86_TUNE_MOVX): Ditto.
(X86_TUNE_MEMORY_MISMATCH_STALL): Ditto.
(X86_TUNE_FUSE_CMP_AND_BRANCH_32): Ditto.
(X86_TUNE_FUSE_CMP_AND_BRANCH_64): Ditto.
(X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS): Ditto.
(X86_TUNE_FUSE_ALU_AND_BRANCH): Ditto.
(X86_TUNE_ACCUMULATE_OUTGOING_ARGS): Ditto.
(X86_TUNE_USE_LEAVE): Ditto.
(X86_TUNE_PUSH_MEMORY): Ditto.
(X86_TUNE_LCP_STALL): Ditto.
(X86_TUNE_USE_INCDEC): Ditto.
(X86_TUNE_INTEGER_DFMODE_MOVES): Ditto.
(X86_TUNE_OPT_AGU): Ditto.
(X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB): Ditto.
(X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES): Ditto.
(X86_TUNE_USE_SAHF): Ditto.
(X86_TUNE_USE_BT): Ditto.
(X86_TUNE_AVOID_FALSE_DEP_FOR_BMI): Ditto.
(X86_TUNE_ONE_IF_CONV_INSN): Ditto.
(X86_TUNE_AVOID_MFENCE): Ditto.
(X86_TUNE_EXPAND_ABS): Ditto.
(X86_TUNE_USE_SIMODE_FIOP): Ditto.
(X86_TUNE_USE_FFREEP): Ditto.
(X86_TUNE_EXT_80387_CONSTANTS): Ditto.
(X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL): Ditto.
(X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL): Ditto.
(X86_TUNE_SSE_TYPELESS_STORES): Ditto.
(X86_TUNE_SSE_LOAD0_BY_PXOR): Ditto.
* doc/extend.texi: Add details about lujiazui.
* doc/invoke.texi: Add details about lujiazui.
* config/i386/lujiazui.md: Introduce lujiazui cpu and include new md file.
gcc/testsuite/ChangeLog:
* gcc.target/i386/funcspec-56.inc: Test -arch=lujiauzi and -tune=lujiazui.
* g++.target/i386/mv32.C: Ditto.
Signed-off-by: mayshao <mayshao-oc@zhaoxin.com>
-rw-r--r-- | gcc/common/config/i386/cpuinfo.h | 54 | ||||
-rw-r--r-- | gcc/common/config/i386/i386-common.cc | 8 | ||||
-rw-r--r-- | gcc/common/config/i386/i386-cpuinfo.h | 3 | ||||
-rw-r--r-- | gcc/config.gcc | 10 | ||||
-rw-r--r-- | gcc/config/i386/cpuid.h | 4 | ||||
-rw-r--r-- | gcc/config/i386/driver-i386.cc | 20 | ||||
-rw-r--r-- | gcc/config/i386/i386-c.cc | 7 | ||||
-rw-r--r-- | gcc/config/i386/i386-options.cc | 3 | ||||
-rw-r--r-- | gcc/config/i386/i386.h | 1 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 5 | ||||
-rw-r--r-- | gcc/config/i386/lujiazui.md | 844 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune-costs.h | 115 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune-sched.cc | 2 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune.def | 89 | ||||
-rw-r--r-- | gcc/doc/extend.texi | 3 | ||||
-rw-r--r-- | gcc/doc/invoke.texi | 5 | ||||
-rw-r--r-- | gcc/testsuite/g++.target/i386/mv32.C | 31 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 |
18 files changed, 1159 insertions, 47 deletions
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index 6d6171f..adc02bc 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -526,6 +526,39 @@ get_intel_cpu (struct __processor_model *cpu_model, return cpu; } +/* Get the specific type of ZHAOXIN CPU and return ZHAOXIN CPU name. + Return NULL for unknown ZHAOXIN CPU. */ + +static inline const char * +get_zhaoxin_cpu (struct __processor_model *cpu_model, + struct __processor_model2 *cpu_model2, + unsigned int *cpu_features2) +{ + const char *cpu = NULL; + unsigned int family = cpu_model2->__cpu_family; + unsigned int model = cpu_model2->__cpu_model; + + switch (family) + { + /* ZHAOXIN family 7h. */ + case 0x07: + cpu_model->__cpu_type = ZHAOXIN_FAM7H; + if (model == 0x3b) + { + cpu = "lujiazui"; + CHECK___builtin_cpu_is ("lujiazui"); + cpu_model->__cpu_features[0] &= ~(1U <<(FEATURE_AVX & 31)); + cpu_features2[0] &= ~(1U <<((FEATURE_F16C - 32) & 31)); + cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_LUJIAZUI; + } + break; + default: + break; + } + + return cpu; +} + /* ECX and EDX are output of CPUID at level one. */ static inline void get_available_features (struct __processor_model *cpu_model, @@ -936,8 +969,27 @@ cpu_indicator_init (struct __processor_model *cpu_model, get_amd_cpu (cpu_model, cpu_model2, cpu_features2); cpu_model->__cpu_vendor = VENDOR_AMD; } - else if (vendor == signature_CENTAUR_ebx) + else if (vendor == signature_CENTAUR_ebx && family < 0x07) cpu_model->__cpu_vendor = VENDOR_CENTAUR; + else if (vendor == signature_SHANGHAI_ebx + || vendor == signature_CENTAUR_ebx) + { + /* Adjust model and family for ZHAOXIN CPUS. */ + if (family == 0x07) + { + model += extended_model; + } + + cpu_model2->__cpu_family = family; + cpu_model2->__cpu_model = model; + + /* Find available features. */ + get_available_features (cpu_model, cpu_model2, cpu_features2, + ecx, edx); + /* Get CPU type. */ + get_zhaoxin_cpu (cpu_model, cpu_model2,cpu_features2); + cpu_model->__cpu_vendor = VENDOR_ZHAOXIN; + } else if (vendor == signature_CYRIX_ebx) cpu_model->__cpu_vendor = VENDOR_CYRIX; else if (vendor == signature_NSC_ebx) diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index 07fdd04..cb878163 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -1817,6 +1817,7 @@ const char *const processor_names[] = "alderlake", "rocketlake", "intel", + "lujiazui", "geode", "k6", "athlon", @@ -1995,6 +1996,13 @@ const pta processor_alias_table[] = {"nano-x4", PROCESSOR_K8, CPU_K8, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR, 0, P_NONE}, + {"lujiazui", PROCESSOR_LUJIAZUI, CPU_LUJIAZUI, + PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 + | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 + | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_BMI | PTA_BMI2 + | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE + | PTA_RDRND | PTA_MOVBE | PTA_ADX | PTA_RDSEED | PTA_POPCNT, + M_CPU_SUBTYPE (ZHAOXIN_FAM7H_LUJIAZUI), P_NONE}, {"k8", PROCESSOR_K8, CPU_K8, PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR, 0, P_NONE}, diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h index 3f6d201..643fbd9 100644 --- a/gcc/common/config/i386/i386-cpuinfo.h +++ b/gcc/common/config/i386/i386-cpuinfo.h @@ -29,6 +29,7 @@ enum processor_vendor { VENDOR_INTEL = 1, VENDOR_AMD, + VENDOR_ZHAOXIN, VENDOR_OTHER, VENDOR_CENTAUR, VENDOR_CYRIX, @@ -56,6 +57,7 @@ enum processor_types INTEL_GOLDMONT_PLUS, INTEL_TREMONT, AMDFAM19H, + ZHAOXIN_FAM7H, CPU_TYPE_MAX, BUILTIN_CPU_TYPE_MAX = CPU_TYPE_MAX }; @@ -89,6 +91,7 @@ enum processor_subtypes INTEL_COREI7_ALDERLAKE, AMDFAM19H_ZNVER3, INTEL_COREI7_ROCKETLAKE, + ZHAOXIN_FAM7H_LUJIAZUI, CPU_SUBTYPE_MAX }; diff --git a/gcc/config.gcc b/gcc/config.gcc index b48d545..600ac35 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -665,7 +665,7 @@ slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \ silvermont knl knm skylake-avx512 cannonlake icelake-client icelake-server \ skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \ sapphirerapids alderlake rocketlake eden-x2 nano nano-1000 nano-2000 nano-3000 \ -nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 native" +nano-x2 eden-x4 nano-x4 lujiazui x86-64 x86-64-v2 x86-64-v3 x86-64-v4 native" # Additional x86 processors supported by --with-cpu=. Each processor # MUST be separated by exactly one space. @@ -3790,6 +3790,10 @@ case ${target} in cpu=geode arch_without_sse2=yes ;; + lujiazui-*) + arch=lujiazui + cpu=lujiazui + ;; pentium2-*) arch=pentium2 cpu=pentium2 @@ -3899,6 +3903,10 @@ case ${target} in arch=k8 cpu=k8 ;; + lujiazui-*) + arch=lujiazui + cpu=lujiazui + ;; nocona-*) arch=nocona cpu=nocona diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h index 8b3dc2b..a4c2fed 100644 --- a/gcc/config/i386/cpuid.h +++ b/gcc/config/i386/cpuid.h @@ -204,6 +204,10 @@ #define signature_VORTEX_ecx 0x436f5320 #define signature_VORTEX_edx 0x36387865 +#define signature_SHANGHAI_ebx 0x68532020 +#define signature_SHANGHAI_ecx 0x20206961 +#define signature_SHANGHAI_edx 0x68676e61 + #ifndef __x86_64__ /* At least one cpu (Winchip 2) does not set %ebx and %ecx for cpuid leaf 1. Forcibly zero the two registers before diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc index 9e0ae0b..3c702fd 100644 --- a/gcc/config/i386/driver-i386.cc +++ b/gcc/config/i386/driver-i386.cc @@ -438,7 +438,8 @@ const char *host_detect_local_cpu (int argc, const char **argv) || vendor == VENDOR_CYRIX || vendor == VENDOR_NSC) cache = detect_caches_amd (ext_level); - else if (vendor == VENDOR_INTEL) + else if (vendor == VENDOR_INTEL + || vendor == VENDOR_ZHAOXIN) { bool xeon_mp = (family == 15 && model == 6); cache = detect_caches_intel (xeon_mp, max_level, @@ -518,6 +519,20 @@ const char *host_detect_local_cpu (int argc, const char **argv) processor = PROCESSOR_I486; } } + else if (vendor == VENDOR_ZHAOXIN) + { + processor = PROCESSOR_GENERIC; + + switch (family) + { + case 7: + if (model == 0x3b) + processor = PROCESSOR_LUJIAZUI; + break; + default: + break; + } + } else { switch (family) @@ -773,6 +788,9 @@ const char *host_detect_local_cpu (int argc, const char **argv) case PROCESSOR_BTVER2: cpu = "btver2"; break; + case PROCESSOR_LUJIAZUI: + cpu = "lujiazui"; + break; default: /* Use something reasonable. */ diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc index c73c1b1..eb0e3b3 100644 --- a/gcc/config/i386/i386-c.cc +++ b/gcc/config/i386/i386-c.cc @@ -140,6 +140,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__btver2"); def_or_undef (parse_in, "__btver2__"); break; + case PROCESSOR_LUJIAZUI: + def_or_undef (parse_in, "__lujiazui"); + def_or_undef (parse_in, "__lujiazui__"); + break; case PROCESSOR_PENTIUM4: def_or_undef (parse_in, "__pentium4"); def_or_undef (parse_in, "__pentium4__"); @@ -332,6 +336,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, case PROCESSOR_BTVER2: def_or_undef (parse_in, "__tune_btver2__"); break; + case PROCESSOR_LUJIAZUI: + def_or_undef (parse_in, "__tune_lujiazui__"); + break; case PROCESSOR_PENTIUM4: def_or_undef (parse_in, "__tune_pentium4__"); break; diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index 32cc58a..e11f681 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -138,6 +138,8 @@ along with GCC; see the file COPYING3. If not see #define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT) #define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL) +#define m_LUJIAZUI (HOST_WIDE_INT_1U<<PROCESSOR_LUJIAZUI) + #define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE) #define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6) #define m_K6_GEODE (m_K6 | m_GEODE) @@ -755,6 +757,7 @@ static const struct processor_costs *processor_cost_table[] = &alderlake_cost, &icelake_cost, &intel_cost, + &lujiazui_cost, &geode_cost, &k6_cost, &athlon_cost, diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 363082b..f16df63 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2243,6 +2243,7 @@ enum processor_type PROCESSOR_ALDERLAKE, PROCESSOR_ROCKETLAKE, PROCESSOR_INTEL, + PROCESSOR_LUJIAZUI, PROCESSOR_GEODE, PROCESSOR_K6, PROCESSOR_ATHLON, diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 792bae1..050dee7 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -473,8 +473,8 @@ ;; Processor type. (define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem, - atom,slm,glm,haswell,generic,amdfam10,bdver1,bdver2,bdver3, - bdver4,btver2,znver1,znver2,znver3" + atom,slm,glm,haswell,generic,lujiazui,amdfam10,bdver1, + bdver2,bdver3,bdver4,btver2,znver1,znver2,znver3" (const (symbol_ref "ix86_schedule"))) ;; A basic instruction type. Refinements due to arguments to be @@ -1310,6 +1310,7 @@ (include "glm.md") (include "core2.md") (include "haswell.md") +(include "lujiazui.md") ;; Operand and operator predicates and constraints diff --git a/gcc/config/i386/lujiazui.md b/gcc/config/i386/lujiazui.md new file mode 100644 index 0000000..9046c09f --- /dev/null +++ b/gcc/config/i386/lujiazui.md @@ -0,0 +1,844 @@ +;; Copyright (C) 2012-2022 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; <http://www.gnu.org/licenses/>. +;; + +;; Scheduling for ZHAOXIN lujiazui processor. + +;; Modeling automatons for decoders, execution pipes and AGU pipes. +(define_automaton "lujiazui_decoder,lujiazui_core,lujiazui_agu") + +;; The rules for the decoder are simple: +;; - an instruction with 1 uop can be decoded by any of the three +;; decoders in one cycle. +;; - an instruction with 2 uops can be decoded by decoder 0 or decoder 1 +;; but still in only one cycle. +;; - a complex (microcode) instruction can only be decoded by +;; decoder 0, and this takes an unspecified number of cycles. +;; +;; The goal is to schedule such that we have a few-one-two uops sequence +;; in each cycle, to decode as many instructions per cycle as possible. +(define_cpu_unit "lua_decoder0" "lujiazui_decoder") +(define_cpu_unit "lua_decoder1" "lujiazui_decoder") +(define_cpu_unit "lua_decoder2" "lujiazui_decoder") + +;; We first wish to find an instruction for lua_decoder0, so exclude +;; lua_decoder1 and lua_decoder2 from being reserved until +;; lua_decoder0 is reserved, and also exclude lua_decoder2 +;; from being reserved until lua_decoder1 is reserved. +(presence_set "lua_decoder1" "lua_decoder0") +(presence_set "lua_decoder2" "lua_decoder0") +(presence_set "lua_decoder2" "lua_decoder1") + +;; Most instructions can be decoded on any of the three decoders. +(define_reservation "lua_decodern" "lua_decoder0|lua_decoder1|lua_decoder2") +(define_reservation "lua_decoder01" "lua_decoder0|lua_decoder1") + +;; The out-of-order core has six pipelines. +;; Port 4, 5 are responsible for address calculations, load or store. +;; Port 0, 1, 2, 3 for everything else. + +(define_cpu_unit "lua_p0,lua_p1,lua_p2,lua_p3" "lujiazui_core") +(define_cpu_unit "lua_p4,lua_p5" "lujiazui_agu") + +(define_reservation "lua_p03" "lua_p0|lua_p3") +(define_reservation "lua_p12" "lua_p1|lua_p2") +(define_reservation "lua_p1p2" "lua_p1+lua_p2") +(define_reservation "lua_p45" "lua_p4|lua_p5") +(define_reservation "lua_p4p5" "lua_p4+lua_p5") +(define_reservation "lua_p0p1p2p3" "lua_p0+lua_p1+lua_p2+lua_p3") + +;; Only the irregular instructions have to be modeled here. + +;; Complex instruction. +(define_insn_reservation "lua_complex_insn" 6 + (and (eq_attr "cpu" "lujiazui") + (eq_attr "type" "other,multi,str")) + "lua_decoder0") + +;; Call instruction. +(define_insn_reservation "lua_call" 1 + (and (eq_attr "cpu" "lujiazui") + (eq_attr "type" "call,callv")) + "lua_decoder0,lua_p45,lua_p1") + +;; MOV - integer moves. +(define_insn_reservation "lua_imov" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "imov,imovx"))) + "lua_decodern,lua_p12") + +(define_insn_reservation "lua_imov_load" 4 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "imov,imovx"))) + "lua_decoder01,lua_p45") + +(define_insn_reservation "lua_imov_store" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "store") + (eq_attr "type" "imov"))) + "lua_decodern,lua_p12+lua_p45") + +(define_insn_reservation "lua_icmov" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "icmov"))) + "lua_decodern,lua_p2") + +(define_insn_reservation "lua_icmov_load" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "icmov"))) + "lua_decoder01,lua_p45,lua_p2") + +;; Push and pop. +(define_insn_reservation "lua_push_reg" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "store") + (eq_attr "type" "push"))) + "lua_decodern,lua_p12+lua_p45") + +(define_insn_reservation "lua_push_mem" 4 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "both") + (eq_attr "type" "push"))) + "lua_decoder01,lua_p45,lua_p12+lua_p45") + +(define_insn_reservation "lua_pop_reg" 4 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "pop"))) + "lua_decoder01,lua_p45") + +(define_insn_reservation "lua_pop_mem" 4 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "both") + (eq_attr "type" "pop"))) + "lua_decoder0,lua_p45,lua_p12+lua_p45") + +(define_insn_reservation "lua_lea" 1 + (and (eq_attr "cpu" "lujiazui") + (eq_attr "type" "lea")) + "hsw_decodern,lua_p45") + +(define_insn_reservation "lua_shift_rotate" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) + "lua_decodern,lua_p2") + +(define_insn_reservation "lua_shift_rotate_load" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) + "lua_decoder01,lua_p45,lua_p2") + +(define_insn_reservation "lua_shift_rotate_store" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "store") + (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) + "lua_decoder01,lua_p2,lua_p45") + +(define_insn_reservation "lua_shift_rotate_both" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "both") + (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) + "lua_decoder0,lua_p45,lua_p2,lua_p45") + +(define_insn_reservation "lua_branch" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "ibr"))) + "lua_decodern,lua_p1") + +(define_insn_reservation "lua_indirect_branch_load" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "ibr"))) + "lua_decodern,lua_p45,lua_p1") + +(define_insn_reservation "lua_leave" 4 + (and (eq_attr "cpu" "lujiazui") + (eq_attr "type" "leave")) + "lua_decoder0,lua_p45+lua_p12,lua_p12") + +;; Multiplication instructions. + +(define_insn_reservation "lua_imul_qi" 2 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "QI") + (eq_attr "type" "imul,imulx")))) + "lua_decodern,lua_p1p2") + +(define_insn_reservation "lua_imul_qi_mem" 6 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "QI") + (eq_attr "type" "imul,imulx")))) + "lua_decoder01,lua_p1p2+lua_p45") + +(define_insn_reservation "lua_imul_hisi" 3 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "HI,SI") + (eq_attr "type" "imul,imulx")))) + "lua_decoder0,lua_p1p2") + +(define_insn_reservation "lua_imul_hisi_mem" 7 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "HI,SI") + (eq_attr "type" "imul,imulx")))) + "lua_decoder0,lua_p1p2+lua_p45") + +(define_insn_reservation "lua_imul_di" 12 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DI") + (eq_attr "type" "imul,imulx")))) + "lua_decoder0,lua_p0p1p2p3") + +(define_insn_reservation "lua_imul_di_mem" 16 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "DI") + (eq_attr "type" "imul,imulx")))) + "lua_decoder0,lua_p0p1p2p3+lua_p45") + +;; Division instructions. + +(define_insn_reservation "lua_idiv_qi" 21 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "QI") + (eq_attr "type" "idiv")))) + "lua_decoder0,lua_p0p1p2p3*21") + +(define_insn_reservation "lua_idiv_qi_load" 25 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "QI") + (eq_attr "type" "idiv")))) + "lua_decoder0,lua_p45,lua_p0p1p2p3*21") + +(define_insn_reservation "lua_idiv_hi" 22 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "HI") + (eq_attr "type" "idiv")))) + "lua_decoder0,lua_p0p1p2p3*22") + +(define_insn_reservation "lua_idiv_hi_load" 26 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "HI") + (eq_attr "type" "idiv")))) + "lua_decoder0,lua_p45,lua_p0p1p2p3*22") + +(define_insn_reservation "lua_idiv_si" 20 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SI") + (eq_attr "type" "idiv")))) + "lua_decoder0,lua_p0p1p2p3*20") + +(define_insn_reservation "lua_idiv_si_load" 24 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SI") + (eq_attr "type" "idiv")))) + "lua_decoder0,lua_p45,lua_p0p1p2p3*20") + +(define_insn_reservation "lua_idiv_di" 150 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DI") + (eq_attr "type" "idiv")))) + "lua_decoder0,lua_p0p1p2p3*150") + +(define_insn_reservation "lua_idiv_di_load" 154 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "DI") + (eq_attr "type" "idiv")))) + "lua_decoder0,lua_p45,lua_p0p1p2p3*150") + +;; x87 floating point operations. + +(define_insn_reservation "lua_fxch" 1 + (and (eq_attr "cpu" "lujiazui") + (eq_attr "type" "fxch")) + "lua_decodern,lua_p1") + +(define_insn_reservation "lua_fop" 3 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "fop"))) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_fop_load" 7 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "fop"))) + "lua_decoder01,lua_p45,lua_p0") + +(define_insn_reservation "lua_fop_store" 3 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "store") + (eq_attr "type" "fop"))) + "lua_decodern,lua_p0,lua_p45") + +(define_insn_reservation "lua_fop_both" 7 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "both") + (eq_attr "type" "fop"))) + "lua_decoder0,lua_p45,lua_p0,lua_p45") + +(define_insn_reservation "lua_fsgn" 1 + (and (eq_attr "cpu" "lujiazui") + (eq_attr "type" "fsgn")) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_fistp" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "fistp"))) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_fistp_mem" 4 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "!none") + (eq_attr "type" "fistp"))) + "lua_decoder0,lua_p0+lua_p45") + +(define_insn_reservation "lua_fcmov" 3 + (and (eq_attr "cpu" "lujiazui") + (eq_attr "type" "fcmov")) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_fcmp" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "fcmp"))) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_fcmp_load" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "fcmp"))) + "lua_decoder01,lua_p45,lua_p0") + +(define_insn_reservation "lua_fmov" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "fmov"))) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_fmov_load" 4 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "!XF") + (eq_attr "type" "fmov")))) + "lua_decoder01,lua_p45,lua_p0") + +(define_insn_reservation "lua_fmov_XF_load" 3 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fmov")))) + "lua_decoder0,lua_p45,lua_p0") + +(define_insn_reservation "lua_fmov_store" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "!XF") + (eq_attr "type" "fmov")))) + "lua_decoder0,lua_p0,lua_p45") + +(define_insn_reservation "lua_fmov_XF_store" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fmov")))) + "lua_decoder0,lua_p0,lua_p45") + +(define_insn_reservation "lua_fmul" 4 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "fmul"))) + "lua_decodern,lua_p3") + +(define_insn_reservation "lua_fmul_load" 8 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "fp_int_src" "false") + (and (eq_attr "memory" "load") + (eq_attr "type" "fmul")))) + "lua_decoder01,lua_p45,lua_p3") + +(define_insn_reservation "lua_fimul_load" 8 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "fp_int_src" "true") + (and (eq_attr "memory" "load") + (eq_attr "type" "fmul")))) + "lua_decoder0,lua_p45,lua_p3") + +;; fdiv instructions. + +(define_insn_reservation "lua_fdiv_SF" 15 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "fdiv,fpspc")))) + "lua_decodern,lua_p0*15") + +(define_insn_reservation "lua_fdiv_SF_load" 19 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "fdiv,fpspc")))) + "lua_decoder01,lua_p45,lua_p0*15") + +(define_insn_reservation "lua_fdiv_DF" 18 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF") + (eq_attr "type" "fdiv,fpspc")))) + "lua_decodern,lua_p0*18") + +(define_insn_reservation "lua_fdiv_DF_load" 22 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "DF") + (eq_attr "type" "fdiv,fpspc")))) + "lua_decoder01,lua_p45,lua_p0*18") + +(define_insn_reservation "lua_fdiv_XF" 22 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fdiv,fpspc")))) + "lua_decoder0,lua_p0*22") + +(define_insn_reservation "lua_fdiv_XF_load" 26 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fdiv,fpspc")))) + "lua_decoder0,lua_p45,lua_p0*22") + +;; MMX instructions. + +(define_insn_reservation "lua_mmx_sse_add_shft" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxadd,sseiadd,mmxshft,sseishft"))) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_mmx_sse_add_shft_load" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "mmxadd,sseiadd,mmxshft,sseishft"))) + "lua_decoder01,lua_p45,lua_p0") + +(define_insn_reservation "lua_mmx_sse_add_shft_store" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "store") + (eq_attr "type" "mmxadd,sseiadd,mmxshft,sseishft"))) + "lua_decodern,lua_p0,lua_p45") + +(define_insn_reservation "lua_mmx_mul" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxmul,sseimul"))) + "lua_decodern,lua_p3") + +(define_insn_reservation "lua_mmx_mul_load" 9 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "mmxmul,sseimul"))) + "lua_decoder01,lua_p45,lua_p3") + +(define_insn_reservation "lua_mmxcvt" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxcvt"))) + "lua_decodern,lua_p03") + +(define_insn_reservation "lua_mmxcvt_load" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "mmxcvt"))) + "lua_decoder01,lua_p45,lua_p03") + +;; The sfence instruction. +(define_insn_reservation "lua_sse_sfence" 13 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "unknown") + (eq_attr "type" "sse"))) + "lua_decoder0,lua_p45") + +(define_insn_reservation "lua_sse_SFDF" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "mode" "SF,DF") + (eq_attr "type" "sse"))) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_sse_V4SF" 13 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sse"))) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_sse_V8SF" 19 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "mode" "V8SF,V4DF") + (eq_attr "type" "sse"))) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_sse_add1" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "sseadd1"))) + "lua_decoder0,lua_p0") + +(define_insn_reservation "lua_sse_add1_load" 8 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "sseadd1"))) + "lua_decoder0,lua_p45,lua_p0") + +(define_insn_reservation "lua_sse_cmp" 3 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "ssecmp,ssecomi"))) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_sse_cmp_load" 7 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "ssecmp,ssecomi"))) + "lua_decoder01,lua_p45,lua_p0") + +(define_insn_reservation "lua_sse_logic" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "sselog,sselog1"))) + "lua_decodern,lua_p03") + +(define_insn_reservation "lua_sse_logic_load" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "sselog,sselog1"))) + "lua_decoder01,lua_p45,lua_p03") + +(define_insn_reservation "lua_sse_add" 3 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "sseadd"))) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_sse_add_load" 7 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "sseadd"))) + "lua_decoder01,lua_p45,lua_p0") + +(define_insn_reservation "lua_ssemul_ss_ps" 3 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF,V4SF,V8SF") + (eq_attr "type" "ssemul")))) + "lua_decodern,lua_p3") + +(define_insn_reservation "lua_ssemul_ss_ps_load" 7 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF,V4SF,V8SF") + (eq_attr "type" "ssemul")))) + "lua_decoder01,lua_p45,lua_p3") + +(define_insn_reservation "lua_ssemul_sd_pd" 4 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF,V2DF,V4DF") + (eq_attr "type" "ssemul")))) + "lua_decodern,lua_p3") + +(define_insn_reservation "lua_ssemul_sd_pd_load" 8 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "DF,V2DF,V4DF") + (eq_attr "type" "ssemul")))) + "lua_decoder01,lua_p45,lua_p3") + +(define_insn_reservation "lua_ssediv_SF" 13 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssediv")))) + "lua_decodern,lua_p0*13") + +(define_insn_reservation "lua_ssediv_load_SF" 17 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssediv")))) + "lua_decoder01,lua_p45,lua_p0*13") + +(define_insn_reservation "lua_ssediv_V4SF" 23 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssediv")))) + "lua_decodern,lua_p0*23") + +(define_insn_reservation "lua_ssediv_load_V4SF" 27 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssediv")))) + "lua_decoder01,lua_p45,lua_p0*23") + +(define_insn_reservation "lua_ssediv_V8SF" 47 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V8SF") + (eq_attr "type" "ssediv")))) + "lua_decoder0,lua_p0*47") + +(define_insn_reservation "lua_ssediv_load_V8SF" 51 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V8SF") + (eq_attr "type" "ssediv")))) + "lua_decoder0,lua_p45,lua_p0*47") + +(define_insn_reservation "lua_ssediv_SD" 17 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF") + (eq_attr "type" "ssediv")))) + "lua_decodern,lua_p0*17") + +(define_insn_reservation "lua_ssediv_load_SD" 21 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "DF") + (eq_attr "type" "ssediv")))) + "lua_decoder01,lua_p45,lua_p0*17") + +(define_insn_reservation "lua_ssediv_V2DF" 30 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V2DF") + (eq_attr "type" "ssediv")))) + "lua_decodern,lua_p0*30") + +(define_insn_reservation "lua_ssediv_load_V2DF" 34 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V2DF") + (eq_attr "type" "ssediv")))) + "lua_decoder01,lua_p45,lua_p0*30") + +(define_insn_reservation "lua_ssediv_V4DF" 56 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4DF") + (eq_attr "type" "ssediv")))) + "lua_decoder0,lua_p0*56") + +(define_insn_reservation "lua_ssediv_load_V4DF" 60 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V4DF") + (eq_attr "type" "ssediv")))) + "lua_decoder0,lua_p4p5,lua_p0*56") + + +(define_insn_reservation "lua_sseicvt_si" 2 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SI") + (and (match_operand:SF 1 "memory_operand") + (eq_attr "type" "sseicvt"))))) + "lua_decoder01,lua_p0") + +(define_insn_reservation "lua_sseicvt_si_load" 6 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SI") + (and (match_operand:SF 1 "memory_operand") + (eq_attr "type" "sseicvt"))))) + "lua_decoder0,lua_p45,lua_p0") + +(define_insn_reservation "lua_sseicvtdf_si" 3 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SI") + (and (match_operand:DF 1 "memory_operand") + (eq_attr "type" "sseicvt"))))) + "lua_decodern,lua_p0") + +(define_insn_reservation "lua_sseicvtdf_si_load" 7 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SI") + (and (match_operand:DF 1 "memory_operand") + (eq_attr "type" "sseicvt"))))) + "lua_decoder01,lua_p45,lua_p0") + +(define_insn_reservation "lua_ssecvt" 6 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "ssecvt"))) + "lua_decoder01,lua_p03") + +(define_insn_reservation "lua_ssecvt_load" 10 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "ssecvt"))) + "lua_decoder0,lua_p45,lua_p03") + +(define_insn_reservation "lua_sse_mov" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "ssemov"))) + "lua_decodern,lua_p03") + +(define_insn_reservation "lua_sse_mov_load" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "ssemov"))) + "lua_decoder01,lua_p45,lua_p03") + +(define_insn_reservation "lua_sse_mov_store" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "store") + (eq_attr "type" "ssemov"))) + "lua_decoder01,lua_p0,lua_p45") + +(define_insn_reservation "lua_insn_alu" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "alu"))) + "lua_decodern,lua_p12") + +(define_insn_reservation "lua_insn_alu_load" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "alu"))) + "lua_decoder01,lua_p45,lua_p12") + +(define_insn_reservation "lua_insn_alu_store" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "store") + (eq_attr "type" "alu"))) + "lua_decoder01,lua_p12,lua_p45") + +(define_insn_reservation "lua_insn_alu_both" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "both") + (eq_attr "type" "alu"))) + "lua_decoder0,lua_p45,lua_p12,lua_p45") + +(define_insn_reservation "lua_insn_alu1" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "alu1"))) + "lua_decodern,lua_p12") + +(define_insn_reservation "lua_insn_alu1_load" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "alu1"))) + "lua_decoder01,lua_p45,lua_p12") + +(define_insn_reservation "lua_insn_alu1_store" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "store") + (eq_attr "type" "alu1"))) + "lua_decoder01,lua_p12,lua_p45") + +(define_insn_reservation "lua_insn_alu1_both" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "both") + (eq_attr "type" "alu1"))) + "lua_decoder0,lua_p45,lua_p12,lua_p45") + +(define_insn_reservation "lua_insn_negnot_incdec" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "negnot,incdec"))) + "lua_decodern,lua_p12") + +(define_insn_reservation "lua_insn_negnot_setcc" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "setcc"))) + "lua_decodern,lua_p2") + +(define_insn_reservation "lua_insn_negnot_setcc_mem" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "!none") + (eq_attr "type" "negnot,setcc"))) + "lua_decoder01,lua_p45,lua_p2,lua_p45") + +(define_insn_reservation "lua_insn_incdec_mem" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "!none") + (eq_attr "type" "incdec"))) + "lua_decoder0,lua_p45,lua_p12,lua_p45") + +(define_insn_reservation "lua_insn_icmptest" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "icmp,test"))) + "lua_decodern,lua_p12") + +(define_insn_reservation "lua_insn_icmptest_load" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "icmp,test"))) + "lua_decoder01,lua_p45,lua_p12") + +(define_insn_reservation "lua_insn_icmptest_store" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "store") + (eq_attr "type" "icmp,test"))) + "lua_decoder01,lua_p12,lua_p45") + +(define_insn_reservation "lua_insn_icmptest_both" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "both") + (eq_attr "type" "icmp,test"))) + "lua_decoder0,lua_p45,lua_p12,lua_p45") + +(define_insn_reservation "lua_insn_sseishft1_mmx" 1 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "none") + (eq_attr "type" "sseishft1,mmx,mmxcmp"))) + "lua_decodern,lua_p03") + +(define_insn_reservation "lua_insn_sseishft1_mmx_mem" 5 + (and (eq_attr "cpu" "lujiazui") + (and (eq_attr "memory" "load") + (eq_attr "type" "sseishft1,mmx,mmxcmp"))) + "lua_decoder01,lua_p45,lua_p03") diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 05cbd49..ea34a93 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -3088,6 +3088,121 @@ struct processor_costs intel_cost = { "16", /* Func alignment. */ }; +/* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU. */ +static stringop_algs lujiazui_memcpy[2] = { + {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, + {-1, libcall, false}}}, + {libcall, {{12, unrolled_loop, true}, {32, loop, false}, + {6144, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs lujiazui_memset[2] = { + {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, + {-1, libcall, false}}}, + {libcall, {{12, loop, true}, {32, loop, false}, + {640, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static const +struct processor_costs lujiazui_cost = { + { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 6, /* cost for loading QImode using movzbl. */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 6, 6}, /* cost of storing integer registers. */ + 2, /* cost of reg,reg fld/fst. */ + {6, 6, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode. */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode. */ + 2, /* cost of moving MMX register. */ + {6, 6}, /* cost of loading MMX registers + in SImode and DImode. */ + {6, 6}, /* cost of storing MMX registers + in SImode and DImode. */ + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */ + {6, 6, 6, 10, 15}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit. */ + {6, 6, 6, 10, 15}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit. */ + 6, 6, /* SSE->integer and integer->SSE moves. */ + 6, 6, /* mask->integer and integer->mask moves. */ + {6, 6, 6}, /* cost of loading mask register + in QImode, HImode, SImode. */ + {6, 6, 6}, /* cost if storing mask register + in QImode, HImode, SImode. */ + 2, /* cost of moving mask register. */ + /* End of register allocator costs. */ + }, + + COSTS_N_INSNS (1), /* cost of an add instruction. */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction. */ + COSTS_N_INSNS (1), /* variable shift costs. */ + COSTS_N_INSNS (1), /* constant shift costs. */ + {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */ + COSTS_N_INSNS (3), /* HI. */ + COSTS_N_INSNS (3), /* SI. */ + COSTS_N_INSNS (12), /* DI. */ + COSTS_N_INSNS (14)}, /* other. */ + 0, /* cost of multiply per each bit set. */ + {COSTS_N_INSNS (22), /* cost of a divide/mod for QI. */ + COSTS_N_INSNS (24), /* HI. */ + COSTS_N_INSNS (24), /* SI. */ + COSTS_N_INSNS (150), /* DI. */ + COSTS_N_INSNS (152)}, /* other. */ + COSTS_N_INSNS (1), /* cost of movsx. */ + COSTS_N_INSNS (1), /* cost of movzx. */ + 8, /* "large" insn. */ + 17, /* MOVE_RATIO. */ + 6, /* CLEAR_RATIO. */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 6, 6}, /* cost of storing integer registers. */ + {6, 6, 6, 10, 15}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit. */ + {6, 6, 6, 10, 15}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit. */ + {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ + {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */ + 6, /* cost of moving SSE register to integer. */ + 18, 6, /* Gather load static, per_elt. */ + 18, 6, /* Gather store static, per_elt. */ + 32, /* size of l1 cache. */ + 4096, /* size of l2 cache. */ + 64, /* size of prefetch block. */ + /* Lujiazui processor never drop prefetches, like AMD processors. */ + 100, /* number of parallel prefetches. */ + 3, /* Branch cost. */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (22), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ + + COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ + COSTS_N_INSNS (3), /* cost of MULSS instruction. */ + COSTS_N_INSNS (4), /* cost of MULSD instruction. */ + COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ + COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ + COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ + COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ + COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */ + COSTS_N_INSNS (60), /* cost of SQRTSD instruction. */ + 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ + lujiazui_memcpy, + lujiazui_memset, + COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ + COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ + "16:11:8", /* Loop alignment. */ + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ +}; + /* Generic should produce code tuned for Core-i7 (and newer chips) and btver1 (and newer chips). */ diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc index e413d04..1ffaeef 100644 --- a/gcc/config/i386/x86-tune-sched.cc +++ b/gcc/config/i386/x86-tune-sched.cc @@ -58,6 +58,7 @@ ix86_issue_rate (void) case PROCESSOR_K8: case PROCESSOR_AMDFAM10: case PROCESSOR_BTVER1: + case PROCESSOR_LUJIAZUI: return 3; case PROCESSOR_BDVER1: @@ -368,6 +369,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, case PROCESSOR_ATHLON: case PROCESSOR_K8: + case PROCESSOR_LUJIAZUI: memory = get_attr_memory (insn); /* Show ability of reorder buffer to hide latency of load by executing diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index d983e2f..540e45d 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -41,8 +41,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see /* X86_TUNE_SCHEDULE: Enable scheduling. */ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT - | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_GOLDMONT - | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC) + | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI + | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC) /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming on modern chips. Prefer stores affecting whole integer register @@ -51,8 +51,8 @@ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL - | m_KNL | m_KNM | m_AMD_MULTIPLE | m_TREMONT | m_ALDERLAKE - | m_GENERIC) + | m_KNL | m_KNM | m_AMD_MULTIPLE | m_LUJIAZUI | m_TREMONT + | m_ALDERLAKE | m_GENERIC) /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store destinations to be 128bit to allow register renaming on 128bit SSE units, @@ -62,7 +62,8 @@ DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", that can be partly masked by careful scheduling of moves. */ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency", m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 - | m_BDVER | m_ZNVER | m_TREMONT | m_ALDERLAKE | m_GENERIC) + | m_BDVER | m_ZNVER | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE + | m_GENERIC) /* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids partial write to the destination in scalar SSE conversion from FP @@ -70,14 +71,14 @@ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency", DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY, "sse_partial_reg_fp_converts_dependency", m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 - | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC) + | m_BDVER | m_ZNVER | m_LUJIAZUI | m_ALDERLAKE | m_GENERIC) /* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial write to the destination in scalar SSE conversion from integer to FP. */ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY, "sse_partial_reg_converts_dependency", m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 - | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC) + | m_BDVER | m_ZNVER | m_LUJIAZUI | m_ALDERLAKE | m_GENERIC) /* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before several insns to break false dependency on the dest register for GLC @@ -108,7 +109,7 @@ DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", DEF_TUNE (X86_TUNE_MOVX, "movx", m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL - | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE + | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_GENERIC) /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by @@ -116,31 +117,31 @@ DEF_TUNE (X86_TUNE_MOVX, "movx", DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall", m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE - | m_TREMONT | m_ALDERLAKE | m_GENERIC) + | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_GENERIC) /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent conditional jump instruction for 32 bit TARGET. */ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32", - m_CORE_ALL | m_BDVER | m_ZNVER | m_GENERIC) + m_CORE_ALL | m_BDVER | m_ZNVER | m_LUJIAZUI | m_GENERIC) /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent conditional jump instruction for TARGET_64BIT. */ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64", m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER - | m_ZNVER | m_GENERIC) + | m_ZNVER | m_LUJIAZUI | m_GENERIC) /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a subsequent conditional jump instruction when the condition jump check sign flag (SF) or overflow flag (OF). */ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER - | m_ZNVER | m_GENERIC) + | m_ZNVER | m_LUJIAZUI | m_GENERIC) /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional jump instruction when the alu instruction produces the CCFLAG consumed by the conditional jump instruction. */ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", - m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC) + m_SANDYBRIDGE | m_CORE_AVX2 | m_LUJIAZUI | m_GENERIC) /*****************************************************************************/ @@ -157,7 +158,7 @@ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL - | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8) + | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8 | m_LUJIAZUI) /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are considered on critical path. */ @@ -171,15 +172,15 @@ DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move", /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */ DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", - m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_TREMONT - | m_ALDERLAKE | m_GENERIC) + m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI + | m_TREMONT | m_ALDERLAKE | m_GENERIC) /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions. Some chips, like 486 and Pentium works faster with separate load and push instructions. */ DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE - | m_TREMONT | m_ALDERLAKE | m_GENERIC) + | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_GENERIC) /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred over esp subtraction. */ @@ -234,7 +235,7 @@ DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_benefi /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall on 16-bit immediate moves into memory on Core2 and Corei7. */ -DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC) +DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_LUJIAZUI | m_GENERIC) /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such as "add mem, reg". */ @@ -249,19 +250,20 @@ DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO)) DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_BONNELL | m_SILVERMONT | m_INTEL | m_KNL | m_KNM | m_GOLDMONT - | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)) + | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_LUJIAZUI + | m_GENERIC)) /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred for DFmode copies */ DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves", ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT - | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GOLDMONT - | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)) + | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI + | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)) /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag will impact LEA instruction selection. */ DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL - | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL) + | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_LUJIAZUI) /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */ DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr", @@ -294,7 +296,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA) move/set sequences of bytes with known size. */ DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB, "prefer_known_rep_movsb_stosb", - m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512) + m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512 | m_LUJIAZUI) /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of compact prologues and epilogues by issuing a misaligned moves. This @@ -303,15 +305,15 @@ DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB, FIXME: This may actualy be a win on more targets than listed here. */ DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES, "misaligned_move_string_pro_epilogues", - m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_TREMONT + m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_GENERIC) /* X86_TUNE_USE_SAHF: Controls use of SAHF. */ DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf", m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER - | m_BTVER | m_ZNVER | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT - | m_ALDERLAKE | m_GENERIC) + | m_BTVER | m_ZNVER | m_LUJIAZUI | m_GOLDMONT | m_GOLDMONT_PLUS + | m_TREMONT | m_ALDERLAKE | m_GENERIC) /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */ DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", @@ -321,13 +323,14 @@ DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */ DEF_TUNE (X86_TUNE_USE_BT, "use_bt", m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL - | m_LAKEMONT | m_AMD_MULTIPLE | m_GOLDMONT | m_GOLDMONT_PLUS - | m_TREMONT | m_ALDERLAKE | m_GENERIC) + | m_LAKEMONT | m_AMD_MULTIPLE | m_LUJIAZUI | m_GOLDMONT + | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC) /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency for bit-manipulation instructions. */ DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi", - m_SANDYBRIDGE | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_GENERIC) + m_SANDYBRIDGE | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_LUJIAZUI + | m_GENERIC) /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based on hardware capabilities. Bdver3 hardware has a loop buffer which makes @@ -339,18 +342,19 @@ DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4) if-converted sequence to one. */ DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn", m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GOLDMONT - | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC) + | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_LUJIAZUI | m_GENERIC) /* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence. */ DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence", - m_CORE_ALL | m_BDVER | m_ZNVER | m_TREMONT | m_ALDERLAKE | m_GENERIC) + m_CORE_ALL | m_BDVER | m_ZNVER | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE + | m_GENERIC) /* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) - (signed) x >> (W-1)) instead of cmove or SSE max/abs instructions. */ DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs", m_CORE_ALL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT - | m_GOLDMONT_PLUS) + | m_GOLDMONT_PLUS | m_LUJIAZUI) /*****************************************************************************/ /* 387 instruction selection tuning */ @@ -367,17 +371,17 @@ DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop", DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop", ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_AMD_MULTIPLE - | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE - | m_GENERIC)) + | m_LUJIAZUI | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT + | m_ALDERLAKE | m_GENERIC)) /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */ -DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE) +DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE | m_LUJIAZUI) /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */ DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants", m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT - | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GOLDMONT - | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC) + | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_LUJIAZUI + | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC) /*****************************************************************************/ /* SSE instruction selection tuning */ @@ -393,14 +397,14 @@ DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill", DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal", m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE - | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_GENERIC) + | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_LUJIAZUI | m_GENERIC) /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead of a sequence loading registers by parts. */ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal", m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM - | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS - | m_TREMONT | m_ALDERLAKE | m_BDVER | m_ZNVER | m_GENERIC) + | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE + | m_BDVER | m_ZNVER | m_LUJIAZUI | m_GENERIC) /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL: Use packed single precision 128bit instructions instead of double where possible. */ @@ -409,13 +413,14 @@ DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optim /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */ DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", - m_AMD_MULTIPLE | m_CORE_ALL | m_TREMONT | m_ALDERLAKE | m_GENERIC) + m_AMD_MULTIPLE | m_LUJIAZUI | m_CORE_ALL | m_TREMONT | m_ALDERLAKE + | m_GENERIC) /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to xorps/xorpd and other variants. */ DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER - | m_TREMONT | m_ALDERLAKE | m_GENERIC) + | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_GENERIC) /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer to SSE registers. If disabled, the moves will be done by storing diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 84e6f66..a2e2a30 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -21840,6 +21840,9 @@ Intel Knights Landing CPU. @item knm Intel Knights Mill CPU. +@item lujiazui +ZHAOXIN lujiazui CPU. + @item amdfam10h AMD Family 10h CPU. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index d8095e3..8becba3 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -31816,6 +31816,11 @@ VIA Nano Quad Core CPU with x86-64, MMX, SSE, SSE2, SSE3, SSSE3 and SSE4.1 instruction set support. (No scheduling is implemented for this chip.) +@item lujiazui +ZHAOXIN lujiazui CPU with x86-64, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, +SSE4.2, AVX, POPCNT, AES, PCLMUL, RDRND, XSAVE, XSAVEOPT, FSGSBASE, CX16, +ABM, BMI, BMI2, F16C, FXSR, RDSEED instruction set support. + @item geode AMD Geode embedded processor with MMX and 3DNow!@: instruction set support. @end table diff --git a/gcc/testsuite/g++.target/i386/mv32.C b/gcc/testsuite/g++.target/i386/mv32.C new file mode 100644 index 0000000..8f74352 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/mv32.C @@ -0,0 +1,31 @@ +// Test that dispatching can choose the right multiversion +// for ZHAOXIN CPU with the same internal GCC processor id + +// { dg-do run } +// { dg-require-ifunc "" } +// { dg-options "-O2" } + +#include <assert.h> + +int __attribute__ ((target("default"))) +foo () +{ + return 0; +} + +int __attribute__ ((target("arch=lujiazui"))) foo () { + return 1; +} + + +int main () +{ + int val = foo (); + + if (__builtin_cpu_is ("lujiazui")) + assert (val == 1); + else + assert (val == 0); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc index 8499fdf..b76dddb 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc +++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc @@ -184,6 +184,7 @@ extern void test_arch_cooperlake (void) __attribute__((__target__("arch= extern void test_arch_sapphirerapids (void) __attribute__((__target__("arch=sapphirerapids"))); extern void test_arch_alderlake (void) __attribute__((__target__("arch=alderlake"))); extern void test_arch_rocketlake (void) __attribute__((__target__("arch=rocketlake"))); +extern void test_arch_lujiazui (void) __attribute__((__target__("arch=lujiazui"))); extern void test_arch_k8 (void) __attribute__((__target__("arch=k8"))); extern void test_arch_k8_sse3 (void) __attribute__((__target__("arch=k8-sse3"))); extern void test_arch_opteron (void) __attribute__((__target__("arch=opteron"))); @@ -205,6 +206,7 @@ extern void test_tune_core2 (void) __attribute__((__target__("tune=core2"))); extern void test_tune_corei7 (void) __attribute__((__target__("tune=corei7"))); extern void test_tune_corei7_avx (void) __attribute__((__target__("tune=corei7-avx"))); extern void test_tune_core_avx2 (void) __attribute__((__target__("tune=core-avx2"))); +extern void test_tune_lujiazui (void) __attribute__((__target__("tune=lujiazui"))); extern void test_tune_k8 (void) __attribute__((__target__("tune=k8"))); extern void test_tune_k8_sse3 (void) __attribute__((__target__("tune=k8-sse3"))); extern void test_tune_opteron (void) __attribute__((__target__("tune=opteron"))); |