diff options
author | Jan Hubicka <jh@suse.cz> | 2024-09-03 16:26:16 +0200 |
---|---|---|
committer | Jan Hubicka <jh@suse.cz> | 2024-09-03 16:27:40 +0200 |
commit | e2125a600552bc6e0329e3f1224eea14804db8d3 (patch) | |
tree | 83d405de615b956aab6e5d53439cd45ae3eebf93 /gcc | |
parent | dee3c5c6ff9952204af3014383593e8d316250e4 (diff) | |
download | gcc-e2125a600552bc6e0329e3f1224eea14804db8d3.zip gcc-e2125a600552bc6e0329e3f1224eea14804db8d3.tar.gz gcc-e2125a600552bc6e0329e3f1224eea14804db8d3.tar.bz2 |
Zen5 tuning part 3: scheduler tweaks
this patch adds support for new fussion in znver5 documented in the
optimization manual:
The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions
with certain ALU instructions. The following conditions need to be met for
fusion to happen:
- The MOV should be reg-reg mov with Opcode 0x89 or 0x8B
- The MOV is followed by an ALU instruction where the MOV and ALU destination register match.
- The ALU instruction may source only registers or immediate data. There cannot be any memory source.
- The ALU instruction sources either the source or dest of MOV instruction.
- If ALU instruction has 2 reg sources, they should be different.
- The following ALU instructions can fuse with an older qualified MOV instruction:
ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR
(I assume OP is OR)
I also increased issue rate from 4 to 6. Theoretically znver5 can do more, but
with our model we can't realy use it.
Increasing issue rate to 8 leads to infinite loop in scheduler.
Finally, I also enabled fuse_alu_and_branch since it is supported by
znver5 (I think by earlier zens too).
New fussion pattern moves quite few instructions around in common code:
@@ -2210,13 +2210,13 @@
.cfi_offset 3, -32
leaq 63(%rsi), %rbx
movq %rbx, %rbp
+ shrq $6, %rbp
+ salq $3, %rbp
subq $16, %rsp
.cfi_def_cfa_offset 48
movq %rdi, %r12
- shrq $6, %rbp
- movq %rsi, 8(%rsp)
- salq $3, %rbp
movq %rbp, %rdi
+ movq %rsi, 8(%rsp)
call _Znwm
movq 8(%rsp), %rsi
movl $0, 8(%r12)
@@ -2224,8 +2224,8 @@
movq %rax, (%r12)
movq %rbp, 32(%r12)
testq %rsi, %rsi
- movq %rsi, %rdx
cmovns %rsi, %rbx
+ movq %rsi, %rdx
sarq $63, %rdx
shrq $58, %rdx
sarq $6, %rbx
which should help decoder bandwidth and perhaps also cache, though I was not
able to measure off-noise effect on SPEC.
gcc/ChangeLog:
* config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU): New tune.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Updat for znver5.
(ix86_adjust_cost): Add TODO about znver5 memory latency.
(ix86_fuse_mov_alu_p): New.
(ix86_macro_fusion_pair_p): Use it.
* config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add ZNVER5.
(X86_TUNE_FUSE_MOV_AND_ALU): New tune;
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/i386.h | 2 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune-sched.cc | 67 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune.def | 11 |
3 files changed, 77 insertions, 3 deletions
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index eabb324..c1ec92f 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -430,6 +430,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS] #define TARGET_FUSE_ALU_AND_BRANCH \ ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH] +#define TARGET_FUSE_MOV_AND_ALU \ + ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU] #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU] #define TARGET_AVOID_LEA_FOR_ADDR \ ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR] diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc index d77298b..c6d5426 100644 --- a/gcc/config/i386/x86-tune-sched.cc +++ b/gcc/config/i386/x86-tune-sched.cc @@ -67,7 +67,6 @@ ix86_issue_rate (void) case PROCESSOR_ZNVER2: case PROCESSOR_ZNVER3: case PROCESSOR_ZNVER4: - case PROCESSOR_ZNVER5: case PROCESSOR_CORE2: case PROCESSOR_NEHALEM: case PROCESSOR_SANDYBRIDGE: @@ -91,6 +90,13 @@ ix86_issue_rate (void) return 5; case PROCESSOR_SAPPHIRERAPIDS: + /* For znver5 decoder can handle 4 or 8 instructions per cycle, + op cache 12 instruction/cycle, dispatch 8 instructions + integer rename 8 instructions and Fp 6 instructions. + + The scheduler, without understanding out of order nature of the CPU + is unlikely going to be able to fill all of these. */ + case PROCESSOR_ZNVER5: return 6; default: @@ -434,6 +440,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, enum attr_unit unit = get_attr_unit (insn); int loadcost; + /* TODO: On znver5 complex addressing modes have + greater latency. */ if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) loadcost = 4; else @@ -563,6 +571,60 @@ ix86_macro_fusion_p () return TARGET_FUSE_CMP_AND_BRANCH; } +static bool +ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu) +{ + /* Validate mov: + - It should be reg-reg move with opcode 0x89 or 0x8B. */ + rtx set1 = PATTERN (mov); + if (GET_CODE (set1) != SET + || !GENERAL_REG_P (SET_SRC (set1)) + || !GENERAL_REG_P (SET_DEST (set1))) + return false; + rtx reg = SET_DEST (set1); + /* - it should have 0x89 or 0x8B opcode. */ + if (!INTEGRAL_MODE_P (GET_MODE (reg)) + || GET_MODE_SIZE (GET_MODE (reg)) < 2 + || GET_MODE_SIZE (GET_MODE (reg)) > 8) + return false; + /* Validate ALU. */ + if (GET_CODE (PATTERN (alu)) != PARALLEL) + return false; + rtx set2 = XVECEXP (PATTERN (alu), 0, 0); + if (GET_CODE (set2) != SET) + return false; + /* Match one of: + ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR + We also may add insn attribute to handle some of sporadic + case we output those with different RTX expressions. */ + + if (GET_CODE (SET_SRC (set2)) != PLUS + && GET_CODE (SET_SRC (set2)) != MINUS + && GET_CODE (SET_SRC (set2)) != XOR + && GET_CODE (SET_SRC (set2)) != AND + && GET_CODE (SET_SRC (set2)) != IOR + && GET_CODE (SET_SRC (set2)) != NOT + && GET_CODE (SET_SRC (set2)) != ASHIFT + && GET_CODE (SET_SRC (set2)) != ASHIFTRT + && GET_CODE (SET_SRC (set2)) != LSHIFTRT) + return false; + rtx op0 = XEXP (SET_SRC (set2), 0); + rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL; + /* One of operands should be register. */ + if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg))) + std::swap (op0, op1); + if (!REG_P (op0) || REGNO (op1) != REGNO (reg)) + return false; + if (op1 + && !REG_P (op1) + && !x86_64_immediate_operand (op1, VOIDmode)) + return false; + /* Only one of two paramters must be move destination. */ + if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg)) + return false; + return true; +} + /* Check whether current microarchitecture support macro fusion for insn pair "CONDGEN + CONDJMP". Refer to "Intel Architectures Optimization Reference Manual". */ @@ -570,6 +632,9 @@ ix86_macro_fusion_p () bool ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp) { + if (TARGET_FUSE_MOV_AND_ALU + && ix86_fuse_mov_alu_p (condgen, condjmp)) + return true; rtx src, dest; enum rtx_code ccode; rtx compare_set = NULL_RTX, test_if, cond; diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index ed26136..d7e2ad7 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -143,10 +143,17 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional jump instruction when the alu instruction produces the CCFLAG consumed by - the conditional jump instruction. */ + the conditional jump instruction. + + TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND, + There is also limitation for immediate and displacement supported. */ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", - m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC) + m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5) +/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov + and the destination is used by alu. alu must be one of + ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */ +DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", m_ZNVER5) /*****************************************************************************/ /* Function prologue, epilogue and function calling sequences. */ |