aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorJan Hubicka <jh@suse.cz>2024-09-03 16:26:16 +0200
committerJan Hubicka <jh@suse.cz>2024-09-03 16:27:40 +0200
commite2125a600552bc6e0329e3f1224eea14804db8d3 (patch)
tree83d405de615b956aab6e5d53439cd45ae3eebf93 /gcc
parentdee3c5c6ff9952204af3014383593e8d316250e4 (diff)
downloadgcc-e2125a600552bc6e0329e3f1224eea14804db8d3.zip
gcc-e2125a600552bc6e0329e3f1224eea14804db8d3.tar.gz
gcc-e2125a600552bc6e0329e3f1224eea14804db8d3.tar.bz2
Zen5 tuning part 3: scheduler tweaks
this patch adds support for new fussion in znver5 documented in the optimization manual: The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions with certain ALU instructions. The following conditions need to be met for fusion to happen: - The MOV should be reg-reg mov with Opcode 0x89 or 0x8B - The MOV is followed by an ALU instruction where the MOV and ALU destination register match. - The ALU instruction may source only registers or immediate data. There cannot be any memory source. - The ALU instruction sources either the source or dest of MOV instruction. - If ALU instruction has 2 reg sources, they should be different. - The following ALU instructions can fuse with an older qualified MOV instruction: ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR (I assume OP is OR) I also increased issue rate from 4 to 6. Theoretically znver5 can do more, but with our model we can't realy use it. Increasing issue rate to 8 leads to infinite loop in scheduler. Finally, I also enabled fuse_alu_and_branch since it is supported by znver5 (I think by earlier zens too). New fussion pattern moves quite few instructions around in common code: @@ -2210,13 +2210,13 @@ .cfi_offset 3, -32 leaq 63(%rsi), %rbx movq %rbx, %rbp + shrq $6, %rbp + salq $3, %rbp subq $16, %rsp .cfi_def_cfa_offset 48 movq %rdi, %r12 - shrq $6, %rbp - movq %rsi, 8(%rsp) - salq $3, %rbp movq %rbp, %rdi + movq %rsi, 8(%rsp) call _Znwm movq 8(%rsp), %rsi movl $0, 8(%r12) @@ -2224,8 +2224,8 @@ movq %rax, (%r12) movq %rbp, 32(%r12) testq %rsi, %rsi - movq %rsi, %rdx cmovns %rsi, %rbx + movq %rsi, %rdx sarq $63, %rdx shrq $58, %rdx sarq $6, %rbx which should help decoder bandwidth and perhaps also cache, though I was not able to measure off-noise effect on SPEC. gcc/ChangeLog: * config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU): New tune. * config/i386/x86-tune-sched.cc (ix86_issue_rate): Updat for znver5. (ix86_adjust_cost): Add TODO about znver5 memory latency. (ix86_fuse_mov_alu_p): New. (ix86_macro_fusion_pair_p): Use it. * config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add ZNVER5. (X86_TUNE_FUSE_MOV_AND_ALU): New tune;
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386.h2
-rw-r--r--gcc/config/i386/x86-tune-sched.cc67
-rw-r--r--gcc/config/i386/x86-tune.def11
3 files changed, 77 insertions, 3 deletions
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index eabb324..c1ec92f 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -430,6 +430,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
#define TARGET_FUSE_ALU_AND_BRANCH \
ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
+#define TARGET_FUSE_MOV_AND_ALU \
+ ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU]
#define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
#define TARGET_AVOID_LEA_FOR_ADDR \
ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]
diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc
index d77298b..c6d5426 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -67,7 +67,6 @@ ix86_issue_rate (void)
case PROCESSOR_ZNVER2:
case PROCESSOR_ZNVER3:
case PROCESSOR_ZNVER4:
- case PROCESSOR_ZNVER5:
case PROCESSOR_CORE2:
case PROCESSOR_NEHALEM:
case PROCESSOR_SANDYBRIDGE:
@@ -91,6 +90,13 @@ ix86_issue_rate (void)
return 5;
case PROCESSOR_SAPPHIRERAPIDS:
+ /* For znver5 decoder can handle 4 or 8 instructions per cycle,
+ op cache 12 instruction/cycle, dispatch 8 instructions
+ integer rename 8 instructions and Fp 6 instructions.
+
+ The scheduler, without understanding out of order nature of the CPU
+ is unlikely going to be able to fill all of these. */
+ case PROCESSOR_ZNVER5:
return 6;
default:
@@ -434,6 +440,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
enum attr_unit unit = get_attr_unit (insn);
int loadcost;
+ /* TODO: On znver5 complex addressing modes have
+ greater latency. */
if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
loadcost = 4;
else
@@ -563,6 +571,60 @@ ix86_macro_fusion_p ()
return TARGET_FUSE_CMP_AND_BRANCH;
}
+static bool
+ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
+{
+ /* Validate mov:
+ - It should be reg-reg move with opcode 0x89 or 0x8B. */
+ rtx set1 = PATTERN (mov);
+ if (GET_CODE (set1) != SET
+ || !GENERAL_REG_P (SET_SRC (set1))
+ || !GENERAL_REG_P (SET_DEST (set1)))
+ return false;
+ rtx reg = SET_DEST (set1);
+ /* - it should have 0x89 or 0x8B opcode. */
+ if (!INTEGRAL_MODE_P (GET_MODE (reg))
+ || GET_MODE_SIZE (GET_MODE (reg)) < 2
+ || GET_MODE_SIZE (GET_MODE (reg)) > 8)
+ return false;
+ /* Validate ALU. */
+ if (GET_CODE (PATTERN (alu)) != PARALLEL)
+ return false;
+ rtx set2 = XVECEXP (PATTERN (alu), 0, 0);
+ if (GET_CODE (set2) != SET)
+ return false;
+ /* Match one of:
+ ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR
+ We also may add insn attribute to handle some of sporadic
+ case we output those with different RTX expressions. */
+
+ if (GET_CODE (SET_SRC (set2)) != PLUS
+ && GET_CODE (SET_SRC (set2)) != MINUS
+ && GET_CODE (SET_SRC (set2)) != XOR
+ && GET_CODE (SET_SRC (set2)) != AND
+ && GET_CODE (SET_SRC (set2)) != IOR
+ && GET_CODE (SET_SRC (set2)) != NOT
+ && GET_CODE (SET_SRC (set2)) != ASHIFT
+ && GET_CODE (SET_SRC (set2)) != ASHIFTRT
+ && GET_CODE (SET_SRC (set2)) != LSHIFTRT)
+ return false;
+ rtx op0 = XEXP (SET_SRC (set2), 0);
+ rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL;
+ /* One of operands should be register. */
+ if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg)))
+ std::swap (op0, op1);
+ if (!REG_P (op0) || REGNO (op1) != REGNO (reg))
+ return false;
+ if (op1
+ && !REG_P (op1)
+ && !x86_64_immediate_operand (op1, VOIDmode))
+ return false;
+ /* Only one of two paramters must be move destination. */
+ if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg))
+ return false;
+ return true;
+}
+
/* Check whether current microarchitecture support macro fusion
for insn pair "CONDGEN + CONDJMP". Refer to
"Intel Architectures Optimization Reference Manual". */
@@ -570,6 +632,9 @@ ix86_macro_fusion_p ()
bool
ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
{
+ if (TARGET_FUSE_MOV_AND_ALU
+ && ix86_fuse_mov_alu_p (condgen, condjmp))
+ return true;
rtx src, dest;
enum rtx_code ccode;
rtx compare_set = NULL_RTX, test_if, cond;
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index ed26136..d7e2ad7 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -143,10 +143,17 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
jump instruction when the alu instruction produces the CCFLAG consumed by
- the conditional jump instruction. */
+ the conditional jump instruction.
+
+ TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND,
+ There is also limitation for immediate and displacement supported. */
DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
- m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC)
+ m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5)
+/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
+ and the destination is used by alu. alu must be one of
+ ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */
+DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", m_ZNVER5)
/*****************************************************************************/
/* Function prologue, epilogue and function calling sequences. */