diff options
author | liuhongt <hongtao.liu@intel.com> | 2024-05-29 11:14:26 +0800 |
---|---|---|
committer | Haochen Jiang <haochen.jiang@intel.com> | 2024-05-29 11:14:26 +0800 |
commit | b644126237a1aa8599f767a5e0bbada1d7286f44 (patch) | |
tree | 7536f7dff5c2ddda9fedfb2c0815ec03bbdd1ddf /gcc/config | |
parent | 00ed5424b1d4dcccfa187f55205521826794898c (diff) | |
download | gcc-b644126237a1aa8599f767a5e0bbada1d7286f44.zip gcc-b644126237a1aa8599f767a5e0bbada1d7286f44.tar.gz gcc-b644126237a1aa8599f767a5e0bbada1d7286f44.tar.bz2 |
Align tight&hot loop without considering max skipping bytes.
When hot loop is small enough to fix into one cacheline, we should align
the loop with ceil_log2 (loop_size) without considering maximum
skipp bytes. It will help code prefetch.
gcc/ChangeLog:
* config/i386/i386.cc (ix86_avoid_jump_mispredicts): Change
gen_pad to gen_max_skip_align.
(ix86_align_loops): New function.
(ix86_reorg): Call ix86_align_loops.
* config/i386/i386.md (pad): Rename to ..
(max_skip_align): .. this, and accept 2 operands for align and
skip.
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/i386/i386.cc | 148 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 10 |
2 files changed, 153 insertions, 5 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 85d87b9..1a0206a 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -23146,7 +23146,7 @@ ix86_avoid_jump_mispredicts (void) if (dump_file) fprintf (dump_file, "Padding insn %i by %i bytes!\n", INSN_UID (insn), padsize); - emit_insn_before (gen_pad (GEN_INT (padsize)), insn); + emit_insn_before (gen_max_skip_align (GEN_INT (4), GEN_INT (padsize)), insn); } } } @@ -23419,6 +23419,150 @@ ix86_split_stlf_stall_load () } } +/* When a hot loop can be fit into one cacheline, + force align the loop without considering the max skip. */ +static void +ix86_align_loops () +{ + basic_block bb; + + /* Don't do this when we don't know cache line size. */ + if (ix86_cost->prefetch_block == 0) + return; + + loop_optimizer_init (AVOID_CFG_MODIFICATIONS); + profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; + FOR_EACH_BB_FN (bb, cfun) + { + rtx_insn *label = BB_HEAD (bb); + bool has_fallthru = 0; + edge e; + edge_iterator ei; + + if (!LABEL_P (label)) + continue; + + profile_count fallthru_count = profile_count::zero (); + profile_count branch_count = profile_count::zero (); + + FOR_EACH_EDGE (e, ei, bb->preds) + { + if (e->flags & EDGE_FALLTHRU) + has_fallthru = 1, fallthru_count += e->count (); + else + branch_count += e->count (); + } + + if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) + continue; + + if (bb->loop_father + && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) + && (has_fallthru + ? (!(single_succ_p (bb) + && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) + && optimize_bb_for_speed_p (bb) + && branch_count + fallthru_count > count_threshold + && (branch_count > fallthru_count * param_align_loop_iterations)) + /* In case there'no fallthru for the loop. + Nops inserted won't be executed. */ + : (branch_count > count_threshold + || (bb->count > bb->prev_bb->count * 10 + && (bb->prev_bb->count + <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) + { + rtx_insn* insn, *end_insn; + HOST_WIDE_INT size = 0; + bool padding_p = true; + basic_block tbb = bb; + unsigned cond_branch_num = 0; + bool detect_tight_loop_p = false; + + for (unsigned int i = 0; i != bb->loop_father->num_nodes; + i++, tbb = tbb->next_bb) + { + /* Only handle continuous cfg layout. */ + if (bb->loop_father != tbb->loop_father) + { + padding_p = false; + break; + } + + FOR_BB_INSNS (tbb, insn) + { + if (!NONDEBUG_INSN_P (insn)) + continue; + size += ix86_min_insn_size (insn); + + /* We don't know size of inline asm. + Don't align loop for call. */ + if (asm_noperands (PATTERN (insn)) >= 0 + || CALL_P (insn)) + { + size = -1; + break; + } + } + + if (size == -1 || size > ix86_cost->prefetch_block) + { + padding_p = false; + break; + } + + FOR_EACH_EDGE (e, ei, tbb->succs) + { + /* It could be part of the loop. */ + if (e->dest == bb) + { + detect_tight_loop_p = true; + break; + } + } + + if (detect_tight_loop_p) + break; + + end_insn = BB_END (tbb); + if (JUMP_P (end_insn)) + { + /* For decoded icache: + 1. Up to two branches are allowed per Way. + 2. A non-conditional branch is the last micro-op in a Way. + */ + if (onlyjump_p (end_insn) + && (any_uncondjump_p (end_insn) + || single_succ_p (tbb))) + { + padding_p = false; + break; + } + else if (++cond_branch_num >= 2) + { + padding_p = false; + break; + } + } + + } + + if (padding_p && detect_tight_loop_p) + { + emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), + GEN_INT (0)), label); + /* End of function. */ + if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) + break; + /* Skip bb which already fits into one cacheline. */ + bb = tbb; + } + } + } + + loop_optimizer_finalize (); + free_dominance_info (CDI_DOMINATORS); +} + /* Implement machine specific optimizations. We implement padding of returns for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */ static void @@ -23442,6 +23586,8 @@ ix86_reorg (void) #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN if (TARGET_FOUR_JUMP_LIMIT) ix86_avoid_jump_mispredicts (); + + ix86_align_loops (); #endif } } diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index e8073f5..c162cd4 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -19150,16 +19150,18 @@ (set_attr "length_immediate" "0") (set_attr "modrm" "0")]) -;; Pad to 16-byte boundary, max skip in op0. Used to avoid +;; Pad to 1 << op0 byte boundary, max skip in op1. Used to avoid ;; branch prediction penalty for the third jump in a 16-byte ;; block on K8. +;; Also it's used to align tight loops which can be fix into 1 cacheline. +;; It can help code prefetch and reduce DSB miss. -(define_insn "pad" - [(unspec_volatile [(match_operand 0)] UNSPECV_ALIGN)] +(define_insn "max_skip_align" + [(unspec_volatile [(match_operand 0) (match_operand 1)] UNSPECV_ALIGN)] "" { #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN - ASM_OUTPUT_MAX_SKIP_ALIGN (asm_out_file, 4, (int)INTVAL (operands[0])); + ASM_OUTPUT_MAX_SKIP_ALIGN (asm_out_file, (int)INTVAL (operands[0]), (int)INTVAL (operands[1])); #else /* It is tempting to use ASM_OUTPUT_ALIGN here, but we don't want to do that. The align insn is used to avoid 3 jump instructions in the row to improve |