aboutsummaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
authorliuhongt <hongtao.liu@intel.com>2024-05-29 11:14:26 +0800
committerHaochen Jiang <haochen.jiang@intel.com>2024-05-29 11:14:26 +0800
commitb644126237a1aa8599f767a5e0bbada1d7286f44 (patch)
tree7536f7dff5c2ddda9fedfb2c0815ec03bbdd1ddf /gcc/config
parent00ed5424b1d4dcccfa187f55205521826794898c (diff)
downloadgcc-b644126237a1aa8599f767a5e0bbada1d7286f44.zip
gcc-b644126237a1aa8599f767a5e0bbada1d7286f44.tar.gz
gcc-b644126237a1aa8599f767a5e0bbada1d7286f44.tar.bz2
Align tight&hot loop without considering max skipping bytes.
When hot loop is small enough to fix into one cacheline, we should align the loop with ceil_log2 (loop_size) without considering maximum skipp bytes. It will help code prefetch. gcc/ChangeLog: * config/i386/i386.cc (ix86_avoid_jump_mispredicts): Change gen_pad to gen_max_skip_align. (ix86_align_loops): New function. (ix86_reorg): Call ix86_align_loops. * config/i386/i386.md (pad): Rename to .. (max_skip_align): .. this, and accept 2 operands for align and skip.
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/i386/i386.cc148
-rw-r--r--gcc/config/i386/i386.md10
2 files changed, 153 insertions, 5 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 85d87b9..1a0206a 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23146,7 +23146,7 @@ ix86_avoid_jump_mispredicts (void)
if (dump_file)
fprintf (dump_file, "Padding insn %i by %i bytes!\n",
INSN_UID (insn), padsize);
- emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
+ emit_insn_before (gen_max_skip_align (GEN_INT (4), GEN_INT (padsize)), insn);
}
}
}
@@ -23419,6 +23419,150 @@ ix86_split_stlf_stall_load ()
}
}
+/* When a hot loop can be fit into one cacheline,
+ force align the loop without considering the max skip. */
+static void
+ix86_align_loops ()
+{
+ basic_block bb;
+
+ /* Don't do this when we don't know cache line size. */
+ if (ix86_cost->prefetch_block == 0)
+ return;
+
+ loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+ profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ rtx_insn *label = BB_HEAD (bb);
+ bool has_fallthru = 0;
+ edge e;
+ edge_iterator ei;
+
+ if (!LABEL_P (label))
+ continue;
+
+ profile_count fallthru_count = profile_count::zero ();
+ profile_count branch_count = profile_count::zero ();
+
+ FOR_EACH_EDGE (e, ei, bb->preds)
+ {
+ if (e->flags & EDGE_FALLTHRU)
+ has_fallthru = 1, fallthru_count += e->count ();
+ else
+ branch_count += e->count ();
+ }
+
+ if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
+ continue;
+
+ if (bb->loop_father
+ && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
+ && (has_fallthru
+ ? (!(single_succ_p (bb)
+ && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
+ && optimize_bb_for_speed_p (bb)
+ && branch_count + fallthru_count > count_threshold
+ && (branch_count > fallthru_count * param_align_loop_iterations))
+ /* In case there'no fallthru for the loop.
+ Nops inserted won't be executed. */
+ : (branch_count > count_threshold
+ || (bb->count > bb->prev_bb->count * 10
+ && (bb->prev_bb->count
+ <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
+ {
+ rtx_insn* insn, *end_insn;
+ HOST_WIDE_INT size = 0;
+ bool padding_p = true;
+ basic_block tbb = bb;
+ unsigned cond_branch_num = 0;
+ bool detect_tight_loop_p = false;
+
+ for (unsigned int i = 0; i != bb->loop_father->num_nodes;
+ i++, tbb = tbb->next_bb)
+ {
+ /* Only handle continuous cfg layout. */
+ if (bb->loop_father != tbb->loop_father)
+ {
+ padding_p = false;
+ break;
+ }
+
+ FOR_BB_INSNS (tbb, insn)
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+ size += ix86_min_insn_size (insn);
+
+ /* We don't know size of inline asm.
+ Don't align loop for call. */
+ if (asm_noperands (PATTERN (insn)) >= 0
+ || CALL_P (insn))
+ {
+ size = -1;
+ break;
+ }
+ }
+
+ if (size == -1 || size > ix86_cost->prefetch_block)
+ {
+ padding_p = false;
+ break;
+ }
+
+ FOR_EACH_EDGE (e, ei, tbb->succs)
+ {
+ /* It could be part of the loop. */
+ if (e->dest == bb)
+ {
+ detect_tight_loop_p = true;
+ break;
+ }
+ }
+
+ if (detect_tight_loop_p)
+ break;
+
+ end_insn = BB_END (tbb);
+ if (JUMP_P (end_insn))
+ {
+ /* For decoded icache:
+ 1. Up to two branches are allowed per Way.
+ 2. A non-conditional branch is the last micro-op in a Way.
+ */
+ if (onlyjump_p (end_insn)
+ && (any_uncondjump_p (end_insn)
+ || single_succ_p (tbb)))
+ {
+ padding_p = false;
+ break;
+ }
+ else if (++cond_branch_num >= 2)
+ {
+ padding_p = false;
+ break;
+ }
+ }
+
+ }
+
+ if (padding_p && detect_tight_loop_p)
+ {
+ emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
+ GEN_INT (0)), label);
+ /* End of function. */
+ if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
+ break;
+ /* Skip bb which already fits into one cacheline. */
+ bb = tbb;
+ }
+ }
+ }
+
+ loop_optimizer_finalize ();
+ free_dominance_info (CDI_DOMINATORS);
+}
+
/* Implement machine specific optimizations. We implement padding of returns
for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
static void
@@ -23442,6 +23586,8 @@ ix86_reorg (void)
#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
if (TARGET_FOUR_JUMP_LIMIT)
ix86_avoid_jump_mispredicts ();
+
+ ix86_align_loops ();
#endif
}
}
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e8073f5..c162cd4 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -19150,16 +19150,18 @@
(set_attr "length_immediate" "0")
(set_attr "modrm" "0")])
-;; Pad to 16-byte boundary, max skip in op0. Used to avoid
+;; Pad to 1 << op0 byte boundary, max skip in op1. Used to avoid
;; branch prediction penalty for the third jump in a 16-byte
;; block on K8.
+;; Also it's used to align tight loops which can be fix into 1 cacheline.
+;; It can help code prefetch and reduce DSB miss.
-(define_insn "pad"
- [(unspec_volatile [(match_operand 0)] UNSPECV_ALIGN)]
+(define_insn "max_skip_align"
+ [(unspec_volatile [(match_operand 0) (match_operand 1)] UNSPECV_ALIGN)]
""
{
#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
- ASM_OUTPUT_MAX_SKIP_ALIGN (asm_out_file, 4, (int)INTVAL (operands[0]));
+ ASM_OUTPUT_MAX_SKIP_ALIGN (asm_out_file, (int)INTVAL (operands[0]), (int)INTVAL (operands[1]));
#else
/* It is tempting to use ASM_OUTPUT_ALIGN here, but we don't want to do that.
The align insn is used to avoid 3 jump instructions in the row to improve