aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vect-loop.cc
diff options
context:
space:
mode:
authorJu-Zhe Zhong <juzhe.zhong@rivai.ai>2023-07-23 12:23:33 +0800
committerLehua Ding <lehua.ding@rivai.ai>2023-07-24 16:22:30 +0800
commit42b17d00c33dd9994dee973c001d85dc09b79b24 (patch)
treeadf8a0e1ec69f261df6e55babc49de86f588f726 /gcc/tree-vect-loop.cc
parent50b5feaa94c26d01fed13f1119f025ae2bc75d2b (diff)
downloadgcc-42b17d00c33dd9994dee973c001d85dc09b79b24.zip
gcc-42b17d00c33dd9994dee973c001d85dc09b79b24.tar.gz
gcc-42b17d00c33dd9994dee973c001d85dc09b79b24.tar.bz2
VECT: Support floating-point in-order reduction for length loop control
Hi, Richard and Richi. This patch support floating-point in-order reduction for loop length control. Consider this following case: float foo (float *__restrict a, int n) { float result = 1.0; for (int i = 0; i < n; i++) result += a[i]; return result; } When compile with **NO** -ffast-math on ARM SVE, we will end up with: loop_mask = WHILE_ULT result = MASK_FOLD_LEFT_PLUS (...loop_mask...) For RVV, we don't use length loop control instead of mask: So, with this patch, we expect to see: loop_len = SELECT_VL result = MASK_LEN_FOLD_LEFT_PLUS (...loop_len...) gcc/ChangeLog: * tree-vect-loop.cc (get_masked_reduction_fn): Add mask_len_fold_left_plus. (vectorize_fold_left_reduction): Ditto. (vectorizable_reduction): Ditto. (vect_transform_reduction): Ditto.
Diffstat (limited to 'gcc/tree-vect-loop.cc')
-rw-r--r--gcc/tree-vect-loop.cc41
1 files changed, 36 insertions, 5 deletions
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 71589b2..ac86b09 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6813,11 +6813,13 @@ static internal_fn
get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
{
internal_fn mask_reduc_fn;
+ internal_fn mask_len_reduc_fn;
switch (reduc_fn)
{
case IFN_FOLD_LEFT_PLUS:
mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
+ mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
break;
default:
@@ -6827,6 +6829,9 @@ get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
OPTIMIZE_FOR_SPEED))
return mask_reduc_fn;
+ if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
+ OPTIMIZE_FOR_SPEED))
+ return mask_len_reduc_fn;
return IFN_LAST;
}
@@ -6847,7 +6852,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
gimple *reduc_def_stmt,
tree_code code, internal_fn reduc_fn,
tree ops[3], tree vectype_in,
- int reduc_index, vec_loop_masks *masks)
+ int reduc_index, vec_loop_masks *masks,
+ vec_loop_lens *lens)
{
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
@@ -6909,8 +6915,18 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
{
gimple *new_stmt;
tree mask = NULL_TREE;
+ tree len = NULL_TREE;
+ tree bias = NULL_TREE;
if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
+ if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+ {
+ len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
+ i, 1);
+ signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+ bias = build_int_cst (intQI_type_node, biasval);
+ mask = build_minus_one_cst (truth_type_for (vectype_in));
+ }
/* Handle MINUS by adding the negative. */
if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
@@ -6930,7 +6946,10 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
the preceding operation. */
if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
{
- if (mask && mask_reduc_fn != IFN_LAST)
+ if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
+ new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
+ def0, mask, len, bias);
+ else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
def0, mask);
else
@@ -7992,6 +8011,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
{
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+ vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
if (reduction_type != FOLD_LEFT_REDUCTION
@@ -8019,8 +8039,17 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
}
else
- vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
- vectype_in, NULL);
+ {
+ internal_fn mask_reduc_fn
+ = get_masked_reduction_fn (reduc_fn, vectype_in);
+
+ if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
+ vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
+ vectype_in, 1);
+ else
+ vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
+ vectype_in, NULL);
+ }
}
return true;
}
@@ -8150,6 +8179,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
code_helper code = canonicalize_code (op.code, op.type);
internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+ vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
/* Transform. */
@@ -8175,7 +8205,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
gcc_assert (code.is_tree_code ());
return vectorize_fold_left_reduction
(loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
- tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
+ tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
+ lens);
}
bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);