VECT: Apply MASK_LEN_{LOAD_LANES, STORE_LANES} into vectorizer

Hi, Richard and Richi. This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into vectorizer. Consider this simple case: void __attribute__ ((noinline, noclone)) foo (int *__restrict a, int *__restrict b, int *__restrict c, int *__restrict d, int *__restrict e, int *__restrict f, int *__restrict g, int *__restrict h, int *__restrict j, int n) { for (int i = 0; i < n; ++i) { a[i] = j[i * 8]; b[i] = j[i * 8 + 1]; c[i] = j[i * 8 + 2]; d[i] = j[i * 8 + 3]; e[i] = j[i * 8 + 4]; f[i] = j[i * 8 + 5]; g[i] = j[i * 8 + 6]; h[i] = j[i * 8 + 7]; } } RVV Gimple IR: _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]); ivtmp_125 = _79 * 32; vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 0); vect__8.9_122 = vect_array.8[0]; vect__8.10_121 = vect_array.8[1]; vect__8.11_120 = vect_array.8[2]; vect__8.12_119 = vect_array.8[3]; vect__8.13_118 = vect_array.8[4]; vect__8.14_117 = vect_array.8[5]; vect__8.15_116 = vect_array.8[6]; vect__8.16_115 = vect_array.8[7]; vect_array.8 ={v} {CLOBBER}; ivtmp_114 = _79 * 4; .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122); .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121); .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120); .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119); .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118); .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117); .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116); .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115); ASM: foo: lw t4,8(sp) ld t5,0(sp) ble t4,zero,.L5 .L3: vsetvli t1,t4,e8,mf4,ta,ma vlseg8e32.v v8,(t5) slli t3,t1,2 slli t6,t1,5 vse32.v v8,0(a0) vse32.v v9,0(a1) vse32.v v10,0(a2) vse32.v v11,0(a3) vse32.v v12,0(a4) vse32.v v13,0(a5) vse32.v v14,0(a6) vse32.v v15,0(a7) sub t4,t4,t1 add t5,t5,t6 add a0,a0,t3 add a1,a1,t3 add a2,a2,t3 add a3,a3,t3 add a4,a4,t3 add a5,a5,t3 add a6,a6,t3 add a7,a7,t3 bne t4,zero,.L3 .L5: ret The details of the approach: Step 1 - Modifiy the LANES LOAD/STORE support function (vect_load_lanes_supported/vect_store_lanes_supported): +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT + vectors of type VECTYPE. MASKED_P says whether the masked form is needed. */ -bool +internal_fn vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, bool masked_p) { - if (masked_p) - return vect_lanes_optab_supported_p ("vec_mask_load_lanes", - vec_mask_load_lanes_optab, - vectype, count); + if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes", + vec_mask_len_load_lanes_optab, + vectype, count)) + return IFN_MASK_LEN_LOAD_LANES; + else if (masked_p) + { + if (vect_lanes_optab_supported_p ("vec_mask_load_lanes", + vec_mask_load_lanes_optab, + vectype, count)) + return IFN_MASK_LOAD_LANES; + } else - return vect_lanes_optab_supported_p ("vec_load_lanes", - vec_load_lanes_optab, - vectype, count); + { + if (vect_lanes_optab_supported_p ("vec_load_lanes", + vec_load_lanes_optab, + vectype, count)) + return IFN_LOAD_LANES; + } + return IFN_LAST; } Instead of returning TRUE or FALSE whether target support the LANES LOAD/STORE. I change it into return internal_fn of the LANES LOAD/STORE that target support, If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST. Step 2 - Compute IFN for LANES LOAD/STORE (Only compute once). if (!STMT_VINFO_STRIDED_P (first_stmt_info) && (can_overrun_p || !would_overrun_p) && compare_step_with_zero (vinfo, stmt_info) > 0) { /* First cope with the degenerate case of a single-element vector. */ if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)) ; else { /* Otherwise try using LOAD/STORE_LANES. */ *lanes_ifn = vls_type == VLS_LOAD ? vect_load_lanes_supported (vectype, group_size, masked_p) : vect_store_lanes_supported (vectype, group_size, masked_p); if (*lanes_ifn != IFN_LAST) { *memory_access_type = VMAT_LOAD_STORE_LANES; overrun_p = would_overrun_p; } /* If that fails, try using permuting loads. */ else if (vls_type == VLS_LOAD ? vect_grouped_load_supported (vectype, single_element_p, group_size) : vect_grouped_store_supported (vectype, group_size)) { *memory_access_type = VMAT_CONTIGUOUS_PERMUTE; overrun_p = would_overrun_p; } } } Step 3 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR: + if (lanes_ifn == IFN_MASK_LEN_STORE_LANES) + { + if (loop_lens) + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, + ncopies, vectype, j, 1); + else + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype)); + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + if (!final_mask) + { + mask_vectype = truth_type_for (vectype); + final_mask = build_minus_one_cst (mask_vectype); + } + } + gcall *call; - if (final_mask) + if (final_len && final_mask) + { + /* Emit: + MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK, + LEN, BIAS, VEC_ARRAY). */ + unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype)); + tree alias_ptr = build_int_cst (ref_type, align); + call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6, + dataref_ptr, alias_ptr, + final_mask, final_len, bias, + vec_array); + } + else if (final_mask) The LEN and MASK flow is totally the same as other MASK_LEN_* load/store. gcc/ChangeLog: * internal-fn.cc (internal_load_fn_p): Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer. (internal_store_fn_p): Ditto. (internal_fn_len_index): Ditto. (internal_fn_mask_index): Ditto. (internal_fn_stored_value_index): Ditto. * tree-vect-data-refs.cc (vect_store_lanes_supported): Ditto. (vect_load_lanes_supported): Ditto. * tree-vect-loop.cc: Ditto. * tree-vect-slp.cc (vect_slp_prefer_store_lanes_p): Ditto. * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto. (get_group_load_store_type): Ditto. (vectorizable_store): Ditto. (vectorizable_load): Ditto. * tree-vectorizer.h (vect_store_lanes_supported): Ditto. (vect_load_lanes_supported): Ditto.
author: Juzhe-Zhong <juzhe.zhong@rivai.ai> 2023-08-15 20:29:17 +0800
committer: Pan Li <pan2.li@intel.com> 2023-08-16 14:05:29 +0800
commit: d5acdd62f090a472026c36fee3dfeb45f2de8429 (patch)
tree: e372ac65f97d12fafd9be83da700c0f295742350 /gcc/tree-vectorizer.h
parent: c6f65ce9483131b1996cbddf8aaaebe0d8e5141c (diff)
download: gcc-d5acdd62f090a472026c36fee3dfeb45f2de8429.zip
gcc-d5acdd62f090a472026c36fee3dfeb45f2de8429.tar.gz
gcc-d5acdd62f090a472026c36fee3dfeb45f2de8429.tar.bz2
1 files changed, 2 insertions, 2 deletions
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1de1449..53a3d78 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2297,9 +2297,9 @@ extern tree bump_vector_ptr (vec_info *, tree, gimple *, gimple_stmt_iterator *,
 extern void vect_copy_ref_info (tree, tree);
 extern tree vect_create_destination_var (tree, tree);
 extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT);
-extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
+extern internal_fn vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
 extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT);
-extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
+extern internal_fn vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
 extern void vect_permute_store_chain (vec_info *, vec<tree> &,
 				      unsigned int, stmt_vec_info,
 				      gimple_stmt_iterator *, vec<tree> *);
author	Juzhe-Zhong <juzhe.zhong@rivai.ai>	2023-08-15 20:29:17 +0800
committer	Pan Li <pan2.li@intel.com>	2023-08-16 14:05:29 +0800
commit	d5acdd62f090a472026c36fee3dfeb45f2de8429 (patch)
tree	e372ac65f97d12fafd9be83da700c0f295742350 /gcc/tree-vectorizer.h
parent	c6f65ce9483131b1996cbddf8aaaebe0d8e5141c (diff)
download	gcc-d5acdd62f090a472026c36fee3dfeb45f2de8429.zip gcc-d5acdd62f090a472026c36fee3dfeb45f2de8429.tar.gz gcc-d5acdd62f090a472026c36fee3dfeb45f2de8429.tar.bz2