aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vectorizer.h
diff options
context:
space:
mode:
authorFeng Xue <fxue@os.amperecomputing.com>2024-05-29 17:22:36 +0800
committerFeng Xue <fxue@os.amperecomputing.com>2024-07-17 21:54:05 +0800
commit178cc419512f7e358f88dfe2336625aa99cd7438 (patch)
tree518eacbefd2e5457b33c4d3a0cc87c3943ff6306 /gcc/tree-vectorizer.h
parent8b59fa9d8ca25bdf0792390a8bdeae151532a530 (diff)
downloadgcc-178cc419512f7e358f88dfe2336625aa99cd7438.zip
gcc-178cc419512f7e358f88dfe2336625aa99cd7438.tar.gz
gcc-178cc419512f7e358f88dfe2336625aa99cd7438.tar.bz2
vect: Support multiple lane-reducing operations for loop reduction [PR114440]
For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current vectorizer could only handle the pattern if the reduction chain does not contain other operation, no matter the other is normal or lane-reducing. This patches removes some constraints in reduction analysis to allow multiple arbitrary lane-reducing operations with mixed input vectypes in a loop reduction chain. For example: int sum = 1; for (i) { sum += d0[i] * d1[i]; // dot-prod <vector(16) char> sum += w[i]; // widen-sum <vector(16) char> sum += abs(s0[i] - s1[i]); // sad <vector(8) short> } The vector size is 128-bit vectorization factor is 16. Reduction statements would be transformed as: vector<4> int sum_v0 = { 0, 0, 0, 1 }; vector<4> int sum_v1 = { 0, 0, 0, 0 }; vector<4> int sum_v2 = { 0, 0, 0, 0 }; vector<4> int sum_v3 = { 0, 0, 0, 0 }; for (i / 16) { sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy } sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1 2024-03-22 Feng Xue <fxue@os.amperecomputing.com> gcc/ PR tree-optimization/114440 * tree-vectorizer.h (vectorizable_lane_reducing): New function declaration. * tree-vect-stmts.cc (vect_analyze_stmt): Call new function vectorizable_lane_reducing to analyze lane-reducing operation. * tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation code related to emulated_mixed_dot_prod. (vectorizable_lane_reducing): New function. (vectorizable_reduction): Allow multiple lane-reducing operations in loop reduction. Move some original lane-reducing related code to vectorizable_lane_reducing. (vect_transform_reduction): Adjust comments with updated example. gcc/testsuite/ PR tree-optimization/114440 * gcc.dg/vect/vect-reduc-chain-1.c * gcc.dg/vect/vect-reduc-chain-2.c * gcc.dg/vect/vect-reduc-chain-3.c * gcc.dg/vect/vect-reduc-chain-dot-slp-1.c * gcc.dg/vect/vect-reduc-chain-dot-slp-2.c * gcc.dg/vect/vect-reduc-chain-dot-slp-3.c * gcc.dg/vect/vect-reduc-chain-dot-slp-4.c * gcc.dg/vect/vect-reduc-dot-slp-1.c
Diffstat (limited to 'gcc/tree-vectorizer.h')
-rw-r--r--gcc/tree-vectorizer.h2
1 files changed, 2 insertions, 0 deletions
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1e2121a..d8be89c 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2485,6 +2485,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
slp_tree, slp_instance, int,
bool, stmt_vector_for_cost *);
+extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
+ slp_tree, stmt_vector_for_cost *);
extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
slp_tree, slp_instance,
stmt_vector_for_cost *);