aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vectorizer.h
diff options
context:
space:
mode:
authorFeng Xue <fxue@os.amperecomputing.com>2024-05-29 17:28:14 +0800
committerFeng Xue <fxue@os.amperecomputing.com>2024-07-17 21:54:06 +0800
commitdb3c8c9726d0bafbb9f85b6d7027fe83602643e7 (patch)
tree8acb3e367d203bfb78eb13aeae7a125ad9eda411 /gcc/tree-vectorizer.h
parent178cc419512f7e358f88dfe2336625aa99cd7438 (diff)
downloadgcc-db3c8c9726d0bafbb9f85b6d7027fe83602643e7.zip
gcc-db3c8c9726d0bafbb9f85b6d7027fe83602643e7.tar.gz
gcc-db3c8c9726d0bafbb9f85b6d7027fe83602643e7.tar.bz2
vect: Optimize order of lane-reducing operations in loop def-use cycles
When transforming multiple lane-reducing operations in a loop reduction chain, originally, corresponding vectorized statements are generated into def-use cycles starting from 0. The def-use cycle with smaller index, would contain more statements, which means more instruction dependency. For example: int sum = 1; for (i) { sum += d0[i] * d1[i]; // dot-prod <vector(16) char> sum += w[i]; // widen-sum <vector(16) char> sum += abs(s0[i] - s1[i]); // sad <vector(8) short> sum += n[i]; // normal <vector(4) int> } Original transformation result: for (i / 16) { sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy ... } For a higher instruction parallelism in final vectorized loop, an optimal means is to make those effective vector lane-reducing ops be distributed evenly among all def-use cycles. Transformed as the below, DOT_PROD, WIDEN_SUM and SADs are generated into disparate cycles, instruction dependency among them could be eliminated. for (i / 16) { sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = sum_v0; // copy sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1); sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = sum_v0; // copy sum_v1 = sum_v1; // copy sum_v2 = SAD (s0_v2[i: 0 ~ 7 ], s1_v2[i: 0 ~ 7 ], sum_v2); sum_v3 = SAD (s0_v3[i: 8 ~ 15], s1_v3[i: 8 ~ 15], sum_v3); ... } 2024-03-22 Feng Xue <fxue@os.amperecomputing.com> gcc/ PR tree-optimization/114440 * tree-vectorizer.h (struct _stmt_vec_info): Add a new field reduc_result_pos. * tree-vect-loop.cc (vect_transform_reduction): Generate lane-reducing statements in an optimized order.
Diffstat (limited to 'gcc/tree-vectorizer.h')
-rw-r--r--gcc/tree-vectorizer.h6
1 files changed, 6 insertions, 0 deletions
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index d8be89c..df6c8ad 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1402,6 +1402,12 @@ public:
/* The vector type for performing the actual reduction. */
tree reduc_vectype;
+ /* For loop reduction with multiple vectorized results (ncopies > 1), a
+ lane-reducing operation participating in it may not use all of those
+ results, this field specifies result index starting from which any
+ following land-reducing operation would be assigned to. */
+ unsigned int reduc_result_pos;
+
/* If IS_REDUC_INFO is true and if the vector code is performing
N scalar reductions in parallel, this variable gives the initial
scalar values of those N reductions. */