diff options
author | chenxiaolong <chenxiaolong@loongson.cn> | 2025-01-07 21:04:51 +0800 |
---|---|---|
committer | Lulu Cheng <chenglulu@loongson.cn> | 2025-01-10 10:02:56 +0800 |
commit | e8a57884ad4898fdec5c13a8933d73bcbaf06099 (patch) | |
tree | a6a30e97a0fe08683afe9cd4660d6034d271e2e2 | |
parent | 979ca3ba366da7177f427e049f67673ec3e35442 (diff) | |
download | gcc-e8a57884ad4898fdec5c13a8933d73bcbaf06099.zip gcc-e8a57884ad4898fdec5c13a8933d73bcbaf06099.tar.gz gcc-e8a57884ad4898fdec5c13a8933d73bcbaf06099.tar.bz2 |
LoongArch: Opitmize the cost of vec_construct.
When analyzing 525 on LoongArch architecture, it was found that the
for loop of hotspot function x264_pixel_satd_8x4 could not be quantized
256-bit due to the cost of vec_construct setting. After re-adjusting
vec_construct, the performance of 525 program was improved by 16.57%.
It was found that this function can be vectorized on the aarch64 and
x86 architectures, see [PR98138].
Co-Authored-By: Deng Jianbo <dengjianbo@loongson.cn>.
gcc/ChangeLog:
* config/loongarch/loongarch.cc
(loongarch_builtin_vectorization_cost): Modify the
construction cost of the vec_construct vector.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/vect-slp-two-operator.c: New test.
-rw-r--r-- | gcc/config/loongarch/loongarch.cc | 6 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/loongarch/vect-slp-two-operator.c | 38 |
2 files changed, 41 insertions, 3 deletions
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index d506354..24c1903 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -4127,10 +4127,10 @@ loongarch_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, case vec_construct: elements = TYPE_VECTOR_SUBPARTS (vectype); - if (ISA_HAS_LASX) - return elements + 1; + if (LASX_SUPPORTED_MODE_P (mode) && !LSX_SUPPORTED_MODE_P (mode)) + return elements / 2 + 3; else - return elements; + return elements / 2 + 1; default: gcc_unreachable (); diff --git a/gcc/testsuite/gcc.target/loongarch/vect-slp-two-operator.c b/gcc/testsuite/gcc.target/loongarch/vect-slp-two-operator.c new file mode 100644 index 0000000..43b4675 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vect-slp-two-operator.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mlasx -ftree-vectorize -fdump-tree-vect -fdump-tree-vect-details" } */ + +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; + +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) \ + { \ + int t0 = s0 + s1; \ + int t1 = s0 - s1; \ + int t2 = s2 + s3; \ + int t3 = s2 - s3; \ + d0 = t0 + t2; \ + d1 = t1 + t3; \ + d2 = t0 - t2; \ + d3 = t1 - t3; \ + } + +void sink (uint32_t tmp[4][4]); + +void +x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2) +{ + uint32_t tmp[4][4]; + int sum = 0; + for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2) + { + uint32_t a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); + uint32_t a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); + uint32_t a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); + uint32_t a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); + HADAMARD4 (tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3); + } + sink (tmp); +} + +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ |