diff options
author | Jakub Jelinek <jakub@redhat.com> | 2020-11-26 08:44:15 +0100 |
---|---|---|
committer | Jakub Jelinek <jakub@redhat.com> | 2020-11-26 08:46:14 +0100 |
commit | 32b0abb24b8702ec9954448739682ace6fa5ccf5 (patch) | |
tree | 4e36a72b8c81c870020720f0a26d9f166873b462 | |
parent | 768ce4f0ceb030e38427e85e483ed44330cd5da7 (diff) | |
download | gcc-32b0abb24b8702ec9954448739682ace6fa5ccf5.zip gcc-32b0abb24b8702ec9954448739682ace6fa5ccf5.tar.gz gcc-32b0abb24b8702ec9954448739682ace6fa5ccf5.tar.bz2 |
i386: Optimize psubusw compared to 0 into pminuw compared to op0 [PR96906]
The following patch renames VI12_AVX2 iterator to VI12_AVX2_AVX512BW
for consistency with some other iterators, as I need VI12_AVX2 without
AVX512BW for this change.
The real meat is a combiner split which combine
can use to optimize psubusw compared to 0 into pminuw compared to op0
(and similarly for psubusb compared to 0 into pminub compared to op0).
According to Agner Fog's tables, psubus[bw] and pminu[bw] timings
are the same, but the advantage of pminu[bw] is that the comparison
doesn't need a zero operand, so e.g. for -msse4.1 it causes changes like
- psubusw %xmm1, %xmm0
- pxor %xmm1, %xmm1
+ pminuw %xmm0, %xmm1
pcmpeqw %xmm1, %xmm0
and similarly for avx2:
- vpsubusb %ymm1, %ymm0, %ymm0
- vpxor %xmm1, %xmm1, %xmm1
- vpcmpeqb %ymm1, %ymm0, %ymm0
+ vpminub %ymm1, %ymm0, %ymm1
+ vpcmpeqb %ymm0, %ymm1, %ymm0
I haven't done the AVX512{BW,VL} define_split(s), they'll need
to match the UNSPEC_PCMP which are used for avx512 comparisons.
2020-11-26 Jakub Jelinek <jakub@redhat.com>
PR target/96906
* config/i386/sse.md (VI12_AVX2): Remove V64QI/V32HI modes.
(VI12_AVX2_AVX512BW): New mode iterator.
(<sse2_avx2>_<plusminus_insn><mode>3<mask_name>,
uavg<mode>3_ceil, <sse2_avx2>_uavg<mode>3<mask_name>): Use
VI12_AVX2_AVX512BW iterator instead of VI12_AVX2.
(*<sse2_avx2>_<plusminus_insn><mode>3<mask_name>): Likewise.
(*<sse2_avx2>_uavg<mode>3<mask_name>): Likewise.
(*<sse2_avx2>_<plusminus_insn><mode>3<mask_name>): Add a new
define_split after this insn.
* gcc.target/i386/pr96906-1.c: New test.
-rw-r--r-- | gcc/config/i386/sse.md | 61 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr96906-1.c | 62 |
2 files changed, 103 insertions, 20 deletions
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 1193680..4aad462 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -466,6 +466,10 @@ [(V4TI "TARGET_AVX512BW") (V2TI "TARGET_AVX2") TI]) (define_mode_iterator VI12_AVX2 + [(V32QI "TARGET_AVX2") V16QI + (V16HI "TARGET_AVX2") V8HI]) + +(define_mode_iterator VI12_AVX2_AVX512BW [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI]) @@ -11395,18 +11399,18 @@ (set_attr "mode" "<sseinsnmode>")]) (define_expand "<sse2_avx2>_<plusminus_insn><mode>3<mask_name>" - [(set (match_operand:VI12_AVX2 0 "register_operand") - (sat_plusminus:VI12_AVX2 - (match_operand:VI12_AVX2 1 "vector_operand") - (match_operand:VI12_AVX2 2 "vector_operand")))] + [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand") + (sat_plusminus:VI12_AVX2_AVX512BW + (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand") + (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand")))] "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") (define_insn "*<sse2_avx2>_<plusminus_insn><mode>3<mask_name>" - [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,v") - (sat_plusminus:VI12_AVX2 - (match_operand:VI12_AVX2 1 "vector_operand" "<comm>0,v") - (match_operand:VI12_AVX2 2 "vector_operand" "xBm,vm")))] + [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand" "=x,v") + (sat_plusminus:VI12_AVX2_AVX512BW + (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand" "<comm>0,v") + (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand" "xBm,vm")))] "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition> && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" "@ @@ -11418,6 +11422,23 @@ (set_attr "prefix" "orig,maybe_evex") (set_attr "mode" "TI")]) +;; PR96906 - optimize psubusw compared to 0 into pminuw compared to op0. +(define_split + [(set (match_operand:VI12_AVX2 0 "register_operand") + (eq:VI12_AVX2 + (us_minus:VI12_AVX2 + (match_operand:VI12_AVX2 1 "vector_operand") + (match_operand:VI12_AVX2 2 "vector_operand")) + (match_operand:VI12_AVX2 3 "const0_operand")))] + "TARGET_SSE2 + && (<MODE>mode != V8HImode || TARGET_SSE4_1) + && ix86_binary_operator_ok (US_MINUS, <MODE>mode, operands)" + [(set (match_dup 4) + (umin:VI12_AVX2 (match_dup 1) (match_dup 2))) + (set (match_dup 0) + (eq:VI12_AVX2 (match_dup 4) (match_dup 1)))] + "operands[4] = gen_reg_rtx (<MODE>mode);") + (define_expand "mulv8qi3" [(set (match_operand:V8QI 0 "register_operand") (mult:V8QI (match_operand:V8QI 1 "register_operand") @@ -12022,15 +12043,15 @@ }) (define_expand "uavg<mode>3_ceil" - [(set (match_operand:VI12_AVX2 0 "register_operand") - (truncate:VI12_AVX2 + [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand") + (truncate:VI12_AVX2_AVX512BW (lshiftrt:<ssedoublemode> (plus:<ssedoublemode> (plus:<ssedoublemode> (zero_extend:<ssedoublemode> - (match_operand:VI12_AVX2 1 "vector_operand")) + (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand")) (zero_extend:<ssedoublemode> - (match_operand:VI12_AVX2 2 "vector_operand"))) + (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand"))) (match_dup 3)) (const_int 1))))] "TARGET_SSE2" @@ -15744,15 +15765,15 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_expand "<sse2_avx2>_uavg<mode>3<mask_name>" - [(set (match_operand:VI12_AVX2 0 "register_operand") - (truncate:VI12_AVX2 + [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand") + (truncate:VI12_AVX2_AVX512BW (lshiftrt:<ssedoublemode> (plus:<ssedoublemode> (plus:<ssedoublemode> (zero_extend:<ssedoublemode> - (match_operand:VI12_AVX2 1 "vector_operand")) + (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand")) (zero_extend:<ssedoublemode> - (match_operand:VI12_AVX2 2 "vector_operand"))) + (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand"))) (match_dup <mask_expand_op3>)) (const_int 1))))] "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" @@ -15762,15 +15783,15 @@ }) (define_insn "*<sse2_avx2>_uavg<mode>3<mask_name>" - [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,v") - (truncate:VI12_AVX2 + [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand" "=x,v") + (truncate:VI12_AVX2_AVX512BW (lshiftrt:<ssedoublemode> (plus:<ssedoublemode> (plus:<ssedoublemode> (zero_extend:<ssedoublemode> - (match_operand:VI12_AVX2 1 "vector_operand" "%0,v")) + (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand" "%0,v")) (zero_extend:<ssedoublemode> - (match_operand:VI12_AVX2 2 "vector_operand" "xBm,vm"))) + (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand" "xBm,vm"))) (match_operand:<ssedoublemode> <mask_expand_op3> "const1_operand")) (const_int 1))))] "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition> diff --git a/gcc/testsuite/gcc.target/i386/pr96906-1.c b/gcc/testsuite/gcc.target/i386/pr96906-1.c new file mode 100644 index 0000000..9d836eb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr96906-1.c @@ -0,0 +1,62 @@ +/* PR target/96906 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2" } */ +/* { dg-final { scan-assembler-times "\tvpminub\[^\n\r]*xmm" 2 } } */ +/* { dg-final { scan-assembler-times "\tvpminuw\[^\n\r]*xmm" 2 } } */ +/* { dg-final { scan-assembler-times "\tvpminub\[^\n\r]*ymm" 2 } } */ +/* { dg-final { scan-assembler-times "\tvpminuw\[^\n\r]*ymm" 2 } } */ +/* { dg-final { scan-assembler-times "\tvpcmpeqb\[^\n\r]*xmm" 2 } } */ +/* { dg-final { scan-assembler-times "\tvpcmpeqw\[^\n\r]*xmm" 2 } } */ +/* { dg-final { scan-assembler-times "\tvpcmpeqb\[^\n\r]*ymm" 2 } } */ +/* { dg-final { scan-assembler-times "\tvpcmpeqw\[^\n\r]*ymm" 2 } } */ +/* { dg-final { scan-assembler-not "\tvpsubus\[bw]" } } */ + +#include <x86intrin.h> + +__m128i +f1 (__m128i x, __m128i y) +{ + return _mm_cmpeq_epi16 (_mm_subs_epu16 (x, y), _mm_setzero_si128 ()); +} + +__m128i +f2 (__m128i x, __m128i y) +{ + return _mm_cmpeq_epi16 (_mm_min_epu16 (x, y), x); +} + +__m128i +f3 (__m128i x, __m128i y) +{ + return _mm_cmpeq_epi8 (_mm_subs_epu8 (x, y), _mm_setzero_si128 ()); +} + +__m128i +f4 (__m128i x, __m128i y) +{ + return _mm_cmpeq_epi8 (_mm_min_epu8 (x, y), x); +} + +__m256i +f5 (__m256i x, __m256i y) +{ + return _mm256_cmpeq_epi16 (_mm256_subs_epu16 (x, y), _mm256_setzero_si256 ()); +} + +__m256i +f6 (__m256i x, __m256i y) +{ + return _mm256_cmpeq_epi16 (_mm256_min_epu16 (x, y), x); +} + +__m256i +f7 (__m256i x, __m256i y) +{ + return _mm256_cmpeq_epi8 (_mm256_subs_epu8 (x, y), _mm256_setzero_si256 ()); +} + +__m256i +f8 (__m256i x, __m256i y) +{ + return _mm256_cmpeq_epi8 (_mm256_min_epu8 (x, y), x); +} |