diff options
author | Haochen Jiang <haochen.jiang@intel.com> | 2024-09-02 10:24:29 +0800 |
---|---|---|
committer | Haochen Jiang <haochen.jiang@intel.com> | 2024-09-02 10:24:29 +0800 |
commit | b1f9fbb6da1a3ced57c3668cecc9f9449e1b237e (patch) | |
tree | 16a58b388fc5c081a659b9b22f34bbf5cb0304f9 /gcc/config | |
parent | 5239902210a16b22d59d2cf8b535d615922a5c00 (diff) | |
download | gcc-b1f9fbb6da1a3ced57c3668cecc9f9449e1b237e.zip gcc-b1f9fbb6da1a3ced57c3668cecc9f9449e1b237e.tar.gz gcc-b1f9fbb6da1a3ced57c3668cecc9f9449e1b237e.tar.bz2 |
i386: Auto vectorize sdot_prod, usdot_prod, udot_prod with AVX10.2 instructions
gcc/ChangeLog:
* config/i386/sse.md (VI1_AVX512VNNIBW): New.
(VI2_AVX10_2): Ditto.
(sdot_prod<mode>): Add AVX10.2
to auto vectorize and combine 512 bit part.
(udot_prod<mode>): Ditto.
(sdot_prodv64qi): Removed.
(udot_prodv64qi): Ditto.
(usdot_prod<mode>): Add AVX10.2 to auto vectorize.
(udot_prod<mode>): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/vnniint16-auto-vectorize-2.c: Only define
TEST when not defined.
* gcc.target/i386/vnniint8-auto-vectorize-2.c: Ditto.
* gcc.target/i386/vnniint16-auto-vectorize-3.c: New test.
* gcc.target/i386/vnniint16-auto-vectorize-4.c: Ditto.
* gcc.target/i386/vnniint8-auto-vectorize-3.c: Ditto.
* gcc.target/i386/vnniint8-auto-vectorize-4.c: Ditto.
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/i386/sse.md | 93 |
1 files changed, 21 insertions, 72 deletions
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index da91d39..442ac93 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -610,6 +610,10 @@ (define_mode_iterator VI1_AVX512VNNI [(V64QI "TARGET_AVX512VNNI && TARGET_EVEX512") (V32QI "TARGET_AVX2") V16QI]) +(define_mode_iterator VI1_AVX512VNNIBW + [(V64QI "(TARGET_AVX512BW || TARGET_AVX512VNNI) && TARGET_EVEX512") + (V32QI "TARGET_AVX2") V16QI]) + (define_mode_iterator VI12_256_512_AVX512VL [(V64QI "TARGET_EVEX512") (V32QI "TARGET_AVX512VL") (V32HI "TARGET_EVEX512") (V16HI "TARGET_AVX512VL")]) @@ -627,6 +631,9 @@ [(V32HI "(TARGET_AVX512BW || TARGET_AVX512VNNI) && TARGET_EVEX512") (V16HI "TARGET_AVX2") V8HI]) +(define_mode_iterator VI2_AVX10_2 + [(V32HI "TARGET_AVX10_2_512") V16HI V8HI]) + (define_mode_iterator VI4_AVX [(V8SI "TARGET_AVX") V4SI]) @@ -31232,12 +31239,13 @@ (define_expand "sdot_prod<mode>" [(match_operand:<ssedvecmode> 0 "register_operand") - (match_operand:VI1_AVX2 1 "register_operand") - (match_operand:VI1_AVX2 2 "register_operand") + (match_operand:VI1_AVX512VNNIBW 1 "register_operand") + (match_operand:VI1_AVX512VNNIBW 2 "register_operand") (match_operand:<ssedvecmode> 3 "register_operand")] "TARGET_SSE2" { - if (TARGET_AVXVNNIINT8) + if ((<MODE_SIZE> == 64 && TARGET_AVX10_2_512) + || (<MODE_SIZE> < 64 && (TARGET_AVXVNNIINT8 || TARGET_AVX10_2_256))) { operands[1] = lowpart_subreg (<ssedvecmode>mode, force_reg (<MODE>mode, operands[1]), @@ -31276,44 +31284,15 @@ DONE; }) -(define_expand "sdot_prodv64qi" - [(match_operand:V16SI 0 "register_operand") - (match_operand:V64QI 1 "register_operand") - (match_operand:V64QI 2 "register_operand") - (match_operand:V16SI 3 "register_operand")] - "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512" -{ - /* Emulate with vpdpwssd. */ - rtx op1_lo = gen_reg_rtx (V32HImode); - rtx op1_hi = gen_reg_rtx (V32HImode); - rtx op2_lo = gen_reg_rtx (V32HImode); - rtx op2_hi = gen_reg_rtx (V32HImode); - - emit_insn (gen_vec_unpacks_lo_v64qi (op1_lo, operands[1])); - emit_insn (gen_vec_unpacks_lo_v64qi (op2_lo, operands[2])); - emit_insn (gen_vec_unpacks_hi_v64qi (op1_hi, operands[1])); - emit_insn (gen_vec_unpacks_hi_v64qi (op2_hi, operands[2])); - - rtx res1 = gen_reg_rtx (V16SImode); - rtx res2 = gen_reg_rtx (V16SImode); - rtx sum = gen_reg_rtx (V16SImode); - - emit_move_insn (sum, CONST0_RTX (V16SImode)); - emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum)); - emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3])); - - emit_insn (gen_addv16si3 (operands[0], res1, res2)); - DONE; -}) - (define_expand "udot_prod<mode>" [(match_operand:<ssedvecmode> 0 "register_operand") - (match_operand:VI1_AVX2 1 "register_operand") - (match_operand:VI1_AVX2 2 "register_operand") + (match_operand:VI1_AVX512VNNIBW 1 "register_operand") + (match_operand:VI1_AVX512VNNIBW 2 "register_operand") (match_operand:<ssedvecmode> 3 "register_operand")] "TARGET_SSE2" { - if (TARGET_AVXVNNIINT8) + if ((<MODE_SIZE> == 64 && TARGET_AVX10_2_512) + || (<MODE_SIZE> < 64 && (TARGET_AVXVNNIINT8 || TARGET_AVX10_2_256))) { operands[1] = lowpart_subreg (<ssedvecmode>mode, force_reg (<MODE>mode, operands[1]), @@ -31352,36 +31331,6 @@ DONE; }) -(define_expand "udot_prodv64qi" - [(match_operand:V16SI 0 "register_operand") - (match_operand:V64QI 1 "register_operand") - (match_operand:V64QI 2 "register_operand") - (match_operand:V16SI 3 "register_operand")] - "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512" -{ - /* Emulate with vpdpwssd. */ - rtx op1_lo = gen_reg_rtx (V32HImode); - rtx op1_hi = gen_reg_rtx (V32HImode); - rtx op2_lo = gen_reg_rtx (V32HImode); - rtx op2_hi = gen_reg_rtx (V32HImode); - - emit_insn (gen_vec_unpacku_lo_v64qi (op1_lo, operands[1])); - emit_insn (gen_vec_unpacku_lo_v64qi (op2_lo, operands[2])); - emit_insn (gen_vec_unpacku_hi_v64qi (op1_hi, operands[1])); - emit_insn (gen_vec_unpacku_hi_v64qi (op2_hi, operands[2])); - - rtx res1 = gen_reg_rtx (V16SImode); - rtx res2 = gen_reg_rtx (V16SImode); - rtx sum = gen_reg_rtx (V16SImode); - - emit_move_insn (sum, CONST0_RTX (V16SImode)); - emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum)); - emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3])); - - emit_insn (gen_addv16si3 (operands[0], res1, res2)); - DONE; -}) - (define_insn "vpdp<vpdotprodtype>_<mode>" [(set (match_operand:VI4_AVX 0 "register_operand" "=v") (unspec:VI4_AVX @@ -31757,10 +31706,10 @@ (define_expand "usdot_prod<mode>" [(match_operand:<sseunpackmode> 0 "register_operand") - (match_operand:VI2_AVX2 1 "register_operand") - (match_operand:VI2_AVX2 2 "register_operand") + (match_operand:VI2_AVX10_2 1 "register_operand") + (match_operand:VI2_AVX10_2 2 "register_operand") (match_operand:<sseunpackmode> 3 "register_operand")] - "TARGET_AVXVNNIINT16" + "TARGET_AVXVNNIINT16 || TARGET_AVX10_2_256" { operands[1] = lowpart_subreg (<sseunpackmode>mode, force_reg (<MODE>mode, operands[1]), @@ -31775,10 +31724,10 @@ (define_expand "udot_prod<mode>" [(match_operand:<sseunpackmode> 0 "register_operand") - (match_operand:VI2_AVX2 1 "register_operand") - (match_operand:VI2_AVX2 2 "register_operand") + (match_operand:VI2_AVX10_2 1 "register_operand") + (match_operand:VI2_AVX10_2 2 "register_operand") (match_operand:<sseunpackmode> 3 "register_operand")] - "TARGET_AVXVNNIINT16" + "TARGET_AVXVNNIINT16 || TARGET_AVX10_2_256" { operands[1] = lowpart_subreg (<sseunpackmode>mode, force_reg (<MODE>mode, operands[1]), |