diff options
author | Levy Hsu <admin@levyhsu.com> | 2024-09-02 10:24:45 +0800 |
---|---|---|
committer | Haochen Jiang <haochen.jiang@intel.com> | 2024-09-02 10:24:45 +0800 |
commit | f82fa0da4d9e1fdaf5e4edd70364d5781534ce11 (patch) | |
tree | 41b7040f125db52c7b014bede5582019efa1e38b | |
parent | 3b1decef83003db9cf8667977c293435c0f3d024 (diff) | |
download | gcc-f82fa0da4d9e1fdaf5e4edd70364d5781534ce11.zip gcc-f82fa0da4d9e1fdaf5e4edd70364d5781534ce11.tar.gz gcc-f82fa0da4d9e1fdaf5e4edd70364d5781534ce11.tar.bz2 |
i386: Support vectorized BF16 add/sub/mul/div with AVX10.2 instructions
AVX10.2 introduces several non-exception instructions for BF16 vector.
Enable vectorized BF add/sub/mul/div operation by supporting standard
optab for them.
gcc/ChangeLog:
* config/i386/sse.md (div<mode>3): New expander for BFmode div.
(VF_BHSD): New mode iterator with vector BFmodes.
(<insn><mode>3<mask_name><round_name>): Change mode to VF_BHSD.
(mul<mode>3<mask_name><round_name>): Likewise.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx10_2-512-bf-vector-operations-1.c: New test.
* gcc.target/i386/avx10_2-bf-vector-operations-1.c: Ditto.
-rw-r--r-- | gcc/config/i386/sse.md | 49 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-operations-1.c | 42 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-operations-1.c | 79 |
3 files changed, 162 insertions, 8 deletions
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 442ac93..ebca462 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -391,6 +391,19 @@ (V8DF "TARGET_AVX512F && TARGET_EVEX512") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) +(define_mode_iterator VF_BHSD + [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V16SF "TARGET_AVX512F && TARGET_EVEX512") + (V8SF "TARGET_AVX") V4SF + (V8DF "TARGET_AVX512F && TARGET_EVEX512") + (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2") + (V32BF "TARGET_AVX10_2_512") + (V16BF "TARGET_AVX10_2_256") + (V8BF "TARGET_AVX10_2_256") + ]) + ;; 128-, 256- and 512-bit float vector modes for bitwise operations (define_mode_iterator VFB [(V32BF "TARGET_AVX512F && TARGET_EVEX512") @@ -2527,10 +2540,10 @@ }) (define_expand "<insn><mode>3<mask_name><round_name>" - [(set (match_operand:VFH 0 "register_operand") - (plusminus:VFH - (match_operand:VFH 1 "<round_nimm_predicate>") - (match_operand:VFH 2 "<round_nimm_predicate>")))] + [(set (match_operand:VF_BHSD 0 "register_operand") + (plusminus:VF_BHSD + (match_operand:VF_BHSD 1 "<round_nimm_predicate>") + (match_operand:VF_BHSD 2 "<round_nimm_predicate>")))] "TARGET_SSE && <mask_mode512bit_condition> && <round_mode_condition>" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") @@ -2616,10 +2629,10 @@ }) (define_expand "mul<mode>3<mask_name><round_name>" - [(set (match_operand:VFH 0 "register_operand") - (mult:VFH - (match_operand:VFH 1 "<round_nimm_predicate>") - (match_operand:VFH 2 "<round_nimm_predicate>")))] + [(set (match_operand:VF_BHSD 0 "register_operand") + (mult:VF_BHSD + (match_operand:VF_BHSD 1 "<round_nimm_predicate>") + (match_operand:VF_BHSD 2 "<round_nimm_predicate>")))] "TARGET_SSE && <mask_mode512bit_condition> && <round_mode_condition>" "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);") @@ -2734,6 +2747,26 @@ } }) +(define_expand "div<mode>3" + [(set (match_operand:VBF_AVX10_2 0 "register_operand") + (div:VBF_AVX10_2 + (match_operand:VBF_AVX10_2 1 "register_operand") + (match_operand:VBF_AVX10_2 2 "vector_operand")))] + "TARGET_AVX10_2_256" +{ + if (TARGET_RECIP_VEC_DIV + && optimize_insn_for_speed_p () + && flag_finite_math_only + && flag_unsafe_math_optimizations) + { + rtx op = gen_reg_rtx (<MODE>mode); + operands[2] = force_reg (<MODE>mode, operands[2]); + emit_insn (gen_avx10_2_rcppbf16_<mode> (op, operands[2])); + emit_insn (gen_avx10_2_mulnepbf16_<mode> (operands[0], operands[1], op)); + DONE; + } +}) + (define_expand "cond_div<mode>" [(set (match_operand:VFH 0 "register_operand") (vec_merge:VFH diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-operations-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-operations-1.c new file mode 100644 index 0000000..d6b0750 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-operations-1.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx10.2-512 -O2" } */ +/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vaddnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdivnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsubnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ + +#include <immintrin.h> + +typedef __bf16 v32bf __attribute__ ((__vector_size__ (64))); + +v32bf +foo_mul (v32bf a, v32bf b) +{ + return a * b; +} + +v32bf +foo_add (v32bf a, v32bf b) +{ + return a + b; +} + +v32bf +foo_div (v32bf a, v32bf b) +{ + return a / b; +} + +v32bf +foo_sub (v32bf a, v32bf b) +{ + return a - b; +} + +__attribute__((optimize("fast-math"))) +v32bf +foo_div_fast_math (v32bf a, v32bf b) +{ + return a / b; +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-operations-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-operations-1.c new file mode 100644 index 0000000..77092b9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-operations-1.c @@ -0,0 +1,79 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx10.2 -O2" } */ +/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vaddnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdivnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsubnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vaddnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdivnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsubnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ + +#include <immintrin.h> + +typedef __bf16 v16bf __attribute__ ((__vector_size__ (32))); +typedef __bf16 v8bf __attribute__ ((__vector_size__ (16))); + +v16bf +foo_mul_256 (v16bf a, v16bf b) +{ + return a * b; +} + +v16bf +foo_add_256 (v16bf a, v16bf b) +{ + return a + b; +} + +v16bf +foo_div_256 (v16bf a, v16bf b) +{ + return a / b; +} + +v16bf +foo_sub_256 (v16bf a, v16bf b) +{ + return a - b; +} + +__attribute__((optimize("fast-math"))) +v16bf +foo_div_fast_math_256 (v16bf a, v16bf b) +{ + return a / b; +} + +v8bf +foo_mul_128 (v8bf a, v8bf b) +{ + return a * b; +} + +v8bf +foo_add_128 (v8bf a, v8bf b) +{ + return a + b; +} + +v8bf +foo_div_128 (v8bf a, v8bf b) +{ + return a / b; +} + +v8bf +foo_sub_128 (v8bf a, v8bf b) +{ + return a - b; +} + +__attribute__((optimize("fast-math"))) +v8bf +foo_div_fast_math_128 (v8bf a, v8bf b) +{ + return a / b; +} |