diff options
author | Jakub Jelinek <jakub@redhat.com> | 2024-09-25 20:17:11 +0200 |
---|---|---|
committer | Jakub Jelinek <jakub@gcc.gnu.org> | 2024-09-25 20:19:28 +0200 |
commit | cc40795d8956d78e719a6acc83d5abad7032a6c3 (patch) | |
tree | 279e4ffa8ed01fbb78caf5d1dc345b7babd5afb1 /gcc | |
parent | c79cc30862d7255ca15884aa956d1ccfa279d86a (diff) | |
download | gcc-cc40795d8956d78e719a6acc83d5abad7032a6c3.zip gcc-cc40795d8956d78e719a6acc83d5abad7032a6c3.tar.gz gcc-cc40795d8956d78e719a6acc83d5abad7032a6c3.tar.bz2 |
i386: Add GENERIC and GIMPLE folders of __builtin_ia32_{min,max}* [PR116738]
The following patch adds GENERIC and GIMPLE folders for various
x86 min/max builtins.
As discussed, these builtins have effectively x < y ? x : y
(or x > y ? x : y) behavior.
The GENERIC folding is done if all the (relevant) arguments are
constants (such as VECTOR_CST for vectors) and is done because
the GIMPLE folding can't easily handle masking, rounding and the
ss/sd cases (in a way that it would be pattern recognized back to the
corresponding instructions). The GIMPLE folding is also done just
for TARGET_SSE4 or later when optimizing, otherwise it is apparently
not matched back.
2024-09-25 Jakub Jelinek <jakub@redhat.com>
PR target/116738
* config/i386/i386.cc (ix86_fold_builtin): Handle
IX86_BUILTIN_M{IN,AX}{S,P}{S,H,D}*.
(ix86_gimple_fold_builtin): Handle IX86_BUILTIN_M{IN,AX}P{S,H,D}*.
* gcc.target/i386/avx512f-pr116738-1.c: New test.
* gcc.target/i386/avx512f-pr116738-2.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/i386.cc | 195 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c | 56 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c | 15 |
3 files changed, 266 insertions, 0 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index cfa84ed..ad2e7b4 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -18514,6 +18514,8 @@ ix86_fold_builtin (tree fndecl, int n_args, = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl); enum rtx_code rcode; bool is_vshift; + enum tree_code tcode; + bool is_scalar; unsigned HOST_WIDE_INT mask; switch (fn_code) @@ -18963,6 +18965,131 @@ ix86_fold_builtin (tree fndecl, int n_args, } break; + case IX86_BUILTIN_MINSS: + case IX86_BUILTIN_MINSH_MASK: + tcode = LT_EXPR; + is_scalar = true; + goto do_minmax; + + case IX86_BUILTIN_MAXSS: + case IX86_BUILTIN_MAXSH_MASK: + tcode = GT_EXPR; + is_scalar = true; + goto do_minmax; + + case IX86_BUILTIN_MINPS: + case IX86_BUILTIN_MINPD: + case IX86_BUILTIN_MINPS256: + case IX86_BUILTIN_MINPD256: + case IX86_BUILTIN_MINPS512: + case IX86_BUILTIN_MINPD512: + case IX86_BUILTIN_MINPS128_MASK: + case IX86_BUILTIN_MINPD128_MASK: + case IX86_BUILTIN_MINPS256_MASK: + case IX86_BUILTIN_MINPD256_MASK: + case IX86_BUILTIN_MINPH128_MASK: + case IX86_BUILTIN_MINPH256_MASK: + case IX86_BUILTIN_MINPH512_MASK: + tcode = LT_EXPR; + is_scalar = false; + goto do_minmax; + + case IX86_BUILTIN_MAXPS: + case IX86_BUILTIN_MAXPD: + case IX86_BUILTIN_MAXPS256: + case IX86_BUILTIN_MAXPD256: + case IX86_BUILTIN_MAXPS512: + case IX86_BUILTIN_MAXPD512: + case IX86_BUILTIN_MAXPS128_MASK: + case IX86_BUILTIN_MAXPD128_MASK: + case IX86_BUILTIN_MAXPS256_MASK: + case IX86_BUILTIN_MAXPD256_MASK: + case IX86_BUILTIN_MAXPH128_MASK: + case IX86_BUILTIN_MAXPH256_MASK: + case IX86_BUILTIN_MAXPH512_MASK: + tcode = GT_EXPR; + is_scalar = false; + do_minmax: + gcc_assert (n_args >= 2); + if (TREE_CODE (args[0]) != VECTOR_CST + || TREE_CODE (args[1]) != VECTOR_CST) + break; + mask = HOST_WIDE_INT_M1U; + if (n_args > 2) + { + gcc_assert (n_args >= 4); + /* This is masked minmax. */ + if (TREE_CODE (args[3]) != INTEGER_CST + || TREE_SIDE_EFFECTS (args[2])) + break; + mask = TREE_INT_CST_LOW (args[3]); + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])); + mask |= HOST_WIDE_INT_M1U << elems; + if (mask != HOST_WIDE_INT_M1U + && TREE_CODE (args[2]) != VECTOR_CST) + break; + if (n_args >= 5) + { + if (!tree_fits_uhwi_p (args[4])) + break; + if (tree_to_uhwi (args[4]) != 4 + && tree_to_uhwi (args[4]) != 8) + break; + } + if (mask == (HOST_WIDE_INT_M1U << elems)) + return args[2]; + } + /* Punt on NaNs, unless exceptions are disabled. */ + if (HONOR_NANS (args[0]) + && (n_args < 5 || tree_to_uhwi (args[4]) != 8)) + for (int i = 0; i < 2; ++i) + { + unsigned count = vector_cst_encoded_nelts (args[i]); + for (unsigned j = 0; j < count; ++j) + if (tree_expr_nan_p (VECTOR_CST_ENCODED_ELT (args[i], j))) + return NULL_TREE; + } + { + tree res = const_binop (tcode, + truth_type_for (TREE_TYPE (args[0])), + args[0], args[1]); + if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST) + break; + res = fold_ternary (VEC_COND_EXPR, TREE_TYPE (args[0]), res, + args[0], args[1]); + if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST) + break; + if (mask != HOST_WIDE_INT_M1U) + { + unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])); + vec_perm_builder sel (nelts, nelts, 1); + for (unsigned int i = 0; i < nelts; i++) + if (mask & (HOST_WIDE_INT_1U << i)) + sel.quick_push (i); + else + sel.quick_push (nelts + i); + vec_perm_indices indices (sel, 2, nelts); + res = fold_vec_perm (TREE_TYPE (args[0]), res, args[2], + indices); + if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST) + break; + } + if (is_scalar) + { + unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])); + vec_perm_builder sel (nelts, nelts, 1); + sel.quick_push (0); + for (unsigned int i = 1; i < nelts; i++) + sel.quick_push (nelts + i); + vec_perm_indices indices (sel, 2, nelts); + res = fold_vec_perm (TREE_TYPE (args[0]), res, args[0], + indices); + if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST) + break; + } + return res; + } + default: break; } @@ -19508,6 +19635,74 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) } return true; + case IX86_BUILTIN_MINPS: + case IX86_BUILTIN_MINPD: + case IX86_BUILTIN_MINPS256: + case IX86_BUILTIN_MINPD256: + case IX86_BUILTIN_MINPS512: + case IX86_BUILTIN_MINPD512: + case IX86_BUILTIN_MINPS128_MASK: + case IX86_BUILTIN_MINPD128_MASK: + case IX86_BUILTIN_MINPS256_MASK: + case IX86_BUILTIN_MINPD256_MASK: + case IX86_BUILTIN_MINPH128_MASK: + case IX86_BUILTIN_MINPH256_MASK: + case IX86_BUILTIN_MINPH512_MASK: + tcode = LT_EXPR; + goto do_minmax; + + case IX86_BUILTIN_MAXPS: + case IX86_BUILTIN_MAXPD: + case IX86_BUILTIN_MAXPS256: + case IX86_BUILTIN_MAXPD256: + case IX86_BUILTIN_MAXPS512: + case IX86_BUILTIN_MAXPD512: + case IX86_BUILTIN_MAXPS128_MASK: + case IX86_BUILTIN_MAXPD128_MASK: + case IX86_BUILTIN_MAXPS256_MASK: + case IX86_BUILTIN_MAXPD256_MASK: + case IX86_BUILTIN_MAXPH128_MASK: + case IX86_BUILTIN_MAXPH256_MASK: + case IX86_BUILTIN_MAXPH512_MASK: + tcode = GT_EXPR; + do_minmax: + gcc_assert (n_args >= 2); + /* Without SSE4.1 we often aren't able to pattern match it back to the + desired instruction. */ + if (!gimple_call_lhs (stmt) || !optimize || !TARGET_SSE4_1) + break; + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + /* For masked minmax, only optimize if the mask is all ones. */ + if (n_args > 2 + && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, 3))) + break; + if (n_args >= 5) + { + tree arg4 = gimple_call_arg (stmt, 4); + if (!tree_fits_uhwi_p (arg4)) + break; + if (tree_to_uhwi (arg4) == 4) + /* Ok. */; + else if (tree_to_uhwi (arg4) != 8) + /* Invalid round argument. */ + break; + else if (HONOR_NANS (arg0)) + /* Lowering to comparison would raise exceptions which + shouldn't be raised. */ + break; + } + { + tree type = truth_type_for (TREE_TYPE (arg0)); + tree cmpres = gimple_build (&stmts, tcode, type, arg0, arg1); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + g = gimple_build_assign (gimple_call_lhs (stmt), + VEC_COND_EXPR, cmpres, arg0, arg1); + gsi_replace (gsi, g, false); + } + return true; + default: break; } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c new file mode 100644 index 0000000..9b58614 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c @@ -0,0 +1,56 @@ +/* PR target/116738 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-not "__builtin_ia32_min" "optimized" } } */ +/* { dg-final { scan-tree-dump-not "__builtin_ia32_max" "optimized" } } */ + +#include <x86intrin.h> + +void +test_pr116738 (void) +{ + __m512 a = _mm512_setr_ps (1.f, 2.f, 0.f, -0.f, -0.f, 0.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, -__builtin_inff (), + __builtin_inff (), -42.f); + __m512 b = _mm512_setr_ps (-0.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, 8.f, + 7.f, 10.f, -9.f, 12.f, 0.f, -0.f, 42.f); + __m512 w = _mm512_setr_ps (4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 0.f, 1.f, + 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f); + __m512 c = _mm512_mask_min_ps (w, -1, a, b); + __m512 d = _mm512_mask_min_ps (w, 18658, a, b); + __m512 e = _mm512_mask_min_ps (w, 54649, a, b); + __m512 f = _mm512_mask_max_ps (w, -1, a, b); + __m512 g = _mm512_mask_max_ps (w, 18658, a, b); + __m512 h = _mm512_mask_max_ps (w, 54649, a, b); + __m128 i = _mm_setr_ps (1.f, 2.f, 0.f, -0.f); + __m128 j = _mm_setr_ps (-0.f, 3.f, -0.f, 0.f); + __m128 k = _mm_min_ss (i, j); + __m128 l = _mm_max_ss (j, i); + __m512 ce = _mm512_setr_ps (-0.f, 2.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, 7.f, + 7.f, 9.f, -9.f, 11.f, -__builtin_inff (), + -0.f, -42.f); + __m512 de = _mm512_setr_ps (4.f, 2.f, 6.f, 7.f, 8.f, 0.f, 5.f, 5.f, 1.f, + 2.f, 3.f, -9.f, 5.f, 6.f, -0.f, 8.f); + __m512 ee = _mm512_setr_ps (-0.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 7.f, + 2.f, 9.f, 4.f, 11.f, 6.f, -0.f, -42.f); + __m512 fe = _mm512_setr_ps (1.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 6.f, 8.f, + 8.f, 10.f, 10.f, 12.f, 0.f, __builtin_inff (), + 42.f); + __m512 ge = _mm512_setr_ps (4.f, 3.f, 6.f, 7.f, 8.f, 0.f, 5.f, 6.f, 1.f, + 2.f, 3.f, 10.f, 5.f, 6.f, __builtin_inff (), + 8.f); + __m512 he = _mm512_setr_ps (1.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 8.f, + 2.f, 10.f, 4.f, 12.f, 6.f, __builtin_inff (), + 42.f); + __m128 ke = _mm_setr_ps (-0.f, 2.f, 0.f, -0.f); + __m128 le = _mm_setr_ps (1.f, 3.f, -0.f, 0.f); + if (__builtin_memcmp (&c, &ce, sizeof (c)) + || __builtin_memcmp (&d, &de, sizeof (d)) + || __builtin_memcmp (&e, &ee, sizeof (e)) + || __builtin_memcmp (&f, &fe, sizeof (f)) + || __builtin_memcmp (&g, &ge, sizeof (g)) + || __builtin_memcmp (&h, &he, sizeof (h)) + || __builtin_memcmp (&k, &ke, sizeof (k)) + || __builtin_memcmp (&l, &le, sizeof (l))) + __builtin_abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c b/gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c new file mode 100644 index 0000000..8a429a2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c @@ -0,0 +1,15 @@ +/* PR target/116738 */ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512f" } */ +/* { dg-require-effective-target avx512f } */ + +#define AVX512F +#include "avx512f-helper.h" + +#include "avx512f-pr116738-1.c" + +void +TEST (void) +{ + test_pr116738 (); +} |