aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2024-09-25 20:17:11 +0200
committerJakub Jelinek <jakub@gcc.gnu.org>2024-09-25 20:19:28 +0200
commitcc40795d8956d78e719a6acc83d5abad7032a6c3 (patch)
tree279e4ffa8ed01fbb78caf5d1dc345b7babd5afb1 /gcc
parentc79cc30862d7255ca15884aa956d1ccfa279d86a (diff)
downloadgcc-cc40795d8956d78e719a6acc83d5abad7032a6c3.zip
gcc-cc40795d8956d78e719a6acc83d5abad7032a6c3.tar.gz
gcc-cc40795d8956d78e719a6acc83d5abad7032a6c3.tar.bz2
i386: Add GENERIC and GIMPLE folders of __builtin_ia32_{min,max}* [PR116738]
The following patch adds GENERIC and GIMPLE folders for various x86 min/max builtins. As discussed, these builtins have effectively x < y ? x : y (or x > y ? x : y) behavior. The GENERIC folding is done if all the (relevant) arguments are constants (such as VECTOR_CST for vectors) and is done because the GIMPLE folding can't easily handle masking, rounding and the ss/sd cases (in a way that it would be pattern recognized back to the corresponding instructions). The GIMPLE folding is also done just for TARGET_SSE4 or later when optimizing, otherwise it is apparently not matched back. 2024-09-25 Jakub Jelinek <jakub@redhat.com> PR target/116738 * config/i386/i386.cc (ix86_fold_builtin): Handle IX86_BUILTIN_M{IN,AX}{S,P}{S,H,D}*. (ix86_gimple_fold_builtin): Handle IX86_BUILTIN_M{IN,AX}P{S,H,D}*. * gcc.target/i386/avx512f-pr116738-1.c: New test. * gcc.target/i386/avx512f-pr116738-2.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386.cc195
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c56
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c15
3 files changed, 266 insertions, 0 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index cfa84ed..ad2e7b4 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18514,6 +18514,8 @@ ix86_fold_builtin (tree fndecl, int n_args,
= (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
enum rtx_code rcode;
bool is_vshift;
+ enum tree_code tcode;
+ bool is_scalar;
unsigned HOST_WIDE_INT mask;
switch (fn_code)
@@ -18963,6 +18965,131 @@ ix86_fold_builtin (tree fndecl, int n_args,
}
break;
+ case IX86_BUILTIN_MINSS:
+ case IX86_BUILTIN_MINSH_MASK:
+ tcode = LT_EXPR;
+ is_scalar = true;
+ goto do_minmax;
+
+ case IX86_BUILTIN_MAXSS:
+ case IX86_BUILTIN_MAXSH_MASK:
+ tcode = GT_EXPR;
+ is_scalar = true;
+ goto do_minmax;
+
+ case IX86_BUILTIN_MINPS:
+ case IX86_BUILTIN_MINPD:
+ case IX86_BUILTIN_MINPS256:
+ case IX86_BUILTIN_MINPD256:
+ case IX86_BUILTIN_MINPS512:
+ case IX86_BUILTIN_MINPD512:
+ case IX86_BUILTIN_MINPS128_MASK:
+ case IX86_BUILTIN_MINPD128_MASK:
+ case IX86_BUILTIN_MINPS256_MASK:
+ case IX86_BUILTIN_MINPD256_MASK:
+ case IX86_BUILTIN_MINPH128_MASK:
+ case IX86_BUILTIN_MINPH256_MASK:
+ case IX86_BUILTIN_MINPH512_MASK:
+ tcode = LT_EXPR;
+ is_scalar = false;
+ goto do_minmax;
+
+ case IX86_BUILTIN_MAXPS:
+ case IX86_BUILTIN_MAXPD:
+ case IX86_BUILTIN_MAXPS256:
+ case IX86_BUILTIN_MAXPD256:
+ case IX86_BUILTIN_MAXPS512:
+ case IX86_BUILTIN_MAXPD512:
+ case IX86_BUILTIN_MAXPS128_MASK:
+ case IX86_BUILTIN_MAXPD128_MASK:
+ case IX86_BUILTIN_MAXPS256_MASK:
+ case IX86_BUILTIN_MAXPD256_MASK:
+ case IX86_BUILTIN_MAXPH128_MASK:
+ case IX86_BUILTIN_MAXPH256_MASK:
+ case IX86_BUILTIN_MAXPH512_MASK:
+ tcode = GT_EXPR;
+ is_scalar = false;
+ do_minmax:
+ gcc_assert (n_args >= 2);
+ if (TREE_CODE (args[0]) != VECTOR_CST
+ || TREE_CODE (args[1]) != VECTOR_CST)
+ break;
+ mask = HOST_WIDE_INT_M1U;
+ if (n_args > 2)
+ {
+ gcc_assert (n_args >= 4);
+ /* This is masked minmax. */
+ if (TREE_CODE (args[3]) != INTEGER_CST
+ || TREE_SIDE_EFFECTS (args[2]))
+ break;
+ mask = TREE_INT_CST_LOW (args[3]);
+ unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
+ mask |= HOST_WIDE_INT_M1U << elems;
+ if (mask != HOST_WIDE_INT_M1U
+ && TREE_CODE (args[2]) != VECTOR_CST)
+ break;
+ if (n_args >= 5)
+ {
+ if (!tree_fits_uhwi_p (args[4]))
+ break;
+ if (tree_to_uhwi (args[4]) != 4
+ && tree_to_uhwi (args[4]) != 8)
+ break;
+ }
+ if (mask == (HOST_WIDE_INT_M1U << elems))
+ return args[2];
+ }
+ /* Punt on NaNs, unless exceptions are disabled. */
+ if (HONOR_NANS (args[0])
+ && (n_args < 5 || tree_to_uhwi (args[4]) != 8))
+ for (int i = 0; i < 2; ++i)
+ {
+ unsigned count = vector_cst_encoded_nelts (args[i]);
+ for (unsigned j = 0; j < count; ++j)
+ if (tree_expr_nan_p (VECTOR_CST_ENCODED_ELT (args[i], j)))
+ return NULL_TREE;
+ }
+ {
+ tree res = const_binop (tcode,
+ truth_type_for (TREE_TYPE (args[0])),
+ args[0], args[1]);
+ if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
+ break;
+ res = fold_ternary (VEC_COND_EXPR, TREE_TYPE (args[0]), res,
+ args[0], args[1]);
+ if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
+ break;
+ if (mask != HOST_WIDE_INT_M1U)
+ {
+ unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
+ vec_perm_builder sel (nelts, nelts, 1);
+ for (unsigned int i = 0; i < nelts; i++)
+ if (mask & (HOST_WIDE_INT_1U << i))
+ sel.quick_push (i);
+ else
+ sel.quick_push (nelts + i);
+ vec_perm_indices indices (sel, 2, nelts);
+ res = fold_vec_perm (TREE_TYPE (args[0]), res, args[2],
+ indices);
+ if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
+ break;
+ }
+ if (is_scalar)
+ {
+ unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
+ vec_perm_builder sel (nelts, nelts, 1);
+ sel.quick_push (0);
+ for (unsigned int i = 1; i < nelts; i++)
+ sel.quick_push (nelts + i);
+ vec_perm_indices indices (sel, 2, nelts);
+ res = fold_vec_perm (TREE_TYPE (args[0]), res, args[0],
+ indices);
+ if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
+ break;
+ }
+ return res;
+ }
+
default:
break;
}
@@ -19508,6 +19635,74 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
}
return true;
+ case IX86_BUILTIN_MINPS:
+ case IX86_BUILTIN_MINPD:
+ case IX86_BUILTIN_MINPS256:
+ case IX86_BUILTIN_MINPD256:
+ case IX86_BUILTIN_MINPS512:
+ case IX86_BUILTIN_MINPD512:
+ case IX86_BUILTIN_MINPS128_MASK:
+ case IX86_BUILTIN_MINPD128_MASK:
+ case IX86_BUILTIN_MINPS256_MASK:
+ case IX86_BUILTIN_MINPD256_MASK:
+ case IX86_BUILTIN_MINPH128_MASK:
+ case IX86_BUILTIN_MINPH256_MASK:
+ case IX86_BUILTIN_MINPH512_MASK:
+ tcode = LT_EXPR;
+ goto do_minmax;
+
+ case IX86_BUILTIN_MAXPS:
+ case IX86_BUILTIN_MAXPD:
+ case IX86_BUILTIN_MAXPS256:
+ case IX86_BUILTIN_MAXPD256:
+ case IX86_BUILTIN_MAXPS512:
+ case IX86_BUILTIN_MAXPD512:
+ case IX86_BUILTIN_MAXPS128_MASK:
+ case IX86_BUILTIN_MAXPD128_MASK:
+ case IX86_BUILTIN_MAXPS256_MASK:
+ case IX86_BUILTIN_MAXPD256_MASK:
+ case IX86_BUILTIN_MAXPH128_MASK:
+ case IX86_BUILTIN_MAXPH256_MASK:
+ case IX86_BUILTIN_MAXPH512_MASK:
+ tcode = GT_EXPR;
+ do_minmax:
+ gcc_assert (n_args >= 2);
+ /* Without SSE4.1 we often aren't able to pattern match it back to the
+ desired instruction. */
+ if (!gimple_call_lhs (stmt) || !optimize || !TARGET_SSE4_1)
+ break;
+ arg0 = gimple_call_arg (stmt, 0);
+ arg1 = gimple_call_arg (stmt, 1);
+ elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+ /* For masked minmax, only optimize if the mask is all ones. */
+ if (n_args > 2
+ && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, 3)))
+ break;
+ if (n_args >= 5)
+ {
+ tree arg4 = gimple_call_arg (stmt, 4);
+ if (!tree_fits_uhwi_p (arg4))
+ break;
+ if (tree_to_uhwi (arg4) == 4)
+ /* Ok. */;
+ else if (tree_to_uhwi (arg4) != 8)
+ /* Invalid round argument. */
+ break;
+ else if (HONOR_NANS (arg0))
+ /* Lowering to comparison would raise exceptions which
+ shouldn't be raised. */
+ break;
+ }
+ {
+ tree type = truth_type_for (TREE_TYPE (arg0));
+ tree cmpres = gimple_build (&stmts, tcode, type, arg0, arg1);
+ gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+ g = gimple_build_assign (gimple_call_lhs (stmt),
+ VEC_COND_EXPR, cmpres, arg0, arg1);
+ gsi_replace (gsi, g, false);
+ }
+ return true;
+
default:
break;
}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c
new file mode 100644
index 0000000..9b58614
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c
@@ -0,0 +1,56 @@
+/* PR target/116738 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not "__builtin_ia32_min" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "__builtin_ia32_max" "optimized" } } */
+
+#include <x86intrin.h>
+
+void
+test_pr116738 (void)
+{
+ __m512 a = _mm512_setr_ps (1.f, 2.f, 0.f, -0.f, -0.f, 0.f, 5.f, 6.f, 7.f,
+ 8.f, 9.f, 10.f, 11.f, -__builtin_inff (),
+ __builtin_inff (), -42.f);
+ __m512 b = _mm512_setr_ps (-0.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, 8.f,
+ 7.f, 10.f, -9.f, 12.f, 0.f, -0.f, 42.f);
+ __m512 w = _mm512_setr_ps (4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 0.f, 1.f,
+ 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f);
+ __m512 c = _mm512_mask_min_ps (w, -1, a, b);
+ __m512 d = _mm512_mask_min_ps (w, 18658, a, b);
+ __m512 e = _mm512_mask_min_ps (w, 54649, a, b);
+ __m512 f = _mm512_mask_max_ps (w, -1, a, b);
+ __m512 g = _mm512_mask_max_ps (w, 18658, a, b);
+ __m512 h = _mm512_mask_max_ps (w, 54649, a, b);
+ __m128 i = _mm_setr_ps (1.f, 2.f, 0.f, -0.f);
+ __m128 j = _mm_setr_ps (-0.f, 3.f, -0.f, 0.f);
+ __m128 k = _mm_min_ss (i, j);
+ __m128 l = _mm_max_ss (j, i);
+ __m512 ce = _mm512_setr_ps (-0.f, 2.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, 7.f,
+ 7.f, 9.f, -9.f, 11.f, -__builtin_inff (),
+ -0.f, -42.f);
+ __m512 de = _mm512_setr_ps (4.f, 2.f, 6.f, 7.f, 8.f, 0.f, 5.f, 5.f, 1.f,
+ 2.f, 3.f, -9.f, 5.f, 6.f, -0.f, 8.f);
+ __m512 ee = _mm512_setr_ps (-0.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 7.f,
+ 2.f, 9.f, 4.f, 11.f, 6.f, -0.f, -42.f);
+ __m512 fe = _mm512_setr_ps (1.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 6.f, 8.f,
+ 8.f, 10.f, 10.f, 12.f, 0.f, __builtin_inff (),
+ 42.f);
+ __m512 ge = _mm512_setr_ps (4.f, 3.f, 6.f, 7.f, 8.f, 0.f, 5.f, 6.f, 1.f,
+ 2.f, 3.f, 10.f, 5.f, 6.f, __builtin_inff (),
+ 8.f);
+ __m512 he = _mm512_setr_ps (1.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 8.f,
+ 2.f, 10.f, 4.f, 12.f, 6.f, __builtin_inff (),
+ 42.f);
+ __m128 ke = _mm_setr_ps (-0.f, 2.f, 0.f, -0.f);
+ __m128 le = _mm_setr_ps (1.f, 3.f, -0.f, 0.f);
+ if (__builtin_memcmp (&c, &ce, sizeof (c))
+ || __builtin_memcmp (&d, &de, sizeof (d))
+ || __builtin_memcmp (&e, &ee, sizeof (e))
+ || __builtin_memcmp (&f, &fe, sizeof (f))
+ || __builtin_memcmp (&g, &ge, sizeof (g))
+ || __builtin_memcmp (&h, &he, sizeof (h))
+ || __builtin_memcmp (&k, &ke, sizeof (k))
+ || __builtin_memcmp (&l, &le, sizeof (l)))
+ __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c b/gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c
new file mode 100644
index 0000000..8a429a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c
@@ -0,0 +1,15 @@
+/* PR target/116738 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#define AVX512F
+#include "avx512f-helper.h"
+
+#include "avx512f-pr116738-1.c"
+
+void
+TEST (void)
+{
+ test_pr116738 ();
+}