aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoger Sayle <roger@nextmovesoftware.com>2024-02-01 06:10:42 +0000
committerRoger Sayle <roger@nextmovesoftware.com>2024-02-01 06:10:42 +0000
commit2f14c0dbb789852947cb58fdf7d3162413f053fa (patch)
tree87b48a81a1d7734aecb79533c601a42ca4b7bdef
parentfd4829dde46b9836c40c9ab27bde98521e692119 (diff)
downloadgcc-2f14c0dbb789852947cb58fdf7d3162413f053fa.zip
gcc-2f14c0dbb789852947cb58fdf7d3162413f053fa.tar.gz
gcc-2f14c0dbb789852947cb58fdf7d3162413f053fa.tar.bz2
PR target/113560: Enhance is_widening_mult_rhs_p.
This patch resolves PR113560, a code quality regression from GCC12 affecting x86_64, by enhancing the middle-end's tree-ssa-math-opts.cc to recognize more instances of widening multiplications. The widening multiplication perception code identifies cases like: _1 = (unsigned __int128) x; __res = _1 * 100; but in the reported test case, the original input looks like: _1 = (unsigned long long) x; _2 = (unsigned __int128) _1; __res = _2 * 100; which gets optimized by constant folding during tree-ssa to: _2 = x & 18446744073709551615; // x & 0xffffffffffffffff __res = _2 * 100; where the BIT_AND_EXPR hides (has consumed) the extension operation. This reveals the more general deficiency (missed optimization opportunity) in widening multiplication perception that additionally both __int128 foo(__int128 x, __int128 y) { return (x & 1000) * (y & 1000) } and unsigned __int128 bar(unsigned __int128 x, unsigned __int128) { return (x >> 80) * (y >> 80); } should be recognized as widening multiplications. Hence rather than test explicitly for BIT_AND_EXPR (as in the first version of this patch) the more general solution is to make use of range information, as provided by tree_non_zero_bits. As a demonstration of the observed improvements, function foo above currently with -O2 compiles on x86_64 to: foo: movq %rdi, %rsi movq %rdx, %r8 xorl %edi, %edi xorl %r9d, %r9d andl $1000, %esi andl $1000, %r8d movq %rdi, %rcx movq %r9, %rdx imulq %rsi, %rdx movq %rsi, %rax imulq %r8, %rcx addq %rdx, %rcx mulq %r8 addq %rdx, %rcx movq %rcx, %rdx ret with this patch, GCC recognizes the *w and instead generates: foo: movq %rdi, %rsi movq %rdx, %r8 andl $1000, %esi andl $1000, %r8d movq %rsi, %rax imulq %r8 ret which is perhaps easier to understand at the tree-level where __int128 foo (__int128 x, __int128 y) { __int128 _1; __int128 _2; __int128 _5; <bb 2> [local count: 1073741824]: _1 = x_3(D) & 1000; _2 = y_4(D) & 1000; _5 = _1 * _2; return _5; } gets transformed to: __int128 foo (__int128 x, __int128 y) { __int128 _1; __int128 _2; __int128 _5; signed long _7; signed long _8; <bb 2> [local count: 1073741824]: _1 = x_3(D) & 1000; _2 = y_4(D) & 1000; _7 = (signed long) _1; _8 = (signed long) _2; _5 = _7 w* _8; return _5; } 2023-02-01 Roger Sayle <roger@nextmovesoftware.com> Richard Biener <rguenther@suse.de> gcc/ChangeLog PR target/113560 * tree-ssa-math-opts.cc (is_widening_mult_rhs_p): Use range information via tree_non_zero_bits to check if this operand is suitably extended for a widening (or highpart) multiplication. (convert_mult_to_widen): Insert explicit casts if the RHS or LHS isn't already of the claimed type. gcc/testsuite/ChangeLog PR target/113560 * g++.target/i386/pr113560.C: New test case. * gcc.target/i386/pr113560.c: Likewise. * gcc.dg/pr87954.c: Update test case.
-rw-r--r--gcc/testsuite/g++.target/i386/pr113560.C19
-rw-r--r--gcc/testsuite/gcc.dg/pr87954.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/pr113560.c17
-rw-r--r--gcc/tree-ssa-math-opts.cc53
4 files changed, 84 insertions, 7 deletions
diff --git a/gcc/testsuite/g++.target/i386/pr113560.C b/gcc/testsuite/g++.target/i386/pr113560.C
new file mode 100644
index 0000000..179b68f
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr113560.C
@@ -0,0 +1,19 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-Ofast -std=c++23 -march=znver4" } */
+
+#include <immintrin.h>
+auto f(char *buf, unsigned long long in) noexcept
+{
+ unsigned long long hi{};
+ auto lo{_mulx_u64(in, 0x2af31dc462ull, &hi)};
+ lo = _mulx_u64(lo, 100, &hi);
+ __builtin_memcpy(buf + 2, &hi, 2);
+ return buf + 10;
+}
+
+/* { dg-final { scan-assembler-times "mulx" 1 } } */
+/* { dg-final { scan-assembler-times "mulq" 1 } } */
+/* { dg-final { scan-assembler-not "addq" } } */
+/* { dg-final { scan-assembler-not "adcq" } } */
+/* { dg-final { scan-assembler-not "salq" } } */
+/* { dg-final { scan-assembler-not "shldq" } } */
diff --git a/gcc/testsuite/gcc.dg/pr87954.c b/gcc/testsuite/gcc.dg/pr87954.c
index 620657c..80d0543 100644
--- a/gcc/testsuite/gcc.dg/pr87954.c
+++ b/gcc/testsuite/gcc.dg/pr87954.c
@@ -18,4 +18,4 @@ imul(unsigned int flags)
return type_dma + (is_rec * !is_dma) * KMALLOC_RECLAIM;
}
-/* { dg-final { scan-tree-dump-times { \* } 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times { \*w? |WIDEN_MULT_PLUS_EXPR} 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr113560.c b/gcc/testsuite/gcc.target/i386/pr113560.c
new file mode 100644
index 0000000..ac2e01a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113560.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2" } */
+
+unsigned __int128 foo(unsigned __int128 x, unsigned __int128 y)
+{
+ return (x & 1000) * (y & 1000);
+}
+
+__int128 bar(__int128 x, __int128 y)
+{
+ return (x & 1000) * (y & 1000);
+}
+
+/* { dg-final { scan-assembler-times "\tmulq" 1 } } */
+/* { dg-final { scan-assembler-times "\timulq" 1 } } */
+/* { dg-final { scan-assembler-not "addq" } } */
+/* { dg-final { scan-assembler-not "xorl" } } */
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 2db26e4..cffe757 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -2552,12 +2552,47 @@ is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
if (TREE_CODE (rhs) == SSA_NAME)
{
+ /* Use tree_non_zero_bits to see if this operand is zero_extended
+ for unsigned widening multiplications or non-negative for
+ signed widening multiplications. */
+ if (TREE_CODE (type) == INTEGER_TYPE
+ && (TYPE_PRECISION (type) & 1) == 0
+ && int_mode_for_size (TYPE_PRECISION (type) / 2, 1).exists ())
+ {
+ unsigned int prec = TYPE_PRECISION (type);
+ unsigned int hprec = prec / 2;
+ wide_int bits = wide_int::from (tree_nonzero_bits (rhs), prec,
+ TYPE_SIGN (TREE_TYPE (rhs)));
+ if (TYPE_UNSIGNED (type)
+ && wi::bit_and (bits, wi::mask (hprec, true, prec)) == 0)
+ {
+ *type_out = build_nonstandard_integer_type (hprec, true);
+ /* X & MODE_MASK can be simplified to (T)X. */
+ stmt = SSA_NAME_DEF_STMT (rhs);
+ if (is_gimple_assign (stmt)
+ && gimple_assign_rhs_code (stmt) == BIT_AND_EXPR
+ && TREE_CODE (gimple_assign_rhs2 (stmt)) == INTEGER_CST
+ && wi::to_wide (gimple_assign_rhs2 (stmt))
+ == wi::mask (hprec, false, prec))
+ *new_rhs_out = gimple_assign_rhs1 (stmt);
+ else
+ *new_rhs_out = rhs;
+ return true;
+ }
+ else if (!TYPE_UNSIGNED (type)
+ && wi::bit_and (bits, wi::mask (hprec - 1, true, prec)) == 0)
+ {
+ *type_out = build_nonstandard_integer_type (hprec, false);
+ *new_rhs_out = rhs;
+ return true;
+ }
+ }
+
stmt = SSA_NAME_DEF_STMT (rhs);
if (is_gimple_assign (stmt))
{
- if (! widening_mult_conversion_strippable_p (type, stmt))
- rhs1 = rhs;
- else
+
+ if (widening_mult_conversion_strippable_p (type, stmt))
{
rhs1 = gimple_assign_rhs1 (stmt);
@@ -2568,6 +2603,8 @@ is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
return true;
}
}
+ else
+ rhs1 = rhs;
}
else
rhs1 = rhs;
@@ -2827,12 +2864,16 @@ convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
if (2 * actual_precision > TYPE_PRECISION (type))
return false;
if (actual_precision != TYPE_PRECISION (type1)
- || from_unsigned1 != TYPE_UNSIGNED (type1))
+ || from_unsigned1 != TYPE_UNSIGNED (type1)
+ || (TREE_TYPE (rhs1) != type1
+ && TREE_CODE (rhs1) != INTEGER_CST))
rhs1 = build_and_insert_cast (gsi, loc,
build_nonstandard_integer_type
(actual_precision, from_unsigned1), rhs1);
if (actual_precision != TYPE_PRECISION (type2)
- || from_unsigned2 != TYPE_UNSIGNED (type2))
+ || from_unsigned2 != TYPE_UNSIGNED (type2)
+ || (TREE_TYPE (rhs2) != type2
+ && TREE_CODE (rhs2) != INTEGER_CST))
rhs2 = build_and_insert_cast (gsi, loc,
build_nonstandard_integer_type
(actual_precision, from_unsigned2), rhs2);
@@ -3046,7 +3087,7 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
|| from_unsigned1 != TYPE_UNSIGNED (type1))
mult_rhs1 = build_and_insert_cast (gsi, loc,
build_nonstandard_integer_type
- (actual_precision, from_unsigned1),
+ (actual_precision, from_unsigned1),
mult_rhs1);
if (actual_precision != TYPE_PRECISION (type2)
|| from_unsigned2 != TYPE_UNSIGNED (type2))