Introduce smul_highpart and umul_highpart RTX for high-part multiplications

This patch introduces new RTX codes to allow the RTL passes and backends to consistently represent high-part multiplications. Currently, the RTL used by different backends for expanding smul<mode>3_highpart and umul<mode>3_highpart varies greatly, with many but not all choosing to express this something like: (define_insn "smuldi3_highpart" [(set (match_operand:DI 0 "nvptx_register_operand" "=R") (truncate:DI (lshiftrt:TI (mult:TI (sign_extend:TI (match_operand:DI 1 "nvptx_register_operand" "R")) (sign_extend:TI (match_operand:DI 2 "nvptx_register_operand" "R"))) (const_int 64))))] "" "%.\\tmul.hi.s64\\t%0, %1, %2;") One complication with using this "widening multiplication" representation is that it requires an intermediate in a wider mode, making it difficult or impossible to encode a high-part multiplication of the widest supported integer mode. A second is that it can interfere with optimization; for example simplify-rtx.c contains the comment: case TRUNCATE: /* Don't optimize (lshiftrt (mult ...)) as it would interfere with the umulXi3_highpart patterns. */ Hopefully these problems are solved (or reduced) by introducing a new canonical form for high-part multiplications in RTL passes. This also simplifies insn patterns when one operand is constant. Whilst implementing some constant folding simplifications and compile-time evaluation of these new RTX codes, I noticed that this functionality could also be added for the existing saturating arithmetic RTX codes. Then likewise when documenting these new RTX codes, I also took the opportunity to silence the @xref warnings in invoke.texi. 2021-10-07 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * rtl.def (SMUL_HIGHPART, UMUL_HIGHPART): New RTX codes for representing signed and unsigned high-part multiplication resp. * simplify-rtx.c (simplify_binary_operation_1) [SMUL_HIGHPART, UMUL_HIGHPART]: Simplify high-part multiplications by zero. [SS_PLUS, US_PLUS, SS_MINUS, US_MINUS, SS_MULT, US_MULT, SS_DIV, US_DIV]: Similar simplifications for saturating arithmetic. (simplify_const_binary_operation) [SS_PLUS, US_PLUS, SS_MINUS, US_MINUS, SS_MULT, US_MULT, SMUL_HIGHPART, UMUL_HIGHPART]: Implement compile-time evaluation for constant operands. * dwarf2out.c (mem_loc_descriptor): Skip SMUL_HIGHPART and UMUL_HIGHPART. * doc/rtl.texi (smul_highpart, umul_highpart): Document RTX codes. * doc/md.texi (smul@var{m}3_highpart, umul@var{m3}_highpart): Mention the new smul_highpart and umul_highpart RTX codes. * doc/invoke.texi: Silence @xref "compilation" warnings. gcc/testsuite/ChangeLog * gcc.target/i386/sse2-mmx-paddsb-2.c: New test case. * gcc.target/i386/sse2-mmx-paddusb-2.c: New test case. * gcc.target/i386/sse2-mmx-psubsb-2.c: New test case. * gcc.target/i386/sse2-mmx-psubusb-2.c: New test case.
author: Roger Sayle <roger@nextmovesoftware.com> 2021-10-07 15:42:09 +0100
committer: Roger Sayle <roger@nextmovesoftware.com> 2021-10-07 15:42:09 +0100
commit: 555fa3545efe23393ff21fe0928aa3942e1b90ed (patch)
tree: 8964ea5feb3382bf50c79d57766a3bb831a5e9f3 /gcc/testsuite
parent: 1a7d452c092be42a892d00c19561af10f42410b0 (diff)
download: gcc-555fa3545efe23393ff21fe0928aa3942e1b90ed.zip
gcc-555fa3545efe23393ff21fe0928aa3942e1b90ed.tar.gz
gcc-555fa3545efe23393ff21fe0928aa3942e1b90ed.tar.bz2
4 files changed, 116 insertions, 0 deletions
diff --git a/gcc/testsuite/gcc.target/i386/sse2-mmx-paddsb-2.c b/gcc/testsuite/gcc.target/i386/sse2-mmx-paddsb-2.c
new file mode 100644
index 0000000..c677884
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-mmx-paddsb-2.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef char v8qi __attribute__ ((vector_size (8)));
+
+char foo()
+{
+  v8qi tx = { 1, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi ty = { 2, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi t = __builtin_ia32_paddsb(tx, ty);
+  return t[0];
+}
+
+char bar()
+{
+  v8qi tx = { 100, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi ty = { 100, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi t = __builtin_ia32_paddsb(tx, ty);
+  return t[0];
+}
+
+char baz()
+{
+  v8qi tx = { -100, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi ty = { -100, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi t = __builtin_ia32_paddsb(tx, ty);
+  return t[0];
+}
+
+/* { dg-final { scan-assembler-times "movl\[ \\t\]+\\\$3," 1 } } */
+/* { dg-final { scan-assembler-times "movl\[ \\t\]+\\\$127," 1 } } */
+/* { dg-final { scan-assembler-times "movl\[ \\t\]+\\\$-128," 1 } } */
+/* { dg-final { scan-assembler-not "paddsb\[ \\t\]+%xmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-mmx-paddusb-2.c b/gcc/testsuite/gcc.target/i386/sse2-mmx-paddusb-2.c
new file mode 100644
index 0000000..b20891c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-mmx-paddusb-2.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef char v8qi __attribute__ ((vector_size (8)));
+
+char foo()
+{
+  v8qi tx = { 1, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi ty = { 2, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi t = __builtin_ia32_paddusb(tx, ty);
+  return t[0];
+}
+
+char bar()
+{
+  v8qi tx = { 200, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi ty = { 200, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi t = __builtin_ia32_paddusb(tx, ty);
+  return t[0];
+}
+
+/* { dg-final { scan-assembler-times "movl\[ \\t\]+\\\$3," 1 } } */
+/* { dg-final { scan-assembler-times "movl\[ \\t\]+\\\$-1," 1 } } */
+/* { dg-final { scan-assembler-not "paddusb\[ \\t\]+%xmm\[0-9\]+" } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/sse2-mmx-psubsb-2.c b/gcc/testsuite/gcc.target/i386/sse2-mmx-psubsb-2.c
new file mode 100644
index 0000000..4fc2920
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-mmx-psubsb-2.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef char v8qi __attribute__ ((vector_size (8)));
+
+char foo()
+{
+  v8qi tx = { 5, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi ty = { 2, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi t = __builtin_ia32_psubsb(tx, ty);
+  return t[0];
+}
+
+char bar()
+{
+  v8qi tx = { -100, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi ty = { 100, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi t = __builtin_ia32_psubsb(tx, ty);
+  return t[0];
+}
+
+char baz()
+{
+  v8qi tx = { 100, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi ty = { -100, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi t = __builtin_ia32_psubsb(tx, ty);
+  return t[0];
+}
+
+/* { dg-final { scan-assembler-times "movl\[ \\t\]+\\\$3," 1 } } */
+/* { dg-final { scan-assembler-times "movl\[ \\t\]+\\\$-128," 1 } } */
+/* { dg-final { scan-assembler-times "movl\[ \\t\]+\\\$127," 1 } } */
+/* { dg-final { scan-assembler-not "paddsb\[ \\t\]+%xmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-mmx-psubusb-2.c b/gcc/testsuite/gcc.target/i386/sse2-mmx-psubusb-2.c
new file mode 100644
index 0000000..5fc58ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-mmx-psubusb-2.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef char v8qi __attribute__ ((vector_size (8)));
+
+char foo()
+{
+  v8qi tx = { 5, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi ty = { 2, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi t = __builtin_ia32_psubusb(tx, ty);
+  return t[0];
+}
+
+char bar()
+{
+  v8qi tx = { 100, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi ty = { 200, 0, 0, 0, 0, 0, 0, 0 };
+  v8qi t = __builtin_ia32_psubusb(tx, ty);
+  return t[0];
+}
+
+/* { dg-final { scan-assembler-times "movl\[ \\t\]+\\\$3," 1 } } */
+/* { dg-final { scan-assembler-times "xorl\[ \\t\]+" 1 } } */
+/* { dg-final { scan-assembler-not "psubusb\[ \\t\]+%xmm\[0-9\]+" } } */
+
author	Roger Sayle <roger@nextmovesoftware.com>	2021-10-07 15:42:09 +0100
committer	Roger Sayle <roger@nextmovesoftware.com>	2021-10-07 15:42:09 +0100
commit	555fa3545efe23393ff21fe0928aa3942e1b90ed (patch)
tree	8964ea5feb3382bf50c79d57766a3bb831a5e9f3 /gcc/testsuite
parent	1a7d452c092be42a892d00c19561af10f42410b0 (diff)
download	gcc-555fa3545efe23393ff21fe0928aa3942e1b90ed.zip gcc-555fa3545efe23393ff21fe0928aa3942e1b90ed.tar.gz gcc-555fa3545efe23393ff21fe0928aa3942e1b90ed.tar.bz2