aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBill Schmidt <wschmidt@linux.vnet.ibm.com>2017-09-05 19:41:55 +0000
committerWilliam Schmidt <wschmidt@gcc.gnu.org>2017-09-05 19:41:55 +0000
commit55b2ce1c919ae1a5460f8bfd105f64155853d701 (patch)
treead2c38fccd0db73de054ee8975fbf8d7f37788a6
parente5fc06a3d339c7db391f199c51354f61a033b555 (diff)
downloadgcc-55b2ce1c919ae1a5460f8bfd105f64155853d701.zip
gcc-55b2ce1c919ae1a5460f8bfd105f64155853d701.tar.gz
gcc-55b2ce1c919ae1a5460f8bfd105f64155853d701.tar.bz2
re PR target/81833 (PowerPC: VSX: Miscompiles ffmpeg's scalarproduct_int16_vsx at -O1)
[gcc] 2017-09-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com> PR target/81833 * config/rs6000/altivec.md (altivec_vsum2sws): Convert from a define_insn to a define_expand. (altivec_vsum2sws_direct): New define_insn. (altivec_vsumsws): Convert from a define_insn to a define_expand. [gcc/testsuite] 2017-09-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com> PR target/81833 * gcc.target/powerpc/pr81833-1.c: New file. * gcc.target/powerpc/pr81833-2.c: New file. From-SVN: r251723
-rw-r--r--gcc/ChangeLog8
-rw-r--r--gcc/config/rs6000/altivec.md78
-rw-r--r--gcc/testsuite/ChangeLog6
-rw-r--r--gcc/testsuite/gcc.target/powerpc/pr81833-1.c59
-rw-r--r--gcc/testsuite/gcc.target/powerpc/pr81833-2.c59
5 files changed, 176 insertions, 34 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 99a3a76..158b524 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2017-09-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
+
+ PR target/81833
+ * config/rs6000/altivec.md (altivec_vsum2sws): Convert from a
+ define_insn to a define_expand.
+ (altivec_vsum2sws_direct): New define_insn.
+ (altivec_vsumsws): Convert from a define_insn to a define_expand.
+
2017-09-05 Wilco Dijkstra <wdijkstr@arm.com>
* config/arm/arm.c (arm_option_params_internal): Improve setting of
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index c9dddda..0aa1e30 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -1804,51 +1804,61 @@
"vsum4s<VI_char>s %0,%1,%2"
[(set_attr "type" "veccomplex")])
-;; FIXME: For the following two patterns, the scratch should only be
-;; allocated for !VECTOR_ELT_ORDER_BIG, and the instructions should
-;; be emitted separately.
-(define_insn "altivec_vsum2sws"
- [(set (match_operand:V4SI 0 "register_operand" "=v")
- (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
- (match_operand:V4SI 2 "register_operand" "v")]
- UNSPEC_VSUM2SWS))
- (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))
- (clobber (match_scratch:V4SI 3 "=v"))]
+(define_expand "altivec_vsum2sws"
+ [(use (match_operand:V4SI 0 "register_operand"))
+ (use (match_operand:V4SI 1 "register_operand"))
+ (use (match_operand:V4SI 2 "register_operand"))]
"TARGET_ALTIVEC"
{
if (VECTOR_ELT_ORDER_BIG)
- return "vsum2sws %0,%1,%2";
+ emit_insn (gen_altivec_vsum2sws_direct (operands[0], operands[1],
+ operands[2]));
else
- return "vsldoi %3,%2,%2,12\n\tvsum2sws %3,%1,%3\n\tvsldoi %0,%3,%3,4";
-}
- [(set_attr "type" "veccomplex")
- (set (attr "length")
- (if_then_else
- (match_test "VECTOR_ELT_ORDER_BIG")
- (const_string "4")
- (const_string "12")))])
-
-(define_insn "altivec_vsumsws"
+ {
+ rtx tmp1 = gen_reg_rtx (V4SImode);
+ rtx tmp2 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_altivec_vsldoi_v4si (tmp1, operands[2],
+ operands[2], GEN_INT (12)));
+ emit_insn (gen_altivec_vsum2sws_direct (tmp2, operands[1], tmp1));
+ emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2,
+ GEN_INT (4)));
+ }
+ DONE;
+})
+
+; FIXME: This can probably be expressed without an UNSPEC.
+(define_insn "altivec_vsum2sws_direct"
[(set (match_operand:V4SI 0 "register_operand" "=v")
(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
- (match_operand:V4SI 2 "register_operand" "v")]
- UNSPEC_VSUMSWS))
- (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))
- (clobber (match_scratch:V4SI 3 "=v"))]
+ (match_operand:V4SI 2 "register_operand" "v")]
+ UNSPEC_VSUM2SWS))
+ (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))]
+ "TARGET_ALTIVEC"
+ "vsum2sws %0,%1,%2"
+ [(set_attr "type" "veccomplex")])
+
+(define_expand "altivec_vsumsws"
+ [(use (match_operand:V4SI 0 "register_operand"))
+ (use (match_operand:V4SI 1 "register_operand"))
+ (use (match_operand:V4SI 2 "register_operand"))]
"TARGET_ALTIVEC"
{
if (VECTOR_ELT_ORDER_BIG)
- return "vsumsws %0,%1,%2";
+ emit_insn (gen_altivec_vsumsws_direct (operands[0], operands[1],
+ operands[2]));
else
- return "vspltw %3,%2,0\n\tvsumsws %3,%1,%3\n\tvsldoi %0,%3,%3,12";
-}
- [(set_attr "type" "veccomplex")
- (set (attr "length")
- (if_then_else
- (match_test "(VECTOR_ELT_ORDER_BIG)")
- (const_string "4")
- (const_string "12")))])
+ {
+ rtx tmp1 = gen_reg_rtx (V4SImode);
+ rtx tmp2 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_altivec_vspltw_direct (tmp1, operands[2], const0_rtx));
+ emit_insn (gen_altivec_vsumsws_direct (tmp2, operands[1], tmp1));
+ emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2,
+ GEN_INT (12)));
+ }
+ DONE;
+})
+; FIXME: This can probably be expressed without an UNSPEC.
(define_insn "altivec_vsumsws_direct"
[(set (match_operand:V4SI 0 "register_operand" "=v")
(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index a34c692..8d945de 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@
+2017-09-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
+
+ PR target/81833
+ * gcc.target/powerpc/pr81833-1.c: New file.
+ * gcc.target/powerpc/pr81833-2.c: New file.
+
2017-09-05 H.J. Lu <hongjiu.lu@intel.com>
PR target/59501
diff --git a/gcc/testsuite/gcc.target/powerpc/pr81833-1.c b/gcc/testsuite/gcc.target/powerpc/pr81833-1.c
new file mode 100644
index 0000000..ca5ce00
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr81833-1.c
@@ -0,0 +1,59 @@
+/* PR81833: This used to fail due to improper implementation of vec_msum. */
+/* Test case relies on -mcpu=power7 or later. Currently we don't have
+ machinery to express that, so we have two separate tests for -mcpu=power7
+ and -mcpu=power8 to catch 32-bit BE on P7 and 64-bit BE/LE on P8. */
+
+/* { dg-do run } */
+/* { dg-require-effective-target p8vector_hw } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O2" } */
+
+#include <altivec.h>
+
+#define vec_u8 vector unsigned char
+#define vec_s8 vector signed char
+#define vec_u16 vector unsigned short
+#define vec_s16 vector signed short
+#define vec_u32 vector unsigned int
+#define vec_s32 vector signed int
+#define vec_f vector float
+
+#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0)
+
+#define zero_u8v (vec_u8) zerov
+#define zero_s8v (vec_s8) zerov
+#define zero_u16v (vec_u16) zerov
+#define zero_s16v (vec_s16) zerov
+#define zero_u32v (vec_u32) zerov
+#define zero_s32v (vec_s32) zerov
+
+signed int __attribute__((noinline))
+scalarproduct_int16_vsx (const signed short *v1, const signed short *v2,
+ int order)
+{
+ int i;
+ LOAD_ZERO;
+ register vec_s16 vec1;
+ register vec_s32 res = vec_splat_s32 (0), t;
+ signed int ires;
+
+ for (i = 0; i < order; i += 8) {
+ vec1 = vec_vsx_ld (0, v1);
+ t = vec_msum (vec1, vec_vsx_ld (0, v2), zero_s32v);
+ res = vec_sums (t, res);
+ v1 += 8;
+ v2 += 8;
+ }
+ res = vec_splat (res, 3);
+ vec_ste (res, 0, &ires);
+
+ return ires;
+}
+
+int main(void)
+{
+ const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 };
+ if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8)
+ __builtin_abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr81833-2.c b/gcc/testsuite/gcc.target/powerpc/pr81833-2.c
new file mode 100644
index 0000000..e928683
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr81833-2.c
@@ -0,0 +1,59 @@
+/* PR81833: This used to fail due to improper implementation of vec_msum. */
+/* Test case relies on -mcpu=power7 or later. Currently we don't have
+ machinery to express that, so we have two separate tests for -mcpu=power7
+ and -mcpu=power8 to catch 32-bit BE on P7 and 64-bit BE/LE on P8. */
+
+/* { dg-do run } */
+/* { dg-require-effective-target vsx_hw } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power7" } } */
+/* { dg-options "-mcpu=power7 -O2" } */
+
+#include <altivec.h>
+
+#define vec_u8 vector unsigned char
+#define vec_s8 vector signed char
+#define vec_u16 vector unsigned short
+#define vec_s16 vector signed short
+#define vec_u32 vector unsigned int
+#define vec_s32 vector signed int
+#define vec_f vector float
+
+#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0)
+
+#define zero_u8v (vec_u8) zerov
+#define zero_s8v (vec_s8) zerov
+#define zero_u16v (vec_u16) zerov
+#define zero_s16v (vec_s16) zerov
+#define zero_u32v (vec_u32) zerov
+#define zero_s32v (vec_s32) zerov
+
+signed int __attribute__((noinline))
+scalarproduct_int16_vsx (const signed short *v1, const signed short *v2,
+ int order)
+{
+ int i;
+ LOAD_ZERO;
+ register vec_s16 vec1;
+ register vec_s32 res = vec_splat_s32 (0), t;
+ signed int ires;
+
+ for (i = 0; i < order; i += 8) {
+ vec1 = vec_vsx_ld (0, v1);
+ t = vec_msum (vec1, vec_vsx_ld (0, v2), zero_s32v);
+ res = vec_sums (t, res);
+ v1 += 8;
+ v2 += 8;
+ }
+ res = vec_splat (res, 3);
+ vec_ste (res, 0, &ires);
+
+ return ires;
+}
+
+int main(void)
+{
+ const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 };
+ if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8)
+ __builtin_abort ();
+ return 0;
+}