aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom de Vries <tom@codesourcery.com>2017-07-11 12:25:01 +0000
committerTom de Vries <vries@gcc.gnu.org>2017-07-11 12:25:01 +0000
commit43c371e8b0dc21d3b1b6d87cc7f29d6d53ae5d82 (patch)
tree51a2189c41e1bf5a58a6b79774b0005b57b73b25
parent3d36348a32186307ce50d748834df29abda2a951 (diff)
downloadgcc-43c371e8b0dc21d3b1b6d87cc7f29d6d53ae5d82.zip
gcc-43c371e8b0dc21d3b1b6d87cc7f29d6d53ae5d82.tar.gz
gcc-43c371e8b0dc21d3b1b6d87cc7f29d6d53ae5d82.tar.bz2
Add extra initialization of broadcasted condition variables
2017-07-11 Tom de Vries <tom@codesourcery.com> * config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG): New macro. (bb_first_real_insn): New function. (nvptx_single): Add extra initialization of broadcasted condition variables. From-SVN: r250129
-rw-r--r--gcc/ChangeLog7
-rw-r--r--gcc/config/nvptx/nvptx.c53
2 files changed, 60 insertions, 0 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9755597..c960cd7 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2017-07-11 Tom de Vries <tom@codesourcery.com>
+
+ * config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG): New macro.
+ (bb_first_real_insn): New function.
+ (nvptx_single): Add extra initialization of broadcasted condition
+ variables.
+
2017-07-11 Nathan Sidwell <nathan@acm.org>
* dwarf2out.c (gen_member_die): Remove useless check for anon ctors.
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index daeec27..c8847a5 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -74,6 +74,8 @@
/* This file should be included last. */
#include "target-def.h"
+#define WORKAROUND_PTXJIT_BUG 1
+
/* The various PTX memory areas an object might reside in. */
enum nvptx_data_area
{
@@ -3844,6 +3846,24 @@ nvptx_wsync (bool after)
return gen_nvptx_barsync (GEN_INT (after));
}
+#if WORKAROUND_PTXJIT_BUG
+/* Return first real insn in BB, or return NULL_RTX if BB does not contain
+ real insns. */
+
+static rtx_insn *
+bb_first_real_insn (basic_block bb)
+{
+ rtx_insn *insn;
+
+ /* Find first insn of from block. */
+ FOR_BB_INSNS (bb, insn)
+ if (INSN_P (insn))
+ return insn;
+
+ return 0;
+}
+#endif
+
/* Single neutering according to MASK. FROM is the incoming block and
TO is the outgoing block. These may be the same block. Insert at
start of FROM:
@@ -3958,6 +3978,39 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
{
/* Vector mode only, do a shuffle. */
+#if WORKAROUND_PTXJIT_BUG
+ /* The branch condition %rcond is propagated like this:
+
+ {
+ .reg .u32 %x;
+ mov.u32 %x,%tid.x;
+ setp.ne.u32 %rnotvzero,%x,0;
+ }
+
+ @%rnotvzero bra Lskip;
+ setp.<op>.<type> %rcond,op1,op2;
+ Lskip:
+ selp.u32 %rcondu32,1,0,%rcond;
+ shfl.idx.b32 %rcondu32,%rcondu32,0,31;
+ setp.ne.u32 %rcond,%rcondu32,0;
+
+ There seems to be a bug in the ptx JIT compiler (observed at driver
+ version 381.22, at -O1 and higher for sm_61), that drops the shfl
+ unless %rcond is initialized to something before 'bra Lskip'. The
+ bug is not observed with ptxas from cuda 8.0.61.
+
+ It is true that the code is non-trivial: at Lskip, %rcond is
+ uninitialized in threads 1-31, and after the selp the same holds
+ for %rcondu32. But shfl propagates the defined value in thread 0
+ to threads 1-31, so after the shfl %rcondu32 is defined in threads
+ 0-31, and after the setp.ne %rcond is defined in threads 0-31.
+
+ There is nothing in the PTX spec to suggest that this is wrong, or
+ to explain why the extra initialization is needed. So, we classify
+ it as a JIT bug, and the extra initialization as workaround. */
+ emit_insn_before (gen_movbi (pvar, const0_rtx),
+ bb_first_real_insn (from));
+#endif
emit_insn_before (nvptx_gen_vcast (pvar), tail);
}
else