diff options
author | Thomas Schwinge <tschwinge@baylibre.com> | 2024-05-10 12:50:23 +0200 |
---|---|---|
committer | Thomas Schwinge <tschwinge@baylibre.com> | 2024-06-06 13:41:46 +0200 |
commit | b4e68dd9084e48ee3e83c11d7f27548d8cca7066 (patch) | |
tree | 0f55fe080dc99174fe265d9cc4a6f46a900dbbef | |
parent | 395ac0417a17ba6405873f891f895417d696b603 (diff) | |
download | gcc-b4e68dd9084e48ee3e83c11d7f27548d8cca7066.zip gcc-b4e68dd9084e48ee3e83c11d7f27548d8cca7066.tar.gz gcc-b4e68dd9084e48ee3e83c11d7f27548d8cca7066.tar.bz2 |
nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution, via 'vote.all.pred'
For example, this allows for '-muniform-simt' code to be executed
single-threaded, which currently fails (device-side 'trap'): the '0xffffffff'
bitmask isn't correct if not all 32 threads of a warp are active. The same
issue/fix, I suppose but have not verified, would apply if we were to allow for
OpenACC 'vector_length' smaller than 32, for example for OpenACC 'serial'.
We use 'nvptx_uniform_warp_check' only for PTX ISA version less than 6.0.
Otherwise we're using 'nvptx_warpsync', which emits 'bar.warp.sync 0xffffffff',
which evidently appears to do the right thing. (I've tested '-muniform-simt'
code executing single-threaded.)
The change that I proposed on 2022-12-15 was to emit PTX code to calculate
'(1 << %ntid.x) - 1' as the actual bitmask to use instead of '0xffffffff'.
This works, but the PTX JIT generates SASS code to do this computation.
In turn, this change now uses PTX 'vote.all.pred' -- which even simplifies upon
the original code a little bit, see the following examplary SASS 'diff' before
vs. after this change:
[...]
/*[...]*/ SYNC (*"BRANCH_TARGETS .L_x_332"*) }
.L_x_332:
- /*[...]*/ VOTE.ANY R9, PT, PT ;
+ /*[...]*/ VOTE.ALL P1, PT ;
- /*[...]*/ ISETP.NE.U32.AND P1, PT, R9, -0x1, PT ;
- /*[...]*/ @!P1 BRA `(.L_x_333) ;
+ /*[...]*/ @P1 BRA `(.L_x_333) ;
/*[...]*/ BPT.TRAP 0x1 ;
.L_x_333:
- /*[...]*/ @P1 EXIT ;
+ /*[...]*/ @!P1 EXIT ;
[...]
gcc/
* config/nvptx/nvptx.md (nvptx_uniform_warp_check): Make fit for
non-full-warp execution, via 'vote.all.pred'.
gcc/testsuite/
* gcc.target/nvptx/nvptx.exp
(check_effective_target_default_ptx_isa_version_at_least_6_0):
New.
* gcc.target/nvptx/uniform-simt-2.c: Adjust.
* gcc.target/nvptx/uniform-simt-5.c: New.
-rw-r--r-- | gcc/config/nvptx/nvptx.md | 13 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/nvptx/nvptx.exp | 5 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c | 28 |
4 files changed, 39 insertions, 9 deletions
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index ef7e3fb..7878a3b 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -2316,14 +2316,11 @@ { const char *insns[] = { "{", - "\\t" ".reg.b32" "\\t" "%%r_act;", - "%.\\t" "vote.ballot.b32" "\\t" "%%r_act,1;", - "\\t" ".reg.pred" "\\t" "%%r_do_abort;", - "\\t" "mov.pred" "\\t" "%%r_do_abort,0;", - "%.\\t" "setp.ne.b32" "\\t" "%%r_do_abort,%%r_act," - "0xffffffff;", - "@ %%r_do_abort\\t" "trap;", - "@ %%r_do_abort\\t" "exit;", + "\\t" ".reg.pred" "\\t" "%%r_sync;", + "\\t" "mov.pred" "\\t" "%%r_sync, 1;", + "%.\\t" "vote.all.pred" "\\t" "%%r_sync, 1;", + "@!%%r_sync\\t" "trap;", + "@!%%r_sync\\t" "exit;", "}", NULL }; diff --git a/gcc/testsuite/gcc.target/nvptx/nvptx.exp b/gcc/testsuite/gcc.target/nvptx/nvptx.exp index 97aa7ae..3151381 100644 --- a/gcc/testsuite/gcc.target/nvptx/nvptx.exp +++ b/gcc/testsuite/gcc.target/nvptx/nvptx.exp @@ -49,6 +49,11 @@ proc check_effective_target_default_ptx_isa_version_at_least { major minor } { return $res } +# Return 1 if code by default compiles for at least PTX ISA version 6.0. +proc check_effective_target_default_ptx_isa_version_at_least_6_0 { } { + return [check_effective_target_default_ptx_isa_version_at_least 6 0] +} + # Return 1 if code with PTX ISA version major.minor or higher can be run. proc check_effective_target_runtime_ptx_isa_version_at_least { major minor } { set name runtime_ptx_isa_version_${major}_${minor} diff --git a/gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c b/gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c index b1eee0d..1d83c49 100644 --- a/gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c +++ b/gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c @@ -17,4 +17,4 @@ f (void) /* { dg-final { scan-assembler-times "@%r\[0-9\]*\tatom.global.cas" 1 } } */ /* { dg-final { scan-assembler-times "shfl.idx.b32" 1 } } */ -/* { dg-final { scan-assembler-times "vote.ballot.b32" 1 } } */ +/* { dg-final { scan-assembler-times "vote.all.pred" 1 } } */ diff --git a/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c b/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c new file mode 100644 index 0000000..cd6ea82 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c @@ -0,0 +1,28 @@ +/* Verify that '-muniform-simt' code may be executed single-threaded. + + { dg-do run } + { dg-options {-save-temps -O2 -muniform-simt} } */ + +enum memmodel +{ + MEMMODEL_RELAXED = 0 +}; + +unsigned long long int v64; +unsigned long long int *p64 = &v64; + +int +main() +{ + /* Trigger uniform-SIMT processing. */ + __atomic_fetch_add (p64, v64, MEMMODEL_RELAXED); + + return 0; +} + +/* Per 'omp_simt_exit': + - 'nvptx_warpsync' + { dg-final { scan-assembler-times {bar\.warp\.sync\t0xffffffff;} 1 { target default_ptx_isa_version_at_least_6_0 } } } + - 'nvptx_uniform_warp_check' + { dg-final { scan-assembler-times {vote\.all\.pred\t%r_sync, 1;} 1 { target { ! default_ptx_isa_version_at_least_6_0 } } } } +*/ |