diff options
author | Thomas Schwinge <thomas@codesourcery.com> | 2022-12-12 22:05:37 +0100 |
---|---|---|
committer | Kwok Cheung Yeung <kcy@codesourcery.com> | 2023-05-18 16:11:53 +0100 |
commit | 473a3169dee411397c24d7e338e27d7fb15b9f77 (patch) | |
tree | c96b404cb861b48c86eb4dcd95e40dab99ba5b6f /gcc | |
parent | b9c80c5b2c0c2e922f96b54c5ac96ae70113a87c (diff) | |
download | gcc-473a3169dee411397c24d7e338e27d7fb15b9f77.zip gcc-473a3169dee411397c24d7e338e27d7fb15b9f77.tar.gz gcc-473a3169dee411397c24d7e338e27d7fb15b9f77.tar.bz2 |
nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution
For example, this allows for '-muniform-simt' code to be executed
single-threaded, which currently fails (device-side 'trap'), as the 0xffffffff
mask isn't correct if not all 32 threads of a warp are active. The same
issue/fix, I suppose but have not verified, would apply if we were to allow for
OpenACC 'vector_length' smaller than 32, for example for OpenACC 'serial'.
We use 'nvptx_uniform_warp_check' only for PTX ISA version less than 6.0.
Otherwise we're using 'nvptx_warpsync', which emits 'bar.warp.sync 0xffffffff',
which evidently appears to do the right thing. (I've tested '-muniform-simt'
code executing single-threaded.)
gcc/
* config/nvptx/nvptx.md (nvptx_uniform_warp_check): Make fit for
non-full-warp execution.
gcc/testsuite/
* gcc.target/nvptx/nvptx.exp
(check_effective_target_default_ptx_isa_version_at_least_6_0):
New.
* gcc.target/nvptx/uniform-simt-5.c: New.
libgomp/
* plugin/plugin-nvptx.c (nvptx_exec): Assert what we know about
'blockDimX'.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog.omp | 5 | ||||
-rw-r--r-- | gcc/config/nvptx/nvptx.md | 16 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog.omp | 7 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/nvptx/nvptx.exp | 5 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c | 28 |
5 files changed, 60 insertions, 1 deletions
diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp index fa6d53c..eefe99c 100644 --- a/gcc/ChangeLog.omp +++ b/gcc/ChangeLog.omp @@ -1,3 +1,8 @@ +2023-01-20 Thomas Schwinge <thomas@codesourcery.com> + + * config/nvptx/nvptx.md (nvptx_uniform_warp_check): Make fit for + non-full-warp execution. + 2022-11-01 Marcel Vollweiler <marcel@codesourcery.com> * omp-expand-metadirective.cc (omp_expand_metadirective): Add already diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index c5ea07e..0ecbeb8 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -2321,10 +2321,24 @@ "{", "\\t" ".reg.b32" "\\t" "%%r_act;", "%.\\t" "vote.ballot.b32" "\\t" "%%r_act,1;", + /* For '%r_exp', we essentially need 'activemask.b32', but that is "Introduced in PTX ISA version 6.2", and this code here is used only 'if (!TARGET_PTX_6_0)'. Thus, emulate it. + TODO Is that actually correct? Wouldn't 'activemask.b32' rather replace our 'vote.ballot.b32' given that it registers the *currently active threads*? */ + /* Compute the "membermask" of all threads of the warp that are expected to be converged here. + For OpenACC, '%ntid.x' is 'vector_length', which per 'nvptx_goacc_validate_dims' always is a multiple of 32. + For OpenMP, '%ntid.x' always is 32. + Thus, this is typically 0xffffffff, but additionally always for the case that not all 32 threads of the warp have been launched. + This assume that lane IDs are assigned in ascending order. */ + //TODO Can we rely on '1 << 32 == 0', and '0 - 1 = 0xffffffff'? + //TODO https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/ + //TODO https://stackoverflow.com/questions/54055195/activemask-vs-ballot-sync + "\\t" ".reg.b32" "\\t" "%%r_exp;", + "%.\\t" "mov.b32" "\\t" "%%r_exp, %%ntid.x;", + "%.\\t" "shl.b32" "\\t" "%%r_exp, 1, %%r_exp;", + "%.\\t" "sub.u32" "\\t" "%%r_exp, %%r_exp, 1;", "\\t" ".reg.pred" "\\t" "%%r_do_abort;", "\\t" "mov.pred" "\\t" "%%r_do_abort,0;", "%.\\t" "setp.ne.b32" "\\t" "%%r_do_abort,%%r_act," - "0xffffffff;", + "%%r_exp;", "@ %%r_do_abort\\t" "trap;", "@ %%r_do_abort\\t" "exit;", "}", diff --git a/gcc/testsuite/ChangeLog.omp b/gcc/testsuite/ChangeLog.omp index 11977f4..71fa411 100644 --- a/gcc/testsuite/ChangeLog.omp +++ b/gcc/testsuite/ChangeLog.omp @@ -1,3 +1,10 @@ +2023-01-20 Thomas Schwinge <thomas@codesourcery.com> + + * gcc.target/nvptx/nvptx.exp + (check_effective_target_default_ptx_isa_version_at_least_6_0): + New. + * gcc.target/nvptx/uniform-simt-5.c: New. + 2022-11-01 Marcel Vollweiler <marcel@codesourcery.com> * c-c++-common/gomp/metadirective-8.c: New test. diff --git a/gcc/testsuite/gcc.target/nvptx/nvptx.exp b/gcc/testsuite/gcc.target/nvptx/nvptx.exp index 4b4bdfe..25e09e9 100644 --- a/gcc/testsuite/gcc.target/nvptx/nvptx.exp +++ b/gcc/testsuite/gcc.target/nvptx/nvptx.exp @@ -49,6 +49,11 @@ proc check_effective_target_default_ptx_isa_version_at_least { major minor } { return $res } +# Return 1 if code by default compiles for at least PTX ISA version 6.0. +proc check_effective_target_default_ptx_isa_version_at_least_6_0 { } { + return [check_effective_target_default_ptx_isa_version_at_least 6 0] +} + # Return 1 if code with PTX ISA version major.minor or higher can be run. proc check_effective_target_runtime_ptx_isa_version_at_least { major minor } { set name runtime_ptx_isa_version_${major}_${minor} diff --git a/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c b/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c new file mode 100644 index 0000000..b2f7819 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c @@ -0,0 +1,28 @@ +/* Verify that '-muniform-simt' code may be executed single-threaded. + + { dg-do run } + { dg-options {-save-temps -O2 -muniform-simt} } */ + +enum memmodel +{ + MEMMODEL_RELAXED = 0 +}; + +unsigned long long int v64; +unsigned long long int *p64 = &v64; + +int +main() +{ + /* Trigger uniform-SIMT processing. */ + __atomic_fetch_add (p64, v64, MEMMODEL_RELAXED); + + return 0; +} + +/* Per 'omp_simt_exit': + - 'nvptx_warpsync' + { dg-final { scan-assembler-times {bar\.warp\.sync\t0xffffffff;} 1 { target default_ptx_isa_version_at_least_6_0 } } } + - 'nvptx_uniform_warp_check' + { dg-final { scan-assembler-times {vote\.ballot\.b32\t%r_act,1;} 1 { target { ! default_ptx_isa_version_at_least_6_0 } } } } +*/ |