diff options
author | Alexey Bataev <a.bataev@hotmail.com> | 2018-12-10 14:29:05 +0000 |
---|---|---|
committer | Alexey Bataev <a.bataev@hotmail.com> | 2018-12-10 14:29:05 +0000 |
commit | cc6cf64c38f49ad203aa2252c4209e21103f06f6 (patch) | |
tree | a01828afc2266ffa6cd3202bba8d456828e8ebed /openmp | |
parent | 4d62f6c3142b4b93e69e39a8238d1189744669b9 (diff) | |
download | llvm-cc6cf64c38f49ad203aa2252c4209e21103f06f6.zip llvm-cc6cf64c38f49ad203aa2252c4209e21103f06f6.tar.gz llvm-cc6cf64c38f49ad203aa2252c4209e21103f06f6.tar.bz2 |
[OPENMP][NVPTX]Enable fast shuffles on 64bit values only if CUDA >= 9.
Summary:
Shuffle on 64bit data is allowed only for CUDA >= 9.0. Also, fixed the
constant for the mask, need one extra L in the end.
Reviewers: gtbercea, kkwli0
Subscribers: guansong, caomhin, openmp-commits
Differential Revision: https://reviews.llvm.org/D55440
llvm-svn: 348758
Diffstat (limited to 'openmp')
-rw-r--r-- | openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu | 12 |
1 files changed, 11 insertions, 1 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu index a05a6e0..2546302 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -76,7 +76,17 @@ EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) { } EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { - return __SHFL_DOWN_SYNC(0xFFFFFFFFFFFFFFFFL, val, delta, size); +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 + return __SHFL_DOWN_SYNC(0xFFFFFFFFFFFFFFFFLL, (long long)val, (unsigned)delta, + (int)size); +#else + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); + hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size); + lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); + return val; +#endif } static INLINE void gpu_regular_warp_reduce(void *reduce_data, |