// RUN: %libomptarget-compilexx-run-and-check-generic // // REQUIRES: gpu #include #include #include #include #include #include #include #pragma omp begin declare variant match(device = {arch(amdgcn)}) unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); } #pragma omp end declare variant #pragma omp begin declare variant match(device = {arch(nvptx64)}) unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); } #pragma omp end declare variant #pragma omp begin declare variant match(device = {kind(cpu)}) unsigned get_warp_size() { return 1; } #pragma omp end declare variant template ::value, bool> = true> bool equal(T LHS, T RHS) { return LHS == RHS; } template ::value, bool> = true> bool equal(T LHS, T RHS) { return __builtin_fabs(LHS - RHS) < std::numeric_limits::epsilon(); } template void test() { constexpr const int num_blocks = 1; constexpr const int block_size = 256; constexpr const int N = num_blocks * block_size; int *res = new int[N]; #pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) \ map(from: res[0:N]) { int tid = ompx_thread_id_x(); T val = ompx::shfl_down_sync(~0U, static_cast(tid), 1); int warp_size = get_warp_size(); if ((tid & (warp_size - 1)) != warp_size - 1) res[tid] = equal(val, static_cast(tid + 1)); else res[tid] = equal(val, static_cast(tid)); } for (int i = 0; i < N; ++i) assert(res[i]); delete[] res; } int main(int argc, char *argv[]) { test(); test(); test(); test(); // CHECK: PASS printf("PASS\n"); return 0; }