aboutsummaryrefslogtreecommitdiff
path: root/openmp/libomptarget/DeviceRTL/src/Utils.cpp
blob: e4920ab90450c1b8bc890132deff6c90206ed3d7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//

#include "Utils.h"

#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"

#pragma omp begin declare target device_type(nohost)

using namespace _OMP;

extern "C" __attribute__((weak)) int IsSPMDMode;

/// Helper to keep code alive without introducing a performance penalty.
extern "C" __attribute__((weak, optnone, cold, used, retain)) void
__keep_alive() {
  __kmpc_get_hardware_thread_id_in_block();
  __kmpc_get_hardware_num_threads_in_block();
  __kmpc_get_warp_size();
  __kmpc_barrier_simple_spmd(nullptr, IsSPMDMode);
  __kmpc_barrier_simple_generic(nullptr, IsSPMDMode);
}

namespace impl {

bool isSharedMemPtr(const void *Ptr) { return false; }
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits);
uint64_t Pack(uint32_t LowBits, uint32_t HighBits);

/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})

void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
  static_assert(sizeof(unsigned long) == 8, "");
  *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
  *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
}

uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
  return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}

#pragma omp end declare variant
///}

/// NVPTX Implementation
///
///{
#pragma omp begin declare variant match(                                       \
    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})

void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
  uint32_t LowBitsLocal, HighBitsLocal;
  asm("mov.b64 {%0,%1}, %2;"
      : "=r"(LowBitsLocal), "=r"(HighBitsLocal)
      : "l"(Val));
  *LowBits = LowBitsLocal;
  *HighBits = HighBitsLocal;
}

uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
  uint64_t Val;
  asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits));
  return Val;
}

#pragma omp end declare variant
///}

int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
                    int32_t Width);

/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})

int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
  int Width = mapping::getWarpSize();
  int Self = mapping::getThreadIdInWarp();
  int Index = SrcLane + (Self & ~(Width - 1));
  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}

int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
                    int32_t Width) {
  int Self = mapping::getThreadIdInWarp();
  int Index = Self + LaneDelta;
  Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}

bool isSharedMemPtr(const void * Ptr) {
  return __builtin_amdgcn_is_shared((const __attribute__((address_space(0))) void *)Ptr);
}
#pragma omp end declare variant
///}

/// NVPTX Implementation
///
///{
#pragma omp begin declare variant match(                                       \
    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})

int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
}

int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
  int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
}

bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }

#pragma omp end declare variant
///}
} // namespace impl

uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
  return impl::Pack(LowBits, HighBits);
}

void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
  impl::Unpack(Val, &LowBits, &HighBits);
}

int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
  return impl::shuffle(Mask, Var, SrcLane);
}

int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
                           int32_t Width) {
  return impl::shuffleDown(Mask, Var, Delta, Width);
}

bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }

extern "C" {
int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
  FunctionTracingRAII();
  return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
}

int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
  FunctionTracingRAII();
  uint32_t lo, hi;
  utils::unpack(Val, lo, hi);
  hi = impl::shuffleDown(lanes::All, hi, Delta, Width);
  lo = impl::shuffleDown(lanes::All, lo, Delta, Width);
  return utils::pack(lo, hi);
}
}

#pragma omp end declare target