1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
|
//===---------------- Implementation of GPU utils ---------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
#define LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
#include "src/__support/macros/properties/architectures.h"
#if !__has_include(<gpuintrin.h>)
#error "Unsupported compiler"
#endif
#include <gpuintrin.h>
namespace LIBC_NAMESPACE_DECL {
namespace gpu {
template <typename T> using Private = __gpu_private T;
template <typename T> using Constant = __gpu_constant T;
template <typename T> using Local = __gpu_local T;
template <typename T> using Global = __gpu_local T;
LIBC_INLINE uint32_t get_num_blocks_x() { return __gpu_num_blocks(0); }
LIBC_INLINE uint32_t get_num_blocks_y() { return __gpu_num_blocks(1); }
LIBC_INLINE uint32_t get_num_blocks_z() { return __gpu_num_blocks(2); }
LIBC_INLINE uint64_t get_num_blocks() {
return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
}
LIBC_INLINE uint32_t get_block_id_x() { return __gpu_block_id(0); }
LIBC_INLINE uint32_t get_block_id_y() { return __gpu_block_id(1); }
LIBC_INLINE uint32_t get_block_id_z() { return __gpu_block_id(2); }
LIBC_INLINE uint64_t get_block_id() {
return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
}
LIBC_INLINE uint32_t get_num_threads_x() { return __gpu_num_threads(0); }
LIBC_INLINE uint32_t get_num_threads_y() { return __gpu_num_threads(1); }
LIBC_INLINE uint32_t get_num_threads_z() { return __gpu_num_threads(2); }
LIBC_INLINE uint64_t get_num_threads() {
return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
}
LIBC_INLINE uint32_t get_thread_id_x() { return __gpu_thread_id(0); }
LIBC_INLINE uint32_t get_thread_id_y() { return __gpu_thread_id(1); }
LIBC_INLINE uint32_t get_thread_id_z() { return __gpu_thread_id(2); }
LIBC_INLINE uint64_t get_thread_id() {
return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
}
LIBC_INLINE uint32_t get_lane_size() { return __gpu_num_lanes(); }
LIBC_INLINE uint32_t get_lane_id() { return __gpu_lane_id(); }
LIBC_INLINE uint64_t get_lane_mask() { return __gpu_lane_mask(); }
LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
return __gpu_read_first_lane_u32(lane_mask, x);
}
LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
return __gpu_ballot(lane_mask, x);
}
LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
uint32_t width = __gpu_num_lanes()) {
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
LIBC_INLINE uint64_t shuffle(uint64_t lane_mask, uint32_t idx, uint64_t x,
uint32_t width = __gpu_num_lanes()) {
return __gpu_shuffle_idx_u64(lane_mask, idx, x, width);
}
template <typename T>
LIBC_INLINE T *shuffle(uint64_t lane_mask, uint32_t idx, T *x,
uint32_t width = __gpu_num_lanes()) {
return reinterpret_cast<T *>(__gpu_shuffle_idx_u64(
lane_mask, idx, reinterpret_cast<uintptr_t>(x), width));
}
LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
return __gpu_match_any_u32(lane_mask, x);
}
LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
return __gpu_match_all_u32(lane_mask, x);
}
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
return __gpu_is_first_in_lane(lane_mask);
}
LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
return __gpu_lane_sum_u32(lane_mask, x);
}
LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
return __gpu_lane_scan_u32(lane_mask, x);
}
LIBC_INLINE uint64_t fixed_frequency_clock() {
return __builtin_readsteadycounter();
}
LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
} // namespace gpu
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
|