aboutsummaryrefslogtreecommitdiff
path: root/openmp/runtime/src/kmp_collapse.h
blob: e4870185645de0729a83cc94297f133285072f6c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
/*
 * kmp_collapse.h -- header for loop collapse feature
 */

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef KMP_COLLAPSE_H
#define KMP_COLLAPSE_H

#include <type_traits>

// Type of the index into the loop nest structures
// (with values from 0 to less than n from collapse(n))
typedef kmp_int32 kmp_index_t;

// Type for combined loop nest space IV:
typedef kmp_uint64 kmp_loop_nest_iv_t;

// Loop has <, <=, etc. as a comparison:
enum comparison_t : kmp_int32 {
  comp_less_or_eq = 0,
  comp_greater_or_eq = 1,
  comp_not_eq = 2,
  comp_less = 3,
  comp_greater = 4
};

// Type of loop IV.
// Type of bounds and step, after usual promotions
// are a subset of these types (32 & 64 only):
enum loop_type_t : kmp_int32 {
  loop_type_uint8 = 0,
  loop_type_int8 = 1,
  loop_type_uint16 = 2,
  loop_type_int16 = 3,
  loop_type_uint32 = 4,
  loop_type_int32 = 5,
  loop_type_uint64 = 6,
  loop_type_int64 = 7
};

/*!
 @ingroup WORK_SHARING
 * Describes the structure for rectangular nested loops.
 */
template <typename T> struct bounds_infoXX_template {

  // typedef typename traits_t<T>::unsigned_t UT;
  typedef typename traits_t<T>::signed_t ST;

  loop_type_t loop_type; // The differentiator
  loop_type_t loop_iv_type;
  comparison_t comparison;
  // outer_iv should be 0 (or any other less then number of dimentions)
  // if loop doesn't depend on it (lb1 and ub1 will be 0).
  // This way we can do multiplication without a check.
  kmp_index_t outer_iv;

  // unions to keep the size constant:
  union {
    T lb0;
    kmp_uint64 lb0_u64; // real type can be signed
  };

  union {
    T lb1;
    kmp_uint64 lb1_u64; // real type can be signed
  };

  union {
    T ub0;
    kmp_uint64 ub0_u64; // real type can be signed
  };

  union {
    T ub1;
    kmp_uint64 ub1_u64; // real type can be signed
  };

  union {
    ST step; // signed even if bounds type is unsigned
    kmp_int64 step_64; // signed
  };

  kmp_loop_nest_iv_t trip_count;
};

/*!
 @ingroup WORK_SHARING
 * Interface struct for rectangular nested loops.
 * Same size as bounds_infoXX_template.
 */
struct bounds_info_t {

  loop_type_t loop_type; // The differentiator
  loop_type_t loop_iv_type;
  comparison_t comparison;
  // outer_iv should be 0  (or any other less then number of dimentions)
  // if loop doesn't depend on it (lb1 and ub1 will be 0).
  // This way we can do multiplication without a check.
  kmp_index_t outer_iv;

  kmp_uint64 lb0_u64; // real type can be signed
  kmp_uint64 lb1_u64; // real type can be signed
  kmp_uint64 ub0_u64; // real type can be signed
  kmp_uint64 ub1_u64; // real type can be signed
  kmp_int64 step_64; // signed

  // This is internal, but it's the only internal thing we need
  // in rectangular case, so let's expose it here:
  kmp_loop_nest_iv_t trip_count;
};

//-------------------------------------------------------------------------
// Additional types for internal representation:

// Array for a point in the loop space, in the original space.
// It's represented in kmp_uint64, but each dimention is calculated in
// that loop IV type. Also dimentions have to be converted to those types
// when used in generated code.
typedef kmp_uint64* kmp_point_t;

// Array: Number of loop iterations on each nesting level to achieve some point,
// in expanded space or in original space.
// OMPTODO: move from using iterations to using offsets (iterations multiplied
// by steps). For those we need to be careful with the types, as step can be
// negative, but it'll remove multiplications and divisions in several places.
typedef kmp_loop_nest_iv_t* kmp_iterations_t;

// Internal struct with additional info:
template <typename T> struct bounds_info_internalXX_template {

  // OMPTODO: should span have type T or should it better be
  // kmp_uint64/kmp_int64 depending on T sign? (if kmp_uint64/kmp_int64 than
  // updated bounds should probably also be kmp_uint64/kmp_int64). I'd like to
  // use big_span_t, if it can be resolved at compile time.
  typedef
      typename std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
          big_span_t;

  // typedef typename big_span_t span_t;
  typedef T span_t;

  bounds_infoXX_template<T> b; // possibly adjusted bounds

  // Leaving this as a union in case we'll switch to span_t with different sizes
  // (depending on T)
  union {
    // Smallest possible value of iv (may be smaller than actually possible)
    span_t span_smallest;
    kmp_uint64 span_smallest_u64;
  };

  // Leaving this as a union in case we'll switch to span_t with different sizes
  // (depending on T)
  union {
    // Biggest possible value of iv (may be bigger than actually possible)
    span_t span_biggest;
    kmp_uint64 span_biggest_u64;
  };

  // Did we adjust loop bounds (not counting canonicalization)?
  bool loop_bounds_adjusted;
};

// Internal struct with additional info:
struct bounds_info_internal_t {

  bounds_info_t b; // possibly adjusted bounds

  // Smallest possible value of iv (may be smaller than actually possible)
  kmp_uint64 span_smallest_u64;

  // Biggest possible value of iv (may be bigger than actually possible)
  kmp_uint64 span_biggest_u64;

  // Did we adjust loop bounds (not counting canonicalization)?
  bool loop_bounds_adjusted;
};

//----------APIs for rectangular loop nests--------------------------------

// Canonicalize loop nest and calculate overall trip count.
// "bounds_nest" has to be allocated per thread.
// API will modify original bounds_nest array to bring it to a canonical form
// (only <= and >=, no !=, <, >). If the original loop nest was already in a
// canonical form there will be no changes to bounds in bounds_nest array
// (only trip counts will be calculated).
// Returns trip count of overall space.
extern "C" kmp_loop_nest_iv_t
__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
                                 /*in/out*/ bounds_info_t *original_bounds_nest,
                                 kmp_index_t n);

// Calculate old induction variables corresponding to overall new_iv.
// Note: original IV will be returned as if it had kmp_uint64 type,
// will have to be converted to original type in user code.
// Note: trip counts should be already calculated by
// __kmpc_process_loop_nest_rectang.
// OMPTODO: special case 2, 3 nested loops - if it'll be possible to inline
// that into user code.
extern "C" void
__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
                                 const bounds_info_t *original_bounds_nest,
                                 /*out*/ kmp_uint64 *original_ivs,
                                 kmp_index_t n);

//----------Init API for non-rectangular loops--------------------------------

// Init API for collapsed loops (static, no chunks defined).
// "bounds_nest" has to be allocated per thread.
// API will modify original bounds_nest array to bring it to a canonical form
// (only <= and >=, no !=, <, >). If the original loop nest was already in a
// canonical form there will be no changes to bounds in bounds_nest array
// (only trip counts will be calculated). Internally API will expand the space
// to parallelogram/parallelepiped, calculate total, calculate bounds for the
// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
// important on the left side, to hit the lower bounds and not step over), and
// pick the correct chunk for this thread (so it will calculate chunks up to the
// needed one). It could be optimized to calculate just this chunk, potentially
// a bit less well distributed among threads. It is designed to make sure that
// threads will receive predictable chunks, deterministically (so that next nest
// of loops with similar characteristics will get exactly same chunks on same
// threads).
// Current contract: chunk_bounds_nest has only lb0 and ub0,
// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
extern "C" kmp_int32
__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
                          /*in/out*/ bounds_info_t *original_bounds_nest,
                          /*out*/ bounds_info_t *chunk_bounds_nest,
                          kmp_index_t n,
                          /*out*/ kmp_int32 *plastiter);

#endif // KMP_COLLAPSE_H