openmp/runtime/src/kmp_collapse.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475

/*
 * kmp_collapse.cpp -- loop collapse feature
 */

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "kmp.h"
#include "kmp_error.h"
#include "kmp_i18n.h"
#include "kmp_itt.h"
#include "kmp_stats.h"
#include "kmp_str.h"
#include "kmp_collapse.h"

#if OMPT_SUPPORT
#include "ompt-specific.h"
#endif

// OMPTODO: different style of comments (see kmp_sched)
// OMPTODO: OMPT/OMPD

// avoid inadevertently using a library based abs
template <typename T> T __kmp_abs(const T val) {
  return (val < 0) ? -val : val;
}
kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }

//----------------------------------------------------------------------------
// Common functions for working with rectangular and non-rectangular loops
//----------------------------------------------------------------------------

template <typename T> int __kmp_sign(T val) {
  return (T(0) < val) - (val < T(0));
}

template <typename T> class CollapseAllocator {
  typedef T *pT;

private:
  static const size_t allocaSize = 32; // size limit for stack allocations
                                       // (8 bytes x 4 nested loops)
  char stackAlloc[allocaSize];
  static constexpr size_t maxElemCount = allocaSize / sizeof(T);
  pT pTAlloc;

public:
  CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
    if (n > maxElemCount) {
      pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
    }
  }
  ~CollapseAllocator() {
    if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
      __kmp_free(pTAlloc);
    }
  }
  T &operator[](int index) { return pTAlloc[index]; }
  operator const pT() { return pTAlloc; }
};

//----------Loop canonicalization---------------------------------------------

// For loop nest (any shape):
// convert != to < or >;
// switch from using < or > to <= or >=.
// "bounds" array has to be allocated per thread.
// All other internal functions will work only with canonicalized loops.
template <typename T>
void kmp_canonicalize_one_loop_XX(
    ident_t *loc,
    /*in/out*/ bounds_infoXX_template<T> *bounds) {

  if (__kmp_env_consistency_check) {
    if (bounds->step == 0) {
      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
                            loc);
    }
  }

  if (bounds->comparison == comparison_t::comp_not_eq) {
    // We can convert this to < or >, depends on the sign of the step:
    if (bounds->step > 0) {
      bounds->comparison = comparison_t::comp_less;
    } else {
      bounds->comparison = comparison_t::comp_greater;
    }
  }

  if (bounds->comparison == comparison_t::comp_less) {
    // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
    // because ub0 + ub1*j should be still positive (otherwise loop was not
    // well formed)
    bounds->ub0 -= 1;
    bounds->comparison = comparison_t::comp_less_or_eq;
  } else if (bounds->comparison == comparison_t::comp_greater) {
    bounds->ub0 += 1;
    bounds->comparison = comparison_t::comp_greater_or_eq;
  }
}

// Canonicalize loop nest. original_bounds_nest is an array of length n.
void kmp_canonicalize_loop_nest(ident_t *loc,
                                /*in/out*/ bounds_info_t *original_bounds_nest,
                                kmp_index_t n) {

  for (kmp_index_t ind = 0; ind < n; ++ind) {
    auto bounds = &(original_bounds_nest[ind]);

    switch (bounds->loop_type) {
    case loop_type_t::loop_type_int32:
      kmp_canonicalize_one_loop_XX<kmp_int32>(
          loc,
          /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
      break;
    case loop_type_t::loop_type_uint32:
      kmp_canonicalize_one_loop_XX<kmp_uint32>(
          loc,
          /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
      break;
    case loop_type_t::loop_type_int64:
      kmp_canonicalize_one_loop_XX<kmp_int64>(
          loc,
          /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
      break;
    case loop_type_t::loop_type_uint64:
      kmp_canonicalize_one_loop_XX<kmp_uint64>(
          loc,
          /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
      break;
    default:
      KMP_ASSERT(false);
    }
  }
}

//----------Calculating trip count on one level-------------------------------

// Calculate trip count on this loop level.
// We do this either for a rectangular loop nest,
// or after an adjustment bringing the loops to a parallelepiped shape.
// This number should not depend on the value of outer IV
// even if the formular has lb1 and ub1.
// Note: for non-rectangular loops don't use span for this, it's too big.

template <typename T>
kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
    /*in/out*/ bounds_infoXX_template<T> *bounds) {

  if (bounds->comparison == comparison_t::comp_less_or_eq) {
    if (bounds->ub0 < bounds->lb0) {
      // Note: after this we don't need to calculate inner loops,
      // but that should be an edge case:
      bounds->trip_count = 0;
    } else {
      // ub - lb may exceed signed type range; we need to cast to
      // kmp_loop_nest_iv_t anyway
      bounds->trip_count =
          static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
              __kmp_abs(bounds->step) +
          1;
    }
  } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
    if (bounds->lb0 < bounds->ub0) {
      // Note: after this we don't need to calculate inner loops,
      // but that should be an edge case:
      bounds->trip_count = 0;
    } else {
      // lb - ub may exceed signed type range; we need to cast to
      // kmp_loop_nest_iv_t anyway
      bounds->trip_count =
          static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
              __kmp_abs(bounds->step) +
          1;
    }
  } else {
    KMP_ASSERT(false);
  }
  return bounds->trip_count;
}

// Calculate trip count on this loop level.
kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {

  kmp_loop_nest_iv_t trip_count = 0;

  switch (bounds->loop_type) {
  case loop_type_t::loop_type_int32:
    trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
        /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
    break;
  case loop_type_t::loop_type_uint32:
    trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
        /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
    break;
  case loop_type_t::loop_type_int64:
    trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
        /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
    break;
  case loop_type_t::loop_type_uint64:
    trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
        /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
    break;
  default:
    KMP_ASSERT(false);
  }

  return trip_count;
}

//----------Trim original iv according to its type----------------------------

// Trim original iv according to its type.
// Return kmp_uint64 value which can be easily used in all internal calculations
// And can be statically cast back to original type in user code.
kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
  kmp_uint64 res = 0;

  switch (loop_iv_type) {
  case loop_type_t::loop_type_int8:
    res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
    break;
  case loop_type_t::loop_type_uint8:
    res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
    break;
  case loop_type_t::loop_type_int16:
    res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
    break;
  case loop_type_t::loop_type_uint16:
    res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
    break;
  case loop_type_t::loop_type_int32:
    res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
    break;
  case loop_type_t::loop_type_uint32:
    res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
    break;
  case loop_type_t::loop_type_int64:
    res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
    break;
  case loop_type_t::loop_type_uint64:
    res = static_cast<kmp_uint64>(original_iv);
    break;
  default:
    KMP_ASSERT(false);
  }

  return res;
}

//----------Compare two IVs (remember they have a type)-----------------------

bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
                kmp_uint64 original_iv2) {
  bool res = false;

  switch (loop_iv_type) {
  case loop_type_t::loop_type_int8:
    res = static_cast<kmp_int8>(original_iv1) ==
          static_cast<kmp_int8>(original_iv2);
    break;
  case loop_type_t::loop_type_uint8:
    res = static_cast<kmp_uint8>(original_iv1) ==
          static_cast<kmp_uint8>(original_iv2);
    break;
  case loop_type_t::loop_type_int16:
    res = static_cast<kmp_int16>(original_iv1) ==
          static_cast<kmp_int16>(original_iv2);
    break;
  case loop_type_t::loop_type_uint16:
    res = static_cast<kmp_uint16>(original_iv1) ==
          static_cast<kmp_uint16>(original_iv2);
    break;
  case loop_type_t::loop_type_int32:
    res = static_cast<kmp_int32>(original_iv1) ==
          static_cast<kmp_int32>(original_iv2);
    break;
  case loop_type_t::loop_type_uint32:
    res = static_cast<kmp_uint32>(original_iv1) ==
          static_cast<kmp_uint32>(original_iv2);
    break;
  case loop_type_t::loop_type_int64:
    res = static_cast<kmp_int64>(original_iv1) ==
          static_cast<kmp_int64>(original_iv2);
    break;
  case loop_type_t::loop_type_uint64:
    res = static_cast<kmp_uint64>(original_iv1) ==
          static_cast<kmp_uint64>(original_iv2);
    break;
  default:
    KMP_ASSERT(false);
  }

  return res;
}

//----------Calculate original iv on one level--------------------------------

// Return true if the point fits into upper bounds on this level,
// false otherwise
template <typename T>
bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
                                 const kmp_point_t original_ivs,
                                 kmp_index_t ind) {

  T iv = static_cast<T>(original_ivs[ind]);
  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);

  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
       (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
      ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
       (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
    // The calculated point is outside of loop upper boundary:
    return false;
  }

  return true;
}

// Calculate one iv corresponding to iteration on the level ind.
// Return true if it fits into lower-upper bounds on this level
// (if not, we need to re-calculate)
template <typename T>
bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
                        /*in/out*/ kmp_point_t original_ivs,
                        const kmp_iterations_t iterations, kmp_index_t ind,
                        bool start_with_lower_bound, bool checkBounds) {

  kmp_uint64 temp = 0;
  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);

  if (start_with_lower_bound) {
    // we moved to the next iteration on one of outer loops, should start
    // with the lower bound here:
    temp = bounds->lb0 + bounds->lb1 * outer_iv;
  } else {
    auto iteration = iterations[ind];
    temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
  }

  // Now trim original iv according to its type:
  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);

  if (checkBounds) {
    return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
  } else {
    return true;
  }
}

bool kmp_calc_one_iv(const bounds_info_t *bounds,
                     /*in/out*/ kmp_point_t original_ivs,
                     const kmp_iterations_t iterations, kmp_index_t ind,
                     bool start_with_lower_bound, bool checkBounds) {

  switch (bounds->loop_type) {
  case loop_type_t::loop_type_int32:
    return kmp_calc_one_iv_XX<kmp_int32>(
        (bounds_infoXX_template<kmp_int32> *)(bounds),
        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
        checkBounds);
    break;
  case loop_type_t::loop_type_uint32:
    return kmp_calc_one_iv_XX<kmp_uint32>(
        (bounds_infoXX_template<kmp_uint32> *)(bounds),
        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
        checkBounds);
    break;
  case loop_type_t::loop_type_int64:
    return kmp_calc_one_iv_XX<kmp_int64>(
        (bounds_infoXX_template<kmp_int64> *)(bounds),
        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
        checkBounds);
    break;
  case loop_type_t::loop_type_uint64:
    return kmp_calc_one_iv_XX<kmp_uint64>(
        (bounds_infoXX_template<kmp_uint64> *)(bounds),
        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
        checkBounds);
    break;
  default:
    KMP_ASSERT(false);
    return false;
  }
}

//----------Calculate original iv on one level for rectangular loop nest------

// Calculate one iv corresponding to iteration on the level ind.
// Return true if it fits into lower-upper bounds on this level
// (if not, we need to re-calculate)
template <typename T>
void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
                                /*in/out*/ kmp_uint64 *original_ivs,
                                const kmp_iterations_t iterations,
                                kmp_index_t ind) {

  auto iteration = iterations[ind];

  kmp_uint64 temp =
      bounds->lb0 +
      bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
      iteration * bounds->step;

  // Now trim original iv according to its type:
  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
}

void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
                             /*in/out*/ kmp_uint64 *original_ivs,
                             const kmp_iterations_t iterations,
                             kmp_index_t ind) {

  switch (bounds->loop_type) {
  case loop_type_t::loop_type_int32:
    kmp_calc_one_iv_rectang_XX<kmp_int32>(
        (bounds_infoXX_template<kmp_int32> *)(bounds),
        /*in/out*/ original_ivs, iterations, ind);
    break;
  case loop_type_t::loop_type_uint32:
    kmp_calc_one_iv_rectang_XX<kmp_uint32>(
        (bounds_infoXX_template<kmp_uint32> *)(bounds),
        /*in/out*/ original_ivs, iterations, ind);
    break;
  case loop_type_t::loop_type_int64:
    kmp_calc_one_iv_rectang_XX<kmp_int64>(
        (bounds_infoXX_template<kmp_int64> *)(bounds),
        /*in/out*/ original_ivs, iterations, ind);
    break;
  case loop_type_t::loop_type_uint64:
    kmp_calc_one_iv_rectang_XX<kmp_uint64>(
        (bounds_infoXX_template<kmp_uint64> *)(bounds),
        /*in/out*/ original_ivs, iterations, ind);
    break;
  default:
    KMP_ASSERT(false);
  }
}

//----------------------------------------------------------------------------
// Rectangular loop nest
//----------------------------------------------------------------------------

//----------Canonicalize loop nest and calculate trip count-------------------

// Canonicalize loop nest and calculate overall trip count.
// "bounds_nest" has to be allocated per thread.
// API will modify original bounds_nest array to bring it to a canonical form
// (only <= and >=, no !=, <, >). If the original loop nest was already in a
// canonical form there will be no changes to bounds in bounds_nest array
// (only trip counts will be calculated).
// Returns trip count of overall space.
extern "C" kmp_loop_nest_iv_t
__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
                                 /*in/out*/ bounds_info_t *original_bounds_nest,
                                 kmp_index_t n) {

  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);

  kmp_loop_nest_iv_t total = 1;

  for (kmp_index_t ind = 0; ind < n; ++ind) {
    auto bounds = &(original_bounds_nest[ind]);

    kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
    total *= trip_count;
  }

  return total;
}

//----------Calculate old induction variables---------------------------------

// Calculate old induction variables corresponding to overall new_iv.
// Note: original IV will be returned as if it had kmp_uint64 type,
// will have to be converted to original type in user code.
// Note: trip counts should be already calculated by
// __kmpc_process_loop_nest_rectang.
// OMPTODO: special case 2, 3 nested loops: either do different
// interface without array or possibly template this over n
extern "C" void
__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
                                 const bounds_info_t *original_bounds_nest,
                                 /*out*/ kmp_uint64 *original_ivs,
                                 kmp_index_t n) {

  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);

  // First, calc corresponding iteration in every original loop:
  for (kmp_index_t ind = n; ind > 0;) {
    --ind;
    auto bounds = &(original_bounds_nest[ind]);

    // should be optimized to OPDIVREM:
    auto temp = new_iv / bounds->trip_count;
    auto iteration = new_iv % bounds->trip_count;
    new_iv = temp;

    iterations[ind] = iteration;
  }
  KMP_ASSERT(new_iv == 0);

  for (kmp_index_t ind = 0; ind < n; ++ind) {
    auto bounds = &(original_bounds_nest[ind]);

    kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
  }
}

//----------------------------------------------------------------------------
// Non-rectangular loop nest
//----------------------------------------------------------------------------

//----------Calculate maximum possible span of iv values on one level---------

// Calculate span for IV on this loop level for "<=" case.
// Note: it's for <= on this loop nest level, so lower bound should be smallest
// value, upper bound should be the biggest value. If the loop won't execute,
// 'smallest' may be bigger than 'biggest', but we'd better not switch them
// around.
template <typename T>
void kmp_calc_span_lessoreq_XX(
    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
    /* in/out*/ bounds_info_internal_t *bounds_nest) {

  typedef typename traits_t<T>::unsigned_t UT;
  // typedef typename traits_t<T>::signed_t ST;

  // typedef typename big_span_t span_t;
  typedef T span_t;

  auto &bbounds = bounds->b;

  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
    // This dimention depends on one of previous ones; can't be the outermost
    // one.
    bounds_info_internalXX_template<T> *previous =
        reinterpret_cast<bounds_info_internalXX_template<T> *>(
            &(bounds_nest[bbounds.outer_iv]));

    // OMPTODO: assert that T is compatible with loop variable type on
    // 'previous' loop

    {
      span_t bound_candidate1 =
          bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
      span_t bound_candidate2 =
          bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
      if (bound_candidate1 < bound_candidate2) {
        bounds->span_smallest = bound_candidate1;
      } else {
        bounds->span_smallest = bound_candidate2;
      }
    }

    {
      // We can't adjust the upper bound with respect to step, because
      // lower bound might be off after adjustments

      span_t bound_candidate1 =
          bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
      span_t bound_candidate2 =
          bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
      if (bound_candidate1 < bound_candidate2) {
        bounds->span_biggest = bound_candidate2;
      } else {
        bounds->span_biggest = bound_candidate1;
      }
    }
  } else {
    // Rectangular:
    bounds->span_smallest = bbounds.lb0;
    bounds->span_biggest = bbounds.ub0;
  }
  if (!bounds->loop_bounds_adjusted) {
    // Here it's safe to reduce the space to the multiply of step.
    // OMPTODO: check if the formular is correct.
    // Also check if it would be safe to do this if we didn't adjust left side.
    bounds->span_biggest -=
        (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
  }
}

// Calculate span for IV on this loop level for ">=" case.
template <typename T>
void kmp_calc_span_greateroreq_XX(
    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
    /* in/out*/ bounds_info_internal_t *bounds_nest) {

  typedef typename traits_t<T>::unsigned_t UT;
  // typedef typename traits_t<T>::signed_t ST;

  // typedef typename big_span_t span_t;
  typedef T span_t;

  auto &bbounds = bounds->b;

  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
    // This dimention depends on one of previous ones; can't be the outermost
    // one.
    bounds_info_internalXX_template<T> *previous =
        reinterpret_cast<bounds_info_internalXX_template<T> *>(
            &(bounds_nest[bbounds.outer_iv]));

    // OMPTODO: assert that T is compatible with loop variable type on
    // 'previous' loop

    {
      span_t bound_candidate1 =
          bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
      span_t bound_candidate2 =
          bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
      if (bound_candidate1 >= bound_candidate2) {
        bounds->span_smallest = bound_candidate1;
      } else {
        bounds->span_smallest = bound_candidate2;
      }
    }

    {
      // We can't adjust the upper bound with respect to step, because
      // lower bound might be off after adjustments

      span_t bound_candidate1 =
          bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
      span_t bound_candidate2 =
          bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
      if (bound_candidate1 >= bound_candidate2) {
        bounds->span_biggest = bound_candidate2;
      } else {
        bounds->span_biggest = bound_candidate1;
      }
    }

  } else {
    // Rectangular:
    bounds->span_biggest = bbounds.lb0;
    bounds->span_smallest = bbounds.ub0;
  }
  if (!bounds->loop_bounds_adjusted) {
    // Here it's safe to reduce the space to the multiply of step.
    // OMPTODO: check if the formular is correct.
    // Also check if it would be safe to do this if we didn't adjust left side.
    bounds->span_biggest -=
        (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
  }
}

// Calculate maximum possible span for IV on this loop level.
template <typename T>
void kmp_calc_span_XX(
    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
    /* in/out*/ bounds_info_internal_t *bounds_nest) {

  if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
    kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
  } else {
    KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
    kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
  }
}

//----------All initial processing of the loop nest---------------------------

// Calculate new bounds for this loop level.
// To be able to work with the nest we need to get it to a parallelepiped shape.
// We need to stay in the original range of values, so that there will be no
// overflow, for that we'll adjust both upper and lower bounds as needed.
template <typename T>
void kmp_calc_new_bounds_XX(
    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
    /* in/out*/ bounds_info_internal_t *bounds_nest) {

  auto &bbounds = bounds->b;

  if (bbounds.lb1 == bbounds.ub1) {
    // Already parallel, no need to adjust:
    bounds->loop_bounds_adjusted = false;
  } else {
    bounds->loop_bounds_adjusted = true;

    T old_lb1 = bbounds.lb1;
    T old_ub1 = bbounds.ub1;

    if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
      // With this shape we can adjust to a rectangle:
      bbounds.lb1 = 0;
      bbounds.ub1 = 0;
    } else {
      // get upper and lower bounds to be parallel
      // with values in the old range.
      // Note: abs didn't work here.
      if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
          ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
        bbounds.lb1 = old_ub1;
      } else {
        bbounds.ub1 = old_lb1;
      }
    }

    // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
    // The idea here that for this IV we are now getting the same span
    // irrespective of the previous IV value.
    bounds_info_internalXX_template<T> *previous =
        reinterpret_cast<bounds_info_internalXX_template<T> *>(
            &bounds_nest[bbounds.outer_iv]);

    if (bbounds.comparison == comparison_t::comp_less_or_eq) {
      if (old_lb1 < bbounds.lb1) {
        KMP_ASSERT(old_lb1 < 0);
        // The length is good on outer_iv biggest number,
        // can use it to find where to move the lower bound:

        T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
        bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
                            // e.g. it was 0?? (same below)
      } else if (old_lb1 > bbounds.lb1) {
        // still need to move lower bound:
        T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
        bbounds.lb0 += add;
      }

      if (old_ub1 > bbounds.ub1) {
        KMP_ASSERT(old_ub1 > 0);
        // The length is good on outer_iv biggest number,
        // can use it to find where to move upper bound:

        T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
        bbounds.ub0 += add;
      } else if (old_ub1 < bbounds.ub1) {
        // still need to move upper bound:
        T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
        bbounds.ub0 -= sub;
      }
    } else {
      KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
      if (old_lb1 < bbounds.lb1) {
        KMP_ASSERT(old_lb1 < 0);
        T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
        bbounds.lb0 -= sub;
      } else if (old_lb1 > bbounds.lb1) {
        T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
        bbounds.lb0 += add;
      }

      if (old_ub1 > bbounds.ub1) {
        KMP_ASSERT(old_ub1 > 0);
        T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
        bbounds.ub0 += add;
      } else if (old_ub1 < bbounds.ub1) {
        T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
        bbounds.ub0 -= sub;
      }
    }
  }
}

// Do all processing for one canonicalized loop in the nest
// (assuming that outer loops already were processed):
template <typename T>
kmp_loop_nest_iv_t kmp_process_one_loop_XX(
    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
    /*in/out*/ bounds_info_internal_t *bounds_nest) {

  kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
  kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
  return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
}

// Non-rectangular loop nest, canonicalized to use <= or >=.
// Process loop nest to have a parallelepiped shape,
// calculate biggest spans for IV's on all levels and calculate overall trip
// count. "bounds_nest" has to be allocated per thread.
// Returns overall trip count (for adjusted space).
kmp_loop_nest_iv_t kmp_process_loop_nest(
    /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {

  kmp_loop_nest_iv_t total = 1;

  for (kmp_index_t ind = 0; ind < n; ++ind) {
    auto bounds = &(bounds_nest[ind]);
    kmp_loop_nest_iv_t trip_count = 0;

    switch (bounds->b.loop_type) {
    case loop_type_t::loop_type_int32:
      trip_count = kmp_process_one_loop_XX<kmp_int32>(
          /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
          /*in/out*/ bounds_nest);
      break;
    case loop_type_t::loop_type_uint32:
      trip_count = kmp_process_one_loop_XX<kmp_uint32>(
          /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
          /*in/out*/ bounds_nest);
      break;
    case loop_type_t::loop_type_int64:
      trip_count = kmp_process_one_loop_XX<kmp_int64>(
          /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
          /*in/out*/ bounds_nest);
      break;
    case loop_type_t::loop_type_uint64:
      trip_count = kmp_process_one_loop_XX<kmp_uint64>(
          /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
          /*in/out*/ bounds_nest);
      break;
    default:
      KMP_ASSERT(false);
    }
    total *= trip_count;
  }

  return total;
}

//----------Calculate iterations (in the original or updated space)-----------

// Calculate number of iterations in original or updated space resulting in
// original_ivs[ind] (only on this level, non-negative)
// (not counting initial iteration)
template <typename T>
kmp_loop_nest_iv_t
kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
                                 const kmp_point_t original_ivs,
                                 kmp_index_t ind) {

  kmp_loop_nest_iv_t iterations = 0;

  if (bounds->comparison == comparison_t::comp_less_or_eq) {
    iterations =
        (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
         bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
        __kmp_abs(bounds->step);
  } else {
    KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
    iterations = (bounds->lb0 +
                  bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
                  static_cast<T>(original_ivs[ind])) /
                 __kmp_abs(bounds->step);
  }

  return iterations;
}

// Calculate number of iterations in the original or updated space resulting in
// original_ivs[ind] (only on this level, non-negative)
kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
                                                 const kmp_point_t original_ivs,
                                                 kmp_index_t ind) {

  switch (bounds->loop_type) {
  case loop_type_t::loop_type_int32:
    return kmp_calc_number_of_iterations_XX<kmp_int32>(
        (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
    break;
  case loop_type_t::loop_type_uint32:
    return kmp_calc_number_of_iterations_XX<kmp_uint32>(
        (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
    break;
  case loop_type_t::loop_type_int64:
    return kmp_calc_number_of_iterations_XX<kmp_int64>(
        (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
    break;
  case loop_type_t::loop_type_uint64:
    return kmp_calc_number_of_iterations_XX<kmp_uint64>(
        (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
    break;
  default:
    KMP_ASSERT(false);
    return 0;
  }
}

//----------Calculate new iv corresponding to original ivs--------------------

// We got a point in the original loop nest.
// Take updated bounds and calculate what new_iv will correspond to this point.
// When we are getting original IVs from new_iv, we have to adjust to fit into
// original loops bounds. Getting new_iv for the adjusted original IVs will help
// with making more chunks non-empty.
kmp_loop_nest_iv_t
kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
                                  const kmp_point_t original_ivs,
                                  kmp_index_t n) {

  kmp_loop_nest_iv_t new_iv = 0;

  for (kmp_index_t ind = 0; ind < n; ++ind) {
    auto bounds = &(bounds_nest[ind].b);

    new_iv = new_iv * bounds->trip_count +
             kmp_calc_number_of_iterations(bounds, original_ivs, ind);
  }

  return new_iv;
}

//----------Calculate original ivs for provided iterations--------------------

// Calculate original IVs for provided iterations, assuming iterations are
// calculated in the original space.
// Loop nest is in canonical form (with <= / >=).
bool kmp_calc_original_ivs_from_iterations(
    const bounds_info_t *original_bounds_nest, kmp_index_t n,
    /*in/out*/ kmp_point_t original_ivs,
    /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {

  kmp_index_t lengthened_ind = n;

  for (; ind < n;) {
    auto bounds = &(original_bounds_nest[ind]);
    bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
                                ind, (lengthened_ind < ind), true);

    if (!good) {
      // The calculated iv value is too big (or too small for >=):
      if (ind == 0) {
        // Space is empty:
        return false;
      } else {
        // Go to next iteration on the outer loop:
        --ind;
        ++iterations[ind];
        lengthened_ind = ind;
        for (kmp_index_t i = ind + 1; i < n; ++i) {
          iterations[i] = 0;
        }
        continue;
      }
    }
    ++ind;
  }

  return true;
}

//----------Calculate original ivs for the beginning of the loop nest---------

// Calculate IVs for the beginning of the loop nest.
// Note: lower bounds of all loops may not work -
// if on some of the iterations of the outer loops inner loops are empty.
// Loop nest is in canonical form (with <= / >=).
bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
                                     kmp_index_t n,
                                     /*out*/ kmp_point_t original_ivs) {

  // Iterations in the original space, multiplied by step:
  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
  for (kmp_index_t ind = n; ind > 0;) {
    --ind;
    iterations[ind] = 0;
  }

  // Now calculate the point:
  bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
                                                 /*in/out*/ original_ivs,
                                                 /*in/out*/ iterations, 0);
  return b;
}

//----------Calculate next point in the original loop space-------------------

// From current set of original IVs calculate next point.
// Return false if there is no next point in the loop bounds.
bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
                                kmp_index_t n, const kmp_point_t original_ivs,
                                /*out*/ kmp_point_t next_original_ivs) {
  // Iterations in the original space, multiplied by step (so can be negative):
  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
  // First, calc corresponding iteration in every original loop:
  for (kmp_index_t ind = 0; ind < n; ++ind) {
    auto bounds = &(original_bounds_nest[ind]);
    iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
  }

  for (kmp_index_t ind = 0; ind < n; ++ind) {
    next_original_ivs[ind] = original_ivs[ind];
  }

  // Next add one step to the iterations on the inner-most level, and see if we
  // need to move up the nest:
  kmp_index_t ind = n - 1;
  ++iterations[ind];

  bool b = kmp_calc_original_ivs_from_iterations(
      original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);

  return b;
}

//----------Calculate chunk end in the original loop space--------------------

// For one level calculate old induction variable corresponding to overall
// new_iv for the chunk end.
// Return true if it fits into upper bound on this level
// (if not, we need to re-calculate)
template <typename T>
bool kmp_calc_one_iv_for_chunk_end_XX(
    const bounds_infoXX_template<T> *bounds,
    const bounds_infoXX_template<T> *updated_bounds,
    /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
    kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
    const kmp_point_t original_ivs_start) {

  // typedef  std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
  // big_span_t;

  // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
  T temp = 0;

  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);

  if (start_with_lower_bound) {
    // we moved to the next iteration on one of outer loops, may as well use
    // the lower bound here:
    temp = bounds->lb0 + bounds->lb1 * outer_iv;
  } else {
    // Start in expanded space, but:
    // - we need to hit original space lower bound, so need to account for
    // that
    // - we have to go into original space, even if that means adding more
    // iterations than was planned
    // - we have to go past (or equal to) previous point (which is the chunk
    // starting point)

    auto iteration = iterations[ind];

    auto step = bounds->step;

    // In case of >= it's negative:
    auto accountForStep =
        ((bounds->lb0 + bounds->lb1 * outer_iv) -
         (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
        step;

    temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
           accountForStep + iteration * step;

    if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
         (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
        ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
         (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
      // Too small (or too big), didn't reach the original lower bound. Use
      // heuristic:
      temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
    }

    if (compare_with_start) {

      T start = static_cast<T>(original_ivs_start[ind]);

      temp = kmp_fix_iv(bounds->loop_iv_type, temp);

      // On all previous levels start of the chunk is same as the end, need to
      // be really careful here:
      if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
           (temp < start)) ||
          ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
           (temp > start))) {
        // End of the chunk can't be smaller (for >= bigger) than it's start.
        // Use heuristic:
        temp = start + iteration / 4 * step;
      }
    }
  }

  original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);

  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
       (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
      ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
       (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
    // Too big (or too small for >=).
    return false;
  }

  return true;
}

// For one level calculate old induction variable corresponding to overall
// new_iv for the chunk end.
bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
                                   const bounds_info_t *updated_bounds,
                                   /*in/out*/ kmp_point_t original_ivs,
                                   const kmp_iterations_t iterations,
                                   kmp_index_t ind, bool start_with_lower_bound,
                                   bool compare_with_start,
                                   const kmp_point_t original_ivs_start) {

  switch (bounds->loop_type) {
  case loop_type_t::loop_type_int32:
    return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
        (bounds_infoXX_template<kmp_int32> *)(bounds),
        (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
        /*in/out*/
        original_ivs, iterations, ind, start_with_lower_bound,
        compare_with_start, original_ivs_start);
    break;
  case loop_type_t::loop_type_uint32:
    return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
        (bounds_infoXX_template<kmp_uint32> *)(bounds),
        (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
        /*in/out*/
        original_ivs, iterations, ind, start_with_lower_bound,
        compare_with_start, original_ivs_start);
    break;
  case loop_type_t::loop_type_int64:
    return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
        (bounds_infoXX_template<kmp_int64> *)(bounds),
        (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
        /*in/out*/
        original_ivs, iterations, ind, start_with_lower_bound,
        compare_with_start, original_ivs_start);
    break;
  case loop_type_t::loop_type_uint64:
    return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
        (bounds_infoXX_template<kmp_uint64> *)(bounds),
        (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
        /*in/out*/
        original_ivs, iterations, ind, start_with_lower_bound,
        compare_with_start, original_ivs_start);
    break;
  default:
    KMP_ASSERT(false);
    return false;
  }
}

// Calculate old induction variables corresponding to overall new_iv for the
// chunk end. If due to space extension we are getting old IVs outside of the
// boundaries, bring them into the boundaries. Need to do this in the runtime,
// esp. on the lower bounds side. When getting result need to make sure that the
// new chunk starts at next position to old chunk, not overlaps with it (this is
// done elsewhere), and need to make sure end of the chunk is further than the
// beginning of the chunk. We don't need an exact ending point here, just
// something more-or-less close to the desired chunk length, bigger is fine
// (smaller would be fine, but we risk going into infinite loop, so do smaller
// only at the very end of the space). result: false if could not find the
// ending point in the original loop space. In this case the caller can use
// original upper bounds as the end of the chunk. Chunk won't be empty, because
// it'll have at least the starting point, which is by construction in the
// original space.
bool kmp_calc_original_ivs_for_chunk_end(
    const bounds_info_t *original_bounds_nest, kmp_index_t n,
    const bounds_info_internal_t *updated_bounds_nest,
    const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
    /*out*/ kmp_point_t original_ivs) {

  // Iterations in the expanded space:
  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
  // First, calc corresponding iteration in every modified loop:
  for (kmp_index_t ind = n; ind > 0;) {
    --ind;
    auto &updated_bounds = updated_bounds_nest[ind];

    // should be optimized to OPDIVREM:
    auto new_ind = new_iv / updated_bounds.b.trip_count;
    auto iteration = new_iv % updated_bounds.b.trip_count;

    new_iv = new_ind;
    iterations[ind] = iteration;
  }
  KMP_DEBUG_ASSERT(new_iv == 0);

  kmp_index_t lengthened_ind = n;
  kmp_index_t equal_ind = -1;

  // Next calculate the point, but in original loop nest.
  for (kmp_index_t ind = 0; ind < n;) {
    auto bounds = &(original_bounds_nest[ind]);
    auto updated_bounds = &(updated_bounds_nest[ind].b);

    bool good = kmp_calc_one_iv_for_chunk_end(
        bounds, updated_bounds,
        /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
        (equal_ind >= ind - 1), original_ivs_start);

    if (!good) {
      // Too big (or too small for >=).
      if (ind == 0) {
        // Need to reduce to the end.
        return false;
      } else {
        // Go to next iteration on outer loop:
        --ind;
        ++(iterations[ind]);
        lengthened_ind = ind;
        if (equal_ind >= lengthened_ind) {
          // We've changed the number of iterations here,
          // can't be same anymore:
          equal_ind = lengthened_ind - 1;
        }
        for (kmp_index_t i = ind + 1; i < n; ++i) {
          iterations[i] = 0;
        }
        continue;
      }
    }

    if ((equal_ind == ind - 1) &&
        (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
                    original_ivs_start[ind]))) {
      equal_ind = ind;
    } else if ((equal_ind > ind - 1) &&
               !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
                            original_ivs_start[ind]))) {
      equal_ind = ind - 1;
    }
    ++ind;
  }

  return true;
}

//----------Calculate upper bounds for the last chunk-------------------------

// Calculate one upper bound for the end.
template <typename T>
void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
                            /*in/out*/ kmp_point_t original_ivs,
                            kmp_index_t ind) {

  T temp = bounds->ub0 +
           bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);

  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
}

void kmp_calc_one_iv_end(const bounds_info_t *bounds,
                         /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {

  switch (bounds->loop_type) {
  default:
    KMP_ASSERT(false);
    break;
  case loop_type_t::loop_type_int32:
    kmp_calc_one_iv_end_XX<kmp_int32>(
        (bounds_infoXX_template<kmp_int32> *)(bounds),
        /*in/out*/ original_ivs, ind);
    break;
  case loop_type_t::loop_type_uint32:
    kmp_calc_one_iv_end_XX<kmp_uint32>(
        (bounds_infoXX_template<kmp_uint32> *)(bounds),
        /*in/out*/ original_ivs, ind);
    break;
  case loop_type_t::loop_type_int64:
    kmp_calc_one_iv_end_XX<kmp_int64>(
        (bounds_infoXX_template<kmp_int64> *)(bounds),
        /*in/out*/ original_ivs, ind);
    break;
  case loop_type_t::loop_type_uint64:
    kmp_calc_one_iv_end_XX<kmp_uint64>(
        (bounds_infoXX_template<kmp_uint64> *)(bounds),
        /*in/out*/ original_ivs, ind);
    break;
  }
}

// Calculate upper bounds for the last loop iteration. Just use original upper
// bounds (adjusted when canonicalized to use <= / >=). No need to check that
// this point is in the original space (it's likely not)
void kmp_calc_original_ivs_for_end(
    const bounds_info_t *const original_bounds_nest, kmp_index_t n,
    /*out*/ kmp_point_t original_ivs) {
  for (kmp_index_t ind = 0; ind < n; ++ind) {
    auto bounds = &(original_bounds_nest[ind]);
    kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
  }
}

//----------Init API for non-rectangular loops--------------------------------

// Init API for collapsed loops (static, no chunks defined).
// "bounds_nest" has to be allocated per thread.
// API will modify original bounds_nest array to bring it to a canonical form
// (only <= and >=, no !=, <, >). If the original loop nest was already in a
// canonical form there will be no changes to bounds in bounds_nest array
// (only trip counts will be calculated). Internally API will expand the space
// to parallelogram/parallelepiped, calculate total, calculate bounds for the
// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
// important on the left side, to hit the lower bounds and not step over), and
// pick the correct chunk for this thread (so it will calculate chunks up to the
// needed one). It could be optimized to calculate just this chunk, potentially
// a bit less well distributed among threads. It is designed to make sure that
// threads will receive predictable chunks, deterministically (so that next nest
// of loops with similar characteristics will get exactly same chunks on same
// threads).
// Current contract: chunk_bounds_nest has only lb0 and ub0,
// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
extern "C" kmp_int32
__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
                          /*in/out*/ bounds_info_t *original_bounds_nest,
                          /*out*/ bounds_info_t *chunk_bounds_nest,
                          kmp_index_t n, /*out*/ kmp_int32 *plastiter) {

  KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
  KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));

  if (__kmp_env_consistency_check) {
    __kmp_push_workshare(gtid, ct_pdo, loc);
  }

  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);

  CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);

  for (kmp_index_t i = 0; i < n; ++i) {
    updated_bounds_nest[i].b = original_bounds_nest[i];
  }

  kmp_loop_nest_iv_t total =
      kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);

  if (plastiter != NULL) {
    *plastiter = FALSE;
  }

  if (total == 0) {
    // Loop won't execute:
    return FALSE;
  }

  // OMPTODO: DISTRIBUTE is not supported yet
  __kmp_assert_valid_gtid(gtid);
  kmp_uint32 tid = __kmp_tid_from_gtid(gtid);

  kmp_info_t *th = __kmp_threads[gtid];
  kmp_team_t *team = th->th.th_team;
  kmp_uint32 nth = team->t.t_nproc; // Number of threads

  KMP_DEBUG_ASSERT(tid < nth);

  CollapseAllocator<kmp_uint64> original_ivs_start(n);

  if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
                                       /*out*/ original_ivs_start)) {
    // Loop won't execute:
    return FALSE;
  }

  // Not doing this optimization for one thread:
  // (1) more to test
  // (2) without it current contract that chunk_bounds_nest has only lb0 and
  // ub0, lb1 and ub1 are set to 0 and can be ignored.
  // if (nth == 1) {
  //  // One thread:
  //  // Copy all info from original_bounds_nest, it'll be good enough.

  //  for (kmp_index_t i = 0; i < n; ++i) {
  //    chunk_bounds_nest[i] = original_bounds_nest[i];
  //  }

  //  if (plastiter != NULL) {
  //    *plastiter = TRUE;
  //  }
  //  return TRUE;
  //}

  kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
      updated_bounds_nest, original_ivs_start, n);

  bool last_iter = false;

  for (; nth > 0;) {
    // We could calculate chunk size once, but this is to compensate that the
    // original space is not parallelepiped and some threads can be left
    // without work:
    KMP_DEBUG_ASSERT(total >= new_iv);

    kmp_loop_nest_iv_t total_left = total - new_iv;
    kmp_loop_nest_iv_t chunk_size = total_left / nth;
    kmp_loop_nest_iv_t remainder = total_left % nth;

    kmp_loop_nest_iv_t curr_chunk_size = chunk_size;

    if (remainder > 0) {
      ++curr_chunk_size;
      --remainder;
    }

#if defined(KMP_DEBUG)
    kmp_loop_nest_iv_t new_iv_for_start = new_iv;
#endif

    if (curr_chunk_size > 1) {
      new_iv += curr_chunk_size - 1;
    }

    CollapseAllocator<kmp_uint64> original_ivs_end(n);
    if ((nth == 1) || (new_iv >= total - 1)) {
      // Do this one till the end - just in case we miscalculated
      // and either too much is left to process or new_iv is a bit too big:
      kmp_calc_original_ivs_for_end(original_bounds_nest, n,
                                    /*out*/ original_ivs_end);

      last_iter = true;
    } else {
      // Note: here we make sure it's past (or equal to) the previous point.
      if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
                                               updated_bounds_nest,
                                               original_ivs_start, new_iv,
                                               /*out*/ original_ivs_end)) {
        // We could not find the ending point, use the original upper bounds:
        kmp_calc_original_ivs_for_end(original_bounds_nest, n,
                                      /*out*/ original_ivs_end);

        last_iter = true;
      }
    }

#if defined(KMP_DEBUG)
    auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
        updated_bounds_nest, original_ivs_end, n);
    KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
#endif

    if (last_iter && (tid != 0)) {
      // We are done, this was last chunk, but no chunk for current thread was
      // found:
      return FALSE;
    }

    if (tid == 0) {
      // We found the chunk for this thread, now we need to check if it's the
      // last chunk or not:

      CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
      if (last_iter ||
          !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
                                      /*out*/ original_ivs_next_start)) {
        // no more loop iterations left to process,
        // this means that currently found chunk is the last chunk:
        if (plastiter != NULL) {
          *plastiter = TRUE;
        }
      }

      // Fill in chunk bounds:
      for (kmp_index_t i = 0; i < n; ++i) {
        chunk_bounds_nest[i] =
            original_bounds_nest[i]; // To fill in types, etc. - optional
        chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
        chunk_bounds_nest[i].lb1_u64 = 0;

        chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
        chunk_bounds_nest[i].ub1_u64 = 0;
      }

      return TRUE;
    }

    --tid;
    --nth;

    bool next_chunk = kmp_calc_next_original_ivs(
        original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
    if (!next_chunk) {
      // no more loop iterations to process,
      // the prevoius chunk was the last chunk
      break;
    }

    // original_ivs_start is next to previous chunk original_ivs_end,
    // we need to start new chunk here, so chunks will be one after another
    // without any gap or overlap:
    new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
                                               original_ivs_start, n);
  }

  return FALSE;
}