libc/src/__support/GPU/allocator.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687

//===-- GPU memory allocator implementation ---------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a parallel allocator intended for use on a GPU device.
// The core algorithm is slab allocator using a random walk over a bitfield for
// maximum parallel progress. Slab handling is done by a wait-free reference
// counted guard. The first use of a slab will create it from system memory for
// re-use. The last use will invalidate it and free the memory.
//
//===----------------------------------------------------------------------===//

#include "allocator.h"

#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/new.h"
#include "src/__support/GPU/fixedstack.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"
#include "src/__support/threads/sleep.h"
#include "src/string/memory_utils/inline_memcpy.h"

namespace LIBC_NAMESPACE_DECL {

constexpr static uint64_t MAX_SIZE = /* 64 GiB */ 64ull * 1024 * 1024 * 1024;
constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024;
constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1;
constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
constexpr static uint32_t BITS_IN_DWORD = sizeof(uint64_t) * 8;
constexpr static uint32_t MIN_SIZE = 16;
constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;

// The number of times to attempt claiming an in-progress slab allocation.
constexpr static uint32_t MAX_TRIES = 1024;

// The number of previously allocated slabs we will keep in memory.
constexpr static uint32_t CACHED_SLABS = 8;

// Configuration for whether or not we will return unused slabs to memory.
constexpr static bool RECLAIM = true;

static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");

namespace impl {
// Allocates more memory from the system through the RPC interface. All
// allocations from the system MUST be aligned on a 2MiB barrier. The default
// HSA allocator has this behavior for any allocation >= 2MiB and the CUDA
// driver provides an alignment field for virtual memory allocations.
static void *rpc_allocate(uint64_t size) {
  void *ptr = nullptr;
  rpc::Client::Port port = rpc::client.open<LIBC_MALLOC>();
  port.send_and_recv(
      [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = size; },
      [&](rpc::Buffer *buffer, uint32_t) {
        ptr = reinterpret_cast<void *>(buffer->data[0]);
      });
  port.close();
  return ptr;
}

// Deallocates the associated system memory.
static void rpc_free(void *ptr) {
  rpc::Client::Port port = rpc::client.open<LIBC_FREE>();
  port.send([=](rpc::Buffer *buffer, uint32_t) {
    buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
  });
  port.close();
}

// Convert a potentially disjoint bitmask into an increasing integer per-lane
// for use with indexing between gpu lanes.
static inline uint32_t lane_count(uint64_t lane_mask, uint32_t id) {
  return cpp::popcount(lane_mask & ((uint64_t(1) << id) - 1));
}

// Obtain an initial value to seed a random number generator. We use the rounded
// multiples of the golden ratio from xorshift* as additional spreading.
static inline uint32_t entropy() {
  return (static_cast<uint32_t>(gpu::processor_clock()) ^
          (gpu::get_thread_id_x() * 0x632be59b) ^
          (gpu::get_block_id_x() * 0x85157af5)) *
         0x9e3779bb;
}

// Generate a random number and update the state using the xorshift32* PRNG.
static inline uint32_t xorshift32(uint32_t &state) {
  state ^= state << 13;
  state ^= state >> 17;
  state ^= state << 5;
  return state * 0x9e3779bb;
}

// Rounds the input value to the closest permitted chunk size. Here we accept
// the sum of the closest three powers of two. For a 2MiB slab size this is 48
// different chunk sizes. This gives us average internal fragmentation of 87.5%.
static inline constexpr uint32_t get_chunk_size(uint32_t x) {
  uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
  uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);

  uint32_t s0 = 0b0100 << (pow2 - 3);
  uint32_t s1 = 0b0110 << (pow2 - 3);
  uint32_t s2 = 0b0111 << (pow2 - 3);
  uint32_t s3 = 0b1000 << (pow2 - 3);

  if (s0 > y)
    return (s0 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
  if (s1 > y)
    return (s1 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
  if (s2 > y)
    return (s2 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
  return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
}

// Converts a chunk size into an index suitable for a statically sized array.
static inline constexpr uint32_t get_chunk_id(uint32_t x) {
  if (x <= MIN_SIZE)
    return 0;
  uint32_t y = x >> 4;
  if (x < MIN_SIZE << 2)
    return cpp::popcount(y);
  return cpp::popcount(y) + 3 * (BITS_IN_WORD - cpp::countl_zero(y)) - 7;
}

// Rounds to the nearest power of two.
template <uint32_t N, typename T>
static inline constexpr T round_up(const T x) {
  static_assert(((N - 1) & N) == 0, "N must be a power of two");
  return (x + N) & ~(N - 1);
}

// Perform a lane parallel memset on a uint32_t pointer.
void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
  uint64_t mask = gpu::get_lane_mask();
  uint32_t workers = cpp::popcount(uniform);
  for (uint32_t i = impl::lane_count(mask & uniform, gpu::get_lane_id()); i < n;
       i += workers)
    s[i] = c;
}

// Indicates that the provided value is a power of two.
static inline constexpr bool is_pow2(uint64_t x) {
  return x && (x & (x - 1)) == 0;
}

// Where this chunk size should start looking in the global array. Small
// allocations are much more likely than large ones, so we give them the most
// space. We use a cubic easing function normalized on the possible chunks.
static inline constexpr uint32_t get_start_index(uint32_t chunk_size) {
  constexpr uint32_t max_chunk = impl::get_chunk_id(SLAB_SIZE / 2);
  uint64_t norm =
      (1 << 16) - (impl::get_chunk_id(chunk_size) << 16) / max_chunk;
  uint64_t bias = (norm * norm * norm) >> 32;
  uint64_t inv = (1 << 16) - bias;
  return static_cast<uint32_t>(((ARRAY_SIZE - 1) * inv) >> 16);
}

// Returns the id of the lane below this one that acts as its leader.
static inline uint32_t get_leader_id(uint64_t ballot, uint32_t id) {
  uint64_t mask = id < BITS_IN_DWORD - 1 ? ~0ull << (id + 1) : 0;
  return BITS_IN_DWORD - cpp::countl_zero(ballot & ~mask) - 1;
}

// We use a sentinal value to indicate a failed or in-progress allocation.
template <typename T> bool is_sentinel(const T &x) {
  if constexpr (cpp::is_pointer_v<T>)
    return reinterpret_cast<uintptr_t>(x) ==
           cpp::numeric_limits<uintptr_t>::max();
  else
    return x == cpp::numeric_limits<T>::max();
}

} // namespace impl

/// A slab allocator used to hand out identically sized slabs of memory.
/// Allocation is done through random walks of a bitfield until a free bit is
/// encountered. This reduces contention and is highly parallel on a GPU.
///
/// 0       4           8       16                 ...                     2 MiB
/// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐
/// │ chunk  │  index   │  pad   │    bitfield[]    │         memory[]         │
/// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘
///
/// The size of the bitfield is the slab size divided by the chunk size divided
/// by the number of bits per word. We pad the interface to ensure 16 byte
/// alignment and to indicate that if the pointer is not aligned by 2MiB it
/// belongs to a slab rather than the global allocator.
struct Slab {
  // Header metadata for the slab, aligned to the minimum alignment.
  struct alignas(MIN_SIZE) Header {
    uint32_t chunk_size;
    uint32_t global_index;
    uint32_t cached_chunk_size;
  };

  // Initialize the slab with its chunk size and index in the global table for
  // use when freeing.
  Slab(uint32_t chunk_size, uint32_t global_index) {
    Header *header = reinterpret_cast<Header *>(memory);
    header->cached_chunk_size = cpp::numeric_limits<uint32_t>::max();
    header->chunk_size = chunk_size;
    header->global_index = global_index;
  }

  // Reset the memory with a new index and chunk size, not thread safe.
  Slab *reset(uint32_t chunk_size, uint32_t global_index) {
    Header *header = reinterpret_cast<Header *>(memory);
    header->cached_chunk_size = header->chunk_size;
    header->chunk_size = chunk_size;
    header->global_index = global_index;
    return this;
  }

  // Set the necessary bitfield bytes to zero in parallel using many lanes. This
  // must be called before the bitfield can be accessed safely, memory is not
  // guaranteed to be zero initialized in the current implementation.
  void initialize(uint64_t uniform) {
    // If this is a re-used slab the memory is already set to zero.
    if (get_cached_chunk_size() <= get_chunk_size())
      return;

    uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
                    sizeof(uint32_t);
    impl::uniform_memset(get_bitfield(), 0, size, uniform);
  }

  // Get the number of chunks that can theoretically fit inside this slab.
  constexpr static uint32_t num_chunks(uint32_t chunk_size) {
    return SLAB_SIZE / chunk_size;
  }

  // Get the number of bytes needed to contain the bitfield bits.
  constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) {
    return __builtin_align_up(
        ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8,
        MIN_ALIGNMENT + 1);
  }

  // The actual amount of memory available excluding the bitfield and metadata.
  constexpr static uint32_t available_bytes(uint32_t chunk_size) {
    return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header);
  }

  // The number of chunks that can be stored in this slab.
  constexpr static uint32_t available_chunks(uint32_t chunk_size) {
    return available_bytes(chunk_size) / chunk_size;
  }

  // The length in bits of the bitfield.
  constexpr static uint32_t usable_bits(uint32_t chunk_size) {
    return available_bytes(chunk_size) / chunk_size;
  }

  // Get the location in the memory where we will store the chunk size.
  uint32_t get_chunk_size() const {
    return reinterpret_cast<const Header *>(memory)->chunk_size;
  }

  // Get the chunk size that was previously used.
  uint32_t get_cached_chunk_size() const {
    return reinterpret_cast<const Header *>(memory)->cached_chunk_size;
  }

  // Get the location in the memory where we will store the global index.
  uint32_t get_global_index() const {
    return reinterpret_cast<const Header *>(memory)->global_index;
  }

  // Get a pointer to where the bitfield is located in the memory.
  uint32_t *get_bitfield() {
    return reinterpret_cast<uint32_t *>(memory + sizeof(Header));
  }

  // Get a pointer to where the actual memory to be allocated lives.
  uint8_t *get_memory(uint32_t chunk_size) {
    return reinterpret_cast<uint8_t *>(get_bitfield()) +
           bitfield_bytes(chunk_size);
  }

  // Get a pointer to the actual memory given an index into the bitfield.
  void *ptr_from_index(uint32_t index, uint32_t chunk_size) {
    return get_memory(chunk_size) + index * chunk_size;
  }

  // Convert a pointer back into its bitfield index using its offset.
  uint32_t index_from_ptr(void *ptr, uint32_t chunk_size) {
    return static_cast<uint32_t>(reinterpret_cast<uint8_t *>(ptr) -
                                 get_memory(chunk_size)) /
           chunk_size;
  }

  // Randomly walks the bitfield until it finds a free bit. Allocations attempt
  // to put lanes right next to each other for better caching and convergence.
  void *allocate(uint64_t uniform, uint32_t reserved) {
    uint32_t chunk_size = get_chunk_size();
    uint32_t state = impl::entropy();

    // Try to find the empty bit in the bitfield to finish the allocation. We
    // start at the number of allocations as this is guaranteed to be available
    // until the user starts freeing memory.
    uint64_t lane_mask = gpu::get_lane_mask();
    uint32_t start = gpu::shuffle(
        lane_mask, cpp::countr_zero(uniform & lane_mask), reserved);
    for (;;) {
      uint64_t lane_mask = gpu::get_lane_mask();

      // Each lane tries to claim one bit in a single contiguous mask.
      uint32_t id = impl::lane_count(uniform & lane_mask, gpu::get_lane_id());
      uint32_t index = (start + id) % usable_bits(chunk_size);
      uint32_t slot = index / BITS_IN_WORD;
      uint32_t bit = index % BITS_IN_WORD;

      // Get the mask of bits destined for the same slot and coalesce it.
      uint32_t leader = impl::get_leader_id(
          uniform & gpu::ballot(lane_mask, !id || index % BITS_IN_WORD == 0),
          gpu::get_lane_id());
      uint32_t length = cpp::popcount(uniform & lane_mask) -
                        impl::lane_count(uniform & lane_mask, leader);
      uint32_t bitmask =
          static_cast<uint32_t>(
              (uint64_t(1) << cpp::min(length, BITS_IN_WORD)) - 1)
          << bit;

      uint32_t before = 0;
      if (gpu::get_lane_id() == leader)
        before = cpp::AtomicRef(get_bitfield()[slot])
                     .fetch_or(bitmask, cpp::MemoryOrder::RELAXED);
      before = gpu::shuffle(lane_mask, leader, before);
      if (~before & (1 << bit)) {
        cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
        return ptr_from_index(index, chunk_size);
      }

      // If the previous operation found an empty bit we move there, otherwise
      // we generate new random index to start at.
      uint32_t after = before | bitmask;
      start = gpu::shuffle(
          gpu::get_lane_mask(),
          cpp::countr_zero(uniform & gpu::get_lane_mask()),
          ~after ? __builtin_align_down(index, BITS_IN_WORD) +
                       cpp::countr_zero(~after)
                 : __builtin_align_down(impl::xorshift32(state), BITS_IN_WORD));
      sleep_briefly();
    }
  }

  // Deallocates memory by resetting its corresponding bit in the bitfield.
  void deallocate(void *ptr) {
    uint32_t chunk_size = get_chunk_size();
    uint32_t index = index_from_ptr(ptr, chunk_size);
    uint32_t slot = index / BITS_IN_WORD;
    uint32_t bit = index % BITS_IN_WORD;

    cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
    cpp::AtomicRef(get_bitfield()[slot])
        .fetch_and(~(1u << bit), cpp::MemoryOrder::RELAXED);
  }

  // The actual memory the slab will manage. All offsets are calculated at
  // runtime with the chunk size to keep the interface convergent when a warp or
  // wavefront is handling multiple sizes at once.
  uint8_t memory[SLAB_SIZE];
};

// A global cache of previously allocated slabs for efficient reuse.
static FixedStack<Slab *, CACHED_SLABS> slab_cache;

/// A wait-free guard around a pointer resource to be created dynamically if
/// space is available and freed once there are no more users.
struct GuardPtr {
private:
  struct RefCounter {
    // Indicates that the object is in its deallocation phase and thus invalid.
    static constexpr uint32_t INVALID = uint32_t(1) << 31;

    // If a read preempts an unlock call we indicate this so the following
    // unlock call can swap out the helped bit and maintain exclusive ownership.
    static constexpr uint32_t HELPED = uint32_t(1) << 30;

    // Resets the reference counter, cannot be reset to zero safely.
    void reset(uint32_t n, uint32_t &count) {
      counter.store(n, cpp::MemoryOrder::RELAXED);
      count = n;
    }

    // Acquire a slot in the reference counter if it is not invalid.
    bool acquire(uint32_t n, uint32_t &count) {
      count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n;
      return (count & INVALID) == 0;
    }

    // Release a slot in the reference counter. This function should only be
    // called following a valid acquire call.
    bool release(uint32_t n) {
      // If this thread caused the counter to reach zero we try to invalidate it
      // and obtain exclusive rights to deconstruct it. If the CAS failed either
      // another thread resurrected the counter and we quit, or a parallel read
      // helped us invalidating it. For the latter, claim that flag and return.
      if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n && RECLAIM) {
        uint32_t expected = 0;
        if (counter.compare_exchange_strong(expected, INVALID,
                                            cpp::MemoryOrder::RELAXED,
                                            cpp::MemoryOrder::RELAXED))
          return true;
        else if ((expected & HELPED) &&
                 (counter.exchange(INVALID, cpp::MemoryOrder::RELAXED) &
                  HELPED))
          return true;
      }
      return false;
    }

    // Returns the current reference count, potentially helping a releasing
    // thread.
    uint64_t read() {
      auto val = counter.load(cpp::MemoryOrder::RELAXED);
      if (val == 0 && RECLAIM &&
          counter.compare_exchange_strong(val, INVALID | HELPED,
                                          cpp::MemoryOrder::RELAXED))
        return 0;
      return (val & INVALID) ? 0 : val;
    }

    cpp::Atomic<uint32_t> counter{0};
  };

  cpp::Atomic<Slab *> ptr;
  RefCounter ref;

  // Should be called be a single lane for each different pointer.
  template <typename... Args>
  Slab *try_lock_impl(uint32_t n, uint32_t &count, Args &&...args) {
    Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED);
    if (!expected &&
        ptr.compare_exchange_strong(
            expected,
            reinterpret_cast<Slab *>(cpp::numeric_limits<uintptr_t>::max()),
            cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
      count = cpp::numeric_limits<uint32_t>::max();

      Slab *cached = nullptr;
      if (slab_cache.pop(cached))
        return cached->reset(cpp::forward<Args>(args)...);

      void *raw = impl::rpc_allocate(sizeof(Slab));
      if (!raw)
        return nullptr;
      return new (raw) Slab(cpp::forward<Args>(args)...);
    }

    // If there is a slab allocation in progress we retry a few times.
    for (uint32_t t = 0; impl::is_sentinel(expected) && t < MAX_TRIES; ++t) {
      sleep_briefly();
      expected = ptr.load(cpp::MemoryOrder::RELAXED);
    }

    if (!expected || impl::is_sentinel(expected))
      return nullptr;

    if (!ref.acquire(n, count))
      return nullptr;

    cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
    return RECLAIM ? ptr.load(cpp::MemoryOrder::RELAXED) : expected;
  }

  // Finalize the associated memory and signal that it is ready to use by
  // resetting the counter.
  void finalize(Slab *mem, uint32_t n, uint32_t &count) {
    cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
    ptr.store(mem, cpp::MemoryOrder::RELAXED);
    cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
    if (!ref.acquire(n, count))
      ref.reset(n, count);
  }

public:
  // Attempt to lock access to the pointer, potentially creating it if empty.
  // The uniform mask represents which lanes share the same pointer. For each
  // uniform value we elect a leader to handle it on behalf of the other lanes.
  template <typename... Args>
  Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint32_t &count,
                 Args &&...args) {
    count = 0;
    Slab *result = nullptr;
    if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
      result = try_lock_impl(cpp::popcount(uniform), count,
                             cpp::forward<Args>(args)...);
    result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result);
    count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count);

    if (!result)
      return nullptr;

    // We defer storing the newly allocated slab until now so that we can use
    // multiple lanes to initialize it and release it for use.
    if (impl::is_sentinel(count)) {
      result->initialize(uniform);
      if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
        finalize(result, cpp::popcount(uniform), count);
      count =
          gpu::shuffle(gpu::get_lane_mask(), cpp::countr_zero(uniform), count);
    }

    if (!impl::is_sentinel(count))
      count = count - cpp::popcount(uniform) +
              impl::lane_count(uniform, gpu::get_lane_id());

    return result;
  }

  // Release the associated lock on the pointer, potentially destroying it.
  void unlock(uint64_t lane_mask, uint64_t mask) {
    cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
    if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
        ref.release(cpp::popcount(mask))) {
      Slab *p = ptr.load(cpp::MemoryOrder::RELAXED);
      if (!slab_cache.push(p)) {
        p->~Slab();
        impl::rpc_free(p);
      }
      cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
      ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
    }
    gpu::sync_lane(lane_mask);
  }

  // Get the current value of the reference counter.
  uint64_t use_count() { return ref.read(); }
};

// The global array used to search for a valid slab to allocate from.
static GuardPtr slots[ARRAY_SIZE] = {};

// Keep a cache of the last successful slot for each chunk size. Initialize it
// to an even spread of the total size. Must be updated if the chunking scheme
// changes.
#define S(X) (impl::get_start_index(X))
static cpp::Atomic<uint32_t> indices[] = {
    S(16),     S(32),     S(48),     S(64),     S(96),     S(112),    S(128),
    S(192),    S(224),    S(256),    S(384),    S(448),    S(512),    S(768),
    S(896),    S(1024),   S(1536),   S(1792),   S(2048),   S(3072),   S(3584),
    S(4096),   S(6144),   S(7168),   S(8192),   S(12288),  S(14336),  S(16384),
    S(24576),  S(28672),  S(32768),  S(49152),  S(57344),  S(65536),  S(98304),
    S(114688), S(131072), S(196608), S(229376), S(262144), S(393216), S(458752),
    S(524288), S(786432), S(917504), S(1048576)};
#undef S

// Tries to find a slab in the table that can support the given chunk size.
static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform,
                       uint32_t &reserved) {
  // We start at the index of the last successful allocation for this kind.
  uint32_t chunk_id = impl::get_chunk_id(chunk_size);
  uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);

  for (uint32_t offset = 0; offset <= ARRAY_SIZE; ++offset) {
    uint32_t index =
        !offset ? start
                : (impl::get_start_index(chunk_size) + offset - 1) % ARRAY_SIZE;

    if (!offset ||
        slots[index].use_count() < Slab::available_chunks(chunk_size)) {
      uint64_t lane_mask = gpu::get_lane_mask();

      Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
                                         reserved, chunk_size, index);

      // If we find a slab with a matching chunk size then we store the result.
      // Otherwise, we need to free the claimed lock and continue. In the case
      // of out-of-memory we receive a sentinel value and return a failure.
      if (slab && reserved < Slab::available_chunks(chunk_size) &&
          slab->get_chunk_size() == chunk_size) {
        if (index != start)
          indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
        uniform = uniform & gpu::get_lane_mask();
        return slab;
      } else if (slab && (reserved >= Slab::available_chunks(chunk_size) ||
                          slab->get_chunk_size() != chunk_size)) {
        slots[index].unlock(gpu::get_lane_mask(),
                            gpu::get_lane_mask() & uniform);
      } else if (!slab && impl::is_sentinel(reserved)) {
        uniform = uniform & gpu::get_lane_mask();
        return nullptr;
      } else {
        sleep_briefly();
      }
    }
  }
  return nullptr;
}

// Release the lock associated with a given slab.
static void release_slab(Slab *slab) {
  uint32_t index = slab->get_global_index();
  uint64_t lane_mask = gpu::get_lane_mask();
  uint64_t uniform = gpu::match_any(lane_mask, index);
  slots[index].unlock(lane_mask, uniform);
}

namespace gpu {

void *allocate(uint64_t size) {
  if (!size)
    return nullptr;

  // Allocations requiring a full slab or more go directly to memory.
  if (size >= SLAB_SIZE / 2)
    return impl::rpc_allocate(impl::round_up<SLAB_SIZE>(size));

  // Try to find a slab for the rounded up chunk size and allocate from it.
  uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
  uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
  uint32_t reserved = 0;
  Slab *slab = find_slab(chunk_size, uniform, reserved);
  if (!slab)
    return nullptr;

  void *ptr = slab->allocate(uniform, reserved);
  return ptr;
}

void deallocate(void *ptr) {
  if (!ptr)
    return;

  // All non-slab allocations will be aligned on a 2MiB boundary.
  if (__builtin_is_aligned(ptr, SLAB_ALIGNMENT + 1))
    return impl::rpc_free(ptr);

  // The original slab pointer is the 2MiB boundary using the given pointer.
  Slab *slab = cpp::launder(reinterpret_cast<Slab *>(
      (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT)));
  slab->deallocate(ptr);
  release_slab(slab);
}

void *reallocate(void *ptr, uint64_t size) {
  if (ptr == nullptr)
    return gpu::allocate(size);

  // Non-slab allocations are considered foreign pointers so we fail.
  if (__builtin_is_aligned(ptr, SLAB_ALIGNMENT + 1))
    return nullptr;

  // The original slab pointer is the 2MiB boundary using the given pointer.
  Slab *slab = cpp::launder(reinterpret_cast<Slab *>(
      (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT)));
  if (slab->get_chunk_size() >= size)
    return ptr;

  // If we need a new chunk we reallocate and copy it over.
  void *new_ptr = gpu::allocate(size);
  inline_memcpy(new_ptr, ptr, slab->get_chunk_size());
  gpu::deallocate(ptr);
  return new_ptr;
}

void *aligned_allocate(uint32_t alignment, uint64_t size) {
  // All alignment values must be a non-zero power of two.
  if (!impl::is_pow2(alignment))
    return nullptr;

  // If the requested alignment is less than what we already provide this is
  // just a normal allocation.
  if (alignment <= MIN_ALIGNMENT + 1)
    return gpu::allocate(size);

  // We can't handle alignments greater than 2MiB so we simply fail.
  if (alignment > SLAB_ALIGNMENT + 1)
    return nullptr;

  // Trying to handle allocation internally would break the assumption that each
  // chunk is identical to eachother. Allocate enough memory with worst-case
  // alignment and then round up. The index logic will round down properly.
  uint64_t rounded = size + alignment - MIN_ALIGNMENT;
  void *ptr = gpu::allocate(rounded);
  return __builtin_align_up(ptr, alignment);
}

} // namespace gpu
} // namespace LIBC_NAMESPACE_DECL