//===-- Implementation header for qsort utilities ---------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H #define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H #include "hdr/stdint_proxy.h" #include "src/__support/CPP/bit.h" #include "src/__support/CPP/cstddef.h" #include "src/__support/macros/config.h" #include "src/stdlib/qsort_pivot.h" namespace LIBC_NAMESPACE_DECL { namespace internal { // Branchless Lomuto partition based on the implementation by Lukas // Bergdoll and Orson Peters // https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md. // Simplified to avoid having to stack allocate. template LIBC_INLINE size_t partition_lomuto_branchless(const A &array, const void *pivot, const F &is_less) { const size_t array_len = array.len(); size_t left = 0; size_t right = 0; while (right < array_len) { const bool right_is_lt = is_less(array.get(right), pivot); array.swap(left, right); left += static_cast(right_is_lt); right += 1; } return left; } // Optimized for large types that are expensive to move. Not optimized // for integers. It's possible to use a cyclic permutation here for // large types as done in ipnsort but the advantages of this are limited // as `is_less` is a small wrapper around a call to a function pointer // and won't incur much binary-size overhead. The other reason to use // cyclic permutation is to have more efficient swapping, but we don't // know the element size so this isn't applicable here either. template LIBC_INLINE size_t partition_hoare_branchy(const A &array, const void *pivot, const F &is_less) { const size_t array_len = array.len(); size_t left = 0; size_t right = array_len; while (true) { while (left < right && is_less(array.get(left), pivot)) ++left; while (true) { --right; if (left >= right || is_less(array.get(right), pivot)) { break; } } if (left >= right) break; array.swap(left, right); ++left; } return left; } template LIBC_INLINE size_t partition(const A &array, size_t pivot_index, const F &is_less) { // Place the pivot at the beginning of the array. if (pivot_index != 0) { array.swap(0, pivot_index); } const A array_without_pivot = array.make_array(1, array.len() - 1); const void *pivot = array.get(0); size_t num_lt; if constexpr (A::has_fixed_size()) { // Branchless Lomuto avoid branch misprediction penalties, but // it also swaps more often which is only faster if the swap is a fast // constant operation. num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less); } else { num_lt = partition_hoare_branchy(array_without_pivot, pivot, is_less); } // Place the pivot between the two partitions. array.swap(0, num_lt); return num_lt; } template LIBC_INLINE void quick_sort_impl(A &array, const void *ancestor_pivot, size_t limit, const F &is_less) { while (true) { const size_t array_len = array.len(); if (array_len <= 1) return; // If too many bad pivot choices were made, simply fall back to // heapsort in order to guarantee `O(N x log(N))` worst-case. if (limit == 0) { heap_sort(array, is_less); return; } limit -= 1; const size_t pivot_index = choose_pivot(array, is_less); // If the chosen pivot is equal to the predecessor, then it's the smallest // element in the slice. Partition the slice into elements equal to and // elements greater than the pivot. This case is usually hit when the slice // contains many duplicate elements. if (ancestor_pivot) { if (!is_less(ancestor_pivot, array.get(pivot_index))) { const size_t num_lt = partition(array, pivot_index, [is_less](const void *a, const void *b) -> bool { return !is_less(b, a); }); // Continue sorting elements greater than the pivot. We know that // `num_lt` cont array.reset_bounds(num_lt + 1, array.len() - (num_lt + 1)); ancestor_pivot = nullptr; continue; } } size_t split_index = partition(array, pivot_index, is_less); if (array_len == 2) // The partition operation sorts the two element array. return; // Split the array into `left`, `pivot`, and `right`. A left = array.make_array(0, split_index); const void *pivot = array.get(split_index); const size_t right_start = split_index + 1; A right = array.make_array(right_start, array.len() - right_start); // Recurse into the left side. We have a fixed recursion limit, // testing shows no real benefit for recursing into the shorter // side. quick_sort_impl(left, ancestor_pivot, limit, is_less); // Continue with the right side. array = right; ancestor_pivot = pivot; } } constexpr size_t ilog2(size_t n) { return static_cast(cpp::bit_width(n)) - 1; } template LIBC_INLINE void quick_sort(A &array, const F &is_less) { const void *ancestor_pivot = nullptr; // Limit the number of imbalanced partitions to `2 * floor(log2(len))`. // The binary OR by one is used to eliminate the zero-check in the logarithm. const size_t limit = 2 * ilog2((array.len() | 1)); quick_sort_impl(array, ancestor_pivot, limit, is_less); } } // namespace internal } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H