7 files changed, 174 insertions, 103 deletions
diff --git a/libstdc++-v3/include/std/barrier b/libstdc++-v3/include/std/barrier
index 6c3cfd4..56270c9 100644
--- a/libstdc++-v3/include/std/barrier
+++ b/libstdc++-v3/include/std/barrier
@@ -81,105 +81,142 @@ It looks different from literature pseudocode for two main reasons:
 
   enum class __barrier_phase_t : unsigned char { };
 
-  template<typename _CompletionF>
-    class __tree_barrier
+  struct __tree_barrier_base
+  {
+    static constexpr ptrdiff_t
+    max() noexcept
+    { return __PTRDIFF_MAX__ - 1; }
+
+  protected:
+    using __atomic_phase_ref_t = std::__atomic_ref<__barrier_phase_t>;
+    using __atomic_phase_const_ref_t = std::__atomic_ref<const __barrier_phase_t>;
+    static constexpr auto __phase_alignment =
+		    __atomic_phase_ref_t::required_alignment;
+
+    using __tickets_t = std::array<__barrier_phase_t, 64>;
+    struct alignas(64) /* naturally-align the heap state */ __state_t
     {
-      using __atomic_phase_ref_t = std::__atomic_ref<__barrier_phase_t>;
-      using __atomic_phase_const_ref_t = std::__atomic_ref<const __barrier_phase_t>;
-      static constexpr auto __phase_alignment =
-		      __atomic_phase_ref_t::required_alignment;
+      alignas(__phase_alignment) __tickets_t __tickets;
+    };
 
-      using __tickets_t = std::array<__barrier_phase_t, 64>;
-      struct alignas(64) /* naturally-align the heap state */ __state_t
-      {
-	alignas(__phase_alignment) __tickets_t __tickets;
-      };
+    ptrdiff_t _M_expected;
+    __atomic_base<__state_t*> _M_state{nullptr};
+    __atomic_base<ptrdiff_t> _M_expected_adjustment{0};
+    alignas(__phase_alignment) __barrier_phase_t  _M_phase{};
 
-      ptrdiff_t _M_expected;
-      unique_ptr<__state_t[]> _M_state;
-      __atomic_base<ptrdiff_t> _M_expected_adjustment;
-      _CompletionF _M_completion;
+    explicit constexpr
+    __tree_barrier_base(ptrdiff_t __expected)
+    : _M_expected(__expected)
+    {
+      __glibcxx_assert(__expected >= 0 && __expected <= max());
 
-      alignas(__phase_alignment) __barrier_phase_t  _M_phase;
+      if (!std::is_constant_evaluated())
+	_M_state.store(_M_alloc_state().release(), memory_order_release);
+    }
 
-      bool
-      _M_arrive(__barrier_phase_t __old_phase, size_t __current)
-      {
-	const auto __old_phase_val = static_cast<unsigned char>(__old_phase);
-	const auto __half_step =
-			   static_cast<__barrier_phase_t>(__old_phase_val + 1);
-	const auto __full_step =
-			   static_cast<__barrier_phase_t>(__old_phase_val + 2);
+    unique_ptr<__state_t[]>
+    _M_alloc_state()
+    {
+      size_t const __count = (_M_expected + 1) >> 1;
+      return std::make_unique<__state_t[]>(__count);
+    }
 
-	size_t __current_expected = _M_expected;
-	__current %= ((_M_expected + 1) >> 1);
+    bool
+    _M_arrive(__barrier_phase_t __old_phase, size_t __current)
+    {
+      const auto __old_phase_val = static_cast<unsigned char>(__old_phase);
+      const auto __half_step =
+			 static_cast<__barrier_phase_t>(__old_phase_val + 1);
+      const auto __full_step =
+			 static_cast<__barrier_phase_t>(__old_phase_val + 2);
+
+      size_t __current_expected = _M_expected;
+      __current %= ((_M_expected + 1) >> 1);
+
+      __state_t* const __state = _M_state.load(memory_order_relaxed);
+
+      for (int __round = 0; ; ++__round)
+	{
+	  if (__current_expected <= 1)
+	      return true;
+	  size_t const __end_node = ((__current_expected + 1) >> 1),
+		       __last_node = __end_node - 1;
+	  for ( ; ; ++__current)
+	    {
+	      if (__current == __end_node)
+		__current = 0;
+	      auto __expect = __old_phase;
+	      __atomic_phase_ref_t __phase(__state[__current]
+					      .__tickets[__round]);
+	      if (__current == __last_node && (__current_expected & 1))
+		{
+		  if (__phase.compare_exchange_strong(__expect, __full_step,
+						      memory_order_acq_rel))
+		    break;     // I'm 1 in 1, go to next __round
+		}
+	      else if (__phase.compare_exchange_strong(__expect, __half_step,
+						       memory_order_acq_rel))
+		{
+		  return false; // I'm 1 in 2, done with arrival
+		}
+	      else if (__expect == __half_step)
+		{
+		  if (__phase.compare_exchange_strong(__expect, __full_step,
+						      memory_order_acq_rel))
+		    break;    // I'm 2 in 2, go to next __round
+		}
+	    }
+	  __current_expected = __last_node + 1;
+	  __current >>= 1;
+	}
+    }
+  };
 
-	for (int __round = 0; ; ++__round)
-	  {
-	    if (__current_expected <= 1)
-		return true;
-	    size_t const __end_node = ((__current_expected + 1) >> 1),
-			 __last_node = __end_node - 1;
-	    for ( ; ; ++__current)
-	      {
-		if (__current == __end_node)
-		  __current = 0;
-		auto __expect = __old_phase;
-		__atomic_phase_ref_t __phase(_M_state[__current]
-						.__tickets[__round]);
-		if (__current == __last_node && (__current_expected & 1))
-		  {
-		    if (__phase.compare_exchange_strong(__expect, __full_step,
-						        memory_order_acq_rel))
-		      break;     // I'm 1 in 1, go to next __round
-		  }
-		else if (__phase.compare_exchange_strong(__expect, __half_step,
-						         memory_order_acq_rel))
-		  {
-		    return false; // I'm 1 in 2, done with arrival
-		  }
-		else if (__expect == __half_step)
-		  {
-		    if (__phase.compare_exchange_strong(__expect, __full_step,
-						        memory_order_acq_rel))
-		      break;    // I'm 2 in 2, go to next __round
-		  }
-	      }
-	    __current_expected = __last_node + 1;
-	    __current >>= 1;
-	  }
-      }
+  template<typename _CompletionF>
+    class __tree_barrier : public __tree_barrier_base
+    {
+      [[no_unique_address]] _CompletionF _M_completion;
+
+      // _GLIBCXX_RESOLVE_LIB_DEFECTS
+      // 3898. Possibly unintended preconditions for completion functions
+      void _M_invoke_completion() noexcept { _M_completion(); }
 
     public:
       using arrival_token = __barrier_phase_t;
 
-      static constexpr ptrdiff_t
-      max() noexcept
-      { return __PTRDIFF_MAX__; }
-
+      constexpr
       __tree_barrier(ptrdiff_t __expected, _CompletionF __completion)
-	  : _M_expected(__expected), _M_expected_adjustment(0),
-	    _M_completion(move(__completion)),
-	    _M_phase(static_cast<__barrier_phase_t>(0))
-      {
-	size_t const __count = (_M_expected + 1) >> 1;
-
-	_M_state = std::make_unique<__state_t[]>(__count);
-      }
+      : __tree_barrier_base(__expected), _M_completion(std::move(__completion))
+      { }
 
       [[nodiscard]] arrival_token
       arrive(ptrdiff_t __update)
       {
+	__glibcxx_assert(__update > 0);
+	// FIXME: Check that update is less than or equal to the expected count
+	// for the current barrier phase.
+
 	std::hash<std::thread::id> __hasher;
 	size_t __current = __hasher(std::this_thread::get_id());
 	__atomic_phase_ref_t __phase(_M_phase);
 	const auto __old_phase = __phase.load(memory_order_relaxed);
 	const auto __cur = static_cast<unsigned char>(__old_phase);
-	for(; __update; --__update)
+
+	if (__cur == 0 && !_M_state.load(memory_order_relaxed)) [[unlikely]]
+	  {
+	    auto __p = _M_alloc_state();
+	    __state_t* __val = nullptr;
+	    if (_M_state.compare_exchange_strong(__val, __p.get(),
+						 memory_order_seq_cst,
+						 memory_order_acquire))
+	      __p.release();
+	  }
+
+	for (; __update; --__update)
 	  {
-	    if(_M_arrive(__old_phase, __current))
+	    if (_M_arrive(__old_phase, __current))
 	      {
-		_M_completion();
+		_M_invoke_completion();
 		_M_expected += _M_expected_adjustment.load(memory_order_relaxed);
 		_M_expected_adjustment.store(0, memory_order_relaxed);
 		auto __new_phase = static_cast<__barrier_phase_t>(__cur + 2);
@@ -194,11 +231,7 @@ It looks different from literature pseudocode for two main reasons:
       wait(arrival_token&& __old_phase) const
       {
 	__atomic_phase_const_ref_t __phase(_M_phase);
-	auto const __test_fn = [=]
-	  {
-	    return __phase.load(memory_order_acquire) != __old_phase;
-	  };
-	std::__atomic_wait_address(&_M_phase, __test_fn);
+	__phase.wait(__old_phase, memory_order_acquire);
       }
 
       void
@@ -212,6 +245,8 @@ It looks different from literature pseudocode for two main reasons:
   template<typename _CompletionF = __empty_completion>
     class barrier
     {
+      static_assert(is_invocable_v<_CompletionF&>);
+
       // Note, we may introduce a "central" barrier algorithm at some point
       // for more space constrained targets
       using __algorithm_t = __tree_barrier<_CompletionF>;
@@ -236,7 +271,7 @@ It looks different from literature pseudocode for two main reasons:
       max() noexcept
       { return __algorithm_t::max(); }
 
-      explicit
+      constexpr explicit
       barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF())
       : _M_b(__count, std::move(__completion))
       { }
diff --git a/libstdc++-v3/include/std/flat_map b/libstdc++-v3/include/std/flat_map
index 6593988..4bd4963 100644
--- a/libstdc++-v3/include/std/flat_map
+++ b/libstdc++-v3/include/std/flat_map
@@ -873,7 +873,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       [[nodiscard]]
       friend bool
       operator==(const _Derived& __x, const _Derived& __y)
-      { return std::equal(__x.begin(), __x.end(), __y.begin(), __y.end()); }
+      {
+	return __x._M_cont.keys == __y._M_cont.keys
+	  && __x._M_cont.values == __y._M_cont.values;
+      }
 
       template<typename _Up = value_type>
 	[[nodiscard]]
@@ -895,7 +898,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	{
 	  auto __guard = _M_make_clear_guard();
 	  auto __zv = views::zip(_M_cont.keys, _M_cont.values);
-	  auto __sr = ranges::remove_if(__zv, __pred);
+	  auto __sr = ranges::remove_if(__zv, __pred,
+					[](const auto& __e) {
+					  return const_reference(__e);
+					});
 	  auto __erased = __sr.size();
 	  erase(end() - __erased, end());
 	  __guard._M_disable();
diff --git a/libstdc++-v3/include/std/functional b/libstdc++-v3/include/std/functional
index 9a55b18..307bcb9 100644
--- a/libstdc++-v3/include/std/functional
+++ b/libstdc++-v3/include/std/functional
@@ -57,6 +57,7 @@
 #define __glibcxx_want_bind_back
 #define __glibcxx_want_constexpr_functional
 #define __glibcxx_want_copyable_function
+#define __glibcxx_want_function_ref
 #define __glibcxx_want_invoke
 #define __glibcxx_want_invoke_r
 #define __glibcxx_want_move_only_function
@@ -86,7 +87,7 @@
 # include <bits/ranges_cmp.h> // std::identity, ranges::equal_to etc.
 # include <compare>
 #endif
-#if defined(__glibcxx_move_only_function) || defined(__glibcxx_copyable_function)
+#if __glibcxx_move_only_function || __glibcxx_copyable_function || __glibcxx_function_ref
 # include <bits/funcwrap.h>
 #endif
 
diff --git a/libstdc++-v3/include/std/latch b/libstdc++-v3/include/std/latch
index dc147c2..9504df0 100644
--- a/libstdc++-v3/include/std/latch
+++ b/libstdc++-v3/include/std/latch
@@ -89,15 +89,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     _GLIBCXX_ALWAYS_INLINE void
     wait() const noexcept
     {
-      auto const __pred = [this] { return this->try_wait(); };
-      std::__atomic_wait_address(&_M_counter, __pred);
+      auto const __vfn = [this] {
+	return __atomic_impl::load(&_M_counter, memory_order::acquire);
+      };
+      auto const __pred = [](__detail::__platform_wait_t __v) {
+	return __v == 0;
+      };
+      std::__atomic_wait_address(&_M_counter, __pred, __vfn);
     }
 
     _GLIBCXX_ALWAYS_INLINE void
     arrive_and_wait(ptrdiff_t __update = 1) noexcept
     {
-      count_down(__update);
-      wait();
+      // The standard specifies this functions as count_down(update); wait();
+      // but we combine those two calls into one and avoid the wait() if we
+      // know the counter reached zero.
+
+      __glibcxx_assert(__update >= 0 && __update <= max());
+      // Use acq_rel here because an omitted wait() would have used acquire:
+      auto const __old = __atomic_impl::fetch_sub(&_M_counter, __update,
+						  memory_order::acq_rel);
+      if (std::cmp_equal(__old, __update))
+	__atomic_impl::notify_all(&_M_counter);
+      else
+	{
+	  __glibcxx_assert(std::cmp_less(__update, __old));
+	  wait();
+	}
     }
 
   private:
diff --git a/libstdc++-v3/include/std/mdspan b/libstdc++-v3/include/std/mdspan
index 47cfa40..bcf2fa6 100644
--- a/libstdc++-v3/include/std/mdspan
+++ b/libstdc++-v3/include/std/mdspan
@@ -146,7 +146,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       private:
 	using _S_storage = __array_traits<_IndexType, _S_rank_dynamic>::_Type;
-	[[no_unique_address]] _S_storage _M_dynamic_extents;
+	[[no_unique_address]] _S_storage _M_dynamic_extents{};
       };
 
     template<typename _OIndexType, typename _SIndexType>
diff --git a/libstdc++-v3/include/std/memory b/libstdc++-v3/include/std/memory
index 78a1250..d64e65c 100644
--- a/libstdc++-v3/include/std/memory
+++ b/libstdc++-v3/include/std/memory
@@ -97,6 +97,10 @@
 #  include <bits/out_ptr.h>
 #endif
 
+#if __cplusplus > 202302L
+#  include <bits/indirect.h>
+#endif
+
 #define __glibcxx_want_addressof_constexpr
 #define __glibcxx_want_allocator_traits_is_always_equal
 #define __glibcxx_want_assume_aligned
@@ -105,6 +109,7 @@
 #define __glibcxx_want_constexpr_dynamic_alloc
 #define __glibcxx_want_constexpr_memory
 #define __glibcxx_want_enable_shared_from_this
+#define __glibcxx_want_indirect
 #define __glibcxx_want_make_unique
 #define __glibcxx_want_out_ptr
 #define __glibcxx_want_parallel_algorithm
diff --git a/libstdc++-v3/include/std/semaphore b/libstdc++-v3/include/std/semaphore
index bec5ac3..ca1bffe 100644
--- a/libstdc++-v3/include/std/semaphore
+++ b/libstdc++-v3/include/std/semaphore
@@ -35,29 +35,29 @@
 
 #include <bits/requires_hosted.h> // concurrency
 
-#if __cplusplus > 201703L
-#include <bits/semaphore_base.h>
-
 #define __glibcxx_want_semaphore
 #include <bits/version.h>
 
-#ifdef __cpp_lib_semaphore // C++ >= 20 && hosted && (atomic_wait || posix_sem)
+#ifdef __cpp_lib_semaphore // C++ >= 20 && hosted && atomic_wait
+#include <bits/semaphore_base.h>
+
 namespace std _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
-  template<ptrdiff_t __least_max_value = __semaphore_impl::_S_max>
+  template<ptrdiff_t __least_max_value = __semaphore_base<true>::_S_max>
     class counting_semaphore
     {
       static_assert(__least_max_value >= 0);
-      static_assert(__least_max_value <= __semaphore_impl::_S_max);
 
-      __semaphore_impl _M_sem;
+      using _Impl = __semaphore_impl<__least_max_value>;
+      _Impl _M_sem;
 
     public:
-      explicit counting_semaphore(ptrdiff_t __desired) noexcept
-	: _M_sem(__desired)
-      { }
+      constexpr explicit
+      counting_semaphore(ptrdiff_t __desired) noexcept
+      : _M_sem(__desired)
+      { __glibcxx_assert(__desired >= 0 && __desired <= max()); }
 
       ~counting_semaphore() = default;
 
@@ -69,8 +69,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       { return __least_max_value; }
 
       void
-      release(ptrdiff_t __update = 1) noexcept(noexcept(_M_sem._M_release(1)))
-      { _M_sem._M_release(__update); }
+      release(ptrdiff_t __update = 1) noexcept
+      {
+	[[maybe_unused]] ptrdiff_t __old = _M_sem._M_release(__update);
+	__glibcxx_assert(__update >= 0 && __update <= max() - __old);
+      }
 
       void
       acquire() noexcept(noexcept(_M_sem._M_acquire()))
@@ -91,10 +94,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	{ return _M_sem._M_try_acquire_until(__atime); }
     };
 
+  /** @brief A binary semaphore
+   *
+   * @since C++20
+   */
   using binary_semaphore = std::counting_semaphore<1>;
 
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace
-#endif // cpp_lib_atomic_wait || _GLIBCXX_HAVE_POSIX_SEMAPHORE
 #endif // __cpp_lib_semaphore
 #endif // _GLIBCXX_SEMAPHORE