2 files changed, 129 insertions, 1 deletions
diff --git a/misc/Makefile b/misc/Makefile
index 1083ba3..3b66cb9 100644
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -73,7 +73,8 @@ routines := brk sbrk sstk ioctl \
 	    fgetxattr flistxattr fremovexattr fsetxattr getxattr \
 	    listxattr lgetxattr llistxattr lremovexattr lsetxattr \
 	    removexattr setxattr getauxval ifunc-impl-list makedev \
-	    allocate_once fd_to_filename single_threaded unwind-link
+	    allocate_once fd_to_filename single_threaded unwind-link \
+	    atomic_wide_counter
 
 generated += tst-error1.mtrace tst-error1-mem.out \
   tst-allocate_once.mtrace tst-allocate_once-mem.out
diff --git a/misc/atomic_wide_counter.c b/misc/atomic_wide_counter.c
new file mode 100644
index 0000000..56d8981
--- /dev/null
+++ b/misc/atomic_wide_counter.c
@@ -0,0 +1,127 @@
+/* Monotonically increasing wide counters (at least 62 bits).
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <atomic_wide_counter.h>
+
+#if !__HAVE_64B_ATOMICS
+
+/* Values we add or xor are less than or equal to 1<<31, so we only
+   have to make overflow-and-addition atomic wrt. to concurrent load
+   operations and xor operations.  To do that, we split each counter
+   into two 32b values of which we reserve the MSB of each to
+   represent an overflow from the lower-order half to the higher-order
+   half.
+
+   In the common case, the state is (higher-order / lower-order half, and . is
+   basically concatenation of the bits):
+   0.h     / 0.l  = h.l
+
+   When we add a value of x that overflows (i.e., 0.l + x == 1.L), we run the
+   following steps S1-S4 (the values these represent are on the right-hand
+   side):
+   S1:  0.h     / 1.L == (h+1).L
+   S2:  1.(h+1) / 1.L == (h+1).L
+   S3:  1.(h+1) / 0.L == (h+1).L
+   S4:  0.(h+1) / 0.L == (h+1).L
+   If the LSB of the higher-order half is set, readers will ignore the
+   overflow bit in the lower-order half.
+
+   To get an atomic snapshot in load operations, we exploit that the
+   higher-order half is monotonically increasing; if we load a value V from
+   it, then read the lower-order half, and then read the higher-order half
+   again and see the same value V, we know that both halves have existed in
+   the sequence of values the full counter had.  This is similar to the
+   validated reads in the time-based STMs in GCC's libitm (e.g.,
+   method_ml_wt).
+
+   One benefit of this scheme is that this makes load operations
+   obstruction-free because unlike if we would just lock the counter, readers
+   can almost always interpret a snapshot of each halves.  Readers can be
+   forced to read a new snapshot when the read is concurrent with an overflow.
+   However, overflows will happen infrequently, so load operations are
+   practically lock-free.  */
+
+uint64_t
+__atomic_wide_counter_fetch_add_relaxed (__atomic_wide_counter *c,
+                                         unsigned int op)
+{
+  /* S1. Note that this is an atomic read-modify-write so it extends the
+     release sequence of release MO store at S3.  */
+  unsigned int l = atomic_fetch_add_relaxed (&c->__value32.__low, op);
+  unsigned int h = atomic_load_relaxed (&c->__value32.__high);
+  uint64_t result = ((uint64_t) h << 31) | l;
+  l += op;
+  if ((l >> 31) > 0)
+    {
+      /* Overflow.  Need to increment higher-order half.  Note that all
+         add operations are ordered in happens-before.  */
+      h++;
+      /* S2. Release MO to synchronize with the loads of the higher-order half
+         in the load operation.  See __condvar_load_64_relaxed.  */
+      atomic_store_release (&c->__value32.__high,
+                            h | ((unsigned int) 1 << 31));
+      l ^= (unsigned int) 1 << 31;
+      /* S3.  See __condvar_load_64_relaxed.  */
+      atomic_store_release (&c->__value32.__low, l);
+      /* S4.  Likewise.  */
+      atomic_store_release (&c->__value32.__high, h);
+    }
+  return result;
+}
+
+uint64_t
+__atomic_wide_counter_load_relaxed (__atomic_wide_counter *c)
+{
+  unsigned int h, l, h2;
+  do
+    {
+      /* This load and the second one below to the same location read from the
+         stores in the overflow handling of the add operation or the
+         initializing stores (which is a simple special case because
+         initialization always completely happens before further use).
+         Because no two stores to the higher-order half write the same value,
+         the loop ensures that if we continue to use the snapshot, this load
+         and the second one read from the same store operation.  All candidate
+         store operations have release MO.
+         If we read from S2 in the first load, then we will see the value of
+         S1 on the next load (because we synchronize with S2), or a value
+         later in modification order.  We correctly ignore the lower-half's
+         overflow bit in this case.  If we read from S4, then we will see the
+         value of S3 in the next load (or a later value), which does not have
+         the overflow bit set anymore.
+          */
+      h = atomic_load_acquire (&c->__value32.__high);
+      /* This will read from the release sequence of S3 (i.e, either the S3
+         store or the read-modify-writes at S1 following S3 in modification
+         order).  Thus, the read synchronizes with S3, and the following load
+         of the higher-order half will read from the matching S2 (or a later
+         value).
+         Thus, if we read a lower-half value here that already overflowed and
+         belongs to an increased higher-order half value, we will see the
+         latter and h and h2 will not be equal.  */
+      l = atomic_load_acquire (&c->__value32.__low);
+      /* See above.  */
+      h2 = atomic_load_relaxed (&c->__value32.__high);
+    }
+  while (h != h2);
+  if (((l >> 31) > 0) && ((h >> 31) > 0))
+    l ^= (unsigned int) 1 << 31;
+  return ((uint64_t) (h & ~((unsigned int) 1 << 31)) << 31) + l;
+}
+
+#endif /* !__HAVE_64B_ATOMICS */