3 files changed, 198 insertions, 45 deletions
diff --git a/host/include/aarch64/host/atomic128-cas.h b/host/include/aarch64/host/atomic128-cas.h
deleted file mode 100644
index 991da4e..0000000
--- a/host/include/aarch64/host/atomic128-cas.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * SPDX-License-Identifier: GPL-2.0-or-later
- * Compare-and-swap for 128-bit atomic operations, AArch64 version.
- *
- * Copyright (C) 2018, 2023 Linaro, Ltd.
- *
- * See docs/devel/atomics.rst for discussion about the guarantees each
- * atomic primitive is meant to provide.
- */
-
-#ifndef AARCH64_ATOMIC128_CAS_H
-#define AARCH64_ATOMIC128_CAS_H
-
-/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
-#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
-#include "host/include/generic/host/atomic128-cas.h.inc"
-#else
-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
-{
-    uint64_t cmpl = int128_getlo(cmp), cmph = int128_gethi(cmp);
-    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
-    uint64_t oldl, oldh;
-    uint32_t tmp;
-
-    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
-        "cmp %[oldl], %[cmpl]\n\t"
-        "ccmp %[oldh], %[cmph], #0, eq\n\t"
-        "b.ne 1f\n\t"
-        "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
-        "cbnz %w[tmp], 0b\n"
-        "1:"
-        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
-          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
-        : [cmpl] "r"(cmpl), [cmph] "r"(cmph),
-          [newl] "r"(newl), [newh] "r"(newh)
-        : "memory", "cc");
-
-    return int128_make128(oldl, oldh);
-}
-
-# define CONFIG_CMPXCHG128 1
-# define HAVE_CMPXCHG128 1
-#endif
-
-#endif /* AARCH64_ATOMIC128_CAS_H */
diff --git a/host/include/aarch64/host/atomic128-cas.h.inc b/host/include/aarch64/host/atomic128-cas.h.inc
new file mode 100644
index 0000000..aec27df
--- /dev/null
+++ b/host/include/aarch64/host/atomic128-cas.h.inc
@@ -0,0 +1,102 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Compare-and-swap for 128-bit atomic operations, AArch64 version.
+ *
+ * Copyright (C) 2018, 2023 Linaro, Ltd.
+ *
+ * See docs/devel/atomics.rst for discussion about the guarantees each
+ * atomic primitive is meant to provide.
+ */
+
+#ifndef AARCH64_ATOMIC128_CAS_H
+#define AARCH64_ATOMIC128_CAS_H
+
+/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
+#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
+#include "host/include/generic/host/atomic128-cas.h.inc"
+#else
+static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+{
+    uint64_t cmpl = int128_getlo(cmp), cmph = int128_gethi(cmp);
+    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
+    uint64_t oldl, oldh;
+    uint32_t tmp;
+
+    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
+        "cmp %[oldl], %[cmpl]\n\t"
+        "ccmp %[oldh], %[cmph], #0, eq\n\t"
+        "b.ne 1f\n\t"
+        "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
+        "cbnz %w[tmp], 0b\n"
+        "1:"
+        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
+          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
+        : [cmpl] "r"(cmpl), [cmph] "r"(cmph),
+          [newl] "r"(newl), [newh] "r"(newh)
+        : "memory", "cc");
+
+    return int128_make128(oldl, oldh);
+}
+
+static inline Int128 atomic16_xchg(Int128 *ptr, Int128 new)
+{
+    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
+    uint64_t oldl, oldh;
+    uint32_t tmp;
+
+    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
+        "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
+        "cbnz %w[tmp], 0b"
+        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
+          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
+        : [newl] "r"(newl), [newh] "r"(newh)
+        : "memory");
+
+    return int128_make128(oldl, oldh);
+}
+
+static inline Int128 atomic16_fetch_and(Int128 *ptr, Int128 new)
+{
+    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
+    uint64_t oldl, oldh, tmpl, tmph;
+    uint32_t tmp;
+
+    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
+        "and %[tmpl], %[oldl], %[newl]\n\t"
+        "and %[tmph], %[oldh], %[newh]\n\t"
+        "stlxp %w[tmp], %[tmpl], %[tmph], %[mem]\n\t"
+        "cbnz %w[tmp], 0b"
+        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
+          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
+        : [newl] "r"(newl), [newh] "r"(newh),
+          [tmpl] "r"(tmpl), [tmph] "r"(tmph)
+        : "memory");
+
+    return int128_make128(oldl, oldh);
+}
+
+static inline Int128 atomic16_fetch_or(Int128 *ptr, Int128 new)
+{
+    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
+    uint64_t oldl, oldh, tmpl, tmph;
+    uint32_t tmp;
+
+    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
+        "orr %[tmpl], %[oldl], %[newl]\n\t"
+        "orr %[tmph], %[oldh], %[newh]\n\t"
+        "stlxp %w[tmp], %[tmpl], %[tmph], %[mem]\n\t"
+        "cbnz %w[tmp], 0b"
+        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
+          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
+        : [newl] "r"(newl), [newh] "r"(newh),
+          [tmpl] "r"(tmpl), [tmph] "r"(tmph)
+        : "memory");
+
+    return int128_make128(oldl, oldh);
+}
+
+# define CONFIG_CMPXCHG128 1
+# define HAVE_CMPXCHG128 1
+#endif
+
+#endif /* AARCH64_ATOMIC128_CAS_H */
diff --git a/host/include/generic/host/atomic128-cas.h.inc b/host/include/generic/host/atomic128-cas.h.inc
index 6b40cc2..990162c 100644
--- a/host/include/generic/host/atomic128-cas.h.inc
+++ b/host/include/generic/host/atomic128-cas.h.inc
@@ -23,6 +23,51 @@ atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
     r.i = qatomic_cmpxchg__nocheck(ptr_align, c.i, n.i);
     return r.s;
 }
+
+/*
+ * Since we're looping anyway, use weak compare and swap.
+ * If the host supports weak, this will eliminate a second loop hidden
+ * within the atomic operation itself; otherwise the weak parameter is
+ * ignored.
+ */
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_xchg(Int128 *ptr, Int128 new)
+{
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
+    Int128 old = *ptr_align;
+
+    while (!__atomic_compare_exchange_n(ptr_align, &old, new, true,
+                                        __ATOMIC_SEQ_CST, 0)) {
+        continue;
+    }
+    return old;
+}
+
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_fetch_and(Int128 *ptr, Int128 val)
+{
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
+    Int128 old = *ptr_align;
+
+    while (!__atomic_compare_exchange_n(ptr_align, &old, old & val, true,
+                                        __ATOMIC_SEQ_CST, 0)) {
+        continue;
+    }
+    return old;
+}
+
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_fetch_or(Int128 *ptr, Int128 val)
+{
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
+    Int128 old = *ptr_align;
+
+    while (!__atomic_compare_exchange_n(ptr_align, &old, old | val, true,
+                                        __ATOMIC_SEQ_CST, 0)) {
+        continue;
+    }
+    return old;
+}
 # define HAVE_CMPXCHG128 1
 #elif defined(CONFIG_CMPXCHG128)
 static inline Int128 ATTRIBUTE_ATOMIC128_OPT
@@ -36,6 +81,57 @@ atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
     r.i = __sync_val_compare_and_swap_16(ptr_align, c.i, n.i);
     return r.s;
 }
+
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_xchg(Int128 *ptr, Int128 new)
+{
+    Int128Aligned *ptr_align = __builtin_assume_aligned(ptr, 16);
+    Int128Alias o, n;
+
+    n.s = new;
+    o.s = *ptr_align;
+    while (1) {
+        __int128 c = __sync_val_compare_and_swap_16(ptr_align, o.i, n.i);
+        if (c == o.i) {
+            return o.s;
+        }
+        o.i = c;
+    }
+}
+
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_fetch_and(Int128 *ptr, Int128 val)
+{
+    Int128Aligned *ptr_align = __builtin_assume_aligned(ptr, 16);
+    Int128Alias o, v;
+
+    v.s = val;
+    o.s = *ptr_align;
+    while (1) {
+        __int128 c = __sync_val_compare_and_swap_16(ptr_align, o.i, o.i & v.i);
+        if (c == o.i) {
+            return o.s;
+        }
+        o.i = c;
+    }
+}
+
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_fetch_or(Int128 *ptr, Int128 val)
+{
+    Int128Aligned *ptr_align = __builtin_assume_aligned(ptr, 16);
+    Int128Alias o, v;
+
+    v.s = val;
+    o.s = *ptr_align;
+    while (1) {
+        __int128 c = __sync_val_compare_and_swap_16(ptr_align, o.i, o.i | v.i);
+        if (c == o.i) {
+            return o.s;
+        }
+        o.i = c;
+    }
+}
 # define HAVE_CMPXCHG128 1
 #else
 /* Fallback definition that must be optimized away, or error.  */