* [PATCH v1 0/3] perf bench: Add ticket spinlock benchmark
@ 2025-07-29 8:12 Yuzhuo Jing
2025-07-29 8:12 ` [PATCH v1 1/3] tools: Import atomic_fetch_{and,add,sub} Yuzhuo Jing
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Yuzhuo Jing @ 2025-07-29 8:12 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
Ian Rogers, Adrian Hunter, Liang Kan, Paul Walmsley,
Palmer Dabbelt, Albert Ou, Alexandre Ghiti, Yuzhuo Jing,
Yuzhuo Jing, Guo Ren, Andrea Parri, Leonardo Bras, linux-kernel,
linux-perf-users, linux-riscv
This patch series adds benchmark for the kernel's ticket spinlock
implementation.
This series depends on a preceding patch series that introduces the
'perf bench sync' benchmark infrastructure.
Link: https://lore.kernel.org/lkml/20250729022640.3134066-1-yuzhuo@google.com/
In a quick test, on a 48C 96T x86 VM, ticket spinlock performs better on
2-6 threads, and qspinlock performs better on 1 thread or >=8 threads.
$ # set 't' variable, and then
$ ./perf bench sync qspinlock -t$t; sleep 1; ./perf bench sync ticket -t$t
'sync/qspinlock' benchmarks:
Lock-unlock latency of 1 threads: 8.5779 ns.
Lock-unlock latency of 2 threads: 187.1022 ns.
...
Lock-unlock latency of 6 threads: 1202.8312 ns.
...
Lock-unlock latency of 8 threads: 1541.566 ns.
Lock-unlock latency of 96 threads: 44140.8765 ns.
'sync/ticket' benchmarks:
Lock-unlock latency of 1 threads: 12.1888 ns.
Lock-unlock latency of 2 threads: 168.1132 ns.
...
Lock-unlock latency of 6 threads: 1033.2760 ns.
....
Lock-unlock latency of 8 threads: 1667.1647 ns.
Lock-unlock latency of 96 threads: 66915.8949 ns.
Yuzhuo Jing (3):
tools: Import atomic_fetch_{and,add,sub}
perf bench: Import ticket_spinlock from kerne
perf bench: Add 'bench sync ticket' subcommand
tools/arch/x86/include/asm/atomic.h | 17 ++++
tools/arch/x86/include/asm/cmpxchg.h | 11 +++
tools/include/asm-generic/atomic-gcc.h | 51 ++++++++++
tools/perf/bench/bench.h | 1 +
tools/perf/bench/include/ticket_spinlock.h | 107 +++++++++++++++++++++
tools/perf/bench/sync.c | 17 ++++
tools/perf/builtin-bench.c | 1 +
tools/perf/check-headers.sh | 3 +
8 files changed, 208 insertions(+)
create mode 100644 tools/perf/bench/include/ticket_spinlock.h
--
2.50.1.487.gc89ff58d15-goog
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH v1 1/3] tools: Import atomic_fetch_{and,add,sub}
2025-07-29 8:12 [PATCH v1 0/3] perf bench: Add ticket spinlock benchmark Yuzhuo Jing
@ 2025-07-29 8:12 ` Yuzhuo Jing
2025-07-29 8:12 ` [PATCH v1 2/3] perf bench: Import ticket_spinlock from kerne Yuzhuo Jing
2025-07-29 8:12 ` [PATCH v1 3/3] perf bench: Add 'bench sync ticket' subcommand Yuzhuo Jing
2 siblings, 0 replies; 4+ messages in thread
From: Yuzhuo Jing @ 2025-07-29 8:12 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
Ian Rogers, Adrian Hunter, Liang Kan, Paul Walmsley,
Palmer Dabbelt, Albert Ou, Alexandre Ghiti, Yuzhuo Jing,
Yuzhuo Jing, Guo Ren, Andrea Parri, Leonardo Bras, linux-kernel,
linux-perf-users, linux-riscv
Import necessary function (atomic_fetch_add) for ticket spinlock
Implementation. In addition, also import those that pair with the
imported ones (atomic_fetch_sub, atomic_fetch_and).
Signed-off-by: Yuzhuo Jing <yuzhuo@google.com>
---
tools/arch/x86/include/asm/atomic.h | 17 +++++++++
tools/arch/x86/include/asm/cmpxchg.h | 11 ++++++
tools/include/asm-generic/atomic-gcc.h | 51 ++++++++++++++++++++++++++
3 files changed, 79 insertions(+)
diff --git a/tools/arch/x86/include/asm/atomic.h b/tools/arch/x86/include/asm/atomic.h
index a55ffd4eb5f1..1fb7711ebbd7 100644
--- a/tools/arch/x86/include/asm/atomic.h
+++ b/tools/arch/x86/include/asm/atomic.h
@@ -66,6 +66,14 @@ static inline int atomic_dec_and_test(atomic_t *v)
GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
}
+static __always_inline int atomic_fetch_add(int i, atomic_t *v)
+{
+ return xadd(&v->counter, i);
+}
+#define atomic_fetch_add atomic_fetch_add
+
+#define atomic_fetch_sub(i, v) atomic_fetch_add(-(i), v)
+
static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
{
return cmpxchg(&v->counter, old, new);
@@ -85,6 +93,15 @@ static __always_inline int atomic_fetch_or(int i, atomic_t *v)
return val;
}
+static __always_inline int atomic_fetch_and(int i, atomic_t *v)
+{
+ int val = atomic_read(v);
+
+ do { } while (!atomic_try_cmpxchg(v, &val, val & i));
+
+ return val;
+}
+
static inline int test_and_set_bit(long nr, unsigned long *addr)
{
GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, "Ir", nr, "%0", "c");
diff --git a/tools/arch/x86/include/asm/cmpxchg.h b/tools/arch/x86/include/asm/cmpxchg.h
index 5372da8b27fc..2d89f150badf 100644
--- a/tools/arch/x86/include/asm/cmpxchg.h
+++ b/tools/arch/x86/include/asm/cmpxchg.h
@@ -12,6 +12,8 @@ extern void __xchg_wrong_size(void)
__compiletime_error("Bad argument size for xchg");
extern void __cmpxchg_wrong_size(void)
__compiletime_error("Bad argument size for cmpxchg");
+extern void __xadd_wrong_size(void)
+ __compiletime_error("Bad argument size for xadd");
/*
* Constants for operation sizes. On 32-bit, the 64-bit size it set to
@@ -200,4 +202,13 @@ extern void __cmpxchg_wrong_size(void)
#define try_cmpxchg(ptr, pold, new) \
__try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+/*
+ * xadd() adds "inc" to "*ptr" and atomically returns the previous
+ * value of "*ptr".
+ *
+ * xadd() is locked when multiple CPUs are online
+ */
+#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock)
+#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX)
+
#endif /* TOOLS_ASM_X86_CMPXCHG_H */
diff --git a/tools/include/asm-generic/atomic-gcc.h b/tools/include/asm-generic/atomic-gcc.h
index 08b7b3b36873..cc146b82bb34 100644
--- a/tools/include/asm-generic/atomic-gcc.h
+++ b/tools/include/asm-generic/atomic-gcc.h
@@ -100,6 +100,23 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new)
return likely(r == o);
}
+/**
+ * atomic_fetch_and() - atomic bitwise AND with full ordering
+ * @i: int value
+ * @v: pointer to atomic_t
+ *
+ * Atomically updates @v to (@v & @i) with full ordering.
+ *
+ * Unsafe to use in noinstr code; use raw_atomic_fetch_and() there.
+ *
+ * Return: The original value of @v.
+ */
+static __always_inline int
+atomic_fetch_and(int i, atomic_t *v)
+{
+ return __sync_fetch_and_and(&v->counter, i);
+}
+
/**
* atomic_fetch_or() - atomic bitwise OR with full ordering
* @i: int value
@@ -117,6 +134,40 @@ atomic_fetch_or(int i, atomic_t *v)
return __sync_fetch_and_or(&v->counter, i);
}
+/**
+ * atomic_fetch_add() - atomic add with full ordering
+ * @i: int value to add
+ * @v: pointer to atomic_t
+ *
+ * Atomically updates @v to (@v + @i) with full ordering.
+ *
+ * Unsafe to use in noinstr code; use raw_atomic_fetch_add() there.
+ *
+ * Return: The original value of @v.
+ */
+static __always_inline int
+atomic_fetch_add(int i, atomic_t *v)
+{
+ return __sync_fetch_and_add(&v->counter, i);
+}
+
+/**
+ * atomic_fetch_sub() - atomic subtract with full ordering
+ * @i: int value to subtract
+ * @v: pointer to atomic_t
+ *
+ * Atomically updates @v to (@v - @i) with full ordering.
+ *
+ * Unsafe to use in noinstr code; use raw_atomic_fetch_sub() there.
+ *
+ * Return: The original value of @v.
+ */
+static __always_inline int
+atomic_fetch_sub(int i, atomic_t *v)
+{
+ return __sync_fetch_and_sub(&v->counter, i);
+}
+
static inline int test_and_set_bit(long nr, unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
--
2.50.1.487.gc89ff58d15-goog
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH v1 2/3] perf bench: Import ticket_spinlock from kerne
2025-07-29 8:12 [PATCH v1 0/3] perf bench: Add ticket spinlock benchmark Yuzhuo Jing
2025-07-29 8:12 ` [PATCH v1 1/3] tools: Import atomic_fetch_{and,add,sub} Yuzhuo Jing
@ 2025-07-29 8:12 ` Yuzhuo Jing
2025-07-29 8:12 ` [PATCH v1 3/3] perf bench: Add 'bench sync ticket' subcommand Yuzhuo Jing
2 siblings, 0 replies; 4+ messages in thread
From: Yuzhuo Jing @ 2025-07-29 8:12 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
Ian Rogers, Adrian Hunter, Liang Kan, Paul Walmsley,
Palmer Dabbelt, Albert Ou, Alexandre Ghiti, Yuzhuo Jing,
Yuzhuo Jing, Guo Ren, Andrea Parri, Leonardo Bras, linux-kernel,
linux-perf-users, linux-riscv
Import generic ticket_spinlock implementation. Updated
tools/perf/check-headers.sh to detect future kernel file changes.
Signed-off-by: Yuzhuo Jing <yuzhuo@google.com>
---
tools/perf/bench/include/ticket_spinlock.h | 107 +++++++++++++++++++++
| 3 +
2 files changed, 110 insertions(+)
create mode 100644 tools/perf/bench/include/ticket_spinlock.h
diff --git a/tools/perf/bench/include/ticket_spinlock.h b/tools/perf/bench/include/ticket_spinlock.h
new file mode 100644
index 000000000000..1d063c99f7cb
--- /dev/null
+++ b/tools/perf/bench/include/ticket_spinlock.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * 'Generic' ticket-lock implementation.
+ *
+ * It relies on atomic_fetch_add() having well defined forward progress
+ * guarantees under contention. If your architecture cannot provide this, stick
+ * to a test-and-set lock.
+ *
+ * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a
+ * sub-word of the value. This is generally true for anything LL/SC although
+ * you'd be hard pressed to find anything useful in architecture specifications
+ * about this. If your architecture cannot do this you might be better off with
+ * a test-and-set.
+ *
+ * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
+ * uses atomic_fetch_add() which is RCsc to create an RCsc hot path, along with
+ * a full fence after the spin to upgrade the otherwise-RCpc
+ * atomic_cond_read_acquire().
+ *
+ * The implementation uses smp_cond_load_acquire() to spin, so if the
+ * architecture has WFE like instructions to sleep instead of poll for word
+ * modifications be sure to implement that (see ARM64 for example).
+ *
+ */
+
+#ifndef __ASM_GENERIC_TICKET_SPINLOCK_H
+#define __ASM_GENERIC_TICKET_SPINLOCK_H
+
+#include <linux/atomic.h>
+#include <asm/barrier.h>
+#include <endian.h>
+#include "qspinlock_types.h"
+
+static __always_inline void ticket_spin_lock(arch_spinlock_t *lock)
+{
+ u32 val = atomic_fetch_add(1<<16, &lock->val);
+ u16 ticket = val >> 16;
+
+ if (ticket == (u16)val)
+ return;
+
+ /*
+ * atomic_cond_read_acquire() is RCpc, but rather than defining a
+ * custom cond_read_rcsc() here we just emit a full fence. We only
+ * need the prior reads before subsequent writes ordering from
+ * smb_mb(), but as atomic_cond_read_acquire() just emits reads and we
+ * have no outstanding writes due to the atomic_fetch_add() the extra
+ * orderings are free.
+ */
+ atomic_cond_read_acquire(&lock->val, ticket == (u16)VAL);
+ smp_mb();
+}
+
+static __always_inline bool ticket_spin_trylock(arch_spinlock_t *lock)
+{
+ u32 old = atomic_read(&lock->val);
+
+ if ((old >> 16) != (old & 0xffff))
+ return false;
+
+ return atomic_try_cmpxchg(&lock->val, (int *)&old, old + (1<<16)); /* SC, for RCsc */
+}
+
+static __always_inline void ticket_spin_unlock(arch_spinlock_t *lock)
+{
+ u16 *ptr = (u16 *)lock + (__BYTE_ORDER == __BIG_ENDIAN);
+ u32 val = atomic_read(&lock->val);
+
+ smp_store_release(ptr, (u16)val + 1);
+}
+
+static __always_inline int ticket_spin_value_unlocked(arch_spinlock_t lock)
+{
+ u32 val = lock.val.counter;
+
+ return ((val >> 16) == (val & 0xffff));
+}
+
+static __always_inline int ticket_spin_is_locked(arch_spinlock_t *lock)
+{
+ arch_spinlock_t val = READ_ONCE(*lock);
+
+ return !ticket_spin_value_unlocked(val);
+}
+
+static __always_inline int ticket_spin_is_contended(arch_spinlock_t *lock)
+{
+ u32 val = atomic_read(&lock->val);
+
+ return (s16)((val >> 16) - (val & 0xffff)) > 1;
+}
+
+#ifndef __no_arch_spinlock_redefine
+/*
+ * Remapping spinlock architecture specific functions to the corresponding
+ * ticket spinlock functions.
+ */
+#define arch_spin_is_locked(l) ticket_spin_is_locked(l)
+#define arch_spin_is_contended(l) ticket_spin_is_contended(l)
+#define arch_spin_value_unlocked(l) ticket_spin_value_unlocked(l)
+#define arch_spin_lock(l) ticket_spin_lock(l)
+#define arch_spin_trylock(l) ticket_spin_trylock(l)
+#define arch_spin_unlock(l) ticket_spin_unlock(l)
+#endif
+
+#endif /* __ASM_GENERIC_TICKET_SPINLOCK_H */
--git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index b827b10e19c1..c9f76e3e3d66 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -239,6 +239,9 @@ check_2_sed tools/perf/bench/qspinlock.c kernel/locking/qspinlock.c "$qsl_sed"
"$qsl_common"' -I EXPORT_SYMBOL -I "^#define lockevent_" -I "^#define trace_" \
-I smp_processor_id -I atomic_try_cmpxchg_relaxed'
+check_2 tools/perf/bench/include/ticket_spinlock.h include/asm-generic/ticket_spinlock.h \
+ '-I "^#include" -I atomic_try_cmpxchg -I BIG_ENDIAN -B'
+
for i in "${BEAUTY_FILES[@]}"
do
beauty_check "$i" -B
--
2.50.1.487.gc89ff58d15-goog
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH v1 3/3] perf bench: Add 'bench sync ticket' subcommand
2025-07-29 8:12 [PATCH v1 0/3] perf bench: Add ticket spinlock benchmark Yuzhuo Jing
2025-07-29 8:12 ` [PATCH v1 1/3] tools: Import atomic_fetch_{and,add,sub} Yuzhuo Jing
2025-07-29 8:12 ` [PATCH v1 2/3] perf bench: Import ticket_spinlock from kerne Yuzhuo Jing
@ 2025-07-29 8:12 ` Yuzhuo Jing
2 siblings, 0 replies; 4+ messages in thread
From: Yuzhuo Jing @ 2025-07-29 8:12 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
Ian Rogers, Adrian Hunter, Liang Kan, Paul Walmsley,
Palmer Dabbelt, Albert Ou, Alexandre Ghiti, Yuzhuo Jing,
Yuzhuo Jing, Guo Ren, Andrea Parri, Leonardo Bras, linux-kernel,
linux-perf-users, linux-riscv
Benchmark kernel ticket spinlock implementation in user space.
In resolution of arch_spin_* redefinition conflicts due to importing
qspinlock and ticket spinlock together, the sync.c defines
__no_arch_spinlock_redefine, following the usage in
arch/riscv/include/asm/spinlock.h.
Signed-off-by: Yuzhuo Jing <yuzhuo@google.com>
---
tools/perf/bench/bench.h | 1 +
tools/perf/bench/sync.c | 17 +++++++++++++++++
tools/perf/builtin-bench.c | 1 +
3 files changed, 19 insertions(+)
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index dd6c8b6126d3..42c0696b05fb 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -23,6 +23,7 @@ int bench_sched_messaging(int argc, const char **argv);
int bench_sched_pipe(int argc, const char **argv);
int bench_sched_seccomp_notify(int argc, const char **argv);
int bench_sync_qspinlock(int argc, const char **argv);
+int bench_sync_ticket(int argc, const char **argv);
int bench_syscall_basic(int argc, const char **argv);
int bench_syscall_getpgid(int argc, const char **argv);
int bench_syscall_fork(int argc, const char **argv);
diff --git a/tools/perf/bench/sync.c b/tools/perf/bench/sync.c
index c85e9853c72a..581835451e5f 100644
--- a/tools/perf/bench/sync.c
+++ b/tools/perf/bench/sync.c
@@ -17,7 +17,9 @@
#include "bench.h"
#include "../util/tsc.h"
+#define __no_arch_spinlock_redefine
#include "include/qspinlock.h"
+#include "include/ticket_spinlock.h"
#define NS 1000000000ull
#define CACHELINE_SIZE 64
@@ -67,6 +69,7 @@ static const struct option options[] = {
static const char *const bench_sync_usage[] = {
"perf bench sync qspinlock <options>",
+ "perf bench sync ticket <options>",
NULL
};
@@ -106,6 +109,20 @@ int bench_sync_qspinlock(int argc, const char **argv)
return bench_sync_lock_generic(&ops, argc, argv);
}
+/*
+ * Benchmark of linux kernel ticket spinlock in user land.
+ */
+int bench_sync_ticket(int argc, const char **argv)
+{
+ arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+ struct lock_ops ops = {
+ .lock = (lock_fn)ticket_spin_lock,
+ .unlock = (lock_fn)ticket_spin_unlock,
+ .data = &lock,
+ };
+ return bench_sync_lock_generic(&ops, argc, argv);
+}
+
/*
* A busy loop to acquire and release the given lock N times.
*/
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index cfe6f6dc6ed4..8d945b846321 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -54,6 +54,7 @@ static struct bench sched_benchmarks[] = {
static struct bench sync_benchmarks[] = {
{ "qspinlock", "Benchmark for queued spinlock", bench_sync_qspinlock },
+ { "ticket", "Benchmark for ticket spinlock", bench_sync_ticket },
{ "all", "Run all synchronization benchmarks", NULL },
{ NULL, NULL, NULL }
};
--
2.50.1.487.gc89ff58d15-goog
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2025-07-29 8:13 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-29 8:12 [PATCH v1 0/3] perf bench: Add ticket spinlock benchmark Yuzhuo Jing
2025-07-29 8:12 ` [PATCH v1 1/3] tools: Import atomic_fetch_{and,add,sub} Yuzhuo Jing
2025-07-29 8:12 ` [PATCH v1 2/3] perf bench: Import ticket_spinlock from kerne Yuzhuo Jing
2025-07-29 8:12 ` [PATCH v1 3/3] perf bench: Add 'bench sync ticket' subcommand Yuzhuo Jing
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).