All of lore.kernel.org
 help / color / mirror / Atom feed
From: Boqun Feng <boqun.feng@gmail.com>
To: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Joel Fernandes <joel@joelfernandes.org>,
	"Paul E. McKenney" <paulmck@kernel.org>,
	linux-kernel@vger.kernel.org, Nicholas Piggin <npiggin@gmail.com>,
	Michael Ellerman <mpe@ellerman.id.au>,
	Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	Will Deacon <will@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Alan Stern <stern@rowland.harvard.edu>,
	John Stultz <jstultz@google.com>,
	Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Frederic Weisbecker <frederic@kernel.org>,
	Josh Triplett <josh@joshtriplett.org>,
	Uladzislau Rezki <urezki@gmail.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Lai Jiangshan <jiangshanlai@gmail.com>,
	Zqiang <qiang.zhang1211@gmail.com>,
	Ingo Molnar <mingo@redhat.com>, Waiman Long <longman@redhat.com>,
	Mark Rutland <mark.rutland@arm.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Vlastimil Babka <vbabka@suse.cz>,
	maged.michael@gmail.com,	Mateusz Guzik <mjguzik@gmail.com>,
	Jonas Oberhauser <jonas.oberhauser@huaweicloud.com>,
	rcu@vger.kernel.org, linux-mm@kvack.org, lkmm@lists.linux.dev
Subject: Re: [RFC PATCH v4 3/4] hazptr: Implement Hazard Pointers
Date: Thu, 18 Dec 2025 17:36:57 +0900	[thread overview]
Message-ID: <aUO9KRaP0FXpm_l9@tardis-2.local> (raw)
In-Reply-To: <20251218014531.3793471-4-mathieu.desnoyers@efficios.com>

On Wed, Dec 17, 2025 at 08:45:30PM -0500, Mathieu Desnoyers wrote:
[...]
> +static inline
> +struct hazptr_slot *hazptr_get_free_percpu_slot(void)
> +{
> +	struct hazptr_percpu_slots *percpu_slots = this_cpu_ptr(&hazptr_percpu_slots);
> +	unsigned int idx;
> +
> +	for (idx = 0; idx < NR_HAZPTR_PERCPU_SLOTS; idx++) {
> +		struct hazptr_slot *slot = &percpu_slots->slots[idx];
> +
> +		if (!READ_ONCE(slot->addr))
> +			return slot;
> +	}
> +	/* All slots are in use. */
> +	return NULL;
> +}
> +
> +static inline
> +bool hazptr_slot_is_backup(struct hazptr_ctx *ctx, struct hazptr_slot *slot)
> +{
> +	return slot == &ctx->backup_slot.slot;
> +}
> +
> +/*
> + * hazptr_acquire: Load pointer at address and protect with hazard pointer.
> + *
> + * Load @addr_p, and protect the loaded pointer with hazard pointer.
> + *
> + * Returns a non-NULL protected address if the loaded pointer is non-NULL.
> + * Returns NULL if the loaded pointer is NULL.
> + *
> + * On success the protected hazptr slot is stored in @ctx->slot.
> + */
> +static inline
> +void *hazptr_acquire(struct hazptr_ctx *ctx, void * const * addr_p)
> +{
> +	struct hazptr_slot *slot = NULL;
> +	void *addr, *addr2;
> +
> +	/*
> +	 * Load @addr_p to know which address should be protected.
> +	 */
> +	addr = READ_ONCE(*addr_p);
> +	for (;;) {
> +		if (!addr)
> +			return NULL;
> +		guard(preempt)();
> +		if (likely(!hazptr_slot_is_backup(ctx, slot))) {
> +			slot = hazptr_get_free_percpu_slot();

I need to continue share my concerns about this "allocating slot while
protecting" pattern. Here realistically, we will go over a few of the
per-CPU hazard pointer slots *every time* instead of directly using a
pre-allocated hazard pointer slot. Could you utilize this[1] to see a
comparison of the reader-side performance against RCU/SRCU?

> +			/*
> +			 * If all the per-CPU slots are already in use, fallback
> +			 * to the backup slot.
> +			 */
> +			if (unlikely(!slot))
> +				slot = hazptr_chain_backup_slot(ctx);
> +		}
> +		WRITE_ONCE(slot->addr, addr);	/* Store B */
> +
> +		/* Memory ordering: Store B before Load A. */
> +		smp_mb();
> +
> +		/*
> +		 * Re-load @addr_p after storing it to the hazard pointer slot.
> +		 */
> +		addr2 = READ_ONCE(*addr_p);	/* Load A */
> +		if (likely(ptr_eq(addr2, addr)))
> +			break;
> +		/*
> +		 * If @addr_p content has changed since the first load,
> +		 * release the hazard pointer and try again.
> +		 */
> +		WRITE_ONCE(slot->addr, NULL);
> +		if (!addr2) {
> +			if (hazptr_slot_is_backup(ctx, slot))
> +				hazptr_unchain_backup_slot(ctx);
> +			return NULL;
> +		}
> +		addr = addr2;
> +	}
> +	ctx->slot = slot;
> +	/*
> +	 * Use addr2 loaded from the second READ_ONCE() to preserve
> +	 * address dependency ordering.
> +	 */
> +	return addr2;
> +}
> +
> +/* Release the protected hazard pointer from @slot. */
> +static inline
> +void hazptr_release(struct hazptr_ctx *ctx, void *addr)
> +{
> +	struct hazptr_slot *slot;
> +
> +	if (!addr)
> +		return;
> +	slot = ctx->slot;
> +	WARN_ON_ONCE(slot->addr != addr);
> +	smp_store_release(&slot->addr, NULL);
> +	if (unlikely(hazptr_slot_is_backup(ctx, slot)))
> +		hazptr_unchain_backup_slot(ctx);
> +}
> +
> +void hazptr_init(void);
> +
> +#endif /* _LINUX_HAZPTR_H */
> diff --git a/init/main.c b/init/main.c
> index 07a3116811c5..858eaa87bde7 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -104,6 +104,7 @@
>  #include <linux/pidfs.h>
>  #include <linux/ptdump.h>
>  #include <linux/time_namespace.h>
> +#include <linux/hazptr.h>
>  #include <net/net_namespace.h>
>  
>  #include <asm/io.h>
> @@ -1002,6 +1003,7 @@ void start_kernel(void)
>  	workqueue_init_early();
>  
>  	rcu_init();
> +	hazptr_init();
>  	kvfree_rcu_init();
>  
>  	/* Trace events are available after this */
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 9fe722305c9b..1178907fe0ea 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -7,7 +7,7 @@ obj-y     = fork.o exec_domain.o panic.o \
>  	    cpu.o exit.o softirq.o resource.o \
>  	    sysctl.o capability.o ptrace.o user.o \
>  	    signal.o sys.o umh.o workqueue.o pid.o task_work.o \
> -	    extable.o params.o \
> +	    extable.o params.o hazptr.o \
>  	    kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \
>  	    notifier.o ksysfs.o cred.o reboot.o \
>  	    async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
> diff --git a/kernel/hazptr.c b/kernel/hazptr.c
> new file mode 100644
> index 000000000000..2ec288bc1132
> --- /dev/null
> +++ b/kernel/hazptr.c
> @@ -0,0 +1,150 @@
> +// SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> +//
> +// SPDX-License-Identifier: LGPL-2.1-or-later
> +
> +/*
> + * hazptr: Hazard Pointers
> + */
> +
> +#include <linux/hazptr.h>
> +#include <linux/percpu.h>
> +#include <linux/spinlock.h>
> +#include <linux/list.h>
> +#include <linux/export.h>
> +
> +struct overflow_list {
> +	raw_spinlock_t lock;		/* Lock protecting overflow list and list generation. */
> +	struct list_head head;		/* Overflow list head. */
> +	uint64_t gen;			/* Overflow list generation. */
> +};
> +
> +static DEFINE_PER_CPU(struct overflow_list, percpu_overflow_list);
> +
> +DEFINE_PER_CPU(struct hazptr_percpu_slots, hazptr_percpu_slots);
> +EXPORT_PER_CPU_SYMBOL_GPL(hazptr_percpu_slots);
> +
> +/*
> + * Perform piecewise iteration on overflow list waiting until "addr" is
> + * not present. Raw spinlock is released and taken between each list
> + * item and busy loop iteration. The overflow list generation is checked
> + * each time the lock is taken to validate that the list has not changed
> + * before resuming iteration or busy wait. If the generation has
> + * changed, retry the entire list traversal.
> + */
> +static
> +void hazptr_synchronize_overflow_list(struct overflow_list *overflow_list, void *addr)
> +{
> +	struct hazptr_backup_slot *backup_slot;
> +	uint64_t snapshot_gen;
> +
> +	raw_spin_lock(&overflow_list->lock);
> +retry:
> +	snapshot_gen = overflow_list->gen;
> +	list_for_each_entry(backup_slot, &overflow_list->head, node) {
> +		/* Busy-wait if node is found. */
> +		while (smp_load_acquire(&backup_slot->slot.addr) == addr) { /* Load B */
> +			raw_spin_unlock(&overflow_list->lock);
> +			cpu_relax();

I think we should prioritize the scan thread solution [2] instead of
busy waiting hazrd pointer updaters, because when we have multiple
hazard pointer usages we would want to consolidate the scans from
updater side. If so, the whole ->gen can be avoided.

However this ->gen idea does seem ot resolve another issue for me, I'm
trying to make shazptr critical section preemptive by using a per-task
backup slot (if you recall, this is your idea from the hallway
discussions we had during LPC 2024), and currently I could not make it
work because the following sequeue:

1. CPU 0 already has one pointer protected.

2. CPU 1 begins the updater scan, and it scans the list of preempted
   hazard pointer readers, no reader.

3. CPU 0 does a context switch, it stores the current hazard pointer
   value to the current task's ->hazard_slot (let's say the task is task
   A), and add it to the list of preempted hazard pointer readers.

4. CPU 0 clears its percpu hazptr_slots for the next task (B).

5. CPU 1 continues the updater scan, and it scans the percpu slot of
   CPU 0, and finds no reader.

in this situation, updater will miss a reader. But if we add a
generation snapshotting at step 2 and generation increment at step 3, I
think it'll work.

IMO, if we make this work, it's better than the current backup slot
mechanism IMO, because we only need to acquire the lock if context
switch happens. I will look into the implementation of this and if I
could get it down, I will send it in my next version of shazptr. Mention
it here just to add this option into the discussion.

[1]: https://lore.kernel.org/lkml/20250625031101.12555-3-boqun.feng@gmail.com/
[2]: https://lore.kernel.org/lkml/20250625031101.12555-5-boqun.feng@gmail.com/

Regards,
Boqun

> +			raw_spin_lock(&overflow_list->lock);
> +			if (overflow_list->gen != snapshot_gen)
> +				goto retry;
> +		}
> +		raw_spin_unlock(&overflow_list->lock);
> +		/*
> +		 * Release raw spinlock, validate generation after
> +		 * re-acquiring the lock.
> +		 */
> +		raw_spin_lock(&overflow_list->lock);
> +		if (overflow_list->gen != snapshot_gen)
> +			goto retry;
> +	}
> +	raw_spin_unlock(&overflow_list->lock);
> +}
> +
> +static
> +void hazptr_synchronize_cpu_slots(int cpu, void *addr)
> +{
> +	struct hazptr_percpu_slots *percpu_slots = per_cpu_ptr(&hazptr_percpu_slots, cpu);
> +	unsigned int idx;
> +
> +	for (idx = 0; idx < NR_HAZPTR_PERCPU_SLOTS; idx++) {
> +		struct hazptr_slot *slot = &percpu_slots->slots[idx];
> +
> +		/* Busy-wait if node is found. */
> +		smp_cond_load_acquire(&slot->addr, VAL != addr); /* Load B */
> +	}
> +}
> +
> +/*
> + * hazptr_synchronize: Wait until @addr is released from all slots.
> + *
> + * Wait to observe that each slot contains a value that differs from
> + * @addr before returning.
> + * Should be called from preemptible context.
> + */
> +void hazptr_synchronize(void *addr)
> +{
> +	int cpu;
> +
> +	/*
> +	 * Busy-wait should only be done from preemptible context.
> +	 */
> +	lockdep_assert_preemption_enabled();
> +
> +	/*
> +	 * Store A precedes hazptr_scan(): it unpublishes addr (sets it to
> +	 * NULL or to a different value), and thus hides it from hazard
> +	 * pointer readers.
> +	 */
> +	if (!addr)
> +		return;
> +	/* Memory ordering: Store A before Load B. */
> +	smp_mb();
> +	/* Scan all CPUs slots. */
> +	for_each_possible_cpu(cpu) {
> +		/* Scan CPU slots. */
> +		hazptr_synchronize_cpu_slots(cpu, addr);
> +		/* Scan backup slots in percpu overflow list. */
> +		hazptr_synchronize_overflow_list(per_cpu_ptr(&percpu_overflow_list, cpu), addr);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(hazptr_synchronize);
> +
> +struct hazptr_slot *hazptr_chain_backup_slot(struct hazptr_ctx *ctx)
> +{
> +	struct overflow_list *overflow_list = this_cpu_ptr(&percpu_overflow_list);
> +	struct hazptr_slot *slot = &ctx->backup_slot.slot;
> +
> +	slot->addr = NULL;
> +
> +	raw_spin_lock(&overflow_list->lock);
> +	overflow_list->gen++;
> +	list_add(&ctx->backup_slot.node, &overflow_list->head);
> +	ctx->backup_slot.cpu = smp_processor_id();
> +	raw_spin_unlock(&overflow_list->lock);
> +	return slot;
> +}
> +EXPORT_SYMBOL_GPL(hazptr_chain_backup_slot);
> +
> +void hazptr_unchain_backup_slot(struct hazptr_ctx *ctx)
> +{
> +	struct overflow_list *overflow_list = per_cpu_ptr(&percpu_overflow_list, ctx->backup_slot.cpu);
> +
> +	raw_spin_lock(&overflow_list->lock);
> +	overflow_list->gen++;
> +	list_del(&ctx->backup_slot.node);
> +	raw_spin_unlock(&overflow_list->lock);
> +}
> +EXPORT_SYMBOL_GPL(hazptr_unchain_backup_slot);
> +
> +void __init hazptr_init(void)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct overflow_list *overflow_list = per_cpu_ptr(&percpu_overflow_list, cpu);
> +
> +		raw_spin_lock_init(&overflow_list->lock);
> +		INIT_LIST_HEAD(&overflow_list->head);
> +	}
> +}
> -- 
> 2.39.5
> 


  reply	other threads:[~2025-12-18 10:29 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-18  1:45 [RFC PATCH v4 0/4] Hazard Pointers Mathieu Desnoyers
2025-12-18  1:45 ` [RFC PATCH v4 1/4] compiler.h: Introduce ptr_eq() to preserve address dependency Mathieu Desnoyers
2025-12-18  9:03   ` David Laight
2025-12-18 13:51     ` Mathieu Desnoyers
2025-12-18 15:54       ` David Laight
2025-12-18 14:27     ` Gary Guo
2025-12-18 16:12       ` David Laight
2025-12-18  1:45 ` [RFC PATCH v4 2/4] Documentation: RCU: Refer to ptr_eq() Mathieu Desnoyers
2025-12-18  1:45 ` [RFC PATCH v4 3/4] hazptr: Implement Hazard Pointers Mathieu Desnoyers
2025-12-18  8:36   ` Boqun Feng [this message]
2025-12-18 17:35     ` Mathieu Desnoyers
2025-12-18 20:22       ` Boqun Feng
2025-12-18 23:36         ` Mathieu Desnoyers
2025-12-19  0:25           ` Boqun Feng
2025-12-19  6:06             ` Joel Fernandes
2025-12-19 15:14             ` Mathieu Desnoyers
2025-12-19 15:42               ` Joel Fernandes
2025-12-19 22:19                 ` Mathieu Desnoyers
2025-12-19 22:39                   ` Joel Fernandes
2025-12-21  9:59                     ` Boqun Feng
2025-12-19  0:43       ` Boqun Feng
2025-12-19 14:22         ` Mathieu Desnoyers
2026-01-08 16:34           ` Frederic Weisbecker
2026-01-08 16:45             ` Mathieu Desnoyers
2026-01-08 23:34               ` Joel Fernandes
2026-01-08 19:01             ` Paul E. McKenney
2025-12-19  1:22   ` Joel Fernandes
2025-12-18  1:45 ` [RFC PATCH v4 4/4] hazptr: Migrate per-CPU slots to backup slot on context switch Mathieu Desnoyers
2025-12-18 16:20   ` Mathieu Desnoyers
2025-12-18 22:16   ` Boqun Feng
2025-12-19  0:21     ` Mathieu Desnoyers
2026-02-23 16:54       ` Boqun Feng
2026-02-23 19:40         ` Mathieu Desnoyers
2025-12-18 10:33 ` [RFC PATCH v4 0/4] Hazard Pointers Joel Fernandes
2025-12-18 17:54   ` Mathieu Desnoyers

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aUO9KRaP0FXpm_l9@tardis-2.local \
    --to=boqun.feng@gmail.com \
    --cc=Neeraj.Upadhyay@amd.com \
    --cc=akpm@linux-foundation.org \
    --cc=bigeasy@linutronix.de \
    --cc=frederic@kernel.org \
    --cc=gregkh@linuxfoundation.org \
    --cc=jiangshanlai@gmail.com \
    --cc=joel@joelfernandes.org \
    --cc=jonas.oberhauser@huaweicloud.com \
    --cc=josh@joshtriplett.org \
    --cc=jstultz@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lkmm@lists.linux.dev \
    --cc=longman@redhat.com \
    --cc=maged.michael@gmail.com \
    --cc=mark.rutland@arm.com \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mingo@redhat.com \
    --cc=mjguzik@gmail.com \
    --cc=mpe@ellerman.id.au \
    --cc=npiggin@gmail.com \
    --cc=paulmck@kernel.org \
    --cc=peterz@infradead.org \
    --cc=qiang.zhang1211@gmail.com \
    --cc=rcu@vger.kernel.org \
    --cc=rostedt@goodmis.org \
    --cc=stern@rowland.harvard.edu \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=urezki@gmail.com \
    --cc=vbabka@suse.cz \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.