All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Breno Leitao <leitao@debian.org>
Cc: "Thomas Gleixner" <tglx@kernel.org>,
	"Ingo Molnar" <mingo@redhat.com>,
	"Darren Hart" <dvhart@infradead.org>,
	"Davidlohr Bueso" <dave@stgolabs.net>,
	"André Almeida" <andrealmeid@igalia.com>,
	linux-kernel@vger.kernel.org, puranjay@kernel.org,
	rmikey@meta.com, stuclar@meta.com, namhyung@kernel.org,
	kernel-team@meta.com
Subject: Re: [PATCH RFC] futex: avoid false sharing between hb->chain and the bucket lock
Date: Tue, 9 Jun 2026 22:11:17 +0200	[thread overview]
Message-ID: <20260609201117.GA187714@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <aigTu8PqXn8wyhiK@gmail.com>

On Tue, Jun 09, 2026 at 08:28:16AM -0700, Breno Leitao wrote:

> > I'm really not sure, it *doubles* the futex memory cost.
> 
> I think it's worth the trade. The global hash scales linearly with
> num_possible_cpus(), so the extra bytes track the same curve as the machines
> that actually need the fix
> 
> in simpler words, a box big enough to feel this contention has plenty of RAM
> headroom to absorb it.

You might not have heard, but RAM has gotten ludicrously expensive.

Anyway, how does something like the below work for you? It's a total
hack job, but it (sorta) builds and runs.


---
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index ff2a4fb2993f..8555c76077af 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -124,7 +124,7 @@ late_initcall(fail_futex_debugfs);
 #endif /* CONFIG_FAIL_FUTEX */
 
 static struct futex_hash_bucket *
-__futex_hash(union futex_key *key, struct futex_private_hash *fph);
+__futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p);
 
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
 static bool futex_ref_get(struct futex_private_hash *fph);
@@ -183,14 +183,6 @@ __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
 {
 	u32 hash;
 
-	if (!futex_key_is_private(key))
-		return NULL;
-
-	if (!fph)
-		fph = rcu_dereference(key->private.mm->futex_phash);
-	if (!fph || !fph->hash_mask)
-		return NULL;
-
 	hash = jhash2((void *)&key->private.address,
 		      sizeof(key->private.address) / 4,
 		      key->both.offset);
@@ -211,13 +203,12 @@ static void futex_rehash_private(struct futex_private_hash *old,
 
 		spin_lock(&hb_old->lock);
 		plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
-
 			plist_del(&this->list, &hb_old->chain);
 			futex_hb_waiters_dec(hb_old);
 
 			WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
 
-			hb_new = __futex_hash(&this->key, new);
+			hb_new = __futex_hash(&this->key, new, NULL);
 			futex_hb_waiters_inc(hb_new);
 			/*
 			 * The new pointer isn't published yet but an already
@@ -299,18 +290,17 @@ struct futex_private_hash *futex_private_hash(void)
 	goto again;
 }
 
-struct futex_hash_bucket *futex_hash(union futex_key *key)
+struct futex_bucket_ref futex_hash(union futex_key *key)
 {
-	struct futex_private_hash *fph;
+	struct futex_private_hash *fph = NULL;
 	struct futex_hash_bucket *hb;
 
 again:
 	scoped_guard(rcu) {
-		hb = __futex_hash(key, NULL);
-		fph = hb->priv;
+		hb = __futex_hash(key, NULL, &fph);
 
 		if (!fph || futex_private_hash_get(fph))
-			return hb;
+			return (struct futex_bucket_ref){ .hb = hb, .fph = fph };
 	}
 	futex_pivot_hash(key->private.mm);
 	goto again;
@@ -412,17 +402,19 @@ static int futex_mpol(struct mm_struct *mm, unsigned long addr)
  * global hash is returned.
  */
 static struct futex_hash_bucket *
-__futex_hash(union futex_key *key, struct futex_private_hash *fph)
+__futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p)
 {
 	int node = key->both.node;
 	u32 hash;
 
-	if (node == FUTEX_NO_NODE) {
-		struct futex_hash_bucket *hb;
-
-		hb = __futex_hash_private(key, fph);
-		if (hb)
-			return hb;
+	if (node == FUTEX_NO_NODE && futex_key_is_private(key)) {
+		if (!fph)
+			fph = rcu_dereference(key->private.mm->futex_phash);
+		if (fph && fph->hash_mask) {
+			if (fph_p)
+				*fph_p = fph;
+			return __futex_hash_private(key, fph);
+		}
 	}
 
 	hash = jhash2((u32 *)key,
@@ -1348,7 +1340,8 @@ static void exit_pi_state_list(struct task_struct *curr)
 		pi_state = list_entry(next, struct futex_pi_state, list);
 		key = pi_state->key;
 		if (1) {
-			CLASS(hb, hb)(&key);
+			CLASS(hb, hbr)(&key);
+			struct futex_hash_bucket *hb = hbr.hb;
 
 			/*
 			 * We can race against put_pi_state() removing itself from the
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 9f6bf6f585fc..37fc944edeb9 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -2,6 +2,7 @@
 #ifndef _FUTEX_H
 #define _FUTEX_H
 
+#include "linux/mm_types.h"
 #include <linux/futex.h>
 #include <linux/rtmutex.h>
 #include <linux/sched/wake_q.h>
@@ -222,7 +223,6 @@ extern struct hrtimer_sleeper *
 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
 		  int flags, u64 range_ns);
 
-extern struct futex_hash_bucket *futex_hash(union futex_key *key);
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
 extern void futex_hash_get(struct futex_hash_bucket *hb);
 extern void futex_hash_put(struct futex_hash_bucket *hb);
@@ -237,8 +237,15 @@ static inline struct futex_private_hash *futex_private_hash(void) { return NULL;
 static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
 #endif
 
-DEFINE_CLASS(hb, struct futex_hash_bucket *,
-	     if (_T) futex_hash_put(_T),
+struct futex_bucket_ref {
+	struct futex_hash_bucket *hb;
+	struct futex_private_hash *fph;
+};
+
+extern struct futex_bucket_ref futex_hash(union futex_key *key);
+
+DEFINE_CLASS(hb, struct futex_bucket_ref,
+	     if (_T.fph) futex_private_hash_put(_T.fph),
 	     futex_hash(key), union futex_key *key);
 
 DEFINE_CLASS(private_hash, struct futex_private_hash *,
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 643199fdbe62..5c227a4d963d 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -945,7 +945,8 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
 
 retry_private:
 	if (1) {
-		CLASS(hb, hb)(&q.key);
+		CLASS(hb, hbr)(&q.key);
+		struct futex_hash_bucket *hb = hbr.hb;
 
 		futex_q_lock(&q, hb);
 
@@ -1101,9 +1102,9 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
 		futex_unqueue_pi(&q);
 		spin_unlock(q.lock_ptr);
 		if (q.drop_hb_ref) {
-			CLASS(hb, hb)(&q.key);
+			CLASS(hb, hbr)(&q.key);
 			/* Additional reference from futex_unlock_pi() */
-			futex_hash_put(hb);
+			futex_hash_put(hbr.hb);
 		}
 		goto out;
 
@@ -1162,7 +1163,8 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 	if (ret)
 		return ret;
 
-	CLASS(hb, hb)(&key);
+	CLASS(hb, hbr)(&key);
+	struct futex_hash_bucket *hb = hbr.hb;
 	spin_lock(&hb->lock);
 retry_hb:
 
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index 1d99a84dc9ad..8ae99b7cb873 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -459,8 +459,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
 
 retry_private:
 	if (1) {
-		CLASS(hb, hb1)(&key1);
-		CLASS(hb, hb2)(&key2);
+		CLASS(hb, hbr1)(&key1);
+		CLASS(hb, hbr2)(&key2);
+		struct futex_hash_bucket *hb1 = hbr1.hb;
+		struct futex_hash_bucket *hb2 = hbr2.hb;
 
 		futex_hb_waiters_inc(hb2);
 		double_lock_hb(hb1, hb2);
@@ -838,7 +840,8 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	switch (futex_requeue_pi_wakeup_sync(&q)) {
 	case Q_REQUEUE_PI_IGNORE:
 		{
-			CLASS(hb, hb)(&q.key);
+			CLASS(hb, hbr)(&q.key);
+			struct futex_hash_bucket *hb = hbr.hb;
 			/* The waiter is still on uaddr1 */
 			spin_lock(&hb->lock);
 			ret = handle_early_requeue_pi_wakeup(hb, &q, to);
@@ -909,9 +912,9 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		BUG();
 	}
 	if (q.drop_hb_ref) {
-		CLASS(hb, hb)(&q.key);
+		CLASS(hb, hbr)(&q.key);
 		/* Additional reference from requeue_pi_wake_futex() */
-		futex_hash_put(hb);
+		futex_hash_put(hbr.hb);
 	}
 
 out:
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index ceed9d879059..8c8e3ae899cb 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -169,7 +169,8 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	if ((flags & FLAGS_STRICT) && !nr_wake)
 		return 0;
 
-	CLASS(hb, hb)(&key);
+	CLASS(hb, hbr)(&key);
+	struct futex_hash_bucket *hb = hbr.hb;
 
 	/* Make sure we really have tasks to wakeup */
 	if (!futex_hb_waiters_pending(hb))
@@ -266,8 +267,10 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 
 retry_private:
 	if (1) {
-		CLASS(hb, hb1)(&key1);
-		CLASS(hb, hb2)(&key2);
+		CLASS(hb, hbr1)(&key1);
+		CLASS(hb, hbr2)(&key2);
+		struct futex_hash_bucket *hb1 = hbr1.hb;
+		struct futex_hash_bucket *hb2 = hbr2.hb;
 
 		double_lock_hb(hb1, hb2);
 		op_ret = futex_atomic_op_inuser(op, uaddr2);
@@ -446,7 +449,8 @@ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
 		u32 val = vs[i].w.val;
 
 		if (1) {
-			CLASS(hb, hb)(&q->key);
+			CLASS(hb, hbr)(&q->key);
+			struct futex_hash_bucket *hb = hbr.hb;
 
 			futex_q_lock(q, hb);
 			ret = futex_get_value_locked(&uval, uaddr);
@@ -621,7 +625,8 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 
 retry_private:
 	if (1) {
-		CLASS(hb, hb)(&q->key);
+		CLASS(hb, hbr)(&q->key);
+		struct futex_hash_bucket *hb = hbr.hb;
 
 		futex_q_lock(q, hb);
 

  reply	other threads:[~2026-06-09 20:11 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-05 16:53 [PATCH RFC] futex: avoid false sharing between hb->chain and the bucket lock Breno Leitao
2026-06-09 10:46 ` Peter Zijlstra
2026-06-09 15:28   ` Breno Leitao
2026-06-09 20:11     ` Peter Zijlstra [this message]
2026-06-09 20:18       ` Peter Zijlstra
2026-06-09 20:16     ` Thomas Gleixner
2026-06-09 20:23       ` Peter Zijlstra
2026-06-09 20:25       ` Peter Zijlstra
2026-06-09 20:32         ` Thomas Gleixner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260609201117.GA187714@noisy.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=andrealmeid@igalia.com \
    --cc=dave@stgolabs.net \
    --cc=dvhart@infradead.org \
    --cc=kernel-team@meta.com \
    --cc=leitao@debian.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=namhyung@kernel.org \
    --cc=puranjay@kernel.org \
    --cc=rmikey@meta.com \
    --cc=stuclar@meta.com \
    --cc=tglx@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.