Re: [PATCH v10 20/21] futex: Implement FUTEX2_NUMA

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Shrikanth Hegde <sshegde@linux.ibm.com>
To: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: "Peter Zijlstra" <peterz@infradead.org>,
	"André Almeida" <andrealmeid@igalia.com>,
	"Darren Hart" <dvhart@infradead.org>,
	"Davidlohr Bueso" <dave@stgolabs.net>,
	"Ingo Molnar" <mingo@redhat.com>,
	"Juri Lelli" <juri.lelli@redhat.com>,
	"Thomas Gleixner" <tglx@linutronix.de>,
	"Valentin Schneider" <vschneid@redhat.com>,
	"Waiman Long" <longman@redhat.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH v10 20/21] futex: Implement FUTEX2_NUMA
Date: Wed, 26 Mar 2025 01:22:19 +0530	[thread overview]
Message-ID: <fc3a6902-a845-4b11-ace8-514f10194288@linux.ibm.com> (raw)
In-Reply-To: <20250312151634.2183278-21-bigeasy@linutronix.de>



On 3/12/25 20:46, Sebastian Andrzej Siewior wrote:
> From: Peter Zijlstra <peterz@infradead.org>
> 
> Extend the futex2 interface to be numa aware.
> 
> When FUTEX2_NUMA is specified for a futex, the user value is extended
> to two words (of the same size). The first is the user value we all
> know, the second one will be the node to place this futex on.
> 
>    struct futex_numa_32 {
> 	u32 val;
> 	u32 node;
>    };
> 
> When node is set to ~0, WAIT will set it to the current node_id such
> that WAKE knows where to find it. If userspace corrupts the node value
> between WAIT and WAKE, the futex will not be found and no wakeup will
> happen.
> 
> When FUTEX2_NUMA is not set, the node is simply an extention of the
> hash, such that traditional futexes are still interleaved over the
> nodes.
> 
> This is done to avoid having to have a separate !numa hash-table.
> 
> [bigeasy: ensure to have at least hashsize of 4 in futex_init(), add
> pr_info() for size and allocation information.]
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>   include/linux/futex.h      |   3 ++
>   include/uapi/linux/futex.h |   8 +++
>   kernel/futex/core.c        | 100 ++++++++++++++++++++++++++++++-------
>   kernel/futex/futex.h       |  33 ++++++++++--
>   4 files changed, 124 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/futex.h b/include/linux/futex.h
> index 7e14d2e9162d2..19c37afa0432a 100644
> --- a/include/linux/futex.h
> +++ b/include/linux/futex.h
> @@ -34,6 +34,7 @@ union futex_key {
>   		u64 i_seq;
>   		unsigned long pgoff;
>   		unsigned int offset;
> +		/* unsigned int node; */
>   	} shared;
>   	struct {
>   		union {
> @@ -42,11 +43,13 @@ union futex_key {
>   		};
>   		unsigned long address;
>   		unsigned int offset;
> +		/* unsigned int node; */
>   	} private;
>   	struct {
>   		u64 ptr;
>   		unsigned long word;
>   		unsigned int offset;
> +		unsigned int node;	/* NOT hashed! */
>   	} both;
>   };
>   
> diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
> index d2ee625ea1890..0435025beaae8 100644
> --- a/include/uapi/linux/futex.h
> +++ b/include/uapi/linux/futex.h
> @@ -74,6 +74,14 @@
>   /* do not use */
>   #define FUTEX_32		FUTEX2_SIZE_U32 /* historical accident :-( */
>   
> +
> +/*
> + * When FUTEX2_NUMA doubles the futex word, the second word is a node value.
> + * The special value -1 indicates no-node. This is the same value as
> + * NUMA_NO_NODE, except that value is not ABI, this is.
> + */
> +#define FUTEX_NO_NODE		(-1)
> +
>   /*
>    * Max numbers of elements in a futex_waitv array
>    */
> diff --git a/kernel/futex/core.c b/kernel/futex/core.c
> index bc7451287b2ce..b9da7dc6a900a 100644
> --- a/kernel/futex/core.c
> +++ b/kernel/futex/core.c
> @@ -36,6 +36,8 @@
>   #include <linux/pagemap.h>
>   #include <linux/debugfs.h>
>   #include <linux/plist.h>
> +#include <linux/gfp.h>
> +#include <linux/vmalloc.h>
>   #include <linux/memblock.h>
>   #include <linux/fault-inject.h>
>   #include <linux/slab.h>
> @@ -51,11 +53,14 @@
>    * reside in the same cacheline.
>    */
>   static struct {
> -	struct futex_hash_bucket *queues;
>   	unsigned long            hashmask;
> +	unsigned int		 hashshift;
> +	struct futex_hash_bucket *queues[MAX_NUMNODES];
>   } __futex_data __read_mostly __aligned(2*sizeof(long));
> -#define futex_queues   (__futex_data.queues)
> -#define futex_hashmask (__futex_data.hashmask)
> +
> +#define futex_hashmask	(__futex_data.hashmask)
> +#define futex_hashshift	(__futex_data.hashshift)
> +#define futex_queues	(__futex_data.queues)
>   
>   struct futex_private_hash {
>   	rcuref_t	users;
> @@ -326,15 +331,35 @@ __futex_hash(union futex_key *key, struct futex_private_hash *fph)
>   {
>   	struct futex_hash_bucket *hb;
>   	u32 hash;
> +	int node;
>   
>   	hb = __futex_hash_private(key, fph);
>   	if (hb)
>   		return hb;
>   
>   	hash = jhash2((u32 *)key,
> -		      offsetof(typeof(*key), both.offset) / 4,
> +		      offsetof(typeof(*key), both.offset) / sizeof(u32),
>   		      key->both.offset);
> -	return &futex_queues[hash & futex_hashmask];
> +	node = key->both.node;
> +
> +	if (node == FUTEX_NO_NODE) {
> +		/*
> +		 * In case of !FLAGS_NUMA, use some unused hash bits to pick a
> +		 * node -- this ensures regular futexes are interleaved across
> +		 * the nodes and avoids having to allocate multiple
> +		 * hash-tables.
> +		 *
> +		 * NOTE: this isn't perfectly uniform, but it is fast and
> +		 * handles sparse node masks.
> +		 */
> +		node = (hash >> futex_hashshift) % nr_node_ids;
> +		if (!node_possible(node)) {
> +			node = find_next_bit_wrap(node_possible_map.bits,
> +						  nr_node_ids, node);
> +		}
> +	}
> +
> +	return &futex_queues[node][hash & futex_hashmask];

IIUC, when one specifies the numa node it can't be private futex anymore?


>   }
>   
>   /**
> @@ -441,25 +466,49 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
>   	struct page *page;
>   	struct folio *folio;
>   	struct address_space *mapping;
> -	int err, ro = 0;
> +	int node, err, size, ro = 0;
>   	bool fshared;
>   
>   	fshared = flags & FLAGS_SHARED;
> +	size = futex_size(flags);
> +	if (flags & FLAGS_NUMA)
> +		size *= 2;
>   
>   	/*
>   	 * The futex address must be "naturally" aligned.
>   	 */
>   	key->both.offset = address % PAGE_SIZE;
> -	if (unlikely((address % sizeof(u32)) != 0))
> +	if (unlikely((address % size) != 0))
>   		return -EINVAL;
>   	address -= key->both.offset;
>   
> -	if (unlikely(!access_ok(uaddr, sizeof(u32))))
> +	if (unlikely(!access_ok(uaddr, size)))
>   		return -EFAULT;
>   
>   	if (unlikely(should_fail_futex(fshared)))
>   		return -EFAULT;
>   
> +	if (flags & FLAGS_NUMA) {
> +		u32 __user *naddr = uaddr + size / 2;
> +
> +		if (futex_get_value(&node, naddr))
> +			return -EFAULT;
> +
> +		if (node == FUTEX_NO_NODE) {
> +			node = numa_node_id();
> +			if (futex_put_value(node, naddr))
> +				return -EFAULT;
> +
> +		} else if (node >= MAX_NUMNODES || !node_possible(node)) {
> +			return -EINVAL;
> +		}
> +
> +		key->both.node = node;
> +
> +	} else {
> +		key->both.node = FUTEX_NO_NODE;
> +	}
> +
>   	/*
>   	 * PROCESS_PRIVATE futexes are fast.
>   	 * As the mm cannot disappear under us and the 'key' only needs
> @@ -1597,24 +1646,41 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3)
>   static int __init futex_init(void)
>   {
>   	unsigned long hashsize, i;
> -	unsigned int futex_shift;
> +	unsigned int order, n;
> +	unsigned long size;
>   
>   #ifdef CONFIG_BASE_SMALL
>   	hashsize = 16;
>   #else
> -	hashsize = roundup_pow_of_two(256 * num_possible_cpus());
> +	hashsize = 256 * num_possible_cpus();
> +	hashsize /= num_possible_nodes();

Wouldn't it be better to use num_online_nodes? each node may get a bigger
hash bucket which means less collision no?


> +	hashsize = max(4, hashsize);
> +	hashsize = roundup_pow_of_two(hashsize);
>   #endif
> +	futex_hashshift = ilog2(hashsize);
> +	size = sizeof(struct futex_hash_bucket) * hashsize;
> +	order = get_order(size);
>   
> -	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
> -					       hashsize, 0, 0,
> -					       &futex_shift, NULL,
> -					       hashsize, hashsize);
> -	hashsize = 1UL << futex_shift;
> +	for_each_node(n) {
> +		struct futex_hash_bucket *table;
>   
> -	for (i = 0; i < hashsize; i++)
> -		futex_hash_bucket_init(&futex_queues[i], NULL);
> +		if (order > MAX_PAGE_ORDER)
> +			table = vmalloc_huge_node(size, GFP_KERNEL, n);
> +		else
> +			table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
> +
> +		BUG_ON(!table);
> +
> +		for (i = 0; i < hashsize; i++)
> +			futex_hash_bucket_init(&table[i], NULL);
> +
> +		futex_queues[n] = table;
> +	}
>   
>   	futex_hashmask = hashsize - 1;
> +	pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
> +		hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
> +		order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
>   	return 0;
>   }
>   core_initcall(futex_init);
> diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
> index 8eba9982bcae1..11c870a92b5d0 100644
> --- a/kernel/futex/futex.h
> +++ b/kernel/futex/futex.h
> @@ -54,7 +54,7 @@ static inline unsigned int futex_to_flags(unsigned int op)
>   	return flags;
>   }
>   
> -#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
> +#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_PRIVATE)
>   
>   /* FUTEX2_ to FLAGS_ */
>   static inline unsigned int futex2_to_flags(unsigned int flags2)
> @@ -87,6 +87,19 @@ static inline bool futex_flags_valid(unsigned int flags)
>   	if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
>   		return false;
>   
> +	/*
> +	 * Must be able to represent both FUTEX_NO_NODE and every valid nodeid
> +	 * in a futex word.
> +	 */
> +	if (flags & FLAGS_NUMA) {
> +		int bits = 8 * futex_size(flags);
> +		u64 max = ~0ULL;
> +
> +		max >>= 64 - bits;
> +		if (nr_node_ids >= max)
> +			return false;
> +	}
> +
>   	return true;
>   }
>   
> @@ -290,7 +303,7 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32
>    * This looks a bit overkill, but generally just results in a couple
>    * of instructions.
>    */
> -static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
> +static __always_inline int futex_get_value(u32 *dest, u32 __user *from)
>   {
>   	u32 val;
>   
> @@ -307,12 +320,26 @@ static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
>   	return -EFAULT;
>   }
>   
> +static __always_inline int futex_put_value(u32 val, u32 __user *to)
> +{
> +	if (can_do_masked_user_access())
> +		to = masked_user_access_begin(to);
> +	else if (!user_read_access_begin(to, sizeof(*to)))
> +		return -EFAULT;
> +	unsafe_put_user(val, to, Efault);
> +	user_read_access_end();
> +	return 0;
> +Efault:
> +	user_read_access_end();
> +	return -EFAULT;
> +}
> +
>   static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
>   {
>   	int ret;
>   
>   	pagefault_disable();
> -	ret = futex_read_inatomic(dest, from);
> +	ret = futex_get_value(dest, from);
>   	pagefault_enable();
>   
>   	return ret;

next prev parent reply	other threads:[~2025-03-25 19:52 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-03-12 15:16 [PATCH v10 00/21] futex: Add support task local hash maps, FUTEX2_NUMA and FUTEX2_MPOL Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 01/21] rcuref: Provide rcuref_is_dead() Sebastian Andrzej Siewior
2025-03-13  4:23   ` Joel Fernandes
2025-03-13  7:55     ` Sebastian Andrzej Siewior
2025-03-14 10:36   ` Peter Zijlstra
2025-03-12 15:16 ` [PATCH v10 02/21] futex: Move futex_queue() into futex_wait_setup() Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 03/21] futex: Pull futex_hash() out of futex_q_lock() Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 04/21] futex: Create hb scopes Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 05/21] futex: Create futex_hash() get/put class Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 06/21] futex: Create helper function to initialize a hash slot Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 07/21] futex: Add basic infrastructure for local task local hash Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 08/21] futex: Hash only the address for private futexes Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 09/21] futex: Allow automatic allocation of process wide futex hash Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 10/21] futex: Decrease the waiter count before the unlock operation Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 11/21] futex: Introduce futex_q_lockptr_lock() Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 12/21] futex: Acquire a hash reference in futex_wait_multiple_setup() Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 13/21] futex: Allow to re-allocate the private local hash Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 14/21] futex: Resize local futex hash table based on number of threads Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 15/21] futex: s/hb_p/fph/ Sebastian Andrzej Siewior
2025-03-14 12:36   ` Peter Zijlstra
2025-03-14 13:10     ` Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 16/21] futex: Remove superfluous state Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 17/21] futex: Untangle and naming Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 18/21] futex: Rework SET_SLOTS Sebastian Andrzej Siewior
2025-03-26 15:37   ` Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 19/21] mm: Add vmalloc_huge_node() Sebastian Andrzej Siewior
2025-03-12 22:02   ` Andrew Morton
2025-03-13  7:59     ` Sebastian Andrzej Siewior
2025-03-13 22:08       ` Andrew Morton
2025-03-14  9:59         ` Sebastian Andrzej Siewior
2025-03-14 10:34           ` Andrew Morton
2025-03-12 15:16 ` [PATCH v10 20/21] futex: Implement FUTEX2_NUMA Sebastian Andrzej Siewior
2025-03-25 19:52   ` Shrikanth Hegde [this message]
2025-03-25 22:52     ` Peter Zijlstra
2025-03-25 22:56     ` Peter Zijlstra
2025-03-26 12:57       ` Shrikanth Hegde
2025-03-26 13:37         ` Peter Zijlstra
2025-03-26 15:06           ` Shrikanth Hegde
2025-03-26  8:03     ` Sebastian Andrzej Siewior
2025-03-12 15:16 ` [PATCH v10 21/21] futex: Implement FUTEX2_MPOL Sebastian Andrzej Siewior
2025-03-12 15:18 ` [PATCH v10 00/21] futex: Add support task local hash maps, FUTEX2_NUMA and FUTEX2_MPOL Sebastian Andrzej Siewior
2025-03-14 10:42   ` Peter Zijlstra
2025-03-14 10:58   ` Peter Zijlstra
2025-03-14 11:28     ` Sebastian Andrzej Siewior
2025-03-14 11:41       ` Peter Zijlstra
2025-03-14 12:00         ` Sebastian Andrzej Siewior
2025-03-14 12:30           ` Peter Zijlstra
2025-03-14 13:30             ` Sebastian Andrzej Siewior
2025-03-14 14:18               ` Peter Zijlstra
2025-03-14 14:40             ` Paul E. McKenney
2025-03-18 13:24 ` Shrikanth Hegde
2025-03-18 16:12   ` Davidlohr Bueso
2025-03-25 19:04   ` Shrikanth Hegde
2025-03-26  9:31     ` Sebastian Andrzej Siewior
2025-03-26 12:54       ` Shrikanth Hegde
2025-03-26 14:01         ` Sebastian Andrzej Siewior
2025-03-26  8:49   ` Sebastian Andrzej Siewior
2025-04-07 16:15   ` Sebastian Andrzej Siewior

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=fc3a6902-a845-4b11-ace8-514f10194288@linux.ibm.com \
    --to=sshegde@linux.ibm.com \
    --cc=andrealmeid@igalia.com \
    --cc=bigeasy@linutronix.de \
    --cc=dave@stgolabs.net \
    --cc=dvhart@infradead.org \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=longman@redhat.com \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox