* [patch 1/8] futex: Move futex task related data into a struct
2026-03-16 17:12 [patch 0/8] futex: Address the robust futex unlock race for real Thomas Gleixner
@ 2026-03-16 17:12 ` Thomas Gleixner
2026-03-16 17:55 ` Mathieu Desnoyers
2026-03-17 2:24 ` André Almeida
2026-03-16 17:13 ` [patch 2/8] futex: Move futex related mm_struct " Thomas Gleixner
` (6 subsequent siblings)
7 siblings, 2 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-16 17:12 UTC (permalink / raw)
To: LKML
Cc: Mathieu Desnoyers, André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
Having all these members in task_struct along with the required #ifdeffery
is annoying, does not allow efficient initializing of the data with
memset() and makes extending it tedious.
Move it into a data structure and fix up all usage sites.
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
Documentation/locking/robust-futexes.rst | 8 ++--
include/linux/futex.h | 12 ++----
include/linux/futex_types.h | 34 +++++++++++++++++++
include/linux/sched.h | 16 ++-------
kernel/exit.c | 4 +-
kernel/futex/core.c | 55 +++++++++++++++----------------
kernel/futex/pi.c | 26 +++++++-------
kernel/futex/syscalls.c | 23 ++++--------
8 files changed, 97 insertions(+), 81 deletions(-)
--- a/Documentation/locking/robust-futexes.rst
+++ b/Documentation/locking/robust-futexes.rst
@@ -94,7 +94,7 @@ time, the kernel checks this user-space
locks to be cleaned up?
In the common case, at do_exit() time, there is no list registered, so
-the cost of robust futexes is just a simple current->robust_list != NULL
+the cost of robust futexes is just a current->futex.robust_list != NULL
comparison. If the thread has registered a list, then normally the list
is empty. If the thread/process crashed or terminated in some incorrect
way then the list might be non-empty: in this case the kernel carefully
@@ -178,9 +178,9 @@ The patch adds two new syscalls: one to
size_t __user *len_ptr);
List registration is very fast: the pointer is simply stored in
-current->robust_list. [Note that in the future, if robust futexes become
-widespread, we could extend sys_clone() to register a robust-list head
-for new threads, without the need of another syscall.]
+current->futex.robust_list. [Note that in the future, if robust futexes
+become widespread, we could extend sys_clone() to register a robust-list
+head for new threads, without the need of another syscall.]
So there is virtually zero overhead for tasks not using robust futexes,
and even for robust futex users, there is only one extra syscall per
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -64,14 +64,10 @@ enum {
static inline void futex_init_task(struct task_struct *tsk)
{
- tsk->robust_list = NULL;
-#ifdef CONFIG_COMPAT
- tsk->compat_robust_list = NULL;
-#endif
- INIT_LIST_HEAD(&tsk->pi_state_list);
- tsk->pi_state_cache = NULL;
- tsk->futex_state = FUTEX_STATE_OK;
- mutex_init(&tsk->futex_exit_mutex);
+ memset(&tsk->futex, 0, sizeof(tsk->futex));
+ INIT_LIST_HEAD(&tsk->futex.pi_state_list);
+ tsk->futex.state = FUTEX_STATE_OK;
+ mutex_init(&tsk->futex.exit_mutex);
}
void futex_exit_recursive(struct task_struct *tsk);
--- /dev/null
+++ b/include/linux/futex_types.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FUTEX_TYPES_H
+#define _LINUX_FUTEX_TYPES_H
+
+#ifdef CONFIG_FUTEX
+#include <linux/mutex_types.h>
+#include <linux/types.h>
+
+struct compat_robust_list_head;
+struct futex_pi_state;
+struct robust_list_head;
+
+/**
+ * struct futex_ctrl - Futex related per task data
+ * @robust_list: User space registered robust list pointer
+ * @compat_robust_list: User space registered robust list pointer for compat tasks
+ * @exit_mutex: Mutex for serializing exit
+ * @state: Futex handling state to handle exit races correctly
+ */
+struct futex_ctrl {
+ struct robust_list_head __user *robust_list;
+#ifdef CONFIG_COMPAT
+ struct compat_robust_list_head __user *compat_robust_list;
+#endif
+ struct list_head pi_state_list;
+ struct futex_pi_state *pi_state_cache;
+ struct mutex exit_mutex;
+ unsigned int state;
+};
+#else
+struct futex_ctrl { };
+#endif /* !CONFIG_FUTEX */
+
+#endif /* _LINUX_FUTEX_TYPES_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -16,6 +16,7 @@
#include <linux/cpumask_types.h>
#include <linux/cache.h>
+#include <linux/futex_types.h>
#include <linux/irqflags_types.h>
#include <linux/smp_types.h>
#include <linux/pid_types.h>
@@ -64,7 +65,6 @@ struct bpf_net_context;
struct capture_control;
struct cfs_rq;
struct fs_struct;
-struct futex_pi_state;
struct io_context;
struct io_uring_task;
struct mempolicy;
@@ -76,7 +76,6 @@ struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
struct reclaim_state;
-struct robust_list_head;
struct root_domain;
struct rq;
struct sched_attr;
@@ -1329,16 +1328,9 @@ struct task_struct {
u32 closid;
u32 rmid;
#endif
-#ifdef CONFIG_FUTEX
- struct robust_list_head __user *robust_list;
-#ifdef CONFIG_COMPAT
- struct compat_robust_list_head __user *compat_robust_list;
-#endif
- struct list_head pi_state_list;
- struct futex_pi_state *pi_state_cache;
- struct mutex futex_exit_mutex;
- unsigned int futex_state;
-#endif
+
+ struct futex_ctrl futex;
+
#ifdef CONFIG_PERF_EVENTS
u8 perf_recursion[PERF_NR_CONTEXTS];
struct perf_event_context *perf_event_ctxp;
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -989,8 +989,8 @@ void __noreturn do_exit(long code)
proc_exit_connector(tsk);
mpol_put_task_policy(tsk);
#ifdef CONFIG_FUTEX
- if (unlikely(current->pi_state_cache))
- kfree(current->pi_state_cache);
+ if (unlikely(current->futex.pi_state_cache))
+ kfree(current->futex.pi_state_cache);
#endif
/*
* Make sure we are holding no locks:
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -32,18 +32,19 @@
* "But they come in a choice of three flavours!"
*/
#include <linux/compat.h>
-#include <linux/jhash.h>
-#include <linux/pagemap.h>
#include <linux/debugfs.h>
-#include <linux/plist.h>
+#include <linux/fault-inject.h>
#include <linux/gfp.h>
-#include <linux/vmalloc.h>
+#include <linux/jhash.h>
#include <linux/memblock.h>
-#include <linux/fault-inject.h>
-#include <linux/slab.h>
-#include <linux/prctl.h>
#include <linux/mempolicy.h>
#include <linux/mmap_lock.h>
+#include <linux/pagemap.h>
+#include <linux/plist.h>
+#include <linux/prctl.h>
+#include <linux/rseq.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -829,7 +830,7 @@ void wait_for_owner_exiting(int ret, str
if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
return;
- mutex_lock(&exiting->futex_exit_mutex);
+ mutex_lock(&exiting->futex.exit_mutex);
/*
* No point in doing state checking here. If the waiter got here
* while the task was in exec()->exec_futex_release() then it can
@@ -838,7 +839,7 @@ void wait_for_owner_exiting(int ret, str
* already. Highly unlikely and not a problem. Just one more round
* through the futex maze.
*/
- mutex_unlock(&exiting->futex_exit_mutex);
+ mutex_unlock(&exiting->futex.exit_mutex);
put_task_struct(exiting);
}
@@ -1048,7 +1049,7 @@ static int handle_futex_death(u32 __user
*
* In both cases the following conditions are met:
*
- * 1) task->robust_list->list_op_pending != NULL
+ * 1) task->futex.robust_list->list_op_pending != NULL
* @pending_op == true
* 2) The owner part of user space futex value == 0
* 3) Regular futex: @pi == false
@@ -1153,7 +1154,7 @@ static inline int fetch_robust_entry(str
*/
static void exit_robust_list(struct task_struct *curr)
{
- struct robust_list_head __user *head = curr->robust_list;
+ struct robust_list_head __user *head = curr->futex.robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
unsigned int next_pi;
@@ -1247,7 +1248,7 @@ compat_fetch_robust_entry(compat_uptr_t
*/
static void compat_exit_robust_list(struct task_struct *curr)
{
- struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct compat_robust_list_head __user *head = curr->futex.compat_robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
unsigned int next_pi;
@@ -1323,7 +1324,7 @@ static void compat_exit_robust_list(stru
*/
static void exit_pi_state_list(struct task_struct *curr)
{
- struct list_head *next, *head = &curr->pi_state_list;
+ struct list_head *next, *head = &curr->futex.pi_state_list;
struct futex_pi_state *pi_state;
union futex_key key = FUTEX_KEY_INIT;
@@ -1407,19 +1408,19 @@ static inline void exit_pi_state_list(st
static void futex_cleanup(struct task_struct *tsk)
{
- if (unlikely(tsk->robust_list)) {
+ if (unlikely(tsk->futex.robust_list)) {
exit_robust_list(tsk);
- tsk->robust_list = NULL;
+ tsk->futex.robust_list = NULL;
}
#ifdef CONFIG_COMPAT
- if (unlikely(tsk->compat_robust_list)) {
+ if (unlikely(tsk->futex.compat_robust_list)) {
compat_exit_robust_list(tsk);
- tsk->compat_robust_list = NULL;
+ tsk->futex.compat_robust_list = NULL;
}
#endif
- if (unlikely(!list_empty(&tsk->pi_state_list)))
+ if (unlikely(!list_empty(&tsk->futex.pi_state_list)))
exit_pi_state_list(tsk);
}
@@ -1442,10 +1443,10 @@ static void futex_cleanup(struct task_st
*/
void futex_exit_recursive(struct task_struct *tsk)
{
- /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
- if (tsk->futex_state == FUTEX_STATE_EXITING)
- mutex_unlock(&tsk->futex_exit_mutex);
- tsk->futex_state = FUTEX_STATE_DEAD;
+ /* If the state is FUTEX_STATE_EXITING then futex.exit_mutex is held */
+ if (tsk->futex.state == FUTEX_STATE_EXITING)
+ mutex_unlock(&tsk->futex.exit_mutex);
+ tsk->futex.state = FUTEX_STATE_DEAD;
}
static void futex_cleanup_begin(struct task_struct *tsk)
@@ -1453,10 +1454,10 @@ static void futex_cleanup_begin(struct t
/*
* Prevent various race issues against a concurrent incoming waiter
* including live locks by forcing the waiter to block on
- * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+ * tsk->futex.exit_mutex when it observes FUTEX_STATE_EXITING in
* attach_to_pi_owner().
*/
- mutex_lock(&tsk->futex_exit_mutex);
+ mutex_lock(&tsk->futex.exit_mutex);
/*
* Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
@@ -1470,7 +1471,7 @@ static void futex_cleanup_begin(struct t
* be observed in exit_pi_state_list().
*/
raw_spin_lock_irq(&tsk->pi_lock);
- tsk->futex_state = FUTEX_STATE_EXITING;
+ tsk->futex.state = FUTEX_STATE_EXITING;
raw_spin_unlock_irq(&tsk->pi_lock);
}
@@ -1480,12 +1481,12 @@ static void futex_cleanup_end(struct tas
* Lockless store. The only side effect is that an observer might
* take another loop until it becomes visible.
*/
- tsk->futex_state = state;
+ tsk->futex.state = state;
/*
* Drop the exit protection. This unblocks waiters which observed
* FUTEX_STATE_EXITING to reevaluate the state.
*/
- mutex_unlock(&tsk->futex_exit_mutex);
+ mutex_unlock(&tsk->futex.exit_mutex);
}
void futex_exec_release(struct task_struct *tsk)
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -14,7 +14,7 @@ int refill_pi_state_cache(void)
{
struct futex_pi_state *pi_state;
- if (likely(current->pi_state_cache))
+ if (likely(current->futex.pi_state_cache))
return 0;
pi_state = kzalloc_obj(*pi_state);
@@ -28,17 +28,17 @@ int refill_pi_state_cache(void)
refcount_set(&pi_state->refcount, 1);
pi_state->key = FUTEX_KEY_INIT;
- current->pi_state_cache = pi_state;
+ current->futex.pi_state_cache = pi_state;
return 0;
}
static struct futex_pi_state *alloc_pi_state(void)
{
- struct futex_pi_state *pi_state = current->pi_state_cache;
+ struct futex_pi_state *pi_state = current->futex.pi_state_cache;
WARN_ON(!pi_state);
- current->pi_state_cache = NULL;
+ current->futex.pi_state_cache = NULL;
return pi_state;
}
@@ -60,7 +60,7 @@ static void pi_state_update_owner(struct
if (new_owner) {
raw_spin_lock(&new_owner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &new_owner->pi_state_list);
+ list_add(&pi_state->list, &new_owner->futex.pi_state_list);
pi_state->owner = new_owner;
raw_spin_unlock(&new_owner->pi_lock);
}
@@ -96,7 +96,7 @@ void put_pi_state(struct futex_pi_state
raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
}
- if (current->pi_state_cache) {
+ if (current->futex.pi_state_cache) {
kfree(pi_state);
} else {
/*
@@ -106,7 +106,7 @@ void put_pi_state(struct futex_pi_state
*/
pi_state->owner = NULL;
refcount_set(&pi_state->refcount, 1);
- current->pi_state_cache = pi_state;
+ current->futex.pi_state_cache = pi_state;
}
}
@@ -179,7 +179,7 @@ void put_pi_state(struct futex_pi_state
*
* p->pi_lock:
*
- * p->pi_state_list -> pi_state->list, relation
+ * p->futex.pi_state_list -> pi_state->list, relation
* pi_mutex->owner -> pi_state->owner, relation
*
* pi_state->refcount:
@@ -327,7 +327,7 @@ static int handle_exit_race(u32 __user *
* If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
* caller that the alleged owner is busy.
*/
- if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+ if (tsk && tsk->futex.state != FUTEX_STATE_DEAD)
return -EBUSY;
/*
@@ -346,8 +346,8 @@ static int handle_exit_race(u32 __user *
* *uaddr = 0xC0000000; tsk = get_task(PID);
* } if (!tsk->flags & PF_EXITING) {
* ... attach();
- * tsk->futex_state = } else {
- * FUTEX_STATE_DEAD; if (tsk->futex_state !=
+ * tsk->futex.state = } else {
+ * FUTEX_STATE_DEAD; if (tsk->futex.state !=
* FUTEX_STATE_DEAD)
* return -EAGAIN;
* return -ESRCH; <--- FAIL
@@ -395,7 +395,7 @@ static void __attach_to_pi_owner(struct
pi_state->key = *key;
WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &p->pi_state_list);
+ list_add(&pi_state->list, &p->futex.pi_state_list);
/*
* Assignment without holding pi_state->pi_mutex.wait_lock is safe
* because there is no concurrency as the object is not published yet.
@@ -439,7 +439,7 @@ static int attach_to_pi_owner(u32 __user
* in futex_exit_release(), we do this protected by p->pi_lock:
*/
raw_spin_lock_irq(&p->pi_lock);
- if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
+ if (unlikely(p->futex.state != FUTEX_STATE_OK)) {
/*
* The task is on the way out. When the futex state is
* FUTEX_STATE_DEAD, we know that the task has finished
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -25,17 +25,13 @@
* @head: pointer to the list-head
* @len: length of the list-head, as userspace expects
*/
-SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
- size_t, len)
+SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len)
{
- /*
- * The kernel knows only one size for now:
- */
+ /* The kernel knows only one size for now. */
if (unlikely(len != sizeof(*head)))
return -EINVAL;
- current->robust_list = head;
-
+ current->futex.robust_list = head;
return 0;
}
@@ -43,9 +39,9 @@ static inline void __user *futex_task_ro
{
#ifdef CONFIG_COMPAT
if (compat)
- return p->compat_robust_list;
+ return p->futex.compat_robust_list;
#endif
- return p->robust_list;
+ return p->futex.robust_list;
}
static void __user *futex_get_robust_list_common(int pid, bool compat)
@@ -467,15 +463,13 @@ SYSCALL_DEFINE4(futex_requeue,
}
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE2(set_robust_list,
- struct compat_robust_list_head __user *, head,
- compat_size_t, len)
+COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
{
if (unlikely(len != sizeof(*head)))
return -EINVAL;
- current->compat_robust_list = head;
-
+ current->futex.compat_robust_list = head;
return 0;
}
@@ -515,4 +509,3 @@ SYSCALL_DEFINE6(futex_time32, u32 __user
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#endif /* CONFIG_COMPAT_32BIT_TIME */
-
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 1/8] futex: Move futex task related data into a struct
2026-03-16 17:12 ` [patch 1/8] futex: Move futex task related data into a struct Thomas Gleixner
@ 2026-03-16 17:55 ` Mathieu Desnoyers
2026-03-17 2:24 ` André Almeida
1 sibling, 0 replies; 57+ messages in thread
From: Mathieu Desnoyers @ 2026-03-16 17:55 UTC (permalink / raw)
To: Thomas Gleixner, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On 2026-03-16 13:12, Thomas Gleixner wrote:
> Having all these members in task_struct along with the required #ifdeffery
> is annoying, does not allow efficient initializing of the data with
> memset() and makes extending it tedious.
>
> Move it into a data structure and fix up all usage sites.
>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 1/8] futex: Move futex task related data into a struct
2026-03-16 17:12 ` [patch 1/8] futex: Move futex task related data into a struct Thomas Gleixner
2026-03-16 17:55 ` Mathieu Desnoyers
@ 2026-03-17 2:24 ` André Almeida
2026-03-17 9:52 ` Thomas Gleixner
1 sibling, 1 reply; 57+ messages in thread
From: André Almeida @ 2026-03-17 2:24 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Mathieu Desnoyers, Sebastian Andrzej Siewior, Carlos O'Donell,
Peter Zijlstra, Florian Weimer, Rich Felker, Torvald Riegel,
Darren Hart, Ingo Molnar, Davidlohr Bueso, Arnd Bergmann,
Liam R . Howlett, LKML
Em 16/03/2026 14:12, Thomas Gleixner escreveu:
> Having all these members in task_struct along with the required #ifdeffery
> is annoying, does not allow efficient initializing of the data with
> memset() and makes extending it tedious.
>
> Move it into a data structure and fix up all usage sites.
>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
[...]
> +
> +#ifdef CONFIG_FUTEX
> +#include <linux/mutex_types.h>
> +#include <linux/types.h>
> +
> +struct compat_robust_list_head;
> +struct futex_pi_state;
> +struct robust_list_head;
> +
> +/**
> + * struct futex_ctrl - Futex related per task data
maybe futex_task_data to match futex_mm_data?
> + * @robust_list: User space registered robust list pointer
> + * @compat_robust_list: User space registered robust list pointer for compat tasks
to avoid kernel-doc warnings:
@pi_state_list: List of PI mutexes holded by this task
@pi_state_cache: Copy of the PI state
> + * @exit_mutex: Mutex for serializing exit
> + * @state: Futex handling state to handle exit races correctly
> + */
> +struct futex_ctrl {
> + struct robust_list_head __user *robust_list;
> +#ifdef CONFIG_COMPAT
> + struct compat_robust_list_head __user *compat_robust_list;
> +#endif
> + struct list_head pi_state_list;
> + struct futex_pi_state *pi_state_cache;
> + struct mutex exit_mutex;
> + unsigned int state;
> +};
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 1/8] futex: Move futex task related data into a struct
2026-03-17 2:24 ` André Almeida
@ 2026-03-17 9:52 ` Thomas Gleixner
0 siblings, 0 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-17 9:52 UTC (permalink / raw)
To: André Almeida
Cc: Mathieu Desnoyers, Sebastian Andrzej Siewior, Carlos O'Donell,
Peter Zijlstra, Florian Weimer, Rich Felker, Torvald Riegel,
Darren Hart, Ingo Molnar, Davidlohr Bueso, Arnd Bergmann,
Liam R . Howlett, LKML
On Mon, Mar 16 2026 at 23:24, André Almeida wrote:
> Em 16/03/2026 14:12, Thomas Gleixner escreveu:
>> +
>> +/**
>> + * struct futex_ctrl - Futex related per task data
>
> maybe futex_task_data to match futex_mm_data?
Sure.
>> + * @robust_list: User space registered robust list pointer
>> + * @compat_robust_list: User space registered robust list pointer for compat tasks
>
>
> to avoid kernel-doc warnings:
>
> @pi_state_list: List of PI mutexes holded by this task
> @pi_state_cache: Copy of the PI state
Oops.
^ permalink raw reply [flat|nested] 57+ messages in thread
* [patch 2/8] futex: Move futex related mm_struct data into a struct
2026-03-16 17:12 [patch 0/8] futex: Address the robust futex unlock race for real Thomas Gleixner
2026-03-16 17:12 ` [patch 1/8] futex: Move futex task related data into a struct Thomas Gleixner
@ 2026-03-16 17:13 ` Thomas Gleixner
2026-03-16 18:00 ` Mathieu Desnoyers
2026-03-16 17:13 ` [patch 3/8] futex: Provide UABI defines for robust list entry modifiers Thomas Gleixner
` (5 subsequent siblings)
7 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-16 17:13 UTC (permalink / raw)
To: LKML
Cc: Mathieu Desnoyers, André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
Having all these members in mm_struct along with the required #ifdeffery is
annoying, does not allow efficient initializing of the data with
memset() and makes extending it tedious.
Move it into a data structure and fix up all usage sites.
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
include/linux/futex_types.h | 22 +++++++
include/linux/mm_types.h | 11 ---
kernel/futex/core.c | 123 ++++++++++++++++++++------------------------
3 files changed, 80 insertions(+), 76 deletions(-)
--- a/include/linux/futex_types.h
+++ b/include/linux/futex_types.h
@@ -31,4 +31,26 @@ struct futex_ctrl {
struct futex_ctrl { };
#endif /* !CONFIG_FUTEX */
+/**
+ * struct futex_mm_data - Futex related per MM data
+ * @phash_lock: Mutex to protect the private hash operations
+ * @phash: RCU managed pointer to the private hash
+ * @phash_new: Pointer to a newly allocated private hash
+ * @phash_batches: Batch state for RCU synchronization
+ * @phash_rcu: RCU head for call_rcu()
+ * @phash_atomic: Aggregate value for @phash_ref
+ * @phash_ref: Per CPU reference counter for a private hash
+ */
+struct futex_mm_data {
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+ struct mutex phash_lock;
+ struct futex_private_hash __rcu *phash;
+ struct futex_private_hash *phash_new;
+ unsigned long phash_batches;
+ struct rcu_head phash_rcu;
+ atomic_long_t phash_atomic;
+ unsigned int __percpu *phash_ref;
+#endif
+};
+
#endif /* _LINUX_FUTEX_TYPES_H */
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1221,16 +1221,7 @@ struct mm_struct {
*/
seqcount_t mm_lock_seq;
#endif
-#ifdef CONFIG_FUTEX_PRIVATE_HASH
- struct mutex futex_hash_lock;
- struct futex_private_hash __rcu *futex_phash;
- struct futex_private_hash *futex_phash_new;
- /* futex-ref */
- unsigned long futex_batches;
- struct rcu_head futex_rcu;
- atomic_long_t futex_atomic;
- unsigned int __percpu *futex_ref;
-#endif
+ struct futex_mm_data futex;
unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -188,13 +188,13 @@ static struct futex_hash_bucket *
return NULL;
if (!fph)
- fph = rcu_dereference(key->private.mm->futex_phash);
+ fph = rcu_dereference(key->private.mm->futex.phash);
if (!fph || !fph->hash_mask)
return NULL;
- hash = jhash2((void *)&key->private.address,
- sizeof(key->private.address) / 4,
+ hash = jhash2((void *)&key->private.address, sizeof(key->private.address) / 4,
key->both.offset);
+
return &fph->queues[hash & fph->hash_mask];
}
@@ -238,13 +238,12 @@ static bool __futex_pivot_hash(struct mm
{
struct futex_private_hash *fph;
- WARN_ON_ONCE(mm->futex_phash_new);
+ WARN_ON_ONCE(mm->futex.phash_new);
- fph = rcu_dereference_protected(mm->futex_phash,
- lockdep_is_held(&mm->futex_hash_lock));
+ fph = rcu_dereference_protected(mm->futex.phash, lockdep_is_held(&mm->futex.phash_lock));
if (fph) {
if (!futex_ref_is_dead(fph)) {
- mm->futex_phash_new = new;
+ mm->futex.phash_new = new;
return false;
}
@@ -252,8 +251,8 @@ static bool __futex_pivot_hash(struct mm
}
new->state = FR_PERCPU;
scoped_guard(rcu) {
- mm->futex_batches = get_state_synchronize_rcu();
- rcu_assign_pointer(mm->futex_phash, new);
+ mm->futex.phash_batches = get_state_synchronize_rcu();
+ rcu_assign_pointer(mm->futex.phash, new);
}
kvfree_rcu(fph, rcu);
return true;
@@ -261,12 +260,12 @@ static bool __futex_pivot_hash(struct mm
static void futex_pivot_hash(struct mm_struct *mm)
{
- scoped_guard(mutex, &mm->futex_hash_lock) {
+ scoped_guard(mutex, &mm->futex.phash_lock) {
struct futex_private_hash *fph;
- fph = mm->futex_phash_new;
+ fph = mm->futex.phash_new;
if (fph) {
- mm->futex_phash_new = NULL;
+ mm->futex.phash_new = NULL;
__futex_pivot_hash(mm, fph);
}
}
@@ -289,7 +288,7 @@ struct futex_private_hash *futex_private
scoped_guard(rcu) {
struct futex_private_hash *fph;
- fph = rcu_dereference(mm->futex_phash);
+ fph = rcu_dereference(mm->futex.phash);
if (!fph)
return NULL;
@@ -412,8 +411,7 @@ static int futex_mpol(struct mm_struct *
* private hash) is returned if existing. Otherwise a hash bucket from the
* global hash is returned.
*/
-static struct futex_hash_bucket *
-__futex_hash(union futex_key *key, struct futex_private_hash *fph)
+static struct futex_hash_bucket *__futex_hash(union futex_key *key, struct futex_private_hash *fph)
{
int node = key->both.node;
u32 hash;
@@ -426,8 +424,7 @@ static struct futex_hash_bucket *
return hb;
}
- hash = jhash2((u32 *)key,
- offsetof(typeof(*key), both.offset) / sizeof(u32),
+ hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / sizeof(u32),
key->both.offset);
if (node == FUTEX_NO_NODE) {
@@ -442,8 +439,7 @@ static struct futex_hash_bucket *
*/
node = (hash >> futex_hashshift) % nr_node_ids;
if (!node_possible(node)) {
- node = find_next_bit_wrap(node_possible_map.bits,
- nr_node_ids, node);
+ node = find_next_bit_wrap(node_possible_map.bits, nr_node_ids, node);
}
}
@@ -460,9 +456,8 @@ static struct futex_hash_bucket *
* Return: Initialized hrtimer_sleeper structure or NULL if no timeout
* value given
*/
-struct hrtimer_sleeper *
-futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
- int flags, u64 range_ns)
+struct hrtimer_sleeper *futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns)
{
if (!time)
return NULL;
@@ -1551,17 +1546,17 @@ static void __futex_ref_atomic_begin(str
* otherwise it would be impossible for it to have reported success
* from futex_ref_is_dead().
*/
- WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);
+ WARN_ON_ONCE(atomic_long_read(&mm->futex.phash_atomic) != 0);
/*
* Set the atomic to the bias value such that futex_ref_{get,put}()
* will never observe 0. Will be fixed up in __futex_ref_atomic_end()
* when folding in the percpu count.
*/
- atomic_long_set(&mm->futex_atomic, LONG_MAX);
+ atomic_long_set(&mm->futex.phash_atomic, LONG_MAX);
smp_store_release(&fph->state, FR_ATOMIC);
- call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+ call_rcu_hurry(&mm->futex.phash_rcu, futex_ref_rcu);
}
static void __futex_ref_atomic_end(struct futex_private_hash *fph)
@@ -1582,7 +1577,7 @@ static void __futex_ref_atomic_end(struc
* Therefore the per-cpu counter is now stable, sum and reset.
*/
for_each_possible_cpu(cpu) {
- unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
+ unsigned int *ptr = per_cpu_ptr(mm->futex.phash_ref, cpu);
count += *ptr;
*ptr = 0;
}
@@ -1590,7 +1585,7 @@ static void __futex_ref_atomic_end(struc
/*
* Re-init for the next cycle.
*/
- this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+ this_cpu_inc(*mm->futex.phash_ref); /* 0 -> 1 */
/*
* Add actual count, subtract bias and initial refcount.
@@ -1598,7 +1593,7 @@ static void __futex_ref_atomic_end(struc
* The moment this atomic operation happens, futex_ref_is_dead() can
* become true.
*/
- ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
+ ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex.phash_atomic);
if (!ret)
wake_up_var(mm);
@@ -1608,8 +1603,8 @@ static void __futex_ref_atomic_end(struc
static void futex_ref_rcu(struct rcu_head *head)
{
- struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
- struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);
+ struct mm_struct *mm = container_of(head, struct mm_struct, futex.phash_rcu);
+ struct futex_private_hash *fph = rcu_dereference_raw(mm->futex.phash);
if (fph->state == FR_PERCPU) {
/*
@@ -1638,7 +1633,7 @@ static void futex_ref_drop(struct futex_
/*
* Can only transition the current fph;
*/
- WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
+ WARN_ON_ONCE(rcu_dereference_raw(mm->futex.phash) != fph);
/*
* We enqueue at least one RCU callback. Ensure mm stays if the task
* exits before the transition is completed.
@@ -1650,8 +1645,8 @@ static void futex_ref_drop(struct futex_
*
* futex_hash() __futex_pivot_hash()
* guard(rcu); guard(mm->futex_hash_lock);
- * fph = mm->futex_phash;
- * rcu_assign_pointer(&mm->futex_phash, new);
+ * fph = mm->futex.phash;
+ * rcu_assign_pointer(&mm->futex.phash, new);
* futex_hash_allocate()
* futex_ref_drop()
* fph->state = FR_ATOMIC;
@@ -1666,7 +1661,7 @@ static void futex_ref_drop(struct futex_
* There must be at least one full grace-period between publishing a
* new fph and trying to replace it.
*/
- if (poll_state_synchronize_rcu(mm->futex_batches)) {
+ if (poll_state_synchronize_rcu(mm->futex.phash_batches)) {
/*
* There was a grace-period, we can begin now.
*/
@@ -1674,7 +1669,7 @@ static void futex_ref_drop(struct futex_
return;
}
- call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+ call_rcu_hurry(&mm->futex.phash_rcu, futex_ref_rcu);
}
static bool futex_ref_get(struct futex_private_hash *fph)
@@ -1684,11 +1679,11 @@ static bool futex_ref_get(struct futex_p
guard(preempt)();
if (READ_ONCE(fph->state) == FR_PERCPU) {
- __this_cpu_inc(*mm->futex_ref);
+ __this_cpu_inc(*mm->futex.phash_ref);
return true;
}
- return atomic_long_inc_not_zero(&mm->futex_atomic);
+ return atomic_long_inc_not_zero(&mm->futex.phash_atomic);
}
static bool futex_ref_put(struct futex_private_hash *fph)
@@ -1698,11 +1693,11 @@ static bool futex_ref_put(struct futex_p
guard(preempt)();
if (READ_ONCE(fph->state) == FR_PERCPU) {
- __this_cpu_dec(*mm->futex_ref);
+ __this_cpu_dec(*mm->futex.phash_ref);
return false;
}
- return atomic_long_dec_and_test(&mm->futex_atomic);
+ return atomic_long_dec_and_test(&mm->futex.phash_atomic);
}
static bool futex_ref_is_dead(struct futex_private_hash *fph)
@@ -1714,18 +1709,14 @@ static bool futex_ref_is_dead(struct fut
if (smp_load_acquire(&fph->state) == FR_PERCPU)
return false;
- return atomic_long_read(&mm->futex_atomic) == 0;
+ return atomic_long_read(&mm->futex.phash_atomic) == 0;
}
int futex_mm_init(struct mm_struct *mm)
{
- mutex_init(&mm->futex_hash_lock);
- RCU_INIT_POINTER(mm->futex_phash, NULL);
- mm->futex_phash_new = NULL;
- /* futex-ref */
- mm->futex_ref = NULL;
- atomic_long_set(&mm->futex_atomic, 0);
- mm->futex_batches = get_state_synchronize_rcu();
+ memset(&mm->futex, 0, sizeof(mm->futex));
+ mutex_init(&mm->futex.phash_lock);
+ mm->futex.phash_batches = get_state_synchronize_rcu();
return 0;
}
@@ -1733,9 +1724,9 @@ void futex_hash_free(struct mm_struct *m
{
struct futex_private_hash *fph;
- free_percpu(mm->futex_ref);
- kvfree(mm->futex_phash_new);
- fph = rcu_dereference_raw(mm->futex_phash);
+ free_percpu(mm->futex.phash_ref);
+ kvfree(mm->futex.phash_new);
+ fph = rcu_dereference_raw(mm->futex.phash);
if (fph)
kvfree(fph);
}
@@ -1746,10 +1737,10 @@ static bool futex_pivot_pending(struct m
guard(rcu)();
- if (!mm->futex_phash_new)
+ if (!mm->futex.phash_new)
return true;
- fph = rcu_dereference(mm->futex_phash);
+ fph = rcu_dereference(mm->futex.phash);
return futex_ref_is_dead(fph);
}
@@ -1791,7 +1782,7 @@ static int futex_hash_allocate(unsigned
* Once we've disabled the global hash there is no way back.
*/
scoped_guard(rcu) {
- fph = rcu_dereference(mm->futex_phash);
+ fph = rcu_dereference(mm->futex.phash);
if (fph && !fph->hash_mask) {
if (custom)
return -EBUSY;
@@ -1799,15 +1790,15 @@ static int futex_hash_allocate(unsigned
}
}
- if (!mm->futex_ref) {
+ if (!mm->futex.phash_ref) {
/*
* This will always be allocated by the first thread and
* therefore requires no locking.
*/
- mm->futex_ref = alloc_percpu(unsigned int);
- if (!mm->futex_ref)
+ mm->futex.phash_ref = alloc_percpu(unsigned int);
+ if (!mm->futex.phash_ref)
return -ENOMEM;
- this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+ this_cpu_inc(*mm->futex.phash_ref); /* 0 -> 1 */
}
fph = kvzalloc(struct_size(fph, queues, hash_slots),
@@ -1830,14 +1821,14 @@ static int futex_hash_allocate(unsigned
wait_var_event(mm, futex_pivot_pending(mm));
}
- scoped_guard(mutex, &mm->futex_hash_lock) {
+ scoped_guard(mutex, &mm->futex.phash_lock) {
struct futex_private_hash *free __free(kvfree) = NULL;
struct futex_private_hash *cur, *new;
- cur = rcu_dereference_protected(mm->futex_phash,
- lockdep_is_held(&mm->futex_hash_lock));
- new = mm->futex_phash_new;
- mm->futex_phash_new = NULL;
+ cur = rcu_dereference_protected(mm->futex.phash,
+ lockdep_is_held(&mm->futex.phash_lock));
+ new = mm->futex.phash_new;
+ mm->futex.phash_new = NULL;
if (fph) {
if (cur && !cur->hash_mask) {
@@ -1847,7 +1838,7 @@ static int futex_hash_allocate(unsigned
* the second one returns here.
*/
free = fph;
- mm->futex_phash_new = new;
+ mm->futex.phash_new = new;
return -EBUSY;
}
if (cur && !new) {
@@ -1877,7 +1868,7 @@ static int futex_hash_allocate(unsigned
if (new) {
/*
- * Will set mm->futex_phash_new on failure;
+ * Will set mm->futex.phash_new on failure;
* futex_private_hash_get() will try again.
*/
if (!__futex_pivot_hash(mm, new) && custom)
@@ -1900,7 +1891,7 @@ int futex_hash_allocate_default(void)
get_nr_threads(current),
num_online_cpus());
- fph = rcu_dereference(current->mm->futex_phash);
+ fph = rcu_dereference(current->mm->futex.phash);
if (fph) {
if (fph->custom)
return 0;
@@ -1927,7 +1918,7 @@ static int futex_hash_get_slots(void)
struct futex_private_hash *fph;
guard(rcu)();
- fph = rcu_dereference(current->mm->futex_phash);
+ fph = rcu_dereference(current->mm->futex.phash);
if (fph && fph->hash_mask)
return fph->hash_mask + 1;
return 0;
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 2/8] futex: Move futex related mm_struct data into a struct
2026-03-16 17:13 ` [patch 2/8] futex: Move futex related mm_struct " Thomas Gleixner
@ 2026-03-16 18:00 ` Mathieu Desnoyers
0 siblings, 0 replies; 57+ messages in thread
From: Mathieu Desnoyers @ 2026-03-16 18:00 UTC (permalink / raw)
To: Thomas Gleixner, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On 2026-03-16 13:13, Thomas Gleixner wrote:
> Having all these members in mm_struct along with the required #ifdeffery is
> annoying, does not allow efficient initializing of the data with
> memset() and makes extending it tedious.
>
> Move it into a data structure and fix up all usage sites.
>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> ---
> include/linux/futex_types.h | 22 +++++++
> include/linux/mm_types.h | 11 ---
> kernel/futex/core.c | 123 ++++++++++++++++++++------------------------
> 3 files changed, 80 insertions(+), 76 deletions(-)
>
> --- a/include/linux/futex_types.h
> +++ b/include/linux/futex_types.h
> @@ -31,4 +31,26 @@ struct futex_ctrl {
> struct futex_ctrl { };
> #endif /* !CONFIG_FUTEX */
>
> +/**
> + * struct futex_mm_data - Futex related per MM data
> + * @phash_lock: Mutex to protect the private hash operations
> + * @phash: RCU managed pointer to the private hash
> + * @phash_new: Pointer to a newly allocated private hash
> + * @phash_batches: Batch state for RCU synchronization
> + * @phash_rcu: RCU head for call_rcu()
> + * @phash_atomic: Aggregate value for @phash_ref
> + * @phash_ref: Per CPU reference counter for a private hash
> + */
> +struct futex_mm_data {
> +#ifdef CONFIG_FUTEX_PRIVATE_HASH
> + struct mutex phash_lock;
> + struct futex_private_hash __rcu *phash;
> + struct futex_private_hash *phash_new;
> + unsigned long phash_batches;
> + struct rcu_head phash_rcu;
> + atomic_long_t phash_atomic;
> + unsigned int __percpu *phash_ref;
> +#endif
> +};
In the previous patch you use the approach of declaring an empty
structure within the #else, and here the content of the struct is
conditional. Preferably pick one approach ?
Other than this minor nit:
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply [flat|nested] 57+ messages in thread
* [patch 3/8] futex: Provide UABI defines for robust list entry modifiers
2026-03-16 17:12 [patch 0/8] futex: Address the robust futex unlock race for real Thomas Gleixner
2026-03-16 17:12 ` [patch 1/8] futex: Move futex task related data into a struct Thomas Gleixner
2026-03-16 17:13 ` [patch 2/8] futex: Move futex related mm_struct " Thomas Gleixner
@ 2026-03-16 17:13 ` Thomas Gleixner
2026-03-16 18:02 ` Mathieu Desnoyers
2026-03-17 2:38 ` André Almeida
2026-03-16 17:13 ` [patch 4/8] futex: Add support for unlocking robust futexes Thomas Gleixner
` (4 subsequent siblings)
7 siblings, 2 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-16 17:13 UTC (permalink / raw)
To: LKML
Cc: Mathieu Desnoyers, André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
The marker for PI futexes in the robust list is a hardcoded 0x1 which lacks
any sensible form of documentation.
Provide proper defines for the bit and the mask and fix up the usage sites.
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
include/uapi/linux/futex.h | 4 +++
kernel/futex/core.c | 53 +++++++++++++++++++++------------------------
2 files changed, 29 insertions(+), 28 deletions(-)
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -177,6 +177,10 @@ struct robust_list_head {
*/
#define ROBUST_LIST_LIMIT 2048
+/* Modifiers for robust_list_head::list_op_pending */
+#define FUTEX_ROBUST_MOD_PI (0x1UL)
+#define FUTEX_ROBUST_MOD_MASK (FUTEX_ROBUST_MOD_PI)
+
/*
* bitset with all bits set for the FUTEX_xxx_BITSET OPs to request a
* match of any bit.
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1009,8 +1009,9 @@ void futex_unqueue_pi(struct futex_q *q)
* dying task, and do notification if so:
*/
static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
- bool pi, bool pending_op)
+ unsigned int mod, bool pending_op)
{
+ bool pi = !!(mod & FUTEX_ROBUST_MOD_PI);
u32 uval, nval, mval;
pid_t owner;
int err;
@@ -1128,21 +1129,21 @@ static int handle_futex_death(u32 __user
*/
static inline int fetch_robust_entry(struct robust_list __user **entry,
struct robust_list __user * __user *head,
- unsigned int *pi)
+ unsigned int *mod)
{
unsigned long uentry;
if (get_user(uentry, (unsigned long __user *)head))
return -EFAULT;
- *entry = (void __user *)(uentry & ~1UL);
- *pi = uentry & 1;
+ *entry = (void __user *)(uentry & ~FUTEX_ROBUST_MOD_MASK);
+ *mod = uentry & FUTEX_ROBUST_MOD_MASK;
return 0;
}
/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * Walk curr->futex.robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
*
* We silently return on any sign of list-walking problem.
@@ -1150,9 +1151,8 @@ static inline int fetch_robust_entry(str
static void exit_robust_list(struct task_struct *curr)
{
struct robust_list_head __user *head = curr->futex.robust_list;
+ unsigned int limit = ROBUST_LIST_LIMIT, cur_mod, next_mod, pend_mod;
struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int next_pi;
unsigned long futex_offset;
int rc;
@@ -1160,7 +1160,7 @@ static void exit_robust_list(struct task
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
- if (fetch_robust_entry(&entry, &head->list.next, &pi))
+ if (fetch_robust_entry(&entry, &head->list.next, &cur_mod))
return;
/*
* Fetch the relative futex offset:
@@ -1171,7 +1171,7 @@ static void exit_robust_list(struct task
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
- if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pend_mod))
return;
next_entry = NULL; /* avoid warning with gcc */
@@ -1180,20 +1180,20 @@ static void exit_robust_list(struct task
* Fetch the next entry in the list before calling
* handle_futex_death:
*/
- rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+ rc = fetch_robust_entry(&next_entry, &entry->next, &next_mod);
/*
* A pending lock might already be on the list, so
* don't process it twice:
*/
if (entry != pending) {
if (handle_futex_death((void __user *)entry + futex_offset,
- curr, pi, HANDLE_DEATH_LIST))
+ curr, cur_mod, HANDLE_DEATH_LIST))
return;
}
if (rc)
return;
entry = next_entry;
- pi = next_pi;
+ cur_mod = next_mod;
/*
* Avoid excessively long or circular lists:
*/
@@ -1205,7 +1205,7 @@ static void exit_robust_list(struct task
if (pending) {
handle_futex_death((void __user *)pending + futex_offset,
- curr, pip, HANDLE_DEATH_PENDING);
+ curr, pend_mod, HANDLE_DEATH_PENDING);
}
}
@@ -1224,29 +1224,28 @@ static void __user *futex_uaddr(struct r
*/
static inline int
compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
- compat_uptr_t __user *head, unsigned int *pi)
+ compat_uptr_t __user *head, unsigned int *pflags)
{
if (get_user(*uentry, head))
return -EFAULT;
- *entry = compat_ptr((*uentry) & ~1);
- *pi = (unsigned int)(*uentry) & 1;
+ *entry = compat_ptr((*uentry) & ~FUTEX_ROBUST_MOD_MASK);
+ *pflags = (unsigned int)(*uentry) & FUTEX_ROBUST_MOD_MASK;
return 0;
}
/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * Walk curr->futex.robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
*
* We silently return on any sign of list-walking problem.
*/
static void compat_exit_robust_list(struct task_struct *curr)
{
- struct compat_robust_list_head __user *head = curr->futex.compat_robust_list;
+ struct compat_robust_list_head __user *head = current->futex.compat_robust_list;
+ unsigned int limit = ROBUST_LIST_LIMIT, cur_mod, next_mod, pend_mod;
struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int next_pi;
compat_uptr_t uentry, next_uentry, upending;
compat_long_t futex_offset;
int rc;
@@ -1255,7 +1254,7 @@ static void compat_exit_robust_list(stru
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
- if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &cur_mod))
return;
/*
* Fetch the relative futex offset:
@@ -1266,8 +1265,7 @@ static void compat_exit_robust_list(stru
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
- if (compat_fetch_robust_entry(&upending, &pending,
- &head->list_op_pending, &pip))
+ if (compat_fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pend_mod))
return;
next_entry = NULL; /* avoid warning with gcc */
@@ -1277,7 +1275,7 @@ static void compat_exit_robust_list(stru
* handle_futex_death:
*/
rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
- (compat_uptr_t __user *)&entry->next, &next_pi);
+ (compat_uptr_t __user *)&entry->next, &next_mod);
/*
* A pending lock might already be on the list, so
* dont process it twice:
@@ -1285,15 +1283,14 @@ static void compat_exit_robust_list(stru
if (entry != pending) {
void __user *uaddr = futex_uaddr(entry, futex_offset);
- if (handle_futex_death(uaddr, curr, pi,
- HANDLE_DEATH_LIST))
+ if (handle_futex_death(uaddr, curr, cur_mod, HANDLE_DEATH_LIST))
return;
}
if (rc)
return;
uentry = next_uentry;
entry = next_entry;
- pi = next_pi;
+ cur_mod = next_mod;
/*
* Avoid excessively long or circular lists:
*/
@@ -1305,7 +1302,7 @@ static void compat_exit_robust_list(stru
if (pending) {
void __user *uaddr = futex_uaddr(pending, futex_offset);
- handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
+ handle_futex_death(uaddr, curr, pend_mod, HANDLE_DEATH_PENDING);
}
}
#endif
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 3/8] futex: Provide UABI defines for robust list entry modifiers
2026-03-16 17:13 ` [patch 3/8] futex: Provide UABI defines for robust list entry modifiers Thomas Gleixner
@ 2026-03-16 18:02 ` Mathieu Desnoyers
2026-03-17 2:38 ` André Almeida
1 sibling, 0 replies; 57+ messages in thread
From: Mathieu Desnoyers @ 2026-03-16 18:02 UTC (permalink / raw)
To: Thomas Gleixner, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On 2026-03-16 13:13, Thomas Gleixner wrote:
> The marker for PI futexes in the robust list is a hardcoded 0x1 which lacks
> any sensible form of documentation.
lol, yes indeed :)
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 3/8] futex: Provide UABI defines for robust list entry modifiers
2026-03-16 17:13 ` [patch 3/8] futex: Provide UABI defines for robust list entry modifiers Thomas Gleixner
2026-03-16 18:02 ` Mathieu Desnoyers
@ 2026-03-17 2:38 ` André Almeida
2026-03-17 9:53 ` Thomas Gleixner
1 sibling, 1 reply; 57+ messages in thread
From: André Almeida @ 2026-03-17 2:38 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Mathieu Desnoyers, Sebastian Andrzej Siewior, Carlos O'Donell,
LKML, Peter Zijlstra, Florian Weimer, Rich Felker, Torvald Riegel,
Darren Hart, Ingo Molnar, Davidlohr Bueso, Arnd Bergmann,
Liam R . Howlett
Em 16/03/2026 14:13, Thomas Gleixner escreveu:
> The marker for PI futexes in the robust list is a hardcoded 0x1 which lacks
> any sensible form of documentation.
>
> Provide proper defines for the bit and the mask and fix up the usage sites.
>
Most of the diff is about a change that's not described here.
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
[...]
> static void compat_exit_robust_list(struct task_struct *curr)
> {
> - struct compat_robust_list_head __user *head = curr->futex.compat_robust_list;
> + struct compat_robust_list_head __user *head = current->futex.compat_robust_list;
It seems you accidentally changed from curr-> to current->
> + unsigned int limit = ROBUST_LIST_LIMIT, cur_mod, next_mod, pend_mod;
> struct robust_list __user *entry, *next_entry, *pending;
> - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
> - unsigned int next_pi;
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 3/8] futex: Provide UABI defines for robust list entry modifiers
2026-03-17 2:38 ` André Almeida
@ 2026-03-17 9:53 ` Thomas Gleixner
0 siblings, 0 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-17 9:53 UTC (permalink / raw)
To: André Almeida
Cc: Mathieu Desnoyers, Sebastian Andrzej Siewior, Carlos O'Donell,
LKML, Peter Zijlstra, Florian Weimer, Rich Felker, Torvald Riegel,
Darren Hart, Ingo Molnar, Davidlohr Bueso, Arnd Bergmann,
Liam R . Howlett
On Mon, Mar 16 2026 at 23:38, André Almeida wrote:
> Em 16/03/2026 14:13, Thomas Gleixner escreveu:
>> The marker for PI futexes in the robust list is a hardcoded 0x1 which lacks
>> any sensible form of documentation.
>>
>> Provide proper defines for the bit and the mask and fix up the usage sites.
>>
>
> Most of the diff is about a change that's not described here.
I'll add some more blurb to it.
>> static void compat_exit_robust_list(struct task_struct *curr)
>> {
>> - struct compat_robust_list_head __user *head = curr->futex.compat_robust_list;
>> + struct compat_robust_list_head __user *head = current->futex.compat_robust_list;
>
> It seems you accidentally changed from curr-> to current->
Copy & Pasta :)
^ permalink raw reply [flat|nested] 57+ messages in thread
* [patch 4/8] futex: Add support for unlocking robust futexes
2026-03-16 17:12 [patch 0/8] futex: Address the robust futex unlock race for real Thomas Gleixner
` (2 preceding siblings ...)
2026-03-16 17:13 ` [patch 3/8] futex: Provide UABI defines for robust list entry modifiers Thomas Gleixner
@ 2026-03-16 17:13 ` Thomas Gleixner
2026-03-16 18:24 ` Mathieu Desnoyers
2026-03-17 16:17 ` André Almeida
2026-03-16 17:13 ` [patch 5/8] futex: Add robust futex unlock IP range Thomas Gleixner
` (3 subsequent siblings)
7 siblings, 2 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-16 17:13 UTC (permalink / raw)
To: LKML
Cc: Mathieu Desnoyers, André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
Unlocking robust non-PI futexes happens in user space with the following
sequence:
1) robust_list_set_op_pending(mutex);
2) robust_list_remove(mutex);
lval = 0;
3) atomic_xchg(lock, lval);
4) if (lval & WAITERS)
5) sys_futex(WAKE,....);
6) robust_list_clear_op_pending();
That opens a window between #3 and #6 where the mutex could be acquired by
some other task which observes that it is the last user and:
A) unmaps the mutex memory
B) maps a different file, which ends up covering the same address
When the original task exits before reaching #6 then the kernel robust list
handling observes the pending op entry and tries to fix up user space.
In case that the newly mapped data contains the TID of the exiting thread
at the address of the mutex/futex the kernel will set the owner died bit in
that memory and therefore corrupting unrelated data.
PI futexes have a similar problem both for the non-contented user space
unlock and the in kernel unlock:
1) robust_list_set_op_pending(mutex);
2) robust_list_remove(mutex);
lval = gettid();
3) if (!atomic_try_cmpxchg(lock, lval, 0))
4) sys_futex(UNLOCK_PI,....);
5) robust_list_clear_op_pending();
Address the first part of the problem where the futexes have waiters and
need to enter the kernel anyway. Add a new FUTEX_ROBUST_UNLOCK flag, which
is valid for the sys_futex() FUTEX_UNLOCK_PI, FUTEX_WAKE, FUTEX_WAKE_BITSET
operations.
This deliberately omits FUTEX_WAKE_OP from this treatment as it's unclear
whether this is needed and there is no usage of it in glibc either to
investigate.
For the futex2 syscall family this needs to be implemented with a new
syscall.
The sys_futex() case [ab]uses the @uaddr2 argument to hand the pointer to
the kernel. This argument is only evaluated when the FUTEX_ROBUST_UNLOCK
bit is set and is therefore backward compatible.
The pointer has a modifier to indicate that it points to an u32 and not to
an u64. This is required for several reasons:
1) sys_futex() has no compat variant
2) The gaming emulators use both both 64-bit and compat 32-bit robust
lists in the same 64-bit application
3) Having the pointer handed in spares the evaluation of the registered
robust lists to figure out whether the futex address is matching the
registered [compat_]robust_list_head::list_pending_op pointer.
As a consequence 32-bit applications have to set this bit unconditionally
so they can run on a 64-bit kernel in compat mode unmodified. 32-bit
kernels return an error code when the bit is not set. 64-bit kernels will
happily clear the full 64 bits if user space fails to set it.
In case of FUTEX_UNLOCK_PI this clears the robust list pending op when the
unlock succeeded. In case of errors, the user space value is still locked
by the caller and therefore the above cannot happen.
In case of FUTEX_WAKE* this does the unlock of the futex in the kernel and
clears the robust list pending op when the unlock was successful. If not,
the user space value is still locked and user space has to deal with the
returned error. That means that the unlocking of non-PI robust futexes has
to use the same try_cmpxchg() unlock scheme as PI futexes.
If the clearing of the pending list op fails (fault) then the kernel clears
the registered robust list pointer if it matches to prevent that exit()
will try to handle invalid data. That's a valid paranoid decision because
the robust list head sits usually in the TLS and if the TLS is not longer
accessible then the chance for fixing up the resulting mess is very close
to zero.
The problem of non-contended unlocks still exists and will be addressed
separately.
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
include/uapi/linux/futex.h | 20 ++++++++++++++
io_uring/futex.c | 2 -
kernel/futex/core.c | 61 +++++++++++++++++++++++++++++++++++++++++++--
kernel/futex/futex.h | 11 ++++++--
kernel/futex/pi.c | 15 +++++++++--
kernel/futex/syscalls.c | 13 +++++++--
kernel/futex/waitwake.c | 27 ++++++++++++++++++-
7 files changed, 136 insertions(+), 13 deletions(-)
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -25,7 +25,8 @@
#define FUTEX_PRIVATE_FLAG 128
#define FUTEX_CLOCK_REALTIME 256
-#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
+#define FUTEX_UNLOCK_ROBUST 512
+#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | FUTEX_UNLOCK_ROBUST)
#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
@@ -182,6 +183,23 @@ struct robust_list_head {
#define FUTEX_ROBUST_MOD_MASK (FUTEX_ROBUST_MOD_PI)
/*
+ * Modifier for FUTEX_ROBUST_UNLOCK uaddr2. Required to distinguish the storage
+ * size for the robust_list_head::list_pending_op. This solves two problems:
+ *
+ * 1) COMPAT tasks
+ *
+ * 2) The mixed mode magic gaming use case which has both 32-bit and 64-bit
+ * robust lists. Oh well....
+ *
+ * Long story short: 32-bit userspace must set this bit unconditionally to
+ * ensure that it can run on a 64-bit kernel in compat mode. If user space
+ * screws that up a 64-bit kernel will happily clear the full 64-bits. 32-bit
+ * kernels return an error code if the bit is not set.
+ */
+#define FUTEX_ROBUST_UNLOCK_MOD_32BIT (0x1UL)
+#define FUTEX_ROBUST_UNLOCK_MOD_MASK (FUTEX_ROBUST_UNLOCK_MOD_32BIT)
+
+/*
* bitset with all bits set for the FUTEX_xxx_BITSET OPs to request a
* match of any bit.
*/
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -325,7 +325,7 @@ int io_futex_wake(struct io_kiocb *req,
* Strict flags - ensure that waking 0 futexes yields a 0 result.
* See commit 43adf8449510 ("futex: FLAGS_STRICT") for details.
*/
- ret = futex_wake(iof->uaddr, FLAGS_STRICT | iof->futex_flags,
+ ret = futex_wake(iof->uaddr, FLAGS_STRICT | iof->futex_flags, NULL,
iof->futex_val, iof->futex_mask);
if (ret < 0)
req_set_fail(req);
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1063,7 +1063,7 @@ static int handle_futex_death(u32 __user
owner = uval & FUTEX_TID_MASK;
if (pending_op && !pi && !owner) {
- futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, NULL, 1,
FUTEX_BITSET_MATCH_ANY);
return 0;
}
@@ -1117,7 +1117,7 @@ static int handle_futex_death(u32 __user
* PI futexes happens in exit_pi_state():
*/
if (!pi && (uval & FUTEX_WAITERS)) {
- futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, NULL, 1,
FUTEX_BITSET_MATCH_ANY);
}
@@ -1209,6 +1209,27 @@ static void exit_robust_list(struct task
}
}
+static bool robust_list_clear_pending(unsigned long __user *pop)
+{
+ struct robust_list_head __user *head = current->futex.robust_list;
+
+ if (!put_user(0UL, pop))
+ return true;
+
+ /*
+ * Just give up. The robust list head is usually part of TLS, so the
+ * chance that this gets resolved is close to zero.
+ *
+ * If @pop_addr is the robust_list_head::list_op_pending pointer then
+ * clear the robust list head pointer to prevent further damage when the
+ * task exits. Better a few stale futexes than corrupted memory. But
+ * that's mostly an academic exercise.
+ */
+ if (pop == (unsigned long __user *)&head->list_op_pending)
+ current->futex.robust_list = NULL;
+ return false;
+}
+
#ifdef CONFIG_COMPAT
static void __user *futex_uaddr(struct robust_list __user *entry,
compat_long_t futex_offset)
@@ -1305,6 +1326,21 @@ static void compat_exit_robust_list(stru
handle_futex_death(uaddr, curr, pend_mod, HANDLE_DEATH_PENDING);
}
}
+
+static bool compat_robust_list_clear_pending(u32 __user *pop)
+{
+ struct compat_robust_list_head __user *head = current->futex.compat_robust_list;
+
+ if (!put_user(0U, pop))
+ return true;
+
+ /* See comment in robust_list_clear_pending(). */
+ if (pop == &head->list_op_pending)
+ current->futex.compat_robust_list = NULL;
+ return false;
+}
+#else
+static bool compat_robust_list_clear_pending(u32 __user *pop_addr) { return false; }
#endif
#ifdef CONFIG_FUTEX_PI
@@ -1398,6 +1434,27 @@ static void exit_pi_state_list(struct ta
static inline void exit_pi_state_list(struct task_struct *curr) { }
#endif
+static inline bool mask_pop_addr(void __user **pop)
+{
+ unsigned long addr = (unsigned long)*pop;
+
+ *pop = (void __user *) (addr & ~FUTEX_ROBUST_UNLOCK_MOD_MASK);
+ return !!(addr & FUTEX_ROBUST_UNLOCK_MOD_32BIT);
+}
+
+bool futex_robust_list_clear_pending(void __user *pop)
+{
+ bool size32bit = mask_pop_addr(&pop);
+
+ if (!IS_ENABLED(CONFIG_64BIT) && !size32bit)
+ return false;
+
+ if (IS_ENABLED(CONFIG_64BIT) && size32bit)
+ return compat_robust_list_clear_pending(pop);
+
+ return robust_list_clear_pending(pop);
+}
+
static void futex_cleanup(struct task_struct *tsk)
{
if (unlikely(tsk->futex.robust_list)) {
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -40,6 +40,7 @@
#define FLAGS_NUMA 0x0080
#define FLAGS_STRICT 0x0100
#define FLAGS_MPOL 0x0200
+#define FLAGS_UNLOCK_ROBUST 0x0400
/* FUTEX_ to FLAGS_ */
static inline unsigned int futex_to_flags(unsigned int op)
@@ -52,6 +53,9 @@ static inline unsigned int futex_to_flag
if (op & FUTEX_CLOCK_REALTIME)
flags |= FLAGS_CLOCKRT;
+ if (op & FUTEX_UNLOCK_ROBUST)
+ flags |= FLAGS_UNLOCK_ROBUST;
+
return flags;
}
@@ -438,13 +442,16 @@ extern int futex_unqueue_multiple(struct
extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
struct hrtimer_sleeper *to);
-extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset);
+extern int futex_wake(u32 __user *uaddr, unsigned int flags, void __user *pop,
+ int nr_wake, u32 bitset);
extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags,
u32 __user *uaddr2, int nr_wake, int nr_wake2, int op);
-extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags);
+extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, void __user *pop);
extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);
+bool futex_robust_list_clear_pending(void __user *pop);
+
#endif /* _FUTEX_H */
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1129,7 +1129,7 @@ int futex_lock_pi(u32 __user *uaddr, uns
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
+static int __futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
u32 curval, uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
@@ -1138,7 +1138,6 @@ int futex_unlock_pi(u32 __user *uaddr, u
if (!IS_ENABLED(CONFIG_FUTEX_PI))
return -ENOSYS;
-
retry:
if (get_user(uval, uaddr))
return -EFAULT;
@@ -1292,3 +1291,15 @@ int futex_unlock_pi(u32 __user *uaddr, u
return ret;
}
+int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, void __user *pop)
+{
+ int ret = __futex_unlock_pi(uaddr, flags);
+
+ if (ret || !(flags & FLAGS_UNLOCK_ROBUST))
+ return ret;
+
+ if (!futex_robust_list_clear_pending(pop))
+ return -EFAULT;
+
+ return 0;
+}
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -118,6 +118,13 @@ long do_futex(u32 __user *uaddr, int op,
return -ENOSYS;
}
+ if (flags & FLAGS_UNLOCK_ROBUST) {
+ if (cmd != FUTEX_WAKE &&
+ cmd != FUTEX_WAKE_BITSET &&
+ cmd != FUTEX_UNLOCK_PI)
+ return -ENOSYS;
+ }
+
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
@@ -128,7 +135,7 @@ long do_futex(u32 __user *uaddr, int op,
val3 = FUTEX_BITSET_MATCH_ANY;
fallthrough;
case FUTEX_WAKE_BITSET:
- return futex_wake(uaddr, flags, val, val3);
+ return futex_wake(uaddr, flags, uaddr2, val, val3);
case FUTEX_REQUEUE:
return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
case FUTEX_CMP_REQUEUE:
@@ -141,7 +148,7 @@ long do_futex(u32 __user *uaddr, int op,
case FUTEX_LOCK_PI2:
return futex_lock_pi(uaddr, flags, timeout, 0);
case FUTEX_UNLOCK_PI:
- return futex_unlock_pi(uaddr, flags);
+ return futex_unlock_pi(uaddr, flags, uaddr2);
case FUTEX_TRYLOCK_PI:
return futex_lock_pi(uaddr, flags, NULL, 1);
case FUTEX_WAIT_REQUEUE_PI:
@@ -375,7 +382,7 @@ SYSCALL_DEFINE4(futex_wake,
if (!futex_validate_input(flags, mask))
return -EINVAL;
- return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
+ return futex_wake(uaddr, FLAGS_STRICT | flags, NULL, nr, mask);
}
/*
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -150,12 +150,32 @@ void futex_wake_mark(struct wake_q_head
}
/*
+ * If requested, clear the robust list pending op and unlock the futex
+ */
+static bool futex_robust_unlock(u32 __user *uaddr, unsigned int flags, void __user *pop)
+{
+ if (!(flags & FLAGS_UNLOCK_ROBUST))
+ return true;
+
+ /* First unlock the futex. */
+ if (put_user(0U, uaddr))
+ return false;
+
+ /*
+ * Clear the pending list op now. If that fails, then the task is in
+ * deeper trouble as the robust list head is usually part of TLS. The
+ * chance of survival is close to zero.
+ */
+ return futex_robust_list_clear_pending(pop);
+}
+
+/*
* Wake up waiters matching bitset queued on this futex (uaddr).
*/
-int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+int futex_wake(u32 __user *uaddr, unsigned int flags, void __user *pop, int nr_wake, u32 bitset)
{
- struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
+ struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
int ret;
@@ -166,6 +186,9 @@ int futex_wake(u32 __user *uaddr, unsign
if (unlikely(ret != 0))
return ret;
+ if (!futex_robust_unlock(uaddr, flags, pop))
+ return -EFAULT;
+
if ((flags & FLAGS_STRICT) && !nr_wake)
return 0;
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 4/8] futex: Add support for unlocking robust futexes
2026-03-16 17:13 ` [patch 4/8] futex: Add support for unlocking robust futexes Thomas Gleixner
@ 2026-03-16 18:24 ` Mathieu Desnoyers
2026-03-17 16:17 ` André Almeida
1 sibling, 0 replies; 57+ messages in thread
From: Mathieu Desnoyers @ 2026-03-16 18:24 UTC (permalink / raw)
To: Thomas Gleixner, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On 2026-03-16 13:13, Thomas Gleixner wrote:
[...]
>
> PI futexes have a similar problem both for the non-contented user space
> unlock and the in kernel unlock:
>
> 1) robust_list_set_op_pending(mutex);
> 2) robust_list_remove(mutex);
>
> lval = gettid();
> 3) if (!atomic_try_cmpxchg(lock, lval, 0))
> 4) sys_futex(UNLOCK_PI,....);
> 5) robust_list_clear_op_pending();
>
We can document that the window in this case is between #3 and #5.
Other than this detail:
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 4/8] futex: Add support for unlocking robust futexes
2026-03-16 17:13 ` [patch 4/8] futex: Add support for unlocking robust futexes Thomas Gleixner
2026-03-16 18:24 ` Mathieu Desnoyers
@ 2026-03-17 16:17 ` André Almeida
2026-03-17 20:46 ` Peter Zijlstra
1 sibling, 1 reply; 57+ messages in thread
From: André Almeida @ 2026-03-17 16:17 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Mathieu Desnoyers, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
Em 16/03/2026 14:13, Thomas Gleixner escreveu:
[...]
> --- a/kernel/futex/waitwake.c
> +++ b/kernel/futex/waitwake.c
> @@ -150,12 +150,32 @@ void futex_wake_mark(struct wake_q_head
> }
>
> /*
> + * If requested, clear the robust list pending op and unlock the futex
> + */
> +static bool futex_robust_unlock(u32 __user *uaddr, unsigned int flags, void __user *pop)
> +{
> + if (!(flags & FLAGS_UNLOCK_ROBUST))
> + return true;
> +
> + /* First unlock the futex. */
> + if (put_user(0U, uaddr))
> + return false;
> +
On glibc code, the futex unlock happens atomically:
atomic_exchange_release (&mutex->__data.__lock, 0)
Is OK to do it unatomically?
I couldn't find a race condition given that the only thread that should
be able to write to the futex address must be the lock owner anyways,
but I don't know why userspace does it atomically in the first place.
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 4/8] futex: Add support for unlocking robust futexes
2026-03-17 16:17 ` André Almeida
@ 2026-03-17 20:46 ` Peter Zijlstra
2026-03-17 22:40 ` Thomas Gleixner
0 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2026-03-17 20:46 UTC (permalink / raw)
To: André Almeida
Cc: Thomas Gleixner, LKML, Mathieu Desnoyers,
Sebastian Andrzej Siewior, Carlos O'Donell, Florian Weimer,
Rich Felker, Torvald Riegel, Darren Hart, Ingo Molnar,
Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Tue, Mar 17, 2026 at 01:17:28PM -0300, André Almeida wrote:
> Em 16/03/2026 14:13, Thomas Gleixner escreveu:
>
> [...]
>
> > --- a/kernel/futex/waitwake.c
> > +++ b/kernel/futex/waitwake.c
> > @@ -150,12 +150,32 @@ void futex_wake_mark(struct wake_q_head
> > }
> > /*
> > + * If requested, clear the robust list pending op and unlock the futex
> > + */
> > +static bool futex_robust_unlock(u32 __user *uaddr, unsigned int flags, void __user *pop)
> > +{
> > + if (!(flags & FLAGS_UNLOCK_ROBUST))
> > + return true;
> > +
> > + /* First unlock the futex. */
> > + if (put_user(0U, uaddr))
> > + return false;
> > +
>
> On glibc code, the futex unlock happens atomically:
>
> atomic_exchange_release (&mutex->__data.__lock, 0)
>
> Is OK to do it unatomically?
>
> I couldn't find a race condition given that the only thread that should be
> able to write to the futex address must be the lock owner anyways, but I
> don't know why userspace does it atomically in the first place.
So userspace could probably get away with doing:
atomic_store_explicit(&mutex->__data.__lock, 0, memory_order_release);
IOW a plain store-release. And yeah, I think the kernel probably should
do a store-release too. It doesn't matter on x86, but if we have a
weakly ordered architecture where the mode transition is also not
serializing, we could be having trouble.
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 4/8] futex: Add support for unlocking robust futexes
2026-03-17 20:46 ` Peter Zijlstra
@ 2026-03-17 22:40 ` Thomas Gleixner
2026-03-18 8:02 ` Peter Zijlstra
0 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-17 22:40 UTC (permalink / raw)
To: Peter Zijlstra, André Almeida
Cc: LKML, Mathieu Desnoyers, Sebastian Andrzej Siewior,
Carlos O'Donell, Florian Weimer, Rich Felker, Torvald Riegel,
Darren Hart, Ingo Molnar, Davidlohr Bueso, Arnd Bergmann,
Liam R . Howlett
On Tue, Mar 17 2026 at 21:46, Peter Zijlstra wrote:
> On Tue, Mar 17, 2026 at 01:17:28PM -0300, André Almeida wrote:
>> Em 16/03/2026 14:13, Thomas Gleixner escreveu:
>>
>> [...]
>>
>> > --- a/kernel/futex/waitwake.c
>> > +++ b/kernel/futex/waitwake.c
>> > @@ -150,12 +150,32 @@ void futex_wake_mark(struct wake_q_head
>> > }
>> > /*
>> > + * If requested, clear the robust list pending op and unlock the futex
>> > + */
>> > +static bool futex_robust_unlock(u32 __user *uaddr, unsigned int flags, void __user *pop)
>> > +{
>> > + if (!(flags & FLAGS_UNLOCK_ROBUST))
>> > + return true;
>> > +
>> > + /* First unlock the futex. */
>> > + if (put_user(0U, uaddr))
>> > + return false;
>> > +
>>
>> On glibc code, the futex unlock happens atomically:
>>
>> atomic_exchange_release (&mutex->__data.__lock, 0)
>>
>> Is OK to do it unatomically?
>>
>> I couldn't find a race condition given that the only thread that should be
>> able to write to the futex address must be the lock owner anyways, but I
>> don't know why userspace does it atomically in the first place.
>
> So userspace could probably get away with doing:
>
> atomic_store_explicit(&mutex->__data.__lock, 0, memory_order_release);
>
> IOW a plain store-release. And yeah, I think the kernel probably should
> do a store-release too. It doesn't matter on x86, but if we have a
> weakly ordered architecture where the mode transition is also not
> serializing, we could be having trouble.
No. There is a syscall in between and if that is not sufficient then the
architecure has more severe troubles than that store, no?
Thanks,
tglx
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 4/8] futex: Add support for unlocking robust futexes
2026-03-17 22:40 ` Thomas Gleixner
@ 2026-03-18 8:02 ` Peter Zijlstra
2026-03-18 8:06 ` Florian Weimer
2026-03-18 14:47 ` Peter Zijlstra
0 siblings, 2 replies; 57+ messages in thread
From: Peter Zijlstra @ 2026-03-18 8:02 UTC (permalink / raw)
To: Thomas Gleixner
Cc: André Almeida, LKML, Mathieu Desnoyers,
Sebastian Andrzej Siewior, Carlos O'Donell, Florian Weimer,
Rich Felker, Torvald Riegel, Darren Hart, Ingo Molnar,
Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Tue, Mar 17, 2026 at 11:40:12PM +0100, Thomas Gleixner wrote:
> On Tue, Mar 17 2026 at 21:46, Peter Zijlstra wrote:
> > On Tue, Mar 17, 2026 at 01:17:28PM -0300, André Almeida wrote:
> >> Em 16/03/2026 14:13, Thomas Gleixner escreveu:
> >>
> >> [...]
> >>
> >> > --- a/kernel/futex/waitwake.c
> >> > +++ b/kernel/futex/waitwake.c
> >> > @@ -150,12 +150,32 @@ void futex_wake_mark(struct wake_q_head
> >> > }
> >> > /*
> >> > + * If requested, clear the robust list pending op and unlock the futex
> >> > + */
> >> > +static bool futex_robust_unlock(u32 __user *uaddr, unsigned int flags, void __user *pop)
> >> > +{
> >> > + if (!(flags & FLAGS_UNLOCK_ROBUST))
> >> > + return true;
> >> > +
> >> > + /* First unlock the futex. */
> >> > + if (put_user(0U, uaddr))
> >> > + return false;
> >> > +
> >>
> >> On glibc code, the futex unlock happens atomically:
> >>
> >> atomic_exchange_release (&mutex->__data.__lock, 0)
> >>
> >> Is OK to do it unatomically?
> >>
> >> I couldn't find a race condition given that the only thread that should be
> >> able to write to the futex address must be the lock owner anyways, but I
> >> don't know why userspace does it atomically in the first place.
> >
> > So userspace could probably get away with doing:
> >
> > atomic_store_explicit(&mutex->__data.__lock, 0, memory_order_release);
> >
> > IOW a plain store-release. And yeah, I think the kernel probably should
> > do a store-release too. It doesn't matter on x86, but if we have a
> > weakly ordered architecture where the mode transition is also not
> > serializing, we could be having trouble.
>
> No. There is a syscall in between and if that is not sufficient then the
> architecure has more severe troubles than that store, no?
So I think we once tried to determine if syscall could be considered to
imply memory ordering, and I think the take-away at the time was that we
could not assume so.
But its been a long time, maybe I misremember.
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 4/8] futex: Add support for unlocking robust futexes
2026-03-18 8:02 ` Peter Zijlstra
@ 2026-03-18 8:06 ` Florian Weimer
2026-03-18 14:47 ` Peter Zijlstra
1 sibling, 0 replies; 57+ messages in thread
From: Florian Weimer @ 2026-03-18 8:06 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Thomas Gleixner, André Almeida, LKML, Mathieu Desnoyers,
Sebastian Andrzej Siewior, Carlos O'Donell, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
* Peter Zijlstra:
>> No. There is a syscall in between and if that is not sufficient then the
>> architecure has more severe troubles than that store, no?
>
> So I think we once tried to determine if syscall could be considered to
> imply memory ordering, and I think the take-away at the time was that we
> could not assume so.
>
> But its been a long time, maybe I misremember.
I remember the same thing, and I think I saw test failures where system
calls where not a (strong) barrier on POWER.
However, some system calls better be barriers. For example, writing to
a pipe should synchronize with reading from the pipe and poll wakeup on
the read end. Likewise for sockets, I assume. As far as I know, POSIX
is silent on this topic, though.
Thanks,
Florian
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 4/8] futex: Add support for unlocking robust futexes
2026-03-18 8:02 ` Peter Zijlstra
2026-03-18 8:06 ` Florian Weimer
@ 2026-03-18 14:47 ` Peter Zijlstra
2026-03-18 16:03 ` Thomas Gleixner
1 sibling, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2026-03-18 14:47 UTC (permalink / raw)
To: Thomas Gleixner
Cc: André Almeida, LKML, Mathieu Desnoyers,
Sebastian Andrzej Siewior, Carlos O'Donell, Florian Weimer,
Rich Felker, Torvald Riegel, Darren Hart, Ingo Molnar,
Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Wed, Mar 18, 2026 at 09:02:01AM +0100, Peter Zijlstra wrote:
> On Tue, Mar 17, 2026 at 11:40:12PM +0100, Thomas Gleixner wrote:
> > On Tue, Mar 17 2026 at 21:46, Peter Zijlstra wrote:
> > > On Tue, Mar 17, 2026 at 01:17:28PM -0300, André Almeida wrote:
> > >> Em 16/03/2026 14:13, Thomas Gleixner escreveu:
> > >>
> > >> [...]
> > >>
> > >> > --- a/kernel/futex/waitwake.c
> > >> > +++ b/kernel/futex/waitwake.c
> > >> > @@ -150,12 +150,32 @@ void futex_wake_mark(struct wake_q_head
> > >> > }
> > >> > /*
> > >> > + * If requested, clear the robust list pending op and unlock the futex
> > >> > + */
> > >> > +static bool futex_robust_unlock(u32 __user *uaddr, unsigned int flags, void __user *pop)
> > >> > +{
> > >> > + if (!(flags & FLAGS_UNLOCK_ROBUST))
> > >> > + return true;
> > >> > +
> > >> > + /* First unlock the futex. */
> > >> > + if (put_user(0U, uaddr))
> > >> > + return false;
> > >> > +
> > >>
> > >> On glibc code, the futex unlock happens atomically:
> > >>
> > >> atomic_exchange_release (&mutex->__data.__lock, 0)
> > >>
> > >> Is OK to do it unatomically?
> > >>
> > >> I couldn't find a race condition given that the only thread that should be
> > >> able to write to the futex address must be the lock owner anyways, but I
> > >> don't know why userspace does it atomically in the first place.
> > >
> > > So userspace could probably get away with doing:
> > >
> > > atomic_store_explicit(&mutex->__data.__lock, 0, memory_order_release);
> > >
> > > IOW a plain store-release. And yeah, I think the kernel probably should
> > > do a store-release too. It doesn't matter on x86, but if we have a
> > > weakly ordered architecture where the mode transition is also not
> > > serializing, we could be having trouble.
> >
> > No. There is a syscall in between and if that is not sufficient then the
> > architecure has more severe troubles than that store, no?
>
> So I think we once tried to determine if syscall could be considered to
> imply memory ordering, and I think the take-away at the time was that we
> could not assume so.
>
> But its been a long time, maybe I misremember.
Ah, it was for the sys_membarrier() thing. And yes, a syscall itself
does not imply memory ordering for all architectures. And since this
very much needs release semantics, it is best to be explicit about that.
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 4/8] futex: Add support for unlocking robust futexes
2026-03-18 14:47 ` Peter Zijlstra
@ 2026-03-18 16:03 ` Thomas Gleixner
0 siblings, 0 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-18 16:03 UTC (permalink / raw)
To: Peter Zijlstra
Cc: André Almeida, LKML, Mathieu Desnoyers,
Sebastian Andrzej Siewior, Carlos O'Donell, Florian Weimer,
Rich Felker, Torvald Riegel, Darren Hart, Ingo Molnar,
Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Wed, Mar 18 2026 at 15:47, Peter Zijlstra wrote:
> On Wed, Mar 18, 2026 at 09:02:01AM +0100, Peter Zijlstra wrote:
>> So I think we once tried to determine if syscall could be considered to
>> imply memory ordering, and I think the take-away at the time was that we
>> could not assume so.
>>
>> But its been a long time, maybe I misremember.
>
> Ah, it was for the sys_membarrier() thing. And yes, a syscall itself
> does not imply memory ordering for all architectures. And since this
> very much needs release semantics, it is best to be explicit about that.
What a mess...
^ permalink raw reply [flat|nested] 57+ messages in thread
* [patch 5/8] futex: Add robust futex unlock IP range
2026-03-16 17:12 [patch 0/8] futex: Address the robust futex unlock race for real Thomas Gleixner
` (3 preceding siblings ...)
2026-03-16 17:13 ` [patch 4/8] futex: Add support for unlocking robust futexes Thomas Gleixner
@ 2026-03-16 17:13 ` Thomas Gleixner
2026-03-16 18:36 ` Mathieu Desnoyers
2026-03-17 19:19 ` André Almeida
2026-03-16 17:13 ` [patch 6/8] futex: Provide infrastructure to plug the non contended robust futex unlock race Thomas Gleixner
` (2 subsequent siblings)
7 siblings, 2 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-16 17:13 UTC (permalink / raw)
To: LKML
Cc: Mathieu Desnoyers, André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
There will be a VDSO function to unlock robust futexes in user space. The
unlock sequence is racy vs. clearing the list_pending_op pointer in the
tasks robust list head. To plug this race the kernel needs to know the
instruction window. As the VDSO is per MM the addresses are stored in
mm_struct::futex.
Architectures which implement support for this have to update these
addresses when the VDSO is (re)mapped.
Arguably this could be resolved by chasing mm->context->vdso->image, but
that that's architecture specific and requires to touch quite some cache
lines. Having it in mm::futex reduces the cache line impact and avoids
having yet another set of architecture specific functionality.
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
include/linux/futex_types.h | 32 +++++++++++++++++++++++++-------
include/linux/mm_types.h | 1 +
init/Kconfig | 6 ++++++
3 files changed, 32 insertions(+), 7 deletions(-)
--- a/include/linux/futex_types.h
+++ b/include/linux/futex_types.h
@@ -33,13 +33,26 @@ struct futex_ctrl { };
/**
* struct futex_mm_data - Futex related per MM data
- * @phash_lock: Mutex to protect the private hash operations
- * @phash: RCU managed pointer to the private hash
- * @phash_new: Pointer to a newly allocated private hash
- * @phash_batches: Batch state for RCU synchronization
- * @phash_rcu: RCU head for call_rcu()
- * @phash_atomic: Aggregate value for @phash_ref
- * @phash_ref: Per CPU reference counter for a private hash
+ * @phash_lock: Mutex to protect the private hash operations
+ * @phash: RCU managed pointer to the private hash
+ * @phash_new: Pointer to a newly allocated private hash
+ * @phash_batches: Batch state for RCU synchronization
+ * @phash_rcu: RCU head for call_rcu()
+ * @phash_atomic: Aggregate value for @phash_ref
+ * @phash_ref: Per CPU reference counter for a private hash
+ *
+ * @unlock_cs_start_ip: The start IP of the robust futex unlock critical section
+ *
+ * @unlock_cs_success_ip: The IP of the robust futex unlock critical section which
+ * indicates that the unlock (cmpxchg) was successful
+ * Required to handle the compat size insanity for mixed mode
+ * game emulators.
+ *
+ * Not evaluated by the core code as that only
+ * evaluates the start/end range. Can therefore be 0 if the
+ * architecture does not care.
+ *
+ * @unlock_cs_end_ip: The end IP of the robust futex unlock critical section
*/
struct futex_mm_data {
#ifdef CONFIG_FUTEX_PRIVATE_HASH
@@ -51,6 +64,11 @@ struct futex_mm_data {
atomic_long_t phash_atomic;
unsigned int __percpu *phash_ref;
#endif
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+ unsigned long unlock_cs_start_ip;
+ unsigned long unlock_cs_success_ip;
+ unsigned long unlock_cs_end_ip;
+#endif
};
#endif /* _LINUX_FUTEX_TYPES_H */
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -22,6 +22,7 @@
#include <linux/types.h>
#include <linux/rseq_types.h>
#include <linux/bitmap.h>
+#include <linux/futex_types.h>
#include <asm/mmu.h>
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1822,6 +1822,12 @@ config FUTEX_MPOL
depends on FUTEX && NUMA
default y
+config HAVE_FUTEX_ROBUST_UNLOCK
+ bool
+
+config FUTEX_ROBUST_UNLOCK
+ def_bool FUTEX && HAVE_GENERIC_VDSO && GENERIC_IRQ_ENTRY && RSEQ && HAVE_FUTEX_ROBUST_UNLOCK
+
config EPOLL
bool "Enable eventpoll support" if EXPERT
default y
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 5/8] futex: Add robust futex unlock IP range
2026-03-16 17:13 ` [patch 5/8] futex: Add robust futex unlock IP range Thomas Gleixner
@ 2026-03-16 18:36 ` Mathieu Desnoyers
2026-03-17 19:19 ` André Almeida
1 sibling, 0 replies; 57+ messages in thread
From: Mathieu Desnoyers @ 2026-03-16 18:36 UTC (permalink / raw)
To: Thomas Gleixner, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On 2026-03-16 13:13, Thomas Gleixner wrote:
> There will be a VDSO function to unlock robust futexes in user space. The
> unlock sequence is racy vs. clearing the list_pending_op pointer in the
> tasks robust list head. To plug this race the kernel needs to know the
> instruction window. As the VDSO is per MM the addresses are stored in
> mm_struct::futex.
>
> Architectures which implement support for this have to update these
> addresses when the VDSO is (re)mapped.
>
> Arguably this could be resolved by chasing mm->context->vdso->image, but
> that that's architecture specific and requires to touch quite some cache
> lines. Having it in mm::futex reduces the cache line impact and avoids
> having yet another set of architecture specific functionality.
>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 5/8] futex: Add robust futex unlock IP range
2026-03-16 17:13 ` [patch 5/8] futex: Add robust futex unlock IP range Thomas Gleixner
2026-03-16 18:36 ` Mathieu Desnoyers
@ 2026-03-17 19:19 ` André Almeida
1 sibling, 0 replies; 57+ messages in thread
From: André Almeida @ 2026-03-17 19:19 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Mathieu Desnoyers, Sebastian Andrzej Siewior, Carlos O'Donell,
LKML, Peter Zijlstra, Florian Weimer, Rich Felker, Torvald Riegel,
Darren Hart, Ingo Molnar, Davidlohr Bueso, Arnd Bergmann,
Liam R . Howlett
Em 16/03/2026 14:13, Thomas Gleixner escreveu:
> Arguably this could be resolved by chasing mm->context->vdso->image, but
> that that's architecture specific and requires to touch quite some cache
"that that's"
^ permalink raw reply [flat|nested] 57+ messages in thread
* [patch 6/8] futex: Provide infrastructure to plug the non contended robust futex unlock race
2026-03-16 17:12 [patch 0/8] futex: Address the robust futex unlock race for real Thomas Gleixner
` (4 preceding siblings ...)
2026-03-16 17:13 ` [patch 5/8] futex: Add robust futex unlock IP range Thomas Gleixner
@ 2026-03-16 17:13 ` Thomas Gleixner
2026-03-16 18:35 ` Mathieu Desnoyers
2026-03-16 17:13 ` [patch 7/8] x86/vdso: Prepare for robust futex unlock support Thomas Gleixner
2026-03-16 17:13 ` [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock() Thomas Gleixner
7 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-16 17:13 UTC (permalink / raw)
To: LKML
Cc: Mathieu Desnoyers, André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
When the FUTEX_ROBUST_UNLOCK mechanism is used for unlocking (PI-)futexes,
then the unlock sequence in user space looks like this:
1) robust_list_set_op_pending(mutex);
2) robust_list_remove(mutex);
lval = gettid();
3) if (atomic_try_cmpxchg(&mutex->lock, lval, 0))
4) robust_list_clear_op_pending();
else
5) sys_futex(OP | FUTEX_ROBUST_UNLOCK, ....);
That still leaves a minimal race window between #3 and #4 where the mutex
could be acquired by some other task, which observes that it is the last
user and:
1) unmaps the mutex memory
2) maps a different file, which ends up covering the same address
When then the original task exits before reaching #6 then the kernel robust
list handling observes the pending op entry and tries to fix up user space.
In case that the newly mapped data contains the TID of the exiting thread
at the address of the mutex/futex the kernel will set the owner died bit in
that memory and therefore corrupt unrelated data.
On X86 this boils down to this simplified assembly sequence:
mov %esi,%eax // Load TID into EAX
xor %ecx,%ecx // Set ECX to 0
#3 lock cmpxchg %ecx,(%rdi) // Try the TID -> 0 transition
.Lstart:
jnz .Lend
#4 movq $0x0,(%rdx) // Clear list_op_pending
.Lend:
If the cmpxchg() succeeds and the task is interrupted before it can clear
list_op_pending in the robust list head (#4) and the task crashes in a
signal handler or gets killed then it ends up in do_exit() and subsequently
in the robust list handling, which then might run into the unmap/map issue
described above.
This is only relevant when user space was interrupted and a signal is
pending. The fix-up has to be done before signal delivery is attempted
because:
1) The signal might be fatal so get_signal() ends up in do_exit()
2) The signal handler might crash or the task is killed before returning
from the handler. At that point the instruction pointer in pt_regs is
not longer the instruction pointer of the initially interrupted unlock
sequence.
The right place to handle this is in __exit_to_user_mode_loop() before
invoking arch_do_signal_or_restart() as this covers obviously both
scenarios.
As this is only relevant when the task was interrupted in user space, this
is tied to RSEQ and the generic entry code as RSEQ keeps track of user
space interrupts unconditionally even if the task does not have a RSEQ
region installed. That makes the decision very lightweight:
if (current->rseq.user_irq && within(regs, unlock_ip_range))
futex_fixup_robust_unlock(regs);
futex_fixup_robust_unlock() then invokes a architecture specific function
which evaluates the register content to decide whether the pending ops
pointer in the robust list head needs to be cleared.
Assuming the above unlock sequence, then on x86 this results in the trivial
evaluation of the zero flag:
return regs->eflags & X86_EFLAGS_ZF;
Other architectures might need to do more complex evaluations due to LLSC,
but the approach is valid in general. In case that COMPAT is enabled the
decision function is a bit more complex, but that's an implementation
detail.
The handling code also requires to retrieve the pending op pointer via an
architecture specific function to be able to perform the clearing.
The unlock sequence is going to be placed in the VDSO so that the kernel
can keep everything synchronized. The resulting code sequence for user
space is:
if (__vdso_futex_robust_try_unlock(lock, tid, &pending_op) != tid)
err = sys_futex($OP | FUTEX_ROBUST_UNLOCK,....);
Both the VDSO unlock and the kernel side unlock ensure that the pending_op
pointer is always cleared when the lock becomes unlocked.
The pending op pointer has the same modifier requirements as the @uaddr2
argument of sys_futex(FUTEX_ROBUST_UNLOCK) for the very same reasons. That
means VDSO implementations need to support the variable size case for the
pending op pointer as well if COMPAT is enabled.
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
include/linux/futex.h | 31 ++++++++++++++++++++++++++++++-
include/vdso/futex.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
kernel/entry/common.c | 9 ++++++---
kernel/futex/core.c | 13 +++++++++++++
4 files changed, 93 insertions(+), 4 deletions(-)
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -110,7 +110,36 @@ static inline int futex_hash_allocate_de
}
static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
+#endif /* !CONFIG_FUTEX */
-#endif
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+#include <asm/futex_robust.h>
+
+void __futex_fixup_robust_unlock(struct pt_regs *regs);
+
+static inline bool futex_within_robust_unlock(struct pt_regs *regs)
+{
+ unsigned long ip = instruction_pointer(regs);
+
+ return ip >= current->mm->futex.unlock_cs_start_ip &&
+ ip < current->mm->futex.unlock_cs_end_ip;
+}
+
+static inline void futex_fixup_robust_unlock(struct pt_regs *regs)
+{
+ /*
+ * Avoid dereferencing current->mm if not returning from interrupt.
+ * current->rseq.event is going to be used anyway in the exit to user
+ * code, so bringing it in is not a big deal.
+ */
+ if (!current->rseq.event.user_irq)
+ return;
+
+ if (unlikely(futex_within_robust_unlock(regs)))
+ __futex_fixup_robust_unlock(regs);
+}
+#else /* CONFIG_FUTEX_ROBUST_UNLOCK */
+static inline void futex_fixup_robust_unlock(struct pt_regs *regs) {}
+#endif /* !CONFIG_FUTEX_ROBUST_UNLOCK */
#endif
--- /dev/null
+++ b/include/vdso/futex.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _VDSO_FUTEX_H
+#define _VDSO_FUTEX_H
+
+#include <linux/types.h>
+
+struct robust_list;
+
+/**
+ * __vdso_futex_robust_try_unlock - Try to unlock an uncontended robust futex
+ * @lock: Pointer to the futex lock object
+ * @tid: The TID of the calling task
+ * @op: Pointer to the task's robust_list_head::list_pending_op
+ *
+ * Return: The content of *@lock. On success this is the same as @tid.
+ *
+ * The function implements:
+ * if (atomic_try_cmpxchg(lock, &tid, 0))
+ * *op = NULL;
+ * return tid;
+ *
+ * There is a race between a successful unlock and clearing the pending op
+ * pointer in the robust list head. If the calling task is interrupted in the
+ * race window and has to handle a (fatal) signal on return to user space then
+ * the kernel handles the clearing of @pending_op before attempting to deliver
+ * the signal. That ensures that a task cannot exit with a potentially invalid
+ * pending op pointer.
+ *
+ * User space uses it in the following way:
+ *
+ * if (__vdso_futex_robust_try_unlock(lock, tid, &pending_op) != tid)
+ * err = sys_futex($OP | FUTEX_ROBUST_UNLOCK,....);
+ *
+ * If the unlock attempt fails due to the FUTEX_WAITERS bit set in the lock,
+ * then the syscall does the unlock, clears the pending op pointer and wakes the
+ * requested number of waiters.
+ *
+ * The @op pointer is intentionally void. It has the same requirements as the
+ * @uaddr2 argument for sys_futex(FUTEX_ROBUST_UNLOCK) operations. See the
+ * modifier and the related documentation in include/uapi/linux/futex.h
+ */
+uint32_t __vdso_futex_robust_try_unlock(uint32_t *lock, uint32_t tid, void *op);
+
+#endif
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -1,11 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/irq-entry-common.h>
-#include <linux/resume_user_mode.h>
+#include <linux/futex.h>
#include <linux/highmem.h>
+#include <linux/irq-entry-common.h>
#include <linux/jump_label.h>
#include <linux/kmsan.h>
#include <linux/livepatch.h>
+#include <linux/resume_user_mode.h>
#include <linux/tick.h>
/* Workaround to allow gradual conversion of architecture code */
@@ -60,8 +61,10 @@ static __always_inline unsigned long __e
if (ti_work & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
- if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
+ futex_fixup_robust_unlock(regs);
arch_do_signal_or_restart(regs);
+ }
if (ti_work & _TIF_NOTIFY_RESUME)
resume_user_mode_work(regs);
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1455,6 +1455,19 @@ bool futex_robust_list_clear_pending(voi
return robust_list_clear_pending(pop);
}
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+void __futex_fixup_robust_unlock(struct pt_regs *regs)
+{
+ void __user *pop;
+
+ if (!arch_futex_needs_robust_unlock_fixup(regs))
+ return;
+
+ pop = arch_futex_robust_unlock_get_pop(regs);
+ futex_robust_list_clear_pending(pop);
+}
+#endif /* CONFIG_FUTEX_ROBUST_UNLOCK */
+
static void futex_cleanup(struct task_struct *tsk)
{
if (unlikely(tsk->futex.robust_list)) {
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 6/8] futex: Provide infrastructure to plug the non contended robust futex unlock race
2026-03-16 17:13 ` [patch 6/8] futex: Provide infrastructure to plug the non contended robust futex unlock race Thomas Gleixner
@ 2026-03-16 18:35 ` Mathieu Desnoyers
2026-03-16 20:29 ` Thomas Gleixner
0 siblings, 1 reply; 57+ messages in thread
From: Mathieu Desnoyers @ 2026-03-16 18:35 UTC (permalink / raw)
To: Thomas Gleixner, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On 2026-03-16 13:13, Thomas Gleixner wrote:
> When the FUTEX_ROBUST_UNLOCK mechanism is used for unlocking (PI-)futexes,
> then the unlock sequence in user space looks like this:
>
> 1) robust_list_set_op_pending(mutex);
> 2) robust_list_remove(mutex);
>
> lval = gettid();
> 3) if (atomic_try_cmpxchg(&mutex->lock, lval, 0))
> 4) robust_list_clear_op_pending();
> else
> 5) sys_futex(OP | FUTEX_ROBUST_UNLOCK, ....);
>
> That still leaves a minimal race window between #3 and #4 where the mutex
> could be acquired by some other task, which observes that it is the last
> user and:
>
> 1) unmaps the mutex memory
> 2) maps a different file, which ends up covering the same address
>
> When then the original task exits before reaching #6 then the kernel robust
"... before reaching #4 ...".
[...]
>
> As this is only relevant when the task was interrupted in user space, this
> is tied to RSEQ and the generic entry code as RSEQ keeps track of user
> space interrupts unconditionally even if the task does not have a RSEQ
> region installed. That makes the decision very lightweight:
>
> if (current->rseq.user_irq && within(regs, unlock_ip_range))
> futex_fixup_robust_unlock(regs);
Nice trick to re-use the rseq infra, but where is the added dependency
on CONFIG_RSEQ=y ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 6/8] futex: Provide infrastructure to plug the non contended robust futex unlock race
2026-03-16 18:35 ` Mathieu Desnoyers
@ 2026-03-16 20:29 ` Thomas Gleixner
2026-03-16 20:52 ` Mathieu Desnoyers
0 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-16 20:29 UTC (permalink / raw)
To: Mathieu Desnoyers, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On Mon, Mar 16 2026 at 14:35, Mathieu Desnoyers wrote:
> On 2026-03-16 13:13, Thomas Gleixner wrote:
>> When the FUTEX_ROBUST_UNLOCK mechanism is used for unlocking (PI-)futexes,
>> then the unlock sequence in user space looks like this:
>>
>> 1) robust_list_set_op_pending(mutex);
>> 2) robust_list_remove(mutex);
>>
>> lval = gettid();
>> 3) if (atomic_try_cmpxchg(&mutex->lock, lval, 0))
>> 4) robust_list_clear_op_pending();
>> else
>> 5) sys_futex(OP | FUTEX_ROBUST_UNLOCK, ....);
>>
>> That still leaves a minimal race window between #3 and #4 where the mutex
>> could be acquired by some other task, which observes that it is the last
>> user and:
>>
>> 1) unmaps the mutex memory
>> 2) maps a different file, which ends up covering the same address
>>
>> When then the original task exits before reaching #6 then the kernel robust
>
> "... before reaching #4 ...".
>
> [...]
>
>>
>> As this is only relevant when the task was interrupted in user space, this
>> is tied to RSEQ and the generic entry code as RSEQ keeps track of user
>> space interrupts unconditionally even if the task does not have a RSEQ
>> region installed. That makes the decision very lightweight:
>>
>> if (current->rseq.user_irq && within(regs, unlock_ip_range))
>> futex_fixup_robust_unlock(regs);
>
> Nice trick to re-use the rseq infra, but where is the added dependency
> on CONFIG_RSEQ=y ?
Here:
+config FUTEX_ROBUST_UNLOCK
+ def_bool FUTEX && HAVE_GENERIC_VDSO && GENERIC_IRQ_ENTRY && RSEQ && HAVE_FUTEX_ROBUST_UNLOCK
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 6/8] futex: Provide infrastructure to plug the non contended robust futex unlock race
2026-03-16 20:29 ` Thomas Gleixner
@ 2026-03-16 20:52 ` Mathieu Desnoyers
0 siblings, 0 replies; 57+ messages in thread
From: Mathieu Desnoyers @ 2026-03-16 20:52 UTC (permalink / raw)
To: Thomas Gleixner, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On 2026-03-16 16:29, Thomas Gleixner wrote:
> On Mon, Mar 16 2026 at 14:35, Mathieu Desnoyers wrote:
[...]
>>
>> Nice trick to re-use the rseq infra, but where is the added dependency
>> on CONFIG_RSEQ=y ?
>
> Here:
>
> +config FUTEX_ROBUST_UNLOCK
> + def_bool FUTEX && HAVE_GENERIC_VDSO && GENERIC_IRQ_ENTRY && RSEQ && HAVE_FUTEX_ROBUST_UNLOCK
*blink*. Yes I missed it. Clear as day.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply [flat|nested] 57+ messages in thread
* [patch 7/8] x86/vdso: Prepare for robust futex unlock support
2026-03-16 17:12 [patch 0/8] futex: Address the robust futex unlock race for real Thomas Gleixner
` (5 preceding siblings ...)
2026-03-16 17:13 ` [patch 6/8] futex: Provide infrastructure to plug the non contended robust futex unlock race Thomas Gleixner
@ 2026-03-16 17:13 ` Thomas Gleixner
2026-03-16 17:13 ` [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock() Thomas Gleixner
7 siblings, 0 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-16 17:13 UTC (permalink / raw)
To: LKML
Cc: Mathieu Desnoyers, André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
There will be a VDSO function to unlock non-contended robust futexes in
user space. The unlock sequence is racy vs. clearing the list_pending_op
pointer in the task's robust list head. To plug this race the kernel needs
to know the instruction window so it can clear the pointer when the task is
interrupted within that race window.
Add the symbols to the vdso2c generator and use them in the VDSO VMA code
to update the critical section addresses in mm_struct::futex on (re)map().
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
arch/x86/entry/vdso/vma.c | 20 ++++++++++++++++++++
arch/x86/include/asm/vdso.h | 3 +++
arch/x86/tools/vdso2c.c | 17 ++++++++++-------
3 files changed, 33 insertions(+), 7 deletions(-)
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -73,6 +73,23 @@ static void vdso_fix_landing(const struc
regs->ip = new_vma->vm_start + ipoffset;
}
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+static void vdso_futex_robust_unlock_update_ips(void)
+{
+ const struct vdso_image *image = current->mm->context.vdso_image;
+ unsigned long vdso = (unsigned long) current->mm->context.vdso;
+
+ current->mm->futex.unlock_cs_start_ip =
+ vdso + image->sym___vdso_futex_robust_try_unlock_cs_start;
+ current->mm->futex.unlock_cs_success_ip =
+ vdso + image->sym___vdso_futex_robust_try_unlock_cs_success;
+ current->mm->futex.unlock_cs_end_ip =
+ vdso + image->sym___vdso_futex_robust_try_unlock_cs_end;
+}
+#else
+static inline void vdso_futex_robust_unlock_update_ips(void) { }
+#endif
+
static int vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
{
@@ -80,6 +97,7 @@ static int vdso_mremap(const struct vm_s
vdso_fix_landing(image, new_vma);
current->mm->context.vdso = (void __user *)new_vma->vm_start;
+ vdso_futex_robust_unlock_update_ips();
return 0;
}
@@ -189,6 +207,8 @@ static int map_vdso(const struct vdso_im
current->mm->context.vdso = (void __user *)text_start;
current->mm->context.vdso_image = image;
+ vdso_futex_robust_unlock_update_ips();
+
up_fail:
mmap_write_unlock(mm);
return ret;
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -25,6 +25,9 @@ struct vdso_image {
long sym_int80_landing_pad;
long sym_vdso32_sigreturn_landing_pad;
long sym_vdso32_rt_sigreturn_landing_pad;
+ long sym___vdso_futex_robust_try_unlock_cs_start;
+ long sym___vdso_futex_robust_try_unlock_cs_success;
+ long sym___vdso_futex_robust_try_unlock_cs_end;
};
extern const struct vdso_image vdso64_image;
--- a/arch/x86/tools/vdso2c.c
+++ b/arch/x86/tools/vdso2c.c
@@ -75,13 +75,16 @@ struct vdso_sym {
};
struct vdso_sym required_syms[] = {
- {"VDSO32_NOTE_MASK", true},
- {"__kernel_vsyscall", true},
- {"__kernel_sigreturn", true},
- {"__kernel_rt_sigreturn", true},
- {"int80_landing_pad", true},
- {"vdso32_rt_sigreturn_landing_pad", true},
- {"vdso32_sigreturn_landing_pad", true},
+ {"VDSO32_NOTE_MASK", true},
+ {"__kernel_vsyscall", true},
+ {"__kernel_sigreturn", true},
+ {"__kernel_rt_sigreturn", true},
+ {"int80_landing_pad", true},
+ {"vdso32_rt_sigreturn_landing_pad", true},
+ {"vdso32_sigreturn_landing_pad", true},
+ {"__vdso_futex_robust_try_unlock_cs_start", true},
+ {"__vdso_futex_robust_try_unlock_cs_success", true},
+ {"__vdso_futex_robust_try_unlock_cs_end", true},
};
__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
^ permalink raw reply [flat|nested] 57+ messages in thread* [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-16 17:12 [patch 0/8] futex: Address the robust futex unlock race for real Thomas Gleixner
` (6 preceding siblings ...)
2026-03-16 17:13 ` [patch 7/8] x86/vdso: Prepare for robust futex unlock support Thomas Gleixner
@ 2026-03-16 17:13 ` Thomas Gleixner
2026-03-16 19:19 ` Mathieu Desnoyers
` (3 more replies)
7 siblings, 4 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-16 17:13 UTC (permalink / raw)
To: LKML
Cc: Mathieu Desnoyers, André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
When the FUTEX_ROBUST_UNLOCK mechanism is used for unlocking (PI-)futexes,
then the unlock sequence in userspace looks like this:
1) robust_list_set_op_pending(mutex);
2) robust_list_remove(mutex);
lval = gettid();
3) if (atomic_try_cmpxchg(&mutex->lock, lval, 0))
4) robust_list_clear_op_pending();
else
5) sys_futex(OP,...FUTEX_ROBUST_UNLOCK);
That still leaves a minimal race window between #3 and #4 where the mutex
could be acquired by some other task which observes that it is the last
user and:
1) unmaps the mutex memory
2) maps a different file, which ends up covering the same address
When then the original task exits before reaching #6 then the kernel robust
list handling observes the pending op entry and tries to fix up user space.
In case that the newly mapped data contains the TID of the exiting thread
at the address of the mutex/futex the kernel will set the owner died bit in
that memory and therefore corrupt unrelated data.
Provide a VDSO function which exposes the critical section window in the
VDSO symbol table. The resulting addresses are updated in the task's mm
when the VDSO is (re)map()'ed.
The core code detects when a task was interrupted within the critical
section and is about to deliver a signal. It then invokes an architecture
specific function which determines whether the pending op pointer has to be
cleared or not. The assembly sequence for the non COMPAT case is:
mov %esi,%eax // Load TID into EAX
xor %ecx,%ecx // Set ECX to 0
lock cmpxchg %ecx,(%rdi) // Try the TID -> 0 transition
.Lstart:
jnz .Lend
movq $0x0,(%rdx) // Clear list_op_pending
.Lend:
ret
So the decision can be simply based on the ZF state in regs->flags.
If COMPAT is enabled then the try_unlock() function needs to take the size
bit in the OP pointer into account, which makes it slightly more complex:
mov %esi,%eax // Load TID into EAX
mov %rdx,%rsi // Get the op pointer
xor %ecx,%ecx // Set ECX to 0
and $0xfffffffffffffffe,%rsi // Clear the size bit
lock cmpxchg %ecx,(%rdi) // Try the TID -> 0 transition
.Lstart:
jnz .Lend
.Lsuccess:
testl $0x1,(%rdx) // Test the size bit
jz .Lop64 // Not set: 64-bit
movl $0x0,(%rsi) // Clear 32-bit
jmp .Lend
.Lop64:
movq $0x0,(%rsi) // Clear 64-bit
.Lend:
ret
The decision function has to check whether regs->ip is in the success
portion as the size bit test obviously modifies ZF too. If it is before
.Lsuccess then ZF contains the cmpxchg() result. If it's at of after
.Lsuccess then the pointer has to be cleared.
The original pointer with the size bit is preserved in RDX so the fixup can
utilize the existing clearing mechanism, which is used by sys_futex().
Arguably this could be avoided by providing separate functions and making
the IP range for the quick check in the exit to user path cover the whole
text section which contains the two functions. But that's not a win at all
because:
1) User space needs to handle the two variants instead of just
relying on a bit which can be saved in the mutex at
initialization time.
2) The fixup decision function has then to evaluate which code path is
used. That just adds more symbols and range checking for no real
value.
The unlock function is inspired by an idea from Mathieu Desnoyers.
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Link: https://lore.kernel.org/20260311185409.1988269-1-mathieu.desnoyers@efficios.com
---
arch/x86/Kconfig | 1
arch/x86/entry/vdso/common/vfutex.c | 72 +++++++++++++++++++++++++++++++
arch/x86/entry/vdso/vdso32/Makefile | 5 +-
arch/x86/entry/vdso/vdso32/vdso32.lds.S | 6 ++
arch/x86/entry/vdso/vdso32/vfutex.c | 1
arch/x86/entry/vdso/vdso64/Makefile | 7 +--
arch/x86/entry/vdso/vdso64/vdso64.lds.S | 6 ++
arch/x86/entry/vdso/vdso64/vdsox32.lds.S | 6 ++
arch/x86/entry/vdso/vdso64/vfutex.c | 1
arch/x86/include/asm/futex_robust.h | 44 ++++++++++++++++++
10 files changed, 144 insertions(+), 5 deletions(-)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -237,6 +237,7 @@ config X86
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EISA if X86_32
select HAVE_EXIT_THREAD
+ select HAVE_FUTEX_ROBUST_UNLOCK
select HAVE_GENERIC_TIF_BITS
select HAVE_GUP_FAST
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
--- /dev/null
+++ b/arch/x86/entry/vdso/common/vfutex.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <vdso/futex.h>
+
+/*
+ * Compat enabled kernels have to take the size bit into account to support the
+ * mixed size use case of gaming emulators. Contrary to the kernel robust unlock
+ * mechanism all of this does not test for the 32-bit modifier in 32-bit VDSOs
+ * and in compat disabled kernels. User space can keep the pieces.
+ */
+#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
+
+#ifdef CONFIG_COMPAT
+
+# define ASM_CLEAR_PTR \
+ " testl $1, (%[pop]) \n" \
+ " jz .Lop64 \n" \
+ " movl $0, (%[pad]) \n" \
+ " jmp __vdso_futex_robust_try_unlock_cs_end \n" \
+ ".Lop64: \n" \
+ " movq $0, (%[pad]) \n"
+
+# define ASM_PAD_CONSTRAINT ,[pad] "S" (((unsigned long)pop) & ~0x1UL)
+
+#else /* CONFIG_COMPAT */
+
+# define ASM_CLEAR_PTR \
+ " movq $0, (%[pop]) \n"
+
+# define ASM_PAD_CONSTRAINT
+
+#endif /* !CONFIG_COMPAT */
+
+#else /* CONFIG_X86_64 && !BUILD_VDSO32_64 */
+
+# define ASM_CLEAR_PTR \
+ " movl $0, (%[pad]) \n"
+
+# define ASM_PAD_CONSTRAINT ,[pad] "S" (((unsigned long)pop) & ~0x1UL)
+
+#endif /* !CONFIG_X86_64 || BUILD_VDSO32_64 */
+
+uint32_t __vdso_futex_robust_try_unlock(uint32_t *lock, uint32_t tid, void *pop)
+{
+ asm volatile (
+ ".global __vdso_futex_robust_try_unlock_cs_start \n"
+ ".global __vdso_futex_robust_try_unlock_cs_success \n"
+ ".global __vdso_futex_robust_try_unlock_cs_end \n"
+ " \n"
+ " lock cmpxchgl %[val], (%[ptr]) \n"
+ " \n"
+ "__vdso_futex_robust_try_unlock_cs_start: \n"
+ " \n"
+ " jnz __vdso_futex_robust_try_unlock_cs_end \n"
+ " \n"
+ "__vdso_futex_robust_try_unlock_cs_success: \n"
+ " \n"
+ ASM_CLEAR_PTR
+ " \n"
+ "__vdso_futex_robust_try_unlock_cs_end: \n"
+ : [tid] "+a" (tid)
+ : [ptr] "D" (lock),
+ [pop] "d" (pop),
+ [val] "r" (0)
+ ASM_PAD_CONSTRAINT
+ : "memory"
+ );
+
+ return tid;
+}
+
+uint32_t futex_robust_try_unlock(uint32_t *, uint32_t, void **)
+ __attribute__((weak, alias("__vdso_futex_robust_try_unlock")));
--- a/arch/x86/entry/vdso/vdso32/Makefile
+++ b/arch/x86/entry/vdso/vdso32/Makefile
@@ -7,8 +7,9 @@
vdsos-y := 32
# Files to link into the vDSO:
-vobjs-y := note.o vclock_gettime.o vgetcpu.o
-vobjs-y += system_call.o sigreturn.o
+vobjs-y := note.o vclock_gettime.o vgetcpu.o
+vobjs-y += system_call.o sigreturn.o
+vobjs-$(CONFIG_FUTEX_ROBUST_UNLOCK) += vfutex.o
# Compilation flags
flags-y := -DBUILD_VDSO32 -m32 -mregparm=0
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -30,6 +30,12 @@ VERSION
__vdso_clock_gettime64;
__vdso_clock_getres_time64;
__vdso_getcpu;
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+ __vdso_futex_robust_try_unlock;
+ __vdso_futex_robust_try_unlock_cs_start;
+ __vdso_futex_robust_try_unlock_cs_success;
+ __vdso_futex_robust_try_unlock_cs_end;
+#endif
};
LINUX_2.5 {
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vfutex.c
@@ -0,0 +1 @@
+#include "common/vfutex.c"
--- a/arch/x86/entry/vdso/vdso64/Makefile
+++ b/arch/x86/entry/vdso/vdso64/Makefile
@@ -8,9 +8,10 @@ vdsos-y := 64
vdsos-$(CONFIG_X86_X32_ABI) += x32
# Files to link into the vDSO:
-vobjs-y := note.o vclock_gettime.o vgetcpu.o
-vobjs-y += vgetrandom.o vgetrandom-chacha.o
-vobjs-$(CONFIG_X86_SGX) += vsgx.o
+vobjs-y := note.o vclock_gettime.o vgetcpu.o
+vobjs-y += vgetrandom.o vgetrandom-chacha.o
+vobjs-$(CONFIG_X86_SGX) += vsgx.o
+vobjs-$(CONFIG_FUTEX_ROBUST_UNLOCK) += vfutex.o
# Compilation flags
flags-y := -DBUILD_VDSO64 -m64 -mcmodel=small
--- a/arch/x86/entry/vdso/vdso64/vdso64.lds.S
+++ b/arch/x86/entry/vdso/vdso64/vdso64.lds.S
@@ -32,6 +32,12 @@ VERSION {
#endif
getrandom;
__vdso_getrandom;
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+ __vdso_futex_robust_try_unlock;
+ __vdso_futex_robust_try_unlock_cs_start;
+ __vdso_futex_robust_try_unlock_cs_success;
+ __vdso_futex_robust_try_unlock_cs_end;
+#endif
local: *;
};
}
--- a/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
@@ -22,6 +22,12 @@ VERSION {
__vdso_getcpu;
__vdso_time;
__vdso_clock_getres;
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+ __vdso_futex_robust_try_unlock;
+ __vdso_futex_robust_try_unlock_cs_start;
+ __vdso_futex_robust_try_unlock_cs_success;
+ __vdso_futex_robust_try_unlock_cs_end;
+#endif
local: *;
};
}
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso64/vfutex.c
@@ -0,0 +1 @@
+#include "common/vfutex.c"
--- /dev/null
+++ b/arch/x86/include/asm/futex_robust.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FUTEX_ROBUST_H
+#define _ASM_X86_FUTEX_ROBUST_H
+
+#include <asm/ptrace.h>
+
+static __always_inline bool x86_futex_needs_robust_unlock_fixup(struct pt_regs *regs)
+{
+ /*
+ * This is tricky in the compat case as it has to take the size check
+ * into account. See the ASM magic in the VDSO vfutex code. If compat is
+ * disabled or this is a 32-bit kernel then ZF is authoritive no matter
+ * what.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) || !IS_ENABLED(CONFIG_IA32_EMULATION))
+ return !!(regs->flags & X86_EFLAGS_ZF);
+
+ /*
+ * For the compat case, the core code already established that regs->ip
+ * is >= cs_start and < cs_end. Now check whether it is at the
+ * conditional jump which checks the cmpxchg() or if it succeeded and
+ * does the size check, which obviously modifies ZF too.
+ */
+ if (regs->ip >= current->mm->futex.unlock_cs_success_ip)
+ return true;
+ /*
+ * It's at the jnz right after the cmpxchg(). ZF tells whether this
+ * succeeded or not.
+ */
+ return !!(regs->flags & X86_EFLAGS_ZF);
+}
+
+#define arch_futex_needs_robust_unlock_fixup(regs) \
+ x86_futex_needs_robust_unlock_fixup(regs)
+
+static __always_inline void __user *x86_futex_robust_unlock_get_pop(struct pt_regs *regs)
+{
+ return (void __user *)regs->dx;
+}
+
+#define arch_futex_robust_unlock_get_pop(regs) \
+ x86_futex_robust_unlock_get_pop(regs)
+
+#endif /* _ASM_X86_FUTEX_ROBUST_H */
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-16 17:13 ` [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock() Thomas Gleixner
@ 2026-03-16 19:19 ` Mathieu Desnoyers
2026-03-16 21:02 ` Thomas Gleixner
` (2 more replies)
2026-03-17 7:25 ` Thomas Weißschuh
` (2 subsequent siblings)
3 siblings, 3 replies; 57+ messages in thread
From: Mathieu Desnoyers @ 2026-03-16 19:19 UTC (permalink / raw)
To: Thomas Gleixner, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On 2026-03-16 13:13, Thomas Gleixner wrote:
> When the FUTEX_ROBUST_UNLOCK mechanism is used for unlocking (PI-)futexes,
> then the unlock sequence in userspace looks like this:
>
> 1) robust_list_set_op_pending(mutex);
> 2) robust_list_remove(mutex);
>
> lval = gettid();
> 3) if (atomic_try_cmpxchg(&mutex->lock, lval, 0))
> 4) robust_list_clear_op_pending();
> else
> 5) sys_futex(OP,...FUTEX_ROBUST_UNLOCK);
[...]
>
> When then the original task exits before reaching #6 then the kernel robust
> list handling observes the pending op entry and tries to fix up user space.
There is no #6.
[...]
> +
> +uint32_t __vdso_futex_robust_try_unlock(uint32_t *lock, uint32_t tid, void *pop)
I'm not sure I see the link between "list_pending_op" and @pop ?
> +{
> + asm volatile (
> + ".global __vdso_futex_robust_try_unlock_cs_start \n"
> + ".global __vdso_futex_robust_try_unlock_cs_success \n"
> + ".global __vdso_futex_robust_try_unlock_cs_end \n"
Those .global are fragile: they depend on making sure the compiler does
not emit those symbols more than once per compile unit (due to
optimizations).
I understand that you want to skip the "iteration on a section" within
the kernel fast path, and I agree with that intent, but I think we can
achieve that goal with more robustness by:
- emitting those as triplets within a dedicated section,
- validating that the section only contains a single triplet within the
vdso2c script.
This would fail immediately in the vsdo2c script if the compiler does
not do as expected rather than silently fail to cover part of the
emitted code range.
[...]
> + : [tid] "+a" (tid)
Could use a few comments:
- "tid" sits in eax.
> + : [ptr] "D" (lock),
- "lock" sits in edi/rdi.
> + [pop] "d" (pop),
This constraint puts the unmasked "pop" pointer into edx/rdx.
> + [val] "r" (0)
> + ASM_PAD_CONSTRAINT
The masked "pop" pointer sits in esi/rsi.
[...]
> + * disabled or this is a 32-bit kernel then ZF is authoritive no matter
authoritative
> +
> +static __always_inline void __user *x86_futex_robust_unlock_get_pop(struct pt_regs *regs)
> +{
> + return (void __user *)regs->dx;
When userspace is compat 32-bit, with a 64-bit kernel, are we sure that
the 32 upper bits are cleared ? If not can we rely on
compat_robust_list_clear_pending to ignore those top bits in
put_user(0U, pop) ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-16 19:19 ` Mathieu Desnoyers
@ 2026-03-16 21:02 ` Thomas Gleixner
2026-03-16 22:35 ` Mathieu Desnoyers
2026-03-16 21:14 ` Thomas Gleixner
2026-03-16 21:29 ` Thomas Gleixner
2 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-16 21:02 UTC (permalink / raw)
To: Mathieu Desnoyers, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On Mon, Mar 16 2026 at 15:19, Mathieu Desnoyers wrote:
> On 2026-03-16 13:13, Thomas Gleixner wrote:
>> +
>> +static __always_inline void __user *x86_futex_robust_unlock_get_pop(struct pt_regs *regs)
>> +{
>> + return (void __user *)regs->dx;
>
> When userspace is compat 32-bit, with a 64-bit kernel, are we sure that
> the 32 upper bits are cleared ? If not can we rely on
> compat_robust_list_clear_pending to ignore those top bits in
> put_user(0U, pop) ?
Which compat version are you talking about?
1) A 32-bit application which truly runs as compat
2) A 64-bit application which uses both variants and invokes the
64-bit VDSO from a 32-bit program segment
#1 is inherently safe. The 32-bit application uses the compat 32-bit VDSO
which only accesses the lower half of the registers. So the mov $ptr,
%edx results in zero extending the 32-bit value. From the SDM:
"32-bit operands generate a 32-bit result, zero-extended to a
64-bit result in the destination general-purpose register."
The exception/interrupt entry switches into 64-bit mode, but due to
the above the upper 32 bit are 0. So it's safe to just blindly use
regs->dx.
Otherwise it would be pretty impossible to run 32-bit user space on a
64-bit kernel.
#2 can really be assumed to be safe as there must be some magic
translation in the emulation code which handles the different calling
conventions.
That's not any different when 32-bit code which runs in the context
of a 64-bit application invokes a syscall or a library function.
If that goes wrong, then it's not a kernel problem because the
application explicitely tells the kernel to corrupt it's own memory.
The golden rule of UNIX applies here as always:
Do what user space asked for unless it results in a boundary
violation which can't be achieved by user space itself.
IOW, let user space shoot itself into the foot when it desires to
do so.
Thanks,
tglx
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-16 21:02 ` Thomas Gleixner
@ 2026-03-16 22:35 ` Mathieu Desnoyers
0 siblings, 0 replies; 57+ messages in thread
From: Mathieu Desnoyers @ 2026-03-16 22:35 UTC (permalink / raw)
To: Thomas Gleixner, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On 2026-03-16 17:02, Thomas Gleixner wrote:
> On Mon, Mar 16 2026 at 15:19, Mathieu Desnoyers wrote:
>> On 2026-03-16 13:13, Thomas Gleixner wrote:
>>> +
>>> +static __always_inline void __user *x86_futex_robust_unlock_get_pop(struct pt_regs *regs)
>>> +{
>>> + return (void __user *)regs->dx;
>>
>> When userspace is compat 32-bit, with a 64-bit kernel, are we sure that
>> the 32 upper bits are cleared ? If not can we rely on
>> compat_robust_list_clear_pending to ignore those top bits in
>> put_user(0U, pop) ?
>
> Which compat version are you talking about?
>
> 1) A 32-bit application which truly runs as compat
>
> 2) A 64-bit application which uses both variants and invokes the
> 64-bit VDSO from a 32-bit program segment
>
> #1 is inherently safe. The 32-bit application uses the compat 32-bit VDSO
> which only accesses the lower half of the registers. So the mov $ptr,
> %edx results in zero extending the 32-bit value. From the SDM:
>
> "32-bit operands generate a 32-bit result, zero-extended to a
> 64-bit result in the destination general-purpose register."
Ah, very well, this is the important piece I was missing.
>
> The exception/interrupt entry switches into 64-bit mode, but due to
> the above the upper 32 bit are 0. So it's safe to just blindly use
> regs->dx.
OK.
[...]
> #2 can really be assumed to be safe as there must be some magic
> translation in the emulation code which handles the different calling
> conventions.
[...]
Sounds good,
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-16 19:19 ` Mathieu Desnoyers
2026-03-16 21:02 ` Thomas Gleixner
@ 2026-03-16 21:14 ` Thomas Gleixner
2026-03-16 21:29 ` Thomas Gleixner
2 siblings, 0 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-16 21:14 UTC (permalink / raw)
To: Mathieu Desnoyers, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On Mon, Mar 16 2026 at 15:19, Mathieu Desnoyers wrote:
>> +uint32_t __vdso_futex_robust_try_unlock(uint32_t *lock, uint32_t tid, void *pop)
>
> I'm not sure I see the link between "list_pending_op" and @pop ?
What so hard to understand about that? The function prototype in
include/vdso/futex.h is extensively documented.
> [...]
>> + : [tid] "+a" (tid)
>
> Could use a few comments:
>
> - "tid" sits in eax.
If someone needs a comment to understand the constraint, then that
persone definitely should not touch the code. I'm all for documentation
and comments, but documenting the obvious is not useful at all. See:
https://www.kernel.org/doc/html/latest/process/maintainer-tip.html#comment-style
I'm amazed that you complain about these obvious details and not about
the actual lack of a general comment which explains the actual inner
workings of that ASM maze. That would be actually useful for a
unexperienced reader. Interesting preference.
Thanks,
tglx
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-16 19:19 ` Mathieu Desnoyers
2026-03-16 21:02 ` Thomas Gleixner
2026-03-16 21:14 ` Thomas Gleixner
@ 2026-03-16 21:29 ` Thomas Gleixner
2 siblings, 0 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-16 21:29 UTC (permalink / raw)
To: Mathieu Desnoyers, LKML
Cc: André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On Mon, Mar 16 2026 at 15:19, Mathieu Desnoyers wrote:
> On 2026-03-16 13:13, Thomas Gleixner wrote:
>> +{
>> + asm volatile (
>> + ".global __vdso_futex_robust_try_unlock_cs_start \n"
>> + ".global __vdso_futex_robust_try_unlock_cs_success \n"
>> + ".global __vdso_futex_robust_try_unlock_cs_end \n"
>
> Those .global are fragile: they depend on making sure the compiler does
> not emit those symbols more than once per compile unit (due to
> optimizations).
This is a single global function which contains the unique ASM code with
those unique symbols. The unique compilation unit containing it ends up
in the VDSO "library".
Q: Which optimizations would cause the compiler to emit them more than
once?
A: None.
If that happens then the compiler is seriously broken and the resulting
VDSO trainwreck is the least of your worries.
That would be equivalent to a single global C function in a unique
compilation unit being emitted more than once.
So what is fragile about that?
Thanks,
tglx
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-16 17:13 ` [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock() Thomas Gleixner
2026-03-16 19:19 ` Mathieu Desnoyers
@ 2026-03-17 7:25 ` Thomas Weißschuh
2026-03-17 9:51 ` Thomas Gleixner
2026-03-17 8:28 ` Florian Weimer
2026-03-17 15:33 ` Uros Bizjak
3 siblings, 1 reply; 57+ messages in thread
From: Thomas Weißschuh @ 2026-03-17 7:25 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Florian Weimer, Rich Felker, Torvald Riegel, Darren Hart,
Ingo Molnar, Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Mon, Mar 16, 2026 at 06:13:34PM +0100, Thomas Gleixner wrote:
(...)
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -237,6 +237,7 @@ config X86
> select HAVE_EFFICIENT_UNALIGNED_ACCESS
> select HAVE_EISA if X86_32
> select HAVE_EXIT_THREAD
> + select HAVE_FUTEX_ROBUST_UNLOCK
> select HAVE_GENERIC_TIF_BITS
> select HAVE_GUP_FAST
> select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
> --- /dev/null
> +++ b/arch/x86/entry/vdso/common/vfutex.c
> @@ -0,0 +1,72 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <vdso/futex.h>
> +
> +/*
> + * Compat enabled kernels have to take the size bit into account to support the
> + * mixed size use case of gaming emulators. Contrary to the kernel robust unlock
> + * mechanism all of this does not test for the 32-bit modifier in 32-bit VDSOs
> + * and in compat disabled kernels. User space can keep the pieces.
> + */
> +#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
#ifndef __x86_64__ ?
> +
> +#ifdef CONFIG_COMPAT
> +
> +# define ASM_CLEAR_PTR \
> + " testl $1, (%[pop]) \n" \
> + " jz .Lop64 \n" \
> + " movl $0, (%[pad]) \n" \
> + " jmp __vdso_futex_robust_try_unlock_cs_end \n" \
> + ".Lop64: \n" \
> + " movq $0, (%[pad]) \n"
> +
> +# define ASM_PAD_CONSTRAINT ,[pad] "S" (((unsigned long)pop) & ~0x1UL)
> +
> +#else /* CONFIG_COMPAT */
> +
> +# define ASM_CLEAR_PTR \
> + " movq $0, (%[pop]) \n"
> +
> +# define ASM_PAD_CONSTRAINT
> +
> +#endif /* !CONFIG_COMPAT */
> +
> +#else /* CONFIG_X86_64 && !BUILD_VDSO32_64 */
> +
> +# define ASM_CLEAR_PTR \
> + " movl $0, (%[pad]) \n"
> +
> +# define ASM_PAD_CONSTRAINT ,[pad] "S" (((unsigned long)pop) & ~0x1UL)
> +
> +#endif /* !CONFIG_X86_64 || BUILD_VDSO32_64 */
> +
> +uint32_t __vdso_futex_robust_try_unlock(uint32_t *lock, uint32_t tid, void *pop)
While uint32_t is originally a userspace type, in the kernel it also pulls in
other internal types which are problematic in the vDSO. __u32 from
uapi/linux/types.h avoid this issue.
(...)
> --- a/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
> +++ b/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
> @@ -22,6 +22,12 @@ VERSION {
> __vdso_getcpu;
> __vdso_time;
> __vdso_clock_getres;
> +#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
> + __vdso_futex_robust_try_unlock;
> + __vdso_futex_robust_try_unlock_cs_start;
> + __vdso_futex_robust_try_unlock_cs_success;
> + __vdso_futex_robust_try_unlock_cs_end;
These three symbols are not meant to be used from outside the vDSO
implementation, so they don't need to be exported by the linkerscripts.
> +#endif
> local: *;
> };
> }
(...)
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-17 7:25 ` Thomas Weißschuh
@ 2026-03-17 9:51 ` Thomas Gleixner
2026-03-17 11:17 ` Thomas Weißschuh
0 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-17 9:51 UTC (permalink / raw)
To: Thomas Weißschuh
Cc: LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Florian Weimer, Rich Felker, Torvald Riegel, Darren Hart,
Ingo Molnar, Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Tue, Mar 17 2026 at 08:25, Thomas Weißschuh wrote:
> On Mon, Mar 16, 2026 at 06:13:34PM +0100, Thomas Gleixner wrote:
>> +/*
>> + * Compat enabled kernels have to take the size bit into account to support the
>> + * mixed size use case of gaming emulators. Contrary to the kernel robust unlock
>> + * mechanism all of this does not test for the 32-bit modifier in 32-bit VDSOs
>> + * and in compat disabled kernels. User space can keep the pieces.
>> + */
>> +#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
>
> #ifndef __x86_64__ ?
#ifdef :)
Just had to double check and convince myself that __x86_64__ is set when
building for the X86_X32 ABI. Seems to work.
>> +uint32_t __vdso_futex_robust_try_unlock(uint32_t *lock, uint32_t tid, void *pop)
>
> While uint32_t is originally a userspace type, in the kernel it also pulls in
> other internal types which are problematic in the vDSO. __u32 from
> uapi/linux/types.h avoid this issue.
Sure.
>> --- a/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
>> +++ b/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
>> @@ -22,6 +22,12 @@ VERSION {
>> __vdso_getcpu;
>> __vdso_time;
>> __vdso_clock_getres;
>> +#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
>> + __vdso_futex_robust_try_unlock;
>
>> + __vdso_futex_robust_try_unlock_cs_start;
>> + __vdso_futex_robust_try_unlock_cs_success;
>> + __vdso_futex_robust_try_unlock_cs_end;
>
> These three symbols are not meant to be used from outside the vDSO
> implementation, so they don't need to be exported by the linkerscripts.
You are right in principle and I had it differently in the first place
until I realized that exposing them is useful for debugging and
validation purposes.
As I pointed out in the cover letter you can use GDB to actually verify
the fixup magic. That makes it obviously possible to write a user space
selftest without requiring to decode the internals of the VDSO.
Due to my pretty limited userspace DSO knowledge that was the best I
came up with. If you have a better idea, please let me know.
Thanks,
tglx
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-17 9:51 ` Thomas Gleixner
@ 2026-03-17 11:17 ` Thomas Weißschuh
2026-03-18 16:17 ` Thomas Gleixner
0 siblings, 1 reply; 57+ messages in thread
From: Thomas Weißschuh @ 2026-03-17 11:17 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Florian Weimer, Rich Felker, Torvald Riegel, Darren Hart,
Ingo Molnar, Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Tue, Mar 17, 2026 at 10:51:47AM +0100, Thomas Gleixner wrote:
> On Tue, Mar 17 2026 at 08:25, Thomas Weißschuh wrote:
> > On Mon, Mar 16, 2026 at 06:13:34PM +0100, Thomas Gleixner wrote:
> >> +/*
> >> + * Compat enabled kernels have to take the size bit into account to support the
> >> + * mixed size use case of gaming emulators. Contrary to the kernel robust unlock
> >> + * mechanism all of this does not test for the 32-bit modifier in 32-bit VDSOs
> >> + * and in compat disabled kernels. User space can keep the pieces.
> >> + */
> >> +#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
> >
> > #ifndef __x86_64__ ?
>
> #ifdef :)
Indeed :-)
> Just had to double check and convince myself that __x86_64__ is set when
> building for the X86_X32 ABI. Seems to work.
Afaik this is mandated by the x32 ABI. Together with __ILP32__.
In any case it doesn't matter as the x32 vDSO is not actually built but
instead is a copy of the x86_64 one with its elf type patched around.
(...)
> >> --- a/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
> >> +++ b/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
> >> @@ -22,6 +22,12 @@ VERSION {
> >> __vdso_getcpu;
> >> __vdso_time;
> >> __vdso_clock_getres;
> >> +#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
> >> + __vdso_futex_robust_try_unlock;
> >
> >> + __vdso_futex_robust_try_unlock_cs_start;
> >> + __vdso_futex_robust_try_unlock_cs_success;
> >> + __vdso_futex_robust_try_unlock_cs_end;
> >
> > These three symbols are not meant to be used from outside the vDSO
> > implementation, so they don't need to be exported by the linkerscripts.
>
> You are right in principle and I had it differently in the first place
> until I realized that exposing them is useful for debugging and
> validation purposes.
>
> As I pointed out in the cover letter you can use GDB to actually verify
> the fixup magic. That makes it obviously possible to write a user space
> selftest without requiring to decode the internals of the VDSO.
>
> Due to my pretty limited userspace DSO knowledge that was the best I
> came up with. If you have a better idea, please let me know.
I would have expected GDB to be able to use the separate vDSO debugging
symbols to find these symbols. So far I was not able to make it work,
but I blame my limited GDB knowledge.
Or move the symbols into a dedicated version to make clear that this is
not a stable interface.
Thomas
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-17 11:17 ` Thomas Weißschuh
@ 2026-03-18 16:17 ` Thomas Gleixner
2026-03-19 7:41 ` Thomas Weißschuh
0 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-18 16:17 UTC (permalink / raw)
To: Thomas Weißschuh
Cc: LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Florian Weimer, Rich Felker, Torvald Riegel, Darren Hart,
Ingo Molnar, Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Tue, Mar 17 2026 at 12:17, Thomas Weißschuh wrote:
>> Due to my pretty limited userspace DSO knowledge that was the best I
>> came up with. If you have a better idea, please let me know.
>
> I would have expected GDB to be able to use the separate vDSO debugging
> symbols to find these symbols. So far I was not able to make it work,
> but I blame my limited GDB knowledge.
I got it "working" by manually loading vdso64.so.dbg at the right
offset, which only took about 10 attempts to get it right. Then you can
use actual local symbols.
vdso2c picks them up correctly too.
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-18 16:17 ` Thomas Gleixner
@ 2026-03-19 7:41 ` Thomas Weißschuh
2026-03-19 8:53 ` Florian Weimer
2026-03-19 10:36 ` Sebastian Andrzej Siewior
0 siblings, 2 replies; 57+ messages in thread
From: Thomas Weißschuh @ 2026-03-19 7:41 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Florian Weimer, Rich Felker, Torvald Riegel, Darren Hart,
Ingo Molnar, Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Wed, Mar 18, 2026 at 05:17:38PM +0100, Thomas Gleixner wrote:
> On Tue, Mar 17 2026 at 12:17, Thomas Weißschuh wrote:
> >> Due to my pretty limited userspace DSO knowledge that was the best I
> >> came up with. If you have a better idea, please let me know.
> >
> > I would have expected GDB to be able to use the separate vDSO debugging
> > symbols to find these symbols. So far I was not able to make it work,
> > but I blame my limited GDB knowledge.
>
> I got it "working" by manually loading vdso64.so.dbg at the right
> offset, which only took about 10 attempts to get it right. Then you can
> use actual local symbols.
>
> vdso2c picks them up correctly too.
What also works is to have GDB look up the debug symbols through their
debug ids. At this point the load address of the vDSO is already known.
$ make vdso_install INSTALL_MOD_PATH=$SOME_DIRECTORY
$ gdb -ex "set debug-file-directory $SOME_DIRECTORY/lib/modules/$(uname -r)/vdso" $BINARY
Depending on the distribution the vDSO from the kernel package might already
be set up to be found automatically.
Maybe we could add a helper to scripts/gdb/ which uses $(vdso-install-y)
to either populate a debug-file-directory automatically or hook into the GDB
lookup process to avoid these manual steps.
Thomas
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-19 7:41 ` Thomas Weißschuh
@ 2026-03-19 8:53 ` Florian Weimer
2026-03-19 9:04 ` Thomas Weißschuh
2026-03-19 9:08 ` Peter Zijlstra
2026-03-19 10:36 ` Sebastian Andrzej Siewior
1 sibling, 2 replies; 57+ messages in thread
From: Florian Weimer @ 2026-03-19 8:53 UTC (permalink / raw)
To: Thomas Weißschuh
Cc: Thomas Gleixner, LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Rich Felker, Torvald Riegel, Darren Hart, Ingo Molnar,
Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
* Thomas Weißschuh:
> On Wed, Mar 18, 2026 at 05:17:38PM +0100, Thomas Gleixner wrote:
>> On Tue, Mar 17 2026 at 12:17, Thomas Weißschuh wrote:
>> >> Due to my pretty limited userspace DSO knowledge that was the best I
>> >> came up with. If you have a better idea, please let me know.
>> >
>> > I would have expected GDB to be able to use the separate vDSO debugging
>> > symbols to find these symbols. So far I was not able to make it work,
>> > but I blame my limited GDB knowledge.
>>
>> I got it "working" by manually loading vdso64.so.dbg at the right
>> offset, which only took about 10 attempts to get it right. Then you can
>> use actual local symbols.
>>
>> vdso2c picks them up correctly too.
>
> What also works is to have GDB look up the debug symbols through their
> debug ids. At this point the load address of the vDSO is already known.
>
> $ make vdso_install INSTALL_MOD_PATH=$SOME_DIRECTORY
> $ gdb -ex "set debug-file-directory $SOME_DIRECTORY/lib/modules/$(uname -r)/vdso" $BINARY
>
> Depending on the distribution the vDSO from the kernel package might already
> be set up to be found automatically.
>
> Maybe we could add a helper to scripts/gdb/ which uses $(vdso-install-y)
> to either populate a debug-file-directory automatically or hook into the GDB
> lookup process to avoid these manual steps.
If the the /lib/modules/$(uname -r)/vdso/vdsoNN.so path is standard, we
can use it in the link map, at the expense of an additional system call
during process startup. It isn't just a performance cost. We've seen
seccomp filters that kill the process on uname calls.
Thanks,
Florian
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-19 8:53 ` Florian Weimer
@ 2026-03-19 9:04 ` Thomas Weißschuh
2026-03-19 9:08 ` Peter Zijlstra
1 sibling, 0 replies; 57+ messages in thread
From: Thomas Weißschuh @ 2026-03-19 9:04 UTC (permalink / raw)
To: Florian Weimer
Cc: Thomas Gleixner, LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Rich Felker, Torvald Riegel, Darren Hart, Ingo Molnar,
Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Thu, Mar 19, 2026 at 09:53:53AM +0100, Florian Weimer wrote:
> * Thomas Weißschuh:
>
> > On Wed, Mar 18, 2026 at 05:17:38PM +0100, Thomas Gleixner wrote:
> >> On Tue, Mar 17 2026 at 12:17, Thomas Weißschuh wrote:
> >> >> Due to my pretty limited userspace DSO knowledge that was the best I
> >> >> came up with. If you have a better idea, please let me know.
> >> >
> >> > I would have expected GDB to be able to use the separate vDSO debugging
> >> > symbols to find these symbols. So far I was not able to make it work,
> >> > but I blame my limited GDB knowledge.
> >>
> >> I got it "working" by manually loading vdso64.so.dbg at the right
> >> offset, which only took about 10 attempts to get it right. Then you can
> >> use actual local symbols.
> >>
> >> vdso2c picks them up correctly too.
> >
> > What also works is to have GDB look up the debug symbols through their
> > debug ids. At this point the load address of the vDSO is already known.
> >
> > $ make vdso_install INSTALL_MOD_PATH=$SOME_DIRECTORY
> > $ gdb -ex "set debug-file-directory $SOME_DIRECTORY/lib/modules/$(uname -r)/vdso" $BINARY
> >
> > Depending on the distribution the vDSO from the kernel package might already
> > be set up to be found automatically.
> >
> > Maybe we could add a helper to scripts/gdb/ which uses $(vdso-install-y)
> > to either populate a debug-file-directory automatically or hook into the GDB
> > lookup process to avoid these manual steps.
>
> If the the /lib/modules/$(uname -r)/vdso/vdsoNN.so path is standard, we
> can use it in the link map, at the expense of an additional system call
> during process startup. It isn't just a performance cost. We've seen
> seccomp filters that kill the process on uname calls.
It's not a standard, only what 'make vdso_install' ends up doing and some
package managers are copying. Also the file is not guaranteed to be there,
as it is only part of the kernel debug packages, if packaged at all.
So I don't think it makes sense to integrate it into low-level system
components. For debugging, the distros can already make the file available
through /usr/lib/debug/ so it works out-of-the box. (As done by Debian)
Thomas
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-19 8:53 ` Florian Weimer
2026-03-19 9:04 ` Thomas Weißschuh
@ 2026-03-19 9:08 ` Peter Zijlstra
2026-03-19 23:31 ` Thomas Gleixner
1 sibling, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2026-03-19 9:08 UTC (permalink / raw)
To: Florian Weimer
Cc: Thomas Weißschuh, Thomas Gleixner, LKML, Mathieu Desnoyers,
André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Rich Felker, Torvald Riegel, Darren Hart,
Ingo Molnar, Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Thu, Mar 19, 2026 at 09:53:53AM +0100, Florian Weimer wrote:
> If the the /lib/modules/$(uname -r)/vdso/vdsoNN.so path is standard, we
> can use it in the link map, at the expense of an additional system call
> during process startup. It isn't just a performance cost. We've seen
> seccomp filters that kill the process on uname calls.
I've always though it would be a good idea to expose vdsoNN.so somewhere
in our virtual filesystem, either /proc or /sys or whatever.
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-19 9:08 ` Peter Zijlstra
@ 2026-03-19 23:31 ` Thomas Gleixner
0 siblings, 0 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-19 23:31 UTC (permalink / raw)
To: Peter Zijlstra, Florian Weimer
Cc: Thomas Weißschuh, LKML, Mathieu Desnoyers,
André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Rich Felker, Torvald Riegel, Darren Hart,
Ingo Molnar, Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Thu, Mar 19 2026 at 10:08, Peter Zijlstra wrote:
> On Thu, Mar 19, 2026 at 09:53:53AM +0100, Florian Weimer wrote:
>> If the the /lib/modules/$(uname -r)/vdso/vdsoNN.so path is standard, we
>> can use it in the link map, at the expense of an additional system call
>> during process startup. It isn't just a performance cost. We've seen
>> seccomp filters that kill the process on uname calls.
>
> I've always though it would be a good idea to expose vdsoNN.so somewhere
> in our virtual filesystem, either /proc or /sys or whatever.
Yes, exposing this through a virtual fs would be really sensible. It's
not a lot of data.
VDSO debug info is ~40K extra per vdsoNN.so and vdsoNN.so.dbg compresses
to ~25k with zstd. Converting that whole thing to a binary blob and bake
it into the kernel is nor rocket science.
Thanks,
tglx
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-19 7:41 ` Thomas Weißschuh
2026-03-19 8:53 ` Florian Weimer
@ 2026-03-19 10:36 ` Sebastian Andrzej Siewior
2026-03-19 10:49 ` Thomas Weißschuh
1 sibling, 1 reply; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-03-19 10:36 UTC (permalink / raw)
To: Thomas Weißschuh
Cc: Thomas Gleixner, LKML, Mathieu Desnoyers, André Almeida,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On 2026-03-19 08:41:47 [+0100], Thomas Weißschuh wrote:
> > vdso2c picks them up correctly too.
>
> What also works is to have GDB look up the debug symbols through their
> debug ids. At this point the load address of the vDSO is already known.
>
> $ make vdso_install INSTALL_MOD_PATH=$SOME_DIRECTORY
> $ gdb -ex "set debug-file-directory $SOME_DIRECTORY/lib/modules/$(uname -r)/vdso" $BINARY
>
> Depending on the distribution the vDSO from the kernel package might already
> be set up to be found automatically.
>
> Maybe we could add a helper to scripts/gdb/ which uses $(vdso-install-y)
> to either populate a debug-file-directory automatically or hook into the GDB
> lookup process to avoid these manual steps.
Is this a complete vdso.so as mapped in process or just the debug
symbols or both?
Looking at my Debian thingy this seems to be there as of
https://packages.debian.org/sid/amd64/linux-image-6.19.8+deb14-amd64-dbg/filelist
| /usr/lib/debug/lib/modules/6.19.8+deb14-amd64/vdso/vdso32.so
| /usr/lib/debug/lib/modules/6.19.8+deb14-amd64/vdso/vdso64.so
| /usr/lib/debug/lib/modules/6.19.8+deb14-amd64/vdso/vdsox32.so
| /usr/lib/debug/lib/modules/6.19.8+deb14-amd64/vmlinux
or do we talk about other things? Usually there is -dbgsym with the
stripped out debug symbols under /usr/lib/debug/.build-id/ but the
kernel seems different.
> Thomas
Sebastian
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-19 10:36 ` Sebastian Andrzej Siewior
@ 2026-03-19 10:49 ` Thomas Weißschuh
2026-03-19 10:55 ` Sebastian Andrzej Siewior
0 siblings, 1 reply; 57+ messages in thread
From: Thomas Weißschuh @ 2026-03-19 10:49 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Thomas Gleixner, LKML, Mathieu Desnoyers, André Almeida,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On Thu, Mar 19, 2026 at 11:36:04AM +0100, Sebastian Andrzej Siewior wrote:
> On 2026-03-19 08:41:47 [+0100], Thomas Weißschuh wrote:
> > > vdso2c picks them up correctly too.
> >
> > What also works is to have GDB look up the debug symbols through their
> > debug ids. At this point the load address of the vDSO is already known.
> >
> > $ make vdso_install INSTALL_MOD_PATH=$SOME_DIRECTORY
> > $ gdb -ex "set debug-file-directory $SOME_DIRECTORY/lib/modules/$(uname -r)/vdso" $BINARY
> >
> > Depending on the distribution the vDSO from the kernel package might already
> > be set up to be found automatically.
> >
> > Maybe we could add a helper to scripts/gdb/ which uses $(vdso-install-y)
> > to either populate a debug-file-directory automatically or hook into the GDB
> > lookup process to avoid these manual steps.
>
> Is this a complete vdso.so as mapped in process or just the debug
> symbols or both?
$(vdso-install-y) references full vDSO images including executable code and
debug symbols. The one mapped into userspace is stripped.
> Looking at my Debian thingy this seems to be there as of
> https://packages.debian.org/sid/amd64/linux-image-6.19.8+deb14-amd64-dbg/filelist
>
> | /usr/lib/debug/lib/modules/6.19.8+deb14-amd64/vdso/vdso32.so
> | /usr/lib/debug/lib/modules/6.19.8+deb14-amd64/vdso/vdso64.so
> | /usr/lib/debug/lib/modules/6.19.8+deb14-amd64/vdso/vdsox32.so
> | /usr/lib/debug/lib/modules/6.19.8+deb14-amd64/vmlinux
>
> or do we talk about other things? Usually there is -dbgsym with the
> stripped out debug symbols under /usr/lib/debug/.build-id/ but the
> kernel seems different.
$ dpkg -L linux-image-6.12.74+deb13+1-amd64-dbg | grep -e vdso/vdso -e /usr/lib/debug/.build-id/
/usr/lib/debug/lib/modules/6.12.74+deb13+1-amd64/vdso/vdso32.so
/usr/lib/debug/lib/modules/6.12.74+deb13+1-amd64/vdso/vdso64.so
/usr/lib/debug/lib/modules/6.12.74+deb13+1-amd64/vdso/vdsox32.so
/usr/lib/debug/.build-id/4a/bb1230e4abe0e2d856e1a304b392831ab7a8e1.debug
/usr/lib/debug/.build-id/5f/a3a3ed11e017bcc765ade0997821383a7d4df8.debug
/usr/lib/debug/.build-id/6e/d4f6c60913c24e158bbdfd680b8a1c1b07d8a4.debug
$ readlink /usr/lib/debug/.build-id/4a/bb1230e4abe0e2d856e1a304b392831ab7a8e1.debug
../../lib/modules/6.12.74+deb13+1-amd64/vdso/vdso32.so
This looks as expected to me. It doesn't help for development kernels, though.
Also kbuild 'make deb-pkg' does *not* package these, see also [0].
One thing to note is that Debian does not follow the 'make vdso_install' layout.
Not that this would be a requirement or anything.
[0] https://lore.kernel.org/lkml/20260318-kbuild-pacman-vdso-install-v1-1-48ceb31c0e80@weissschuh.net/
Thomas
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-19 10:49 ` Thomas Weißschuh
@ 2026-03-19 10:55 ` Sebastian Andrzej Siewior
0 siblings, 0 replies; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-03-19 10:55 UTC (permalink / raw)
To: Thomas Weißschuh
Cc: Thomas Gleixner, LKML, Mathieu Desnoyers, André Almeida,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On 2026-03-19 11:49:37 [+0100], Thomas Weißschuh wrote:
> $ readlink /usr/lib/debug/.build-id/4a/bb1230e4abe0e2d856e1a304b392831ab7a8e1.debug
> ../../lib/modules/6.12.74+deb13+1-amd64/vdso/vdso32.so
>
> This looks as expected to me. It doesn't help for development kernels, though.
Good to know.
> Also kbuild 'make deb-pkg' does *not* package these, see also [0].
This is not used by Debian kernel building package but by people
building a Debian package. If it is useful maybe add it.
> One thing to note is that Debian does not follow the 'make vdso_install' layout.
> Not that this would be a requirement or anything.
Okay. Well, if it needs to change or anything I know who to poke.
> Thomas
Sebastian
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-16 17:13 ` [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock() Thomas Gleixner
2026-03-16 19:19 ` Mathieu Desnoyers
2026-03-17 7:25 ` Thomas Weißschuh
@ 2026-03-17 8:28 ` Florian Weimer
2026-03-17 9:36 ` Thomas Gleixner
2026-03-17 15:33 ` Uros Bizjak
3 siblings, 1 reply; 57+ messages in thread
From: Florian Weimer @ 2026-03-17 8:28 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Rich Felker, Torvald Riegel, Darren Hart, Ingo Molnar,
Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
* Thomas Gleixner:
> Arguably this could be avoided by providing separate functions and making
> the IP range for the quick check in the exit to user path cover the whole
> text section which contains the two functions. But that's not a win at all
> because:
>
> 1) User space needs to handle the two variants instead of just
> relying on a bit which can be saved in the mutex at
> initialization time.
I'm pretty sure that on the user-space side, we wouldn't have
cross-word-size operations (e.g., 64-bit code working on both 64-bit and
32-bit robust mutexes). Certainly not within libcs. The other point
about complexity is of course still valid.
Thanks,
Florian
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-17 8:28 ` Florian Weimer
@ 2026-03-17 9:36 ` Thomas Gleixner
2026-03-17 10:37 ` Florian Weimer
0 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-17 9:36 UTC (permalink / raw)
To: Florian Weimer
Cc: LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Rich Felker, Torvald Riegel, Darren Hart, Ingo Molnar,
Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Tue, Mar 17 2026 at 09:28, Florian Weimer wrote:
> * Thomas Gleixner:
>
>> Arguably this could be avoided by providing separate functions and making
>> the IP range for the quick check in the exit to user path cover the whole
>> text section which contains the two functions. But that's not a win at all
>> because:
>>
>> 1) User space needs to handle the two variants instead of just
>> relying on a bit which can be saved in the mutex at
>> initialization time.
>
> I'm pretty sure that on the user-space side, we wouldn't have
> cross-word-size operations (e.g., 64-bit code working on both 64-bit and
> 32-bit robust mutexes). Certainly not within libcs. The other point
> about complexity is of course still valid.
Right, I know that no libc implementation supports such an insanity, but
the kernel unfortunately allows to do so and it's used in the wild :(
So we have to deal with it somehow and the size modifier was the most
straight forward solution I could come up with. I'm all ears if someone
has a better idea.
That said, do you see any issue from libc size versus extending the
WAKE/UNLOCK_PI functionality with that UNLOCK_ROBUST functionality?
I did some basic performance tests in the meanwhile with an open coded
mutex implementation. I can't observe any significant difference between
doing the unlock in user space or letting the kernel do it, but that
needs of course more scrunity.
Thanks,
tglx
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-17 9:36 ` Thomas Gleixner
@ 2026-03-17 10:37 ` Florian Weimer
2026-03-17 22:32 ` Thomas Gleixner
0 siblings, 1 reply; 57+ messages in thread
From: Florian Weimer @ 2026-03-17 10:37 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Rich Felker, Torvald Riegel, Darren Hart, Ingo Molnar,
Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
* Thomas Gleixner:
> On Tue, Mar 17 2026 at 09:28, Florian Weimer wrote:
>> * Thomas Gleixner:
>>
>>> Arguably this could be avoided by providing separate functions and making
>>> the IP range for the quick check in the exit to user path cover the whole
>>> text section which contains the two functions. But that's not a win at all
>>> because:
>>>
>>> 1) User space needs to handle the two variants instead of just
>>> relying on a bit which can be saved in the mutex at
>>> initialization time.
>>
>> I'm pretty sure that on the user-space side, we wouldn't have
>> cross-word-size operations (e.g., 64-bit code working on both 64-bit and
>> 32-bit robust mutexes). Certainly not within libcs. The other point
>> about complexity is of course still valid.
>
> Right, I know that no libc implementation supports such an insanity, but
> the kernel unfortunately allows to do so and it's used in the wild :(
>
> So we have to deal with it somehow and the size modifier was the most
> straight forward solution I could come up with. I'm all ears if someone
> has a better idea.
Maybe a separate futex op? And the vDSO would have the futex call,
mangle uaddr2 as required for the shared code section that handles both
ops?
As far as I can tell at this point, the current proposal should work.
We'd probably start with using the syscall-based unlock.
Thanks,
Florian
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-17 10:37 ` Florian Weimer
@ 2026-03-17 22:32 ` Thomas Gleixner
2026-03-18 22:08 ` Thomas Gleixner
0 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-17 22:32 UTC (permalink / raw)
To: Florian Weimer
Cc: LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Rich Felker, Torvald Riegel, Darren Hart, Ingo Molnar,
Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Tue, Mar 17 2026 at 11:37, Florian Weimer wrote:
> * Thomas Gleixner:
>> Right, I know that no libc implementation supports such an insanity, but
>> the kernel unfortunately allows to do so and it's used in the wild :(
>>
>> So we have to deal with it somehow and the size modifier was the most
>> straight forward solution I could come up with. I'm all ears if someone
>> has a better idea.
>
> Maybe a separate futex op? And the vDSO would have the futex call,
> mangle uaddr2 as required for the shared code section that handles both
> ops?
>
> As far as I can tell at this point, the current proposal should work.
> We'd probably start with using the syscall-based unlock.
Something like the below compiled but untested delta diff which includes
also the other unrelated feedback fixups?
Thanks,
tglx
---
diff --git a/arch/x86/entry/vdso/common/vfutex.c b/arch/x86/entry/vdso/common/vfutex.c
index 19d8ef130b63..491ed141622d 100644
--- a/arch/x86/entry/vdso/common/vfutex.c
+++ b/arch/x86/entry/vdso/common/vfutex.c
@@ -1,72 +1,218 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <vdso/futex.h>
+/*
+ * Assembly template for the try unlock functions. The basic functionality for
+ * 64-bit is:
+ *
+ * At the call site:
+ * mov &lock, %rdi Store the lock pointer in RDI
+ * mov &pop, %rdx Store the pending op pointer in RDX
+ * mov TID, %esi Store the thread's TID in ESI
+ *
+ * 64-bit unlock function:
+ * mov esi, %eax Move the TID into EAX
+ * xor %ecx, %ecx Clear ECX
+ * lock_cmpxchgl %ecx, (%rdi) Attempt the TID -> 0 transition
+ * .Lcs_start: Start of the critical section
+ * jnz .Lcs_end If cmpxchl failed jump to the end
+ * .Lcs_success: Start of the success section
+ * movq $0, (%rdx) Set the pending op pointer to 0
+ * .Lcs_end: End of the critical section
+ *
+ * For COMPAT enabled 64-bit kernels this is a bit more complex because the size
+ * of the @pop pointer has to be determined in the success section:
+ *
+ * At the 64-bit call site:
+ * mov &lock, %rdi Store the lock pointer in RDI
+ * mov &pop, %rdx Store the pending op pointer in RDX
+ * mov TID, %esi Store the thread's TID in ESI
+ *
+ * At the 32-bit call site:
+ * mov &lock, %edi Store the lock pointer in EDI
+ * mov &pop, %edx Store the pending op pointer in EDX
+ * mov TID, %esi Store the thread's TID in ESI
+ *
+ * The 32-bit entry point:
+ * or $0x1, %edx Mark the op pointer 32-bit
+ *
+ * Common unlock function:
+ * mov esi, %eax Move the TID into EAX
+ * xor %ecx, %ecx Clear ECX
+ * mov %rdx, %rsi Store the op pointer in RSI
+ * and ~0x1, %rsi Clear the size bit in RSI
+ * lock_cmpxchgl %ecx, (%rdi) Attempt the TID -> 0 transition
+ * .Lcs_start: Start of the critical section
+ * jnz .Lcs_end If cmpxchl failed jump to the end
+ * .Lcs_success: Start of the success section
+ * test $0x1, %rdx Test the 32-bit size bit in the original pointer
+ * jz .Lop64 If not set, clear 64-bit
+ * movl $0, (%rsi) Set the 32-bit pending op pointer to 0
+ * jmp .Lcs_end Leave the critical section
+ * .Lop64: movq $0, (%rsi) Set the 64-bit pending op pointer to 0
+ * .Lcs_end: End of the critical section
+ *
+ * The 32-bit VDSO needs to set the 32-bit size bit as well to keep the code
+ * compatible for the kernel side fixup function, but it does not require the
+ * size evaluation in the success path.
+ *
+ * At the 32-bit call site:
+ * mov &lock, %edi Store the lock pointer in EDI
+ * mov &pop, %edx Store the pending op pointer in EDX
+ * mov TID, %esi Store the thread's TID in ESI
+ *
+ * The 32-bit entry point does:
+ * or $0x1, %edx Mark the op pointer 32-bit
+ *
+ * 32-bit unlock function:
+ * mov esi, %eax Move the TID into EAX
+ * xor %ecx, %ecx Clear ECX
+ * mov %edx, %esi Store the op pointer in ESI
+ * and ~0x1, %esi Clear the size bit in ESI
+ * lock_cmpxchgl %ecx, (%edi) Attempt the TID -> 0 transition
+ * .Lcs_start: Start of the critical section
+ * jnz .Lcs_end If cmpxchl failed jump to the end
+ * .Lcs_success: Start of the success section
+ * movl $0, (%esi) Set the 32-bit pending op pointer to 0
+ * .Lcs_end: End of the critical section
+ *
+ * The pointer modification makes sure that the unlock function can determine
+ * the pending op pointer size correctly and clear either 32 or 64 bit.
+ *
+ * The intermediate storage of the unmangled pointer (bit 0 cleared) in [ER]SI
+ * makes sure that the store hits the right address.
+ *
+ * The mangled pointer (bit 0 set for 32-bit) stays in [ER]DX so that the kernel
+ * side fixup function can determine the storage size correctly and always
+ * retrieve regs->rdx without any extra knowledge of the actual code path taken
+ * or checking the compat mode of the task.
+ *
+ * The .Lcs_success label is technically not required for a pure 64-bit and the
+ * 32-bit VDSO but is kept there for simplicity. In those cases the ZF flag in
+ * regs->eflags is authoritative for the whole critical section and no further
+ * evaluation is required.
+ *
+ * In the 64-bit compat case the .Lcs_success label is required because the
+ * pointer size check modifies the ZF flag, which means it is only valid for the
+ * case where .Lcs_start <= regs->ip < L.cs_success, which is obviously the
+ * same as l.cs_start == regs->ip for x86.
+ *
+ * That's still a valuable distinction for clarity to keep the ASM template the
+ * same for all case. This is also a template for other architectures which
+ * might have different requirements even for the non COMPAT case.
+ *
+ * That means in the 64-bit compat case the decision to do the fixup is:
+ *
+ * if (regs->ip >= .Lcs_start && regs->ip < L.cs_success)
+ * return (regs->eflags & ZF);
+ * return regs->ip < .Lcs_end;
+ *
+ * As the initial critical section check in the return to user space code
+ * already established that:
+ *
+ * .Lcs_start <= regs->ip < L.cs_end
+ *
+ * that decision can be simplified to:
+ *
+ * return regs->ip >= L.cs_success || regs->eflags & ZF;
+ *
+ */
+#define robust_try_unlock_asm(__tid, __lock, __pop) \
+ asm volatile ( \
+ ".global __kernel_futex_robust_try_unlock_cs_start \n" \
+ ".global __kernel_futex_robust_try_unlock_cs_success \n" \
+ ".global __kernel_futex_robust_try_unlock_cs_end \n" \
+ " \n" \
+ " lock cmpxchgl %[val], (%[ptr]) \n" \
+ " \n" \
+ "__kernel_futex_robust_try_unlock_cs_start: \n" \
+ " \n" \
+ " jnz __kernel_futex_robust_try_unlock_cs_end \n" \
+ " \n" \
+ "__kernel_futex_robust_try_unlock_cs_success: \n" \
+ " \n" \
+ ASM_CLEAR_PTR \
+ " \n" \
+ "__kernel_futex_robust_try_unlock_cs_end: \n" \
+ : [tid] "+a" (__tid) \
+ : [ptr] "D" (__lock), \
+ [pop] "d" (__pop), \
+ [val] "r" (0) \
+ ASM_PAD_CONSTRAINT(__pop) \
+ : "memory" \
+ )
+
/*
* Compat enabled kernels have to take the size bit into account to support the
* mixed size use case of gaming emulators. Contrary to the kernel robust unlock
* mechanism all of this does not test for the 32-bit modifier in 32-bit VDSOs
* and in compat disabled kernels. User space can keep the pieces.
*/
-#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
-
+#ifdef __x86_64__
#ifdef CONFIG_COMPAT
# define ASM_CLEAR_PTR \
" testl $1, (%[pop]) \n" \
" jz .Lop64 \n" \
" movl $0, (%[pad]) \n" \
- " jmp __vdso_futex_robust_try_unlock_cs_end \n" \
+ " jmp __kernel_futex_robust_try_unlock_cs_end \n" \
".Lop64: \n" \
" movq $0, (%[pad]) \n"
-# define ASM_PAD_CONSTRAINT ,[pad] "S" (((unsigned long)pop) & ~0x1UL)
+# define ASM_PAD_CONSTRAINT(__pop) ,[pad] "S" (((unsigned long)__pop) & ~0x1UL)
+
+__u32 noinline __vdso_futex_robust_try_unlock_64(__u32 *lock, __u32 tid, __u64 *pop)
+{
+ robust_try_unlock_asm(lock, tid, pop);
+ return tid;
+}
+
+__u32 noinline __vdso_futex_robust_try_unlock_32(__u32 *lock, __u32 tid, __u32 *pop)
+{
+ __u64 pop_addr = ((u64) pop) | FUTEX_ROBUST_UNLOCK_MOD_32BIT;
+
+ return __vdso_futex_robust_try_unlock_64(lock, tid, (__u64 *)pop_addr);
+}
+
+__u32 futex_robust_try_unlock_64(__u32 *, __u32, __u64 *)
+ __attribute__((weak, alias("__vdso_futex_robust_try_unlock_64")));
+
+__u32 futex_robust_try_unlock_32(__u32 *, __u32, __u32 *)
+ __attribute__((weak, alias("__vdso_futex_robust_try_unlock_32")));
#else /* CONFIG_COMPAT */
# define ASM_CLEAR_PTR \
" movq $0, (%[pop]) \n"
-# define ASM_PAD_CONSTRAINT
+# define ASM_PAD_CONSTRAINT(__pop)
+
+__u32 noinline __vdso_futex_robust_try_unlock_64(__u32 *lock, __u32 tid, __u64 *pop)
+{
+ robust_try_unlock_asm(lock, tid, pop);
+ return tid;
+}
+
+__u32 futex_robust_try_unlock_64(__u32 *, __u32, __u64 *)
+ __attribute__((weak, alias("__vdso_futex_robust_try_unlock_64")));
#endif /* !CONFIG_COMPAT */
-#else /* CONFIG_X86_64 && !BUILD_VDSO32_64 */
+#else /* __x86_64__ */
# define ASM_CLEAR_PTR \
" movl $0, (%[pad]) \n"
-# define ASM_PAD_CONSTRAINT ,[pad] "S" (((unsigned long)pop) & ~0x1UL)
-
-#endif /* !CONFIG_X86_64 || BUILD_VDSO32_64 */
+# define ASM_PAD_CONSTRAINT(__pop) ,[pad] "S" (((unsigned long)__pop) & ~0x1UL)
-uint32_t __vdso_futex_robust_try_unlock(uint32_t *lock, uint32_t tid, void *pop)
+__u32 noinline __vdso_futex_robust_try_unlock_32(__u32 *lock, __u32 tid, __u32 *pop)
{
- asm volatile (
- ".global __vdso_futex_robust_try_unlock_cs_start \n"
- ".global __vdso_futex_robust_try_unlock_cs_success \n"
- ".global __vdso_futex_robust_try_unlock_cs_end \n"
- " \n"
- " lock cmpxchgl %[val], (%[ptr]) \n"
- " \n"
- "__vdso_futex_robust_try_unlock_cs_start: \n"
- " \n"
- " jnz __vdso_futex_robust_try_unlock_cs_end \n"
- " \n"
- "__vdso_futex_robust_try_unlock_cs_success: \n"
- " \n"
- ASM_CLEAR_PTR
- " \n"
- "__vdso_futex_robust_try_unlock_cs_end: \n"
- : [tid] "+a" (tid)
- : [ptr] "D" (lock),
- [pop] "d" (pop),
- [val] "r" (0)
- ASM_PAD_CONSTRAINT
- : "memory"
- );
+ __u32 pop_addr = ((u32) pop) | FUTEX_ROBUST_UNLOCK_MOD_32BIT;
+ robust_try_unlock_asm(lock, tid, (__u32 *)pop_addr);
return tid;
}
-uint32_t futex_robust_try_unlock(uint32_t *, uint32_t, void **)
- __attribute__((weak, alias("__vdso_futex_robust_try_unlock")));
+__u32 futex_robust_try_unlock_32(__u32 *, __u32, __u32 *)
+ __attribute__((weak, alias("__vdso_futex_robust_try_unlock_32")));
+#endif /* !__x86_64__ */
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index b027d2f98bd0..cb7b8de8009c 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -31,10 +31,10 @@ VERSION
__vdso_clock_getres_time64;
__vdso_getcpu;
#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
- __vdso_futex_robust_try_unlock;
- __vdso_futex_robust_try_unlock_cs_start;
- __vdso_futex_robust_try_unlock_cs_success;
- __vdso_futex_robust_try_unlock_cs_end;
+ __vdso_futex_robust_try_unlock_32;
+ __kernel_futex_robust_try_unlock_cs_start;
+ __kernel_futex_robust_try_unlock_cs_success;
+ __kernel_futex_robust_try_unlock_cs_end;
#endif
};
diff --git a/arch/x86/entry/vdso/vdso64/vdso64.lds.S b/arch/x86/entry/vdso/vdso64/vdso64.lds.S
index e5c0ca9664e1..6dd36ae2ab79 100644
--- a/arch/x86/entry/vdso/vdso64/vdso64.lds.S
+++ b/arch/x86/entry/vdso/vdso64/vdso64.lds.S
@@ -33,10 +33,11 @@ VERSION {
getrandom;
__vdso_getrandom;
#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
- __vdso_futex_robust_try_unlock;
- __vdso_futex_robust_try_unlock_cs_start;
- __vdso_futex_robust_try_unlock_cs_success;
- __vdso_futex_robust_try_unlock_cs_end;
+ __vdso_futex_robust_try_unlock_64;
+ __vdso_futex_robust_try_unlock_32;
+ __kernel_futex_robust_try_unlock_cs_start;
+ __kernel_futex_robust_try_unlock_cs_success;
+ __kernel_futex_robust_try_unlock_cs_end;
#endif
local: *;
};
diff --git a/arch/x86/entry/vdso/vdso64/vdsox32.lds.S b/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
index 4409d97e7ef6..a456f184c937 100644
--- a/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
@@ -23,7 +23,8 @@ VERSION {
__vdso_time;
__vdso_clock_getres;
#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
- __vdso_futex_robust_try_unlock;
+ __vdso_futex_robust_try_unlock_64;
+ __vdso_futex_robust_try_unlock_32;
__vdso_futex_robust_try_unlock_cs_start;
__vdso_futex_robust_try_unlock_cs_success;
__vdso_futex_robust_try_unlock_cs_end;
diff --git a/include/linux/futex_types.h b/include/linux/futex_types.h
index 223f469789c5..a96293050bf4 100644
--- a/include/linux/futex_types.h
+++ b/include/linux/futex_types.h
@@ -11,13 +11,15 @@ struct futex_pi_state;
struct robust_list_head;
/**
- * struct futex_ctrl - Futex related per task data
+ * struct futex_sched_data - Futex related per task data
* @robust_list: User space registered robust list pointer
* @compat_robust_list: User space registered robust list pointer for compat tasks
+ * @pi_state_list: List head for Priority Inheritance (PI) state management
+ * @pi_state_cache: Pointer to cache one PI state object per task
* @exit_mutex: Mutex for serializing exit
* @state: Futex handling state to handle exit races correctly
*/
-struct futex_ctrl {
+struct futex_sched_data {
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
@@ -27,9 +29,6 @@ struct futex_ctrl {
struct mutex exit_mutex;
unsigned int state;
};
-#else
-struct futex_ctrl { };
-#endif /* !CONFIG_FUTEX */
/**
* struct futex_mm_data - Futex related per MM data
@@ -71,4 +70,9 @@ struct futex_mm_data {
#endif
};
+#else
+struct futex_sched_data { };
+struct futex_mm_data { };
+#endif /* !CONFIG_FUTEX */
+
#endif /* _LINUX_FUTEX_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 266d4859e322..a5d5c0ec3c64 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1329,7 +1329,7 @@ struct task_struct {
u32 rmid;
#endif
- struct futex_ctrl futex;
+ struct futex_sched_data futex;
#ifdef CONFIG_PERF_EVENTS
u8 perf_recursion[PERF_NR_CONTEXTS];
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index ab9d89748595..e447eaea63f4 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -26,6 +26,7 @@
#define FUTEX_PRIVATE_FLAG 128
#define FUTEX_CLOCK_REALTIME 256
#define FUTEX_UNLOCK_ROBUST 512
+#define FUTEX_ROBUST_LIST32 1024
#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | FUTEX_UNLOCK_ROBUST)
#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
@@ -182,23 +183,6 @@ struct robust_list_head {
#define FUTEX_ROBUST_MOD_PI (0x1UL)
#define FUTEX_ROBUST_MOD_MASK (FUTEX_ROBUST_MOD_PI)
-/*
- * Modifier for FUTEX_ROBUST_UNLOCK uaddr2. Required to distinguish the storage
- * size for the robust_list_head::list_pending_op. This solves two problems:
- *
- * 1) COMPAT tasks
- *
- * 2) The mixed mode magic gaming use case which has both 32-bit and 64-bit
- * robust lists. Oh well....
- *
- * Long story short: 32-bit userspace must set this bit unconditionally to
- * ensure that it can run on a 64-bit kernel in compat mode. If user space
- * screws that up a 64-bit kernel will happily clear the full 64-bits. 32-bit
- * kernels return an error code if the bit is not set.
- */
-#define FUTEX_ROBUST_UNLOCK_MOD_32BIT (0x1UL)
-#define FUTEX_ROBUST_UNLOCK_MOD_MASK (FUTEX_ROBUST_UNLOCK_MOD_32BIT)
-
/*
* bitset with all bits set for the FUTEX_xxx_BITSET OPs to request a
* match of any bit.
diff --git a/include/vdso/futex.h b/include/vdso/futex.h
index 8061bfcb6b92..a768c00b0ada 100644
--- a/include/vdso/futex.h
+++ b/include/vdso/futex.h
@@ -2,12 +2,11 @@
#ifndef _VDSO_FUTEX_H
#define _VDSO_FUTEX_H
-#include <linux/types.h>
-
-struct robust_list;
+#include <uapi/linux/types.h>
/**
- * __vdso_futex_robust_try_unlock - Try to unlock an uncontended robust futex
+ * __vdso_futex_robust_try_unlock_64 - Try to unlock an uncontended robust futex
+ * with a 64-bit op pointer
* @lock: Pointer to the futex lock object
* @tid: The TID of the calling task
* @op: Pointer to the task's robust_list_head::list_pending_op
@@ -39,6 +38,23 @@ struct robust_list;
* @uaddr2 argument for sys_futex(FUTEX_ROBUST_UNLOCK) operations. See the
* modifier and the related documentation in include/uapi/linux/futex.h
*/
-uint32_t __vdso_futex_robust_try_unlock(uint32_t *lock, uint32_t tid, void *op);
+__u32 __vdso_futex_robust_try_unlock_64(__u32 *lock, __u32 tid, __u64 *op);
+
+/**
+ * __vdso_futex_robust_try_unlock_32 - Try to unlock an uncontended robust futex
+ * with a 32-bit op pointer
+ * @lock: Pointer to the futex lock object
+ * @tid: The TID of the calling task
+ * @op: Pointer to the task's robust_list_head::list_pending_op
+ *
+ * Return: The content of *@lock. On success this is the same as @tid.
+ *
+ * Same as __vdso_futex_robust_try_unlock_64() just with a 32-bit @op pointer.
+ */
+__u32 __vdso_futex_robust_try_unlock_32(__u32 *lock, __u32 tid, __u32 *op);
+
+/* Modifier to convey the size of the op pointer */
+#define FUTEX_ROBUST_UNLOCK_MOD_32BIT (0x1UL)
+#define FUTEX_ROBUST_UNLOCK_MOD_MASK (FUTEX_ROBUST_UNLOCK_MOD_32BIT)
#endif
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 7957edd46b89..39041cf94522 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -46,6 +46,8 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <vdso/futex.h>
+
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -1434,17 +1436,9 @@ static void exit_pi_state_list(struct task_struct *curr)
static inline void exit_pi_state_list(struct task_struct *curr) { }
#endif
-static inline bool mask_pop_addr(void __user **pop)
-{
- unsigned long addr = (unsigned long)*pop;
-
- *pop = (void __user *) (addr & ~FUTEX_ROBUST_UNLOCK_MOD_MASK);
- return !!(addr & FUTEX_ROBUST_UNLOCK_MOD_32BIT);
-}
-
-bool futex_robust_list_clear_pending(void __user *pop)
+bool futex_robust_list_clear_pending(void __user *pop, unsigned int flags)
{
- bool size32bit = mask_pop_addr(&pop);
+ bool size32bit = !!(flags & FLAGS_ROBUST_LIST32);
if (!IS_ENABLED(CONFIG_64BIT) && !size32bit)
return false;
@@ -1456,15 +1450,28 @@ bool futex_robust_list_clear_pending(void __user *pop)
}
#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+static inline bool mask_pop_addr(void __user **pop)
+{
+ unsigned long addr = (unsigned long)*pop;
+
+ *pop = (void __user *) (addr & ~FUTEX_ROBUST_UNLOCK_MOD_MASK);
+ return !!(addr & FUTEX_ROBUST_UNLOCK_MOD_32BIT);
+}
+
void __futex_fixup_robust_unlock(struct pt_regs *regs)
{
+ unsigned int flags = 0;
void __user *pop;
if (!arch_futex_needs_robust_unlock_fixup(regs))
return;
pop = arch_futex_robust_unlock_get_pop(regs);
- futex_robust_list_clear_pending(pop);
+
+ if (mask_pop_addr(&pop))
+ flags = FUTEX_ROBUST_UNLOCK_MOD_32BIT;
+
+ futex_robust_list_clear_pending(pop, flags);
}
#endif /* CONFIG_FUTEX_ROBUST_UNLOCK */
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index b1aaa90f1779..31a5bae8b470 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -41,6 +41,7 @@
#define FLAGS_STRICT 0x0100
#define FLAGS_MPOL 0x0200
#define FLAGS_UNLOCK_ROBUST 0x0400
+#define FLAGS_ROBUST_LIST32 0x0800
/* FUTEX_ to FLAGS_ */
static inline unsigned int futex_to_flags(unsigned int op)
@@ -56,6 +57,9 @@ static inline unsigned int futex_to_flags(unsigned int op)
if (op & FUTEX_UNLOCK_ROBUST)
flags |= FLAGS_UNLOCK_ROBUST;
+ if (op & FUTEX_ROBUST_LIST32)
+ flags |= FLAGS_ROBUST_LIST32;
+
return flags;
}
@@ -452,6 +456,6 @@ extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, void __user *p
extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);
-bool futex_robust_list_clear_pending(void __user *pop);
+bool futex_robust_list_clear_pending(void __user *pop, unsigned int flags);
#endif /* _FUTEX_H */
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index b8c76b6242e4..05ca360a7a30 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1298,7 +1298,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, void __user *pop)
if (ret || !(flags & FLAGS_UNLOCK_ROBUST))
return ret;
- if (!futex_robust_list_clear_pending(pop))
+ if (!futex_robust_list_clear_pending(pop, flags))
return -EFAULT;
return 0;
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 45effcf42961..233f38b1f52e 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -166,7 +166,7 @@ static bool futex_robust_unlock(u32 __user *uaddr, unsigned int flags, void __us
* deeper trouble as the robust list head is usually part of TLS. The
* chance of survival is close to zero.
*/
- return futex_robust_list_clear_pending(pop);
+ return futex_robust_list_clear_pending(pop, flags);
}
/*
^ permalink raw reply related [flat|nested] 57+ messages in thread* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-17 22:32 ` Thomas Gleixner
@ 2026-03-18 22:08 ` Thomas Gleixner
2026-03-18 22:10 ` Peter Zijlstra
2026-03-19 2:05 ` André Almeida
0 siblings, 2 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-18 22:08 UTC (permalink / raw)
To: Florian Weimer
Cc: LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Rich Felker, Torvald Riegel, Darren Hart, Ingo Molnar,
Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Tue, Mar 17 2026 at 23:32, Thomas Gleixner wrote:
> On Tue, Mar 17 2026 at 11:37, Florian Weimer wrote:
> Something like the below compiled but untested delta diff which includes
> also the other unrelated feedback fixups?
The more I look into it, the more I regret that we allowed mixed mode in
the first place. Which way I turn it around the code becomes more
horrible than it really should be.
If I understand it correctly then the only real world use case is the
x86 emulator for ARM64. That made me think about possible options to
keep the insanity restricted.
1) Delegate the problem to the ARM64 people :)
2) Make that mixed size mode depend on a config option
3) Require that such a use case issues a prctl to switch into that
special case mode.
or a combination of those.
Andre?
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-18 22:08 ` Thomas Gleixner
@ 2026-03-18 22:10 ` Peter Zijlstra
2026-03-19 2:05 ` André Almeida
1 sibling, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2026-03-18 22:10 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Florian Weimer, LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On Wed, Mar 18, 2026 at 11:08:26PM +0100, Thomas Gleixner wrote:
> On Tue, Mar 17 2026 at 23:32, Thomas Gleixner wrote:
> > On Tue, Mar 17 2026 at 11:37, Florian Weimer wrote:
> > Something like the below compiled but untested delta diff which includes
> > also the other unrelated feedback fixups?
>
> The more I look into it, the more I regret that we allowed mixed mode in
> the first place. Which way I turn it around the code becomes more
> horrible than it really should be.
>
> If I understand it correctly then the only real world use case is the
> x86 emulator for ARM64. That made me think about possible options to
> keep the insanity restricted.
>
> 1) Delegate the problem to the ARM64 people :)
>
> 2) Make that mixed size mode depend on a config option
>
> 3) Require that such a use case issues a prctl to switch into that
> special case mode.
>
> or a combination of those.
>
> Andre?
Well, he has this patch set that creates multiple lists, which would
nicely solve things for FEX I recon.
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-18 22:08 ` Thomas Gleixner
2026-03-18 22:10 ` Peter Zijlstra
@ 2026-03-19 2:05 ` André Almeida
2026-03-19 7:10 ` Thomas Gleixner
1 sibling, 1 reply; 57+ messages in thread
From: André Almeida @ 2026-03-19 2:05 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Mathieu Desnoyers, Sebastian Andrzej Siewior,
Florian Weimer, Carlos O'Donell, Peter Zijlstra, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
Em 18/03/2026 19:08, Thomas Gleixner escreveu:
>
> 2) Make that mixed size mode depend on a config option
>
> 3) Require that such a use case issues a prctl to switch into that
> special case mode.
>
> or a combination of those.
>
> Andre?
>
Those two last options works for me, if it helps to make the code more
readable. However, I think that QEMU might be interested in those
features as well :) I'm going to ping them
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-19 2:05 ` André Almeida
@ 2026-03-19 7:10 ` Thomas Gleixner
0 siblings, 0 replies; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-19 7:10 UTC (permalink / raw)
To: André Almeida
Cc: LKML, Mathieu Desnoyers, Sebastian Andrzej Siewior,
Florian Weimer, Carlos O'Donell, Peter Zijlstra, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On Wed, Mar 18 2026 at 23:05, André Almeida wrote:
> Em 18/03/2026 19:08, Thomas Gleixner escreveu:
>>
>> 2) Make that mixed size mode depend on a config option
>>
>> 3) Require that such a use case issues a prctl to switch into that
>> special case mode.
>>
>> or a combination of those.
>>
>> Andre?
>>
>
> Those two last options works for me, if it helps to make the code more
> readable. However, I think that QEMU might be interested in those
> features as well :) I'm going to ping them
I already came up with something. It makes the fixup range larger as it
has to cover two functions and then pick the right one.
So the range check becomes:
if (likely(!ip_within(regs, mm->futex.cs_start, mm->futex.cs_end)))
return;
if (likely(!mm->futex.cs_multi)) {
fixup(regs, NULL);
return;
}
csr = mm->futex.cs_ranges;
for (range = 0; range < mm->futex.cs_multi; range++, csr++) {
if (ip_within(regs, csr->cs_start, csr->cs_end)) {
fixup(regs, csr);
return;
}
}
Or something daft like that.
That makes the multi CS range check generic and still optimizes for the
single entry case. The ASM functions become minimal w/o extra pointer
size conditionals.
Thanks,
tglx
^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-16 17:13 ` [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock() Thomas Gleixner
` (2 preceding siblings ...)
2026-03-17 8:28 ` Florian Weimer
@ 2026-03-17 15:33 ` Uros Bizjak
2026-03-18 8:21 ` Thomas Gleixner
3 siblings, 1 reply; 57+ messages in thread
From: Uros Bizjak @ 2026-03-17 15:33 UTC (permalink / raw)
To: Thomas Gleixner, LKML
Cc: Mathieu Desnoyers, André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On 3/16/26 18:13, Thomas Gleixner wrote:
> When the FUTEX_ROBUST_UNLOCK mechanism is used for unlocking (PI-)futexes,
> then the unlock sequence in userspace looks like this:
>
> 1) robust_list_set_op_pending(mutex);
> 2) robust_list_remove(mutex);
>
> lval = gettid();
> 3) if (atomic_try_cmpxchg(&mutex->lock, lval, 0))
> 4) robust_list_clear_op_pending();
> else
> 5) sys_futex(OP,...FUTEX_ROBUST_UNLOCK);
>
> That still leaves a minimal race window between #3 and #4 where the mutex
> could be acquired by some other task which observes that it is the last
> user and:
>
> 1) unmaps the mutex memory
> 2) maps a different file, which ends up covering the same address
>
> When then the original task exits before reaching #6 then the kernel robust
> list handling observes the pending op entry and tries to fix up user space.
>
> In case that the newly mapped data contains the TID of the exiting thread
> at the address of the mutex/futex the kernel will set the owner died bit in
> that memory and therefore corrupt unrelated data.
>
> Provide a VDSO function which exposes the critical section window in the
> VDSO symbol table. The resulting addresses are updated in the task's mm
> when the VDSO is (re)map()'ed.
>
> The core code detects when a task was interrupted within the critical
> section and is about to deliver a signal. It then invokes an architecture
> specific function which determines whether the pending op pointer has to be
> cleared or not. The assembly sequence for the non COMPAT case is:
>
> mov %esi,%eax // Load TID into EAX
> xor %ecx,%ecx // Set ECX to 0
> lock cmpxchg %ecx,(%rdi) // Try the TID -> 0 transition
> .Lstart:
> jnz .Lend
> movq $0x0,(%rdx) // Clear list_op_pending
> .Lend:
> ret
>
> So the decision can be simply based on the ZF state in regs->flags.
>
> If COMPAT is enabled then the try_unlock() function needs to take the size
> bit in the OP pointer into account, which makes it slightly more complex:
>
> mov %esi,%eax // Load TID into EAX
> mov %rdx,%rsi // Get the op pointer
> xor %ecx,%ecx // Set ECX to 0
> and $0xfffffffffffffffe,%rsi // Clear the size bit
> lock cmpxchg %ecx,(%rdi) // Try the TID -> 0 transition
> .Lstart:
> jnz .Lend
> .Lsuccess:
> testl $0x1,(%rdx) // Test the size bit
> jz .Lop64 // Not set: 64-bit
> movl $0x0,(%rsi) // Clear 32-bit
> jmp .Lend
> .Lop64:
> movq $0x0,(%rsi) // Clear 64-bit
> .Lend:
> ret
>
> The decision function has to check whether regs->ip is in the success
> portion as the size bit test obviously modifies ZF too. If it is before
> .Lsuccess then ZF contains the cmpxchg() result. If it's at of after
> .Lsuccess then the pointer has to be cleared.
>
> The original pointer with the size bit is preserved in RDX so the fixup can
> utilize the existing clearing mechanism, which is used by sys_futex().
>
> Arguably this could be avoided by providing separate functions and making
> the IP range for the quick check in the exit to user path cover the whole
> text section which contains the two functions. But that's not a win at all
> because:
>
> 1) User space needs to handle the two variants instead of just
> relying on a bit which can be saved in the mutex at
> initialization time.
>
> 2) The fixup decision function has then to evaluate which code path is
> used. That just adds more symbols and range checking for no real
> value.
>
> The unlock function is inspired by an idea from Mathieu Desnoyers.
>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> Link: https://lore.kernel.org/20260311185409.1988269-1-mathieu.desnoyers@efficios.com
> ---
> arch/x86/Kconfig | 1
> arch/x86/entry/vdso/common/vfutex.c | 72 +++++++++++++++++++++++++++++++
> arch/x86/entry/vdso/vdso32/Makefile | 5 +-
> arch/x86/entry/vdso/vdso32/vdso32.lds.S | 6 ++
> arch/x86/entry/vdso/vdso32/vfutex.c | 1
> arch/x86/entry/vdso/vdso64/Makefile | 7 +--
> arch/x86/entry/vdso/vdso64/vdso64.lds.S | 6 ++
> arch/x86/entry/vdso/vdso64/vdsox32.lds.S | 6 ++
> arch/x86/entry/vdso/vdso64/vfutex.c | 1
> arch/x86/include/asm/futex_robust.h | 44 ++++++++++++++++++
> 10 files changed, 144 insertions(+), 5 deletions(-)
>
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -237,6 +237,7 @@ config X86
> select HAVE_EFFICIENT_UNALIGNED_ACCESS
> select HAVE_EISA if X86_32
> select HAVE_EXIT_THREAD
> + select HAVE_FUTEX_ROBUST_UNLOCK
> select HAVE_GENERIC_TIF_BITS
> select HAVE_GUP_FAST
> select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
> --- /dev/null
> +++ b/arch/x86/entry/vdso/common/vfutex.c
> @@ -0,0 +1,72 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <vdso/futex.h>
> +
> +/*
> + * Compat enabled kernels have to take the size bit into account to support the
> + * mixed size use case of gaming emulators. Contrary to the kernel robust unlock
> + * mechanism all of this does not test for the 32-bit modifier in 32-bit VDSOs
> + * and in compat disabled kernels. User space can keep the pieces.
> + */
> +#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
> +
> +#ifdef CONFIG_COMPAT
The following asm template can be substantially improved.
> +# define ASM_CLEAR_PTR \
> + " testl $1, (%[pop]) \n" \
Please use byte-wide instruction, TESTB with address operand modifier,
"%a[pop]" instead of "(%[pop])":
testb $1, %a[pop]
> + " jz .Lop64 \n" \
> + " movl $0, (%[pad]) \n" \
Here you can reuse zero-valued operand "val" and use address operand
modifier. Please note %k modifier.
movl %k[val], %a[pad]
> + " jmp __vdso_futex_robust_try_unlock_cs_end \n" \
> + ".Lop64: \n" \
> + " movq $0, (%[pad]) \n"
Again, zero-valued operand "val" and address op modifier can be used here:
movq %[val], %a[pad]
> +
> +# define ASM_PAD_CONSTRAINT ,[pad] "S" (((unsigned long)pop) & ~0x1UL)
> +
> +#else /* CONFIG_COMPAT */
> +
> +# define ASM_CLEAR_PTR \
> + " movq $0, (%[pop]) \n"
movq %[val], %a[pop]
> +
> +# define ASM_PAD_CONSTRAINT
> +
> +#endif /* !CONFIG_COMPAT */
> +
> +#else /* CONFIG_X86_64 && !BUILD_VDSO32_64 */
> +
> +# define ASM_CLEAR_PTR \
> + " movl $0, (%[pad]) \n"
movl %[val], %a[pad]
> +
> +# define ASM_PAD_CONSTRAINT ,[pad] "S" (((unsigned long)pop) & ~0x1UL)
> +
> +#endif /* !CONFIG_X86_64 || BUILD_VDSO32_64 */
> +
> +uint32_t __vdso_futex_robust_try_unlock(uint32_t *lock, uint32_t tid, void *pop)
> +{
> + asm volatile (
> + ".global __vdso_futex_robust_try_unlock_cs_start \n"
> + ".global __vdso_futex_robust_try_unlock_cs_success \n"
> + ".global __vdso_futex_robust_try_unlock_cs_end \n"
> + " \n"
> + " lock cmpxchgl %[val], (%[ptr]) \n"
> + " \n"
> + "__vdso_futex_robust_try_unlock_cs_start: \n"
> + " \n"
> + " jnz __vdso_futex_robust_try_unlock_cs_end \n"
> + " \n"
> + "__vdso_futex_robust_try_unlock_cs_success: \n"
> + " \n"
> + ASM_CLEAR_PTR
> + " \n"
> + "__vdso_futex_robust_try_unlock_cs_end: \n"
> + : [tid] "+a" (tid)
You need earlyclobber here "+&a", because not all input arguemnts are
read before this argument is written.
> + : [ptr] "D" (lock),
> + [pop] "d" (pop),
> + [val] "r" (0)
[val] "r" (0UL), so the correct register width will be used. I'd name
this operand [zero], because 0 lives here, and it will be reused in
several places.
Uros.
> + ASM_PAD_CONSTRAINT
> + : "memory"
> + );
> +
> + return tid;
> +}
> +
> +uint32_t futex_robust_try_unlock(uint32_t *, uint32_t, void **)
> + __attribute__((weak, alias("__vdso_futex_robust_try_unlock")));
> --- a/arch/x86/entry/vdso/vdso32/Makefile
> +++ b/arch/x86/entry/vdso/vdso32/Makefile
> @@ -7,8 +7,9 @@
> vdsos-y := 32
>
> # Files to link into the vDSO:
> -vobjs-y := note.o vclock_gettime.o vgetcpu.o
> -vobjs-y += system_call.o sigreturn.o
> +vobjs-y := note.o vclock_gettime.o vgetcpu.o
> +vobjs-y += system_call.o sigreturn.o
> +vobjs-$(CONFIG_FUTEX_ROBUST_UNLOCK) += vfutex.o
>
> # Compilation flags
> flags-y := -DBUILD_VDSO32 -m32 -mregparm=0
> --- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
> +++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
> @@ -30,6 +30,12 @@ VERSION
> __vdso_clock_gettime64;
> __vdso_clock_getres_time64;
> __vdso_getcpu;
> +#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
> + __vdso_futex_robust_try_unlock;
> + __vdso_futex_robust_try_unlock_cs_start;
> + __vdso_futex_robust_try_unlock_cs_success;
> + __vdso_futex_robust_try_unlock_cs_end;
> +#endif
> };
>
> LINUX_2.5 {
> --- /dev/null
> +++ b/arch/x86/entry/vdso/vdso32/vfutex.c
> @@ -0,0 +1 @@
> +#include "common/vfutex.c"
> --- a/arch/x86/entry/vdso/vdso64/Makefile
> +++ b/arch/x86/entry/vdso/vdso64/Makefile
> @@ -8,9 +8,10 @@ vdsos-y := 64
> vdsos-$(CONFIG_X86_X32_ABI) += x32
>
> # Files to link into the vDSO:
> -vobjs-y := note.o vclock_gettime.o vgetcpu.o
> -vobjs-y += vgetrandom.o vgetrandom-chacha.o
> -vobjs-$(CONFIG_X86_SGX) += vsgx.o
> +vobjs-y := note.o vclock_gettime.o vgetcpu.o
> +vobjs-y += vgetrandom.o vgetrandom-chacha.o
> +vobjs-$(CONFIG_X86_SGX) += vsgx.o
> +vobjs-$(CONFIG_FUTEX_ROBUST_UNLOCK) += vfutex.o
>
> # Compilation flags
> flags-y := -DBUILD_VDSO64 -m64 -mcmodel=small
> --- a/arch/x86/entry/vdso/vdso64/vdso64.lds.S
> +++ b/arch/x86/entry/vdso/vdso64/vdso64.lds.S
> @@ -32,6 +32,12 @@ VERSION {
> #endif
> getrandom;
> __vdso_getrandom;
> +#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
> + __vdso_futex_robust_try_unlock;
> + __vdso_futex_robust_try_unlock_cs_start;
> + __vdso_futex_robust_try_unlock_cs_success;
> + __vdso_futex_robust_try_unlock_cs_end;
> +#endif
> local: *;
> };
> }
> --- a/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
> +++ b/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
> @@ -22,6 +22,12 @@ VERSION {
> __vdso_getcpu;
> __vdso_time;
> __vdso_clock_getres;
> +#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
> + __vdso_futex_robust_try_unlock;
> + __vdso_futex_robust_try_unlock_cs_start;
> + __vdso_futex_robust_try_unlock_cs_success;
> + __vdso_futex_robust_try_unlock_cs_end;
> +#endif
> local: *;
> };
> }
> --- /dev/null
> +++ b/arch/x86/entry/vdso/vdso64/vfutex.c
> @@ -0,0 +1 @@
> +#include "common/vfutex.c"
> --- /dev/null
> +++ b/arch/x86/include/asm/futex_robust.h
> @@ -0,0 +1,44 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_FUTEX_ROBUST_H
> +#define _ASM_X86_FUTEX_ROBUST_H
> +
> +#include <asm/ptrace.h>
> +
> +static __always_inline bool x86_futex_needs_robust_unlock_fixup(struct pt_regs *regs)
> +{
> + /*
> + * This is tricky in the compat case as it has to take the size check
> + * into account. See the ASM magic in the VDSO vfutex code. If compat is
> + * disabled or this is a 32-bit kernel then ZF is authoritive no matter
> + * what.
> + */
> + if (!IS_ENABLED(CONFIG_X86_64) || !IS_ENABLED(CONFIG_IA32_EMULATION))
> + return !!(regs->flags & X86_EFLAGS_ZF);
> +
> + /*
> + * For the compat case, the core code already established that regs->ip
> + * is >= cs_start and < cs_end. Now check whether it is at the
> + * conditional jump which checks the cmpxchg() or if it succeeded and
> + * does the size check, which obviously modifies ZF too.
> + */
> + if (regs->ip >= current->mm->futex.unlock_cs_success_ip)
> + return true;
> + /*
> + * It's at the jnz right after the cmpxchg(). ZF tells whether this
> + * succeeded or not.
> + */
> + return !!(regs->flags & X86_EFLAGS_ZF);
> +}
> +
> +#define arch_futex_needs_robust_unlock_fixup(regs) \
> + x86_futex_needs_robust_unlock_fixup(regs)
> +
> +static __always_inline void __user *x86_futex_robust_unlock_get_pop(struct pt_regs *regs)
> +{
> + return (void __user *)regs->dx;
> +}
> +
> +#define arch_futex_robust_unlock_get_pop(regs) \
> + x86_futex_robust_unlock_get_pop(regs)
> +
> +#endif /* _ASM_X86_FUTEX_ROBUST_H */
>
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-17 15:33 ` Uros Bizjak
@ 2026-03-18 8:21 ` Thomas Gleixner
2026-03-18 8:32 ` Uros Bizjak
0 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2026-03-18 8:21 UTC (permalink / raw)
To: Uros Bizjak, LKML
Cc: Mathieu Desnoyers, André Almeida, Sebastian Andrzej Siewior,
Carlos O'Donell, Peter Zijlstra, Florian Weimer, Rich Felker,
Torvald Riegel, Darren Hart, Ingo Molnar, Davidlohr Bueso,
Arnd Bergmann, Liam R . Howlett
On Tue, Mar 17 2026 at 16:33, Uros Bizjak wrote:
> On 3/16/26 18:13, Thomas Gleixner wrote:
>> +#ifdef CONFIG_COMPAT
>
> The following asm template can be substantially improved.
In theory.
>> +# define ASM_CLEAR_PTR \
>> + " testl $1, (%[pop]) \n" \
>
> Please use byte-wide instruction, TESTB with address operand modifier,
> "%a[pop]" instead of "(%[pop])":
>
> testb $1, %a[pop]
New fangled %a syntax seems to work. Didn't know about that. Though I'm
not convinced that this is an improvement. At least not for someone who
is used to read/write plain old school ASM for several decades.
>> + " jz .Lop64 \n" \
>> + " movl $0, (%[pad]) \n" \
>
> Here you can reuse zero-valued operand "val" and use address operand
> modifier. Please note %k modifier.
>
> movl %k[val], %a[pad]
...
> movq %[val], %a[pad]
But this one does not and _cannot_ work.
Error: incorrect register `%rcx' used with `l' suffix
Which is obvious because of the initalization:
[val] "r" (0UL)
which makes it explicitely R$$. If you change the initialization back to
[val] "r" (0) // or (0U)
the failure unsurprisingly becomes
Error: incorrect register `%ecx' used with `q' suffix
So much for the theory....
<SNIP>
>> + ASM_PAD_CONSTRAINT
tons of useless quoted text
>> +#endif /* _ASM_X86_FUTEX_ROBUST_H */
</SNIP>
Can you please trim your replies properly?
Thanks,
tglx
^ permalink raw reply [flat|nested] 57+ messages in thread* Re: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
2026-03-18 8:21 ` Thomas Gleixner
@ 2026-03-18 8:32 ` Uros Bizjak
0 siblings, 0 replies; 57+ messages in thread
From: Uros Bizjak @ 2026-03-18 8:32 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Mathieu Desnoyers, André Almeida,
Sebastian Andrzej Siewior, Carlos O'Donell, Peter Zijlstra,
Florian Weimer, Rich Felker, Torvald Riegel, Darren Hart,
Ingo Molnar, Davidlohr Bueso, Arnd Bergmann, Liam R . Howlett
On Wed, Mar 18, 2026 at 9:22 AM Thomas Gleixner <tglx@kernel.org> wrote:
>
> On Tue, Mar 17 2026 at 16:33, Uros Bizjak wrote:
> > On 3/16/26 18:13, Thomas Gleixner wrote:
> >> +#ifdef CONFIG_COMPAT
> >
> > The following asm template can be substantially improved.
>
> In theory.
>
> >> +# define ASM_CLEAR_PTR \
> >> + " testl $1, (%[pop]) \n" \
> >
> > Please use byte-wide instruction, TESTB with address operand modifier,
> > "%a[pop]" instead of "(%[pop])":
> >
> > testb $1, %a[pop]
>
> New fangled %a syntax seems to work. Didn't know about that. Though I'm
> not convinced that this is an improvement. At least not for someone who
> is used to read/write plain old school ASM for several decades.
>
> >> + " jz .Lop64 \n" \
> >> + " movl $0, (%[pad]) \n" \
> >
> > Here you can reuse zero-valued operand "val" and use address operand
> > modifier. Please note %k modifier.
> >
> > movl %k[val], %a[pad]
> ...
> > movq %[val], %a[pad]
>
> But this one does not and _cannot_ work.
>
> Error: incorrect register `%rcx' used with `l' suffix
Ah, I missed:
" lock cmpxchgl %[val], (%[ptr]) \n"
Please use %k[val] here and it will work. %k forces %ecx.
> Which is obvious because of the initalization:
>
> [val] "r" (0UL)
You have to force 0 to %r$$ for x86_64 when movq is used. The
resulting code (xorl %ecx, %ecx) is the same, but the compiler knows
what type of value is created here.
Uros.
^ permalink raw reply [flat|nested] 57+ messages in thread