From: Thomas Gleixner <tglx@kernel.org>
To: LKML <linux-kernel@vger.kernel.org>
Cc: "Mathieu Desnoyers" <mathieu.desnoyers@efficios.com>,
"André Almeida" <andrealmeid@igalia.com>,
"Sebastian Andrzej Siewior" <bigeasy@linutronix.de>,
"Carlos O'Donell" <carlos@redhat.com>,
"Peter Zijlstra" <peterz@infradead.org>,
"Florian Weimer" <fweimer@redhat.com>,
"Rich Felker" <dalias@aerifal.cx>,
"Torvald Riegel" <triegel@redhat.com>,
"Darren Hart" <dvhart@infradead.org>,
"Ingo Molnar" <mingo@kernel.org>,
"Davidlohr Bueso" <dave@stgolabs.net>,
"Arnd Bergmann" <arnd@arndb.de>,
"Liam R . Howlett" <Liam.Howlett@oracle.com>
Subject: [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()
Date: Mon, 16 Mar 2026 18:13:34 +0100 [thread overview]
Message-ID: <20260316164951.484640267@kernel.org> (raw)
In-Reply-To: 20260316162316.356674433@kernel.org
When the FUTEX_ROBUST_UNLOCK mechanism is used for unlocking (PI-)futexes,
then the unlock sequence in userspace looks like this:
1) robust_list_set_op_pending(mutex);
2) robust_list_remove(mutex);
lval = gettid();
3) if (atomic_try_cmpxchg(&mutex->lock, lval, 0))
4) robust_list_clear_op_pending();
else
5) sys_futex(OP,...FUTEX_ROBUST_UNLOCK);
That still leaves a minimal race window between #3 and #4 where the mutex
could be acquired by some other task which observes that it is the last
user and:
1) unmaps the mutex memory
2) maps a different file, which ends up covering the same address
When then the original task exits before reaching #6 then the kernel robust
list handling observes the pending op entry and tries to fix up user space.
In case that the newly mapped data contains the TID of the exiting thread
at the address of the mutex/futex the kernel will set the owner died bit in
that memory and therefore corrupt unrelated data.
Provide a VDSO function which exposes the critical section window in the
VDSO symbol table. The resulting addresses are updated in the task's mm
when the VDSO is (re)map()'ed.
The core code detects when a task was interrupted within the critical
section and is about to deliver a signal. It then invokes an architecture
specific function which determines whether the pending op pointer has to be
cleared or not. The assembly sequence for the non COMPAT case is:
mov %esi,%eax // Load TID into EAX
xor %ecx,%ecx // Set ECX to 0
lock cmpxchg %ecx,(%rdi) // Try the TID -> 0 transition
.Lstart:
jnz .Lend
movq $0x0,(%rdx) // Clear list_op_pending
.Lend:
ret
So the decision can be simply based on the ZF state in regs->flags.
If COMPAT is enabled then the try_unlock() function needs to take the size
bit in the OP pointer into account, which makes it slightly more complex:
mov %esi,%eax // Load TID into EAX
mov %rdx,%rsi // Get the op pointer
xor %ecx,%ecx // Set ECX to 0
and $0xfffffffffffffffe,%rsi // Clear the size bit
lock cmpxchg %ecx,(%rdi) // Try the TID -> 0 transition
.Lstart:
jnz .Lend
.Lsuccess:
testl $0x1,(%rdx) // Test the size bit
jz .Lop64 // Not set: 64-bit
movl $0x0,(%rsi) // Clear 32-bit
jmp .Lend
.Lop64:
movq $0x0,(%rsi) // Clear 64-bit
.Lend:
ret
The decision function has to check whether regs->ip is in the success
portion as the size bit test obviously modifies ZF too. If it is before
.Lsuccess then ZF contains the cmpxchg() result. If it's at of after
.Lsuccess then the pointer has to be cleared.
The original pointer with the size bit is preserved in RDX so the fixup can
utilize the existing clearing mechanism, which is used by sys_futex().
Arguably this could be avoided by providing separate functions and making
the IP range for the quick check in the exit to user path cover the whole
text section which contains the two functions. But that's not a win at all
because:
1) User space needs to handle the two variants instead of just
relying on a bit which can be saved in the mutex at
initialization time.
2) The fixup decision function has then to evaluate which code path is
used. That just adds more symbols and range checking for no real
value.
The unlock function is inspired by an idea from Mathieu Desnoyers.
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Link: https://lore.kernel.org/20260311185409.1988269-1-mathieu.desnoyers@efficios.com
---
arch/x86/Kconfig | 1
arch/x86/entry/vdso/common/vfutex.c | 72 +++++++++++++++++++++++++++++++
arch/x86/entry/vdso/vdso32/Makefile | 5 +-
arch/x86/entry/vdso/vdso32/vdso32.lds.S | 6 ++
arch/x86/entry/vdso/vdso32/vfutex.c | 1
arch/x86/entry/vdso/vdso64/Makefile | 7 +--
arch/x86/entry/vdso/vdso64/vdso64.lds.S | 6 ++
arch/x86/entry/vdso/vdso64/vdsox32.lds.S | 6 ++
arch/x86/entry/vdso/vdso64/vfutex.c | 1
arch/x86/include/asm/futex_robust.h | 44 ++++++++++++++++++
10 files changed, 144 insertions(+), 5 deletions(-)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -237,6 +237,7 @@ config X86
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EISA if X86_32
select HAVE_EXIT_THREAD
+ select HAVE_FUTEX_ROBUST_UNLOCK
select HAVE_GENERIC_TIF_BITS
select HAVE_GUP_FAST
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
--- /dev/null
+++ b/arch/x86/entry/vdso/common/vfutex.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <vdso/futex.h>
+
+/*
+ * Compat enabled kernels have to take the size bit into account to support the
+ * mixed size use case of gaming emulators. Contrary to the kernel robust unlock
+ * mechanism all of this does not test for the 32-bit modifier in 32-bit VDSOs
+ * and in compat disabled kernels. User space can keep the pieces.
+ */
+#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
+
+#ifdef CONFIG_COMPAT
+
+# define ASM_CLEAR_PTR \
+ " testl $1, (%[pop]) \n" \
+ " jz .Lop64 \n" \
+ " movl $0, (%[pad]) \n" \
+ " jmp __vdso_futex_robust_try_unlock_cs_end \n" \
+ ".Lop64: \n" \
+ " movq $0, (%[pad]) \n"
+
+# define ASM_PAD_CONSTRAINT ,[pad] "S" (((unsigned long)pop) & ~0x1UL)
+
+#else /* CONFIG_COMPAT */
+
+# define ASM_CLEAR_PTR \
+ " movq $0, (%[pop]) \n"
+
+# define ASM_PAD_CONSTRAINT
+
+#endif /* !CONFIG_COMPAT */
+
+#else /* CONFIG_X86_64 && !BUILD_VDSO32_64 */
+
+# define ASM_CLEAR_PTR \
+ " movl $0, (%[pad]) \n"
+
+# define ASM_PAD_CONSTRAINT ,[pad] "S" (((unsigned long)pop) & ~0x1UL)
+
+#endif /* !CONFIG_X86_64 || BUILD_VDSO32_64 */
+
+uint32_t __vdso_futex_robust_try_unlock(uint32_t *lock, uint32_t tid, void *pop)
+{
+ asm volatile (
+ ".global __vdso_futex_robust_try_unlock_cs_start \n"
+ ".global __vdso_futex_robust_try_unlock_cs_success \n"
+ ".global __vdso_futex_robust_try_unlock_cs_end \n"
+ " \n"
+ " lock cmpxchgl %[val], (%[ptr]) \n"
+ " \n"
+ "__vdso_futex_robust_try_unlock_cs_start: \n"
+ " \n"
+ " jnz __vdso_futex_robust_try_unlock_cs_end \n"
+ " \n"
+ "__vdso_futex_robust_try_unlock_cs_success: \n"
+ " \n"
+ ASM_CLEAR_PTR
+ " \n"
+ "__vdso_futex_robust_try_unlock_cs_end: \n"
+ : [tid] "+a" (tid)
+ : [ptr] "D" (lock),
+ [pop] "d" (pop),
+ [val] "r" (0)
+ ASM_PAD_CONSTRAINT
+ : "memory"
+ );
+
+ return tid;
+}
+
+uint32_t futex_robust_try_unlock(uint32_t *, uint32_t, void **)
+ __attribute__((weak, alias("__vdso_futex_robust_try_unlock")));
--- a/arch/x86/entry/vdso/vdso32/Makefile
+++ b/arch/x86/entry/vdso/vdso32/Makefile
@@ -7,8 +7,9 @@
vdsos-y := 32
# Files to link into the vDSO:
-vobjs-y := note.o vclock_gettime.o vgetcpu.o
-vobjs-y += system_call.o sigreturn.o
+vobjs-y := note.o vclock_gettime.o vgetcpu.o
+vobjs-y += system_call.o sigreturn.o
+vobjs-$(CONFIG_FUTEX_ROBUST_UNLOCK) += vfutex.o
# Compilation flags
flags-y := -DBUILD_VDSO32 -m32 -mregparm=0
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -30,6 +30,12 @@ VERSION
__vdso_clock_gettime64;
__vdso_clock_getres_time64;
__vdso_getcpu;
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+ __vdso_futex_robust_try_unlock;
+ __vdso_futex_robust_try_unlock_cs_start;
+ __vdso_futex_robust_try_unlock_cs_success;
+ __vdso_futex_robust_try_unlock_cs_end;
+#endif
};
LINUX_2.5 {
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vfutex.c
@@ -0,0 +1 @@
+#include "common/vfutex.c"
--- a/arch/x86/entry/vdso/vdso64/Makefile
+++ b/arch/x86/entry/vdso/vdso64/Makefile
@@ -8,9 +8,10 @@ vdsos-y := 64
vdsos-$(CONFIG_X86_X32_ABI) += x32
# Files to link into the vDSO:
-vobjs-y := note.o vclock_gettime.o vgetcpu.o
-vobjs-y += vgetrandom.o vgetrandom-chacha.o
-vobjs-$(CONFIG_X86_SGX) += vsgx.o
+vobjs-y := note.o vclock_gettime.o vgetcpu.o
+vobjs-y += vgetrandom.o vgetrandom-chacha.o
+vobjs-$(CONFIG_X86_SGX) += vsgx.o
+vobjs-$(CONFIG_FUTEX_ROBUST_UNLOCK) += vfutex.o
# Compilation flags
flags-y := -DBUILD_VDSO64 -m64 -mcmodel=small
--- a/arch/x86/entry/vdso/vdso64/vdso64.lds.S
+++ b/arch/x86/entry/vdso/vdso64/vdso64.lds.S
@@ -32,6 +32,12 @@ VERSION {
#endif
getrandom;
__vdso_getrandom;
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+ __vdso_futex_robust_try_unlock;
+ __vdso_futex_robust_try_unlock_cs_start;
+ __vdso_futex_robust_try_unlock_cs_success;
+ __vdso_futex_robust_try_unlock_cs_end;
+#endif
local: *;
};
}
--- a/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
@@ -22,6 +22,12 @@ VERSION {
__vdso_getcpu;
__vdso_time;
__vdso_clock_getres;
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+ __vdso_futex_robust_try_unlock;
+ __vdso_futex_robust_try_unlock_cs_start;
+ __vdso_futex_robust_try_unlock_cs_success;
+ __vdso_futex_robust_try_unlock_cs_end;
+#endif
local: *;
};
}
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso64/vfutex.c
@@ -0,0 +1 @@
+#include "common/vfutex.c"
--- /dev/null
+++ b/arch/x86/include/asm/futex_robust.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FUTEX_ROBUST_H
+#define _ASM_X86_FUTEX_ROBUST_H
+
+#include <asm/ptrace.h>
+
+static __always_inline bool x86_futex_needs_robust_unlock_fixup(struct pt_regs *regs)
+{
+ /*
+ * This is tricky in the compat case as it has to take the size check
+ * into account. See the ASM magic in the VDSO vfutex code. If compat is
+ * disabled or this is a 32-bit kernel then ZF is authoritive no matter
+ * what.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) || !IS_ENABLED(CONFIG_IA32_EMULATION))
+ return !!(regs->flags & X86_EFLAGS_ZF);
+
+ /*
+ * For the compat case, the core code already established that regs->ip
+ * is >= cs_start and < cs_end. Now check whether it is at the
+ * conditional jump which checks the cmpxchg() or if it succeeded and
+ * does the size check, which obviously modifies ZF too.
+ */
+ if (regs->ip >= current->mm->futex.unlock_cs_success_ip)
+ return true;
+ /*
+ * It's at the jnz right after the cmpxchg(). ZF tells whether this
+ * succeeded or not.
+ */
+ return !!(regs->flags & X86_EFLAGS_ZF);
+}
+
+#define arch_futex_needs_robust_unlock_fixup(regs) \
+ x86_futex_needs_robust_unlock_fixup(regs)
+
+static __always_inline void __user *x86_futex_robust_unlock_get_pop(struct pt_regs *regs)
+{
+ return (void __user *)regs->dx;
+}
+
+#define arch_futex_robust_unlock_get_pop(regs) \
+ x86_futex_robust_unlock_get_pop(regs)
+
+#endif /* _ASM_X86_FUTEX_ROBUST_H */
next prev parent reply other threads:[~2026-03-16 17:13 UTC|newest]
Thread overview: 57+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-16 17:12 [patch 0/8] futex: Address the robust futex unlock race for real Thomas Gleixner
2026-03-16 17:12 ` [patch 1/8] futex: Move futex task related data into a struct Thomas Gleixner
2026-03-16 17:55 ` Mathieu Desnoyers
2026-03-17 2:24 ` André Almeida
2026-03-17 9:52 ` Thomas Gleixner
2026-03-16 17:13 ` [patch 2/8] futex: Move futex related mm_struct " Thomas Gleixner
2026-03-16 18:00 ` Mathieu Desnoyers
2026-03-16 17:13 ` [patch 3/8] futex: Provide UABI defines for robust list entry modifiers Thomas Gleixner
2026-03-16 18:02 ` Mathieu Desnoyers
2026-03-17 2:38 ` André Almeida
2026-03-17 9:53 ` Thomas Gleixner
2026-03-16 17:13 ` [patch 4/8] futex: Add support for unlocking robust futexes Thomas Gleixner
2026-03-16 18:24 ` Mathieu Desnoyers
2026-03-17 16:17 ` André Almeida
2026-03-17 20:46 ` Peter Zijlstra
2026-03-17 22:40 ` Thomas Gleixner
2026-03-18 8:02 ` Peter Zijlstra
2026-03-18 8:06 ` Florian Weimer
2026-03-18 14:47 ` Peter Zijlstra
2026-03-18 16:03 ` Thomas Gleixner
2026-03-16 17:13 ` [patch 5/8] futex: Add robust futex unlock IP range Thomas Gleixner
2026-03-16 18:36 ` Mathieu Desnoyers
2026-03-17 19:19 ` André Almeida
2026-03-16 17:13 ` [patch 6/8] futex: Provide infrastructure to plug the non contended robust futex unlock race Thomas Gleixner
2026-03-16 18:35 ` Mathieu Desnoyers
2026-03-16 20:29 ` Thomas Gleixner
2026-03-16 20:52 ` Mathieu Desnoyers
2026-03-16 17:13 ` [patch 7/8] x86/vdso: Prepare for robust futex unlock support Thomas Gleixner
2026-03-16 17:13 ` Thomas Gleixner [this message]
2026-03-16 19:19 ` [patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock() Mathieu Desnoyers
2026-03-16 21:02 ` Thomas Gleixner
2026-03-16 22:35 ` Mathieu Desnoyers
2026-03-16 21:14 ` Thomas Gleixner
2026-03-16 21:29 ` Thomas Gleixner
2026-03-17 7:25 ` Thomas Weißschuh
2026-03-17 9:51 ` Thomas Gleixner
2026-03-17 11:17 ` Thomas Weißschuh
2026-03-18 16:17 ` Thomas Gleixner
2026-03-19 7:41 ` Thomas Weißschuh
2026-03-19 8:53 ` Florian Weimer
2026-03-19 9:04 ` Thomas Weißschuh
2026-03-19 9:08 ` Peter Zijlstra
2026-03-19 23:31 ` Thomas Gleixner
2026-03-19 10:36 ` Sebastian Andrzej Siewior
2026-03-19 10:49 ` Thomas Weißschuh
2026-03-19 10:55 ` Sebastian Andrzej Siewior
2026-03-17 8:28 ` Florian Weimer
2026-03-17 9:36 ` Thomas Gleixner
2026-03-17 10:37 ` Florian Weimer
2026-03-17 22:32 ` Thomas Gleixner
2026-03-18 22:08 ` Thomas Gleixner
2026-03-18 22:10 ` Peter Zijlstra
2026-03-19 2:05 ` André Almeida
2026-03-19 7:10 ` Thomas Gleixner
2026-03-17 15:33 ` Uros Bizjak
2026-03-18 8:21 ` Thomas Gleixner
2026-03-18 8:32 ` Uros Bizjak
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260316164951.484640267@kernel.org \
--to=tglx@kernel.org \
--cc=Liam.Howlett@oracle.com \
--cc=andrealmeid@igalia.com \
--cc=arnd@arndb.de \
--cc=bigeasy@linutronix.de \
--cc=carlos@redhat.com \
--cc=dalias@aerifal.cx \
--cc=dave@stgolabs.net \
--cc=dvhart@infradead.org \
--cc=fweimer@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=mingo@kernel.org \
--cc=peterz@infradead.org \
--cc=triegel@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox