All of lore.kernel.org
 help / color / mirror / Atom feed
* [syzbot] [kernel?] general protection fault in try_to_wake_up (3)
@ 2025-09-02 20:54 syzbot
  2025-09-02 21:46 ` Peter Zijlstra
  0 siblings, 1 reply; 12+ messages in thread
From: syzbot @ 2025-09-02 20:54 UTC (permalink / raw)
  To: andrealmeid, dave, dvhart, linux-kernel, mingo, peterz,
	syzkaller-bugs, tglx

Hello,

syzbot found the following issue on:

HEAD commit:    5c3b3264e585 Merge tag 'x86_urgent_for_v6.17_rc4' of git:/..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=12e1ae34580000
kernel config:  https://syzkaller.appspot.com/x/.config?x=bd9738e00c1bbfb4
dashboard link: https://syzkaller.appspot.com/bug?extid=034246a838a10d181e78
compiler:       Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=10f6a1f0580000

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/37953b384dff/disk-5c3b3264.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/df5cc1c4e51d/vmlinux-5c3b3264.xz
kernel image: https://storage.googleapis.com/syzbot-assets/2ed6195eae9f/bzImage-5c3b3264.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+034246a838a10d181e78@syzkaller.appspotmail.com

Oops: general protection fault, probably for non-canonical address 0xdffffc000000014b: 0000 [#1] SMP KASAN PTI
KASAN: null-ptr-deref in range [0x0000000000000a58-0x0000000000000a5f]
CPU: 1 UID: 0 PID: 6293 Comm: syz.0.60 Not tainted syzkaller #0 PREEMPT_{RT,(full)} 
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025
RIP: 0010:kasan_byte_accessible+0x12/0x30 mm/kasan/generic.c:199
Code: 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 48 c1 ef 03 48 b8 00 00 00 00 00 fc ff df <0f> b6 04 07 3c 08 0f 92 c0 e9 d0 9f dc 08 cc 66 66 66 66 66 66 2e
RSP: 0018:ffffc9000157f7e0 EFLAGS: 00010006
RAX: dffffc0000000000 RBX: ffffffff8af9dfe7 RCX: e1dbfc1ee2ae4a00
RDX: 0000000000000000 RSI: ffffffff8af9dfe7 RDI: 000000000000014b
RBP: ffffffff81908477 R08: 0000000000000001 R09: 0000000000000000
R10: dffffc0000000000 R11: fffffbfff1e3a947 R12: 0000000000000000
R13: 0000000000000a58 R14: 0000000000000a58 R15: 0000000000000001
FS:  00007ff6ed61d6c0(0000) GS:ffff8881269c2000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ff6ed61cf40 CR3: 0000000027554000 CR4: 00000000003526f0
Call Trace:
 <TASK>
 __kasan_check_byte+0x12/0x40 mm/kasan/common.c:567
 kasan_check_byte include/linux/kasan.h:399 [inline]
 lock_acquire+0x8d/0x360 kernel/locking/lockdep.c:5842
 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
 _raw_spin_lock_irqsave+0xa7/0xf0 kernel/locking/spinlock.c:162
 class_raw_spinlock_irqsave_constructor include/linux/spinlock.h:557 [inline]
 try_to_wake_up+0x67/0x12b0 kernel/sched/core.c:4216
 requeue_pi_wake_futex+0x24b/0x2f0 kernel/futex/requeue.c:249
 futex_proxy_trylock_atomic kernel/futex/requeue.c:340 [inline]
 futex_requeue+0x135f/0x1870 kernel/futex/requeue.c:498
 do_futex+0x362/0x420 kernel/futex/syscalls.c:-1
 __do_sys_futex kernel/futex/syscalls.c:179 [inline]
 __se_sys_futex+0x36f/0x400 kernel/futex/syscalls.c:160
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7ff6edfcebe9
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ff6ed61d038 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
RAX: ffffffffffffffda RBX: 00007ff6ee206090 RCX: 00007ff6edfcebe9
RDX: 0000000000000001 RSI: 000000000000000c RDI: 000020000000cffc
RBP: 00007ff6ee051e19 R08: 0000200000048000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007ff6ee206128 R14: 00007ff6ee206090 R15: 00007ffd53c7a368
 </TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---
RIP: 0010:kasan_byte_accessible+0x12/0x30 mm/kasan/generic.c:199
Code: 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 48 c1 ef 03 48 b8 00 00 00 00 00 fc ff df <0f> b6 04 07 3c 08 0f 92 c0 e9 d0 9f dc 08 cc 66 66 66 66 66 66 2e
RSP: 0018:ffffc9000157f7e0 EFLAGS: 00010006
RAX: dffffc0000000000 RBX: ffffffff8af9dfe7 RCX: e1dbfc1ee2ae4a00
RDX: 0000000000000000 RSI: ffffffff8af9dfe7 RDI: 000000000000014b
RBP: ffffffff81908477 R08: 0000000000000001 R09: 0000000000000000
R10: dffffc0000000000 R11: fffffbfff1e3a947 R12: 0000000000000000
R13: 0000000000000a58 R14: 0000000000000a58 R15: 0000000000000001
FS:  00007ff6ed61d6c0(0000) GS:ffff8881269c2000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ff6ed61cf40 CR3: 0000000027554000 CR4: 00000000003526f0
----------------
Code disassembly (best guess):
   0:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
   7:	00
   8:	90                   	nop
   9:	90                   	nop
   a:	90                   	nop
   b:	90                   	nop
   c:	90                   	nop
   d:	90                   	nop
   e:	90                   	nop
   f:	90                   	nop
  10:	90                   	nop
  11:	90                   	nop
  12:	90                   	nop
  13:	90                   	nop
  14:	90                   	nop
  15:	90                   	nop
  16:	90                   	nop
  17:	90                   	nop
  18:	66 0f 1f 00          	nopw   (%rax)
  1c:	48 c1 ef 03          	shr    $0x3,%rdi
  20:	48 b8 00 00 00 00 00 	movabs $0xdffffc0000000000,%rax
  27:	fc ff df
* 2a:	0f b6 04 07          	movzbl (%rdi,%rax,1),%eax <-- trapping instruction
  2e:	3c 08                	cmp    $0x8,%al
  30:	0f 92 c0             	setb   %al
  33:	e9 d0 9f dc 08       	jmp    0x8dca008
  38:	cc                   	int3
  39:	66                   	data16
  3a:	66                   	data16
  3b:	66                   	data16
  3c:	66                   	data16
  3d:	66                   	data16
  3e:	66                   	data16
  3f:	2e                   	cs


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want syzbot to run the reproducer, reply with:
#syz test: git://repo/address.git branch-or-commit-hash
If you attach or paste a git patch, syzbot will apply it before testing.

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [syzbot] [kernel?] general protection fault in try_to_wake_up (3)
  2025-09-02 20:54 [syzbot] [kernel?] general protection fault in try_to_wake_up (3) syzbot
@ 2025-09-02 21:46 ` Peter Zijlstra
  2025-09-03 13:07   ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2025-09-02 21:46 UTC (permalink / raw)
  To: syzbot
  Cc: andrealmeid, dave, dvhart, linux-kernel, mingo, syzkaller-bugs,
	tglx, Sebastian Andrzej Siewior

On Tue, Sep 02, 2025 at 01:54:33PM -0700, syzbot wrote:
> Hello,
> 
> syzbot found the following issue on:
> 
> HEAD commit:    5c3b3264e585 Merge tag 'x86_urgent_for_v6.17_rc4' of git:/..
> git tree:       upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=12e1ae34580000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=bd9738e00c1bbfb4
> dashboard link: https://syzkaller.appspot.com/bug?extid=034246a838a10d181e78
> compiler:       Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8
> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=10f6a1f0580000
> 
> Downloadable assets:
> disk image: https://storage.googleapis.com/syzbot-assets/37953b384dff/disk-5c3b3264.raw.xz
> vmlinux: https://storage.googleapis.com/syzbot-assets/df5cc1c4e51d/vmlinux-5c3b3264.xz
> kernel image: https://storage.googleapis.com/syzbot-assets/2ed6195eae9f/bzImage-5c3b3264.xz
> 
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+034246a838a10d181e78@syzkaller.appspotmail.com
> 
> Oops: general protection fault, probably for non-canonical address 0xdffffc000000014b: 0000 [#1] SMP KASAN PTI
> KASAN: null-ptr-deref in range [0x0000000000000a58-0x0000000000000a5f]

When I build the provided .config with clang-20, that a58 offset is
exactly task_struct::pi_lock::lockdep_map, which nicely corresponds with
the below stacktrace, and seems to suggest someone did:
try_to_wake_up(NULL).

> CPU: 1 UID: 0 PID: 6293 Comm: syz.0.60 Not tainted syzkaller #0 PREEMPT_{RT,(full)} 
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025
> RIP: 0010:kasan_byte_accessible+0x12/0x30 mm/kasan/generic.c:199
> Code: 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 48 c1 ef 03 48 b8 00 00 00 00 00 fc ff df <0f> b6 04 07 3c 08 0f 92 c0 e9 d0 9f dc 08 cc 66 66 66 66 66 66 2e
> RSP: 0018:ffffc9000157f7e0 EFLAGS: 00010006
> RAX: dffffc0000000000 RBX: ffffffff8af9dfe7 RCX: e1dbfc1ee2ae4a00
> RDX: 0000000000000000 RSI: ffffffff8af9dfe7 RDI: 000000000000014b
> RBP: ffffffff81908477 R08: 0000000000000001 R09: 0000000000000000
> R10: dffffc0000000000 R11: fffffbfff1e3a947 R12: 0000000000000000
> R13: 0000000000000a58 R14: 0000000000000a58 R15: 0000000000000001
> FS:  00007ff6ed61d6c0(0000) GS:ffff8881269c2000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007ff6ed61cf40 CR3: 0000000027554000 CR4: 00000000003526f0
> Call Trace:
>  <TASK>
>  __kasan_check_byte+0x12/0x40 mm/kasan/common.c:567
>  kasan_check_byte include/linux/kasan.h:399 [inline]
>  lock_acquire+0x8d/0x360 kernel/locking/lockdep.c:5842
>  __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>  _raw_spin_lock_irqsave+0xa7/0xf0 kernel/locking/spinlock.c:162
>  class_raw_spinlock_irqsave_constructor include/linux/spinlock.h:557 [inline]
>  try_to_wake_up+0x67/0x12b0 kernel/sched/core.c:4216
>  requeue_pi_wake_futex+0x24b/0x2f0 kernel/futex/requeue.c:249

Trouble is, we've not changed the requeue bits in a fair while... So I'm
somewhat confused on how this happens now ?!

>  futex_proxy_trylock_atomic kernel/futex/requeue.c:340 [inline]
>  futex_requeue+0x135f/0x1870 kernel/futex/requeue.c:498
>  do_futex+0x362/0x420 kernel/futex/syscalls.c:-1
>  __do_sys_futex kernel/futex/syscalls.c:179 [inline]
>  __se_sys_futex+0x36f/0x400 kernel/futex/syscalls.c:160
>  do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
>  do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94
>  entry_SYSCALL_64_after_hwframe+0x77/0x7f

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [syzbot] [kernel?] general protection fault in try_to_wake_up (3)
  2025-09-02 21:46 ` Peter Zijlstra
@ 2025-09-03 13:07   ` Sebastian Andrzej Siewior
  2025-09-03 18:51     ` Jens Axboe
  0 siblings, 1 reply; 12+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-09-03 13:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: syzbot, andrealmeid, dave, dvhart, linux-kernel, mingo,
	syzkaller-bugs, tglx, Jens Axboe

+Jens

On 2025-09-02 23:46:28 [+0200], Peter Zijlstra wrote:
> When I build the provided .config with clang-20, that a58 offset is
> exactly task_struct::pi_lock::lockdep_map, which nicely corresponds with
> the below stacktrace, and seems to suggest someone did:
> try_to_wake_up(NULL).

correct.

> >  try_to_wake_up+0x67/0x12b0 kernel/sched/core.c:4216
> >  requeue_pi_wake_futex+0x24b/0x2f0 kernel/futex/requeue.c:249
> 
> Trouble is, we've not changed the requeue bits in a fair while... So I'm
> somewhat confused on how this happens now ?!

This means syzkaller managed to invoke futex_wait_setup(…, NULL) in
order to get futex_q::task assigned to NULL. All users use current
except for io_futex_wait().

The syz-reproducer lists only:
| timer_create(0x0, &(0x7f0000000080)={0x0, 0x11, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000000))
| timer_settime(0x0, 0x0, &(0x7f0000000240)={{0x0, 0x8}, {0x0, 0x9}}, 0x0)
| futex(&(0x7f000000cffc), 0x80000000000b, 0x0, 0x0, &(0x7f0000048000), 0x0)
| futex(&(0x7f000000cffc), 0xc, 0x1, 0x0, &(0x7f0000048000), 0x0)

and that is probably why it can't come up with C-reproducer.
The whole log has (filtered) the following lines:

| io_uring_setup(0x85a, &(0x7f0000000180)={0x0, 0x58b9, 0x1, 0x2, 0x383})
| syz_io_uring_setup(0x88f, &(0x7f0000000300)={0x0, 0xaedf, 0x0, 0x0, 0x25d}, &(0x7f0000000140)=<r0=>0x0, &(0x7f0000000280)=<r1=>0x0)
| syz_memcpy_off$IO_URING_METADATA_GENERIC(r0, 0x4, &(0x7f0000000080)=0xfffffffc, 0x0, 0x4)
| syz_io_uring_submit(r0, r1, &(0x7f00000001c0)=@IORING_OP_RECVMSG={0xa, 0x8, 0x1, r2, 0x0, &(0x7f0000000440)={0x0, 0x0, 0x0}, 0x0, 0x40000020, 0x1, {0x2}})

This should explain the how the waiter got NULL. There is no private
flag so that is how they interact with each other.
Do we want this:

diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index c716a66f86929..0c98256ebdcb7 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -312,6 +312,8 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
 	if (!top_waiter->rt_waiter || top_waiter->pi_state)
 		return -EINVAL;
 
+	if (!top_waiter->task)
+		-EINVAL;
 	/* Ensure we requeue to the expected futex. */
 	if (!futex_match(top_waiter->requeue_pi_key, key2))
 		return -EINVAL;

?

Sebastian

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [syzbot] [kernel?] general protection fault in try_to_wake_up (3)
  2025-09-03 13:07   ` Sebastian Andrzej Siewior
@ 2025-09-03 18:51     ` Jens Axboe
  2025-09-04 16:28       ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2025-09-03 18:51 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Peter Zijlstra
  Cc: syzbot, andrealmeid, dave, dvhart, linux-kernel, mingo,
	syzkaller-bugs, tglx

On 9/3/25 7:07 AM, Sebastian Andrzej Siewior wrote:
> +Jens
> 
> On 2025-09-02 23:46:28 [+0200], Peter Zijlstra wrote:
>> When I build the provided .config with clang-20, that a58 offset is
>> exactly task_struct::pi_lock::lockdep_map, which nicely corresponds with
>> the below stacktrace, and seems to suggest someone did:
>> try_to_wake_up(NULL).
> 
> correct.
> 
>>>  try_to_wake_up+0x67/0x12b0 kernel/sched/core.c:4216
>>>  requeue_pi_wake_futex+0x24b/0x2f0 kernel/futex/requeue.c:249
>>
>> Trouble is, we've not changed the requeue bits in a fair while... So I'm
>> somewhat confused on how this happens now ?!
> 
> This means syzkaller managed to invoke futex_wait_setup(?, NULL) in
> order to get futex_q::task assigned to NULL. All users use current
> except for io_futex_wait().
> 
> The syz-reproducer lists only:
> | timer_create(0x0, &(0x7f0000000080)={0x0, 0x11, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000000))
> | timer_settime(0x0, 0x0, &(0x7f0000000240)={{0x0, 0x8}, {0x0, 0x9}}, 0x0)
> | futex(&(0x7f000000cffc), 0x80000000000b, 0x0, 0x0, &(0x7f0000048000), 0x0)
> | futex(&(0x7f000000cffc), 0xc, 0x1, 0x0, &(0x7f0000048000), 0x0)
> 
> and that is probably why it can't come up with C-reproducer.
> The whole log has (filtered) the following lines:
> 
> | io_uring_setup(0x85a, &(0x7f0000000180)={0x0, 0x58b9, 0x1, 0x2, 0x383})
> | syz_io_uring_setup(0x88f, &(0x7f0000000300)={0x0, 0xaedf, 0x0, 0x0, 0x25d}, &(0x7f0000000140)=<r0=>0x0, &(0x7f0000000280)=<r1=>0x0)
> | syz_memcpy_off$IO_URING_METADATA_GENERIC(r0, 0x4, &(0x7f0000000080)=0xfffffffc, 0x0, 0x4)
> | syz_io_uring_submit(r0, r1, &(0x7f00000001c0)=@IORING_OP_RECVMSG={0xa, 0x8, 0x1, r2, 0x0, &(0x7f0000000440)={0x0, 0x0, 0x0}, 0x0, 0x40000020, 0x1, {0x2}})
> 
> This should explain the how the waiter got NULL. There is no private
> flag so that is how they interact with each other.
> Do we want this:
> 
> diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
> index c716a66f86929..0c98256ebdcb7 100644
> --- a/kernel/futex/requeue.c
> +++ b/kernel/futex/requeue.c
> @@ -312,6 +312,8 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
>  	if (!top_waiter->rt_waiter || top_waiter->pi_state)
>  		return -EINVAL;
>  
> +	if (!top_waiter->task)
> +		-EINVAL;
>  	/* Ensure we requeue to the expected futex. */
>  	if (!futex_match(top_waiter->requeue_pi_key, key2))
>  		return -EINVAL;
> 
> ?
> 
> Sebastian

Yep that looks reasonable to me. And agree that this futex must've been
setup on the io_uring side, which is why you end up with ->task == NULL.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [syzbot] [kernel?] general protection fault in try_to_wake_up (3)
  2025-09-03 18:51     ` Jens Axboe
@ 2025-09-04 16:28       ` Sebastian Andrzej Siewior
  2025-09-09 18:56         ` Thomas Gleixner
  2025-09-09 19:27         ` Jens Axboe
  0 siblings, 2 replies; 12+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-09-04 16:28 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Peter Zijlstra, syzbot, andrealmeid, dave, dvhart, linux-kernel,
	mingo, syzkaller-bugs, tglx

On 2025-09-03 12:51:09 [-0600], Jens Axboe wrote:
> > The syz-reproducer lists only:
> > | timer_create(0x0, &(0x7f0000000080)={0x0, 0x11, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000000))
> > | timer_settime(0x0, 0x0, &(0x7f0000000240)={{0x0, 0x8}, {0x0, 0x9}}, 0x0)
> > | futex(&(0x7f000000cffc), 0x80000000000b, 0x0, 0x0, &(0x7f0000048000), 0x0)
> > | futex(&(0x7f000000cffc), 0xc, 0x1, 0x0, &(0x7f0000048000), 0x0)
> > 
> > and that is probably why it can't come up with C-reproducer.
> > The whole log has (filtered) the following lines:
> > 
> > | io_uring_setup(0x85a, &(0x7f0000000180)={0x0, 0x58b9, 0x1, 0x2, 0x383})
> > | syz_io_uring_setup(0x88f, &(0x7f0000000300)={0x0, 0xaedf, 0x0, 0x0, 0x25d}, &(0x7f0000000140)=<r0=>0x0, &(0x7f0000000280)=<r1=>0x0)
> > | syz_memcpy_off$IO_URING_METADATA_GENERIC(r0, 0x4, &(0x7f0000000080)=0xfffffffc, 0x0, 0x4)
> > | syz_io_uring_submit(r0, r1, &(0x7f00000001c0)=@IORING_OP_RECVMSG={0xa, 0x8, 0x1, r2, 0x0, &(0x7f0000000440)={0x0, 0x0, 0x0}, 0x0, 0x40000020, 0x1, {0x2}})
> > 
> > This should explain the how the waiter got NULL. There is no private
> > flag so that is how they interact with each other.
> > Do we want this:
> > 
> > diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
> > index c716a66f86929..0c98256ebdcb7 100644
> > --- a/kernel/futex/requeue.c
> > +++ b/kernel/futex/requeue.c
> > @@ -312,6 +312,8 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
> >  	if (!top_waiter->rt_waiter || top_waiter->pi_state)
> >  		return -EINVAL;

I've been poking at this today and I have one problem with my
explanation:
The io_uring code initializes its futex_q with futex_q_init. At this
point futex_q::rt_waiter is set to NULL and never set to something else.
We should bail out here instead of going further.
Only the PI bits set rt_waiter. Only io_uring sets task to NULL.
I'm hopeless, this makes no sense.

> > +	if (!top_waiter->task)
> > +		-EINVAL;
> >  	/* Ensure we requeue to the expected futex. */
> >  	if (!futex_match(top_waiter->requeue_pi_key, key2))
> >  		return -EINVAL;
> > 

Sebastian

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [syzbot] [kernel?] general protection fault in try_to_wake_up (3)
  2025-09-04 16:28       ` Sebastian Andrzej Siewior
@ 2025-09-09 18:56         ` Thomas Gleixner
  2025-09-09 19:27         ` Jens Axboe
  1 sibling, 0 replies; 12+ messages in thread
From: Thomas Gleixner @ 2025-09-09 18:56 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Jens Axboe
  Cc: Peter Zijlstra, syzbot, andrealmeid, dave, dvhart, linux-kernel,
	mingo, syzkaller-bugs

On Thu, Sep 04 2025 at 18:28, Sebastian Andrzej Siewior wrote:
> On 2025-09-03 12:51:09 [-0600], Jens Axboe wrote:
>> > The syz-reproducer lists only:
>> > | timer_create(0x0, &(0x7f0000000080)={0x0, 0x11, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000000))
>> > | timer_settime(0x0, 0x0, &(0x7f0000000240)={{0x0, 0x8}, {0x0, 0x9}}, 0x0)
>> > | futex(&(0x7f000000cffc), 0x80000000000b, 0x0, 0x0, &(0x7f0000048000), 0x0)
>> > | futex(&(0x7f000000cffc), 0xc, 0x1, 0x0, &(0x7f0000048000), 0x0)
>> > 
>> > and that is probably why it can't come up with C-reproducer.
>> > The whole log has (filtered) the following lines:
>> > 
>> > | io_uring_setup(0x85a, &(0x7f0000000180)={0x0, 0x58b9, 0x1, 0x2, 0x383})
>> > | syz_io_uring_setup(0x88f, &(0x7f0000000300)={0x0, 0xaedf, 0x0, 0x0, 0x25d}, &(0x7f0000000140)=<r0=>0x0, &(0x7f0000000280)=<r1=>0x0)
>> > | syz_memcpy_off$IO_URING_METADATA_GENERIC(r0, 0x4, &(0x7f0000000080)=0xfffffffc, 0x0, 0x4)
>> > | syz_io_uring_submit(r0, r1, &(0x7f00000001c0)=@IORING_OP_RECVMSG={0xa, 0x8, 0x1, r2, 0x0, &(0x7f0000000440)={0x0, 0x0, 0x0}, 0x0, 0x40000020, 0x1, {0x2}})
>> > 
>> > This should explain the how the waiter got NULL. There is no private
>> > flag so that is how they interact with each other.

I'm not really seeing how they overlap though and it actually reproduces
occasionally without any of the other syz programs which are showing up
in that bisect log.

The problem is that it's hard to reproduce here. I've only seen it three
times within several hours.

So I thought I try and run qemu without -enable-kvm to change the timing,
but that does not even boot at all. It reliably dies at random places
during boot, but always with an 'Oops: int3:':

[   64.184144][    C1] Oops: int3: 0000 [#1] SMP KASAN NOPTI
[   64.185081][    C1] CPU: 1 UID: 0 PID: 994 Comm: kworker/u10:3 Not tainted syzkaller #0 PREEMPT_{RT,(full)} 
[   64.185369][    C1] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[   64.185681][    C1] Workqueue: events_unbound call_usermodehelper_exec_work
[   64.187063][    C1] RIP: 0010:kmem_cache_alloc_node_noprof+0x90/0x330
[   64.187445][    C1] Code: ff 2e 2e 2e 31 c0 4c 89 f7 44 89 ee e8 39 3c 0b 00 45 31 ed 4d 85 f6 0f 84 02 01 00 00 85 c0 0f 85 fa 00 00 00 89 5c 24 04 0f <1f> 44 00 00 48 c7 44 24 10 00 00 00 00 65 48 8b 05 73 fd e2 0f 49
[   64.187574][    C1] RSP: 0018:ffffc900056ff538 EFLAGS: 00000246
[   64.187743][    C1] RAX: 0000000000000000 RBX: 00000000ffffffff RCX: cc07f7dd94535100
[   64.187866][    C1] RDX: ffff888046a63900 RSI: 0000000000000cc0 RDI: ffff888040414500
[   64.187968][    C1] RBP: 0000000000000cc0 R08: 0000000000000000 R09: ffffffff82107f5d
[   64.188066][    C1] R10: dffffc0000000000 R11: ffffed1008d4c721 R12: 0000000000000000
[   64.188165][    C1] R13: 0000000000000000 R14: ffff888040414500 R15: ffffffff8182bd72
[   64.188303][    C1] FS:  0000000000000000(0000) GS:ffff8880ecec2000(0000) knlGS:0000000000000000
[   64.188414][    C1] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   64.188501][    C1] CR2: 0000000000000000 CR3: 000000000d7a6000 CR4: 00000000000006f0
[   64.188711][    C1] Call Trace:
[   64.188929][    C1]  <TASK>
[   64.189286][    C1]  dup_task_struct+0x52/0x860
[   64.189569][    C1]  copy_process+0x545/0x3ae0
[   64.190275][    C1]  kernel_clone+0x224/0x7c0
[   64.190691][    C1]  user_mode_thread+0xdd/0x140
[   64.191352][    C1]  call_usermodehelper_exec_work+0x5c/0x230
[   64.191873][    C1]  worker_thread+0x8a0/0xda0

[   36.676800][    C1] Oops: int3: 0000 [#1] SMP KASAN NOPTI
[   36.677774][    C1] CPU: 1 UID: 0 PID: 1 Comm: swapper/0 Not tainted syzkaller #0 PREEMPT_{RT,(full)} 
[   36.678176][    C1] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[   36.678477][    C1] RIP: 0010:kmem_cache_alloc_noprof+0x83/0x310
[   36.679809][    C1] Code: 00 e8 81 75 7e ff 2e 2e 2e 31 c0 4c 89 f7 89 ee e8 92 47 0b 00 45 31 ed 4d 85 f6 0f 84 fe 00 00 00 85 c0 0f 85 f6 00 00 00 0f <1f> 44 00 00 48 c7 44 24 08 00 00 00 00 65 48 8b 05 d0 08 e3 0f 49
[   36.680000][    C1] RSP: 0018:ffffc9000012f190 EFLAGS: 00000246
[   36.680185][    C1] RAX: 0000000000000000 RBX: 0000000000000dc0 RCX: 3fbc2ecf0c9c3500
[   36.680309][    C1] RDX: ffff88801b698000 RSI: 0000000000000dc0 RDI: ffff888040ad2000
[   36.680411][    C1] RBP: 0000000000000dc0 R08: 0000000000000000 R09: ffffffff82107f5d
[   36.680511][    C1] R10: dffffc0000000000 R11: ffffed1008d3f94c R12: 1ffff92000025e48
[   36.680611][    C1] R13: 0000000000000000 R14: ffff888040ad2000 R15: ffffffff8252e407
[   36.680749][    C1] FS:  0000000000000000(0000) GS:ffff8880ecec2000(0000) knlGS:0000000000000000
[   36.680862][    C1] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   36.680948][    C1] CR2: 0000000000000000 CR3: 000000000d7a6000 CR4: 00000000000006f0
[   36.681175][    C1] Call Trace:
[   36.681398][    C1]  <TASK>
[   36.681784][    C1]  __kernfs_new_node+0xd7/0x690
[   36.681877][    C1]  kernfs_new_node+0x102/0x210
[   36.681877][    C1]  kernfs_create_dir_ns+0x44/0x130
[   36.681877][    C1]  sysfs_create_dir_ns+0x123/0x280
[   36.681877][    C1]  ? __pfx_rt_mutex_slowunlock+0x10/0x10
[   36.681877][    C1]  ? __pfx_sysfs_create_dir_ns+0x10/0x10
[   36.681877][    C1]  ? rt_spin_unlock+0x65/0x80
[   36.681877][    C1]  kobject_add_internal+0x5a5/0xb50
[   36.681877][    C1]  kobject_add+0x155/0x220

Both are decoded to:

arch_static_branch at arch/x86/include/asm/jump_label.h:36
(inlined by) kfence_alloc at include/linux/kfence.h:121
(inlined by) slab_alloc_node at mm/slub.c:4213

which is kfence_allocation_key. Decoding the code shows:

  21:	85 c0                	test   %eax,%eax
  23:	0f 85 f6 00 00 00    	jne    0x11f
  29:*	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)		<-- trapping instruction
  2e:	48 c7 44 24 08 00 00 	movq   $0x0,0x8(%rsp)

which is clearly an intact NOP sequence. So with qemu plain the static
branch patching seems to be unhappy....

Oh well....




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [syzbot] [kernel?] general protection fault in try_to_wake_up (3)
  2025-09-04 16:28       ` Sebastian Andrzej Siewior
  2025-09-09 18:56         ` Thomas Gleixner
@ 2025-09-09 19:27         ` Jens Axboe
  2025-09-09 20:43           ` Thomas Gleixner
  1 sibling, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2025-09-09 19:27 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Peter Zijlstra, syzbot, andrealmeid, dave, dvhart, linux-kernel,
	mingo, syzkaller-bugs, tglx

On 9/4/25 10:28 AM, Sebastian Andrzej Siewior wrote:
> On 2025-09-03 12:51:09 [-0600], Jens Axboe wrote:
>>> The syz-reproducer lists only:
>>> | timer_create(0x0, &(0x7f0000000080)={0x0, 0x11, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000000))
>>> | timer_settime(0x0, 0x0, &(0x7f0000000240)={{0x0, 0x8}, {0x0, 0x9}}, 0x0)
>>> | futex(&(0x7f000000cffc), 0x80000000000b, 0x0, 0x0, &(0x7f0000048000), 0x0)
>>> | futex(&(0x7f000000cffc), 0xc, 0x1, 0x0, &(0x7f0000048000), 0x0)
>>>
>>> and that is probably why it can't come up with C-reproducer.
>>> The whole log has (filtered) the following lines:
>>>
>>> | io_uring_setup(0x85a, &(0x7f0000000180)={0x0, 0x58b9, 0x1, 0x2, 0x383})
>>> | syz_io_uring_setup(0x88f, &(0x7f0000000300)={0x0, 0xaedf, 0x0, 0x0, 0x25d}, &(0x7f0000000140)=<r0=>0x0, &(0x7f0000000280)=<r1=>0x0)
>>> | syz_memcpy_off$IO_URING_METADATA_GENERIC(r0, 0x4, &(0x7f0000000080)=0xfffffffc, 0x0, 0x4)
>>> | syz_io_uring_submit(r0, r1, &(0x7f00000001c0)=@IORING_OP_RECVMSG={0xa, 0x8, 0x1, r2, 0x0, &(0x7f0000000440)={0x0, 0x0, 0x0}, 0x0, 0x40000020, 0x1, {0x2}})
>>>
>>> This should explain the how the waiter got NULL. There is no private
>>> flag so that is how they interact with each other.
>>> Do we want this:
>>>
>>> diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
>>> index c716a66f86929..0c98256ebdcb7 100644
>>> --- a/kernel/futex/requeue.c
>>> +++ b/kernel/futex/requeue.c
>>> @@ -312,6 +312,8 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
>>>  	if (!top_waiter->rt_waiter || top_waiter->pi_state)
>>>  		return -EINVAL;
> 
> I've been poking at this today and I have one problem with my
> explanation:
> The io_uring code initializes its futex_q with futex_q_init. At this
> point futex_q::rt_waiter is set to NULL and never set to something else.
> We should bail out here instead of going further.
> Only the PI bits set rt_waiter. Only io_uring sets task to NULL.
> I'm hopeless, this makes no sense.

Was on the road, and now back at least for a day or two... So I took a
gander at this one too. One thing that puzzles me is the io_uring traces
in that syzbot log - if they are to be trusted, it only ever submits an
io_uring RECVMSG request? IOW, no futex usage on the io_uring side at
all?

Going to try and download the disk image and kernel and see if I can
actually run this locally.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [syzbot] [kernel?] general protection fault in try_to_wake_up (3)
  2025-09-09 19:27         ` Jens Axboe
@ 2025-09-09 20:43           ` Thomas Gleixner
  2025-09-09 20:46             ` Jens Axboe
  2025-09-10 10:42             ` [PATCH] futex: Prevent use-after-free during requeue-PI Sebastian Andrzej Siewior
  0 siblings, 2 replies; 12+ messages in thread
From: Thomas Gleixner @ 2025-09-09 20:43 UTC (permalink / raw)
  To: Jens Axboe, Sebastian Andrzej Siewior
  Cc: Peter Zijlstra, syzbot, andrealmeid, dave, dvhart, linux-kernel,
	mingo, syzkaller-bugs

On Tue, Sep 09 2025 at 13:27, Jens Axboe wrote:
> On 9/4/25 10:28 AM, Sebastian Andrzej Siewior wrote:
> Was on the road, and now back at least for a day or two... So I took a
> gander at this one too. One thing that puzzles me is the io_uring traces
> in that syzbot log - if they are to be trusted, it only ever submits an
> io_uring RECVMSG request? IOW, no futex usage on the io_uring side at
> all?
>
> Going to try and download the disk image and kernel and see if I can
> actually run this locally.

It has nothing to do with IO/URING. It reproduces cleanly with the
provided repro.syz. It just takes ages. The bisect log, which contains
the IO/URING muck is irrelevant.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [syzbot] [kernel?] general protection fault in try_to_wake_up (3)
  2025-09-09 20:43           ` Thomas Gleixner
@ 2025-09-09 20:46             ` Jens Axboe
  2025-09-10 10:42             ` [PATCH] futex: Prevent use-after-free during requeue-PI Sebastian Andrzej Siewior
  1 sibling, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2025-09-09 20:46 UTC (permalink / raw)
  To: Thomas Gleixner, Sebastian Andrzej Siewior
  Cc: Peter Zijlstra, syzbot, andrealmeid, dave, dvhart, linux-kernel,
	mingo, syzkaller-bugs

On 9/9/25 2:43 PM, Thomas Gleixner wrote:
> On Tue, Sep 09 2025 at 13:27, Jens Axboe wrote:
>> On 9/4/25 10:28 AM, Sebastian Andrzej Siewior wrote:
>> Was on the road, and now back at least for a day or two... So I took a
>> gander at this one too. One thing that puzzles me is the io_uring traces
>> in that syzbot log - if they are to be trusted, it only ever submits an
>> io_uring RECVMSG request? IOW, no futex usage on the io_uring side at
>> all?
>>
>> Going to try and download the disk image and kernel and see if I can
>> actually run this locally.
> 
> It has nothing to do with IO/URING. It reproduces cleanly with the
> provided repro.syz. It just takes ages. The bisect log, which contains
> the IO/URING muck is irrelevant.

OK thanks for confirming!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] futex: Prevent use-after-free during requeue-PI
  2025-09-09 20:43           ` Thomas Gleixner
  2025-09-09 20:46             ` Jens Axboe
@ 2025-09-10 10:42             ` Sebastian Andrzej Siewior
  2025-09-10 10:48               ` Sebastian Andrzej Siewior
  2025-09-20 15:43               ` [tip: locking/urgent] " tip-bot2 for Sebastian Andrzej Siewior
  1 sibling, 2 replies; 12+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-09-10 10:42 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jens Axboe, Peter Zijlstra, syzbot, andrealmeid, dave, dvhart,
	linux-kernel, mingo, syzkaller-bugs

syzbot managed to trigger the following race:

   T1                               T2

 futex_wait_requeue_pi()
   futex_do_wait()
     schedule()
                               futex_requeue()
                                 futex_proxy_trylock_atomic()
                                   futex_requeue_pi_prepare()
                                   requeue_pi_wake_futex()
                                     futex_requeue_pi_complete()
                                      /* preempt */

         * timeout/ signal wakes T1 *

   futex_requeue_pi_wakeup_sync() // Q_REQUEUE_PI_LOCKED
   futex_hash_put()
  // back to userland, on stack futex_q is garbage

                                      /* back */
                                     wake_up_state(q->task, TASK_NORMAL);

In this scenario futex_wait_requeue_pi() is able to leave without using
futex_q::lock_ptr for synchronization.
This can be prevented by reading futex_q::task before updating the
futex_q::requeue_state. A reference on the task_struct is not needed
because requeue_pi_wake_futex() is invoked with a spinlock_t held which
implies a RCU read section. Even if T1 terminates immediately after, the
task_struct will remain valid during T2's wake_up_state().
A READ_ONCE on futex_q::task before futex_requeue_pi_complete() is
enough because it ensures that the variable is read before the state is
updated.

Read futex_q::task before the updating the requeue state, use it for the
following wakeup.

Fixes: 07d91ef510fb1 ("futex: Prevent requeue_pi() lock nesting issue on RT")
Reported-by: syzbot+034246a838a10d181e78@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/68b75989.050a0220.3db4df.01dd.GAE@google.com/
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/futex/requeue.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index c716a66f86929..d818b4d47f1ba 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -230,8 +230,9 @@ static inline
 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 			   struct futex_hash_bucket *hb)
 {
-	q->key = *key;
+	struct task_struct *task;
 
+	q->key = *key;
 	__futex_unqueue(q);
 
 	WARN_ON(!q->rt_waiter);
@@ -243,10 +244,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 	futex_hash_get(hb);
 	q->drop_hb_ref = true;
 	q->lock_ptr = &hb->lock;
+	task = READ_ONCE(q->task);
 
 	/* Signal locked state to the waiter */
 	futex_requeue_pi_complete(q, 1);
-	wake_up_state(q->task, TASK_NORMAL);
+	wake_up_state(task, TASK_NORMAL);
 }
 
 /**
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] futex: Prevent use-after-free during requeue-PI
  2025-09-10 10:42             ` [PATCH] futex: Prevent use-after-free during requeue-PI Sebastian Andrzej Siewior
@ 2025-09-10 10:48               ` Sebastian Andrzej Siewior
  2025-09-20 15:43               ` [tip: locking/urgent] " tip-bot2 for Sebastian Andrzej Siewior
  1 sibling, 0 replies; 12+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-09-10 10:48 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jens Axboe, Peter Zijlstra, syzbot, andrealmeid, dave, dvhart,
	linux-kernel, mingo, syzkaller-bugs

On 2025-09-10 12:42:45 [+0200], To Thomas Gleixner wrote:
> --- a/kernel/futex/requeue.c
> +++ b/kernel/futex/requeue.c
> @@ -243,10 +244,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
>  	futex_hash_get(hb);
>  	q->drop_hb_ref = true;
>  	q->lock_ptr = &hb->lock;
> +	task = READ_ONCE(q->task);
>  
>  	/* Signal locked state to the waiter */
>  	futex_requeue_pi_complete(q, 1);

once understood, adding an mdelay(500) here greatly improves the chances
to trigger.
futex_requeue_pi_complete() uses atomic_try_cmpxchg() which has full
ordering. This means that the q->drop_hb_ref assignment earlier is
visible to the other thread after that cmpxchg, correct? 

> -	wake_up_state(q->task, TASK_NORMAL);
> +	wake_up_state(task, TASK_NORMAL);
>  }

Sebastian

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [tip: locking/urgent] futex: Prevent use-after-free during requeue-PI
  2025-09-10 10:42             ` [PATCH] futex: Prevent use-after-free during requeue-PI Sebastian Andrzej Siewior
  2025-09-10 10:48               ` Sebastian Andrzej Siewior
@ 2025-09-20 15:43               ` tip-bot2 for Sebastian Andrzej Siewior
  1 sibling, 0 replies; 12+ messages in thread
From: tip-bot2 for Sebastian Andrzej Siewior @ 2025-09-20 15:43 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: syzbot+034246a838a10d181e78, Sebastian Andrzej Siewior,
	Thomas Gleixner, x86, linux-kernel

The following commit has been merged into the locking/urgent branch of tip:

Commit-ID:     b549113738e8c751b613118032a724b772aa83f2
Gitweb:        https://git.kernel.org/tip/b549113738e8c751b613118032a724b772aa83f2
Author:        Sebastian Andrzej Siewior <bigeasy@linutronix.de>
AuthorDate:    Wed, 10 Sep 2025 12:42:43 +02:00
Committer:     Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Sat, 20 Sep 2025 17:40:42 +02:00

futex: Prevent use-after-free during requeue-PI

syzbot managed to trigger the following race:

   T1                               T2

 futex_wait_requeue_pi()
   futex_do_wait()
     schedule()
                               futex_requeue()
                                 futex_proxy_trylock_atomic()
                                   futex_requeue_pi_prepare()
                                   requeue_pi_wake_futex()
                                     futex_requeue_pi_complete()
                                      /* preempt */

         * timeout/ signal wakes T1 *

   futex_requeue_pi_wakeup_sync() // Q_REQUEUE_PI_LOCKED
   futex_hash_put()
  // back to userland, on stack futex_q is garbage

                                      /* back */
                                     wake_up_state(q->task, TASK_NORMAL);

In this scenario futex_wait_requeue_pi() is able to leave without using
futex_q::lock_ptr for synchronization.

This can be prevented by reading futex_q::task before updating the
futex_q::requeue_state. A reference on the task_struct is not needed
because requeue_pi_wake_futex() is invoked with a spinlock_t held which
implies a RCU read section.

Even if T1 terminates immediately after, the task_struct will remain valid
during T2's wake_up_state().  A READ_ONCE on futex_q::task before
futex_requeue_pi_complete() is enough because it ensures that the variable
is read before the state is updated.

Read futex_q::task before updating the requeue state, use it for the
following wakeup.

Fixes: 07d91ef510fb1 ("futex: Prevent requeue_pi() lock nesting issue on RT")
Reported-by: syzbot+034246a838a10d181e78@syzkaller.appspotmail.com
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Closes: https://lore.kernel.org/all/68b75989.050a0220.3db4df.01dd.GAE@google.com/
---
 kernel/futex/requeue.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index c716a66..d818b4d 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -230,8 +230,9 @@ static inline
 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 			   struct futex_hash_bucket *hb)
 {
-	q->key = *key;
+	struct task_struct *task;
 
+	q->key = *key;
 	__futex_unqueue(q);
 
 	WARN_ON(!q->rt_waiter);
@@ -243,10 +244,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 	futex_hash_get(hb);
 	q->drop_hb_ref = true;
 	q->lock_ptr = &hb->lock;
+	task = READ_ONCE(q->task);
 
 	/* Signal locked state to the waiter */
 	futex_requeue_pi_complete(q, 1);
-	wake_up_state(q->task, TASK_NORMAL);
+	wake_up_state(task, TASK_NORMAL);
 }
 
 /**

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2025-09-20 15:43 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-02 20:54 [syzbot] [kernel?] general protection fault in try_to_wake_up (3) syzbot
2025-09-02 21:46 ` Peter Zijlstra
2025-09-03 13:07   ` Sebastian Andrzej Siewior
2025-09-03 18:51     ` Jens Axboe
2025-09-04 16:28       ` Sebastian Andrzej Siewior
2025-09-09 18:56         ` Thomas Gleixner
2025-09-09 19:27         ` Jens Axboe
2025-09-09 20:43           ` Thomas Gleixner
2025-09-09 20:46             ` Jens Axboe
2025-09-10 10:42             ` [PATCH] futex: Prevent use-after-free during requeue-PI Sebastian Andrzej Siewior
2025-09-10 10:48               ` Sebastian Andrzej Siewior
2025-09-20 15:43               ` [tip: locking/urgent] " tip-bot2 for Sebastian Andrzej Siewior

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.