* [patch 01/11] posix-timers: Initialise timer before adding it to the hash table
2025-02-24 10:15 [patch 00/11] posix-timers: Rework the global hash table and provide a sane mechanism for CRIU Thomas Gleixner
@ 2025-02-24 10:15 ` Thomas Gleixner
2025-02-24 10:15 ` [patch 02/11] posix-timers: Add cond_resched() to posix_timer_add() search loop Thomas Gleixner
` (9 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 10:15 UTC (permalink / raw)
To: LKML
Cc: Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov, Peter Zijlstra
From: Eric Dumazet <edumazet@google.com>
A timer is only valid in the hashtable when both timer::it_signal and
timer::it_id are set to their final values, but timers are added without
those values being set.
The timer ID is allocated when the timer is added to the hash in invalid
state. The ID is taken from a monotonically increasing per process counter
which wraps around after reaching INT_MAX. The hash insertion validates
that there is no timer with the allocated ID in the hash table which
belongs to the same process. That opens a mostly theoretical race condition:
If other threads of the same process manage to create/delete timers in
rapid succession before the newly created timer is fully initialized and
wrap around to the timer ID which was handed out, then a duplicate timer ID
will be inserted into the hash table.
Prevent this by:
1) Setting timer::it_id before inserting the timer into the hashtable.
2) Storing the signal pointer in timer::it_signal with bit 0 set before
inserting it into the hashtable.
Bit 0 acts as a invalid bit, which means that the regular lookup for
sys_timer_*() will fail the comparison with the signal pointer.
But the lookup on insertion masks out bit 0 and can therefore detect a
timer which is not yet valid, but allocated in the hash table. Bit 0
in the pointer is cleared once the initialization of the timer
completed.
[ tglx: Fold ID and signal iniitializaion into one patch and massage change
log and comments. ]
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250219125522.2535263-3-edumazet@google.com
---
kernel/time/posix-timers.c | 56 +++++++++++++++++++++++++++++++++------------
1 file changed, 42 insertions(+), 14 deletions(-)
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -72,13 +72,13 @@ static int hash(struct signal_struct *si
return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
}
-static struct k_itimer *__posix_timers_find(struct hlist_head *head,
- struct signal_struct *sig,
- timer_t id)
+static struct k_itimer *posix_timer_by_id(timer_t id)
{
+ struct signal_struct *sig = current->signal;
+ struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
struct k_itimer *timer;
- hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) {
+ hlist_for_each_entry_rcu(timer, head, t_hash) {
/* timer->it_signal can be set concurrently */
if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id))
return timer;
@@ -86,12 +86,26 @@ static struct k_itimer *__posix_timers_f
return NULL;
}
-static struct k_itimer *posix_timer_by_id(timer_t id)
+static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer)
{
- struct signal_struct *sig = current->signal;
- struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+ unsigned long val = (unsigned long)timer->it_signal;
+
+ /*
+ * Mask out bit 0, which acts as invalid marker to prevent
+ * posix_timer_by_id() detecting it as valid.
+ */
+ return (struct signal_struct *)(val & ~1UL);
+}
+
+static bool posix_timer_hashed(struct hlist_head *head, struct signal_struct *sig, timer_t id)
+{
+ struct k_itimer *timer;
- return __posix_timers_find(head, sig, id);
+ hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) {
+ if ((posix_sig_owner(timer) == sig) && (timer->it_id == id))
+ return true;
+ }
+ return false;
}
static int posix_timer_add(struct k_itimer *timer)
@@ -112,7 +126,19 @@ static int posix_timer_add(struct k_itim
sig->next_posix_timer_id = (id + 1) & INT_MAX;
head = &posix_timers_hashtable[hash(sig, id)];
- if (!__posix_timers_find(head, sig, id)) {
+ if (!posix_timer_hashed(head, sig, id)) {
+ /*
+ * Set the timer ID and the signal pointer to make
+ * it identifiable in the hash table. The signal
+ * pointer has bit 0 set to indicate that it is not
+ * yet fully initialized. posix_timer_hashed()
+ * masks this bit out, but the syscall lookup fails
+ * to match due to it being set. This guarantees
+ * that there can't be duplicate timer IDs handed
+ * out.
+ */
+ timer->it_id = (timer_t)id;
+ timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
hlist_add_head_rcu(&timer->t_hash, head);
spin_unlock(&hash_lock);
return id;
@@ -406,8 +432,7 @@ static int do_timer_create(clockid_t whi
/*
* Add the timer to the hash table. The timer is not yet valid
- * because new_timer::it_signal is still NULL. The timer id is also
- * not yet visible to user space.
+ * after insertion, but has a unique ID allocated.
*/
new_timer_id = posix_timer_add(new_timer);
if (new_timer_id < 0) {
@@ -415,7 +440,6 @@ static int do_timer_create(clockid_t whi
return new_timer_id;
}
- new_timer->it_id = (timer_t) new_timer_id;
new_timer->it_clock = which_clock;
new_timer->kclock = kc;
new_timer->it_overrun = -1LL;
@@ -453,7 +477,7 @@ static int do_timer_create(clockid_t whi
}
/*
* After succesful copy out, the timer ID is visible to user space
- * now but not yet valid because new_timer::signal is still NULL.
+ * now but not yet valid because new_timer::signal low order bit is 1.
*
* Complete the initialization with the clock specific create
* callback.
@@ -463,7 +487,11 @@ static int do_timer_create(clockid_t whi
goto out;
spin_lock_irq(¤t->sighand->siglock);
- /* This makes the timer valid in the hash table */
+ /*
+ * new_timer::it_signal contains the signal pointer with bit 0 set,
+ * which makes it invalid for syscall operations. Store the
+ * unmodified signal pointer to make it valid.
+ */
WRITE_ONCE(new_timer->it_signal, current->signal);
hlist_add_head(&new_timer->list, ¤t->signal->posix_timers);
spin_unlock_irq(¤t->sighand->siglock);
^ permalink raw reply [flat|nested] 19+ messages in thread* [patch 02/11] posix-timers: Add cond_resched() to posix_timer_add() search loop
2025-02-24 10:15 [patch 00/11] posix-timers: Rework the global hash table and provide a sane mechanism for CRIU Thomas Gleixner
2025-02-24 10:15 ` [patch 01/11] posix-timers: Initialise timer before adding it to the hash table Thomas Gleixner
@ 2025-02-24 10:15 ` Thomas Gleixner
2025-02-24 10:15 ` [patch 03/11] posix-timers: Cleanup includes Thomas Gleixner
` (8 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 10:15 UTC (permalink / raw)
To: LKML
Cc: Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov, Peter Zijlstra
From: Eric Dumazet <edumazet@google.com>
With a large number of POSIX timers the search for a valid ID might cause a
soft lockup on PREEMPT_NONE/VOLUNTARY kernels.
Add cond_resched() to the loop to prevent that.
[ tglx: Split out from Eric's series ]
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250214135911.2037402-2-edumazet@google.com
---
kernel/time/posix-timers.c | 1 +
1 file changed, 1 insertion(+)
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -144,6 +144,7 @@ static int posix_timer_add(struct k_itim
return id;
}
spin_unlock(&hash_lock);
+ cond_resched();
}
/* POSIX return code when no timer ID could be allocated */
return -EAGAIN;
^ permalink raw reply [flat|nested] 19+ messages in thread* [patch 03/11] posix-timers: Cleanup includes
2025-02-24 10:15 [patch 00/11] posix-timers: Rework the global hash table and provide a sane mechanism for CRIU Thomas Gleixner
2025-02-24 10:15 ` [patch 01/11] posix-timers: Initialise timer before adding it to the hash table Thomas Gleixner
2025-02-24 10:15 ` [patch 02/11] posix-timers: Add cond_resched() to posix_timer_add() search loop Thomas Gleixner
@ 2025-02-24 10:15 ` Thomas Gleixner
2025-02-24 10:15 ` [patch 04/11] posix-timers: Remove pointless unlock_timer() wrapper Thomas Gleixner
` (7 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 10:15 UTC (permalink / raw)
To: LKML
Cc: Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov, Peter Zijlstra
Remove pointless includes and sort the remaining ones alphabetically.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/time/posix-timers.c | 26 ++++++++++----------------
1 file changed, 10 insertions(+), 16 deletions(-)
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -9,28 +9,22 @@
*
* These are all the functions necessary to implement POSIX clocks & timers
*/
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/mutex.h>
-#include <linux/sched/task.h>
-
-#include <linux/uaccess.h>
-#include <linux/list.h>
-#include <linux/init.h>
+#include <linux/compat.h>
#include <linux/compiler.h>
#include <linux/hash.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/nospec.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
#include <linux/syscalls.h>
-#include <linux/wait.h>
-#include <linux/workqueue.h>
-#include <linux/export.h>
-#include <linux/hashtable.h>
-#include <linux/compat.h>
-#include <linux/nospec.h>
+#include <linux/time.h>
#include <linux/time_namespace.h>
+#include <linux/uaccess.h>
#include "timekeeping.h"
#include "posix-timers.h"
^ permalink raw reply [flat|nested] 19+ messages in thread* [patch 04/11] posix-timers: Remove pointless unlock_timer() wrapper
2025-02-24 10:15 [patch 00/11] posix-timers: Rework the global hash table and provide a sane mechanism for CRIU Thomas Gleixner
` (2 preceding siblings ...)
2025-02-24 10:15 ` [patch 03/11] posix-timers: Cleanup includes Thomas Gleixner
@ 2025-02-24 10:15 ` Thomas Gleixner
2025-02-24 16:21 ` Peter Zijlstra
2025-02-24 10:15 ` [patch 05/11] posix-timers: Rework timer removal Thomas Gleixner
` (6 subsequent siblings)
10 siblings, 1 reply; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 10:15 UTC (permalink / raw)
To: LKML
Cc: Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov, Peter Zijlstra
It's just a wrapper around spin_unlock_irqrestore() with zero value.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/time/posix-timers.c | 15 +++++----------
1 file changed, 5 insertions(+), 10 deletions(-)
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -144,11 +144,6 @@ static int posix_timer_add(struct k_itim
return -EAGAIN;
}
-static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
-{
- spin_unlock_irqrestore(&timr->it_lock, flags);
-}
-
static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_real_ts64(tp);
@@ -691,7 +686,7 @@ static int do_timer_gettime(timer_t time
else
kc->timer_get(timr, setting);
- unlock_timer(timr, flags);
+ spin_unlock_irqrestore(&timr->it_lock, flags);
return ret;
}
@@ -755,7 +750,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_
return -EINVAL;
overrun = timer_overrun_to_int(timr);
- unlock_timer(timr, flags);
+ spin_unlock_irqrestore(&timr->it_lock, flags);
return overrun;
}
@@ -822,7 +817,7 @@ static struct k_itimer *timer_wait_runni
/* Prevent kfree(timer) after dropping the lock */
rcu_read_lock();
- unlock_timer(timer, *flags);
+ spin_unlock_irqrestore(&timer->it_lock, *flags);
/*
* kc->timer_wait_running() might drop RCU lock. So @timer
@@ -928,7 +923,7 @@ static int do_timer_settime(timer_t time
timr = timer_wait_running(timr, &flags);
goto retry;
}
- unlock_timer(timr, flags);
+ spin_unlock_irqrestore(&timr->it_lock, flags);
return error;
}
@@ -1046,7 +1041,7 @@ SYSCALL_DEFINE1(timer_delete, timer_t, t
WRITE_ONCE(timer->it_signal, NULL);
spin_unlock(¤t->sighand->siglock);
- unlock_timer(timer, flags);
+ spin_unlock_irqrestore(&timer->it_lock, flags);
posix_timer_unhash_and_free(timer);
return 0;
}
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [patch 04/11] posix-timers: Remove pointless unlock_timer() wrapper
2025-02-24 10:15 ` [patch 04/11] posix-timers: Remove pointless unlock_timer() wrapper Thomas Gleixner
@ 2025-02-24 16:21 ` Peter Zijlstra
2025-02-24 18:43 ` Thomas Gleixner
0 siblings, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2025-02-24 16:21 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov
On Mon, Feb 24, 2025 at 11:15:28AM +0100, Thomas Gleixner wrote:
> It's just a wrapper around spin_unlock_irqrestore() with zero value.
Well, I disagree... the value is that is matches lock_timer(). Both in
naming and in argument types.
Anyway, I started tinkering with the code, it's more hacky than I'd like
it to be, but what do you think?
---
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -74,6 +74,29 @@ static struct k_itimer *__lock_timer(tim
__timr; \
})
+static inline void unlock_timer(struct k_itimer *timer, unsigned long flags)
+{
+ spin_unlock_irqrestore(&timer->it_lock, flags);
+}
+
+__DEFINE_CLASS_IS_CONDITIONAL(lock_timer, true);
+__DEFINE_UNLOCK_GUARD(lock_timer, struct k_itimer,
+ unlock_timer(_T->lock, _T->flags),
+ unsigned long flags);
+static inline class_lock_timer_t class_lock_timer_constructor(timer_t id)
+{
+ class_lock_timer_t _t = {
+ .lock = __lock_timer(id, &_t.flags),
+ };
+ return _t;
+}
+
+#define scoped_guard_end(_name) do { \
+ class_##_name##_t *_T = &(scope); \
+ class_##_name##_destructor(_T); \
+ _T->lock = NULL; \
+} while (0)
+
static int hash(struct signal_struct *sig, unsigned int nr)
{
return hash_32(hash32_ptr(sig) ^ nr, timer_hashbits);
@@ -327,14 +350,13 @@ bool posixtimer_deliver_signal(struct ke
* Release siglock to ensure proper locking order versus
* timr::it_lock. Keep interrupts disabled.
*/
- spin_unlock(¤t->sighand->siglock);
+ guard(spinlock)(¤t->sighand->siglock);
ret = __posixtimer_deliver_signal(info, timr);
/* Drop the reference which was acquired when the signal was queued */
posixtimer_putref(timr);
- spin_lock(¤t->sighand->siglock);
return ret;
}
@@ -717,24 +739,20 @@ void common_timer_get(struct k_itimer *t
static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
{
- const struct k_clock *kc;
- struct k_itimer *timr;
- unsigned long flags;
- int ret = 0;
-
- timr = lock_timer(timer_id, &flags);
- if (!timr)
- return -EINVAL;
+ scoped_guard (lock_timer, timer_id) {
+ struct k_itimer *timr = __guard_ptr(lock_timer)(&scope);
+ const struct k_clock *kc;
+
+ memset(setting, 0, sizeof(*setting));
+ kc = timr->kclock;
+ if (WARN_ON_ONCE(!kc || !kc->timer_get))
+ return -EINVAL;
- memset(setting, 0, sizeof(*setting));
- kc = timr->kclock;
- if (WARN_ON_ONCE(!kc || !kc->timer_get))
- ret = -EINVAL;
- else
kc->timer_get(timr, setting);
+ return 0;
+ }
- spin_unlock_irqrestore(&timr->it_lock, flags);
- return ret;
+ return -EINVAL;
}
/* Get the time remaining on a POSIX.1b interval timer. */
@@ -788,18 +806,12 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t
*/
SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
{
- struct k_itimer *timr;
- unsigned long flags;
- int overrun;
-
- timr = lock_timer(timer_id, &flags);
- if (!timr)
- return -EINVAL;
-
- overrun = timer_overrun_to_int(timr);
- spin_unlock_irqrestore(&timr->it_lock, flags);
+ scoped_guard (lock_timer, timer_id) {
+ struct k_itimer *timr = __guard_ptr(lock_timer)(&scope);
- return overrun;
+ return timer_overrun_to_int(timr);
+ }
+ return -EINVAL;
}
static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
@@ -855,15 +867,9 @@ static void common_timer_wait_running(st
* when the task which tries to delete or disarm the timer has preempted
* the task which runs the expiry in task work context.
*/
-static struct k_itimer *timer_wait_running(struct k_itimer *timer, unsigned long *flags,
- bool delete)
+static void timer_wait_running(struct k_itimer *timer)
{
const struct k_clock *kc = READ_ONCE(timer->kclock);
- timer_t timer_id = READ_ONCE(timer->it_id);
-
- /* Prevent kfree(timer) after dropping the lock */
- rcu_read_lock();
- spin_unlock_irqrestore(&timer->it_lock, *flags);
/*
* kc->timer_wait_running() might drop RCU lock. So @timer cannot
@@ -872,27 +878,6 @@ static struct k_itimer *timer_wait_runni
*/
if (!WARN_ON_ONCE(!kc->timer_wait_running))
kc->timer_wait_running(timer);
-
- rcu_read_unlock();
-
- /*
- * On deletion the timer has been marked invalid before
- * timer_delete_hook() has been invoked. That means that the
- * current task is the only one which has access to the timer and
- * even after dropping timer::it_lock above, no other thread can
- * have accessed the timer.
- */
- if (delete) {
- spin_lock_irqsave(&timer->it_lock, *flags);
- return timer;
- }
-
- /*
- * If invoked from timer_set() the timer could have been deleted
- * after dropping the lock. So in that case the timer needs to be
- * looked up and validated.
- */
- return lock_timer(timer_id, flags);
}
/*
@@ -952,8 +937,6 @@ static int do_timer_settime(timer_t time
struct itimerspec64 *old_spec64)
{
const struct k_clock *kc;
- struct k_itimer *timr;
- unsigned long flags;
int error;
if (!timespec64_valid(&new_spec64->it_interval) ||
@@ -963,33 +946,35 @@ static int do_timer_settime(timer_t time
if (old_spec64)
memset(old_spec64, 0, sizeof(*old_spec64));
- timr = lock_timer(timer_id, &flags);
retry:
- if (!timr)
- return -EINVAL;
+ scoped_guard (lock_timer, timer_id) {
+ struct k_itimer *timr = __guard_ptr(lock_timer)(&scope);
- if (old_spec64)
- old_spec64->it_interval = ktime_to_timespec64(timr->it_interval);
+ if (old_spec64)
+ old_spec64->it_interval = ktime_to_timespec64(timr->it_interval);
- /* Prevent signal delivery and rearming. */
- timr->it_signal_seq++;
+ /* Prevent signal delivery and rearming. */
+ timr->it_signal_seq++;
+
+ kc = timr->kclock;
+ if (WARN_ON_ONCE(!kc || !kc->timer_set))
+ return -EINVAL;
- kc = timr->kclock;
- if (WARN_ON_ONCE(!kc || !kc->timer_set))
- error = -EINVAL;
- else
error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64);
+ if (error == TIMER_RETRY) {
+ // We already got the old time...
+ old_spec64 = NULL;
+ /* Unlocks and relocks the timer if it still exists */
+
+ guard(rcu)();
+ scoped_guard_end(lock_timer);
+ timer_wait_running(timr);
+ goto retry;
+ }
- if (error == TIMER_RETRY) {
- // We already got the old time...
- old_spec64 = NULL;
- /* Unlocks and relocks the timer if it still exists */
- timr = timer_wait_running(timr, &flags, false);
- goto retry;
+ return error;
}
- spin_unlock_irqrestore(&timr->it_lock, flags);
-
- return error;
+ return -EINVAL;
}
/* Set a POSIX.1b interval timer */
@@ -1072,18 +1057,8 @@ static inline int timer_delete_hook(stru
return kc->timer_del(timer);
}
-static int posix_timer_delete(struct k_itimer *timer, timer_t id)
+static void posix_timer_invalidate(struct k_itimer *timer, unsigned long flags)
{
- unsigned long flags;
-
- if (!timer) {
- timer = lock_timer(id, &flags);
- if (!timer)
- return -EINVAL;
- } else {
- spin_lock_irqsave(&timer->it_lock, flags);
- }
-
/*
* Invalidate the timer, remove it from the linked list and remove
* it from the ignored list if pending.
@@ -1115,19 +1090,23 @@ static int posix_timer_delete(struct k_i
* delete possible after unlocking the timer as the timer
* has been marked invalid above.
*/
- timer_wait_running(timer, &flags, true);
+ guard(rcu)();
+ spin_unlock_irqrestore(&timer->it_lock, flags);
+ timer_wait_running(timer);
+ spin_lock_irqsave(&timer->it_lock, flags);
}
-
- spin_unlock_irqrestore(&timer->it_lock, flags);
- /* Remove it from the hash, which frees up the timer ID */
- posix_timer_unhash_and_free(timer);
- return 0;
}
/* Delete a POSIX.1b interval timer. */
SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
{
- return posix_timer_delete(NULL, timer_id);
+ scoped_guard (lock_timer, timer_id) {
+ posix_timer_invalidate(scope.lock, scope.flags);
+ scoped_guard_end(lock_timer);
+ posix_timer_unhash_and_free(scope.lock);
+ return 0;
+ }
+ return -EINVAL;
}
/*
@@ -1143,13 +1122,17 @@ void exit_itimers(struct task_struct *ts
return;
/* Protect against concurrent read via /proc/$PID/timers */
- spin_lock_irq(&tsk->sighand->siglock);
- hlist_move_list(&tsk->signal->posix_timers, &timers);
- spin_unlock_irq(&tsk->sighand->siglock);
+ scoped_guard (spinlock_irq, &tsk->sighand->siglock)
+ hlist_move_list(&tsk->signal->posix_timers, &timers);
/* The timers are not longer accessible via tsk::signal */
while (!hlist_empty(&timers)) {
- posix_timer_delete(hlist_entry(timers.first, struct k_itimer, list), 0);
+ struct k_itimer *timer = hlist_entry(timers.first, struct k_itimer, list);
+
+ scoped_guard (spinlock_irqsave, &timer->it_lock)
+ posix_timer_invalidate(timer, scope.flags);
+
+ posix_timer_unhash_and_free(timer);
cond_resched();
}
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [patch 04/11] posix-timers: Remove pointless unlock_timer() wrapper
2025-02-24 16:21 ` Peter Zijlstra
@ 2025-02-24 18:43 ` Thomas Gleixner
2025-02-24 21:55 ` Peter Zijlstra
0 siblings, 1 reply; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 18:43 UTC (permalink / raw)
To: Peter Zijlstra
Cc: LKML, Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov
On Mon, Feb 24 2025 at 17:21, Peter Zijlstra wrote:
> On Mon, Feb 24, 2025 at 11:15:28AM +0100, Thomas Gleixner wrote:
>> It's just a wrapper around spin_unlock_irqrestore() with zero value.
>
> Well, I disagree... the value is that is matches lock_timer(). Both in
> naming and in argument types.
Sure, but it's not used consistently as we have places where
lock_timer() is not involved.
> @@ -327,14 +350,13 @@ bool posixtimer_deliver_signal(struct ke
> * Release siglock to ensure proper locking order versus
> * timr::it_lock. Keep interrupts disabled.
> */
> - spin_unlock(¤t->sighand->siglock);
> + guard(spinlock)(¤t->sighand->siglock);
How is that equivalent?
This is a unlock/lock pair because __posix_timer_deliver_signal() takes
timr->it_lock and now you introduced the ABBA which the sighand::siglock
unlock carefully avoids :)
>
> ret = __posixtimer_deliver_signal(info, timr);
>
> /* Drop the reference which was acquired when the signal was queued */
> posixtimer_putref(timr);
>
> - spin_lock(¤t->sighand->siglock);
> return ret;
> }
>
> @@ -717,24 +739,20 @@ void common_timer_get(struct k_itimer *t
>
> static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
> {
> - const struct k_clock *kc;
> - struct k_itimer *timr;
> - unsigned long flags;
> - int ret = 0;
> -
> - timr = lock_timer(timer_id, &flags);
> - if (!timr)
> - return -EINVAL;
> + scoped_guard (lock_timer, timer_id) {
> + struct k_itimer *timr = __guard_ptr(lock_timer)(&scope);
> + const struct k_clock *kc;
> +
> + memset(setting, 0, sizeof(*setting));
> + kc = timr->kclock;
> + if (WARN_ON_ONCE(!kc || !kc->timer_get))
> + return -EINVAL;
>
> - memset(setting, 0, sizeof(*setting));
> - kc = timr->kclock;
> - if (WARN_ON_ONCE(!kc || !kc->timer_get))
> - ret = -EINVAL;
> - else
> kc->timer_get(timr, setting);
> + return 0;
> + }
>
> - spin_unlock_irqrestore(&timr->it_lock, flags);
> - return ret;
> + return -EINVAL;
So the resulting code is:
scoped_guard (lock_timer, timer_id) {
struct k_itimer *timr = __guard_ptr(lock_timer)(&scope);
const struct k_clock *kc;
memset(setting, 0, sizeof(*setting));
kc = timr->kclock;
if (WARN_ON_ONCE(!kc || !kc->timer_get))
return -EINVAL;
return 0;
}
return -EINVAL;
I had to go and stare at the guard/class muck 10 times to convince
myself, that this actually works. This really wants to be express the
condition of the scoped_guard() somehow, e.g. scoped_cond_guard() or
such.
> /* Delete a POSIX.1b interval timer. */
> SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
> {
> - return posix_timer_delete(NULL, timer_id);
> + scoped_guard (lock_timer, timer_id) {
> + posix_timer_invalidate(scope.lock, scope.flags);
> + scoped_guard_end(lock_timer);
> + posix_timer_unhash_and_free(scope.lock);
Not sure whether it's a good idea to free the scope.lock and not
scope.timer :)
Thanks,
tglx
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [patch 04/11] posix-timers: Remove pointless unlock_timer() wrapper
2025-02-24 18:43 ` Thomas Gleixner
@ 2025-02-24 21:55 ` Peter Zijlstra
0 siblings, 0 replies; 19+ messages in thread
From: Peter Zijlstra @ 2025-02-24 21:55 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov
On Mon, Feb 24, 2025 at 07:43:26PM +0100, Thomas Gleixner wrote:
> On Mon, Feb 24 2025 at 17:21, Peter Zijlstra wrote:
> > On Mon, Feb 24, 2025 at 11:15:28AM +0100, Thomas Gleixner wrote:
> >> It's just a wrapper around spin_unlock_irqrestore() with zero value.
> >
> > Well, I disagree... the value is that is matches lock_timer(). Both in
> > naming and in argument types.
>
> Sure, but it's not used consistently as we have places where
> lock_timer() is not involved.
>
> > @@ -327,14 +350,13 @@ bool posixtimer_deliver_signal(struct ke
> > * Release siglock to ensure proper locking order versus
> > * timr::it_lock. Keep interrupts disabled.
> > */
> > - spin_unlock(¤t->sighand->siglock);
> > + guard(spinlock)(¤t->sighand->siglock);
>
> How is that equivalent?
I R idiot :-)
> So the resulting code is:
>
> scoped_guard (lock_timer, timer_id) {
> struct k_itimer *timr = __guard_ptr(lock_timer)(&scope);
> const struct k_clock *kc;
>
> memset(setting, 0, sizeof(*setting));
> kc = timr->kclock;
> if (WARN_ON_ONCE(!kc || !kc->timer_get))
> return -EINVAL;
>
> return 0;
> }
> return -EINVAL;
>
> I had to go and stare at the guard/class muck 10 times to convince
> myself, that this actually works. This really wants to be express the
> condition of the scoped_guard() somehow, e.g. scoped_cond_guard() or
> such.
Right, so the alternative form is something like:
scoped_cond_guard (lock_timer, return -EINVAL, timer_id) {
struct k_itimer *timr = __guard_ptr(lock_timer)(&scope);
const struct k_clock *kc;
memset(setting, 0, sizeof(*setting));
kc = timr->kclock;
if (WARN_ON_ONCE(!kc || !kc->timer_get))
return -EINVAL;
}
return 0;
Is that really so much better?
> > /* Delete a POSIX.1b interval timer. */
> > SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
> > {
> > - return posix_timer_delete(NULL, timer_id);
> > + scoped_guard (lock_timer, timer_id) {
> > + posix_timer_invalidate(scope.lock, scope.flags);
> > + scoped_guard_end(lock_timer);
> > + posix_timer_unhash_and_free(scope.lock);
>
> Not sure whether it's a good idea to free the scope.lock and not
> scope.timer :)
There is no scope.timer, the way this work is that the main pointer is
.lock, per the __DEFINE_UNLOCK_GUARD() helper.
I said there were rough edges :-/
Anyway, should I continue poking at this to see if I can clean it up /
extract more useful helpers.
Or shall I just let it be.
^ permalink raw reply [flat|nested] 19+ messages in thread
* [patch 05/11] posix-timers: Rework timer removal
2025-02-24 10:15 [patch 00/11] posix-timers: Rework the global hash table and provide a sane mechanism for CRIU Thomas Gleixner
` (3 preceding siblings ...)
2025-02-24 10:15 ` [patch 04/11] posix-timers: Remove pointless unlock_timer() wrapper Thomas Gleixner
@ 2025-02-24 10:15 ` Thomas Gleixner
2025-02-24 10:15 ` [patch 06/11] posix-timers: Make signal_struct::next_posix_timer_id an atomic_t Thomas Gleixner
` (5 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 10:15 UTC (permalink / raw)
To: LKML
Cc: Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov, Peter Zijlstra
sys_timer_delete() and the do_exit() cleanup function itimer_delete() are
doing the same thing, but have needlessly different implementations instead
of sharing the code.
The other oddity of timer deletion is the fact that the timer is not
invalidated before the actual deletion happens, which allows concurrent
lookups to succeed.
That's wrong because a timer which is in the process of being deleted
should not be visible and any actions like signal queueing, delivery and
rearming should not happen once the task, which invoked timer_delete(), has
the timer locked.
Rework the code so that:
1) The signal queueing and delivery code ignore timers which are marked
invalid
2) The deletion implementation between sys_timer_delete() and
itimer_delete() is shared
3) The timer is invalidated and removed from the linked lists before
the deletion callback of the relevant clock is invoked.
That requires to rework timer_wait_running() as it does a lookup of
the timer when relocking it at the end. In case of deletion this
lookup would fail due to the preceding invalidation and the wait loop
would terminate prematurely.
But due to the preceding invalidation the timer cannot be accessed by
other tasks anymore, so there is no way that the timer has been freed
after the timer lock has been dropped.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/posix-timers.h | 7 +
kernel/signal.c | 2
kernel/time/posix-timers.c | 156 +++++++++++++++++++------------------------
3 files changed, 80 insertions(+), 85 deletions(-)
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -240,6 +240,13 @@ static inline void posixtimer_sigqueue_p
posixtimer_putref(tmr);
}
+
+static inline bool posixtimer_valid(const struct k_itimer *timer)
+{
+ unsigned long val = (unsigned long)timer->it_signal;
+
+ return !(val & 0x1UL);
+}
#else /* CONFIG_POSIX_TIMERS */
static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { }
static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { }
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2092,7 +2092,7 @@ static inline void posixtimer_sig_ignore
* from a non-periodic timer, then just drop the reference
* count. Otherwise queue it on the ignored list.
*/
- if (tmr->it_signal && tmr->it_sig_periodic)
+ if (posixtimer_valid(tmr) && tmr->it_sig_periodic)
hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers);
else
posixtimer_putref(tmr);
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -275,7 +275,7 @@ static bool __posixtimer_deliver_signal(
* since the signal was queued. In either case, don't rearm and
* drop the signal.
*/
- if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal))
+ if (timr->it_signal_seq != timr->it_sigqueue_seq || !posixtimer_valid(timr))
return false;
if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING))
@@ -320,6 +320,9 @@ void posix_timer_queue_signal(struct k_i
{
lockdep_assert_held(&timr->it_lock);
+ if (!posixtimer_valid(timr))
+ return;
+
timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED;
posixtimer_send_sigqueue(timr);
}
@@ -540,11 +543,11 @@ static struct k_itimer *__lock_timer(tim
* The hash lookup and the timers are RCU protected.
*
* Timers are added to the hash in invalid state where
- * timr::it_signal == NULL. timer::it_signal is only set after the
- * rest of the initialization succeeded.
+ * timr::it_signal is marked invalid. timer::it_signal is only set
+ * after the rest of the initialization succeeded.
*
* Timer destruction happens in steps:
- * 1) Set timr::it_signal to NULL with timr::it_lock held
+ * 1) Set timr::it_signal marked invalid with timr::it_lock held
* 2) Release timr::it_lock
* 3) Remove from the hash under hash_lock
* 4) Put the reference count.
@@ -561,8 +564,8 @@ static struct k_itimer *__lock_timer(tim
*
* The lookup validates locklessly that timr::it_signal ==
* current::it_signal and timr::it_id == @timer_id. timr::it_id
- * can't change, but timr::it_signal becomes NULL during
- * destruction.
+ * can't change, but timr::it_signal can become invalid during
+ * destruction, which makes the locked check fail.
*/
rcu_read_lock();
timr = posix_timer_by_id(timer_id);
@@ -809,8 +812,8 @@ static void common_timer_wait_running(st
* when the task which tries to delete or disarm the timer has preempted
* the task which runs the expiry in task work context.
*/
-static struct k_itimer *timer_wait_running(struct k_itimer *timer,
- unsigned long *flags)
+static struct k_itimer *timer_wait_running(struct k_itimer *timer, unsigned long *flags,
+ bool delete)
{
const struct k_clock *kc = READ_ONCE(timer->kclock);
timer_t timer_id = READ_ONCE(timer->it_id);
@@ -820,14 +823,32 @@ static struct k_itimer *timer_wait_runni
spin_unlock_irqrestore(&timer->it_lock, *flags);
/*
- * kc->timer_wait_running() might drop RCU lock. So @timer
- * cannot be touched anymore after the function returns!
+ * kc->timer_wait_running() might drop RCU lock. So @timer cannot
+ * be touched anymore after the function returns, except when
+ * @delete is true!
*/
if (!WARN_ON_ONCE(!kc->timer_wait_running))
kc->timer_wait_running(timer);
rcu_read_unlock();
- /* Relock the timer. It might be not longer hashed. */
+
+ /*
+ * On deletion the timer has been marked invalid before
+ * timer_delete_hook() has been invoked. That means that the
+ * current task is the only one which has access to the timer and
+ * even after dropping timer::it_lock above, no other thread can
+ * have accessed the timer.
+ */
+ if (delete) {
+ spin_lock_irqsave(&timer->it_lock, *flags);
+ return timer;
+ }
+
+ /*
+ * If invoked from timer_set() the timer could have been deleted
+ * after dropping the lock. So in that case the timer needs to be
+ * looked up and validated.
+ */
return lock_timer(timer_id, flags);
}
@@ -920,7 +941,7 @@ static int do_timer_settime(timer_t time
// We already got the old time...
old_spec64 = NULL;
/* Unlocks and relocks the timer if it still exists */
- timr = timer_wait_running(timr, &flags);
+ timr = timer_wait_running(timr, &flags, false);
goto retry;
}
spin_unlock_irqrestore(&timr->it_lock, flags);
@@ -1008,95 +1029,62 @@ static inline int timer_delete_hook(stru
return kc->timer_del(timer);
}
-/* Delete a POSIX.1b interval timer. */
-SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
+static int posix_timer_delete(struct k_itimer *timer, timer_t id)
{
- struct k_itimer *timer;
unsigned long flags;
- timer = lock_timer(timer_id, &flags);
-
-retry_delete:
- if (!timer)
- return -EINVAL;
-
- if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) {
- /* Unlocks and relocks the timer if it still exists */
- timer = timer_wait_running(timer, &flags);
- goto retry_delete;
+ if (!timer) {
+ timer = lock_timer(id, &flags);
+ if (!timer)
+ return -EINVAL;
+ } else {
+ spin_lock_irqsave(&timer->it_lock, flags);
}
- spin_lock(¤t->sighand->siglock);
- hlist_del(&timer->list);
- posix_timer_cleanup_ignored(timer);
/*
- * A concurrent lookup could check timer::it_signal lockless. It
- * will reevaluate with timer::it_lock held and observe the NULL.
+ * Invalidate the timer, remove it from the linked list and remove
+ * it from the ignored list if pending.
*
- * It must be written with siglock held so that the signal code
- * observes timer->it_signal == NULL in do_sigaction(SIG_IGN),
+ * The invalidation must be written with siglock held so that the
+ * signal code observes timer->it_valid == false in do_sigaction(),
* which prevents it from moving a pending signal of a deleted
* timer to the ignore list.
+ *
+ * The invalidation also prevents signal queueing, signal delivery
+ * and therefore rearming from the signal delivery path.
+ *
+ * A concurrent lookup can still find the timer in the hash, but it
+ * will check timer::it_signal with timer::it_lock held and observe
+ * bit 0 set, which invalidates it. That also prevents the timer ID
+ * from being handed out before this timer is completely gone.
*/
- WRITE_ONCE(timer->it_signal, NULL);
- spin_unlock(¤t->sighand->siglock);
-
- spin_unlock_irqrestore(&timer->it_lock, flags);
- posix_timer_unhash_and_free(timer);
- return 0;
-}
+ scoped_guard(spinlock, ¤t->sighand->siglock) {
+ unsigned long sig = (unsigned long)timer->it_signal | 1UL;
-/*
- * Delete a timer if it is armed, remove it from the hash and schedule it
- * for RCU freeing.
- */
-static void itimer_delete(struct k_itimer *timer)
-{
- unsigned long flags;
-
- /*
- * irqsave is required to make timer_wait_running() work.
- */
- spin_lock_irqsave(&timer->it_lock, flags);
+ WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig);
+ hlist_del(&timer->list);
+ posix_timer_cleanup_ignored(timer);
+ }
-retry_delete:
- /*
- * Even if the timer is not longer accessible from other tasks
- * it still might be armed and queued in the underlying timer
- * mechanism. Worse, that timer mechanism might run the expiry
- * function concurrently.
- */
- if (timer_delete_hook(timer) == TIMER_RETRY) {
+ while (timer_delete_hook(timer) == TIMER_RETRY) {
/*
- * Timer is expired concurrently, prevent livelocks
- * and pointless spinning on RT.
- *
- * timer_wait_running() drops timer::it_lock, which opens
- * the possibility for another task to delete the timer.
- *
- * That's not possible here because this is invoked from
- * do_exit() only for the last thread of the thread group.
- * So no other task can access and delete that timer.
+ * Unlocks and relocks the timer. There is no concurrent
+ * delete possible after unlocking the timer as the timer
+ * has been marked invalid above.
*/
- if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer))
- return;
-
- goto retry_delete;
+ timer_wait_running(timer, &flags, true);
}
- hlist_del(&timer->list);
-
- posix_timer_cleanup_ignored(timer);
-
- /*
- * Setting timer::it_signal to NULL is technically not required
- * here as nothing can access the timer anymore legitimately via
- * the hash table. Set it to NULL nevertheless so that all deletion
- * paths are consistent.
- */
- WRITE_ONCE(timer->it_signal, NULL);
spin_unlock_irqrestore(&timer->it_lock, flags);
+ /* Remove it from the hash, which frees up the timer ID */
posix_timer_unhash_and_free(timer);
+ return 0;
+}
+
+/* Delete a POSIX.1b interval timer. */
+SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
+{
+ return posix_timer_delete(NULL, timer_id);
}
/*
@@ -1118,7 +1106,7 @@ void exit_itimers(struct task_struct *ts
/* The timers are not longer accessible via tsk::signal */
while (!hlist_empty(&timers)) {
- itimer_delete(hlist_entry(timers.first, struct k_itimer, list));
+ posix_timer_delete(hlist_entry(timers.first, struct k_itimer, list), 0);
cond_resched();
}
^ permalink raw reply [flat|nested] 19+ messages in thread* [patch 06/11] posix-timers: Make signal_struct::next_posix_timer_id an atomic_t
2025-02-24 10:15 [patch 00/11] posix-timers: Rework the global hash table and provide a sane mechanism for CRIU Thomas Gleixner
` (4 preceding siblings ...)
2025-02-24 10:15 ` [patch 05/11] posix-timers: Rework timer removal Thomas Gleixner
@ 2025-02-24 10:15 ` Thomas Gleixner
2025-02-24 13:20 ` Peter Zijlstra
2025-02-24 10:15 ` [patch 07/11] posix-timers: Improve hash table performance Thomas Gleixner
` (4 subsequent siblings)
10 siblings, 1 reply; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 10:15 UTC (permalink / raw)
To: LKML
Cc: Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov, Peter Zijlstra
From: Eric Dumazet <edumazet@google.com>
The global hash_lock protecting the posix timer hash table can be heavily
contended especially when there is an extensive linear search for a timer
ID.
Timer IDs are handed out by monotonically increasing next_posix_timer_id
and then validating that there is no timer with the same ID in the hash
table. Both operations happen with the global hash lock held.
To reduce the hash lock contention the hash will be reworked to a scaled
hash with per bucket locks, which requires to handle the ID counter
lockless.
Prepare for this by making next_posix_timer_id an atomic_t, which can be
used lockless with atomic_inc_return().
[ tglx: Adopted from Eric's series, massaged change log and simplified it ]
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250219125522.2535263-2-edumazet@google.com
---
include/linux/sched/signal.h | 2 +-
kernel/time/posix-timers.c | 14 +++++---------
2 files changed, 6 insertions(+), 10 deletions(-)
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -136,7 +136,7 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
- unsigned int next_posix_timer_id;
+ atomic_t next_posix_timer_id;
struct hlist_head posix_timers;
struct hlist_head ignored_posix_timers;
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -105,21 +105,17 @@ static bool posix_timer_hashed(struct hl
static int posix_timer_add(struct k_itimer *timer)
{
struct signal_struct *sig = current->signal;
- struct hlist_head *head;
- unsigned int cnt, id;
/*
* FIXME: Replace this by a per signal struct xarray once there is
* a plan to handle the resulting CRIU regression gracefully.
*/
- for (cnt = 0; cnt <= INT_MAX; cnt++) {
- spin_lock(&hash_lock);
- id = sig->next_posix_timer_id;
-
- /* Write the next ID back. Clamp it to the positive space */
- sig->next_posix_timer_id = (id + 1) & INT_MAX;
+ for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) {
+ /* Get the next timer ID and clamp it to positive space */
+ unsigned int id = (atomic_inc_return(&sig->next_posix_timer_id) - 1) & INT_MAX;
+ struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
- head = &posix_timers_hashtable[hash(sig, id)];
+ spin_lock(&hash_lock);
if (!posix_timer_hashed(head, sig, id)) {
/*
* Set the timer ID and the signal pointer to make
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [patch 06/11] posix-timers: Make signal_struct::next_posix_timer_id an atomic_t
2025-02-24 10:15 ` [patch 06/11] posix-timers: Make signal_struct::next_posix_timer_id an atomic_t Thomas Gleixner
@ 2025-02-24 13:20 ` Peter Zijlstra
2025-02-24 13:34 ` Eric Dumazet
2025-02-24 19:38 ` Thomas Gleixner
0 siblings, 2 replies; 19+ messages in thread
From: Peter Zijlstra @ 2025-02-24 13:20 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov
On Mon, Feb 24, 2025 at 11:15:32AM +0100, Thomas Gleixner wrote:
> + unsigned int id = (atomic_inc_return(&sig->next_posix_timer_id) - 1) & INT_MAX;
How about:
unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX;
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [patch 06/11] posix-timers: Make signal_struct::next_posix_timer_id an atomic_t
2025-02-24 13:20 ` Peter Zijlstra
@ 2025-02-24 13:34 ` Eric Dumazet
2025-02-24 19:38 ` Thomas Gleixner
1 sibling, 0 replies; 19+ messages in thread
From: Eric Dumazet @ 2025-02-24 13:34 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Thomas Gleixner, LKML, Anna-Maria Behnsen, Frederic Weisbecker,
Benjamin Segall, Andrey Vagin, Pavel Tikhomirov
On Mon, Feb 24, 2025 at 2:20 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Feb 24, 2025 at 11:15:32AM +0100, Thomas Gleixner wrote:
>
> > + unsigned int id = (atomic_inc_return(&sig->next_posix_timer_id) - 1) & INT_MAX;
>
> How about:
>
> unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX;
NIce, it seems few calls in net trees could use this as well.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [patch 06/11] posix-timers: Make signal_struct::next_posix_timer_id an atomic_t
2025-02-24 13:20 ` Peter Zijlstra
2025-02-24 13:34 ` Eric Dumazet
@ 2025-02-24 19:38 ` Thomas Gleixner
1 sibling, 0 replies; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 19:38 UTC (permalink / raw)
To: Peter Zijlstra
Cc: LKML, Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov
On Mon, Feb 24 2025 at 14:20, Peter Zijlstra wrote:
> On Mon, Feb 24, 2025 at 11:15:32AM +0100, Thomas Gleixner wrote:
>
>> + unsigned int id = (atomic_inc_return(&sig->next_posix_timer_id) - 1) & INT_MAX;
>
> How about:
>
> unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX;
Duh. Yes.
^ permalink raw reply [flat|nested] 19+ messages in thread
* [patch 07/11] posix-timers: Improve hash table performance
2025-02-24 10:15 [patch 00/11] posix-timers: Rework the global hash table and provide a sane mechanism for CRIU Thomas Gleixner
` (5 preceding siblings ...)
2025-02-24 10:15 ` [patch 06/11] posix-timers: Make signal_struct::next_posix_timer_id an atomic_t Thomas Gleixner
@ 2025-02-24 10:15 ` Thomas Gleixner
2025-02-24 19:45 ` Thomas Gleixner
2025-02-24 10:15 ` [patch 08/11] posix-timers: Make per process list RCU safe Thomas Gleixner
` (3 subsequent siblings)
10 siblings, 1 reply; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 10:15 UTC (permalink / raw)
To: LKML
Cc: Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov, Peter Zijlstra
Eric and Ben reported a significant performance bottleneck on the global
hash, which is used to store posix timers for lookup.
Eric tried to do a lockless validation of a new timer ID before trying to
insert the timer, but that does not solve the problem.
For the non-contended case this is a pointless exercise and for the
contended case this extra lookup just creates enough interleaving that all
tasks can make progress.
There are actually two real solutions to the problem:
1) Provide a per process (signal struct) xarray storage
2) Implement a smarter hash like the one in the futex code
#1 works perfectly fine for most cases, but the fact that CRIU enforced a
linear increasing timer ID to restore timers makes this problematic.
It's easy enough to create a sparse timer ID space, which amounts very
fast to a large junk of memory consumed for the xarray. 2048 timers with
a ID offset of 512 consume more than one megabyte of memory for the
xarray storage.
#2 The main advantage of the futex hash is that it uses per hash bucket
locks instead of a global hash lock. Aside of that it is scaled
according to the number of CPUs at boot time.
Experiments with artifical benchmarks have shown that a scaled hash with
per bucket locks comes pretty close to the xarray performance and in some
scenarios it performes better.
Test 1:
A single process creates 20000 timers and afterwards invokes
timer_getoverrun(2) on each of them:
mainline Eric newhash xarray
create 23 ms 23 ms 9 ms 8 ms
getoverrun 14 ms 14 ms 5 ms 4 ms
Test 2:
A single process creates 50000 timers and afterwards invokes
timer_getoverrun(2) on each of them:
mainline Eric newhash xarray
create 98 ms 219 ms 20 ms 18 ms
getoverrun 62 ms 62 ms 10 ms 9 ms
Test 3:
A single process creates 100000 timers and afterwards invokes
timer_getoverrun(2) on each of them:
mainline Eric newhash xarray
create 313 ms 750 ms 48 ms 33 ms
getoverrun 261 ms 260 ms 20 ms 14 ms
Erics changes create quite some overhead in the create() path due to the
double list walk, as the main issue according to perf is the list walk
itself. With 100k timers each hash bucket contains ~200 timers, which in
the worst case need to be all inspected. The same problem applies for
getoverrun() where the lookup has to walk through the hash buckets to find
the timer it is looking for.
The scaled hash obviously reduces hash collisions and lock contention
significantly. This becomes more prominent with concurrency.
Test 4:
A process creates 63 threads and all threads wait on a barrier before
each instance creates 20000 timers and afterwards invokes
timer_getoverrun(2) on each of them. The threads are pinned on
seperate CPUs to achive maximum concurrency. The numbers are the
average times per thread:
mainline Eric newhash xarray
create 180239 ms 38599 ms 579 ms 813 ms
getoverrun 2645 ms 2642 ms 32 ms 7 ms
Test 5:
A process forks 63 times and all forks wait on a barrier before each
instance creates 20000 timers and afterwards invokes
timer_getoverrun(2) on each of them. The processes are pinned on
seperate CPUs to achive maximum concurrency. The numbers are the
average times per process:
mainline eric newhash xarray
create 157253 ms 40008 ms 83 ms 60 ms
getoverrun 2611 ms 2614 ms 40 ms 4 ms
So clearly the reduction of lock contention with Eric's changes makes a
significant difference for the create() loop, but it does not mitigate the
problem of long list walks, which is clearly visible on the getoverrun()
side because that is purely dominated by the lookup itself. Once the timer
is found, the syscall just reads from the timer structure with no other
locks or code paths involved and returns.
The reason for the difference between the thread and the fork case for the
new hash and the xarray is that both suffer from contention on
sighand::siglock and the xarray suffers additionally from contention on the
xarray lock on insertion.
The only case where the reworked hash slighly outperforms the xarray is a
tight loop which creates and deletes timers.
Test 4:
A process creates 63 threads and all threads wait on a barrier before
each instance runs a loop which creates and deletes a timer 100000
times in a row. The threads are pinned on seperate CPUs to achive
maximum concurrency. The numbers are the average times per thread:
mainline Eric newhash xarray
loop 5917 ms 5897 ms 5473 ms 7846 ms
Test 5:
A process forks 63 times and all forks wait on a barrier before each
each instance runs a loop which creates and deletes a timer 100000
times in a row. The processes are pinned on seperate CPUs to achive
maximum concurrency. The numbers are the average times per process:
mainline Eric newhash xarray
loop 5137 ms 7828 ms 891 ms 872 ms
In both test there is not much contention on the hash, but the ucount
accounting for the signal and in the thread case the sighand::siglock
contention (plus the xarray locking) contribute dominantly to the overhead.
As the memory consumption of the xarray in the sparse ID case is
significant, the scaled hash with per bucket locks seems to be the better
overall option. While the xarray has faster lookup times for a large number
of timers, the actual syscall usage, which requires the lookup is not an
extreme hotpath. Most applications utilize signal delivery and all syscalls
except timer_getoverrun(2) are all but cheap.
So implement a scaled hash with per bucket locks, which offers the best
tradeoff between performance and memory consumption.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/time/posix-timers.c | 101 ++++++++++++++++++++++++++++++---------------
1 file changed, 69 insertions(+), 32 deletions(-)
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -12,10 +12,10 @@
#include <linux/compat.h>
#include <linux/compiler.h>
#include <linux/hash.h>
-#include <linux/hashtable.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/list.h>
+#include <linux/memblock.h>
#include <linux/nospec.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
@@ -40,8 +40,18 @@ static struct kmem_cache *posix_timers_c
* This allows checkpoint/restore to reconstruct the exact timer IDs for
* a process.
*/
-static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
-static DEFINE_SPINLOCK(hash_lock);
+struct timer_hash_bucket {
+ spinlock_t lock;
+ struct hlist_head head;
+};
+
+static struct {
+ struct timer_hash_bucket *buckets;
+ unsigned long bits;
+} __timer_data __ro_after_init __aligned(2*sizeof(long));
+
+#define timer_buckets (__timer_data.buckets)
+#define timer_hashbits (__timer_data.bits)
static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
@@ -63,16 +73,16 @@ static struct k_itimer *__lock_timer(tim
static int hash(struct signal_struct *sig, unsigned int nr)
{
- return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
+ return hash_32(hash32_ptr(sig) ^ nr, timer_hashbits);
}
static struct k_itimer *posix_timer_by_id(timer_t id)
{
struct signal_struct *sig = current->signal;
- struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+ struct timer_hash_bucket *bucket = &timer_buckets[hash(sig, id)];
struct k_itimer *timer;
- hlist_for_each_entry_rcu(timer, head, t_hash) {
+ hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) {
/* timer->it_signal can be set concurrently */
if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id))
return timer;
@@ -91,11 +101,13 @@ static inline struct signal_struct *posi
return (struct signal_struct *)(val & ~1UL);
}
-static bool posix_timer_hashed(struct hlist_head *head, struct signal_struct *sig, timer_t id)
+static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig,
+ timer_t id)
{
+ struct hlist_head *head = &bucket->head;
struct k_itimer *timer;
- hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) {
+ hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) {
if ((posix_sig_owner(timer) == sig) && (timer->it_id == id))
return true;
}
@@ -106,34 +118,34 @@ static int posix_timer_add(struct k_itim
{
struct signal_struct *sig = current->signal;
- /*
- * FIXME: Replace this by a per signal struct xarray once there is
- * a plan to handle the resulting CRIU regression gracefully.
- */
for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) {
/* Get the next timer ID and clamp it to positive space */
unsigned int id = (atomic_inc_return(&sig->next_posix_timer_id) - 1) & INT_MAX;
- struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+ struct timer_hash_bucket *bucket = &timer_buckets[hash(sig, id)];
- spin_lock(&hash_lock);
- if (!posix_timer_hashed(head, sig, id)) {
+ scoped_guard (spinlock, &bucket->lock) {
/*
- * Set the timer ID and the signal pointer to make
- * it identifiable in the hash table. The signal
- * pointer has bit 0 set to indicate that it is not
- * yet fully initialized. posix_timer_hashed()
- * masks this bit out, but the syscall lookup fails
- * to match due to it being set. This guarantees
- * that there can't be duplicate timer IDs handed
- * out.
+ * Validate under the lock as this could have raced
+ * against another thread ending up with the same
+ * ID, which is highly unlikely, but possible.
*/
- timer->it_id = (timer_t)id;
- timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
- hlist_add_head_rcu(&timer->t_hash, head);
- spin_unlock(&hash_lock);
- return id;
+ if (!posix_timer_hashed(bucket, sig, id)) {
+ /*
+ * Set the timer ID and the signal pointer to make
+ * it identifiable in the hash table. The signal
+ * pointer has bit 0 set to indicate that it is not
+ * yet fully initialized. posix_timer_hashed()
+ * masks this bit out, but the syscall lookup fails
+ * to match due to it being set. This guarantees
+ * that there can't be duplicate timer IDs handed
+ * out.
+ */
+ timer->it_id = (timer_t)id;
+ timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
+ hlist_add_head_rcu(&timer->t_hash, &bucket->head);
+ return id;
+ }
}
- spin_unlock(&hash_lock);
cond_resched();
}
/* POSIX return code when no timer ID could be allocated */
@@ -388,9 +400,11 @@ void posixtimer_free_timer(struct k_itim
static void posix_timer_unhash_and_free(struct k_itimer *tmr)
{
- spin_lock(&hash_lock);
- hlist_del_rcu(&tmr->t_hash);
- spin_unlock(&hash_lock);
+ unsigned int idx = hash(posix_sig_owner(tmr), tmr->it_id);
+ struct timer_hash_bucket *bucket = &timer_buckets[idx];
+
+ scoped_guard (spinlock, &bucket->lock)
+ hlist_del_rcu(&tmr->t_hash);
posixtimer_putref(tmr);
}
@@ -1549,3 +1563,26 @@ static const struct k_clock *clockid_to_
return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];
}
+
+static int __init posixtimer_init(void)
+{
+ unsigned long i, size;
+ unsigned int shift;
+
+ if (IS_ENABLED(CONFIG_BASE_SMALL))
+ size = 512;
+ else
+ size = roundup_pow_of_two(512 * num_possible_cpus());
+
+ timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets),
+ size, 0, 0, &shift, NULL, size, size);
+ size = 1UL << shift;
+ timer_hashbits = ilog2(size);
+
+ for (i = 0; i < size; i++) {
+ spin_lock_init(&timer_buckets[i].lock);
+ INIT_HLIST_HEAD(&timer_buckets[i].head);
+ }
+ return 0;
+}
+core_initcall(posixtimer_init);
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [patch 07/11] posix-timers: Improve hash table performance
2025-02-24 10:15 ` [patch 07/11] posix-timers: Improve hash table performance Thomas Gleixner
@ 2025-02-24 19:45 ` Thomas Gleixner
0 siblings, 0 replies; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 19:45 UTC (permalink / raw)
To: LKML
Cc: Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov, Peter Zijlstra
On Mon, Feb 24 2025 at 11:15, Thomas Gleixner wrote:
There are two more long hanging fruits:
1) The hashing is suboptimal and can simply be improved by using
jhash32(), which gives a way better distribution in the
pathological test case with 1.2M timers
2) Avoid false sharing
struct k_itimer has the hlist_node which is used for lookup in
the hash bucket and the timer lock in the same cache line.
That's obviously bad, if one CPU fiddles with a timer and the
other is walking the hash bucket on which that timer is queued.
That can be avoided by restructuring struct k_itimer, so that
the read mostly (only modified during setup and teardown)
fields are in the first cache line and the lock and the rest of
the fields which get written to are in cacheline 2-N.
Combo patch below.
Thanks,
tglx
---
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -179,23 +179,26 @@ static inline void posix_cputimers_init_
* @rcu: RCU head for freeing the timer.
*/
struct k_itimer {
- struct hlist_node list;
- struct hlist_node ignored_list;
+ /* 1st cacheline contains read-mostly fields */
struct hlist_node t_hash;
- spinlock_t it_lock;
- const struct k_clock *kclock;
- clockid_t it_clock;
+ struct hlist_node list;
timer_t it_id;
+ clockid_t it_clock;
+ int it_sigev_notify;
+ enum pid_type it_pid_type;
+ struct signal_struct *it_signal;
+ const struct k_clock *kclock;
+
+ /* 2nd cacheline and above contain fields which are modified regularly */
+ spinlock_t it_lock;
int it_status;
bool it_sig_periodic;
s64 it_overrun;
s64 it_overrun_last;
unsigned int it_signal_seq;
unsigned int it_sigqueue_seq;
- int it_sigev_notify;
- enum pid_type it_pid_type;
ktime_t it_interval;
- struct signal_struct *it_signal;
+ struct hlist_node ignored_list;
union {
struct pid *it_pid;
struct task_struct *it_process;
@@ -212,7 +215,7 @@ struct k_itimer {
} alarm;
} it;
struct rcu_head rcu;
-};
+} ____cacheline_aligned_in_smp;
void run_posix_cpu_timers(void);
void posix_cpu_timers_exit(struct task_struct *task);
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -11,8 +11,8 @@
*/
#include <linux/compat.h>
#include <linux/compiler.h>
-#include <linux/hash.h>
#include <linux/init.h>
+#include <linux/jhash.h>
#include <linux/interrupt.h>
#include <linux/list.h>
#include <linux/memblock.h>
@@ -48,11 +48,11 @@ struct timer_hash_bucket {
static struct {
struct timer_hash_bucket *buckets;
- unsigned long bits;
+ unsigned long mask;
} __timer_data __ro_after_init __aligned(2*sizeof(long));
#define timer_buckets (__timer_data.buckets)
-#define timer_hashbits (__timer_data.bits)
+#define timer_hashmask (__timer_data.mask)
static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
@@ -74,15 +74,15 @@ static struct k_itimer *__lock_timer(tim
__timr; \
})
-static int hash(struct signal_struct *sig, unsigned int nr)
+static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr)
{
- return hash_32(hash32_ptr(sig) ^ nr, timer_hashbits);
+ return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask];
}
static struct k_itimer *posix_timer_by_id(timer_t id)
{
struct signal_struct *sig = current->signal;
- struct timer_hash_bucket *bucket = &timer_buckets[hash(sig, id)];
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
struct k_itimer *timer;
hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) {
@@ -119,7 +119,7 @@ static bool posix_timer_hashed(struct ti
static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id)
{
- struct timer_hash_bucket *bucket = &timer_buckets[hash(sig, id)];
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
scoped_guard (spinlock, &bucket->lock) {
/*
@@ -260,9 +260,9 @@ static int posix_get_hrtimer_res(clockid
static __init int init_posix_timers(void)
{
- posix_timers_cache = kmem_cache_create("posix_timers_cache",
- sizeof(struct k_itimer), 0,
- SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer),
+ __alignof__(struct k_itimer),
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
return 0;
}
__initcall(init_posix_timers);
@@ -424,8 +424,7 @@ void posixtimer_free_timer(struct k_itim
static void posix_timer_unhash_and_free(struct k_itimer *tmr)
{
- unsigned int idx = hash(posix_sig_owner(tmr), tmr->it_id);
- struct timer_hash_bucket *bucket = &timer_buckets[idx];
+ struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id);
scoped_guard (spinlock, &bucket->lock)
hlist_del_rcu(&tmr->t_hash);
@@ -1611,7 +1610,7 @@ static int __init posixtimer_init(void)
timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets),
size, 0, 0, &shift, NULL, size, size);
size = 1UL << shift;
- timer_hashbits = ilog2(size);
+ timer_hashmask = size - 1;
for (i = 0; i < size; i++) {
spin_lock_init(&timer_buckets[i].lock);
^ permalink raw reply [flat|nested] 19+ messages in thread
* [patch 08/11] posix-timers: Make per process list RCU safe
2025-02-24 10:15 [patch 00/11] posix-timers: Rework the global hash table and provide a sane mechanism for CRIU Thomas Gleixner
` (6 preceding siblings ...)
2025-02-24 10:15 ` [patch 07/11] posix-timers: Improve hash table performance Thomas Gleixner
@ 2025-02-24 10:15 ` Thomas Gleixner
2025-02-24 10:15 ` [patch 09/11] posix-timers: Dont iterate /proc/$PID/timers with sighand::siglock held Thomas Gleixner
` (2 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 10:15 UTC (permalink / raw)
To: LKML
Cc: Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov, Peter Zijlstra
Preparatory change to remove the sighand locking from the /proc/$PID/timers
iterator.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/time/posix-timers.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -496,7 +496,7 @@ static int do_timer_create(clockid_t whi
* unmodified signal pointer to make it valid.
*/
WRITE_ONCE(new_timer->it_signal, current->signal);
- hlist_add_head(&new_timer->list, ¤t->signal->posix_timers);
+ hlist_add_head_rcu(&new_timer->list, ¤t->signal->posix_timers);
spin_unlock_irq(¤t->sighand->siglock);
/*
* After unlocking sighand::siglock @new_timer is subject to
@@ -1072,7 +1072,7 @@ static int posix_timer_delete(struct k_i
unsigned long sig = (unsigned long)timer->it_signal | 1UL;
WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig);
- hlist_del(&timer->list);
+ hlist_del_rcu(&timer->list);
posix_timer_cleanup_ignored(timer);
}
^ permalink raw reply [flat|nested] 19+ messages in thread* [patch 09/11] posix-timers: Dont iterate /proc/$PID/timers with sighand::siglock held
2025-02-24 10:15 [patch 00/11] posix-timers: Rework the global hash table and provide a sane mechanism for CRIU Thomas Gleixner
` (7 preceding siblings ...)
2025-02-24 10:15 ` [patch 08/11] posix-timers: Make per process list RCU safe Thomas Gleixner
@ 2025-02-24 10:15 ` Thomas Gleixner
2025-02-24 10:15 ` [patch 10/11] posix-timers: Provide a mechanism to allocate a given timer ID Thomas Gleixner
2025-02-24 10:15 ` [patch 11/11] selftests/timers/posix-timers: Add a test for exact allocation mode Thomas Gleixner
10 siblings, 0 replies; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 10:15 UTC (permalink / raw)
To: LKML
Cc: Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov, Peter Zijlstra
The readout of /proc/$PID/timers holds sighand::siglock with interrupts
disabled. That is required to protect against concurrent modifications of
the task::signal::posix_timers list because the list is not RCU safe.
With the conversion of the timer storage to RCU protected hlist, this is
not longer required.
The only requirement is to protect the returned entry against a concurrent
free, which is trivial as the timers are RCU protected.
Removing the trylock of sighand::siglock is benign because the life time of
task_struct::signal is bound to the life time of the task_struct itself.
There are two scenarios where this matters:
1) The process is life and not about to be checkpointed
2) The process is stopped via ptrace for checkpointing
#1 is a racy snapshot of the armed timers and nothing can rely on it. It's
not more than debug information and it has been that way before because
sighand lock is dropped when the buffer is full and the restart of
the iteration might find a completely different set of timers.
The task and therefore task::signal cannot be freed as timers_start()
acquired a reference count via get_pid_task().
#2 the process is stopped for checkpointing so nothing can delete or create
timers at this point. Neither can the process exit during the traversal.
If CRIU fails to observe an exit in progress prior to the dissimination
of the timers, then there are more severe problems to solve in the CRIU
mechanics as they can't rely on posix timers being enabled in the first
place.
Therefore replace the lock acquisition with rcu_read_lock() and switch the
timer storage traversal over to seq_hlist_*_rcu().
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
fs/proc/base.c | 48 ++++++++++++++++++++----------------------------
1 file changed, 20 insertions(+), 28 deletions(-)
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2497,11 +2497,9 @@ static const struct file_operations proc
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
struct timers_private {
- struct pid *pid;
- struct task_struct *task;
- struct sighand_struct *sighand;
- struct pid_namespace *ns;
- unsigned long flags;
+ struct pid *pid;
+ struct task_struct *task;
+ struct pid_namespace *ns;
};
static void *timers_start(struct seq_file *m, loff_t *pos)
@@ -2512,54 +2510,48 @@ static void *timers_start(struct seq_fil
if (!tp->task)
return ERR_PTR(-ESRCH);
- tp->sighand = lock_task_sighand(tp->task, &tp->flags);
- if (!tp->sighand)
- return ERR_PTR(-ESRCH);
-
- return seq_hlist_start(&tp->task->signal->posix_timers, *pos);
+ rcu_read_lock();
+ return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos);
}
static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
struct timers_private *tp = m->private;
- return seq_hlist_next(v, &tp->task->signal->posix_timers, pos);
+
+ return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos);
}
static void timers_stop(struct seq_file *m, void *v)
{
struct timers_private *tp = m->private;
- if (tp->sighand) {
- unlock_task_sighand(tp->task, &tp->flags);
- tp->sighand = NULL;
- }
-
if (tp->task) {
put_task_struct(tp->task);
tp->task = NULL;
+ rcu_read_unlock();
}
}
static int show_timer(struct seq_file *m, void *v)
{
- struct k_itimer *timer;
- struct timers_private *tp = m->private;
- int notify;
static const char * const nstr[] = {
- [SIGEV_SIGNAL] = "signal",
- [SIGEV_NONE] = "none",
- [SIGEV_THREAD] = "thread",
+ [SIGEV_SIGNAL] = "signal",
+ [SIGEV_NONE] = "none",
+ [SIGEV_THREAD] = "thread",
};
- timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
- notify = timer->it_sigev_notify;
+ struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
+ struct timers_private *tp = m->private;
+ int notify = timer->it_sigev_notify;
+
+ guard(spinlock_irq)(&timer->it_lock);
+ if (!posixtimer_valid(timer))
+ return 0;
seq_printf(m, "ID: %d\n", timer->it_id);
- seq_printf(m, "signal: %d/%px\n",
- timer->sigq.info.si_signo,
+ seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo,
timer->sigq.info.si_value.sival_ptr);
- seq_printf(m, "notify: %s/%s.%d\n",
- nstr[notify & ~SIGEV_THREAD_ID],
+ seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID],
(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
pid_nr_ns(timer->it_pid, tp->ns));
seq_printf(m, "ClockID: %d\n", timer->it_clock);
^ permalink raw reply [flat|nested] 19+ messages in thread* [patch 10/11] posix-timers: Provide a mechanism to allocate a given timer ID
2025-02-24 10:15 [patch 00/11] posix-timers: Rework the global hash table and provide a sane mechanism for CRIU Thomas Gleixner
` (8 preceding siblings ...)
2025-02-24 10:15 ` [patch 09/11] posix-timers: Dont iterate /proc/$PID/timers with sighand::siglock held Thomas Gleixner
@ 2025-02-24 10:15 ` Thomas Gleixner
2025-02-24 10:15 ` [patch 11/11] selftests/timers/posix-timers: Add a test for exact allocation mode Thomas Gleixner
10 siblings, 0 replies; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 10:15 UTC (permalink / raw)
To: LKML
Cc: Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov, Peter Zijlstra
Checkpoint/Restore in Userspace (CRIU) requires to reconstruct posix timers
with the same timer ID on restore. It uses sys_timer_create() and relies on
the monotonic increasing timer ID provided by this syscall. It creates and
deletes timers until the desired ID is reached. This is can loop for a long
time, when the checkpointed process had a very sparse timer ID range.
It has been debated to implement a new syscall to allow the creation of
timers with a given timer ID, but that's tideous due to the 32/64bit compat
issues of sigevent_t and of dubious value.
The restore mechanism of CRIU creates the timers in a state where all
threads of the restored process are held on a barrier and cannot issue
syscalls. That means the restorer task has exclusive control.
This allows to address this issue with a prctl() so that the restorer
thread can do:
if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_ON))
goto linear_mode;
create_timers_with_explicit_ids();
prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_OFF);
This is backwards compatible because the prctl() fails on older kernels and
CRIU can fall back to the linear timer ID mechanism. CRIU versions which do
not know about the prctl() just work as before.
Implement the prctl() and modify timer_create() so that it copies the
requested timer ID from userspace by utilizing the existing timer_t
pointer, which is used to copy out the allocated timer ID on success.
If the prctl() is disabled, which it is by default, timer_create() works as
before and does not try to read from the userspace pointer.
There is no problem when a broken or rogue user space application enables
the prctl(). If the user space pointer does not contain a valid ID, then
timer_create() fails. If the data is not initialized, but constains a
random valid ID, timer_create() will create that random timer ID or fail if
the ID is already given out.
As CRIU must use the raw syscall to avoid manipulating the internal state
of the restored process, this has no library dependencies and can be
adopted by CRIU right away.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/posix-timers.h | 2 +
include/linux/sched/signal.h | 1
include/uapi/linux/prctl.h | 10 +++++
kernel/sys.c | 5 ++
kernel/time/posix-timers.c | 86 ++++++++++++++++++++++++++++++-------------
5 files changed, 78 insertions(+), 26 deletions(-)
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -114,6 +114,7 @@ bool posixtimer_init_sigqueue(struct sig
void posixtimer_send_sigqueue(struct k_itimer *tmr);
bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq);
void posixtimer_free_timer(struct k_itimer *timer);
+long posixtimer_create_prctl(unsigned long ctrl);
/* Init task static initializer */
#define INIT_CPU_TIMERBASE(b) { \
@@ -140,6 +141,7 @@ static inline void posixtimer_rearm_itim
static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info,
struct sigqueue *timer_sigq) { return false; }
static inline void posixtimer_free_timer(struct k_itimer *timer) { }
+static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; }
#endif
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -136,6 +136,7 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
+ unsigned int timer_create_restore_ids:1;
atomic_t next_posix_timer_id;
struct hlist_head posix_timers;
struct hlist_head ignored_posix_timers;
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -353,4 +353,14 @@ struct prctl_mm_map {
*/
#define PR_LOCK_SHADOW_STACK_STATUS 76
+/*
+ * Controls the mode of timer_create() for CRIU restore operations.
+ * Enabling this allows CRIU to restore timers with explicit IDs.
+ *
+ * Don't use for normal operations as the result might be undefined.
+ */
+#define PR_TIMER_CREATE_RESTORE_IDS 77
+# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0
+# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
+
#endif /* _LINUX_PRCTL_H */
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2811,6 +2811,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
return -EINVAL;
error = arch_lock_shadow_stack_status(me, arg2);
break;
+ case PR_TIMER_CREATE_RESTORE_IDS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = posixtimer_create_prctl(arg2);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -19,6 +19,7 @@
#include <linux/nospec.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
+#include <linux/prctl.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
@@ -57,6 +58,8 @@ static const struct k_clock * const posi
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;
+#define TIMER_ANY_ID INT_MIN
+
/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
@@ -114,38 +117,49 @@ static bool posix_timer_hashed(struct ti
return false;
}
-static int posix_timer_add(struct k_itimer *timer)
+static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id)
+{
+ struct timer_hash_bucket *bucket = &timer_buckets[hash(sig, id)];
+
+ scoped_guard (spinlock, &bucket->lock) {
+ /*
+ * Validate under the lock as this could have raced against
+ * another thread ending up with the same ID, which is
+ * highly unlikely, but possible.
+ */
+ if (!posix_timer_hashed(bucket, sig, id)) {
+ /*
+ * Set the timer ID and the signal pointer to make
+ * it identifiable in the hash table. The signal
+ * pointer has bit 0 set to indicate that it is not
+ * yet fully initialized. posix_timer_hashed()
+ * masks this bit out, but the syscall lookup fails
+ * to match due to it being set. This guarantees
+ * that there can't be duplicate timer IDs handed
+ * out.
+ */
+ timer->it_id = (timer_t)id;
+ timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
+ hlist_add_head_rcu(&timer->t_hash, &bucket->head);
+ return true;
+ }
+ }
+ return false;
+}
+
+static int posix_timer_add(struct k_itimer *timer, int req_id)
{
struct signal_struct *sig = current->signal;
+ if (unlikely(req_id != TIMER_ANY_ID))
+ return posix_timer_add_at(timer, sig, req_id) ? req_id : -EBUSY;
+
for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) {
/* Get the next timer ID and clamp it to positive space */
unsigned int id = (atomic_inc_return(&sig->next_posix_timer_id) - 1) & INT_MAX;
- struct timer_hash_bucket *bucket = &timer_buckets[hash(sig, id)];
- scoped_guard (spinlock, &bucket->lock) {
- /*
- * Validate under the lock as this could have raced
- * against another thread ending up with the same
- * ID, which is highly unlikely, but possible.
- */
- if (!posix_timer_hashed(bucket, sig, id)) {
- /*
- * Set the timer ID and the signal pointer to make
- * it identifiable in the hash table. The signal
- * pointer has bit 0 set to indicate that it is not
- * yet fully initialized. posix_timer_hashed()
- * masks this bit out, but the syscall lookup fails
- * to match due to it being set. This guarantees
- * that there can't be duplicate timer IDs handed
- * out.
- */
- timer->it_id = (timer_t)id;
- timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
- hlist_add_head_rcu(&timer->t_hash, &bucket->head);
- return id;
- }
- }
+ if (posix_timer_add_at(timer, sig, id))
+ return id;
cond_resched();
}
/* POSIX return code when no timer ID could be allocated */
@@ -351,6 +365,16 @@ static enum hrtimer_restart posix_timer_
return HRTIMER_NORESTART;
}
+long posixtimer_create_prctl(unsigned long ctrl)
+{
+ if (ctrl > PR_TIMER_CREATE_RESTORE_IDS_ON)
+ return -EINVAL;
+
+ guard(spinlock_irq)(¤t->sighand->siglock);
+ current->signal->timer_create_restore_ids = ctrl == PR_TIMER_CREATE_RESTORE_IDS_ON;
+ return 0;
+}
+
static struct pid *good_sigevent(sigevent_t * event)
{
struct pid *pid = task_tgid(current);
@@ -419,6 +443,7 @@ static int do_timer_create(clockid_t whi
timer_t __user *created_timer_id)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
+ timer_t req_id = TIMER_ANY_ID;
struct k_itimer *new_timer;
int error, new_timer_id;
@@ -433,11 +458,20 @@ static int do_timer_create(clockid_t whi
spin_lock_init(&new_timer->it_lock);
+ /* Special case for CRIU to restore timers with a given timer ID. */
+ if (unlikely(current->signal->timer_create_restore_ids)) {
+ if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
+ return -EFAULT;
+ /* Valid IDs are 0..INT_MAX */
+ if ((unsigned int)req_id > INT_MAX)
+ return -EINVAL;
+ }
+
/*
* Add the timer to the hash table. The timer is not yet valid
* after insertion, but has a unique ID allocated.
*/
- new_timer_id = posix_timer_add(new_timer);
+ new_timer_id = posix_timer_add(new_timer, req_id);
if (new_timer_id < 0) {
posixtimer_free_timer(new_timer);
return new_timer_id;
^ permalink raw reply [flat|nested] 19+ messages in thread* [patch 11/11] selftests/timers/posix-timers: Add a test for exact allocation mode
2025-02-24 10:15 [patch 00/11] posix-timers: Rework the global hash table and provide a sane mechanism for CRIU Thomas Gleixner
` (9 preceding siblings ...)
2025-02-24 10:15 ` [patch 10/11] posix-timers: Provide a mechanism to allocate a given timer ID Thomas Gleixner
@ 2025-02-24 10:15 ` Thomas Gleixner
10 siblings, 0 replies; 19+ messages in thread
From: Thomas Gleixner @ 2025-02-24 10:15 UTC (permalink / raw)
To: LKML
Cc: Anna-Maria Behnsen, Frederic Weisbecker, Benjamin Segall,
Eric Dumazet, Andrey Vagin, Pavel Tikhomirov, Peter Zijlstra
The exact timer ID allocation mode is used by CRIU to restore timers with a
given ID. Add a test case for it.
It's skipped on older kernels when the prctl() fails.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
tools/testing/selftests/timers/posix_timers.c | 68 +++++++++++++++++++++++++-
1 file changed, 67 insertions(+), 1 deletion(-)
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -7,6 +7,7 @@
* Kernel loop code stolen from Steven Rostedt <srostedt@redhat.com>
*/
#define _GNU_SOURCE
+#include <sys/prctl.h>
#include <sys/time.h>
#include <sys/types.h>
#include <stdio.h>
@@ -599,14 +600,79 @@ static void check_overrun(int which, con
"check_overrun %s\n", name);
}
+#include <sys/syscall.h>
+
+static int do_timer_create(int *id)
+{
+ return syscall(__NR_timer_create, CLOCK_MONOTONIC, NULL, id);
+}
+
+static int do_timer_delete(int id)
+{
+ return syscall(__NR_timer_delete, id);
+}
+
+static void check_timer_create_exact(void)
+{
+ int id, rid;
+
+ /* Allocate a timer for comparison after switch back from exact mode */
+ if (do_timer_create(&id) < 0)
+ fatal_error(NULL, "timer_create()");
+
+ if (do_timer_delete(id))
+ fatal_error(NULL, "timer_delete()");
+
+ if (prctl(77, 1, 0, 0, 0)) {
+ switch (errno) {
+ case EINVAL:
+ ksft_test_result_skip("check timer create exact, not supported\n");
+ return;
+ default:
+ ksft_test_result_skip("check timer create exact, errno = %d\n", errno);
+ return;
+ }
+ }
+
+ rid = id + 8;
+ if (do_timer_create(&rid) < 0)
+ fatal_error(NULL, "timer_create()");
+
+ if (do_timer_delete(rid))
+ fatal_error(NULL, "timer_delete()");
+
+ if (prctl(77, 0, 0, 0, 0))
+ fatal_error(NULL, "prctl()");
+
+ if (rid != id + 8) {
+ ksft_test_result_fail("check timer create exact %d != %d\n", rid, id + 8);
+ return;
+ }
+
+ /* Validate that it went back to normal mode */
+ if (do_timer_create(&rid) < 0)
+ fatal_error(NULL, "timer_create()");
+
+ if (do_timer_delete(rid))
+ fatal_error(NULL, "timer_delete()");
+
+ /* Same ID if linear mode is off, next ID if enabled */
+ if (rid == id || rid == id + 1)
+ ksft_test_result_pass("check timer create exact\n");
+ else
+ ksft_test_result_fail("check timer create exact. Disabling failed.\n");
+}
+
int main(int argc, char **argv)
{
ksft_print_header();
- ksft_set_plan(18);
+ ksft_set_plan(19);
ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n");
ksft_print_msg("based timers if other threads run on the CPU...\n");
+ check_timer_create_exact();
+
check_itimer(ITIMER_VIRTUAL, "ITIMER_VIRTUAL");
check_itimer(ITIMER_PROF, "ITIMER_PROF");
check_itimer(ITIMER_REAL, "ITIMER_REAL");
^ permalink raw reply [flat|nested] 19+ messages in thread