From: Peter Zijlstra <peterz@infradead.org>
To: John Stultz <jstultz@google.com>
Cc: LKML <linux-kernel@vger.kernel.org>,
Joel Fernandes <joelaf@google.com>,
Qais Yousef <qyousef@layalina.io>, Ingo Molnar <mingo@redhat.com>,
Juri Lelli <juri.lelli@redhat.com>,
Vincent Guittot <vincent.guittot@linaro.org>,
Dietmar Eggemann <dietmar.eggemann@arm.com>,
Valentin Schneider <vschneid@redhat.com>,
Steven Rostedt <rostedt@goodmis.org>,
Ben Segall <bsegall@google.com>,
Zimuzo Ezeozue <zezeozue@google.com>,
Mel Gorman <mgorman@suse.de>, Will Deacon <will@kernel.org>,
Waiman Long <longman@redhat.com>,
Boqun Feng <boqun.feng@gmail.com>,
"Paul E. McKenney" <paulmck@kernel.org>,
Metin Kaya <Metin.Kaya@arm.com>,
Xuewen Yan <xuewen.yan94@gmail.com>,
K Prateek Nayak <kprateek.nayak@amd.com>,
Thomas Gleixner <tglx@linutronix.de>,
Daniel Lezcano <daniel.lezcano@linaro.org>,
kernel-team@android.com, Connor O'Brien <connoro@google.com>
Subject: Re: [RFC][PATCH v14 2/7] locking/mutex: Rework task_struct::blocked_on
Date: Sat, 14 Dec 2024 00:22:14 +0100 [thread overview]
Message-ID: <20241213232214.GA17501@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <20241125195204.2374458-3-jstultz@google.com>
On Mon, Nov 25, 2024 at 11:51:56AM -0800, John Stultz wrote:
> Also add a blocked_on_state value so we can distinguish when a
> task is blocked_on a mutex, but is either blocked, waking up, or
> runnable (such that it can try to acquire the lock its blocked
> on).
>
> This avoids some of the subtle & racy games where the blocked_on
> state gets cleared, only to have it re-added by the
> mutex_lock_slowpath call when it tries to acquire the lock on
> wakeup
If you can remember those sublte cases, I'm sure our future selves
would've loved it if you wrote a comment to go with these states :-)
> +enum blocked_on_state {
> + BO_RUNNABLE,
> + BO_BLOCKED,
> + BO_WAKING,
> +};
> +
> struct task_struct {
> #ifdef CONFIG_THREAD_INFO_IN_TASK
> /*
> @@ -1195,10 +1202,9 @@ struct task_struct {
> struct rt_mutex_waiter *pi_blocked_on;
> #endif
>
> -#ifdef CONFIG_DEBUG_MUTEXES
> - /* Mutex deadlock detection: */
> - struct mutex_waiter *blocked_on;
> -#endif
> + enum blocked_on_state blocked_on_state;
> + struct mutex *blocked_on; /* lock we're blocked on */
> + raw_spinlock_t blocked_lock;
>
> #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
> int non_block_count;
> @@ -2118,6 +2124,56 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
> __cond_resched_rwlock_write(lock); \
> })
>
> +static inline void __set_blocked_on_runnable(struct task_struct *p)
> +{
> + lockdep_assert_held(&p->blocked_lock);
> +
> + if (p->blocked_on_state == BO_WAKING)
> + p->blocked_on_state = BO_RUNNABLE;
> +}
> +
> +static inline void set_blocked_on_runnable(struct task_struct *p)
> +{
> + unsigned long flags;
> +
> + if (!sched_proxy_exec())
> + return;
> +
> + raw_spin_lock_irqsave(&p->blocked_lock, flags);
Do we want to make this:
guard(raw_spinlock_irqsave)(&p->blocked_lock);
?
> + __set_blocked_on_runnable(p);
> + raw_spin_unlock_irqrestore(&p->blocked_lock, flags);
> +}
> +
> +static inline void set_blocked_on_waking(struct task_struct *p)
consistent naming would be __set_blocked_on_waking()
> +{
> + lockdep_assert_held(&p->blocked_lock);
> +
> + if (p->blocked_on_state == BO_BLOCKED)
> + p->blocked_on_state = BO_WAKING;
> +}
> +
> +static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m)
__set_task_blocked_on()
> +{
> + lockdep_assert_held(&p->blocked_lock);
> +
> + /*
> + * Check we are clearing values to NULL or setting NULL
> + * to values to ensure we don't overwrite existing mutex
> + * values or clear already cleared values
> + */
> + WARN_ON((!m && !p->blocked_on) || (m && p->blocked_on));
> +
> + p->blocked_on = m;
> + p->blocked_on_state = m ? BO_BLOCKED : BO_RUNNABLE;
> +}
> +
> +static inline struct mutex *get_task_blocked_on(struct task_struct *p)
__get_task_blocked_on()
> +{
> + lockdep_assert_held(&p->blocked_lock);
> +
> + return p->blocked_on;
> +}
That gets us the __ prefix if the caller is assumed to have taken
blocked_lock.
> static __always_inline bool need_resched(void)
> {
> return unlikely(tif_need_resched());
> @@ -2157,8 +2213,6 @@ extern bool sched_task_on_rq(struct task_struct *p);
> extern unsigned long get_wchan(struct task_struct *p);
> extern struct task_struct *cpu_curr_snapshot(int cpu);
>
> -#include <linux/spinlock.h>
> -
> /*
> * In order to reduce various lock holder preemption latencies provide an
> * interface to see if a vCPU is currently running or not.
> diff --git a/init/init_task.c b/init/init_task.c
> index e557f622bd906..7e29d86153d9f 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -140,6 +140,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
> .journal_info = NULL,
> INIT_CPU_TIMERS(init_task)
> .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
> + .blocked_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.blocked_lock),
> .timer_slack_ns = 50000, /* 50 usec default slack */
> .thread_pid = &init_struct_pid,
> .thread_node = LIST_HEAD_INIT(init_signals.thread_head),
> diff --git a/kernel/fork.c b/kernel/fork.c
> index f253e81d0c28e..160bead843afb 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2231,6 +2231,7 @@ __latent_entropy struct task_struct *copy_process(
> ftrace_graph_init_task(p);
>
> rt_mutex_init_task(p);
> + raw_spin_lock_init(&p->blocked_lock);
>
> lockdep_assert_irqs_enabled();
> #ifdef CONFIG_PROVE_LOCKING
> @@ -2329,9 +2330,8 @@ __latent_entropy struct task_struct *copy_process(
> lockdep_init_task(p);
> #endif
>
> -#ifdef CONFIG_DEBUG_MUTEXES
> + p->blocked_on_state = BO_RUNNABLE;
> p->blocked_on = NULL; /* not blocked yet */
> -#endif
> #ifdef CONFIG_BCACHE
> p->sequential_io = 0;
> p->sequential_io_avg = 0;
> diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
> index 6e6f6071cfa27..1d8cff71f65e1 100644
> --- a/kernel/locking/mutex-debug.c
> +++ b/kernel/locking/mutex-debug.c
> @@ -53,17 +53,18 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
> {
> lockdep_assert_held(&lock->wait_lock);
>
> - /* Mark the current thread as blocked on the lock: */
> - task->blocked_on = waiter;
> + /* Current thread can't be already blocked (since it's executing!) */
> + DEBUG_LOCKS_WARN_ON(get_task_blocked_on(task));
> }
>
> void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
> struct task_struct *task)
> {
> + struct mutex *blocked_on = get_task_blocked_on(task);
> +
> DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
> DEBUG_LOCKS_WARN_ON(waiter->task != task);
> - DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
> - task->blocked_on = NULL;
> + DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock);
>
> INIT_LIST_HEAD(&waiter->list);
> waiter->task = NULL;
> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
> index 3302e52f0c967..8f5d3fe6c1029 100644
> --- a/kernel/locking/mutex.c
> +++ b/kernel/locking/mutex.c
> @@ -597,6 +597,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> }
>
> raw_spin_lock_irqsave(&lock->wait_lock, flags);
> + raw_spin_lock(¤t->blocked_lock);
> /*
> * After waiting to acquire the wait_lock, try again.
> */
> @@ -627,6 +628,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> goto err_early_kill;
> }
>
> + set_task_blocked_on(current, lock);
> set_current_state(state);
blocked_on_state mirrors task-state
> trace_contention_begin(lock, LCB_F_MUTEX);
> for (;;) {
> @@ -639,7 +641,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> * the handoff.
> */
> if (__mutex_trylock(lock))
> - goto acquired;
> + break; /* acquired */;
>
> /*
> * Check for signals and kill conditions while holding
> @@ -657,6 +659,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> goto err;
> }
>
> + raw_spin_unlock(¤t->blocked_lock);
> raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
> /* Make sure we do wakeups before calling schedule */
> wake_up_q(&wake_q);
> @@ -666,6 +669,13 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
>
> first = __mutex_waiter_is_first(lock, &waiter);
>
> + raw_spin_lock_irqsave(&lock->wait_lock, flags);
> + raw_spin_lock(¤t->blocked_lock);
> +
> + /*
> + * Re-set blocked_on_state as unlock path set it to WAKING/RUNNABLE
> + */
> + current->blocked_on_state = BO_BLOCKED;
> set_current_state(state);
And blocked_on_state again mirrors taks-state
> /*
> * Here we order against unlock; we must either see it change
> @@ -676,16 +686,26 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> break;
>
> if (first) {
> + bool opt_acquired;
> +
> + /*
> + * mutex_optimistic_spin() can schedule, so we need to
> + * release these locks before calling it.
> + */
> + current->blocked_on_state = BO_RUNNABLE;
> + raw_spin_unlock(¤t->blocked_lock);
> + raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
> trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
> - if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
> + opt_acquired = mutex_optimistic_spin(lock, ww_ctx, &waiter);
I'm confused -- didn't I kill optimistic spinning for proxy? IIRC it
fundamentally doesn't make sense since we do a hand-off to the donor
thread.
> + raw_spin_lock_irqsave(&lock->wait_lock, flags);
> + raw_spin_lock(¤t->blocked_lock);
> + current->blocked_on_state = BO_BLOCKED;
> + if (opt_acquired)
> break;
> trace_contention_begin(lock, LCB_F_MUTEX);
> }
> -
> - raw_spin_lock_irqsave(&lock->wait_lock, flags);
> }
> - raw_spin_lock_irqsave(&lock->wait_lock, flags);
> -acquired:
> + set_task_blocked_on(current, NULL);
> __set_current_state(TASK_RUNNING);
And again..
>
> if (ww_ctx) {
> @@ -710,16 +730,20 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> if (ww_ctx)
> ww_mutex_lock_acquired(ww, ww_ctx);
>
> + raw_spin_unlock(¤t->blocked_lock);
> raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
> wake_up_q(&wake_q);
> preempt_enable();
> return 0;
>
> err:
> + set_task_blocked_on(current, NULL);
> __set_current_state(TASK_RUNNING);
and one again..
> __mutex_remove_waiter(lock, &waiter);
> err_early_kill:
> + WARN_ON(get_task_blocked_on(current));
> trace_contention_end(lock, ret);
> + raw_spin_unlock(¤t->blocked_lock);
> raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
> debug_mutex_free_waiter(&waiter);
> mutex_release(&lock->dep_map, ip);
> @@ -928,8 +952,12 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
>
> next = waiter->task;
>
> + raw_spin_lock(&next->blocked_lock);
> debug_mutex_wake_waiter(lock, waiter);
> + WARN_ON(get_task_blocked_on(next) != lock);
> + set_blocked_on_waking(next);
and more..
> wake_q_add(&wake_q, next);
> + raw_spin_unlock(&next->blocked_lock);
> }
>
> if (owner & MUTEX_FLAG_HANDOFF)
> diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
> index 37f025a096c9d..d9ff2022eef6f 100644
> --- a/kernel/locking/ww_mutex.h
> +++ b/kernel/locking/ww_mutex.h
> @@ -281,10 +281,21 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
> return false;
>
> if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) {
> + /* nested as we should hold current->blocked_lock already */
> + raw_spin_lock_nested(&waiter->task->blocked_lock, SINGLE_DEPTH_NESTING);
> #ifndef WW_RT
> debug_mutex_wake_waiter(lock, waiter);
> + /*
> + * When waking up the task to die, be sure to set the
> + * blocked_on_state to WAKING. Otherwise we can see
> + * circular blocked_on relationships that can't
> + * resolve.
> + */
> + WARN_ON(get_task_blocked_on(waiter->task) != lock);
> #endif
> + set_blocked_on_waking(waiter->task);
> wake_q_add(wake_q, waiter->task);
> + raw_spin_unlock(&waiter->task->blocked_lock);
> }
>
> return true;
> @@ -331,9 +342,18 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
> * it's wounded in __ww_mutex_check_kill() or has a
> * wakeup pending to re-read the wounded state.
> */
> - if (owner != current)
> + if (owner != current) {
> + /* nested as we should hold current->blocked_lock already */
> + raw_spin_lock_nested(&owner->blocked_lock, SINGLE_DEPTH_NESTING);
> + /*
> + * When waking up the task to wound, be sure to set the
> + * blocked_on_state flag. Otherwise we can see circular
> + * blocked_on relationships that can't resolve.
> + */
> + set_blocked_on_waking(owner);
> wake_q_add(wake_q, owner);
> -
> + raw_spin_unlock(&owner->blocked_lock);
> + }
> return true;
> }
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index d712e177d3b75..f8714050b6d0d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4350,6 +4350,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
> ttwu_queue(p, cpu, wake_flags);
> }
> out:
> + set_blocked_on_runnable(p);
> if (success)
> ttwu_stat(p, task_cpu(p), wake_flags);
>
All in all I'm properly confused by this patch... please write
more/better comments changelog.
next prev parent reply other threads:[~2024-12-13 23:22 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-11-25 19:51 [RFC][PATCH v14 0/7] Single CPU Proxy Execution (v14) John Stultz
2024-11-25 19:51 ` [RFC][PATCH v14 1/7] sched: Add CONFIG_SCHED_PROXY_EXEC & boot argument to enable/disable John Stultz
2024-11-25 19:51 ` [RFC][PATCH v14 2/7] locking/mutex: Rework task_struct::blocked_on John Stultz
2024-12-13 23:22 ` Peter Zijlstra [this message]
2024-12-14 3:39 ` John Stultz
2024-12-16 16:54 ` Peter Zijlstra
2024-12-16 17:07 ` Peter Zijlstra
2024-12-17 5:01 ` John Stultz
2024-12-17 8:39 ` Peter Zijlstra
2024-12-17 8:46 ` Peter Zijlstra
2024-12-17 9:19 ` Peter Zijlstra
2024-11-25 19:51 ` [RFC][PATCH v14 3/7] sched: Fix runtime accounting w/ split exec & sched contexts John Stultz
2024-12-13 23:37 ` Peter Zijlstra
2024-12-14 0:09 ` Peter Zijlstra
2024-12-17 6:09 ` John Stultz
2024-12-17 8:48 ` Peter Zijlstra
2024-11-25 19:51 ` [RFC][PATCH v14 4/7] sched: Fix psi_dequeue for Proxy Execution John Stultz
2024-11-25 19:51 ` [RFC][PATCH v14 5/7] sched: Add an initial sketch of the find_proxy_task() function John Stultz
2024-12-14 0:05 ` Peter Zijlstra
2024-12-17 5:42 ` John Stultz
2024-12-17 8:52 ` Peter Zijlstra
2024-11-25 19:52 ` [RFC][PATCH v14 6/7] sched: Fix proxy/current (push,pull)ability John Stultz
2024-11-25 19:52 ` [RFC][PATCH v14 7/7] sched: Start blocked_on chain processing in find_proxy_task() John Stultz
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241213232214.GA17501@noisy.programming.kicks-ass.net \
--to=peterz@infradead.org \
--cc=Metin.Kaya@arm.com \
--cc=boqun.feng@gmail.com \
--cc=bsegall@google.com \
--cc=connoro@google.com \
--cc=daniel.lezcano@linaro.org \
--cc=dietmar.eggemann@arm.com \
--cc=joelaf@google.com \
--cc=jstultz@google.com \
--cc=juri.lelli@redhat.com \
--cc=kernel-team@android.com \
--cc=kprateek.nayak@amd.com \
--cc=linux-kernel@vger.kernel.org \
--cc=longman@redhat.com \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=paulmck@kernel.org \
--cc=qyousef@layalina.io \
--cc=rostedt@goodmis.org \
--cc=tglx@linutronix.de \
--cc=vincent.guittot@linaro.org \
--cc=vschneid@redhat.com \
--cc=will@kernel.org \
--cc=xuewen.yan94@gmail.com \
--cc=zezeozue@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.