From: Andrea Righi <arighi@nvidia.com>
To: Cheng-Yang Chou <yphbchou0911@gmail.com>
Cc: sched-ext@lists.linux.dev, Tejun Heo <tj@kernel.org>,
David Vernet <void@manifault.com>,
Changwoo Min <changwoo@igalia.com>,
"Paul E . McKenney" <paulmck@kernel.org>,
Joel Fernandes <joelagnelf@nvidia.com>,
rcu@vger.kernel.org, Ching-Chun Huang <jserv@ccns.ncku.edu.tw>,
Chia-Ping Tsai <chia7712@gmail.com>
Subject: Re: [PATCH 2/2] sched_ext, rcu: Upgrade RCU stall paths to report cpumask of stalled CPUs
Date: Tue, 9 Jun 2026 10:06:58 +0200 [thread overview]
Message-ID: <aifJookTje68036x@gpd4> (raw)
In-Reply-To: <20260531152646.1206799-3-yphbchou0911@gmail.com>
Hi Cheng-Yang,
On Sun, May 31, 2026 at 11:25:27PM +0800, Cheng-Yang Chou wrote:
> scx_rcu_cpu_stall() previously recorded the detector CPU rather than the
> stalled one, and the expedited grace period path had no stalled CPU to
> report at all.
>
> Thread a cpumask through panic_on_rcu_stall() and scx_rcu_cpu_stall()
> to capture all stalled CPUs. Report cpumask_first() as exit_cpu and the
> full CPU list in the exit message. Task-only stalls yield exit_cpu = -1.
>
> Store the stall mask in scx_sched rather than scx_exit_info, keeping the
> BPF-visible struct unchanged. scx_dump_state() reads sch->stall_cpus
> directly and dumps all stalled CPUs first to avoid losing them to
> truncation.
>
> Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Looks good from a sched_ext perspective.
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Thanks,
-Andrea
> ---
> include/linux/sched/ext.h | 4 +-
> kernel/rcu/tree.c | 3 ++
> kernel/rcu/tree_exp.h | 5 ++-
> kernel/rcu/tree_stall.h | 13 +++++--
> kernel/sched/ext.c | 73 ++++++++++++++++++++++++++++++++-----
> kernel/sched/ext_internal.h | 1 +
> 6 files changed, 83 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
> index 20b2343aa344..75cb8b119fb7 100644
> --- a/include/linux/sched/ext.h
> +++ b/include/linux/sched/ext.h
> @@ -263,7 +263,7 @@ void sched_ext_dead(struct task_struct *p);
> void print_scx_info(const char *log_lvl, struct task_struct *p);
> void scx_softlockup(u32 dur_s);
> bool scx_hardlockup(int cpu);
> -bool scx_rcu_cpu_stall(void);
> +bool scx_rcu_cpu_stall(const struct cpumask *stalled_mask);
>
> #else /* !CONFIG_SCHED_CLASS_EXT */
>
> @@ -271,7 +271,7 @@ static inline void sched_ext_dead(struct task_struct *p) {}
> static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
> static inline void scx_softlockup(u32 dur_s) {}
> static inline bool scx_hardlockup(int cpu) { return false; }
> -static inline bool scx_rcu_cpu_stall(void) { return false; }
> +static inline bool scx_rcu_cpu_stall(const struct cpumask *stalled_mask) { return false; }
>
> #endif /* CONFIG_SCHED_CLASS_EXT */
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 55df6d37145e..03c9651be5c0 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -4871,6 +4871,9 @@ static void __init rcu_dump_rcu_node_tree(void)
>
> struct workqueue_struct *rcu_gp_wq;
>
> +static struct cpumask rcu_stall_cpumask;
> +static struct cpumask rcu_exp_stall_cpumask;
> +
> void __init rcu_init(void)
> {
> int cpu = smp_processor_id();
> diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
> index 82cada459e5d..46b6907f1b09 100644
> --- a/kernel/rcu/tree_exp.h
> +++ b/kernel/rcu/tree_exp.h
> @@ -578,6 +578,7 @@ static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigne
> if (!(READ_ONCE(rnp->expmask) & mask))
> continue;
> ndetected++;
> + cpumask_set_cpu(cpu, &rcu_exp_stall_cpumask);
> rdp = per_cpu_ptr(&rcu_data, cpu);
> pr_cont(" %d-%c%c%c%c", cpu,
> "O."[!!cpu_online(cpu)],
> @@ -665,6 +666,8 @@ static void synchronize_rcu_expedited_wait(void)
> if (rcu_stall_is_suppressed())
> continue;
>
> + cpumask_clear(&rcu_exp_stall_cpumask);
> +
> nbcon_cpu_emergency_enter();
>
> j = jiffies;
> @@ -675,7 +678,7 @@ static void synchronize_rcu_expedited_wait(void)
>
> nbcon_cpu_emergency_exit();
>
> - panic_on_rcu_stall();
> + panic_on_rcu_stall(&rcu_exp_stall_cpumask);
> }
> }
>
> diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
> index b67532cb8770..d0c4f193f17e 100644
> --- a/kernel/rcu/tree_stall.h
> +++ b/kernel/rcu/tree_stall.h
> @@ -159,7 +159,7 @@ static int __init check_cpu_stall_init(void)
> early_initcall(check_cpu_stall_init);
>
> /* If so specified via sysctl, panic, yielding cleaner stall-warning output. */
> -static void panic_on_rcu_stall(void)
> +static void panic_on_rcu_stall(const struct cpumask *stalled_mask)
> {
> static int cpu_stall;
>
> @@ -167,7 +167,7 @@ static void panic_on_rcu_stall(void)
> * Attempt to kick out the BPF scheduler if it's installed and defer
> * the panic to give the system a chance to recover.
> */
> - if (scx_rcu_cpu_stall())
> + if (scx_rcu_cpu_stall(stalled_mask))
> return;
>
> if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
> @@ -645,6 +645,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
> if (rcu_stall_is_suppressed())
> return;
>
> + cpumask_clear(&rcu_stall_cpumask);
> +
> nbcon_cpu_emergency_enter();
>
> /*
> @@ -660,6 +662,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
> for_each_leaf_node_possible_cpu(rnp, cpu)
> if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
> print_cpu_stall_info(cpu);
> + cpumask_set_cpu(cpu, &rcu_stall_cpumask);
> ndetected++;
> }
> }
> @@ -701,7 +704,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
>
> nbcon_cpu_emergency_exit();
>
> - panic_on_rcu_stall();
> + panic_on_rcu_stall(&rcu_stall_cpumask);
>
> rcu_force_quiescent_state(); /* Kick them all. */
> }
> @@ -754,7 +757,9 @@ static void print_cpu_stall(unsigned long gp_seq, unsigned long gps)
>
> nbcon_cpu_emergency_exit();
>
> - panic_on_rcu_stall();
> + cpumask_clear(&rcu_stall_cpumask);
> + cpumask_set_cpu(smp_processor_id(), &rcu_stall_cpumask);
> + panic_on_rcu_stall(&rcu_stall_cpumask);
>
> /*
> * Attempt to revive the RCU machinery by forcing a context switch.
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 0c37b5fd58b0..28009d08762b 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -4966,6 +4966,8 @@ static const struct attribute_group scx_global_attr_group = {
>
> static void free_pnode(struct scx_sched_pnode *pnode);
> static void free_exit_info(struct scx_exit_info *ei);
> +static const char *scx_exit_reason(enum scx_exit_kind kind);
> +static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind);
>
> static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
> {
> @@ -5022,6 +5024,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
> timer_shutdown_sync(&sch->bypass_lb_timer);
> free_cpumask_var(sch->bypass_lb_donee_cpumask);
> free_cpumask_var(sch->bypass_lb_resched_cpumask);
> + free_cpumask_var(sch->stall_cpus);
>
> #ifdef CONFIG_EXT_SUB_SCHED
> kfree(sch->cgrp_path);
> @@ -5251,9 +5254,46 @@ static __printf(2, 3) bool handle_lockup(int exit_cpu, const char *fmt, ...)
> * resolve the reported RCU stall. %false if sched_ext is not enabled or someone
> * else already initiated abort.
> */
> -bool scx_rcu_cpu_stall(void)
> +bool scx_rcu_cpu_stall(const struct cpumask *stalled_mask)
> {
> - return handle_lockup(-1, "RCU CPU stall detected!");
> + struct scx_sched *sch;
> + struct scx_exit_info *ei;
> + int exit_cpu;
> +
> + guard(rcu)();
> +
> + sch = rcu_dereference(scx_root);
> + if (unlikely(!sch))
> + return false;
> +
> + switch (scx_enable_state()) {
> + case SCX_ENABLING:
> + case SCX_ENABLED:
> + break;
> + default:
> + return false;
> + }
> +
> + exit_cpu = cpumask_empty(stalled_mask) ? -1 : (int)cpumask_first(stalled_mask);
> + ei = sch->exit_info;
> +
> + guard(preempt)();
> +
> + if (!scx_claim_exit(sch, SCX_EXIT_ERROR))
> + return false;
> +
> +#ifdef CONFIG_STACKTRACE
> + ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
> +#endif
> + scnprintf(ei->msg, SCX_EXIT_MSG_LEN, "RCU CPU stall on CPUs (%*pbl)",
> + cpumask_pr_args(stalled_mask));
> + ei->kind = SCX_EXIT_ERROR;
> + ei->reason = scx_exit_reason(SCX_EXIT_ERROR);
> + ei->exit_cpu = exit_cpu;
> + cpumask_copy(sch->stall_cpus, stalled_mask);
> +
> + irq_work_queue(&sch->disable_irq_work);
> + return true;
> }
>
> /**
> @@ -6672,14 +6712,23 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
> dump_line(&s, "----------");
>
> /*
> - * Dump the exit CPU first so it isn't lost to dump truncation, then
> - * walk the rest in order, skipping the one already dumped.
> + * Dump stalled CPUs first so they aren't lost to dump truncation, then
> + * walk the rest in order. Fall back to exit_cpu if no stall mask set.
> */
> - if (ei->exit_cpu >= 0)
> - scx_dump_cpu(sch, &s, &dctx, ei->exit_cpu, dump_all_tasks);
> - for_each_possible_cpu(cpu) {
> - if (cpu != ei->exit_cpu)
> + if (!cpumask_empty(sch->stall_cpus)) {
> + for_each_cpu(cpu, sch->stall_cpus)
> scx_dump_cpu(sch, &s, &dctx, cpu, dump_all_tasks);
> + for_each_possible_cpu(cpu) {
> + if (!cpumask_test_cpu(cpu, sch->stall_cpus))
> + scx_dump_cpu(sch, &s, &dctx, cpu, dump_all_tasks);
> + }
> + } else {
> + if (ei->exit_cpu >= 0)
> + scx_dump_cpu(sch, &s, &dctx, ei->exit_cpu, dump_all_tasks);
> + for_each_possible_cpu(cpu) {
> + if (cpu != ei->exit_cpu)
> + scx_dump_cpu(sch, &s, &dctx, cpu, dump_all_tasks);
> + }
> }
>
> dump_newline(&s);
> @@ -6916,6 +6965,10 @@ static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
> ret = -ENOMEM;
> goto err_free_lb_cpumask;
> }
> + if (!zalloc_cpumask_var(&sch->stall_cpus, GFP_KERNEL)) {
> + ret = -ENOMEM;
> + goto err_free_lb_resched_cpumask;
> + }
> /*
> * Copy ops through the right union view. For cid-form the source is
> * struct sched_ext_ops_cid which lacks the trailing cpu_acquire/
> @@ -6994,8 +7047,10 @@ static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
> #ifdef CONFIG_EXT_SUB_SCHED
> err_free_lb_resched:
> RCU_INIT_POINTER(ops->priv, NULL);
> - free_cpumask_var(sch->bypass_lb_resched_cpumask);
> + free_cpumask_var(sch->stall_cpus);
> #endif
> +err_free_lb_resched_cpumask:
> + free_cpumask_var(sch->bypass_lb_resched_cpumask);
> err_free_lb_cpumask:
> free_cpumask_var(sch->bypass_lb_donee_cpumask);
> err_stop_helper:
> diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
> index 2f15a4d3c534..f48dfda3facb 100644
> --- a/kernel/sched/ext_internal.h
> +++ b/kernel/sched/ext_internal.h
> @@ -1199,6 +1199,7 @@ struct scx_sched {
> struct timer_list bypass_lb_timer;
> cpumask_var_t bypass_lb_donee_cpumask;
> cpumask_var_t bypass_lb_resched_cpumask;
> + cpumask_var_t stall_cpus;
> struct rcu_work rcu_work;
>
> /* all ancestors including self */
> --
> 2.48.1
>
next prev parent reply other threads:[~2026-06-09 8:07 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-31 15:25 [PATCH v6 sched_ext/for-7.2 0/2] sched_ext: Follow-up fixes for exit_cpu accuracy Cheng-Yang Chou
2026-05-31 15:25 ` [PATCH 1/2] sched_ext: Fix exit_cpu accuracy for lockup paths Cheng-Yang Chou
2026-06-09 5:10 ` Andrea Righi
2026-05-31 15:25 ` [PATCH 2/2] sched_ext, rcu: Upgrade RCU stall paths to report cpumask of stalled CPUs Cheng-Yang Chou
2026-06-04 17:57 ` Paul E. McKenney
2026-06-05 14:33 ` Cheng-Yang Chou
2026-06-09 8:06 ` Andrea Righi [this message]
-- strict thread matches above, loose matches on Subject: below --
2026-05-21 16:16 [PATCH v5 sched_ext/for-7.2 0/2] sched_ext: Follow-up fixes for exit_cpu accuracy Cheng-Yang Chou
2026-05-21 16:16 ` [PATCH 2/2] sched_ext, rcu: Upgrade RCU stall paths to report cpumask of stalled CPUs Cheng-Yang Chou
2026-05-21 17:05 ` sashiko-bot
2026-05-27 23:19 ` Paul E. McKenney
2026-05-29 15:51 ` Cheng-Yang Chou
2026-05-19 17:17 [PATCH v4 sched_ext/for-7.2 0/2] sched_ext: Follow-up fixes for exit_cpu accuracy Cheng-Yang Chou
2026-05-19 17:17 ` [PATCH 2/2] sched_ext, rcu: Upgrade RCU stall paths to report cpumask of stalled CPUs Cheng-Yang Chou
2026-05-19 17:52 ` sashiko-bot
2026-05-19 18:22 ` Cheng-Yang Chou
2026-05-19 23:48 ` Paul E. McKenney
2026-05-20 14:56 ` Cheng-Yang Chou
2026-05-20 16:35 ` Paul E. McKenney
2026-05-21 7:00 ` Cheng-Yang Chou
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aifJookTje68036x@gpd4 \
--to=arighi@nvidia.com \
--cc=changwoo@igalia.com \
--cc=chia7712@gmail.com \
--cc=joelagnelf@nvidia.com \
--cc=jserv@ccns.ncku.edu.tw \
--cc=paulmck@kernel.org \
--cc=rcu@vger.kernel.org \
--cc=sched-ext@lists.linux.dev \
--cc=tj@kernel.org \
--cc=void@manifault.com \
--cc=yphbchou0911@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.