From: Tejun Heo <tj@kernel.org>
To: linux-kernel@vger.kernel.org, sched-ext@lists.linux.dev
Cc: void@manifault.com, andrea.righi@linux.dev, changwoo@igalia.com,
emil@etsalapatis.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 26/34] sched_ext: Make watchdog sub-sched aware
Date: Wed, 21 Jan 2026 13:11:32 -1000 [thread overview]
Message-ID: <20260121231140.832332-27-tj@kernel.org> (raw)
In-Reply-To: <20260121231140.832332-1-tj@kernel.org>
Currently, the watchdog checks all tasks as if they are all on scx_root.
Move scx_watchdog_timeout inside scx_sched and make check_rq_for_timeouts()
use the timeout from the scx_sched associated with each task.
refresh_watchdog() is added, which determines the timer interval as half of
the shortest watchdog timeouts of all scheds and arms or disarms it as
necessary. Every scx_sched instance has equivalent or better detection
latency while sharing the same timer.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 74 ++++++++++++++++++++++++-------------
kernel/sched/ext_internal.h | 7 ++++
2 files changed, 56 insertions(+), 25 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9c474a6dc2cc..e34896a57780 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -59,11 +59,10 @@ static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
/*
- * The maximum amount of time in jiffies that a task may be runnable without
- * being scheduled on a CPU. If this timeout is exceeded, it will trigger
- * scx_error().
+ * Watchdog interval. All scx_sched's share a single watchdog timer and the
+ * interval is half of the shortest sch->watchdog_timeout.
*/
-static unsigned long scx_watchdog_timeout;
+static unsigned long scx_watchdog_interval;
/*
* The last time the delayed work was run. This delayed work relies on
@@ -2912,10 +2911,11 @@ static bool check_rq_for_timeouts(struct rq *rq)
goto out_unlock;
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
+ struct scx_sched *sch = scx_task_sched(p);
unsigned long last_runnable = p->scx.runnable_at;
if (unlikely(time_after(jiffies,
- last_runnable + scx_watchdog_timeout))) {
+ last_runnable + sch->watchdog_timeout))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@@ -2932,6 +2932,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
static void scx_watchdog_workfn(struct work_struct *work)
{
+ unsigned long intv;
int cpu;
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
@@ -2942,28 +2943,31 @@ static void scx_watchdog_workfn(struct work_struct *work)
cond_resched();
}
- queue_delayed_work(system_unbound_wq, to_delayed_work(work),
- scx_watchdog_timeout / 2);
+
+ intv = READ_ONCE(scx_watchdog_interval);
+ if (intv < ULONG_MAX)
+ queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+ intv);
}
void scx_tick(struct rq *rq)
{
- struct scx_sched *sch;
+ struct scx_sched *root;
unsigned long last_check;
if (!scx_enabled())
return;
- sch = rcu_dereference_bh(scx_root);
- if (unlikely(!sch))
+ root = rcu_dereference_bh(scx_root);
+ if (unlikely(!root))
return;
last_check = READ_ONCE(scx_watchdog_timestamp);
if (unlikely(time_after(jiffies,
- last_check + READ_ONCE(scx_watchdog_timeout)))) {
+ last_check + READ_ONCE(root->watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
- scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ scx_exit(root, SCX_EXIT_ERROR_STALL, 0,
"watchdog failed to check in for %u.%03us",
dur_ms / 1000, dur_ms % 1000);
}
@@ -4564,6 +4568,26 @@ static void free_kick_syncs(void)
}
}
+static void refresh_watchdog(void)
+{
+ struct scx_sched *sch;
+ unsigned long intv = ULONG_MAX;
+
+ /* take the shortest timeout and use its half for watchdog interval */
+ rcu_read_lock();
+ list_for_each_entry_rcu(sch, &scx_sched_all, all)
+ intv = max(min(intv, sch->watchdog_timeout / 2), 1);
+ rcu_read_unlock();
+
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+ WRITE_ONCE(scx_watchdog_interval, intv);
+
+ if (intv < ULONG_MAX)
+ mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
+ else
+ cancel_delayed_work_sync(&scx_watchdog_work);
+}
+
#ifdef CONFIG_EXT_SUB_SCHED
static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
@@ -4602,6 +4626,8 @@ static void scx_sub_disable(struct scx_sched *sch)
list_del_rcu(&sch->all);
raw_spin_unlock_irq(&scx_sched_lock);
+ refresh_watchdog();
+
mutex_unlock(&scx_enable_mutex);
/*
@@ -4736,12 +4762,12 @@ static void scx_root_disable(struct scx_sched *sch)
if (sch->ops.exit)
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
- cancel_delayed_work_sync(&scx_watchdog_work);
-
raw_spin_lock_irq(&scx_sched_lock);
list_del_rcu(&sch->all);
raw_spin_unlock_irq(&scx_sched_lock);
+ refresh_watchdog();
+
/*
* scx_root clearing must be inside cpus_read_lock(). See
* handle_hotplug().
@@ -5266,6 +5292,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
sch->ancestors[level] = sch;
sch->level = level;
+ if (ops->timeout_ms)
+ sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
+ else
+ sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+
sch->slice_dfl = SCX_SLICE_DFL;
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
@@ -5393,7 +5424,6 @@ static s32 scx_root_enable(struct sched_ext_ops *ops, struct bpf_link *link)
struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
- unsigned long timeout;
s32 i, cpu, ret;
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
@@ -5451,6 +5481,8 @@ static s32 scx_root_enable(struct sched_ext_ops *ops, struct bpf_link *link)
list_add_tail_rcu(&sch->all, &scx_sched_all);
raw_spin_unlock_irq(&scx_sched_lock);
+ refresh_watchdog();
+
scx_idle_enable(ops);
if (sch->ops.init) {
@@ -5481,16 +5513,6 @@ static s32 scx_root_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (ret)
goto err_disable;
- if (ops->timeout_ms)
- timeout = msecs_to_jiffies(ops->timeout_ms);
- else
- timeout = SCX_WATCHDOG_MAX_TIMEOUT;
-
- WRITE_ONCE(scx_watchdog_timeout, timeout);
- WRITE_ONCE(scx_watchdog_timestamp, jiffies);
- queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
- scx_watchdog_timeout / 2);
-
/*
* Once __scx_enabled is set, %current can be switched to SCX anytime.
* This can lead to stalls as some BPF schedulers (e.g. userspace
@@ -5708,6 +5730,8 @@ static s32 scx_sub_enable(struct sched_ext_ops *ops, struct bpf_link *link)
list_add_tail_rcu(&sch->all, &scx_sched_all);
raw_spin_unlock_irq(&scx_sched_lock);
+ refresh_watchdog();
+
if (sch->level >= SCX_SUB_MAX_DEPTH) {
scx_error(sch, "max nesting depth %d violated",
SCX_SUB_MAX_DEPTH);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 93ab6e0b0f74..6ad1c00a72ba 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1019,6 +1019,13 @@ struct scx_sched {
bool sub_attached;
#endif /* CONFIG_EXT_SUB_SCHED */
+ /*
+ * The maximum amount of time in jiffies that a task may be runnable
+ * without being scheduled on a CPU. If this timeout is exceeded, it
+ * will trigger scx_error().
+ */
+ unsigned long watchdog_timeout;
+
atomic_t exit_kind;
struct scx_exit_info *exit_info;
--
2.52.0
next prev parent reply other threads:[~2026-01-21 23:12 UTC|newest]
Thread overview: 38+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-21 23:11 [PATCHSET v1 sched_ext/for-6.20] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-01-21 23:11 ` [PATCH 01/34] sched_ext: Implement cgroup subtree iteration for scx_task_iter Tejun Heo
2026-01-21 23:11 ` [PATCH 02/34] sched_ext: Add @kargs to scx_fork() Tejun Heo
2026-01-21 23:11 ` [PATCH 03/34] sched/core: Swap the order between sched_post_fork() and cgroup_post_fork() Tejun Heo
2026-01-21 23:11 ` [PATCH 04/34] cgroup: Expose some cgroup helpers Tejun Heo
2026-01-21 23:11 ` [PATCH 05/34] sched_ext: Update p->scx.disallow warning in scx_init_task() Tejun Heo
2026-01-21 23:11 ` [PATCH 06/34] sched_ext: Reorganize enable/disable path for multi-scheduler support Tejun Heo
2026-01-21 23:11 ` [PATCH 07/34] sched_ext: Introduce cgroup sub-sched support Tejun Heo
2026-01-21 23:11 ` [PATCH 08/34] sched_ext: Introduce scx_task_sched[_rcu]() Tejun Heo
2026-01-21 23:11 ` [PATCH 09/34] sched_ext: Introduce scx_prog_sched() Tejun Heo
2026-01-21 23:11 ` [PATCH 10/34] sched_ext: Enforce scheduling authority in dispatch and select_cpu operations Tejun Heo
2026-01-21 23:11 ` [PATCH 11/34] sched_ext: Enforce scheduler ownership when updating slice and dsq_vtime Tejun Heo
2026-01-21 23:11 ` [PATCH 12/34] sched_ext: scx_dsq_move() should validate the task belongs to the right scheduler Tejun Heo
2026-01-21 23:11 ` [PATCH 13/34] sched_ext: Refactor task init/exit helpers Tejun Heo
2026-01-21 23:11 ` [PATCH 14/34] sched_ext: Make scx_prio_less() handle multiple schedulers Tejun Heo
2026-01-21 23:11 ` [PATCH 15/34] sched_ext: Move default slice to per-scheduler field Tejun Heo
2026-01-21 23:11 ` [PATCH 16/34] sched_ext: Move aborting flag " Tejun Heo
2026-01-21 23:11 ` [PATCH 17/34] sched_ext: Move bypass_dsq into scx_sched_pcpu Tejun Heo
2026-01-21 23:11 ` [PATCH 18/34] sched_ext: Move bypass state into scx_sched Tejun Heo
2026-01-21 23:11 ` [PATCH 19/34] sched_ext: Prepare bypass mode for hierarchical operation Tejun Heo
2026-01-21 23:11 ` [PATCH 20/34] sched_ext: Factor out scx_dispatch_sched() Tejun Heo
2026-01-21 23:11 ` [PATCH 21/34] sched_ext: When calling ops.dispatch() @prev must be on the same scx_sched Tejun Heo
2026-01-21 23:11 ` [PATCH 22/34] sched_ext: Separate bypass dispatch enabling from bypass depth tracking Tejun Heo
2026-01-21 23:11 ` [PATCH 23/34] sched_ext: Implement hierarchical bypass mode Tejun Heo
2026-01-21 23:11 ` [PATCH 24/34] sched_ext: Dispatch from all scx_sched instances Tejun Heo
2026-01-21 23:11 ` [PATCH 25/34] sched_ext: Move scx_dsp_ctx and scx_dsp_max_batch into scx_sched Tejun Heo
2026-01-21 23:11 ` Tejun Heo [this message]
2026-01-21 23:11 ` [PATCH 27/34] sched_ext: Convert scx_dump_state() spinlock to raw spinlock Tejun Heo
2026-01-21 23:11 ` [PATCH 28/34] sched_ext: Support dumping multiple schedulers and add scheduler identification Tejun Heo
2026-01-21 23:11 ` [PATCH 29/34] sched_ext: Implement cgroup sub-sched enabling and disabling Tejun Heo
2026-01-21 23:11 ` [PATCH 30/34] sched_ext: Add scx_sched back pointer to scx_sched_pcpu Tejun Heo
2026-01-21 23:11 ` [PATCH 31/34] sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware Tejun Heo
2026-01-21 23:11 ` [PATCH 32/34] sched_ext: Factor out scx_link_sched() and scx_unlink_sched() Tejun Heo
2026-01-21 23:11 ` [PATCH 33/34] sched_ext: Add rhashtable lookup for sub-schedulers Tejun Heo
2026-01-21 23:11 ` [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching Tejun Heo
-- strict thread matches above, loose matches on Subject: below --
2026-02-25 5:00 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25 5:01 ` [PATCH 26/34] sched_ext: Make watchdog sub-sched aware Tejun Heo
2026-02-25 5:01 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25 5:01 ` [PATCH 26/34] sched_ext: Make watchdog sub-sched aware Tejun Heo
2026-03-04 22:00 [PATCHSET v3 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-03-04 22:01 ` [PATCH 26/34] sched_ext: Make watchdog sub-sched aware Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260121231140.832332-27-tj@kernel.org \
--to=tj@kernel.org \
--cc=andrea.righi@linux.dev \
--cc=changwoo@igalia.com \
--cc=emil@etsalapatis.com \
--cc=linux-kernel@vger.kernel.org \
--cc=sched-ext@lists.linux.dev \
--cc=void@manifault.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox