From: Tejun Heo <tj@kernel.org>
To: David Vernet <void@manifault.com>,
Andrea Righi <andrea.righi@linux.dev>,
Changwoo Min <changwoo@igalia.com>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>,
Emil Tsalapatis <etsal@meta.com>,
sched-ext@lists.linux.dev, linux-kernel@vger.kernel.org,
Tejun Heo <tj@kernel.org>, Emil Tsalapatis <emil@etsalapatis.com>,
Andrea Righi <arighi@nvidia.com>
Subject: [PATCH 04/13] sched_ext: Simplify breather mechanism with scx_aborting flag
Date: Tue, 11 Nov 2025 09:18:07 -1000 [thread overview]
Message-ID: <20251111191816.862797-5-tj@kernel.org> (raw)
In-Reply-To: <20251111191816.862797-1-tj@kernel.org>
The breather mechanism was introduced in 62dcbab8b0ef ("sched_ext: Avoid
live-locking bypass mode switching") and e32c260195e6 ("sched_ext: Enable the
ops breather and eject BPF scheduler on softlockup") to prevent live-locks by
injecting delays when CPUs are trapped in dispatch paths.
Currently, it uses scx_breather_depth (atomic_t) and scx_in_softlockup
(unsigned long) with separate increment/decrement and cleanup operations. The
breather is only activated when aborting, so tie it directly to the exit
mechanism. Replace both variables with scx_aborting flag set when exit is
claimed and cleared after bypass is enabled. Introduce scx_claim_exit() to
consolidate exit_kind claiming and breather enablement. This eliminates
scx_clear_softlockup() and simplifies scx_softlockup() and scx_bypass().
The breather mechanism will be replaced by a different abort mechanism in a
future patch. This simplification prepares for that change.
Reviewed-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 54 +++++++++++++++++++++-------------------------
1 file changed, 25 insertions(+), 29 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 747391a3f6e3..5da699cacde1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -33,9 +33,8 @@ static DEFINE_MUTEX(scx_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
-static unsigned long scx_in_softlockup;
-static atomic_t scx_breather_depth = ATOMIC_INIT(0);
static int scx_bypass_depth;
+static bool scx_aborting;
static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -1831,7 +1830,7 @@ static void scx_breather(struct rq *rq)
lockdep_assert_rq_held(rq);
- if (likely(!atomic_read(&scx_breather_depth)))
+ if (likely(!READ_ONCE(scx_aborting)))
return;
raw_spin_rq_unlock(rq);
@@ -1840,9 +1839,9 @@ static void scx_breather(struct rq *rq)
do {
int cnt = 1024;
- while (atomic_read(&scx_breather_depth) && --cnt)
+ while (READ_ONCE(scx_aborting) && --cnt)
cpu_relax();
- } while (atomic_read(&scx_breather_depth) &&
+ } while (READ_ONCE(scx_aborting) &&
time_before64(ktime_get_ns(), until));
raw_spin_rq_lock(rq);
@@ -3741,30 +3740,14 @@ void scx_softlockup(u32 dur_s)
goto out_unlock;
}
- /* allow only one instance, cleared at the end of scx_bypass() */
- if (test_and_set_bit(0, &scx_in_softlockup))
- goto out_unlock;
-
printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
smp_processor_id(), dur_s, scx_root->ops.name);
- /*
- * Some CPUs may be trapped in the dispatch paths. Enable breather
- * immediately; otherwise, we might even be able to get to scx_bypass().
- */
- atomic_inc(&scx_breather_depth);
-
scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s);
out_unlock:
rcu_read_unlock();
}
-static void scx_clear_softlockup(void)
-{
- if (test_and_clear_bit(0, &scx_in_softlockup))
- atomic_dec(&scx_breather_depth);
-}
-
/**
* scx_bypass - [Un]bypass scx_ops and guarantee forward progress
* @bypass: true for bypass, false for unbypass
@@ -3827,8 +3810,6 @@ static void scx_bypass(bool bypass)
ktime_get_ns() - bypass_timestamp);
}
- atomic_inc(&scx_breather_depth);
-
/*
* No task property is changing. We just need to make sure all currently
* queued tasks are re-queued according to the new scx_rq_bypassing()
@@ -3884,10 +3865,8 @@ static void scx_bypass(bool bypass)
raw_spin_rq_unlock(rq);
}
- atomic_dec(&scx_breather_depth);
unlock:
raw_spin_unlock_irqrestore(&bypass_lock, flags);
- scx_clear_softlockup();
}
static void free_exit_info(struct scx_exit_info *ei)
@@ -3982,6 +3961,7 @@ static void scx_disable_workfn(struct kthread_work *work)
/* guarantee forward progress by bypassing scx_ops */
scx_bypass(true);
+ WRITE_ONCE(scx_aborting, false);
switch (scx_set_enable_state(SCX_DISABLING)) {
case SCX_DISABLING:
@@ -4104,9 +4084,24 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_bypass(false);
}
-static void scx_disable(enum scx_exit_kind kind)
+static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+
+ if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
+ return false;
+
+ /*
+ * Some CPUs may be trapped in the dispatch paths. Enable breather
+ * immediately; otherwise, we might not even be able to get to
+ * scx_bypass().
+ */
+ WRITE_ONCE(scx_aborting, true);
+ return true;
+}
+
+static void scx_disable(enum scx_exit_kind kind)
+{
struct scx_sched *sch;
if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
@@ -4115,7 +4110,7 @@ static void scx_disable(enum scx_exit_kind kind)
rcu_read_lock();
sch = rcu_dereference(scx_root);
if (sch) {
- atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
+ scx_claim_exit(sch, kind);
kthread_queue_work(sch->helper, &sch->disable_work);
}
rcu_read_unlock();
@@ -4436,9 +4431,8 @@ static void scx_vexit(struct scx_sched *sch,
const char *fmt, va_list args)
{
struct scx_exit_info *ei = sch->exit_info;
- int none = SCX_EXIT_NONE;
- if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
+ if (!scx_claim_exit(sch, kind))
return;
ei->exit_code = exit_code;
@@ -4654,6 +4648,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
WARN_ON_ONCE(scx_root);
+ if (WARN_ON_ONCE(READ_ONCE(scx_aborting)))
+ WRITE_ONCE(scx_aborting, false);
atomic_long_set(&scx_nr_rejected, 0);
--
2.51.2
next prev parent reply other threads:[~2025-11-11 19:18 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-11 19:18 [PATCHSET v3 sched_ext/for-6.19] sched_ext: Improve bypass mode scalability Tejun Heo
2025-11-11 19:18 ` [PATCH 01/13] sched_ext: Use shorter slice in bypass mode Tejun Heo
2025-11-11 19:18 ` [PATCH 02/13] sched_ext: Refactor do_enqueue_task() local and global DSQ paths Tejun Heo
2025-11-11 19:18 ` [PATCH 03/13] sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode Tejun Heo
2025-11-11 19:18 ` Tejun Heo [this message]
2025-11-11 19:18 ` [PATCH 05/13] sched_ext: Exit dispatch and move operations immediately when aborting Tejun Heo
2025-11-11 19:18 ` [PATCH 06/13] sched_ext: Make scx_exit() and scx_vexit() return bool Tejun Heo
2025-11-11 19:18 ` [PATCH 07/13] sched_ext: Refactor lockup handlers into handle_lockup() Tejun Heo
2025-11-11 19:18 ` [PATCH 08/13] sched_ext: Make handle_lockup() propagate scx_verror() result Tejun Heo
2025-11-11 19:18 ` [PATCH 09/13] sched_ext: Hook up hardlockup detector Tejun Heo
2025-11-11 19:19 ` Tejun Heo
2025-11-13 22:33 ` Doug Anderson
2025-11-14 1:25 ` Tejun Heo
2025-11-14 1:33 ` [PATCH sched_ext/for-6.19] sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs Tejun Heo
2025-11-14 2:00 ` Emil Tsalapatis
2025-11-14 7:32 ` Andrea Righi
2025-11-14 19:24 ` Doug Anderson
2025-11-14 21:15 ` Tejun Heo
2025-11-14 21:19 ` Tejun Heo
2025-11-11 19:18 ` [PATCH 10/13] sched_ext: Add scx_cpu0 example scheduler Tejun Heo
2025-11-11 19:18 ` [PATCH 11/13] sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR Tejun Heo
2025-11-11 19:18 ` [PATCH 12/13] sched_ext: Factor out abbreviated dispatch dequeue into dispatch_dequeue_locked() Tejun Heo
2025-11-11 19:18 ` [PATCH 13/13] sched_ext: Implement load balancer for bypass mode Tejun Heo
2025-11-11 19:30 ` Emil Tsalapatis
2025-11-12 16:49 ` [PATCHSET v3 sched_ext/for-6.19] sched_ext: Improve bypass mode scalability Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251111191816.862797-5-tj@kernel.org \
--to=tj@kernel.org \
--cc=andrea.righi@linux.dev \
--cc=arighi@nvidia.com \
--cc=changwoo@igalia.com \
--cc=emil@etsalapatis.com \
--cc=etsal@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=schatzberg.dan@gmail.com \
--cc=sched-ext@lists.linux.dev \
--cc=void@manifault.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.