From: Tejun Heo <tj@kernel.org>
To: linux-kernel@vger.kernel.org, sched-ext@lists.linux.dev
Cc: void@manifault.com, arighi@nvidia.com, changwoo@igalia.com,
emil@etsalapatis.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 22/34] sched_ext: Separate bypass dispatch enabling from bypass depth tracking
Date: Wed, 4 Mar 2026 12:01:07 -1000 [thread overview]
Message-ID: <20260304220119.4095551-23-tj@kernel.org> (raw)
In-Reply-To: <20260304220119.4095551-1-tj@kernel.org>
The bypass_depth field tracks nesting of bypass operations but is also used
to determine whether the bypass dispatch path should be active. With
hierarchical scheduling, child schedulers may need to activate their parent's
bypass dispatch path without affecting the parent's bypass_depth, requiring
separation of these concerns.
Add bypass_dsp_enable_depth and bypass_dsp_claim to independently control
bypass dispatch path activation. The new enable_bypass_dsp() and
disable_bypass_dsp() functions manage this state with proper claim semantics
to prevent races. The bypass dispatch path now only activates when
bypass_dsp_enabled() returns true, which checks the new enable_depth counter.
The disable operation is carefully ordered after all tasks are moved out of
bypass DSQs to ensure they are drained before the dispatch path is disabled.
During scheduler teardown, disable_bypass_dsp() is called explicitly to ensure
cleanup even if bypass mode was never entered normally.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 74 ++++++++++++++++++++++++++++++++-----
kernel/sched/ext_internal.h | 5 +++
2 files changed, 69 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1570b6a8158c..6b07d97b0af6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -357,6 +357,26 @@ static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu)
return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq;
}
+/**
+ * bypass_dsp_enabled - Check if bypass dispatch path is enabled
+ * @sch: scheduler to check
+ *
+ * When a descendant scheduler enters bypass mode, bypassed tasks are scheduled
+ * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors
+ * are bypassing. In the former case, the ancestor is not itself bypassing but
+ * its bypass DSQs will be populated with bypassed tasks from descendants. Thus,
+ * the ancestor's bypass dispatch path must be active even though its own
+ * bypass_depth remains zero.
+ *
+ * This function checks bypass_dsp_enable_depth which is managed separately from
+ * bypass_depth to enable this decoupling. See enable_bypass_dsp() and
+ * disable_bypass_dsp().
+ */
+static bool bypass_dsp_enabled(struct scx_sched *sch)
+{
+ return unlikely(atomic_read(&sch->bypass_dsp_enable_depth));
+}
+
/*
* scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
* ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -2400,7 +2420,7 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
if (consume_global_dsq(sch, rq))
return true;
- if (scx_bypassing(sch, cpu))
+ if (bypass_dsp_enabled(sch) && scx_bypassing(sch, cpu))
return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
@@ -4397,7 +4417,7 @@ static void scx_bypass_lb_timerfn(struct timer_list *timer)
int node;
u32 intv_us;
- if (!READ_ONCE(sch->bypass_depth))
+ if (!bypass_dsp_enabled(sch))
return;
for_each_node_with_cpus(node)
@@ -4438,6 +4458,42 @@ static bool dec_bypass_depth(struct scx_sched *sch)
return true;
}
+static void enable_bypass_dsp(struct scx_sched *sch)
+{
+ u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+ s32 ret;
+
+ /*
+ * @sch->bypass_depth transitioning from 0 to 1 triggers enabling.
+ * Shouldn't stagger.
+ */
+ if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim)))
+ return;
+
+ /*
+ * The LB timer will stop running if bypass_arm_depth is 0. Increment
+ * before starting the LB timer.
+ */
+ ret = atomic_inc_return(&sch->bypass_dsp_enable_depth);
+ WARN_ON_ONCE(ret <= 0);
+
+ if (intv_us && !timer_pending(&sch->bypass_lb_timer))
+ mod_timer(&sch->bypass_lb_timer,
+ jiffies + usecs_to_jiffies(intv_us));
+}
+
+/* may be called without holding scx_bypass_lock */
+static void disable_bypass_dsp(struct scx_sched *sch)
+{
+ s32 ret;
+
+ if (!test_and_clear_bit(0, &sch->bypass_dsp_claim))
+ return;
+
+ ret = atomic_dec_return(&sch->bypass_dsp_enable_depth);
+ WARN_ON_ONCE(ret < 0);
+}
+
/**
* scx_bypass - [Un]bypass scx_ops and guarantee forward progress
* @sch: sched to bypass
@@ -4479,17 +4535,10 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
raw_spin_lock_irqsave(&scx_bypass_lock, flags);
if (bypass) {
- u32 intv_us;
-
if (!inc_bypass_depth(sch))
goto unlock;
- intv_us = READ_ONCE(scx_bypass_lb_intv_us);
- if (intv_us && !timer_pending(&sch->bypass_lb_timer)) {
- sch->bypass_lb_timer.expires =
- jiffies + usecs_to_jiffies(intv_us);
- add_timer_global(&sch->bypass_lb_timer);
- }
+ enable_bypass_dsp(sch);
} else {
if (!dec_bypass_depth(sch))
goto unlock;
@@ -4571,6 +4620,9 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
raw_spin_rq_unlock(rq);
}
+ /* disarming must come after moving all tasks out of the bypass DSQs */
+ if (!bypass)
+ disable_bypass_dsp(sch);
unlock:
raw_spin_unlock_irqrestore(&scx_bypass_lock, flags);
}
@@ -4672,6 +4724,8 @@ static void scx_sub_disable(struct scx_sched *sch)
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
+ disable_bypass_dsp(sch);
+
raw_spin_lock_irq(&scx_sched_lock);
list_del_init(&sch->sibling);
list_del_rcu(&sch->all);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index c0358ff544b8..fd2671340019 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -961,6 +961,11 @@ struct scx_sched {
u64 slice_dfl;
u64 bypass_timestamp;
s32 bypass_depth;
+
+ /* bypass dispatch path enable state, see bypass_dsp_enabled() */
+ unsigned long bypass_dsp_claim;
+ atomic_t bypass_dsp_enable_depth;
+
bool aborting;
s32 level;
--
2.53.0
next prev parent reply other threads:[~2026-03-04 22:01 UTC|newest]
Thread overview: 50+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-04 22:00 [PATCHSET v3 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-03-04 22:00 ` [PATCH 01/34] sched_ext: Implement cgroup subtree iteration for scx_task_iter Tejun Heo
2026-03-04 22:00 ` [PATCH 02/34] sched_ext: Add @kargs to scx_fork() Tejun Heo
2026-03-04 22:00 ` [PATCH 03/34] sched/core: Swap the order between sched_post_fork() and cgroup_post_fork() Tejun Heo
2026-03-06 4:17 ` Tejun Heo
2026-03-06 8:44 ` Peter Zijlstra
2026-03-04 22:00 ` [PATCH 04/34] cgroup: Expose some cgroup helpers Tejun Heo
2026-03-06 4:18 ` Tejun Heo
2026-03-04 22:00 ` [PATCH 05/34] sched_ext: Update p->scx.disallow warning in scx_init_task() Tejun Heo
2026-03-04 22:00 ` [PATCH 06/34] sched_ext: Reorganize enable/disable path for multi-scheduler support Tejun Heo
2026-03-04 22:00 ` [PATCH 07/34] sched_ext: Introduce cgroup sub-sched support Tejun Heo
2026-03-04 22:00 ` [PATCH 08/34] sched_ext: Introduce scx_task_sched[_rcu]() Tejun Heo
2026-03-04 22:00 ` [PATCH 09/34] sched_ext: Introduce scx_prog_sched() Tejun Heo
2026-03-04 22:00 ` [PATCH 10/34] sched_ext: Enforce scheduling authority in dispatch and select_cpu operations Tejun Heo
2026-03-04 22:00 ` [PATCH 11/34] sched_ext: Enforce scheduler ownership when updating slice and dsq_vtime Tejun Heo
2026-03-04 22:00 ` [PATCH 12/34] sched_ext: scx_dsq_move() should validate the task belongs to the right scheduler Tejun Heo
2026-03-04 22:00 ` [PATCH 13/34] sched_ext: Refactor task init/exit helpers Tejun Heo
2026-03-04 22:00 ` [PATCH 14/34] sched_ext: Make scx_prio_less() handle multiple schedulers Tejun Heo
2026-03-04 22:01 ` [PATCH 15/34] sched_ext: Move default slice to per-scheduler field Tejun Heo
2026-03-04 22:01 ` [PATCH 16/34] sched_ext: Move aborting flag " Tejun Heo
2026-03-04 22:01 ` [PATCH 17/34] sched_ext: Move bypass_dsq into scx_sched_pcpu Tejun Heo
2026-03-04 22:01 ` [PATCH 18/34] sched_ext: Move bypass state into scx_sched Tejun Heo
2026-03-04 22:01 ` [PATCH 19/34] sched_ext: Prepare bypass mode for hierarchical operation Tejun Heo
2026-03-04 22:01 ` [PATCH 20/34] sched_ext: Factor out scx_dispatch_sched() Tejun Heo
2026-03-04 22:01 ` [PATCH 21/34] sched_ext: When calling ops.dispatch() @prev must be on the same scx_sched Tejun Heo
2026-03-04 22:01 ` Tejun Heo [this message]
2026-03-04 22:01 ` [PATCH 23/34] sched_ext: Implement hierarchical bypass mode Tejun Heo
2026-03-06 7:03 ` Andrea Righi
2026-03-06 7:23 ` Andrea Righi
2026-03-06 17:39 ` [PATCH v2 " Tejun Heo
2026-03-04 22:01 ` [PATCH 24/34] sched_ext: Dispatch from all scx_sched instances Tejun Heo
2026-03-04 22:01 ` [PATCH 25/34] sched_ext: Move scx_dsp_ctx and scx_dsp_max_batch into scx_sched Tejun Heo
2026-03-04 22:01 ` [PATCH 26/34] sched_ext: Make watchdog sub-sched aware Tejun Heo
2026-03-04 22:01 ` [PATCH 27/34] sched_ext: Convert scx_dump_state() spinlock to raw spinlock Tejun Heo
2026-03-04 22:01 ` [PATCH 28/34] sched_ext: Support dumping multiple schedulers and add scheduler identification Tejun Heo
2026-03-04 22:01 ` [PATCH 29/34] sched_ext: Implement cgroup sub-sched enabling and disabling Tejun Heo
2026-03-06 9:41 ` Cheng-Yang Chou
2026-03-06 17:39 ` [PATCH v2 " Tejun Heo
2026-03-04 22:01 ` [PATCH 30/34] sched_ext: Add scx_sched back pointer to scx_sched_pcpu Tejun Heo
2026-03-04 22:01 ` [PATCH 31/34] sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware Tejun Heo
2026-03-04 22:01 ` [PATCH 32/34] sched_ext: Factor out scx_link_sched() and scx_unlink_sched() Tejun Heo
2026-03-04 22:01 ` [PATCH 33/34] sched_ext: Add rhashtable lookup for sub-schedulers Tejun Heo
2026-03-04 22:01 ` [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching Tejun Heo
2026-03-06 4:09 ` [PATCHSET v3 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-03-06 4:17 ` Tejun Heo
2026-03-06 7:29 ` Andrea Righi
2026-03-06 18:14 ` Tejun Heo
-- strict thread matches above, loose matches on Subject: below --
2026-02-25 5:01 [PATCHSET v2 " Tejun Heo
2026-02-25 5:01 ` [PATCH 22/34] sched_ext: Separate bypass dispatch enabling from bypass depth tracking Tejun Heo
2026-02-25 5:00 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25 5:00 ` [PATCH 22/34] sched_ext: Separate bypass dispatch enabling from bypass depth tracking Tejun Heo
2026-01-21 23:11 [PATCHSET v1 sched_ext/for-6.20] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-01-21 23:11 ` [PATCH 22/34] sched_ext: Separate bypass dispatch enabling from bypass depth tracking Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260304220119.4095551-23-tj@kernel.org \
--to=tj@kernel.org \
--cc=arighi@nvidia.com \
--cc=changwoo@igalia.com \
--cc=emil@etsalapatis.com \
--cc=linux-kernel@vger.kernel.org \
--cc=sched-ext@lists.linux.dev \
--cc=void@manifault.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox