public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: linux-kernel@vger.kernel.org, sched-ext@lists.linux.dev
Cc: void@manifault.com, arighi@nvidia.com, changwoo@igalia.com,
	emil@etsalapatis.com, hannes@cmpxchg.org, mkoutny@suse.com,
	cgroups@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH 17/34] sched_ext: Move bypass_dsq into scx_sched_pcpu
Date: Tue, 24 Feb 2026 19:01:35 -1000	[thread overview]
Message-ID: <20260225050152.1070601-18-tj@kernel.org> (raw)
In-Reply-To: <20260225050152.1070601-1-tj@kernel.org>

To support bypass mode for sub-schedulers, move bypass_dsq from struct scx_rq
to struct scx_sched_pcpu. Add bypass_dsq() helper. Move bypass_dsq
initialization from init_sched_ext_class() to scx_alloc_and_attach_sched().
bypass_lb_cpu() now takes a CPU number instead of rq pointer. All callers
updated. No behavior change as all tasks use the root scheduler.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c          | 52 +++++++++++++++++++------------------
 kernel/sched/ext_internal.h |  2 ++
 kernel/sched/sched.h        |  1 -
 3 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ef11a537acc5..a68f97befcf1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -359,6 +359,11 @@ static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
 	return __setscheduler_class(p->policy, p->prio);
 }
 
+static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu)
+{
+	return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq;
+}
+
 /*
  * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
  * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -1628,7 +1633,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	dsq = find_global_dsq(sch, p);
 	goto enqueue;
 bypass:
-	dsq = &task_rq(p)->scx.bypass_dsq;
+	dsq = bypass_dsq(sch, task_cpu(p));
 	goto enqueue;
 
 enqueue:
@@ -2439,7 +2444,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		goto has_tasks;
 
 	if (scx_rq_bypassing(rq)) {
-		if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
+		if (consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu_of(rq))))
 			goto has_tasks;
 		else
 			goto no_tasks;
@@ -4192,11 +4197,12 @@ bool scx_hardlockup(int cpu)
 	return true;
 }
 
-static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
+static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor,
 			 struct cpumask *donee_mask, struct cpumask *resched_mask,
 			 u32 nr_donor_target, u32 nr_donee_target)
 {
-	struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+	struct rq *donor_rq = cpu_rq(donor);
+	struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor);
 	struct task_struct *p, *n;
 	struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0);
 	s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
@@ -4212,7 +4218,7 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
 	if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
 		return 0;
 
-	raw_spin_rq_lock_irq(rq);
+	raw_spin_rq_lock_irq(donor_rq);
 	raw_spin_lock(&donor_dsq->lock);
 	list_add(&cursor.node, &donor_dsq->list);
 resume:
@@ -4220,7 +4226,6 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
 	n = nldsq_next_task(donor_dsq, n, false);
 
 	while ((p = n)) {
-		struct rq *donee_rq;
 		struct scx_dispatch_q *donee_dsq;
 		int donee;
 
@@ -4236,14 +4241,13 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
 		if (donee >= nr_cpu_ids)
 			continue;
 
-		donee_rq = cpu_rq(donee);
-		donee_dsq = &donee_rq->scx.bypass_dsq;
+		donee_dsq = bypass_dsq(sch, donee);
 
 		/*
 		 * $p's rq is not locked but $p's DSQ lock protects its
 		 * scheduling properties making this test safe.
 		 */
-		if (!task_can_run_on_remote_rq(sch, p, donee_rq, false))
+		if (!task_can_run_on_remote_rq(sch, p, cpu_rq(donee), false))
 			continue;
 
 		/*
@@ -4258,7 +4262,7 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
 		 * between bypass DSQs.
 		 */
 		dispatch_dequeue_locked(p, donor_dsq);
-		dispatch_enqueue(sch, donee_rq, donee_dsq, p, SCX_ENQ_NESTED);
+		dispatch_enqueue(sch, cpu_rq(donee), donee_dsq, p, SCX_ENQ_NESTED);
 
 		/*
 		 * $donee might have been idle and need to be woken up. No need
@@ -4273,9 +4277,9 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
 		if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) {
 			list_move_tail(&cursor.node, &n->scx.dsq_list.node);
 			raw_spin_unlock(&donor_dsq->lock);
-			raw_spin_rq_unlock_irq(rq);
+			raw_spin_rq_unlock_irq(donor_rq);
 			cpu_relax();
-			raw_spin_rq_lock_irq(rq);
+			raw_spin_rq_lock_irq(donor_rq);
 			raw_spin_lock(&donor_dsq->lock);
 			goto resume;
 		}
@@ -4283,7 +4287,7 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
 
 	list_del_init(&cursor.node);
 	raw_spin_unlock(&donor_dsq->lock);
-	raw_spin_rq_unlock_irq(rq);
+	raw_spin_rq_unlock_irq(donor_rq);
 
 	return nr_balanced;
 }
@@ -4301,7 +4305,7 @@ static void bypass_lb_node(struct scx_sched *sch, int node)
 
 	/* count the target tasks and CPUs */
 	for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
-		u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+		u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr);
 
 		nr_tasks += nr;
 		nr_cpus++;
@@ -4323,24 +4327,21 @@ static void bypass_lb_node(struct scx_sched *sch, int node)
 
 	cpumask_clear(donee_mask);
 	for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
-		if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target)
+		if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target)
 			cpumask_set_cpu(cpu, donee_mask);
 	}
 
 	/* iterate !donee CPUs and see if they should be offloaded */
 	cpumask_clear(resched_mask);
 	for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
-		struct rq *rq = cpu_rq(cpu);
-		struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
-
 		if (cpumask_empty(donee_mask))
 			break;
 		if (cpumask_test_cpu(cpu, donee_mask))
 			continue;
-		if (READ_ONCE(donor_dsq->nr) <= nr_donor_target)
+		if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target)
 			continue;
 
-		nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask,
+		nr_balanced += bypass_lb_cpu(sch, cpu, donee_mask, resched_mask,
 					     nr_donor_target, nr_target);
 	}
 
@@ -4348,7 +4349,7 @@ static void bypass_lb_node(struct scx_sched *sch, int node)
 		resched_cpu(cpu);
 
 	for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
-		u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+		u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr);
 
 		after_min = min(nr, after_min);
 		after_max = max(nr, after_max);
@@ -5243,7 +5244,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 {
 	struct scx_sched *sch;
 	s32 level = parent ? parent->level + 1 : 0;
-	int node, ret;
+	s32 node, cpu, ret;
 
 	sch = kzalloc_flex(*sch, ancestors, level);
 	if (!sch)
@@ -5284,6 +5285,9 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 		goto err_free_gdsqs;
 	}
 
+	for_each_possible_cpu(cpu)
+		init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch);
+
 	sch->helper = kthread_run_worker(0, "sched_ext_helper");
 	if (IS_ERR(sch->helper)) {
 		ret = PTR_ERR(sch->helper);
@@ -5463,7 +5467,6 @@ static s32 scx_root_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		struct rq *rq = cpu_rq(cpu);
 
 		rq->scx.local_dsq.sched = sch;
-		rq->scx.bypass_dsq.sched = sch;
 		rq->scx.cpuperf_target = SCX_CPUPERF_ONE;
 	}
 
@@ -6399,9 +6402,8 @@ void __init init_sched_ext_class(void)
 		struct rq *rq = cpu_rq(cpu);
 		int  n = cpu_to_node(cpu);
 
-		/* local/bypass dsq's sch will be set during scx_root_enable() */
+		/* local_dsq's sch will be set during scx_root_enable() */
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL);
-		init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS, NULL);
 
 		INIT_LIST_HEAD(&rq->scx.runnable_list);
 		INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 96c8e75f2930..2fccd65067b8 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -932,6 +932,8 @@ struct scx_sched_pcpu {
 	 * constructed when requested by scx_bpf_events().
 	 */
 	struct scx_event_stats	event_stats;
+
+	struct scx_dispatch_q	bypass_dsq;
 };
 
 struct scx_sched {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b82fb70a9d54..d9921d743467 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -810,7 +810,6 @@ struct scx_rq {
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
 	struct irq_work		kick_cpus_irq_work;
-	struct scx_dispatch_q	bypass_dsq;
 };
 #endif /* CONFIG_SCHED_CLASS_EXT */
 
-- 
2.53.0


  parent reply	other threads:[~2026-02-25  5:02 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-25  5:01 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25  5:01 ` [PATCH 01/34] sched_ext: Implement cgroup subtree iteration for scx_task_iter Tejun Heo
2026-02-25  5:01 ` [PATCH 02/34] sched_ext: Add @kargs to scx_fork() Tejun Heo
2026-02-25  5:01 ` [PATCH 03/34] sched/core: Swap the order between sched_post_fork() and cgroup_post_fork() Tejun Heo
2026-02-25  5:01 ` [PATCH 04/34] cgroup: Expose some cgroup helpers Tejun Heo
2026-02-25  5:01 ` [PATCH 05/34] sched_ext: Update p->scx.disallow warning in scx_init_task() Tejun Heo
2026-02-25  5:01 ` [PATCH 06/34] sched_ext: Reorganize enable/disable path for multi-scheduler support Tejun Heo
2026-02-25  5:01 ` [PATCH 07/34] sched_ext: Introduce cgroup sub-sched support Tejun Heo
2026-02-25  5:01 ` [PATCH 08/34] sched_ext: Introduce scx_task_sched[_rcu]() Tejun Heo
2026-02-25  5:01 ` [PATCH 09/34] sched_ext: Introduce scx_prog_sched() Tejun Heo
2026-02-25  5:01 ` [PATCH 10/34] sched_ext: Enforce scheduling authority in dispatch and select_cpu operations Tejun Heo
2026-02-25  5:01 ` [PATCH 11/34] sched_ext: Enforce scheduler ownership when updating slice and dsq_vtime Tejun Heo
2026-02-25  5:01 ` [PATCH 12/34] sched_ext: scx_dsq_move() should validate the task belongs to the right scheduler Tejun Heo
2026-02-25  5:01 ` [PATCH 13/34] sched_ext: Refactor task init/exit helpers Tejun Heo
2026-02-25  5:01 ` [PATCH 14/34] sched_ext: Make scx_prio_less() handle multiple schedulers Tejun Heo
2026-02-25  5:01 ` [PATCH 15/34] sched_ext: Move default slice to per-scheduler field Tejun Heo
2026-02-25  5:01 ` [PATCH 16/34] sched_ext: Move aborting flag " Tejun Heo
2026-02-25  5:01 ` Tejun Heo [this message]
2026-02-25  5:01 ` [PATCH 18/34] sched_ext: Move bypass state into scx_sched Tejun Heo
2026-02-25  5:01 ` [PATCH 19/34] sched_ext: Prepare bypass mode for hierarchical operation Tejun Heo
2026-02-25  5:01 ` [PATCH 20/34] sched_ext: Factor out scx_dispatch_sched() Tejun Heo
2026-02-25  5:01 ` [PATCH 21/34] sched_ext: When calling ops.dispatch() @prev must be on the same scx_sched Tejun Heo
2026-02-25  5:01 ` [PATCH 22/34] sched_ext: Separate bypass dispatch enabling from bypass depth tracking Tejun Heo
2026-02-25  5:01 ` [PATCH 23/34] sched_ext: Implement hierarchical bypass mode Tejun Heo
2026-02-25  5:01 ` [PATCH 24/34] sched_ext: Dispatch from all scx_sched instances Tejun Heo
2026-02-25  5:01 ` [PATCH 25/34] sched_ext: Move scx_dsp_ctx and scx_dsp_max_batch into scx_sched Tejun Heo
2026-02-25  5:01 ` [PATCH 26/34] sched_ext: Make watchdog sub-sched aware Tejun Heo
2026-02-25  5:01 ` [PATCH 27/34] sched_ext: Convert scx_dump_state() spinlock to raw spinlock Tejun Heo
2026-02-25  5:01 ` [PATCH 28/34] sched_ext: Support dumping multiple schedulers and add scheduler identification Tejun Heo
2026-02-25  5:01 ` [PATCH 29/34] sched_ext: Implement cgroup sub-sched enabling and disabling Tejun Heo
2026-02-25  5:01 ` [PATCH 30/34] sched_ext: Add scx_sched back pointer to scx_sched_pcpu Tejun Heo
2026-02-25  5:01 ` [PATCH 31/34] sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware Tejun Heo
2026-02-25  5:01 ` [PATCH 32/34] sched_ext: Factor out scx_link_sched() and scx_unlink_sched() Tejun Heo
2026-02-25  5:01 ` [PATCH 33/34] sched_ext: Add rhashtable lookup for sub-schedulers Tejun Heo
2026-02-25  5:01 ` [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching Tejun Heo
2026-02-25  5:18 ` [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
  -- strict thread matches above, loose matches on Subject: below --
2026-03-04 22:00 [PATCHSET v3 " Tejun Heo
2026-03-04 22:01 ` [PATCH 17/34] sched_ext: Move bypass_dsq into scx_sched_pcpu Tejun Heo
2026-02-25  5:00 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25  5:00 ` [PATCH 17/34] sched_ext: Move bypass_dsq into scx_sched_pcpu Tejun Heo
2026-01-21 23:11 [PATCHSET v1 sched_ext/for-6.20] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-01-21 23:11 ` [PATCH 17/34] sched_ext: Move bypass_dsq into scx_sched_pcpu Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260225050152.1070601-18-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=arighi@nvidia.com \
    --cc=cgroups@vger.kernel.org \
    --cc=changwoo@igalia.com \
    --cc=emil@etsalapatis.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mkoutny@suse.com \
    --cc=sched-ext@lists.linux.dev \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox