From: Peter Zijlstra <peterz@infradead.org>
To: tj@kernel.org
Cc: linux-kernel@vger.kernel.org, peterz@infradead.org,
mingo@kernel.org, juri.lelli@redhat.com,
vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
vschneid@redhat.com, longman@redhat.com, hannes@cmpxchg.org,
mkoutny@suse.com, void@manifault.com, arighi@nvidia.com,
changwoo@igalia.com, cgroups@vger.kernel.org,
sched-ext@lists.linux.dev, liuwenfang@honor.com,
tglx@linutronix.de
Subject: [RFC][PATCH 3/3] sched/ext: Fold balance_scx() into pick_task_scx()
Date: Mon, 06 Oct 2025 12:46:55 +0200 [thread overview]
Message-ID: <20251006105453.769281377@infradead.org> (raw)
In-Reply-To: 20251006104652.630431579@infradead.org
With pick_task() having an rf argument, it is possible to do the
lock-break there, get rid of the weird balance/pick_task hack.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 13 --------
kernel/sched/ext.c | 78 +++++++--------------------------------------------
kernel/sched/sched.h | 1
3 files changed, 12 insertions(+), 80 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5845,19 +5845,6 @@ static void prev_balance(struct rq *rq,
const struct sched_class *start_class = prev->sched_class;
const struct sched_class *class;
-#ifdef CONFIG_SCHED_CLASS_EXT
- /*
- * SCX requires a balance() call before every pick_task() including when
- * waking up from SCHED_IDLE. If @start_class is below SCX, start from
- * SCX instead. Also, set a flag to detect missing balance() call.
- */
- if (scx_enabled()) {
- rq->scx.flags |= SCX_RQ_BAL_PENDING;
- if (sched_class_above(&ext_sched_class, start_class))
- start_class = &ext_sched_class;
- }
-#endif
-
/*
* We must do the balancing pass before put_prev_task(), such
* that when we release the rq->lock the task is in the same
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2013,7 +2013,7 @@ static int balance_one(struct rq *rq, st
lockdep_assert_rq_held(rq);
rq->scx.flags |= SCX_RQ_IN_BALANCE;
- rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
+ rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
unlikely(rq->scx.cpu_released)) {
@@ -2119,40 +2119,6 @@ static int balance_one(struct rq *rq, st
return true;
}
-static int balance_scx(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf)
-{
- int ret;
-
- rq_unpin_lock(rq, rf);
-
- ret = balance_one(rq, prev);
-
-#ifdef CONFIG_SCHED_SMT
- /*
- * When core-sched is enabled, this ops.balance() call will be followed
- * by pick_task_scx() on this CPU and the SMT siblings. Balance the
- * siblings too.
- */
- if (sched_core_enabled(rq)) {
- const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
- int scpu;
-
- for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
- struct rq *srq = cpu_rq(scpu);
- struct task_struct *sprev = srq->curr;
-
- WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
- update_rq_clock(srq);
- balance_one(srq, sprev);
- }
- }
-#endif
- rq_repin_lock(rq, rf);
-
- return ret;
-}
-
static void process_ddsp_deferred_locals(struct rq *rq)
{
struct task_struct *p;
@@ -2335,38 +2301,19 @@ static struct task_struct *first_local_t
static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
{
struct task_struct *prev = rq->curr;
+ bool keep_prev, kick_idle = false;
struct task_struct *p;
- bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
- bool kick_idle = false;
- /*
- * WORKAROUND:
- *
- * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
- * have gone through balance_scx(). Unfortunately, there currently is a
- * bug where fair could say yes on balance() but no on pick_task(),
- * which then ends up calling pick_task_scx() without preceding
- * balance_scx().
- *
- * Keep running @prev if possible and avoid stalling from entering idle
- * without balancing.
- *
- * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
- * if pick_task_scx() is called without preceding balance_scx().
- */
- if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
- if (prev->scx.flags & SCX_TASK_QUEUED) {
- keep_prev = true;
- } else {
- keep_prev = false;
- kick_idle = true;
- }
- } else if (unlikely(keep_prev &&
- prev->sched_class != &ext_sched_class)) {
- /*
- * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
- * conditional on scx_enabled() and may have been skipped.
- */
+ rq->queue_mask = 0;
+ rq_unpin_lock(rq, rf);
+ balance_one(rq, prev);
+ rq_repin_lock(rq, rf);
+ if (rq->queue_mask & ~((ext_sched_class.queue_mask << 1)-1))
+ return RETRY_TASK;
+
+ keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+ if (unlikely(keep_prev &&
+ prev->sched_class != &ext_sched_class)) {
WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
keep_prev = false;
}
@@ -3243,7 +3190,6 @@ DEFINE_SCHED_CLASS(ext) = {
.wakeup_preempt = wakeup_preempt_scx,
- .balance = balance_scx,
.pick_task = pick_task_scx,
.put_prev_task = put_prev_task_scx,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -779,7 +779,6 @@ enum scx_rq_flags {
*/
SCX_RQ_ONLINE = 1 << 0,
SCX_RQ_CAN_STOP_TICK = 1 << 1,
- SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
next prev parent reply other threads:[~2025-10-06 10:57 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-06 10:46 [RFC][PATCH 0/3] sched/ext: Cleanup pick_task_scx() Peter Zijlstra
2025-10-06 10:46 ` [RFC][PATCH 1/3] sched: Detect per-class runqueue changes Peter Zijlstra
2025-10-07 10:08 ` Juri Lelli
2025-10-07 10:16 ` Peter Zijlstra
2025-10-07 10:26 ` Juri Lelli
2025-10-06 10:46 ` [RFC][PATCH 2/3] sched: Add support to pick functions to take rf Peter Zijlstra
2025-10-08 13:16 ` Vincent Guittot
2025-10-08 13:58 ` Peter Zijlstra
2025-10-08 15:22 ` Vincent Guittot
2025-10-08 20:34 ` Tejun Heo
2025-10-09 7:17 ` Vincent Guittot
2025-10-13 11:04 ` Peter Zijlstra
2025-10-13 11:09 ` Peter Zijlstra
2025-10-13 13:06 ` Vincent Guittot
2025-10-13 17:20 ` Tejun Heo
2025-10-06 10:46 ` Peter Zijlstra [this message]
2025-10-07 12:40 ` [RFC][PATCH 0/3] sched/ext: Cleanup pick_task_scx() Christian Loehle
2025-10-07 21:48 ` Tejun Heo
2025-10-08 9:11 ` Peter Zijlstra
2025-10-08 20:21 ` Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251006105453.769281377@infradead.org \
--to=peterz@infradead.org \
--cc=arighi@nvidia.com \
--cc=bsegall@google.com \
--cc=cgroups@vger.kernel.org \
--cc=changwoo@igalia.com \
--cc=dietmar.eggemann@arm.com \
--cc=hannes@cmpxchg.org \
--cc=juri.lelli@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=liuwenfang@honor.com \
--cc=longman@redhat.com \
--cc=mgorman@suse.de \
--cc=mingo@kernel.org \
--cc=mkoutny@suse.com \
--cc=rostedt@goodmis.org \
--cc=sched-ext@lists.linux.dev \
--cc=tglx@linutronix.de \
--cc=tj@kernel.org \
--cc=vincent.guittot@linaro.org \
--cc=void@manifault.com \
--cc=vschneid@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox