public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 sched_ext/for-7.1] sched_ext: Invalidate dispatch decisions on CPU affinity changes
@ 2026-03-19  8:35 Andrea Righi
  2026-03-19 10:31 ` Kuba Piecuch
  2026-03-19 15:18 ` Kuba Piecuch
  0 siblings, 2 replies; 8+ messages in thread
From: Andrea Righi @ 2026-03-19  8:35 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Emil Tsalapatis, Christian Loehle, Daniel Hodges,
	sched-ext, linux-kernel

A BPF scheduler may rely on p->cpus_ptr from ops.dispatch() to select a
target CPU. However, task affinity can change between the dispatch
decision and its finalization in finish_dispatch(). When this happens,
the scheduler may attempt to dispatch a task to a CPU that is no longer
allowed, resulting in fatal errors such as:

 EXIT: runtime error (SCX_DSQ_LOCAL[_ON] target CPU 10 not allowed for stress-ng-race-[13565])

This race exists because ops.dispatch() runs without holding the task's
run queue lock, allowing a concurrent set_cpus_allowed() to update
p->cpus_ptr while the BPF scheduler is still using it. The dispatch is
then finalized using stale affinity information.

Example timeline:

  CPU0                                      CPU1
  ----                                      ----
                                            task_rq_lock(p)
  if (cpumask_test_cpu(cpu, p->cpus_ptr))
                                            set_cpus_allowed_scx(p, new_mask)
                                            task_rq_unlock(p)
      scx_bpf_dsq_insert(p,
              SCX_DSQ_LOCAL_ON | cpu, 0)

With commit ebf1ccff79c4 ("sched_ext: Fix ops.dequeue() semantics"), BPF
schedulers can avoid the affinity race by tracking task state and
handling %SCX_DEQ_SCHED_CHANGE in ops.dequeue(): when a task is dequeued
due to a property change, the scheduler can update the task state and
skip the direct dispatch from ops.dispatch() for non-queued tasks.

However, schedulers that do not implement task state tracking and
dispatch directly to a local DSQ directly from ops.dispatch() may
trigger the scx_error() condition when the kernel validates the
destination in dispatch_to_local_dsq().

Improve this by shooting down in-flight dispatches from the dequeue path
in the sched_ext core, instead of using the global DSQ as a fallback.
When a QUEUED task is dequeued, increment the runqueue's ops_qseq before
transitioning the task's ops_state to NONE. A finish_dispatch() that
runs after the transition sees NONE and drops the dispatch, one that
runs later, after the task has been re-enqueued (with the new qseq),
sees a qseq mismatch and also drops. Either way the stale dispatch is
discarded and the task is already, or will be, handled by the scheduler
again.

Since this change removes the global DSQ fallback, also drop
%SCX_ENQ_GDSQ_FALLBACK, which is now unused.

This allows reducing boilerplate in BPF schedulers for task state
tracking and simplifies their implementation.

Cc: Christian Loehle <christian.loehle@arm.com>
Cc: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
Changes in v2:
 - Rework the patch based on the new ops.dequeue() semantic
 - Drop SCX_ENQ_GDSQ_FALLBACK
 - Link to v1: https://lore.kernel.org/all/20260203230639.1259869-1-arighi@nvidia.com

 kernel/sched/ext.c          | 55 +++++++++++++++++++++++++++++--------
 kernel/sched/ext_internal.h |  1 -
 2 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 94548ee9ad858..8c199c548b27e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1382,10 +1382,8 @@ static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 en
 	 * e.g. SAVE/RESTORE cycles and slice extensions.
 	 */
 	if (enq_flags & SCX_ENQ_IMMED) {
-		if (unlikely(dsq->id != SCX_DSQ_LOCAL)) {
-			WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK));
+		if (unlikely(dsq->id != SCX_DSQ_LOCAL))
 			return;
-		}
 		p->scx.flags |= SCX_TASK_IMMED;
 	}
 
@@ -2043,6 +2041,13 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 		 */
 		BUG();
 	case SCX_OPSS_QUEUED:
+		/*
+		 * Invalidate any in-flight dispatches for this task. The
+		 * task is leaving the runqueue, so any dispatch decision
+		 * made while it was queued is stale.
+		 */
+		rq->scx.ops_qseq++;
+
 		/* A queued task must always be in BPF scheduler's custody */
 		WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY));
 		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
@@ -2390,8 +2395,10 @@ static bool consume_remote_task(struct rq *this_rq,
  * will change. As @p's task_rq is locked, this function doesn't need to use the
  * holding_cpu mechanism.
  *
- * On return, @src_dsq is unlocked and only @p's new task_rq, which is the
- * return value, is locked.
+ * On success, @src_dsq is unlocked and only @p's new task_rq, which is the
+ * return value, is locked. On failure (affinity change invalidated the
+ * move), returns NULL with @src_dsq still locked and task remaining in
+ * @src_dsq.
  */
 static struct rq *move_task_between_dsqs(struct scx_sched *sch,
 					 struct task_struct *p, u64 enq_flags,
@@ -2408,9 +2415,11 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
 		dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
 		if (src_rq != dst_rq &&
 		    unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
-			dst_dsq = find_global_dsq(sch, task_cpu(p));
-			dst_rq = src_rq;
-			enq_flags |= SCX_ENQ_GDSQ_FALLBACK;
+			/*
+			 * Affinity changed after dispatch: abort the move,
+			 * task stays on src_dsq.
+			 */
+			return NULL;
 		}
 	} else {
 		/* no need to migrate if destination is a non-local DSQ */
@@ -2537,9 +2546,26 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
 	}
 
 	if (src_rq != dst_rq &&
-	    unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
-		dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p,
-				 enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK);
+	    unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, false))) {
+		/*
+		 * Affinity changed after dispatch decision and the task
+		 * can't run anymore on the destination rq.
+		 *
+		 * Drop the dispatch, the task will be re-enqueued. Set the
+		 * task back to QUEUED so dequeue (if waiting) can proceed
+		 * using current qseq from the task's rq.
+		 */
+		if (src_rq != rq) {
+			raw_spin_rq_unlock(rq);
+			raw_spin_rq_lock(src_rq);
+		}
+		atomic_long_set_release(&p->scx.ops_state,
+			       SCX_OPSS_QUEUED |
+			       (src_rq->scx.ops_qseq << SCX_OPSS_QSEQ_SHIFT));
+		if (src_rq != rq) {
+			raw_spin_rq_unlock(src_rq);
+			raw_spin_rq_lock(rq);
+		}
 		return;
 	}
 
@@ -8112,7 +8138,12 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 
 	/* execute move */
 	locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq);
-	dispatched = true;
+	if (locked_rq) {
+		dispatched = true;
+	} else {
+		raw_spin_unlock(&src_dsq->lock);
+		locked_rq = src_rq;
+	}
 out:
 	if (in_balance) {
 		if (this_rq != locked_rq) {
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index b4f36d8b9c1dd..49cef302b26bd 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1145,7 +1145,6 @@ enum scx_enq_flags {
 	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
 	SCX_ENQ_DSQ_PRIQ	= 1LLU << 57,
 	SCX_ENQ_NESTED		= 1LLU << 58,
-	SCX_ENQ_GDSQ_FALLBACK	= 1LLU << 59,	/* fell back to global DSQ */
 };
 
 enum scx_deq_flags {
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-03-23 23:13 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-19  8:35 [PATCH v2 sched_ext/for-7.1] sched_ext: Invalidate dispatch decisions on CPU affinity changes Andrea Righi
2026-03-19 10:31 ` Kuba Piecuch
2026-03-19 13:54   ` Kuba Piecuch
2026-03-19 21:09   ` Andrea Righi
2026-03-20  9:18     ` Kuba Piecuch
2026-03-23 23:13       ` Tejun Heo
2026-03-19 15:18 ` Kuba Piecuch
2026-03-19 19:01   ` Andrea Righi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox