All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sched_ext: Add cookie API for early qseq capture
@ 2026-05-06  7:59 Cheng-Yang Chou
  2026-05-06 10:08 ` Kuba Piecuch
  2026-05-06 10:58 ` sashiko-bot
  0 siblings, 2 replies; 4+ messages in thread
From: Cheng-Yang Chou @ 2026-05-06  7:59 UTC (permalink / raw)
  To: sched-ext, Tejun Heo, David Vernet, Andrea Righi, Changwoo Min
  Cc: Ching-Chun Huang, Chia-Ping Tsai, yphbchou0911, Kuba Piecuch

scx_bpf_dsq_insert() captures qseq at insert time. Any BPF-side
pre-dispatch checks (e.g. CPU affinity validation) performed before
the insert are outside the qseq protection window: a dequeue/re-enqueue
race occurring between the check and the insert goes undetected by
finish_dispatch(), which sees a matching qseq and proceeds with stale
assumptions.

Introduce two new kfuncs to extend the qseq protection window:

- scx_bpf_task_get_cookie(p)
    Reads @p's current qseq from ops_state and returns it as an opaque
    u64 cookie. The BPF scheduler should call this before performing
    pre-dispatch validity checks. The cookie may be stored in BPF maps
    to support cross-CPU dispatch patterns.

- scx_bpf_dsq_insert_with_cookie(p, dsq_id, enq_flags, cookie)
    Like scx_bpf_dsq_insert() with slice=0, but uses the cookie's qseq
    instead of re-reading ops_state at insert time. If @p was dequeued
    and re-enqueued between get_cookie() and here, qseq will have
    changed and finish_dispatch() will silently discard the stale
    dispatch. Use scx_bpf_task_set_slice() to set a non-default slice.

To support explicit qseq passing, refactor scx_dsq_insert_commit() to
take qseq as a parameter; all existing callers capture ops_state at
their call site, preserving the original behavior.

This mechanism is intended for schedulers that do not implement
properly synchronized dequeue. A scheduler whose ops.dequeue()
synchronizes atomically with the dispatch path does not need cookies.

Suggested-by: Tejun Heo <tj@kernel.org>
Suggested-by: Kuba Piecuch <jpiecuch@google.com>
Suggested-by: Andrea Righi <arighi@nvidia.com>
Reported-by: Andrea Righi <arighi@nvidia.com>
Link: https://lore.kernel.org/r/20260203230639.1259869-1-arighi@nvidia.com/
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
---
As discussed in [1].
[1]: https://lore.kernel.org/r/20260319083518.94673-1-arighi@nvidia.com/

 kernel/sched/ext.c | 65 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 61 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c80f0efd42c0..49577cca3104 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8303,7 +8303,7 @@ static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p
 }
 
 static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
-				  u64 dsq_id, u64 enq_flags)
+				  u64 dsq_id, u64 enq_flags, unsigned long qseq)
 {
 	struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
 	struct task_struct *ddsp_task;
@@ -8321,7 +8321,7 @@ static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
 
 	dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){
 		.task = p,
-		.qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
+		.qseq = qseq,
 		.dsq_id = dsq_id,
 		.enq_flags = enq_flags,
 	};
@@ -8388,7 +8388,8 @@ __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id,
 	else
 		p->scx.slice = p->scx.slice ?: 1;
 
-	scx_dsq_insert_commit(sch, p, dsq_id, enq_flags);
+	scx_dsq_insert_commit(sch, p, dsq_id, enq_flags,
+			      atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK);
 
 	return true;
 }
@@ -8416,7 +8417,8 @@ static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p,
 
 	p->scx.dsq_vtime = vtime;
 
-	scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+	scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ,
+			      atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK);
 
 	return true;
 }
@@ -8505,13 +8507,67 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
 	scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags);
 }
 
+/**
+ * scx_bpf_task_get_cookie - Get an opaque dispatch cookie for a task
+ * @p: task_struct to read cookie from
+ *
+ * Returns an opaque u64 cookie encoding @p's current qseq. Call this
+ * before pre-dispatch validity checks and pass the result to
+ * scx_bpf_dsq_insert_with_cookie() to extend the qseq protection window.
+ *
+ * For schedulers that do not implement properly synchronized dequeue only.
+ */
+__bpf_kfunc u64 scx_bpf_task_get_cookie(struct task_struct *p)
+{
+	return atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK;
+}
+
+/**
+ * scx_bpf_dsq_insert_with_cookie - Insert a task using an early-captured cookie
+ * @p: task_struct to insert
+ * @dsq_id: DSQ to insert into
+ * @enq_flags: SCX_ENQ_*
+ * @cookie: cookie from scx_bpf_task_get_cookie()
+ * @aux: implicit BPF argument
+ *
+ * Like scx_bpf_dsq_insert() with slice=0, but uses @cookie's qseq instead
+ * of re-reading ops_state at insert time. A stale cookie causes
+ * finish_dispatch() to silently discard the dispatch. Use
+ * scx_bpf_task_set_slice() to set a non-default slice.
+ *
+ * Returns %true on success, %false on failure.
+ */
+__bpf_kfunc bool scx_bpf_dsq_insert_with_cookie(struct task_struct *p,
+						 u64 dsq_id, u64 enq_flags,
+						 u64 cookie,
+						 const struct bpf_prog_aux *aux)
+{
+	struct scx_sched *sch;
+
+	guard(rcu)();
+	sch = scx_prog_sched(aux);
+	if (unlikely(!sch))
+		return false;
+
+	if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags))
+		return false;
+
+	p->scx.slice = p->scx.slice ?: 1;
+
+	scx_dsq_insert_commit(sch, p, dsq_id, enq_flags, (unsigned long)cookie);
+
+	return true;
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
 BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_IMPLICIT_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert_with_cookie, KF_IMPLICIT_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_IMPLICIT_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_task_get_cookie, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
@@ -10181,6 +10237,7 @@ BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_task_cid, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_task_get_cookie, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL)
 BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED)
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-05-06 12:39 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-06  7:59 [PATCH] sched_ext: Add cookie API for early qseq capture Cheng-Yang Chou
2026-05-06 10:08 ` Kuba Piecuch
2026-05-06 12:39   ` Cheng-Yang Chou
2026-05-06 10:58 ` sashiko-bot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.