[PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-01-21 12:25 [PATCHSET v2 sched_ext/for-6.20] sched_ext: Fix " Andrea Righi
@ 2026-01-21 12:25 ` Andrea Righi
  0 siblings, 0 replies; 33+ messages in thread
From: Andrea Righi @ 2026-01-21 12:25 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Changwoo Min
  Cc: Emil Tsalapatis, Daniel Hodges, sched-ext, linux-kernel

Add a new kselftest to validate that:
 - every enqueue is followed by proper dequeue,
 - dispatch dequeues happen exactly once per enqueue,
 - async dequeues (SCX_DEQ_ASYNC) only happen for property changes,
 - no duplicate enqueues without proper dequeue.

Test scenarios:
 - direct dispatch to local DSQ (SCX_DSQ_LOCAL),
 - dispatch to user DSQ,
 - explicit property changes via sched_setaffinity().

This validates that the new ops.dequeue() semantics work correctly for
all task lifecycle scenarios.

Cc: Tejun Heo <tj@kernel.org>
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 tools/testing/selftests/sched_ext/Makefile    |   1 +
 .../testing/selftests/sched_ext/dequeue.bpf.c | 209 ++++++++++++++++++
 tools/testing/selftests/sched_ext/dequeue.c   | 182 +++++++++++++++
 3 files changed, 392 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.c

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 2c601a7eaff5f..2815a875bde2f 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
 
 auto-test-targets :=			\
 	create_dsq			\
+	dequeue				\
 	enq_last_no_enq_fails		\
 	ddsp_bogus_dsq_fail		\
 	ddsp_vtimelocal_fail		\
diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
new file mode 100644
index 0000000000000..f7c4643a5e8de
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates ops.dequeue() is called correctly:
+ * - For tasks on BPF data structures (not yet dispatched)
+ * - For tasks already on DSQs (local or shared)
+ * - That every ops.enqueue() is followed by ops.dequeue()
+ *
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+#define SHARED_DSQ	0
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+/*
+ * Counters to track the lifecycle of tasks:
+ * - enqueue_cnt: Number of times ops.enqueue() was called
+ * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
+ * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
+ * - async_dequeue_cnt: Number of async dequeues (SCX_DEQ_ASYNC)
+ */
+u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, async_dequeue_cnt;
+
+/*
+ * Test scenarios:
+ * - 0: Dispatch to local DSQ
+ * - 1: Dispatch to shared DSQ
+ */
+u32 test_scenario;
+
+/*
+ * Per-task state to track lifecycle and validate workflow semantics.
+ * State transitions:
+ *   NONE -> ENQUEUED (on enqueue)
+ *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
+ *   DISPATCHED -> NONE (on async dequeue or re-enqueue)
+ *   ENQUEUED -> NONE (on async dequeue before dispatch)
+ */
+enum task_state {
+	TASK_NONE = 0,      /* Task is outside scheduler control */
+	TASK_ENQUEUED,      /* ops.enqueue() called, waiting for dequeue */
+	TASK_DISPATCHED,    /* Dispatch dequeue received, can get async or re-enqueue */
+};
+
+struct task_ctx {
+	enum task_state state; /* Current state in the workflow */
+	u64 enqueue_seq;       /* Sequence number for debugging */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
+{
+	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	/* Always bounce to ops.enqueue() */
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct task_ctx *tctx;
+
+	__sync_fetch_and_add(&enqueue_cnt, 1);
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	/*
+	 * Validate state transition: enqueue is only valid from NONE or
+	 * DISPATCHED states. Getting enqueue while in ENQUEUED state
+	 * indicates a missing dequeue.
+	 */
+	if (tctx->state == TASK_ENQUEUED)
+		scx_bpf_error("%d (%s): enqueue while in ENQUEUED state (seq %llu)",
+			      p->pid, p->comm, tctx->enqueue_seq);
+
+	/* Transition to ENQUEUED state */
+	tctx->state = TASK_ENQUEUED;
+	tctx->enqueue_seq++;
+
+	switch (test_scenario) {
+	case 0:
+		/* Scenario 0: Direct dispatch to the local DSQ */
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		break;
+
+	case 1:
+		/* Scenario 1: Dispatch to shared DSQ */
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+		break;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
+{
+	struct task_ctx *tctx;
+
+	__sync_fetch_and_add(&dequeue_cnt, 1);
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	/*
+	 * Validate state: dequeue should only happen from ENQUEUED or
+	 * DISPATCHED states. Getting dequeue from NONE indicates a bug.
+	 */
+	if (tctx->state == TASK_NONE)
+		scx_bpf_error("%d (%s): dequeue from NONE state (seq %llu)",
+			      p->pid, p->comm, tctx->enqueue_seq);
+
+	if (deq_flags & SCX_DEQ_ASYNC) {
+		/*
+		 * Async dequeue: property change interrupting the workflow.
+		 * Valid from both ENQUEUED and DISPATCHED states.
+		 * Transitions task back to NONE state.
+		 */
+		__sync_fetch_and_add(&async_dequeue_cnt, 1);
+
+		/* Validate state transition */
+		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
+			scx_bpf_error("%d (%s): async dequeue from invalid state %d (seq %llu)",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/* Transition back to NONE - task outside scheduler control */
+		tctx->state = TASK_NONE;
+	} else {
+		/*
+		 * Regular dispatch dequeue: normal workflow step.
+		 * Valid only from ENQUEUED state (after enqueue, before dispatch dequeue).
+		 * Transitions to DISPATCHED state.
+		 */
+		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
+
+		/* Validate: dispatch dequeue should NOT have SCX_DEQ_ASYNC flag */
+		if (deq_flags & SCX_DEQ_ASYNC)
+			scx_bpf_error("%d (%s): SCX_DEQ_ASYNC in dispatch dequeue (seq %llu)",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		/* Must be in ENQUEUED state */
+		if (tctx->state != TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): dispatch dequeue from state %d (seq %llu)",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/* Transition to DISPATCHED - normal cycle completed dispatch */
+		tctx->state = TASK_DISPATCHED;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_dsq_move_to_local(SHARED_DSQ);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	struct task_ctx *tctx;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!tctx)
+		return -ENOMEM;
+
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
+{
+	s32 ret;
+
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops dequeue_ops = {
+	.select_cpu		= (void *)dequeue_select_cpu,
+	.enqueue		= (void *)dequeue_enqueue,
+	.dequeue		= (void *)dequeue_dequeue,
+	.dispatch		= (void *)dequeue_dispatch,
+	.init_task		= (void *)dequeue_init_task,
+	.init			= (void *)dequeue_init,
+	.exit			= (void *)dequeue_exit,
+	.name			= "dequeue_test",
+};
diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
new file mode 100644
index 0000000000000..07de94957d366
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <sched.h>
+#include <pthread.h>
+#include "scx_test.h"
+#include "dequeue.bpf.skel.h"
+
+#define NUM_WORKERS 8
+
+/*
+ * Worker function that creates enqueue/dequeue events. It alternates
+ * between CPU work, sleeping, and affinity changes to trigger dequeues.
+ */
+static void worker_fn(int id)
+{
+	cpu_set_t cpuset;
+	int i;
+	volatile int sum = 0;
+
+	for (i = 0; i < 1000; i++) {
+		int j;
+
+		/* Do some work to trigger scheduling events */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Change affinity to trigger dequeue */
+		if (i % 10 == 0) {
+			CPU_ZERO(&cpuset);
+			/* Rotate through the first 4 CPUs */
+			CPU_SET(i % 4, &cpuset);
+			sched_setaffinity(0, sizeof(cpuset), &cpuset);
+		}
+
+		/* Do additional work */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Sleep to trigger dequeue */
+		usleep(1000 + (id * 100));
+	}
+
+	exit(0);
+}
+
+static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
+					 const char *scenario_name)
+{
+	struct bpf_link *link;
+	pid_t pids[NUM_WORKERS];
+	int i, status;
+	u64 enq_start, deq_start, dispatch_deq_start, async_deq_start;
+	u64 enq_delta, deq_delta, dispatch_deq_delta, async_deq_delta;
+
+	/* Set the test scenario */
+	skel->bss->test_scenario = scenario;
+
+	/* Record starting counts */
+	enq_start = skel->bss->enqueue_cnt;
+	deq_start = skel->bss->dequeue_cnt;
+	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
+	async_deq_start = skel->bss->async_dequeue_cnt;
+
+	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
+
+	/* Fork worker processes to generate enqueue/dequeue events */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		pids[i] = fork();
+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
+
+		if (pids[i] == 0) {
+			worker_fn(i);
+			/* Should not reach here */
+			exit(1);
+		}
+	}
+
+	/* Wait for all workers to complete */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for worker %d", i);
+		SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
+	}
+
+	bpf_link__destroy(link);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+
+	/* Calculate deltas */
+	enq_delta = skel->bss->enqueue_cnt - enq_start;
+	deq_delta = skel->bss->dequeue_cnt - deq_start;
+	dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
+	async_deq_delta = skel->bss->async_dequeue_cnt - async_deq_start;
+
+	printf("%s:\n", scenario_name);
+	printf("  enqueues: %lu\n", (unsigned long)enq_delta);
+	printf("  dequeues: %lu (dispatch: %lu, async: %lu)\n",
+	       (unsigned long)deq_delta,
+	       (unsigned long)dispatch_deq_delta,
+	       (unsigned long)async_deq_delta);
+
+	/*
+	 * Validate that we got enqueue and dequeue events.
+	 * The BPF code does strict state machine validation with scx_bpf_error()
+	 * to ensure the workflow semantics are correct. If we reach here without
+	 * errors, the semantics are validated correctly.
+	 */
+	SCX_GT(enq_delta, 0);
+	SCX_GT(deq_delta, 0);
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct dequeue *skel;
+
+	skel = dequeue__open();
+	SCX_FAIL_IF(!skel, "Failed to open skel");
+	SCX_ENUM_INIT(skel);
+	SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct dequeue *skel = ctx;
+	enum scx_test_status status;
+
+	status = run_scenario(skel, 0, "Local DSQ");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 1, "User DSQ");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	printf("\n=== Summary ===\n");
+	printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
+	printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
+	printf("  Dispatch dequeues: %lu (no flag, normal workflow)\n",
+	       (unsigned long)skel->bss->dispatch_dequeue_cnt);
+	printf("  Async dequeues: %lu (SCX_DEQ_ASYNC flag, property changes)\n",
+	       (unsigned long)skel->bss->async_dequeue_cnt);
+	printf("\nAll scenarios passed - no state machine violations detected\n");
+	printf("-> Validated: Correct state transitions (NONE -> ENQUEUED -> DISPATCHED)\n");
+	printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
+	printf("-> Validated: Async dequeues have SCX_DEQ_ASYNC flag (interruptions)\n");
+	printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct dequeue *skel = ctx;
+
+	dequeue__destroy(skel);
+}
+
+struct scx_test dequeue_test = {
+	.name = "dequeue",
+	.description = "Verify that ops.enqueue() is balanced with ops.dequeue()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+
+REGISTER_SCX_TEST(&dequeue_test)
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-01-26  8:41 [PATCHSET v3 sched_ext/for-6.20] sched_ext: Fix " Andrea Righi
@ 2026-01-26  8:41 ` Andrea Righi
  2026-01-27 16:53   ` Emil Tsalapatis
  0 siblings, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-01-26  8:41 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Christian Loehle, Daniel Hodges, sched-ext,
	linux-kernel, Emil Tsalapatis

Add a new kselftest to validate that:
 - every enqueue is followed by proper dequeue,
 - dispatch dequeues happen exactly once per enqueue,
 - property change dequeues (%SCX_DEQ_SCHED_CHANGE) only happen for
   property changes,
 - no duplicate enqueues without proper dequeue.

Test scenarios:
 - direct dispatch to local DSQ (%SCX_DSQ_LOCAL),
 - dispatch to user DSQ,
 - explicit property changes via sched_setaffinity().

This validates that the new ops.dequeue() semantics work correctly for
all task lifecycle scenarios.

Cc: Tejun Heo <tj@kernel.org>
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Cc: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 tools/testing/selftests/sched_ext/Makefile    |   1 +
 .../testing/selftests/sched_ext/dequeue.bpf.c | 209 ++++++++++++++++++
 tools/testing/selftests/sched_ext/dequeue.c   | 182 +++++++++++++++
 3 files changed, 392 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.c

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 5fe45f9c5f8fd..764e91edabf93 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
 
 auto-test-targets :=			\
 	create_dsq			\
+	dequeue				\
 	enq_last_no_enq_fails		\
 	ddsp_bogus_dsq_fail		\
 	ddsp_vtimelocal_fail		\
diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
new file mode 100644
index 0000000000000..8b2f792cf7d8b
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates ops.dequeue() is called correctly:
+ * - For tasks on BPF data structures (not yet dispatched)
+ * - For tasks already on DSQs (local or shared)
+ * - That every ops.enqueue() is followed by ops.dequeue()
+ *
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+#define SHARED_DSQ	0
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+/*
+ * Counters to track the lifecycle of tasks:
+ * - enqueue_cnt: Number of times ops.enqueue() was called
+ * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
+ * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
+ * - change_dequeue_cnt: Number of property change dequeues (%SCX_DEQ_SCHED_CHANGE)
+ */
+u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt;
+
+/*
+ * Test scenarios:
+ * - 0: Dispatch to local DSQ
+ * - 1: Dispatch to shared DSQ
+ */
+u32 test_scenario;
+
+/*
+ * Per-task state to track lifecycle and validate workflow semantics.
+ * State transitions:
+ *   NONE -> ENQUEUED (on enqueue)
+ *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
+ *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
+ *   ENQUEUED -> NONE (on property change dequeue before dispatch)
+ */
+enum task_state {
+	TASK_NONE = 0,      /* Task is outside scheduler control */
+	TASK_ENQUEUED,      /* ops.enqueue() called, waiting for dequeue */
+	TASK_DISPATCHED,    /* Dispatch dequeue received, can get property change or re-enqueue */
+};
+
+struct task_ctx {
+	enum task_state state; /* Current state in the workflow */
+	u64 enqueue_seq;       /* Sequence number for debugging */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
+{
+	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	/* Always bounce to ops.enqueue() */
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct task_ctx *tctx;
+
+	__sync_fetch_and_add(&enqueue_cnt, 1);
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	/*
+	 * Validate state transition: enqueue is only valid from NONE or
+	 * DISPATCHED states. Getting enqueue while in ENQUEUED state
+	 * indicates a missing dequeue.
+	 */
+	if (tctx->state == TASK_ENQUEUED)
+		scx_bpf_error("%d (%s): enqueue while in ENQUEUED state (seq %llu)",
+			      p->pid, p->comm, tctx->enqueue_seq);
+
+	/* Transition to ENQUEUED state */
+	tctx->state = TASK_ENQUEUED;
+	tctx->enqueue_seq++;
+
+	switch (test_scenario) {
+	case 0:
+		/* Scenario 0: Direct dispatch to the local DSQ */
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		break;
+
+	case 1:
+		/* Scenario 1: Dispatch to shared DSQ */
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+		break;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
+{
+	struct task_ctx *tctx;
+
+	__sync_fetch_and_add(&dequeue_cnt, 1);
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	/*
+	 * Validate state: dequeue should only happen from ENQUEUED or
+	 * DISPATCHED states. Getting dequeue from NONE indicates a bug.
+	 */
+	if (tctx->state == TASK_NONE)
+		scx_bpf_error("%d (%s): dequeue from NONE state (seq %llu)",
+			      p->pid, p->comm, tctx->enqueue_seq);
+
+	if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
+		/*
+		 * Async dequeue: property change interrupting the workflow.
+		 * Valid from both ENQUEUED and DISPATCHED states.
+		 * Transitions task back to NONE state.
+		 */
+		__sync_fetch_and_add(&change_dequeue_cnt, 1);
+
+		/* Validate state transition */
+		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
+			scx_bpf_error("%d (%s): property change dequeue from invalid state %d (seq %llu)",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/* Transition back to NONE - task outside scheduler control */
+		tctx->state = TASK_NONE;
+	} else {
+		/*
+		 * Regular dispatch dequeue: normal workflow step.
+		 * Valid only from ENQUEUED state (after enqueue, before dispatch dequeue).
+		 * Transitions to DISPATCHED state.
+		 */
+		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
+
+		/* Validate: dispatch dequeue should NOT have %SCX_DEQ_SCHED_CHANGE flag */
+		if (deq_flags & SCX_DEQ_SCHED_CHANGE)
+			scx_bpf_error("%d (%s): SCX_DEQ_SCHED_CHANGE in dispatch dequeue (seq %llu)",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		/* Must be in ENQUEUED state */
+		if (tctx->state != TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): dispatch dequeue from state %d (seq %llu)",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/* Transition to DISPATCHED - normal cycle completed dispatch */
+		tctx->state = TASK_DISPATCHED;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_dsq_move_to_local(SHARED_DSQ);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	struct task_ctx *tctx;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!tctx)
+		return -ENOMEM;
+
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
+{
+	s32 ret;
+
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops dequeue_ops = {
+	.select_cpu		= (void *)dequeue_select_cpu,
+	.enqueue		= (void *)dequeue_enqueue,
+	.dequeue		= (void *)dequeue_dequeue,
+	.dispatch		= (void *)dequeue_dispatch,
+	.init_task		= (void *)dequeue_init_task,
+	.init			= (void *)dequeue_init,
+	.exit			= (void *)dequeue_exit,
+	.name			= "dequeue_test",
+};
diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
new file mode 100644
index 0000000000000..6861257d79b47
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <sched.h>
+#include <pthread.h>
+#include "scx_test.h"
+#include "dequeue.bpf.skel.h"
+
+#define NUM_WORKERS 8
+
+/*
+ * Worker function that creates enqueue/dequeue events. It alternates
+ * between CPU work, sleeping, and affinity changes to trigger dequeues.
+ */
+static void worker_fn(int id)
+{
+	cpu_set_t cpuset;
+	int i;
+	volatile int sum = 0;
+
+	for (i = 0; i < 1000; i++) {
+		int j;
+
+		/* Do some work to trigger scheduling events */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Change affinity to trigger dequeue */
+		if (i % 10 == 0) {
+			CPU_ZERO(&cpuset);
+			/* Rotate through the first 4 CPUs */
+			CPU_SET(i % 4, &cpuset);
+			sched_setaffinity(0, sizeof(cpuset), &cpuset);
+		}
+
+		/* Do additional work */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Sleep to trigger dequeue */
+		usleep(1000 + (id * 100));
+	}
+
+	exit(0);
+}
+
+static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
+					 const char *scenario_name)
+{
+	struct bpf_link *link;
+	pid_t pids[NUM_WORKERS];
+	int i, status;
+	u64 enq_start, deq_start, dispatch_deq_start, change_deq_start;
+	u64 enq_delta, deq_delta, dispatch_deq_delta, change_deq_delta;
+
+	/* Set the test scenario */
+	skel->bss->test_scenario = scenario;
+
+	/* Record starting counts */
+	enq_start = skel->bss->enqueue_cnt;
+	deq_start = skel->bss->dequeue_cnt;
+	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
+	change_deq_start = skel->bss->change_dequeue_cnt;
+
+	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
+
+	/* Fork worker processes to generate enqueue/dequeue events */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		pids[i] = fork();
+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
+
+		if (pids[i] == 0) {
+			worker_fn(i);
+			/* Should not reach here */
+			exit(1);
+		}
+	}
+
+	/* Wait for all workers to complete */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for worker %d", i);
+		SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
+	}
+
+	bpf_link__destroy(link);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+
+	/* Calculate deltas */
+	enq_delta = skel->bss->enqueue_cnt - enq_start;
+	deq_delta = skel->bss->dequeue_cnt - deq_start;
+	dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
+	change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start;
+
+	printf("%s:\n", scenario_name);
+	printf("  enqueues: %lu\n", (unsigned long)enq_delta);
+	printf("  dequeues: %lu (dispatch: %lu, property_change: %lu)\n",
+	       (unsigned long)deq_delta,
+	       (unsigned long)dispatch_deq_delta,
+	       (unsigned long)change_deq_delta);
+
+	/*
+	 * Validate that we got enqueue and dequeue events.
+	 * The BPF code does strict state machine validation with scx_bpf_error()
+	 * to ensure the workflow semantics are correct. If we reach here without
+	 * errors, the semantics are validated correctly.
+	 */
+	SCX_GT(enq_delta, 0);
+	SCX_GT(deq_delta, 0);
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct dequeue *skel;
+
+	skel = dequeue__open();
+	SCX_FAIL_IF(!skel, "Failed to open skel");
+	SCX_ENUM_INIT(skel);
+	SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct dequeue *skel = ctx;
+	enum scx_test_status status;
+
+	status = run_scenario(skel, 0, "Local DSQ");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 1, "User DSQ");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	printf("\n=== Summary ===\n");
+	printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
+	printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
+	printf("  Dispatch dequeues: %lu (no flag, normal workflow)\n",
+	       (unsigned long)skel->bss->dispatch_dequeue_cnt);
+	printf("  Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n",
+	       (unsigned long)skel->bss->change_dequeue_cnt);
+	printf("\nAll scenarios passed - no state machine violations detected\n");
+	printf("-> Validated: Correct state transitions (NONE -> ENQUEUED -> DISPATCHED)\n");
+	printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
+	printf("-> Validated: Async dequeues have SCX_DEQ_SCHED_CHANGE flag (interruptions)\n");
+	printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct dequeue *skel = ctx;
+
+	dequeue__destroy(skel);
+}
+
+struct scx_test dequeue_test = {
+	.name = "dequeue",
+	.description = "Verify that ops.enqueue() is balanced with ops.dequeue()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+
+REGISTER_SCX_TEST(&dequeue_test)
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-01-26  8:41 ` [PATCH 2/2] selftests/sched_ext: Add test to validate " Andrea Righi
@ 2026-01-27 16:53   ` Emil Tsalapatis
  0 siblings, 0 replies; 33+ messages in thread
From: Emil Tsalapatis @ 2026-01-27 16:53 UTC (permalink / raw)
  To: Andrea Righi, Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Christian Loehle, Daniel Hodges, sched-ext,
	linux-kernel

On Mon Jan 26, 2026 at 3:41 AM EST, Andrea Righi wrote:
> Add a new kselftest to validate that:
>  - every enqueue is followed by proper dequeue,
>  - dispatch dequeues happen exactly once per enqueue,
>  - property change dequeues (%SCX_DEQ_SCHED_CHANGE) only happen for
>    property changes,
>  - no duplicate enqueues without proper dequeue.
>
> Test scenarios:
>  - direct dispatch to local DSQ (%SCX_DSQ_LOCAL),
>  - dispatch to user DSQ,
>  - explicit property changes via sched_setaffinity().
>
> This validates that the new ops.dequeue() semantics work correctly for
> all task lifecycle scenarios.
>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Emil Tsalapatis <emil@etsalapatis.com>
> Cc: Kuba Piecuch <jpiecuch@google.com>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

> ---
>  tools/testing/selftests/sched_ext/Makefile    |   1 +
>  .../testing/selftests/sched_ext/dequeue.bpf.c | 209 ++++++++++++++++++
>  tools/testing/selftests/sched_ext/dequeue.c   | 182 +++++++++++++++
>  3 files changed, 392 insertions(+)
>  create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
>  create mode 100644 tools/testing/selftests/sched_ext/dequeue.c
>
> diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
> index 5fe45f9c5f8fd..764e91edabf93 100644
> --- a/tools/testing/selftests/sched_ext/Makefile
> +++ b/tools/testing/selftests/sched_ext/Makefile
> @@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
>  
>  auto-test-targets :=			\
>  	create_dsq			\
> +	dequeue				\
>  	enq_last_no_enq_fails		\
>  	ddsp_bogus_dsq_fail		\
>  	ddsp_vtimelocal_fail		\
> diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
> new file mode 100644
> index 0000000000000..8b2f792cf7d8b
> --- /dev/null
> +++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
> @@ -0,0 +1,209 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * A scheduler that validates ops.dequeue() is called correctly:
> + * - For tasks on BPF data structures (not yet dispatched)
> + * - For tasks already on DSQs (local or shared)
> + * - That every ops.enqueue() is followed by ops.dequeue()
> + *
> + * Copyright (c) 2025 NVIDIA Corporation.
> + */
> +
> +#include <scx/common.bpf.h>
> +
> +#define SHARED_DSQ	0
> +
> +char _license[] SEC("license") = "GPL";
> +
> +UEI_DEFINE(uei);
> +
> +/*
> + * Counters to track the lifecycle of tasks:
> + * - enqueue_cnt: Number of times ops.enqueue() was called
> + * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
> + * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
> + * - change_dequeue_cnt: Number of property change dequeues (%SCX_DEQ_SCHED_CHANGE)
> + */
> +u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt;
> +
> +/*
> + * Test scenarios:
> + * - 0: Dispatch to local DSQ
> + * - 1: Dispatch to shared DSQ
> + */
> +u32 test_scenario;
> +
> +/*
> + * Per-task state to track lifecycle and validate workflow semantics.
> + * State transitions:
> + *   NONE -> ENQUEUED (on enqueue)
> + *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
> + *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
> + *   ENQUEUED -> NONE (on property change dequeue before dispatch)
> + */
> +enum task_state {
> +	TASK_NONE = 0,      /* Task is outside scheduler control */
> +	TASK_ENQUEUED,      /* ops.enqueue() called, waiting for dequeue */
> +	TASK_DISPATCHED,    /* Dispatch dequeue received, can get property change or re-enqueue */
> +};
> +
> +struct task_ctx {
> +	enum task_state state; /* Current state in the workflow */
> +	u64 enqueue_seq;       /* Sequence number for debugging */
> +};
> +
> +struct {
> +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
> +	__uint(map_flags, BPF_F_NO_PREALLOC);
> +	__type(key, int);
> +	__type(value, struct task_ctx);
> +} task_ctx_stor SEC(".maps");
> +
> +static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
> +{
> +	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> +}
> +
> +s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
> +		   s32 prev_cpu, u64 wake_flags)
> +{
> +	/* Always bounce to ops.enqueue() */
> +	return prev_cpu;
> +}
> +
> +void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
> +{
> +	struct task_ctx *tctx;
> +
> +	__sync_fetch_and_add(&enqueue_cnt, 1);
> +
> +	tctx = try_lookup_task_ctx(p);
> +	if (!tctx)
> +		return;
> +
> +	/*
> +	 * Validate state transition: enqueue is only valid from NONE or
> +	 * DISPATCHED states. Getting enqueue while in ENQUEUED state
> +	 * indicates a missing dequeue.
> +	 */
> +	if (tctx->state == TASK_ENQUEUED)
> +		scx_bpf_error("%d (%s): enqueue while in ENQUEUED state (seq %llu)",
> +			      p->pid, p->comm, tctx->enqueue_seq);
> +
> +	/* Transition to ENQUEUED state */
> +	tctx->state = TASK_ENQUEUED;
> +	tctx->enqueue_seq++;
> +
> +	switch (test_scenario) {
> +	case 0:
> +		/* Scenario 0: Direct dispatch to the local DSQ */
> +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
> +		break;
> +
> +	case 1:
> +		/* Scenario 1: Dispatch to shared DSQ */
> +		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
> +		break;
> +	}
> +}
> +
> +void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
> +{
> +	struct task_ctx *tctx;
> +
> +	__sync_fetch_and_add(&dequeue_cnt, 1);
> +
> +	tctx = try_lookup_task_ctx(p);
> +	if (!tctx)
> +		return;
> +
> +	/*
> +	 * Validate state: dequeue should only happen from ENQUEUED or
> +	 * DISPATCHED states. Getting dequeue from NONE indicates a bug.
> +	 */
> +	if (tctx->state == TASK_NONE)
> +		scx_bpf_error("%d (%s): dequeue from NONE state (seq %llu)",
> +			      p->pid, p->comm, tctx->enqueue_seq);
> +
> +	if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
> +		/*
> +		 * Async dequeue: property change interrupting the workflow.
> +		 * Valid from both ENQUEUED and DISPATCHED states.
> +		 * Transitions task back to NONE state.
> +		 */
> +		__sync_fetch_and_add(&change_dequeue_cnt, 1);
> +
> +		/* Validate state transition */
> +		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
> +			scx_bpf_error("%d (%s): property change dequeue from invalid state %d (seq %llu)",
> +				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
> +
> +		/* Transition back to NONE - task outside scheduler control */
> +		tctx->state = TASK_NONE;
> +	} else {
> +		/*
> +		 * Regular dispatch dequeue: normal workflow step.
> +		 * Valid only from ENQUEUED state (after enqueue, before dispatch dequeue).
> +		 * Transitions to DISPATCHED state.
> +		 */
> +		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
> +
> +		/* Validate: dispatch dequeue should NOT have %SCX_DEQ_SCHED_CHANGE flag */
> +		if (deq_flags & SCX_DEQ_SCHED_CHANGE)
> +			scx_bpf_error("%d (%s): SCX_DEQ_SCHED_CHANGE in dispatch dequeue (seq %llu)",
> +				      p->pid, p->comm, tctx->enqueue_seq);
> +
> +		/* Must be in ENQUEUED state */
> +		if (tctx->state != TASK_ENQUEUED)
> +			scx_bpf_error("%d (%s): dispatch dequeue from state %d (seq %llu)",
> +				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
> +
> +		/* Transition to DISPATCHED - normal cycle completed dispatch */
> +		tctx->state = TASK_DISPATCHED;
> +	}
> +}
> +
> +void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
> +{
> +	scx_bpf_dsq_move_to_local(SHARED_DSQ);
> +}
> +
> +s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
> +		   struct scx_init_task_args *args)
> +{
> +	struct task_ctx *tctx;
> +
> +	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
> +				   BPF_LOCAL_STORAGE_GET_F_CREATE);
> +	if (!tctx)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
> +{
> +	s32 ret;
> +
> +	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}
> +
> +void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
> +{
> +	UEI_RECORD(uei, ei);
> +}
> +
> +SEC(".struct_ops.link")
> +struct sched_ext_ops dequeue_ops = {
> +	.select_cpu		= (void *)dequeue_select_cpu,
> +	.enqueue		= (void *)dequeue_enqueue,
> +	.dequeue		= (void *)dequeue_dequeue,
> +	.dispatch		= (void *)dequeue_dispatch,
> +	.init_task		= (void *)dequeue_init_task,
> +	.init			= (void *)dequeue_init,
> +	.exit			= (void *)dequeue_exit,
> +	.name			= "dequeue_test",
> +};
> diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
> new file mode 100644
> index 0000000000000..6861257d79b47
> --- /dev/null
> +++ b/tools/testing/selftests/sched_ext/dequeue.c
> @@ -0,0 +1,182 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2025 NVIDIA Corporation.
> + */
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <bpf/bpf.h>
> +#include <scx/common.h>
> +#include <sys/wait.h>
> +#include <sched.h>
> +#include <pthread.h>
> +#include "scx_test.h"
> +#include "dequeue.bpf.skel.h"
> +
> +#define NUM_WORKERS 8
> +
> +/*
> + * Worker function that creates enqueue/dequeue events. It alternates
> + * between CPU work, sleeping, and affinity changes to trigger dequeues.
> + */
> +static void worker_fn(int id)
> +{
> +	cpu_set_t cpuset;
> +	int i;
> +	volatile int sum = 0;
> +
> +	for (i = 0; i < 1000; i++) {
> +		int j;
> +
> +		/* Do some work to trigger scheduling events */
> +		for (j = 0; j < 10000; j++)
> +			sum += j;
> +
> +		/* Change affinity to trigger dequeue */
> +		if (i % 10 == 0) {
> +			CPU_ZERO(&cpuset);
> +			/* Rotate through the first 4 CPUs */
> +			CPU_SET(i % 4, &cpuset);
> +			sched_setaffinity(0, sizeof(cpuset), &cpuset);
> +		}
> +
> +		/* Do additional work */
> +		for (j = 0; j < 10000; j++)
> +			sum += j;
> +
> +		/* Sleep to trigger dequeue */
> +		usleep(1000 + (id * 100));
> +	}
> +
> +	exit(0);
> +}
> +
> +static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
> +					 const char *scenario_name)
> +{
> +	struct bpf_link *link;
> +	pid_t pids[NUM_WORKERS];
> +	int i, status;
> +	u64 enq_start, deq_start, dispatch_deq_start, change_deq_start;
> +	u64 enq_delta, deq_delta, dispatch_deq_delta, change_deq_delta;
> +
> +	/* Set the test scenario */
> +	skel->bss->test_scenario = scenario;
> +
> +	/* Record starting counts */
> +	enq_start = skel->bss->enqueue_cnt;
> +	deq_start = skel->bss->dequeue_cnt;
> +	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
> +	change_deq_start = skel->bss->change_dequeue_cnt;
> +
> +	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
> +	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
> +
> +	/* Fork worker processes to generate enqueue/dequeue events */
> +	for (i = 0; i < NUM_WORKERS; i++) {
> +		pids[i] = fork();
> +		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
> +
> +		if (pids[i] == 0) {
> +			worker_fn(i);
> +			/* Should not reach here */
> +			exit(1);
> +		}
> +	}
> +
> +	/* Wait for all workers to complete */
> +	for (i = 0; i < NUM_WORKERS; i++) {
> +		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
> +			    "Failed to wait for worker %d", i);
> +		SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
> +	}
> +
> +	bpf_link__destroy(link);
> +
> +	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
> +
> +	/* Calculate deltas */
> +	enq_delta = skel->bss->enqueue_cnt - enq_start;
> +	deq_delta = skel->bss->dequeue_cnt - deq_start;
> +	dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
> +	change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start;
> +
> +	printf("%s:\n", scenario_name);
> +	printf("  enqueues: %lu\n", (unsigned long)enq_delta);
> +	printf("  dequeues: %lu (dispatch: %lu, property_change: %lu)\n",
> +	       (unsigned long)deq_delta,
> +	       (unsigned long)dispatch_deq_delta,
> +	       (unsigned long)change_deq_delta);
> +
> +	/*
> +	 * Validate that we got enqueue and dequeue events.
> +	 * The BPF code does strict state machine validation with scx_bpf_error()
> +	 * to ensure the workflow semantics are correct. If we reach here without
> +	 * errors, the semantics are validated correctly.
> +	 */
> +	SCX_GT(enq_delta, 0);
> +	SCX_GT(deq_delta, 0);
> +
> +	return SCX_TEST_PASS;
> +}
> +
> +static enum scx_test_status setup(void **ctx)
> +{
> +	struct dequeue *skel;
> +
> +	skel = dequeue__open();
> +	SCX_FAIL_IF(!skel, "Failed to open skel");
> +	SCX_ENUM_INIT(skel);
> +	SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
> +
> +	*ctx = skel;
> +
> +	return SCX_TEST_PASS;
> +}
> +
> +static enum scx_test_status run(void *ctx)
> +{
> +	struct dequeue *skel = ctx;
> +	enum scx_test_status status;
> +
> +	status = run_scenario(skel, 0, "Local DSQ");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 1, "User DSQ");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	printf("\n=== Summary ===\n");
> +	printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
> +	printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
> +	printf("  Dispatch dequeues: %lu (no flag, normal workflow)\n",
> +	       (unsigned long)skel->bss->dispatch_dequeue_cnt);
> +	printf("  Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n",
> +	       (unsigned long)skel->bss->change_dequeue_cnt);
> +	printf("\nAll scenarios passed - no state machine violations detected\n");
> +	printf("-> Validated: Correct state transitions (NONE -> ENQUEUED -> DISPATCHED)\n");
> +	printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
> +	printf("-> Validated: Async dequeues have SCX_DEQ_SCHED_CHANGE flag (interruptions)\n");
> +	printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
> +
> +	return SCX_TEST_PASS;
> +}
> +
> +static void cleanup(void *ctx)
> +{
> +	struct dequeue *skel = ctx;
> +
> +	dequeue__destroy(skel);
> +}
> +
> +struct scx_test dequeue_test = {
> +	.name = "dequeue",
> +	.description = "Verify that ops.enqueue() is balanced with ops.dequeue()",
> +	.setup = setup,
> +	.run = run,
> +	.cleanup = cleanup,
> +};
> +
> +REGISTER_SCX_TEST(&dequeue_test)


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-01  9:08 [PATCHSET v4 sched_ext/for-6.20] sched_ext: Fix " Andrea Righi
@ 2026-02-01  9:08 ` Andrea Righi
  0 siblings, 0 replies; 33+ messages in thread
From: Andrea Righi @ 2026-02-01  9:08 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Emil Tsalapatis, Christian Loehle, Daniel Hodges,
	sched-ext, linux-kernel

Add a new kselftest to validate the following scenarios:
 - scenario 0 (Local DSQ): tasks don't increment enqueue counters since
   they never enter BPF scheduler custody (no tracking or validation,
   expects 0 enqueues and 0 dequeues),
 - scenario 1 (User DSQ): full enqueue/dequeue lifecycle tracking with
   state machine validation (expects 1:1 enqueue/dequeue pairing).

The test validates that:
 - local DSQ dispatch don't trigger ops.dequeue(),
 - non-local DSQ dispatch has exact 1:1 ops.enqueue/dequeue() pairing,
 - dispatch dequeues have no flags (normal workflow),
 - property change dequeues have %SCX_DEQ_SCHED_CHANGE flag,
 - no duplicate enqueues or invalid state transitions are happening.

This validates that the new ops.dequeue() semantics work correctly for
all task lifecycle scenarios.

Cc: Tejun Heo <tj@kernel.org>
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Cc: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 tools/testing/selftests/sched_ext/Makefile    |   1 +
 .../testing/selftests/sched_ext/dequeue.bpf.c | 238 ++++++++++++++++++
 tools/testing/selftests/sched_ext/dequeue.c   | 201 +++++++++++++++
 3 files changed, 440 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.c

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 5fe45f9c5f8fd..764e91edabf93 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
 
 auto-test-targets :=			\
 	create_dsq			\
+	dequeue				\
 	enq_last_no_enq_fails		\
 	ddsp_bogus_dsq_fail		\
 	ddsp_vtimelocal_fail		\
diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
new file mode 100644
index 0000000000000..092956becb554
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates ops.dequeue() is called correctly:
+ * - for tasks on BPF data structures (not yet dispatched)
+ * - for tasks dispatched to non-local DSQs (global or user DSQs)
+ * - That every ops.enqueue() is followed by ops.dequeue() except for tasks
+ *   directly dispatched to local DSQs, which bypass the BPF scheduler
+ *   entirely
+ *
+ * Copyright (c) 2026 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+#define SHARED_DSQ	0
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+/*
+ * Counters to track the lifecycle of tasks:
+ * - enqueue_cnt: Number of times ops.enqueue() was called
+ * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
+ * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
+ * - change_dequeue_cnt: Number of property change dequeues (%SCX_DEQ_SCHED_CHANGE)
+ */
+u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt;
+
+/*
+ * Test scenarios:
+ * - 0: Dispatch to local DSQ (bypasses BPF scheduler, no dequeue callbacks)
+ * - 1: Dispatch to shared DSQ (enters BPF scheduler, dequeue callbacks expected)
+ */
+u32 test_scenario;
+
+/*
+ * Per-task state to track lifecycle and validate workflow semantics.
+ * State transitions:
+ *   NONE -> ENQUEUED (on enqueue)
+ *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
+ *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
+ *   ENQUEUED -> NONE (on property change dequeue before dispatch)
+ */
+enum task_state {
+	TASK_NONE = 0,      /* Task is outside scheduler control */
+	TASK_ENQUEUED,      /* ops.enqueue() called, waiting for dequeue */
+	TASK_DISPATCHED,    /* Dispatch dequeue received, can get property change or re-enqueue */
+};
+
+struct task_ctx {
+	enum task_state state; /* Current state in the workflow */
+	u64 enqueue_seq;       /* Sequence number for debugging */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
+{
+	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	/* Always bounce to ops.enqueue() */
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct task_ctx *tctx;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	switch (test_scenario) {
+	case 0:
+		/*
+		 * Scenario 0: Direct dispatch to the local DSQ.
+		 *
+		 * Task bypasses BPF scheduler entirely - no enqueue
+		 * tracking, no dequeue callbacks. Don't increment counters
+		 * or validate state since the task never enters BPF
+		 * scheduler management.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		break;
+
+	case 1:
+		/*
+		 * Scenario 1: Dispatch to shared DSQ.
+		 *
+		 * Task enters BPF scheduler management - track
+		 * enqueue/dequeue lifecycle and validate state
+		 * transitions.
+		 */
+		__sync_fetch_and_add(&enqueue_cnt, 1);
+
+		/*
+		 * Validate state transition: enqueue is only valid from
+		 * NONE or DISPATCHED states. Getting enqueue while in
+		 * ENQUEUED state indicates a missing dequeue.
+		 */
+		if (tctx->state == TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		/* Transition to ENQUEUED state */
+		tctx->state = TASK_ENQUEUED;
+		tctx->enqueue_seq++;
+
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+		break;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
+{
+	struct task_ctx *tctx;
+
+	__sync_fetch_and_add(&dequeue_cnt, 1);
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	/*
+	 * For scenario 0 (local DSQ), ops.dequeue() should never be called
+	 * because tasks bypass the BPF scheduler entirely. If we get here,
+	 * it's a kernel bug. We don't track enqueues for scenario 0, so
+	 * tctx->enqueue_seq will be 0.
+	 */
+	if (test_scenario == 0) {
+		scx_bpf_error("%d (%s): dequeue called for local DSQ scenario - kernel bug!",
+			      p->pid, p->comm);
+		return;
+	}
+
+	/*
+	 * Validate state: dequeue should only happen from ENQUEUED or
+	 * DISPATCHED states. Getting dequeue from NONE indicates a bug.
+	 */
+	if (tctx->state == TASK_NONE) {
+		scx_bpf_error("%d (%s): dequeue from NONE state seq=%llu",
+			      p->pid, p->comm, tctx->enqueue_seq);
+		return;
+	}
+
+	if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
+		/*
+		 * Property change interrupting the workflow. Valid from
+		 * both ENQUEUED and DISPATCHED states. Transitions task
+		 * back to NONE state.
+		 */
+		__sync_fetch_and_add(&change_dequeue_cnt, 1);
+
+		/* Validate state transition */
+		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
+			scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/* Transition back to NONE: task outside scheduler control */
+		tctx->state = TASK_NONE;
+	} else {
+		/*
+		 * Regular dispatch dequeue: normal workflow step. Valid
+		 * only from ENQUEUED state (after enqueue, before dispatch
+		 * dequeue). Transitions to DISPATCHED state.
+		 */
+		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
+
+		/* Dispatch dequeue should not have %SCX_DEQ_SCHED_CHANGE flag */
+		if (deq_flags & SCX_DEQ_SCHED_CHANGE)
+			scx_bpf_error("%d (%s): SCX_DEQ_SCHED_CHANGE in dispatch dequeue seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		/* Must be in ENQUEUED state */
+		if (tctx->state != TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/* Transition to DISPATCHED: normal cycle completed dispatch */
+		tctx->state = TASK_DISPATCHED;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_dsq_move_to_local(SHARED_DSQ);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	struct task_ctx *tctx;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!tctx)
+		return -ENOMEM;
+
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
+{
+	s32 ret;
+
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops dequeue_ops = {
+	.select_cpu		= (void *)dequeue_select_cpu,
+	.enqueue		= (void *)dequeue_enqueue,
+	.dequeue		= (void *)dequeue_dequeue,
+	.dispatch		= (void *)dequeue_dispatch,
+	.init_task		= (void *)dequeue_init_task,
+	.init			= (void *)dequeue_init,
+	.exit			= (void *)dequeue_exit,
+	.name			= "dequeue_test",
+};
diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
new file mode 100644
index 0000000000000..805970834612b
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <sched.h>
+#include <pthread.h>
+#include "scx_test.h"
+#include "dequeue.bpf.skel.h"
+
+#define NUM_WORKERS 8
+
+/*
+ * Worker function that creates enqueue/dequeue events. It alternates
+ * between CPU work, sleeping, and affinity changes to trigger dequeues.
+ */
+static void worker_fn(int id)
+{
+	cpu_set_t cpuset;
+	int i;
+	volatile int sum = 0;
+
+	for (i = 0; i < 1000; i++) {
+		int j;
+
+		/* Do some work to trigger scheduling events */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Change affinity to trigger dequeue */
+		if (i % 10 == 0) {
+			CPU_ZERO(&cpuset);
+			/* Rotate through the first 4 CPUs */
+			CPU_SET(i % 4, &cpuset);
+			sched_setaffinity(0, sizeof(cpuset), &cpuset);
+		}
+
+		/* Do additional work */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Sleep to trigger dequeue */
+		usleep(1000 + (id * 100));
+	}
+
+	exit(0);
+}
+
+static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
+					 const char *scenario_name)
+{
+	struct bpf_link *link;
+	pid_t pids[NUM_WORKERS];
+	int i, status;
+	u64 enq_start, deq_start, dispatch_deq_start, change_deq_start;
+	u64 enq_delta, deq_delta, dispatch_deq_delta, change_deq_delta;
+
+	/* Set the test scenario */
+	skel->bss->test_scenario = scenario;
+
+	/* Record starting counts */
+	enq_start = skel->bss->enqueue_cnt;
+	deq_start = skel->bss->dequeue_cnt;
+	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
+	change_deq_start = skel->bss->change_dequeue_cnt;
+
+	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
+
+	/* Fork worker processes to generate enqueue/dequeue events */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		pids[i] = fork();
+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
+
+		if (pids[i] == 0) {
+			worker_fn(i);
+			/* Should not reach here */
+			exit(1);
+		}
+	}
+
+	/* Wait for all workers to complete */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for worker %d", i);
+		SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
+	}
+
+	bpf_link__destroy(link);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+
+	/* Calculate deltas */
+	enq_delta = skel->bss->enqueue_cnt - enq_start;
+	deq_delta = skel->bss->dequeue_cnt - deq_start;
+	dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
+	change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start;
+
+	printf("%s:\n", scenario_name);
+	printf("  enqueues: %lu\n", (unsigned long)enq_delta);
+	printf("  dequeues: %lu (dispatch: %lu, property_change: %lu)\n",
+	       (unsigned long)deq_delta,
+	       (unsigned long)dispatch_deq_delta,
+	       (unsigned long)change_deq_delta);
+
+	/*
+	 * Validate enqueue/dequeue lifecycle tracking.
+	 *
+	 * For scenario 0 (Local DSQ), both enqueues and dequeues should be
+	 * 0 because tasks bypass the BPF scheduler entirely: they never
+	 * enter BPF scheduler's custody. For scenario 1 (ser DSQ) , we
+	 * expect both enqueues and dequeues.
+	 *
+	 * The BPF code does strict state machine validation with
+	 * scx_bpf_error() to ensure the workflow semantics are correct. If
+	 * we reach here without errors, the semantics are validated
+	 * correctly.
+	 */
+	if (scenario == 0) {
+		/* Local DSQ: tasks bypass BPF scheduler completely */
+		SCX_EQ(enq_delta, 0);
+		SCX_EQ(deq_delta, 0);
+		SCX_EQ(dispatch_deq_delta, 0);
+		SCX_EQ(change_deq_delta, 0);
+	} else {
+		/* Non-local DSQ: tasks enter BPF scheduler's custody */
+		SCX_GT(enq_delta, 0);
+		SCX_GT(deq_delta, 0);
+		/* Validate 1:1 enqueue/dequeue pairing */
+		SCX_EQ(enq_delta, deq_delta);
+	}
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct dequeue *skel;
+
+	skel = dequeue__open();
+	SCX_FAIL_IF(!skel, "Failed to open skel");
+	SCX_ENUM_INIT(skel);
+	SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct dequeue *skel = ctx;
+	enum scx_test_status status;
+
+	status = run_scenario(skel, 0, "Local DSQ (direct dispatch)");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 1, "User DSQ");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	printf("\n=== Summary ===\n");
+	printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
+	printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
+	printf("  Dispatch dequeues: %lu (no flag, normal workflow)\n",
+	       (unsigned long)skel->bss->dispatch_dequeue_cnt);
+	printf("  Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n",
+	       (unsigned long)skel->bss->change_dequeue_cnt);
+	printf("\nAll scenarios passed - no state machine violations detected\n");
+	printf("-> Validated: Local DSQ dispatch bypasses BPF scheduler (no dequeue callbacks)\n");
+	printf("-> Validated: Non-local DSQ dispatch triggers dequeue callbacks\n");
+	printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
+	printf("-> Validated: Async dequeues have SCX_DEQ_SCHED_CHANGE flag (interruptions)\n");
+	printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct dequeue *skel = ctx;
+
+	dequeue__destroy(skel);
+}
+
+struct scx_test dequeue_test = {
+	.name = "dequeue",
+	.description = "Verify ops.dequeue() semantics for local and non-local DSQ dispatch",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+
+REGISTER_SCX_TEST(&dequeue_test)
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-04 16:05 [PATCHSET v5] sched_ext: Fix " Andrea Righi
@ 2026-02-04 16:05 ` Andrea Righi
  0 siblings, 0 replies; 33+ messages in thread
From: Andrea Righi @ 2026-02-04 16:05 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Emil Tsalapatis, Christian Loehle, Daniel Hodges,
	sched-ext, linux-kernel

Add a new kselftest to validate the following scenarios:
 - scenario 0 (Local DSQ): tasks dispatched to local DSQs bypass BPF
   scheduler entirely, they never enter BPF custody, so no
   ops.dequeue() should be called,
 - scenario 1 (Global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
   bypass BPF scheduler, like local DSQs, no ops.dequeue() should be
   called,
 - scenario 2 (User DSQ): tasks enter BPF scheduler custody with full
   enqueue/dequeue lifecycle tracking and state machine validation
   (expects 1:1 enqueue/dequeue pairing).

The test validates that:
 - terminal DSQ dispatch (local, global) don't trigger ops.dequeue(),
 - user DSQ dispatch has exact 1:1 ops.enqueue()/dequeue() pairing,
 - dispatch dequeues have no flags (normal workflow),
 - property change dequeues have the %SCX_DEQ_SCHED_CHANGE flag set,
 - no duplicate enqueues or invalid state transitions are happening.

This validates that the new ops.dequeue() semantics work correctly for
all task lifecycle scenarios, including the distinction between terminal
DSQs (where BPF scheduler is done with the task) and user DSQs (where
BPF scheduler manages the task lifecycle).

Cc: Tejun Heo <tj@kernel.org>
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Cc: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 tools/testing/selftests/sched_ext/Makefile    |   1 +
 .../testing/selftests/sched_ext/dequeue.bpf.c | 269 ++++++++++++++++++
 tools/testing/selftests/sched_ext/dequeue.c   | 207 ++++++++++++++
 3 files changed, 477 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.c

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 5fe45f9c5f8fd..764e91edabf93 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
 
 auto-test-targets :=			\
 	create_dsq			\
+	dequeue				\
 	enq_last_no_enq_fails		\
 	ddsp_bogus_dsq_fail		\
 	ddsp_vtimelocal_fail		\
diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
new file mode 100644
index 0000000000000..5d736ffadb4c8
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
@@ -0,0 +1,269 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates ops.dequeue() is called correctly:
+ * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
+ *   scheduler entirely: no ops.dequeue() should be called
+ * - Tasks dispatched to user DSQs enter BPF custody: ops.dequeue() must be
+ *   called when they leave custody
+ * - Every ops.enqueue() for non-terminal DSQs is followed by exactly one
+ *   ops.dequeue() (validate 1:1 pairing and state machine)
+ *
+ * Copyright (c) 2026 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+#define SHARED_DSQ	0
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+/*
+ * Counters to track the lifecycle of tasks:
+ * - enqueue_cnt: Number of times ops.enqueue() was called
+ * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
+ * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
+ * - change_dequeue_cnt: Number of property change dequeues
+ */
+u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt;
+
+/*
+ * Test scenarios:
+ * 0) Dispatch to local DSQ (terminal DSQ, bypasses BPF scheduler, no
+ *    dequeue callbacks)
+ * 1) Dispatch to global DSQ (terminal DSQ, bypasses BPF scheduler, no
+ *    dequeue callbacks)
+ * 2) Dispatch to shared user DSQ (enters BPF scheduler, dequeue callbacks
+ *    expected)
+ */
+u32 test_scenario;
+
+/*
+ * Per-task state to track lifecycle and validate workflow semantics.
+ * State transitions:
+ *   NONE -> ENQUEUED (on enqueue)
+ *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
+ *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
+ *   ENQUEUED -> NONE (on property change dequeue before dispatch)
+ */
+enum task_state {
+	TASK_NONE = 0,
+	TASK_ENQUEUED,
+	TASK_DISPATCHED,
+};
+
+struct task_ctx {
+	enum task_state state; /* Current state in the workflow */
+	u64 enqueue_seq;       /* Sequence number for debugging */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
+{
+	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	/* Always bounce to ops.enqueue() */
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct task_ctx *tctx;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	switch (test_scenario) {
+	case 0:
+		/*
+		 * Scenario 0: Direct dispatch to the local DSQ.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no dequeue callbacks. Don't increment counters
+		 * or validate state since the task never enters BPF
+		 * scheduler management.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		break;
+
+	case 1:
+		/*
+		 * Scenario 1: Direct dispatch to the global DSQ.
+		 *
+		 * Like scenario 0, task bypasses BPF scheduler entirely.
+		 * SCX_DSQ_GLOBAL is a terminal DSQ, tasks dispatched to it
+		 * leave BPF custody immediately, so no dequeue callbacks
+		 * should be triggered.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		break;
+
+	case 2:
+		/*
+		 * Scenario 2: Dispatch to shared user DSQ.
+		 *
+		 * Task enters BPF scheduler management: track
+		 * enqueue/dequeue lifecycle and validate state
+		 * transitions.
+		 */
+		__sync_fetch_and_add(&enqueue_cnt, 1);
+
+		/*
+		 * Validate state transition: enqueue is only valid from
+		 * NONE or DISPATCHED states. Getting enqueue while in
+		 * ENQUEUED state indicates a missing dequeue.
+		 */
+		if (tctx->state == TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		/* Transition to ENQUEUED state */
+		tctx->state = TASK_ENQUEUED;
+		tctx->enqueue_seq++;
+
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+		break;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
+{
+	struct task_ctx *tctx;
+
+	__sync_fetch_and_add(&dequeue_cnt, 1);
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	/*
+	 * For scenarios 0 and 1 (terminal DSQs: local and global),
+	 * ops.dequeue() should never be called because tasks bypass the
+	 * BPF scheduler entirely. If we get here, it's a kernel bug. We
+	 * don't track enqueues for these scenarios, so tctx->enqueue_seq
+	 * will be 0.
+	 */
+	if (test_scenario == 0) {
+		scx_bpf_error("%d (%s): dequeue called for local DSQ scenario - kernel bug!",
+			      p->pid, p->comm);
+		return;
+	}
+	if (test_scenario == 1) {
+		scx_bpf_error("%d (%s): dequeue called for global DSQ scenario - kernel bug!",
+			      p->pid, p->comm);
+		return;
+	}
+
+	/*
+	 * Validate state: dequeue should only happen from ENQUEUED or
+	 * DISPATCHED states. Getting dequeue from NONE indicates a bug.
+	 */
+	if (tctx->state == TASK_NONE) {
+		scx_bpf_error("%d (%s): dequeue from NONE state seq=%llu",
+			      p->pid, p->comm, tctx->enqueue_seq);
+		return;
+	}
+
+	if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
+		/*
+		 * Property change interrupting the workflow. Valid from
+		 * both ENQUEUED and DISPATCHED states. Transitions task
+		 * back to NONE state.
+		 */
+		__sync_fetch_and_add(&change_dequeue_cnt, 1);
+
+		/* Validate state transition */
+		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
+			scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/* Transition back to NONE: task outside scheduler control */
+		tctx->state = TASK_NONE;
+	} else {
+		/*
+		 * Regular dispatch dequeue: normal workflow step. Valid
+		 * only from ENQUEUED state (after enqueue, before dispatch
+		 * dequeue). Transitions to DISPATCHED state.
+		 */
+		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
+
+		/*
+		 * Dispatch dequeue should not have %SCX_DEQ_SCHED_CHANGE
+		 * flag.
+		 */
+		if (deq_flags & SCX_DEQ_SCHED_CHANGE)
+			scx_bpf_error("%d (%s): SCX_DEQ_SCHED_CHANGE in dispatch dequeue seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		/*
+		 * Must be in ENQUEUED state.
+		 */
+		if (tctx->state != TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/*
+		 * Transition to DISPATCHED: normal cycle completed
+		 * dispatch.
+		 */
+		tctx->state = TASK_DISPATCHED;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_dsq_move_to_local(SHARED_DSQ);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	struct task_ctx *tctx;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!tctx)
+		return -ENOMEM;
+
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
+{
+	s32 ret;
+
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops dequeue_ops = {
+	.select_cpu		= (void *)dequeue_select_cpu,
+	.enqueue		= (void *)dequeue_enqueue,
+	.dequeue		= (void *)dequeue_dequeue,
+	.dispatch		= (void *)dequeue_dispatch,
+	.init_task		= (void *)dequeue_init_task,
+	.init			= (void *)dequeue_init,
+	.exit			= (void *)dequeue_exit,
+	.name			= "dequeue_test",
+};
diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
new file mode 100644
index 0000000000000..0ce59299a1b37
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <sched.h>
+#include <pthread.h>
+#include "scx_test.h"
+#include "dequeue.bpf.skel.h"
+
+#define NUM_WORKERS 8
+
+/*
+ * Worker function that creates enqueue/dequeue events. It alternates
+ * between CPU work, sleeping, and affinity changes to trigger dequeues.
+ */
+static void worker_fn(int id)
+{
+	cpu_set_t cpuset;
+	int i;
+	volatile int sum = 0;
+
+	for (i = 0; i < 1000; i++) {
+		int j;
+
+		/* Do some work to trigger scheduling events */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Change affinity to trigger dequeue */
+		if (i % 10 == 0) {
+			CPU_ZERO(&cpuset);
+			/* Rotate through the first 4 CPUs */
+			CPU_SET(i % 4, &cpuset);
+			sched_setaffinity(0, sizeof(cpuset), &cpuset);
+		}
+
+		/* Do additional work */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Sleep to trigger dequeue */
+		usleep(1000 + (id * 100));
+	}
+
+	exit(0);
+}
+
+static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
+					 const char *scenario_name)
+{
+	struct bpf_link *link;
+	pid_t pids[NUM_WORKERS];
+	int i, status;
+	u64 enq_start, deq_start, dispatch_deq_start, change_deq_start;
+	u64 enq_delta, deq_delta, dispatch_deq_delta, change_deq_delta;
+
+	/* Set the test scenario */
+	skel->bss->test_scenario = scenario;
+
+	/* Record starting counts */
+	enq_start = skel->bss->enqueue_cnt;
+	deq_start = skel->bss->dequeue_cnt;
+	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
+	change_deq_start = skel->bss->change_dequeue_cnt;
+
+	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
+
+	/* Fork worker processes to generate enqueue/dequeue events */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		pids[i] = fork();
+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
+
+		if (pids[i] == 0) {
+			worker_fn(i);
+			/* Should not reach here */
+			exit(1);
+		}
+	}
+
+	/* Wait for all workers to complete */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for worker %d", i);
+		SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
+	}
+
+	bpf_link__destroy(link);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+
+	/* Calculate deltas */
+	enq_delta = skel->bss->enqueue_cnt - enq_start;
+	deq_delta = skel->bss->dequeue_cnt - deq_start;
+	dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
+	change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start;
+
+	printf("%s:\n", scenario_name);
+	printf("  enqueues: %lu\n", (unsigned long)enq_delta);
+	printf("  dequeues: %lu (dispatch: %lu, property_change: %lu)\n",
+	       (unsigned long)deq_delta,
+	       (unsigned long)dispatch_deq_delta,
+	       (unsigned long)change_deq_delta);
+
+	/*
+	 * Validate enqueue/dequeue lifecycle tracking.
+	 *
+	 * For scenarios 0 and 1 (local and global DSQs), both enqueues and
+	 * dequeues should be 0 because tasks bypass the BPF scheduler
+	 * entirely: tasks never enter BPF scheduler's custody.
+	 *
+	 * For scenario 2 (user DSQ), we expect both enqueues and dequeues.
+	 *
+	 * The BPF code does strict state machine validation with
+	 * scx_bpf_error() to ensure the workflow semantics are correct. If
+	 * we reach this point without errors, the semantics are validated
+	 * correctly.
+	 */
+	if (scenario == 0 || scenario == 1) {
+		/* Terminal DSQs: tasks bypass BPF scheduler completely */
+		SCX_EQ(enq_delta, 0);
+		SCX_EQ(deq_delta, 0);
+		SCX_EQ(dispatch_deq_delta, 0);
+		SCX_EQ(change_deq_delta, 0);
+	} else {
+		/* User DSQ: tasks enter BPF scheduler's custody */
+		SCX_GT(enq_delta, 0);
+		SCX_GT(deq_delta, 0);
+		/* Validate 1:1 enqueue/dequeue pairing */
+		SCX_EQ(enq_delta, deq_delta);
+	}
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct dequeue *skel;
+
+	skel = dequeue__open();
+	SCX_FAIL_IF(!skel, "Failed to open skel");
+	SCX_ENUM_INIT(skel);
+	SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct dequeue *skel = ctx;
+	enum scx_test_status status;
+
+	status = run_scenario(skel, 0, "Local DSQ (terminal, direct dispatch)");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 1, "Global DSQ (terminal, SCX_DSQ_GLOBAL)");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 2, "User DSQ (non-terminal, BPF custody)");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	printf("\n=== Summary ===\n");
+	printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
+	printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
+	printf("  Dispatch dequeues: %lu (no flag, normal workflow)\n",
+	       (unsigned long)skel->bss->dispatch_dequeue_cnt);
+	printf("  Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n",
+	       (unsigned long)skel->bss->change_dequeue_cnt);
+	printf("\nAll scenarios passed - no state machine violations detected\n");
+	printf("-> Validated: Local DSQ dispatch bypasses BPF scheduler\n");
+	printf("-> Validated: Global DSQ dispatch bypasses BPF scheduler\n");
+	printf("-> Validated: User DSQ dispatch triggers dequeue callbacks\n");
+	printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
+	printf("-> Validated: Property change dequeues have SCX_DEQ_SCHED_CHANGE flag\n");
+	printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct dequeue *skel = ctx;
+
+	dequeue__destroy(skel);
+}
+
+struct scx_test dequeue_test = {
+	.name = "dequeue",
+	.description = "Verify ops.dequeue() semantics",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+
+REGISTER_SCX_TEST(&dequeue_test)
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-05 15:32 [PATCHSET v6] sched_ext: Fix " Andrea Righi
@ 2026-02-05 15:32 ` Andrea Righi
  0 siblings, 0 replies; 33+ messages in thread
From: Andrea Righi @ 2026-02-05 15:32 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Emil Tsalapatis, Christian Loehle, Daniel Hodges,
	sched-ext, linux-kernel

Add a new kselftest to validate that the new ops.dequeue() semantics
work correctly for all task lifecycle scenarios, including the
distinction between terminal DSQs (where BPF scheduler is done with the
task) and user DSQs (where BPF scheduler manages the task lifecycle),
regardless of which callback performs the dispatch.

The test validates 6 scenarios:
 - from ops.enqueue():
   - scenario 0 (local DSQ): tasks dispatched to local DSQs bypass BPF
     scheduler entirely, they never enter BPF custody, so no
     ops.dequeue() should be called,
    - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
      bypass BPF scheduler, like local DSQs, no ops.dequeue() should be
      called,
    - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
      enqueue/dequeue lifecycle tracking and state machine validation
      (expects 1:1 enqueue/dequeue pairing).

 - from ops.select_cpu():
   - scenario 3 (local DSQ): identical behavior to scenario 0,
   - scenario 4 (global DSQ): identical behavior to scenario 1,
   - scenario 5 (user DSQ): identical behavior to scenario 2.

This verifies that:
 - terminal DSQ dispatch (local, global) don't trigger ops.dequeue(),
 - user DSQ dispatch has exact 1:1 ops.enqueue()/dequeue() pairing,
 - dispatch dequeues have no flags (normal workflow),
 - property change dequeues have the %SCX_DEQ_SCHED_CHANGE flag set,
 - no duplicate enqueues or invalid state transitions are happening,
 - ops.enqueue() and ops.select_cpu() dispatch paths behave identically.

Cc: Tejun Heo <tj@kernel.org>
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Cc: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 tools/testing/selftests/sched_ext/Makefile    |   1 +
 .../testing/selftests/sched_ext/dequeue.bpf.c | 334 ++++++++++++++++++
 tools/testing/selftests/sched_ext/dequeue.c   | 222 ++++++++++++
 3 files changed, 557 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.c

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 5fe45f9c5f8fd..764e91edabf93 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
 
 auto-test-targets :=			\
 	create_dsq			\
+	dequeue				\
 	enq_last_no_enq_fails		\
 	ddsp_bogus_dsq_fail		\
 	ddsp_vtimelocal_fail		\
diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
new file mode 100644
index 0000000000000..9b1950737a014
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates ops.dequeue() is called correctly:
+ * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
+ *   scheduler entirely: no ops.dequeue() should be called
+ * - Tasks dispatched to user DSQs enter BPF custody: ops.dequeue() must be
+ *   called when they leave custody
+ * - Every ops.enqueue() for non-terminal DSQs is followed by exactly one
+ *   ops.dequeue() (validate 1:1 pairing and state machine)
+ *
+ * Copyright (c) 2026 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+#define SHARED_DSQ	0
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+/*
+ * Counters to track the lifecycle of tasks:
+ * - enqueue_cnt: Number of times ops.enqueue() was called
+ * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
+ * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
+ * - change_dequeue_cnt: Number of property change dequeues
+ */
+u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt;
+
+/*
+ * Test scenarios:
+ * 0) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 1) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 2) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
+ *    dequeue callbacks expected)
+ * 3) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 4) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses
+ *    BPF scheduler, no dequeue callbacks)
+ * 5) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
+ *    dequeue callbacks expected)
+ */
+u32 test_scenario;
+
+/*
+ * Per-task state to track lifecycle and validate workflow semantics.
+ * State transitions:
+ *   NONE -> ENQUEUED (on enqueue)
+ *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
+ *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
+ *   ENQUEUED -> NONE (on property change dequeue before dispatch)
+ */
+enum task_state {
+	TASK_NONE = 0,
+	TASK_ENQUEUED,
+	TASK_DISPATCHED,
+};
+
+struct task_ctx {
+	enum task_state state; /* Current state in the workflow */
+	u64 enqueue_seq;       /* Sequence number for debugging */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
+{
+	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	struct task_ctx *tctx;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return prev_cpu;
+
+	switch (test_scenario) {
+	case 3:
+		/*
+		 * Scenario 3: Direct dispatch to local DSQ from select_cpu.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no dequeue callbacks. Behavior should be
+		 * identical to scenario 0.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+		return prev_cpu;
+
+	case 4:
+		/*
+		 * Scenario 4: Direct dispatch to global DSQ from select_cpu.
+		 *
+		 * Like scenario 3, task bypasses BPF scheduler entirely.
+		 * Behavior should be identical to scenario 1.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		return prev_cpu;
+
+	case 5:
+		/*
+		 * Scenario 5: Dispatch to shared user DSQ from select_cpu.
+		 *
+		 * Task enters BPF scheduler management: track
+		 * enqueue/dequeue lifecycle and validate state transitions.
+		 * Behavior should be identical to scenario 2.
+		 */
+		__sync_fetch_and_add(&enqueue_cnt, 1);
+
+		/*
+		 * Validate state transition: enqueue is only valid from
+		 * NONE or DISPATCHED states. Getting enqueue while in
+		 * ENQUEUED state indicates a missing dequeue.
+		 */
+		if (tctx->state == TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		/* Transition to ENQUEUED state */
+		tctx->state = TASK_ENQUEUED;
+		tctx->enqueue_seq++;
+
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);
+		return prev_cpu;
+
+	default:
+		/* For scenarios 0-2, bounce to ops.enqueue() */
+		return prev_cpu;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct task_ctx *tctx;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	switch (test_scenario) {
+	case 0:
+		/*
+		 * Scenario 0: Direct dispatch to the local DSQ.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no dequeue callbacks. Don't increment counters
+		 * or validate state since the task never enters BPF
+		 * scheduler management.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		break;
+
+	case 1:
+		/*
+		 * Scenario 1: Direct dispatch to the global DSQ.
+		 *
+		 * Like scenario 0, task bypasses BPF scheduler entirely.
+		 * SCX_DSQ_GLOBAL is a terminal DSQ, tasks dispatched to it
+		 * leave BPF custody immediately, so no dequeue callbacks
+		 * should be triggered.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		break;
+
+	case 2:
+		/*
+		 * Scenario 2: Dispatch to shared user DSQ.
+		 *
+		 * Task enters BPF scheduler management: track
+		 * enqueue/dequeue lifecycle and validate state
+		 * transitions.
+		 */
+		__sync_fetch_and_add(&enqueue_cnt, 1);
+
+		/*
+		 * Validate state transition: enqueue is only valid from
+		 * NONE or DISPATCHED states. Getting enqueue while in
+		 * ENQUEUED state indicates a missing dequeue.
+		 */
+		if (tctx->state == TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		/* Transition to ENQUEUED state */
+		tctx->state = TASK_ENQUEUED;
+		tctx->enqueue_seq++;
+
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+		break;
+	default:
+		/* For scenarios 3-5 dispatch to the global DSQ */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
+{
+	struct task_ctx *tctx;
+
+	__sync_fetch_and_add(&dequeue_cnt, 1);
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	/*
+	 * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global),
+	 * ops.dequeue() should never be called because tasks bypass the
+	 * BPF scheduler entirely. If we get here, it's a kernel bug. We
+	 * don't track enqueues for these scenarios, so tctx->enqueue_seq
+	 * will be 0.
+	 */
+	if (test_scenario == 0 || test_scenario == 3) {
+		scx_bpf_error("%d (%s): dequeue called for local DSQ scenario - kernel bug!",
+			      p->pid, p->comm);
+		return;
+	}
+	if (test_scenario == 1 || test_scenario == 4) {
+		scx_bpf_error("%d (%s): dequeue called for global DSQ scenario - kernel bug!",
+			      p->pid, p->comm);
+		return;
+	}
+
+	/*
+	 * Validate state: dequeue should only happen from ENQUEUED or
+	 * DISPATCHED states. Getting dequeue from NONE indicates a bug.
+	 */
+	if (tctx->state == TASK_NONE) {
+		scx_bpf_error("%d (%s): dequeue from NONE state seq=%llu",
+			      p->pid, p->comm, tctx->enqueue_seq);
+		return;
+	}
+
+	if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
+		/*
+		 * Property change interrupting the workflow. Valid from
+		 * both ENQUEUED and DISPATCHED states. Transitions task
+		 * back to NONE state.
+		 */
+		__sync_fetch_and_add(&change_dequeue_cnt, 1);
+
+		/* Validate state transition */
+		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
+			scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/* Transition back to NONE: task outside scheduler control */
+		tctx->state = TASK_NONE;
+	} else {
+		/*
+		 * Regular dispatch dequeue: normal workflow step. Valid
+		 * only from ENQUEUED state (after enqueue, before dispatch
+		 * dequeue). Transitions to DISPATCHED state.
+		 */
+		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
+
+		/*
+		 * Dispatch dequeue should not have %SCX_DEQ_SCHED_CHANGE
+		 * flag.
+		 */
+		if (deq_flags & SCX_DEQ_SCHED_CHANGE)
+			scx_bpf_error("%d (%s): SCX_DEQ_SCHED_CHANGE in dispatch dequeue seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		/*
+		 * Must be in ENQUEUED state.
+		 */
+		if (tctx->state != TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/*
+		 * Transition to DISPATCHED: normal cycle completed
+		 * dispatch.
+		 */
+		tctx->state = TASK_DISPATCHED;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_dsq_move_to_local(SHARED_DSQ);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	struct task_ctx *tctx;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!tctx)
+		return -ENOMEM;
+
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
+{
+	s32 ret;
+
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops dequeue_ops = {
+	.select_cpu		= (void *)dequeue_select_cpu,
+	.enqueue		= (void *)dequeue_enqueue,
+	.dequeue		= (void *)dequeue_dequeue,
+	.dispatch		= (void *)dequeue_dispatch,
+	.init_task		= (void *)dequeue_init_task,
+	.init			= (void *)dequeue_init,
+	.exit			= (void *)dequeue_exit,
+	.name			= "dequeue_test",
+};
diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
new file mode 100644
index 0000000000000..02655b9525ffe
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <sched.h>
+#include <pthread.h>
+#include "scx_test.h"
+#include "dequeue.bpf.skel.h"
+
+#define NUM_WORKERS 8
+
+/*
+ * Worker function that creates enqueue/dequeue events. It alternates
+ * between CPU work, sleeping, and affinity changes to trigger dequeues.
+ */
+static void worker_fn(int id)
+{
+	cpu_set_t cpuset;
+	int i;
+	volatile int sum = 0;
+
+	for (i = 0; i < 1000; i++) {
+		int j;
+
+		/* Do some work to trigger scheduling events */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Change affinity to trigger dequeue */
+		if (i % 10 == 0) {
+			CPU_ZERO(&cpuset);
+			/* Rotate through the first 4 CPUs */
+			CPU_SET(i % 4, &cpuset);
+			sched_setaffinity(0, sizeof(cpuset), &cpuset);
+		}
+
+		/* Do additional work */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Sleep to trigger dequeue */
+		usleep(1000 + (id * 100));
+	}
+
+	exit(0);
+}
+
+static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
+					 const char *scenario_name)
+{
+	struct bpf_link *link;
+	pid_t pids[NUM_WORKERS];
+	int i, status;
+	u64 enq_start, deq_start, dispatch_deq_start, change_deq_start;
+	u64 enq_delta, deq_delta, dispatch_deq_delta, change_deq_delta;
+
+	/* Set the test scenario */
+	skel->bss->test_scenario = scenario;
+
+	/* Record starting counts */
+	enq_start = skel->bss->enqueue_cnt;
+	deq_start = skel->bss->dequeue_cnt;
+	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
+	change_deq_start = skel->bss->change_dequeue_cnt;
+
+	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
+
+	/* Fork worker processes to generate enqueue/dequeue events */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		pids[i] = fork();
+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
+
+		if (pids[i] == 0) {
+			worker_fn(i);
+			/* Should not reach here */
+			exit(1);
+		}
+	}
+
+	/* Wait for all workers to complete */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for worker %d", i);
+		SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
+	}
+
+	bpf_link__destroy(link);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+
+	/* Calculate deltas */
+	enq_delta = skel->bss->enqueue_cnt - enq_start;
+	deq_delta = skel->bss->dequeue_cnt - deq_start;
+	dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
+	change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start;
+
+	printf("%s:\n", scenario_name);
+	printf("  enqueues: %lu\n", (unsigned long)enq_delta);
+	printf("  dequeues: %lu (dispatch: %lu, property_change: %lu)\n",
+	       (unsigned long)deq_delta,
+	       (unsigned long)dispatch_deq_delta,
+	       (unsigned long)change_deq_delta);
+
+	/*
+	 * Validate enqueue/dequeue lifecycle tracking.
+	 *
+	 * For scenarios 0, 1, 3, 4 (local and global DSQs from both
+	 * ops.enqueue() and ops.select_cpu()), both enqueues and dequeues
+	 * should be 0 because tasks bypass the BPF scheduler entirely: tasks
+	 * never enter BPF scheduler's custody.
+	 *
+	 * For scenarios 2 and 5 (user DSQ from both ops.enqueue() and
+	 * ops.select_cpu()), we expect both enqueues and dequeues.
+	 *
+	 * The BPF code does strict state machine validation with
+	 * scx_bpf_error() to ensure the workflow semantics are correct. If
+	 * we reach this point without errors, the semantics are validated
+	 * correctly.
+	 */
+	if (scenario == 0 || scenario == 1 || scenario == 3 || scenario == 4) {
+		/* Terminal DSQs: tasks bypass BPF scheduler completely */
+		SCX_EQ(enq_delta, 0);
+		SCX_EQ(deq_delta, 0);
+		SCX_EQ(dispatch_deq_delta, 0);
+		SCX_EQ(change_deq_delta, 0);
+	} else {
+		/* User DSQ: tasks enter BPF scheduler's custody */
+		SCX_GT(enq_delta, 0);
+		SCX_GT(deq_delta, 0);
+		/* Validate 1:1 enqueue/dequeue pairing */
+		SCX_EQ(enq_delta, deq_delta);
+	}
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct dequeue *skel;
+
+	skel = dequeue__open();
+	SCX_FAIL_IF(!skel, "Failed to open skel");
+	SCX_ENUM_INIT(skel);
+	SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct dequeue *skel = ctx;
+	enum scx_test_status status;
+
+	status = run_scenario(skel, 0, "Scenario 0: Local DSQ from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 1, "Scenario 1: Global DSQ from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 2, "Scenario 2: User DSQ from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 3, "Scenario 3: Local DSQ from ops.select_cpu()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 4, "Scenario 4: Global DSQ from ops.select_cpu()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 5, "Scenario 5: User DSQ from ops.select_cpu()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	printf("\n=== Summary ===\n");
+	printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
+	printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
+	printf("  Dispatch dequeues: %lu (no flag, normal workflow)\n",
+	       (unsigned long)skel->bss->dispatch_dequeue_cnt);
+	printf("  Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n",
+	       (unsigned long)skel->bss->change_dequeue_cnt);
+	printf("\nAll scenarios passed - no state machine violations detected\n");
+	printf("-> Validated: Local DSQ dispatch bypasses BPF scheduler (both paths)\n");
+	printf("-> Validated: Global DSQ dispatch bypasses BPF scheduler (both paths)\n");
+	printf("-> Validated: User DSQ dispatch triggers dequeue callbacks (both paths)\n");
+	printf("-> Validated: ops.enqueue() and ops.select_cpu() behave identically\n");
+	printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
+	printf("-> Validated: Property change dequeues have SCX_DEQ_SCHED_CHANGE flag\n");
+	printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct dequeue *skel = ctx;
+
+	dequeue__destroy(skel);
+}
+
+struct scx_test dequeue_test = {
+	.name = "dequeue",
+	.description = "Verify ops.dequeue() semantics",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+
+REGISTER_SCX_TEST(&dequeue_test)
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCHSET v7] sched_ext: Fix ops.dequeue() semantics
@ 2026-02-06 13:54 Andrea Righi
  2026-02-06 13:54 ` [PATCH 1/2] " Andrea Righi
  2026-02-06 13:54 ` [PATCH 2/2] selftests/sched_ext: Add test to validate " Andrea Righi
  0 siblings, 2 replies; 33+ messages in thread
From: Andrea Righi @ 2026-02-06 13:54 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Emil Tsalapatis, Christian Loehle, Daniel Hodges,
	sched-ext, linux-kernel

The callback ops.dequeue() is provided to let BPF schedulers observe when a
task leaves the scheduler, either because it is dispatched or due to a task
property change. However, this callback is currently unreliable and not
invoked systematically, which can result in missed ops.dequeue() events.

In particular, once a task is removed from the scheduler (whether for
dispatch or due to a property change) the BPF scheduler loses visibility of
the task and the sched_ext core may not always trigger ops.dequeue().

This breaks accurate accounting (i.e., per-DSQ queued runtime sums) and
prevents reliable tracking of task lifecycle transitions.

This patch set fixes the semantics of ops.dequeue(), by guaranteeing that
each task entering the BPF scheduler's custody triggers exactly one
ops.dequeue() call when it leaves that custody, whether the exit is due to
a dispatch (regular or via a core scheduling pick) or to a scheduling
property change (e.g.  sched_setaffinity(), sched_setscheduler(),
set_user_nice(), NUMA balancing, etc.).

To identify property change dequeues a new ops.dequeue() flag is
introduced: %SCX_DEQ_SCHED_CHANGE.

Together, these changes allow BPF schedulers to reliably track task
ownership and maintain accurate accounting.

Changes in v7:
 - Handle tasks stored to BPF internal data structures (trigger
   ops.dequeue())
 - Add a new kselftest scenario to verify ops.dequeue() behavior with tasks
   stored to internal BPF data structures
 - Link to v6:
   https://lore.kernel.org/all/20260205153304.1996142-1-arighi@nvidia.com

Changes in v6:
 - Rename SCX_TASK_OPS_ENQUEUED -> SCX_TASK_NEED_DSQ
 - Use SCX_DSQ_FLAG_BUILTIN in is_terminal_dsq() to check for all builtin
   DSQs (local, global, bypass)
 - centralize ops.dequeue() logic in dispatch_enqueue()
 - Remove "Property Change Notifications for Running Tasks" section from
   the documentation
 - The kselftest now validates the right behavior both from ops.enqueue()
   and ops.select_cpu()
 - Link to v5: https://lore.kernel.org/all/20260204160710.1475802-1-arighi@nvidia.com

Changes in v5:
 - Introduce the concept of "terminal DSQ" (when a task is dispatched to a
   terminal DSQ, the task leaves the BPF scheduler's custody)
 - Consider SCX_DSQ_GLOBAL as a terminal DSQ
 - Link to v4: https://lore.kernel.org/all/20260201091318.178710-1-arighi@nvidia.com

Changes in v4:
 - Introduce the concept of "BPF scheduler custody"
 - Do not trigger ops.dequeue() for direct dispatches to local DSQs
 - Trigger ops.dequeue() only once; after the task leaves BPF scheduler
   custody, further dequeue events are not reported.
 - Link to v3: https://lore.kernel.org/all/20260126084258.3798129-1-arighi@nvidia.com

Changes in v3:
 - Rename SCX_DEQ_ASYNC to SCX_DEQ_SCHED_CHANGE
 - Handle core-sched dequeues (Kuba)
 - Link to v2: https://lore.kernel.org/all/20260121123118.964704-1-arighi@nvidia.com

Changes in v2:
 - Distinguish between "dispatch" dequeues and "property change" dequeues
   (flag SCX_DEQ_ASYNC)
 - Link to v1: https://lore.kernel.org/all/20251219224450.2537941-1-arighi@nvidia.com

Andrea Righi (2):
      sched_ext: Fix ops.dequeue() semantics
      selftests/sched_ext: Add test to validate ops.dequeue() semantics

 Documentation/scheduler/sched-ext.rst           |  58 ++++
 include/linux/sched/ext.h                       |   1 +
 kernel/sched/ext.c                              | 157 ++++++++-
 kernel/sched/ext_internal.h                     |   7 +
 tools/sched_ext/include/scx/enum_defs.autogen.h |   1 +
 tools/sched_ext/include/scx/enums.autogen.bpf.h |   2 +
 tools/sched_ext/include/scx/enums.autogen.h     |   1 +
 tools/testing/selftests/sched_ext/Makefile      |   1 +
 tools/testing/selftests/sched_ext/dequeue.bpf.c | 403 ++++++++++++++++++++++++
 tools/testing/selftests/sched_ext/dequeue.c     | 258 +++++++++++++++
 10 files changed, 875 insertions(+), 14 deletions(-)
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.c

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics
  2026-02-06 13:54 [PATCHSET v7] sched_ext: Fix ops.dequeue() semantics Andrea Righi
@ 2026-02-06 13:54 ` Andrea Righi
  2026-02-06 20:35   ` Emil Tsalapatis
  2026-02-06 13:54 ` [PATCH 2/2] selftests/sched_ext: Add test to validate " Andrea Righi
  1 sibling, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-06 13:54 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Emil Tsalapatis, Christian Loehle, Daniel Hodges,
	sched-ext, linux-kernel

Currently, ops.dequeue() is only invoked when the sched_ext core knows
that a task resides in BPF-managed data structures, which causes it to
miss scheduling property change events. In addition, ops.dequeue()
callbacks are completely skipped when tasks are dispatched to non-local
DSQs from ops.select_cpu(). As a result, BPF schedulers cannot reliably
track task state.

Fix this by guaranteeing that each task entering the BPF scheduler's
custody triggers exactly one ops.dequeue() call when it leaves that
custody, whether the exit is due to a dispatch (regular or via a core
scheduling pick) or to a scheduling property change (e.g.
sched_setaffinity(), sched_setscheduler(), set_user_nice(), NUMA
balancing, etc.).

BPF scheduler custody concept: a task is considered to be in the BPF
scheduler's custody when the scheduler is responsible for managing its
lifecycle. This includes tasks dispatched to user-created DSQs or stored
in the BPF scheduler's internal data structures. Custody ends when the
task is dispatched to a terminal DSQ (such as the local DSQ or
%SCX_DSQ_GLOBAL), selected by core scheduling, or removed due to a
property change.

Tasks directly dispatched to terminal DSQs bypass the BPF scheduler
entirely and are never in its custody. Terminal DSQs include:
 - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON): per-CPU queues
   where tasks go directly to execution.
 - Global DSQ (%SCX_DSQ_GLOBAL): the built-in fallback queue where the
   BPF scheduler is considered "done" with the task.

As a result, ops.dequeue() is not invoked for tasks directly dispatched
to terminal DSQs.

To identify dequeues triggered by scheduling property changes, introduce
the new ops.dequeue() flag %SCX_DEQ_SCHED_CHANGE: when this flag is set,
the dequeue was caused by a scheduling property change.

New ops.dequeue() semantics:
 - ops.dequeue() is invoked exactly once when the task leaves the BPF
   scheduler's custody, in one of the following cases:
   a) regular dispatch: a task dispatched to a user DSQ or stored in
      internal BPF data structures is moved to a terminal DSQ
      (ops.dequeue() called without any special flags set),
   b) core scheduling dispatch: core-sched picks task before dispatch
      (ops.dequeue() called with %SCX_DEQ_CORE_SCHED_EXEC flag set),
   c) property change: task properties modified before dispatch,
      (ops.dequeue() called with %SCX_DEQ_SCHED_CHANGE flag set).

This allows BPF schedulers to:
 - reliably track task ownership and lifecycle,
 - maintain accurate accounting of managed tasks,
 - update internal state when tasks change properties.

Cc: Tejun Heo <tj@kernel.org>
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Cc: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 Documentation/scheduler/sched-ext.rst         |  58 +++++++
 include/linux/sched/ext.h                     |   1 +
 kernel/sched/ext.c                            | 157 ++++++++++++++++--
 kernel/sched/ext_internal.h                   |   7 +
 .../sched_ext/include/scx/enum_defs.autogen.h |   1 +
 .../sched_ext/include/scx/enums.autogen.bpf.h |   2 +
 tools/sched_ext/include/scx/enums.autogen.h   |   1 +
 7 files changed, 213 insertions(+), 14 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 404fe6126a769..fe8c59b0c1477 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -252,6 +252,62 @@ The following briefly shows how a waking task is scheduled and executed.
 
    * Queue the task on the BPF side.
 
+   **Task State Tracking and ops.dequeue() Semantics**
+
+   A task is in the "BPF scheduler's custody" when the BPF scheduler is
+   responsible for managing its lifecycle. That includes tasks dispatched
+   to user-created DSQs or stored in the BPF scheduler's internal data
+   structures. Once ``ops.select_cpu()`` or ``ops.enqueue()`` is called,
+   the task may or may not enter custody depending on what the scheduler
+   does:
+
+   * **Directly dispatched to terminal DSQs** (``SCX_DSQ_LOCAL``,
+     ``SCX_DSQ_LOCAL_ON | cpu``, or ``SCX_DSQ_GLOBAL``): The BPF scheduler
+     is done with the task - it either goes straight to a CPU's local run
+     queue or to the global DSQ as a fallback. The task never enters (or
+     exits) BPF custody, and ``ops.dequeue()`` will not be called.
+
+   * **Dispatch to user-created DSQs** (custom DSQs): the task enters the
+     BPF scheduler's custody. When the task later leaves BPF custody
+     (dispatched to a terminal DSQ, picked by core-sched, or dequeued for
+     sleep/property changes), ``ops.dequeue()`` will be called exactly once.
+
+   * **Queued on BPF side** (e.g., internal queues, no DSQ): The task is in
+     BPF custody. ``ops.dequeue()`` will be called when it leaves (e.g.
+     when ``ops.dispatch()`` moves it to a terminal DSQ, or on property
+     change / sleep).
+
+   **NOTE**: this concept is valid also with the ``ops.select_cpu()``
+   direct dispatch optimization. Even though it skips ``ops.enqueue()``
+   invocation, if the task is dispatched to a user-created DSQ or internal
+   BPF structure, it enters BPF custody and will get ``ops.dequeue()`` when
+   it leaves. If dispatched to a terminal DSQ, the BPF scheduler is done
+   with it immediately. This provides the performance benefit of avoiding
+   the ``ops.enqueue()`` roundtrip while maintaining correct state
+   tracking.
+
+   The dequeue can happen for different reasons, distinguished by flags:
+
+   1. **Regular dispatch**: when a task in BPF custody is dispatched to a
+      terminal DSQ from ``ops.dispatch()`` (leaving BPF custody for
+      execution), ``ops.dequeue()`` is triggered without any special flags.
+
+   2. **Core scheduling pick**: when ``CONFIG_SCHED_CORE`` is enabled and
+      core scheduling picks a task for execution while it's still in BPF
+      custody, ``ops.dequeue()`` is called with the
+      ``SCX_DEQ_CORE_SCHED_EXEC`` flag.
+
+   3. **Scheduling property change**: when a task property changes (via
+      operations like ``sched_setaffinity()``, ``sched_setscheduler()``,
+      priority changes, CPU migrations, etc.) while the task is still in
+      BPF custody, ``ops.dequeue()`` is called with the
+      ``SCX_DEQ_SCHED_CHANGE`` flag set in ``deq_flags``.
+
+   **Important**: Once a task has left BPF custody (e.g. after being
+   dispatched to a terminal DSQ), property changes will not trigger
+   ``ops.dequeue()``, since the task is no longer being managed by the BPF
+   scheduler.
+
 3. When a CPU is ready to schedule, it first looks at its local DSQ. If
    empty, it then looks at the global DSQ. If there still isn't a task to
    run, ``ops.dispatch()`` is invoked which can use the following two
@@ -319,6 +375,8 @@ by a sched_ext scheduler:
                 /* Any usable CPU becomes available */
 
                 ops.dispatch(); /* Task is moved to a local DSQ */
+
+                ops.dequeue(); /* Exiting BPF scheduler */
             }
             ops.running();      /* Task starts running on its assigned CPU */
             while (task->scx.slice > 0 && task is runnable)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index bcb962d5ee7d8..c48f818eee9b8 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -84,6 +84,7 @@ struct scx_dispatch_q {
 /* scx_entity.flags */
 enum scx_ent_flags {
 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
+	SCX_TASK_NEED_DEQ	= 1 << 1, /* in BPF custody, needs ops.dequeue() when leaving */
 	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0bb8fa927e9e9..d17fd9141adf4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -925,6 +925,27 @@ static void touch_core_sched(struct rq *rq, struct task_struct *p)
 #endif
 }
 
+/**
+ * is_terminal_dsq - Check if a DSQ is terminal for ops.dequeue() purposes
+ * @dsq_id: DSQ ID to check
+ *
+ * Returns true if @dsq_id is a terminal/builtin DSQ where the BPF
+ * scheduler is considered "done" with the task.
+ *
+ * Builtin DSQs include:
+ *  - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON): per-CPU queues
+ *    where tasks go directly to execution,
+ *  - Global DSQ (%SCX_DSQ_GLOBAL): built-in fallback queue,
+ *  - Bypass DSQ: used during bypass mode.
+ *
+ * Tasks dispatched to builtin DSQs exit BPF scheduler custody and do not
+ * trigger ops.dequeue() when they are later consumed.
+ */
+static inline bool is_terminal_dsq(u64 dsq_id)
+{
+	return dsq_id & SCX_DSQ_FLAG_BUILTIN;
+}
+
 /**
  * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
  * @rq: rq to read clock from, must be locked
@@ -1008,7 +1029,8 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p
 		resched_curr(rq);
 }
 
-static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
+			     struct scx_dispatch_q *dsq,
 			     struct task_struct *p, u64 enq_flags)
 {
 	bool is_local = dsq->id == SCX_DSQ_LOCAL;
@@ -1103,6 +1125,27 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
 	dsq_mod_nr(dsq, 1);
 	p->scx.dsq = dsq;
 
+	/*
+	 * Handle ops.dequeue() and custody tracking.
+	 *
+	 * Builtin DSQs (local, global, bypass) are terminal: the BPF
+	 * scheduler is done with the task. If it was in BPF custody, call
+	 * ops.dequeue() and clear the flag.
+	 *
+	 * User DSQs: Task is in BPF scheduler's custody. Set the flag so
+	 * ops.dequeue() will be called when it leaves.
+	 */
+	if (SCX_HAS_OP(sch, dequeue)) {
+		if (is_terminal_dsq(dsq->id)) {
+			if (p->scx.flags & SCX_TASK_NEED_DEQ)
+				SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue,
+						 rq, p, 0);
+			p->scx.flags &= ~SCX_TASK_NEED_DEQ;
+		} else {
+			p->scx.flags |= SCX_TASK_NEED_DEQ;
+		}
+	}
+
 	/*
 	 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
 	 * direct dispatch path, but we clear them here because the direct
@@ -1323,7 +1366,7 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
 		return;
 	}
 
-	dispatch_enqueue(sch, dsq, p,
+	dispatch_enqueue(sch, rq, dsq, p,
 			 p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
 }
 
@@ -1407,13 +1450,22 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	 * dequeue may be waiting. The store_release matches their load_acquire.
 	 */
 	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
+
+	/*
+	 * Task is now in BPF scheduler's custody (queued on BPF internal
+	 * structures). Set %SCX_TASK_NEED_DEQ so ops.dequeue() is called
+	 * when it leaves custody (e.g. dispatched to a terminal DSQ or on
+	 * property change).
+	 */
+	if (SCX_HAS_OP(sch, dequeue))
+		p->scx.flags |= SCX_TASK_NEED_DEQ;
 	return;
 
 direct:
 	direct_dispatch(sch, p, enq_flags);
 	return;
 local_norefill:
-	dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
+	dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags);
 	return;
 local:
 	dsq = &rq->scx.local_dsq;
@@ -1433,7 +1485,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	 */
 	touch_core_sched(rq, p);
 	refill_task_slice_dfl(sch, p);
-	dispatch_enqueue(sch, dsq, p, enq_flags);
+	dispatch_enqueue(sch, rq, dsq, p, enq_flags);
 }
 
 static bool task_runnable(const struct task_struct *p)
@@ -1511,6 +1563,22 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 		__scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1);
 }
 
+/*
+ * Call ops.dequeue() for a task leaving BPF custody. Adds %SCX_DEQ_SCHED_CHANGE
+ * when the dequeue is due to a property change (not sleep or core-sched pick).
+ */
+static void call_task_dequeue(struct scx_sched *sch, struct rq *rq,
+			      struct task_struct *p, u64 deq_flags)
+{
+	u64 flags = deq_flags;
+
+	if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC)))
+		flags |= SCX_DEQ_SCHED_CHANGE;
+
+	SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, p, flags);
+	p->scx.flags &= ~SCX_TASK_NEED_DEQ;
+}
+
 static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 {
 	struct scx_sched *sch = scx_root;
@@ -1524,6 +1592,24 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 
 	switch (opss & SCX_OPSS_STATE_MASK) {
 	case SCX_OPSS_NONE:
+		/*
+		 * Task is not in BPF data structures (either dispatched to
+		 * a DSQ or running). Only call ops.dequeue() if the task
+		 * is still in BPF scheduler's custody (%SCX_TASK_NEED_DEQ
+		 * is set).
+		 *
+		 * If the task has already been dispatched to a terminal
+		 * DSQ (local DSQ or %SCX_DSQ_GLOBAL), it has left the BPF
+		 * scheduler's custody and the flag will be clear, so we
+		 * skip ops.dequeue().
+		 *
+		 * If this is a property change (not sleep/core-sched) and
+		 * the task is still in BPF custody, set the
+		 * %SCX_DEQ_SCHED_CHANGE flag.
+		 */
+		if (SCX_HAS_OP(sch, dequeue) &&
+		    (p->scx.flags & SCX_TASK_NEED_DEQ))
+			call_task_dequeue(sch, rq, p, deq_flags);
 		break;
 	case SCX_OPSS_QUEUEING:
 		/*
@@ -1532,9 +1618,14 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 		 */
 		BUG();
 	case SCX_OPSS_QUEUED:
-		if (SCX_HAS_OP(sch, dequeue))
-			SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
-					 p, deq_flags);
+		/*
+		 * Task is still on the BPF scheduler (not dispatched yet).
+		 * Call ops.dequeue() to notify it is leaving BPF custody.
+		 */
+		if (SCX_HAS_OP(sch, dequeue)) {
+			WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_NEED_DEQ));
+			call_task_dequeue(sch, rq, p, deq_flags);
+		}
 
 		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
 					    SCX_OPSS_NONE))
@@ -1631,6 +1722,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
 					 struct scx_dispatch_q *src_dsq,
 					 struct rq *dst_rq)
 {
+	struct scx_sched *sch = scx_root;
 	struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;
 
 	/* @dsq is locked and @p is on @dst_rq */
@@ -1639,6 +1731,15 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
 
 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
 
+	/*
+	 * Task is moving from a non-local DSQ to a local (terminal) DSQ.
+	 * Call ops.dequeue() if the task was in BPF custody.
+	 */
+	if (SCX_HAS_OP(sch, dequeue) && (p->scx.flags & SCX_TASK_NEED_DEQ)) {
+		SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, dst_rq, p, 0);
+		p->scx.flags &= ~SCX_TASK_NEED_DEQ;
+	}
+
 	if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
 		list_add(&p->scx.dsq_list.node, &dst_dsq->list);
 	else
@@ -1879,7 +1980,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
 		dispatch_dequeue_locked(p, src_dsq);
 		raw_spin_unlock(&src_dsq->lock);
 
-		dispatch_enqueue(sch, dst_dsq, p, enq_flags);
+		dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags);
 	}
 
 	return dst_rq;
@@ -1969,14 +2070,14 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
 	 * If dispatching to @rq that @p is already on, no lock dancing needed.
 	 */
 	if (rq == src_rq && rq == dst_rq) {
-		dispatch_enqueue(sch, dst_dsq, p,
+		dispatch_enqueue(sch, rq, dst_dsq, p,
 				 enq_flags | SCX_ENQ_CLEAR_OPSS);
 		return;
 	}
 
 	if (src_rq != dst_rq &&
 	    unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
-		dispatch_enqueue(sch, find_global_dsq(sch, p), p,
+		dispatch_enqueue(sch, rq, find_global_dsq(sch, p), p,
 				 enq_flags | SCX_ENQ_CLEAR_OPSS);
 		return;
 	}
@@ -2014,9 +2115,21 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
 		 */
 		if (src_rq == dst_rq) {
 			p->scx.holding_cpu = -1;
-			dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p,
+			dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p,
 					 enq_flags);
 		} else {
+			/*
+			 * Moving to a remote local DSQ. dispatch_enqueue() is
+			 * not used (we go through deactivate/activate), so
+			 * call ops.dequeue() here if the task was in BPF
+			 * custody.
+			 */
+			if (SCX_HAS_OP(sch, dequeue) &&
+			    (p->scx.flags & SCX_TASK_NEED_DEQ)) {
+				SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue,
+						 src_rq, p, 0);
+				p->scx.flags &= ~SCX_TASK_NEED_DEQ;
+			}
 			move_remote_task_to_local_dsq(p, enq_flags,
 						      src_rq, dst_rq);
 			/* task has been moved to dst_rq, which is now locked */
@@ -2113,7 +2226,7 @@ static void finish_dispatch(struct scx_sched *sch, struct rq *rq,
 	if (dsq->id == SCX_DSQ_LOCAL)
 		dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
 	else
-		dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+		dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
 }
 
 static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
@@ -2414,7 +2527,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 		 * DSQ.
 		 */
 		if (p->scx.slice && !scx_rq_bypassing(rq)) {
-			dispatch_enqueue(sch, &rq->scx.local_dsq, p,
+			dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p,
 					 SCX_ENQ_HEAD);
 			goto switch_class;
 		}
@@ -2898,6 +3011,14 @@ static void scx_enable_task(struct task_struct *p)
 
 	lockdep_assert_rq_held(rq);
 
+	/*
+	 * Verify the task is not in BPF scheduler's custody. If flag
+	 * transitions are consistent, the flag should always be clear
+	 * here.
+	 */
+	if (SCX_HAS_OP(sch, dequeue))
+		WARN_ON_ONCE(p->scx.flags & SCX_TASK_NEED_DEQ);
+
 	/*
 	 * Set the weight before calling ops.enable() so that the scheduler
 	 * doesn't see a stale value if they inspect the task struct.
@@ -2929,6 +3050,14 @@ static void scx_disable_task(struct task_struct *p)
 	if (SCX_HAS_OP(sch, disable))
 		SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p);
 	scx_set_task_state(p, SCX_TASK_READY);
+
+	/*
+	 * Verify the task is not in BPF scheduler's custody. If flag
+	 * transitions are consistent, the flag should always be clear
+	 * here.
+	 */
+	if (SCX_HAS_OP(sch, dequeue))
+		WARN_ON_ONCE(p->scx.flags & SCX_TASK_NEED_DEQ);
 }
 
 static void scx_exit_task(struct task_struct *p)
@@ -3919,7 +4048,7 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
 		 * between bypass DSQs.
 		 */
 		dispatch_dequeue_locked(p, donor_dsq);
-		dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED);
+		dispatch_enqueue(sch, donee_rq, donee_dsq, p, SCX_ENQ_NESTED);
 
 		/*
 		 * $donee might have been idle and need to be woken up. No need
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 386c677e4c9a0..befa9a5d6e53f 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -982,6 +982,13 @@ enum scx_deq_flags {
 	 * it hasn't been dispatched yet. Dequeue from the BPF side.
 	 */
 	SCX_DEQ_CORE_SCHED_EXEC	= 1LLU << 32,
+
+	/*
+	 * The task is being dequeued due to a property change (e.g.,
+	 * sched_setaffinity(), sched_setscheduler(), set_user_nice(),
+	 * etc.).
+	 */
+	SCX_DEQ_SCHED_CHANGE	= 1LLU << 33,
 };
 
 enum scx_pick_idle_cpu_flags {
diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h
index c2c33df9292c2..dcc945304760f 100644
--- a/tools/sched_ext/include/scx/enum_defs.autogen.h
+++ b/tools/sched_ext/include/scx/enum_defs.autogen.h
@@ -21,6 +21,7 @@
 #define HAVE_SCX_CPU_PREEMPT_UNKNOWN
 #define HAVE_SCX_DEQ_SLEEP
 #define HAVE_SCX_DEQ_CORE_SCHED_EXEC
+#define HAVE_SCX_DEQ_SCHED_CHANGE
 #define HAVE_SCX_DSQ_FLAG_BUILTIN
 #define HAVE_SCX_DSQ_FLAG_LOCAL_ON
 #define HAVE_SCX_DSQ_INVALID
diff --git a/tools/sched_ext/include/scx/enums.autogen.bpf.h b/tools/sched_ext/include/scx/enums.autogen.bpf.h
index 2f8002bcc19ad..5da50f9376844 100644
--- a/tools/sched_ext/include/scx/enums.autogen.bpf.h
+++ b/tools/sched_ext/include/scx/enums.autogen.bpf.h
@@ -127,3 +127,5 @@ const volatile u64 __SCX_ENQ_CLEAR_OPSS __weak;
 const volatile u64 __SCX_ENQ_DSQ_PRIQ __weak;
 #define SCX_ENQ_DSQ_PRIQ __SCX_ENQ_DSQ_PRIQ
 
+const volatile u64 __SCX_DEQ_SCHED_CHANGE __weak;
+#define SCX_DEQ_SCHED_CHANGE __SCX_DEQ_SCHED_CHANGE
diff --git a/tools/sched_ext/include/scx/enums.autogen.h b/tools/sched_ext/include/scx/enums.autogen.h
index fedec938584be..fc9a7a4d9dea5 100644
--- a/tools/sched_ext/include/scx/enums.autogen.h
+++ b/tools/sched_ext/include/scx/enums.autogen.h
@@ -46,4 +46,5 @@
 	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_LAST); \
 	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_CLEAR_OPSS); \
 	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_DSQ_PRIQ); \
+	SCX_ENUM_SET(skel, scx_deq_flags, SCX_DEQ_SCHED_CHANGE); \
 } while (0)
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-06 13:54 [PATCHSET v7] sched_ext: Fix ops.dequeue() semantics Andrea Righi
  2026-02-06 13:54 ` [PATCH 1/2] " Andrea Righi
@ 2026-02-06 13:54 ` Andrea Righi
  2026-02-06 20:10   ` Emil Tsalapatis
  1 sibling, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-06 13:54 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Emil Tsalapatis, Christian Loehle, Daniel Hodges,
	sched-ext, linux-kernel

Add a new kselftest to validate that the new ops.dequeue() semantics
work correctly for all task lifecycle scenarios, including the
distinction between terminal DSQs (where BPF scheduler is done with the
task), user DSQs (where BPF scheduler manages the task lifecycle) and
BPF data structures, regardless of which event performs the dispatch.

The test validates the following scenarios:

 - From ops.select_cpu():
     - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
       the BPF scheduler entirely; they never enter BPF custody, so
       ops.dequeue() is not called,
     - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
       bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
       not called,
     - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
       enqueue/dequeue lifecycle tracking and state machine validation
       (expects 1:1 enqueue/dequeue pairing).

   - From ops.enqueue():
     - scenario 3 (local DSQ): same behavior as scenario 0,
     - scenario 4 (global DSQ): same behavior as scenario 1,
     - scenario 5 (user DSQ): same behavior as scenario 2,
     - scenario 6 (BPF internal queue): tasks are stored in a BPF queue
       in ops.enqueue() and consumed in ops.dispatch(); they remain in
       BPF custody until dispatch, with full lifecycle tracking and 1:1
       enqueue/dequeue validation.

This verifies that:
 - terminal DSQ dispatch (local, global) don't trigger ops.dequeue(),
 - user DSQ / internal BPF data structure dispatch has exact 1:1
   ops.enqueue()/dequeue() pairing,
 - dispatch dequeues have no flags (normal workflow),
 - property change dequeues have the %SCX_DEQ_SCHED_CHANGE flag set,
 - no duplicate enqueues or invalid state transitions are happening,
 - ops.enqueue() and ops.select_cpu() dispatch paths behave identically.

Cc: Tejun Heo <tj@kernel.org>
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Cc: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 tools/testing/selftests/sched_ext/Makefile    |   1 +
 .../testing/selftests/sched_ext/dequeue.bpf.c | 403 ++++++++++++++++++
 tools/testing/selftests/sched_ext/dequeue.c   | 258 +++++++++++
 3 files changed, 662 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.c

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 5fe45f9c5f8fd..764e91edabf93 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
 
 auto-test-targets :=			\
 	create_dsq			\
+	dequeue				\
 	enq_last_no_enq_fails		\
 	ddsp_bogus_dsq_fail		\
 	ddsp_vtimelocal_fail		\
diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
new file mode 100644
index 0000000000000..4ba657ba1bff5
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates ops.dequeue() is called correctly:
+ * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
+ *   scheduler entirely: no ops.dequeue() should be called
+ * - Tasks dispatched to user DSQs enter BPF custody: ops.dequeue() must be
+ *   called when they leave custody
+ * - Every ops.enqueue() for non-terminal DSQs is followed by exactly one
+ *   ops.dequeue() (validate 1:1 pairing and state machine)
+ *
+ * Copyright (c) 2026 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+#define SHARED_DSQ	0
+
+/*
+ * Scenario 6: BPF internal queue. Tasks are stored here from ops.enqueue()
+ * and consumed from ops.dispatch(), validating that tasks not on a user DSQ
+ * (only on BPF internal structures) still get ops.dequeue() when they leave.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, 4096);
+	__type(value, s32);
+} global_queue SEC(".maps");
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+/*
+ * Counters to track the lifecycle of tasks:
+ * - enqueue_cnt: Number of times ops.enqueue() was called
+ * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
+ * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
+ * - change_dequeue_cnt: Number of property change dequeues
+ */
+u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt;
+
+/*
+ * Test scenarios (0-2: ops.select_cpu(), 3-6: ops.enqueue()):
+ * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
+ *    dequeue callbacks expected)
+ * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
+ *    dequeue callbacks expected)
+ * 6) BPF internal queue: store task PIDs in ops.enqueue(), consume in
+ *    ops.dispatch() and dispatch to local DSQ (validates dequeue for tasks
+ *    in BPF custody but not on a user DSQ)
+ */
+u32 test_scenario;
+
+/*
+ * Per-task state to track lifecycle and validate workflow semantics.
+ * State transitions:
+ *   NONE -> ENQUEUED (on enqueue)
+ *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
+ *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
+ *   ENQUEUED -> NONE (on property change dequeue before dispatch)
+ */
+enum task_state {
+	TASK_NONE = 0,
+	TASK_ENQUEUED,
+	TASK_DISPATCHED,
+};
+
+struct task_ctx {
+	enum task_state state; /* Current state in the workflow */
+	u64 enqueue_seq;       /* Sequence number for debugging */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
+{
+	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	struct task_ctx *tctx;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return prev_cpu;
+
+	switch (test_scenario) {
+	case 0:
+		/*
+		 * Scenario 0: Direct dispatch to local DSQ from select_cpu.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no dequeue callbacks. Behavior should be
+		 * identical to scenario 3.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+		return prev_cpu;
+
+	case 1:
+		/*
+		 * Scenario 1: Direct dispatch to global DSQ from select_cpu.
+		 *
+		 * Like scenario 0, task bypasses BPF scheduler entirely.
+		 * Behavior should be identical to scenario 4.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		return prev_cpu;
+
+	case 2:
+		/*
+		 * Scenario 2: Dispatch to shared user DSQ from select_cpu.
+		 *
+		 * Task enters BPF scheduler management: track
+		 * enqueue/dequeue lifecycle and validate state transitions.
+		 * Behavior should be identical to scenario 5.
+		 */
+		__sync_fetch_and_add(&enqueue_cnt, 1);
+
+		/*
+		 * Validate state transition: enqueue is only valid from
+		 * NONE or DISPATCHED states. Getting enqueue while in
+		 * ENQUEUED state indicates a missing dequeue.
+		 */
+		if (tctx->state == TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		/* Transition to ENQUEUED state */
+		tctx->state = TASK_ENQUEUED;
+		tctx->enqueue_seq++;
+
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);
+		return prev_cpu;
+
+	default:
+		/*
+		 * Force all tasks through ops.enqueue().
+		 */
+		return prev_cpu;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct task_ctx *tctx;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	switch (test_scenario) {
+	case 3:
+		/*
+		 * Scenario 3: Direct dispatch to the local DSQ.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no dequeue callbacks. Don't increment counters
+		 * or validate state since the task never enters BPF
+		 * scheduler management.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		break;
+
+	case 4:
+		/*
+		 * Scenario 4: Direct dispatch to the global DSQ.
+		 *
+		 * Like scenario 3, task bypasses BPF scheduler entirely.
+		 * SCX_DSQ_GLOBAL is a terminal DSQ, tasks dispatched to it
+		 * leave BPF custody immediately, so no dequeue callbacks
+		 * should be triggered.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		break;
+
+	case 5:
+		/*
+		 * Scenario 5: Dispatch to shared user DSQ.
+		 *
+		 * Task enters BPF scheduler management: track
+		 * enqueue/dequeue lifecycle and validate state
+		 * transitions.
+		 */
+		__sync_fetch_and_add(&enqueue_cnt, 1);
+
+		/*
+		 * Validate state transition: enqueue is only valid from
+		 * NONE or DISPATCHED states. Getting enqueue while in
+		 * ENQUEUED state indicates a missing dequeue (or stale state
+		 * from a previous scenario when the scheduler was unregistered
+		 * with tasks still on a DSQ). Reset and proceed to avoid false
+		 * positives across scenario switches.
+		 */
+		if (tctx->state == TASK_ENQUEUED)
+			tctx->state = TASK_NONE;
+
+		/* Transition to ENQUEUED state */
+		tctx->state = TASK_ENQUEUED;
+		tctx->enqueue_seq++;
+
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+		break;
+
+	case 6:
+		/*
+		 * Scenario 6: Store task in BPF internal queue. Task enters
+		 * BPF custody (kernel sets SCX_TASK_NEED_DEQ). When
+		 * ops.dispatch() later pops and inserts to local DSQ,
+		 * ops.dequeue() must be called.
+		 *
+		 * If the queue is full, fallback to local DSQ. The task still
+		 * goes through QUEUED in the kernel and gets ops.dequeue()
+		 * when moved to the terminal DSQ, so we track it the same.
+		 *
+		 * If state is already ENQUEUED (e.g. task was on a DSQ when
+		 * the scheduler was unregistered in a previous scenario),
+		 * reset to NONE and proceed to avoid false positives.
+		 */
+		{
+			s32 pid = p->pid;
+
+			if (tctx->state == TASK_ENQUEUED)
+				tctx->state = TASK_NONE;
+
+			tctx->state = TASK_ENQUEUED;
+			tctx->enqueue_seq++;
+
+			/* Queue full: fallback to the global DSQ */
+			if (bpf_map_push_elem(&global_queue, &pid, 0))
+				scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+			else
+				__sync_fetch_and_add(&enqueue_cnt, 1);
+
+			scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE);
+		}
+		break;
+
+	default:
+		/* For scenarios 0-2 dispatch to the global DSQ */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
+{
+	struct task_ctx *tctx;
+
+	__sync_fetch_and_add(&dequeue_cnt, 1);
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	/*
+	 * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global),
+	 * ops.dequeue() should never be called because tasks bypass the
+	 * BPF scheduler entirely. If we get here, it's a kernel bug.
+	 */
+	if (test_scenario == 0 || test_scenario == 3) {
+		scx_bpf_error("%d (%s): dequeue called for local DSQ scenario - kernel bug!",
+			      p->pid, p->comm);
+		return;
+	}
+	if (test_scenario == 1 || test_scenario == 4) {
+		scx_bpf_error("%d (%s): dequeue called for global DSQ scenario - kernel bug!",
+			      p->pid, p->comm);
+		return;
+	}
+
+	/*
+	 * Validate state: dequeue should only happen from ENQUEUED or
+	 * DISPATCHED states. Getting dequeue from NONE indicates a bug.
+	 */
+	if (tctx->state == TASK_NONE) {
+		scx_bpf_error("%d (%s): dequeue from NONE state seq=%llu",
+			      p->pid, p->comm, tctx->enqueue_seq);
+		return;
+	}
+
+	if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
+		/*
+		 * Property change interrupting the workflow. Valid from
+		 * both ENQUEUED and DISPATCHED states. Transitions task
+		 * back to NONE state.
+		 */
+		__sync_fetch_and_add(&change_dequeue_cnt, 1);
+
+		/* Validate state transition */
+		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
+			scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/* Transition back to NONE: task outside scheduler control */
+		tctx->state = TASK_NONE;
+	} else {
+		/*
+		 * Regular dispatch dequeue: normal workflow step. Valid
+		 * only from ENQUEUED state (after enqueue, before dispatch
+		 * dequeue). Transitions to DISPATCHED state.
+		 */
+		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
+
+		/*
+		 * Dispatch dequeue should not have %SCX_DEQ_SCHED_CHANGE
+		 * flag.
+		 */
+		if (deq_flags & SCX_DEQ_SCHED_CHANGE)
+			scx_bpf_error("%d (%s): SCX_DEQ_SCHED_CHANGE in dispatch dequeue seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		/*
+		 * Must be in ENQUEUED state.
+		 */
+		if (tctx->state != TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/*
+		 * Transition to DISPATCHED: normal cycle completed
+		 * dispatch.
+		 */
+		tctx->state = TASK_DISPATCHED;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
+{
+	if (test_scenario == 6) {
+		s32 pid;
+		struct task_struct *p;
+
+		if (bpf_map_pop_elem(&global_queue, &pid))
+			return;
+
+		p = bpf_task_from_pid(pid);
+		if (!p)
+			return;
+
+		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
+			cpu = scx_bpf_task_cpu(p);
+
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	} else {
+		scx_bpf_dsq_move_to_local(SHARED_DSQ);
+	}
+}
+
+s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	struct task_ctx *tctx;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!tctx)
+		return -ENOMEM;
+
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
+{
+	s32 ret;
+
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops dequeue_ops = {
+	.select_cpu		= (void *)dequeue_select_cpu,
+	.enqueue		= (void *)dequeue_enqueue,
+	.dequeue		= (void *)dequeue_dequeue,
+	.dispatch		= (void *)dequeue_dispatch,
+	.init_task		= (void *)dequeue_init_task,
+	.init			= (void *)dequeue_init,
+	.exit			= (void *)dequeue_exit,
+	.name			= "dequeue_test",
+};
diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
new file mode 100644
index 0000000000000..b0add2a516ab8
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <time.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <sched.h>
+#include <pthread.h>
+#include "scx_test.h"
+#include "dequeue.bpf.skel.h"
+
+#define NUM_WORKERS 8
+#define AFFINITY_HAMMER_MS 50   /* How long affinity hammer runs (scenario 6) */
+
+/*
+ * Worker function that creates enqueue/dequeue events via CPU work and
+ * sleeping. Property-change dequeues are triggered by the affinity hammer
+ * thread (external sched_setaffinity on worker PIDs).
+ */
+static void worker_fn(int id)
+{
+	int i;
+	volatile int sum = 0;
+
+	for (i = 0; i < 1000; i++) {
+		int j;
+
+		/* Do some work to trigger scheduling events */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Sleep to trigger dequeue */
+		usleep(1000 + (id * 100));
+	}
+
+	exit(0);
+}
+
+/*
+ * For scenario 6, tasks sit in the BPF queue until dispatch consumes them.
+ * Property-change dequeues only happen when a task gets a property change
+ * while still in the queue (SCX_OPSS_QUEUED), not after it has been
+ * dispatched. This thread changes workers' affinity from outside so that
+ * some changes hit tasks while they are still in the queue.
+ */
+static void *affinity_hammer_fn(void *arg)
+{
+	pid_t *pids = arg;
+	cpu_set_t cpuset;
+	int i, n = NUM_WORKERS;
+	struct timespec ts = { .tv_sec = 0, .tv_nsec = 1000000 }; /* 1ms */
+
+	for (i = 0; i < (AFFINITY_HAMMER_MS * 1000 / 100); i++) {
+		int w = i % n;
+		int cpu = (i / n) % 4;
+
+		CPU_ZERO(&cpuset);
+		CPU_SET(cpu, &cpuset);
+		sched_setaffinity(pids[w], sizeof(cpuset), &cpuset);
+		nanosleep(&ts, NULL);
+	}
+	return NULL;
+}
+
+static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
+					 const char *scenario_name)
+{
+	struct bpf_link *link;
+	pid_t pids[NUM_WORKERS];
+	pthread_t hammer;
+
+	int i, status;
+	u64 enq_start, deq_start, dispatch_deq_start, change_deq_start;
+	u64 enq_delta, deq_delta, dispatch_deq_delta, change_deq_delta;
+
+	/* Set the test scenario */
+	skel->bss->test_scenario = scenario;
+
+	/* Record starting counts */
+	enq_start = skel->bss->enqueue_cnt;
+	deq_start = skel->bss->dequeue_cnt;
+	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
+	change_deq_start = skel->bss->change_dequeue_cnt;
+
+	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
+
+	/* Fork worker processes to generate enqueue/dequeue events */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		pids[i] = fork();
+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
+
+		if (pids[i] == 0) {
+			worker_fn(i);
+			/* Should not reach here */
+			exit(1);
+		}
+	}
+
+	/*
+	 * Run an "affinity hammer" so that some property changes hit tasks
+	 * while they are still in BPF custody (e.g. in user DSQ or BPF queue),
+	 * triggering SCX_DEQ_SCHED_CHANGE dequeues in scenarios 2, 5 and 6.
+	 */
+	SCX_FAIL_IF(pthread_create(&hammer, NULL, affinity_hammer_fn, pids) != 0,
+		    "Failed to create affinity hammer thread");
+
+	/* Wait for all workers to complete */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for worker %d", i);
+		SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
+	}
+
+	pthread_join(hammer, NULL);
+
+	bpf_link__destroy(link);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+
+	/* Calculate deltas */
+	enq_delta = skel->bss->enqueue_cnt - enq_start;
+	deq_delta = skel->bss->dequeue_cnt - deq_start;
+	dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
+	change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start;
+
+	printf("%s:\n", scenario_name);
+	printf("  enqueues: %lu\n", (unsigned long)enq_delta);
+	printf("  dequeues: %lu (dispatch: %lu, property_change: %lu)\n",
+	       (unsigned long)deq_delta,
+	       (unsigned long)dispatch_deq_delta,
+	       (unsigned long)change_deq_delta);
+
+	/*
+	 * Validate enqueue/dequeue lifecycle tracking.
+	 *
+	 * For scenarios 0, 1, 3, 4 (local and global DSQs from
+	 * ops.select_cpu() and ops.enqueue()), both enqueues and dequeues
+	 * should be 0 because tasks bypass the BPF scheduler entirely: tasks
+	 * never enter BPF scheduler's custody.
+	 *
+	 * For scenarios 2, 5, and 6 (user DSQ or BPF internal queue from
+	 * ops.select_cpu() / ops.enqueue()), we expect both enqueues and
+	 * dequeues.
+	 *
+	 * The BPF code does strict state machine validation with
+	 * scx_bpf_error() to ensure the workflow semantics are correct. If
+	 * we reach this point without errors, the semantics are validated
+	 * correctly.
+	 */
+	if (scenario == 0 || scenario == 1 || scenario == 3 || scenario == 4) {
+		/* Terminal DSQs: tasks bypass BPF scheduler completely */
+		SCX_EQ(enq_delta, 0);
+		SCX_EQ(deq_delta, 0);
+		SCX_EQ(dispatch_deq_delta, 0);
+		SCX_EQ(change_deq_delta, 0);
+	} else {
+		/* User DSQ: tasks enter BPF scheduler's custody */
+		SCX_GT(enq_delta, 0);
+		SCX_GT(deq_delta, 0);
+		/* Validate 1:1 enqueue/dequeue pairing */
+		SCX_EQ(enq_delta, deq_delta);
+	}
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct dequeue *skel;
+
+	skel = dequeue__open();
+	SCX_FAIL_IF(!skel, "Failed to open skel");
+	SCX_ENUM_INIT(skel);
+	SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct dequeue *skel = ctx;
+	enum scx_test_status status;
+
+	/* Scenarios 0-2: ops.select_cpu(), 3-6: ops.enqueue() */
+	status = run_scenario(skel, 0, "Scenario 0: Local DSQ from ops.select_cpu()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 1, "Scenario 1: Global DSQ from ops.select_cpu()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 2, "Scenario 2: User DSQ from ops.select_cpu()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 3, "Scenario 3: Local DSQ from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 4, "Scenario 4: Global DSQ from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 5, "Scenario 5: User DSQ from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 6,
+			     "Scenario 6: BPF internal queue from ops.enqueue(), consumed in ops.dispatch()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	printf("\n=== Summary ===\n");
+	printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
+	printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
+	printf("  Dispatch dequeues: %lu (no flag, normal workflow)\n",
+	       (unsigned long)skel->bss->dispatch_dequeue_cnt);
+	printf("  Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n",
+	       (unsigned long)skel->bss->change_dequeue_cnt);
+	printf("\nAll scenarios passed - no state machine violations detected\n");
+	printf("-> Validated: Local DSQ dispatch bypasses BPF scheduler (both paths)\n");
+	printf("-> Validated: Global DSQ dispatch bypasses BPF scheduler (both paths)\n");
+	printf("-> Validated: User DSQ dispatch triggers dequeue callbacks (both paths)\n");
+	printf("-> Validated: BPF internal queue triggers dequeue when leaving custody\n");
+	printf("-> Validated: ops.enqueue() and ops.select_cpu() behave identically\n");
+	printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
+	printf("-> Validated: Property change dequeues have SCX_DEQ_SCHED_CHANGE flag\n");
+	printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct dequeue *skel = ctx;
+
+	dequeue__destroy(skel);
+}
+
+struct scx_test dequeue_test = {
+	.name = "dequeue",
+	.description = "Verify ops.dequeue() semantics",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+
+REGISTER_SCX_TEST(&dequeue_test)
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-06 13:54 ` [PATCH 2/2] selftests/sched_ext: Add test to validate " Andrea Righi
@ 2026-02-06 20:10   ` Emil Tsalapatis
  2026-02-07  9:16     ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Emil Tsalapatis @ 2026-02-06 20:10 UTC (permalink / raw)
  To: Andrea Righi, Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Christian Loehle, Daniel Hodges, sched-ext,
	linux-kernel

On Fri Feb 6, 2026 at 8:54 AM EST, Andrea Righi wrote:

Hi Andrea,

> Add a new kselftest to validate that the new ops.dequeue() semantics
> work correctly for all task lifecycle scenarios, including the
> distinction between terminal DSQs (where BPF scheduler is done with the
> task), user DSQs (where BPF scheduler manages the task lifecycle) and
> BPF data structures, regardless of which event performs the dispatch.
>
> The test validates the following scenarios:
>
>  - From ops.select_cpu():
>      - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
>        the BPF scheduler entirely; they never enter BPF custody, so
>        ops.dequeue() is not called,
>      - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
>        bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
>        not called,
>      - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
>        enqueue/dequeue lifecycle tracking and state machine validation
>        (expects 1:1 enqueue/dequeue pairing).

Could you add a note here about why there's no equivalent to scenario 6?
The differentiating factor between that and scenario 2 (nonterminal queue) is 
that scx_dsq_insert_commit() is called regardless of whether the queue is terminal.
And this makes sense since for non-DSQ queues the BPF scheduler can do its
own tracking of enqueue/dequeue (plus it does not make too much sense to
do BPF-internal enqueueing in select_cpu).

What do you think? If the above makes sense, maybe we should spell it out 
in the documentation too. Maybe also add it makes no sense to enqueue
in an internal BPF structure from select_cpu - the task is not yet
enqueued, and would have to go through enqueue anyway.

>
>    - From ops.enqueue():
>      - scenario 3 (local DSQ): same behavior as scenario 0,
>      - scenario 4 (global DSQ): same behavior as scenario 1,
>      - scenario 5 (user DSQ): same behavior as scenario 2,
>      - scenario 6 (BPF internal queue): tasks are stored in a BPF queue
>        in ops.enqueue() and consumed in ops.dispatch(); they remain in
>        BPF custody until dispatch, with full lifecycle tracking and 1:1
>        enqueue/dequeue validation.
>
> This verifies that:
>  - terminal DSQ dispatch (local, global) don't trigger ops.dequeue(),
>  - user DSQ / internal BPF data structure dispatch has exact 1:1
>    ops.enqueue()/dequeue() pairing,
>  - dispatch dequeues have no flags (normal workflow),
>  - property change dequeues have the %SCX_DEQ_SCHED_CHANGE flag set,
>  - no duplicate enqueues or invalid state transitions are happening,
>  - ops.enqueue() and ops.select_cpu() dispatch paths behave identically.
>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Emil Tsalapatis <emil@etsalapatis.com>
> Cc: Kuba Piecuch <jpiecuch@google.com>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---
>  tools/testing/selftests/sched_ext/Makefile    |   1 +
>  .../testing/selftests/sched_ext/dequeue.bpf.c | 403 ++++++++++++++++++
>  tools/testing/selftests/sched_ext/dequeue.c   | 258 +++++++++++
>  3 files changed, 662 insertions(+)
>  create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
>  create mode 100644 tools/testing/selftests/sched_ext/dequeue.c
>
> diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
> index 5fe45f9c5f8fd..764e91edabf93 100644
> --- a/tools/testing/selftests/sched_ext/Makefile
> +++ b/tools/testing/selftests/sched_ext/Makefile
> @@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
>  
>  auto-test-targets :=			\
>  	create_dsq			\
> +	dequeue				\
>  	enq_last_no_enq_fails		\
>  	ddsp_bogus_dsq_fail		\
>  	ddsp_vtimelocal_fail		\
> diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
> new file mode 100644
> index 0000000000000..4ba657ba1bff5
> --- /dev/null
> +++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
> @@ -0,0 +1,403 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * A scheduler that validates ops.dequeue() is called correctly:
> + * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
> + *   scheduler entirely: no ops.dequeue() should be called
> + * - Tasks dispatched to user DSQs enter BPF custody: ops.dequeue() must be
> + *   called when they leave custody
> + * - Every ops.enqueue() for non-terminal DSQs is followed by exactly one
> + *   ops.dequeue() (validate 1:1 pairing and state machine)
> + *
> + * Copyright (c) 2026 NVIDIA Corporation.
> + */
> +
> +#include <scx/common.bpf.h>
> +
> +#define SHARED_DSQ	0
> +
> +/*
> + * Scenario 6: BPF internal queue. Tasks are stored here from ops.enqueue()
> + * and consumed from ops.dispatch(), validating that tasks not on a user DSQ
> + * (only on BPF internal structures) still get ops.dequeue() when they leave.
> + */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_QUEUE);
> +	__uint(max_entries, 4096);

Nit: Can we make this larger? I don't think there's any downsides. I know
there's a mitigation for if the queue gets full, please see nit below.

> +	__type(value, s32);
> +} global_queue SEC(".maps");
> +
> +char _license[] SEC("license") = "GPL";
> +
> +UEI_DEFINE(uei);
> +
> +/*
> + * Counters to track the lifecycle of tasks:
> + * - enqueue_cnt: Number of times ops.enqueue() was called
> + * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
> + * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
> + * - change_dequeue_cnt: Number of property change dequeues
> + */
> +u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt;
> +
> +/*
> + * Test scenarios (0-2: ops.select_cpu(), 3-6: ops.enqueue()):
> + * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
> + *    scheduler, no dequeue callbacks)
> + * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
> + *    scheduler, no dequeue callbacks)
> + * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
> + *    dequeue callbacks expected)
> + * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
> + *    scheduler, no dequeue callbacks)
> + * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
> + *    scheduler, no dequeue callbacks)
> + * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
> + *    dequeue callbacks expected)
> + * 6) BPF internal queue: store task PIDs in ops.enqueue(), consume in
> + *    ops.dispatch() and dispatch to local DSQ (validates dequeue for tasks
> + *    in BPF custody but not on a user DSQ)
> + */
> +u32 test_scenario;
> +
> +/*
> + * Per-task state to track lifecycle and validate workflow semantics.
> + * State transitions:
> + *   NONE -> ENQUEUED (on enqueue)
> + *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
> + *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
> + *   ENQUEUED -> NONE (on property change dequeue before dispatch)
> + */
> +enum task_state {
> +	TASK_NONE = 0,
> +	TASK_ENQUEUED,
> +	TASK_DISPATCHED,
> +};
> +
> +struct task_ctx {
> +	enum task_state state; /* Current state in the workflow */
> +	u64 enqueue_seq;       /* Sequence number for debugging */
> +};
> +
> +struct {
> +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
> +	__uint(map_flags, BPF_F_NO_PREALLOC);
> +	__type(key, int);
> +	__type(value, struct task_ctx);
> +} task_ctx_stor SEC(".maps");
> +
> +static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
> +{
> +	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> +}
> +
> +s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
> +		   s32 prev_cpu, u64 wake_flags)
> +{
> +	struct task_ctx *tctx;
> +
> +	tctx = try_lookup_task_ctx(p);
> +	if (!tctx)
> +		return prev_cpu;
> +
> +	switch (test_scenario) {
> +	case 0:
> +		/*
> +		 * Scenario 0: Direct dispatch to local DSQ from select_cpu.
> +		 *
> +		 * Task bypasses BPF scheduler entirely: no enqueue
> +		 * tracking, no dequeue callbacks. Behavior should be
> +		 * identical to scenario 3.
> +		 */
> +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
> +		return prev_cpu;
> +
> +	case 1:
> +		/*
> +		 * Scenario 1: Direct dispatch to global DSQ from select_cpu.
> +		 *
> +		 * Like scenario 0, task bypasses BPF scheduler entirely.
> +		 * Behavior should be identical to scenario 4.
> +		 */
> +		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
> +		return prev_cpu;
> +
> +	case 2:
> +		/*
> +		 * Scenario 2: Dispatch to shared user DSQ from select_cpu.
> +		 *
> +		 * Task enters BPF scheduler management: track
> +		 * enqueue/dequeue lifecycle and validate state transitions.
> +		 * Behavior should be identical to scenario 5.
> +		 */
> +		__sync_fetch_and_add(&enqueue_cnt, 1);
> +
> +		/*
> +		 * Validate state transition: enqueue is only valid from
> +		 * NONE or DISPATCHED states. Getting enqueue while in
> +		 * ENQUEUED state indicates a missing dequeue.
> +		 */
> +		if (tctx->state == TASK_ENQUEUED)
> +			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
> +				      p->pid, p->comm, tctx->enqueue_seq);
> +
> +		/* Transition to ENQUEUED state */
> +		tctx->state = TASK_ENQUEUED;
> +		tctx->enqueue_seq++;
> +
> +		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);
> +		return prev_cpu;
> +
> +	default:
> +		/*
> +		 * Force all tasks through ops.enqueue().
> +		 */
> +		return prev_cpu;
> +	}
> +}
> +
> +void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
> +{
> +	struct task_ctx *tctx;
> +
> +	tctx = try_lookup_task_ctx(p);
> +	if (!tctx)
> +		return;
> +
> +	switch (test_scenario) {
> +	case 3:
> +		/*
> +		 * Scenario 3: Direct dispatch to the local DSQ.
> +		 *
> +		 * Task bypasses BPF scheduler entirely: no enqueue
> +		 * tracking, no dequeue callbacks. Don't increment counters
> +		 * or validate state since the task never enters BPF
> +		 * scheduler management.
> +		 */
> +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
> +		break;
> +
> +	case 4:
> +		/*
> +		 * Scenario 4: Direct dispatch to the global DSQ.
> +		 *
> +		 * Like scenario 3, task bypasses BPF scheduler entirely.
> +		 * SCX_DSQ_GLOBAL is a terminal DSQ, tasks dispatched to it
> +		 * leave BPF custody immediately, so no dequeue callbacks
> +		 * should be triggered.
> +		 */
> +		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
> +		break;
> +
> +	case 5:
> +		/*
> +		 * Scenario 5: Dispatch to shared user DSQ.
> +		 *
> +		 * Task enters BPF scheduler management: track
> +		 * enqueue/dequeue lifecycle and validate state
> +		 * transitions.
> +		 */
> +		__sync_fetch_and_add(&enqueue_cnt, 1);
> +
> +		/*
> +		 * Validate state transition: enqueue is only valid from
> +		 * NONE or DISPATCHED states. Getting enqueue while in
> +		 * ENQUEUED state indicates a missing dequeue (or stale state
> +		 * from a previous scenario when the scheduler was unregistered
> +		 * with tasks still on a DSQ). Reset and proceed to avoid false
> +		 * positives across scenario switches.
> +		 */
> +		if (tctx->state == TASK_ENQUEUED)
> +			tctx->state = TASK_NONE;
> +
> +		/* Transition to ENQUEUED state */
> +		tctx->state = TASK_ENQUEUED;
> +		tctx->enqueue_seq++;
> +
> +		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
> +		break;
> +
> +	case 6:
> +		/*
> +		 * Scenario 6: Store task in BPF internal queue. Task enters
> +		 * BPF custody (kernel sets SCX_TASK_NEED_DEQ). When
> +		 * ops.dispatch() later pops and inserts to local DSQ,
> +		 * ops.dequeue() must be called.
> +		 *
> +		 * If the queue is full, fallback to local DSQ. The task still
> +		 * goes through QUEUED in the kernel and gets ops.dequeue()
> +		 * when moved to the terminal DSQ, so we track it the same.
> +		 *
> +		 * If state is already ENQUEUED (e.g. task was on a DSQ when
> +		 * the scheduler was unregistered in a previous scenario),
> +		 * reset to NONE and proceed to avoid false positives.
> +		 */
> +		{
> +			s32 pid = p->pid;
> +
> +			if (tctx->state == TASK_ENQUEUED)
> +				tctx->state = TASK_NONE;
> +
> +			tctx->state = TASK_ENQUEUED;
> +			tctx->enqueue_seq++;
> +
> +			/* Queue full: fallback to the global DSQ */
Nit: Can we remove this fallback? This silently changes the behavior of
the test, and even though it makes sense to avoid overflowing the queue,
it causes the test to succeed even if for some reason the
bpf_map_push_elem fails. Why not just bump the queue number to a
reasonably large number amount instead?

> +			if (bpf_map_push_elem(&global_queue, &pid, 0))
> +				scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
> +			else
> +				__sync_fetch_and_add(&enqueue_cnt, 1);
> +
> +			scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE);
> +		}
> +		break;
> +
> +	default:
> +		/* For scenarios 0-2 dispatch to the global DSQ */
> +		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
> +	}
> +}
> +
> +void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
> +{
> +	struct task_ctx *tctx;
> +
> +	__sync_fetch_and_add(&dequeue_cnt, 1);
> +
> +	tctx = try_lookup_task_ctx(p);
> +	if (!tctx)
> +		return;
> +
> +	/*
> +	 * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global),
> +	 * ops.dequeue() should never be called because tasks bypass the
> +	 * BPF scheduler entirely. If we get here, it's a kernel bug.
> +	 */
> +	if (test_scenario == 0 || test_scenario == 3) {
> +		scx_bpf_error("%d (%s): dequeue called for local DSQ scenario - kernel bug!",
> +			      p->pid, p->comm);
> +		return;
> +	}
> +	if (test_scenario == 1 || test_scenario == 4) {
> +		scx_bpf_error("%d (%s): dequeue called for global DSQ scenario - kernel bug!",
> +			      p->pid, p->comm);
> +		return;
> +	}
> +
> +	/*
> +	 * Validate state: dequeue should only happen from ENQUEUED or
> +	 * DISPATCHED states. Getting dequeue from NONE indicates a bug.
> +	 */
> +	if (tctx->state == TASK_NONE) {
> +		scx_bpf_error("%d (%s): dequeue from NONE state seq=%llu",
> +			      p->pid, p->comm, tctx->enqueue_seq);
> +		return;
> +	}
> +
> +	if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
> +		/*
> +		 * Property change interrupting the workflow. Valid from
> +		 * both ENQUEUED and DISPATCHED states. Transitions task
> +		 * back to NONE state.
> +		 */
> +		__sync_fetch_and_add(&change_dequeue_cnt, 1);
> +
> +		/* Validate state transition */
> +		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
> +			scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu",
> +				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
> +
> +		/* Transition back to NONE: task outside scheduler control */
> +		tctx->state = TASK_NONE;
> +	} else {
> +		/*
> +		 * Regular dispatch dequeue: normal workflow step. Valid
> +		 * only from ENQUEUED state (after enqueue, before dispatch
> +		 * dequeue). Transitions to DISPATCHED state.
> +		 */
> +		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
> +
> +		/*
> +		 * Dispatch dequeue should not have %SCX_DEQ_SCHED_CHANGE
> +		 * flag.
> +		 */
> +		if (deq_flags & SCX_DEQ_SCHED_CHANGE)
> +			scx_bpf_error("%d (%s): SCX_DEQ_SCHED_CHANGE in dispatch dequeue seq=%llu",
> +				      p->pid, p->comm, tctx->enqueue_seq);
> +
> +		/*
> +		 * Must be in ENQUEUED state.
> +		 */
> +		if (tctx->state != TASK_ENQUEUED)
> +			scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu",
> +				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
> +
> +		/*
> +		 * Transition to DISPATCHED: normal cycle completed
> +		 * dispatch.
> +		 */
> +		tctx->state = TASK_DISPATCHED;
> +	}
> +}
> +
> +void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
> +{
> +	if (test_scenario == 6) {
> +		s32 pid;
> +		struct task_struct *p;
> +
> +		if (bpf_map_pop_elem(&global_queue, &pid))
> +			return;
> +
> +		p = bpf_task_from_pid(pid);
> +		if (!p)
> +			return;
> +
> +		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
> +			cpu = scx_bpf_task_cpu(p);
> +
> +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
> +		bpf_task_release(p);
> +	} else {
> +		scx_bpf_dsq_move_to_local(SHARED_DSQ);
> +	}
> +}
> +
> +s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
> +		   struct scx_init_task_args *args)
> +{
> +	struct task_ctx *tctx;
> +
> +	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
> +				   BPF_LOCAL_STORAGE_GET_F_CREATE);
> +	if (!tctx)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
> +{
> +	s32 ret;
> +
> +	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}
> +
> +void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
> +{
> +	UEI_RECORD(uei, ei);
> +}
> +
> +SEC(".struct_ops.link")
> +struct sched_ext_ops dequeue_ops = {
> +	.select_cpu		= (void *)dequeue_select_cpu,
> +	.enqueue		= (void *)dequeue_enqueue,
> +	.dequeue		= (void *)dequeue_dequeue,
> +	.dispatch		= (void *)dequeue_dispatch,
> +	.init_task		= (void *)dequeue_init_task,
> +	.init			= (void *)dequeue_init,
> +	.exit			= (void *)dequeue_exit,
> +	.name			= "dequeue_test",
> +};
> diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
> new file mode 100644
> index 0000000000000..b0add2a516ab8
> --- /dev/null
> +++ b/tools/testing/selftests/sched_ext/dequeue.c
> @@ -0,0 +1,258 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2025 NVIDIA Corporation.
> + */
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <time.h>
> +#include <bpf/bpf.h>
> +#include <scx/common.h>
> +#include <sys/wait.h>
> +#include <sched.h>
> +#include <pthread.h>
> +#include "scx_test.h"
> +#include "dequeue.bpf.skel.h"
> +
> +#define NUM_WORKERS 8
> +#define AFFINITY_HAMMER_MS 50   /* How long affinity hammer runs (scenario 6) */
> +
> +/*
> + * Worker function that creates enqueue/dequeue events via CPU work and
> + * sleeping. Property-change dequeues are triggered by the affinity hammer
> + * thread (external sched_setaffinity on worker PIDs).
> + */
> +static void worker_fn(int id)
> +{
> +	int i;
> +	volatile int sum = 0;
> +
> +	for (i = 0; i < 1000; i++) {
> +		int j;
> +
> +		/* Do some work to trigger scheduling events */
> +		for (j = 0; j < 10000; j++)
> +			sum += j;
> +
> +		/* Sleep to trigger dequeue */
> +		usleep(1000 + (id * 100));
> +	}
> +
> +	exit(0);
> +}
> +
> +/*
> + * For scenario 6, tasks sit in the BPF queue until dispatch consumes them.
> + * Property-change dequeues only happen when a task gets a property change
> + * while still in the queue (SCX_OPSS_QUEUED), not after it has been
> + * dispatched. This thread changes workers' affinity from outside so that
> + * some changes hit tasks while they are still in the queue.
> + */
> +static void *affinity_hammer_fn(void *arg)
> +{
> +	pid_t *pids = arg;
> +	cpu_set_t cpuset;
> +	int i, n = NUM_WORKERS;
> +	struct timespec ts = { .tv_sec = 0, .tv_nsec = 1000000 }; /* 1ms */
> +
> +	for (i = 0; i < (AFFINITY_HAMMER_MS * 1000 / 100); i++) {
> +		int w = i % n;
> +		int cpu = (i / n) % 4;
> +
> +		CPU_ZERO(&cpuset);
> +		CPU_SET(cpu, &cpuset);
> +		sched_setaffinity(pids[w], sizeof(cpuset), &cpuset);
> +		nanosleep(&ts, NULL);
> +	}
> +	return NULL;
> +}
> +
> +static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
> +					 const char *scenario_name)
> +{
> +	struct bpf_link *link;
> +	pid_t pids[NUM_WORKERS];
> +	pthread_t hammer;
> +
> +	int i, status;
> +	u64 enq_start, deq_start, dispatch_deq_start, change_deq_start;
> +	u64 enq_delta, deq_delta, dispatch_deq_delta, change_deq_delta;
> +
> +	/* Set the test scenario */
> +	skel->bss->test_scenario = scenario;
> +
> +	/* Record starting counts */
> +	enq_start = skel->bss->enqueue_cnt;
> +	deq_start = skel->bss->dequeue_cnt;
> +	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
> +	change_deq_start = skel->bss->change_dequeue_cnt;
> +
> +	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
> +	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
> +
> +	/* Fork worker processes to generate enqueue/dequeue events */
> +	for (i = 0; i < NUM_WORKERS; i++) {
> +		pids[i] = fork();
> +		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
> +
> +		if (pids[i] == 0) {
> +			worker_fn(i);
> +			/* Should not reach here */
> +			exit(1);
> +		}
> +	}
> +
> +	/*
> +	 * Run an "affinity hammer" so that some property changes hit tasks
> +	 * while they are still in BPF custody (e.g. in user DSQ or BPF queue),
> +	 * triggering SCX_DEQ_SCHED_CHANGE dequeues in scenarios 2, 5 and 6.
> +	 */
> +	SCX_FAIL_IF(pthread_create(&hammer, NULL, affinity_hammer_fn, pids) != 0,
> +		    "Failed to create affinity hammer thread");
> +
> +	/* Wait for all workers to complete */
> +	for (i = 0; i < NUM_WORKERS; i++) {
> +		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
> +			    "Failed to wait for worker %d", i);
> +		SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
> +	}
> +
> +	pthread_join(hammer, NULL);
> +
> +	bpf_link__destroy(link);
> +
> +	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
> +
> +	/* Calculate deltas */
> +	enq_delta = skel->bss->enqueue_cnt - enq_start;
> +	deq_delta = skel->bss->dequeue_cnt - deq_start;
> +	dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
> +	change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start;
> +
> +	printf("%s:\n", scenario_name);
> +	printf("  enqueues: %lu\n", (unsigned long)enq_delta);
> +	printf("  dequeues: %lu (dispatch: %lu, property_change: %lu)\n",
> +	       (unsigned long)deq_delta,
> +	       (unsigned long)dispatch_deq_delta,
> +	       (unsigned long)change_deq_delta);
> +
> +	/*
> +	 * Validate enqueue/dequeue lifecycle tracking.
> +	 *
> +	 * For scenarios 0, 1, 3, 4 (local and global DSQs from
> +	 * ops.select_cpu() and ops.enqueue()), both enqueues and dequeues
> +	 * should be 0 because tasks bypass the BPF scheduler entirely: tasks
> +	 * never enter BPF scheduler's custody.
> +	 *
> +	 * For scenarios 2, 5, and 6 (user DSQ or BPF internal queue from
> +	 * ops.select_cpu() / ops.enqueue()), we expect both enqueues and
> +	 * dequeues.
> +	 *
> +	 * The BPF code does strict state machine validation with
> +	 * scx_bpf_error() to ensure the workflow semantics are correct. If
> +	 * we reach this point without errors, the semantics are validated
> +	 * correctly.
> +	 */
> +	if (scenario == 0 || scenario == 1 || scenario == 3 || scenario == 4) {
> +		/* Terminal DSQs: tasks bypass BPF scheduler completely */
> +		SCX_EQ(enq_delta, 0);
> +		SCX_EQ(deq_delta, 0);
> +		SCX_EQ(dispatch_deq_delta, 0);
> +		SCX_EQ(change_deq_delta, 0);
> +	} else {
> +		/* User DSQ: tasks enter BPF scheduler's custody */
> +		SCX_GT(enq_delta, 0);
> +		SCX_GT(deq_delta, 0);
> +		/* Validate 1:1 enqueue/dequeue pairing */
> +		SCX_EQ(enq_delta, deq_delta);
> +	}
> +
> +	return SCX_TEST_PASS;
> +}
> +
> +static enum scx_test_status setup(void **ctx)
> +{
> +	struct dequeue *skel;
> +
> +	skel = dequeue__open();
> +	SCX_FAIL_IF(!skel, "Failed to open skel");
> +	SCX_ENUM_INIT(skel);
> +	SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
> +
> +	*ctx = skel;
> +
> +	return SCX_TEST_PASS;
> +}
> +
> +static enum scx_test_status run(void *ctx)
> +{
> +	struct dequeue *skel = ctx;
> +	enum scx_test_status status;
> +
> +	/* Scenarios 0-2: ops.select_cpu(), 3-6: ops.enqueue() */
> +	status = run_scenario(skel, 0, "Scenario 0: Local DSQ from ops.select_cpu()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 1, "Scenario 1: Global DSQ from ops.select_cpu()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 2, "Scenario 2: User DSQ from ops.select_cpu()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 3, "Scenario 3: Local DSQ from ops.enqueue()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 4, "Scenario 4: Global DSQ from ops.enqueue()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 5, "Scenario 5: User DSQ from ops.enqueue()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 6,
> +			     "Scenario 6: BPF internal queue from ops.enqueue(), consumed in ops.dispatch()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	printf("\n=== Summary ===\n");
> +	printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
> +	printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
> +	printf("  Dispatch dequeues: %lu (no flag, normal workflow)\n",
> +	       (unsigned long)skel->bss->dispatch_dequeue_cnt);
> +	printf("  Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n",
> +	       (unsigned long)skel->bss->change_dequeue_cnt);
> +	printf("\nAll scenarios passed - no state machine violations detected\n");
> +	printf("-> Validated: Local DSQ dispatch bypasses BPF scheduler (both paths)\n");
> +	printf("-> Validated: Global DSQ dispatch bypasses BPF scheduler (both paths)\n");
> +	printf("-> Validated: User DSQ dispatch triggers dequeue callbacks (both paths)\n");
> +	printf("-> Validated: BPF internal queue triggers dequeue when leaving custody\n");
> +	printf("-> Validated: ops.enqueue() and ops.select_cpu() behave identically\n");
> +	printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
> +	printf("-> Validated: Property change dequeues have SCX_DEQ_SCHED_CHANGE flag\n");
> +	printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
> +
> +	return SCX_TEST_PASS;
> +}
> +
> +static void cleanup(void *ctx)
> +{
> +	struct dequeue *skel = ctx;
> +
> +	dequeue__destroy(skel);
> +}
> +
> +struct scx_test dequeue_test = {
> +	.name = "dequeue",
> +	.description = "Verify ops.dequeue() semantics",
> +	.setup = setup,
> +	.run = run,
> +	.cleanup = cleanup,
> +};
> +
> +REGISTER_SCX_TEST(&dequeue_test)


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics
  2026-02-06 13:54 ` [PATCH 1/2] " Andrea Righi
@ 2026-02-06 20:35   ` Emil Tsalapatis
  2026-02-07  9:26     ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Emil Tsalapatis @ 2026-02-06 20:35 UTC (permalink / raw)
  To: Andrea Righi, Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Christian Loehle, Daniel Hodges, sched-ext,
	linux-kernel

On Fri Feb 6, 2026 at 8:54 AM EST, Andrea Righi wrote:
> Currently, ops.dequeue() is only invoked when the sched_ext core knows
> that a task resides in BPF-managed data structures, which causes it to
> miss scheduling property change events. In addition, ops.dequeue()
> callbacks are completely skipped when tasks are dispatched to non-local
> DSQs from ops.select_cpu(). As a result, BPF schedulers cannot reliably
> track task state.
>
> Fix this by guaranteeing that each task entering the BPF scheduler's
> custody triggers exactly one ops.dequeue() call when it leaves that
> custody, whether the exit is due to a dispatch (regular or via a core
> scheduling pick) or to a scheduling property change (e.g.
> sched_setaffinity(), sched_setscheduler(), set_user_nice(), NUMA
> balancing, etc.).
>
> BPF scheduler custody concept: a task is considered to be in the BPF
> scheduler's custody when the scheduler is responsible for managing its
> lifecycle. This includes tasks dispatched to user-created DSQs or stored
> in the BPF scheduler's internal data structures. Custody ends when the
> task is dispatched to a terminal DSQ (such as the local DSQ or
> %SCX_DSQ_GLOBAL), selected by core scheduling, or removed due to a
> property change.
>
> Tasks directly dispatched to terminal DSQs bypass the BPF scheduler
> entirely and are never in its custody. Terminal DSQs include:
>  - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON): per-CPU queues
>    where tasks go directly to execution.
>  - Global DSQ (%SCX_DSQ_GLOBAL): the built-in fallback queue where the
>    BPF scheduler is considered "done" with the task.
>
> As a result, ops.dequeue() is not invoked for tasks directly dispatched
> to terminal DSQs.
>
> To identify dequeues triggered by scheduling property changes, introduce
> the new ops.dequeue() flag %SCX_DEQ_SCHED_CHANGE: when this flag is set,
> the dequeue was caused by a scheduling property change.
>
> New ops.dequeue() semantics:
>  - ops.dequeue() is invoked exactly once when the task leaves the BPF
>    scheduler's custody, in one of the following cases:
>    a) regular dispatch: a task dispatched to a user DSQ or stored in
>       internal BPF data structures is moved to a terminal DSQ
>       (ops.dequeue() called without any special flags set),
>    b) core scheduling dispatch: core-sched picks task before dispatch
>       (ops.dequeue() called with %SCX_DEQ_CORE_SCHED_EXEC flag set),
>    c) property change: task properties modified before dispatch,
>       (ops.dequeue() called with %SCX_DEQ_SCHED_CHANGE flag set).
>
> This allows BPF schedulers to:
>  - reliably track task ownership and lifecycle,
>  - maintain accurate accounting of managed tasks,
>  - update internal state when tasks change properties.
>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Emil Tsalapatis <emil@etsalapatis.com>
> Cc: Kuba Piecuch <jpiecuch@google.com>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---

Hi Andrea,

>  Documentation/scheduler/sched-ext.rst         |  58 +++++++
>  include/linux/sched/ext.h                     |   1 +
>  kernel/sched/ext.c                            | 157 ++++++++++++++++--
>  kernel/sched/ext_internal.h                   |   7 +
>  .../sched_ext/include/scx/enum_defs.autogen.h |   1 +
>  .../sched_ext/include/scx/enums.autogen.bpf.h |   2 +
>  tools/sched_ext/include/scx/enums.autogen.h   |   1 +
>  7 files changed, 213 insertions(+), 14 deletions(-)
>
> diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
> index 404fe6126a769..fe8c59b0c1477 100644
> --- a/Documentation/scheduler/sched-ext.rst
> +++ b/Documentation/scheduler/sched-ext.rst
> @@ -252,6 +252,62 @@ The following briefly shows how a waking task is scheduled and executed.
>  
>     * Queue the task on the BPF side.
>  
> +   **Task State Tracking and ops.dequeue() Semantics**
> +
> +   A task is in the "BPF scheduler's custody" when the BPF scheduler is
> +   responsible for managing its lifecycle. That includes tasks dispatched
> +   to user-created DSQs or stored in the BPF scheduler's internal data
> +   structures. Once ``ops.select_cpu()`` or ``ops.enqueue()`` is called,
> +   the task may or may not enter custody depending on what the scheduler
> +   does:
> +
> +   * **Directly dispatched to terminal DSQs** (``SCX_DSQ_LOCAL``,
> +     ``SCX_DSQ_LOCAL_ON | cpu``, or ``SCX_DSQ_GLOBAL``): The BPF scheduler
> +     is done with the task - it either goes straight to a CPU's local run
> +     queue or to the global DSQ as a fallback. The task never enters (or
> +     exits) BPF custody, and ``ops.dequeue()`` will not be called.
> +
> +   * **Dispatch to user-created DSQs** (custom DSQs): the task enters the
> +     BPF scheduler's custody. When the task later leaves BPF custody
> +     (dispatched to a terminal DSQ, picked by core-sched, or dequeued for
> +     sleep/property changes), ``ops.dequeue()`` will be called exactly once.
> +
> +   * **Queued on BPF side** (e.g., internal queues, no DSQ): The task is in
> +     BPF custody. ``ops.dequeue()`` will be called when it leaves (e.g.
> +     when ``ops.dispatch()`` moves it to a terminal DSQ, or on property
> +     change / sleep).
> +
> +   **NOTE**: this concept is valid also with the ``ops.select_cpu()``
> +   direct dispatch optimization. Even though it skips ``ops.enqueue()``
> +   invocation, if the task is dispatched to a user-created DSQ or internal
> +   BPF structure, it enters BPF custody and will get ``ops.dequeue()`` when
> +   it leaves. If dispatched to a terminal DSQ, the BPF scheduler is done
> +   with it immediately. This provides the performance benefit of avoiding
> +   the ``ops.enqueue()`` roundtrip while maintaining correct state
> +   tracking.
> +
> +   The dequeue can happen for different reasons, distinguished by flags:
> +
> +   1. **Regular dispatch**: when a task in BPF custody is dispatched to a
> +      terminal DSQ from ``ops.dispatch()`` (leaving BPF custody for
> +      execution), ``ops.dequeue()`` is triggered without any special flags.
> +
> +   2. **Core scheduling pick**: when ``CONFIG_SCHED_CORE`` is enabled and
> +      core scheduling picks a task for execution while it's still in BPF
> +      custody, ``ops.dequeue()`` is called with the
> +      ``SCX_DEQ_CORE_SCHED_EXEC`` flag.
> +
> +   3. **Scheduling property change**: when a task property changes (via
> +      operations like ``sched_setaffinity()``, ``sched_setscheduler()``,
> +      priority changes, CPU migrations, etc.) while the task is still in
> +      BPF custody, ``ops.dequeue()`` is called with the
> +      ``SCX_DEQ_SCHED_CHANGE`` flag set in ``deq_flags``.
> +
> +   **Important**: Once a task has left BPF custody (e.g. after being
> +   dispatched to a terminal DSQ), property changes will not trigger
> +   ``ops.dequeue()``, since the task is no longer being managed by the BPF
> +   scheduler.
> +
>  3. When a CPU is ready to schedule, it first looks at its local DSQ. If
>     empty, it then looks at the global DSQ. If there still isn't a task to
>     run, ``ops.dispatch()`` is invoked which can use the following two
> @@ -319,6 +375,8 @@ by a sched_ext scheduler:
>                  /* Any usable CPU becomes available */
>  
>                  ops.dispatch(); /* Task is moved to a local DSQ */
> +
> +                ops.dequeue(); /* Exiting BPF scheduler */
>              }
>              ops.running();      /* Task starts running on its assigned CPU */
>              while (task->scx.slice > 0 && task is runnable)
> diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
> index bcb962d5ee7d8..c48f818eee9b8 100644
> --- a/include/linux/sched/ext.h
> +++ b/include/linux/sched/ext.h
> @@ -84,6 +84,7 @@ struct scx_dispatch_q {
>  /* scx_entity.flags */
>  enum scx_ent_flags {
>  	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
> +	SCX_TASK_NEED_DEQ	= 1 << 1, /* in BPF custody, needs ops.dequeue() when leaving */

Can we make this "SCX_TASK_IN_BPF"? Since we've now defined what it means to be
in BPF custody vs the core scx scheduler (terminal DSQs) this is a more
general property that can be useful to check in the future. An example:
We can now assert that a task's BPF state is consistent with its actual 
kernel state when using BPF-based data structures to manage tasks.

>  	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
>  	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
>  
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 0bb8fa927e9e9..d17fd9141adf4 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -925,6 +925,27 @@ static void touch_core_sched(struct rq *rq, struct task_struct *p)
>  #endif
>  }
>  
> +/**
> + * is_terminal_dsq - Check if a DSQ is terminal for ops.dequeue() purposes
> + * @dsq_id: DSQ ID to check
> + *
> + * Returns true if @dsq_id is a terminal/builtin DSQ where the BPF
> + * scheduler is considered "done" with the task.
> + *
> + * Builtin DSQs include:
> + *  - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON): per-CPU queues
> + *    where tasks go directly to execution,
> + *  - Global DSQ (%SCX_DSQ_GLOBAL): built-in fallback queue,
> + *  - Bypass DSQ: used during bypass mode.
> + *
> + * Tasks dispatched to builtin DSQs exit BPF scheduler custody and do not
> + * trigger ops.dequeue() when they are later consumed.
> + */
> +static inline bool is_terminal_dsq(u64 dsq_id)
> +{
> +	return dsq_id & SCX_DSQ_FLAG_BUILTIN;
> +}
> +
>  /**
>   * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
>   * @rq: rq to read clock from, must be locked
> @@ -1008,7 +1029,8 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p
>  		resched_curr(rq);
>  }
>  
> -static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
> +static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
> +			     struct scx_dispatch_q *dsq,
>  			     struct task_struct *p, u64 enq_flags)
>  {
>  	bool is_local = dsq->id == SCX_DSQ_LOCAL;
> @@ -1103,6 +1125,27 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
>  	dsq_mod_nr(dsq, 1);
>  	p->scx.dsq = dsq;
>  
> +	/*
> +	 * Handle ops.dequeue() and custody tracking.
> +	 *
> +	 * Builtin DSQs (local, global, bypass) are terminal: the BPF
> +	 * scheduler is done with the task. If it was in BPF custody, call
> +	 * ops.dequeue() and clear the flag.
> +	 *
> +	 * User DSQs: Task is in BPF scheduler's custody. Set the flag so
> +	 * ops.dequeue() will be called when it leaves.
> +	 */
> +	if (SCX_HAS_OP(sch, dequeue)) {
> +		if (is_terminal_dsq(dsq->id)) {
> +			if (p->scx.flags & SCX_TASK_NEED_DEQ)
> +				SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue,
> +						 rq, p, 0);
> +			p->scx.flags &= ~SCX_TASK_NEED_DEQ;
> +		} else {
> +			p->scx.flags |= SCX_TASK_NEED_DEQ;
> +		}
> +	}
> +
>  	/*
>  	 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
>  	 * direct dispatch path, but we clear them here because the direct
> @@ -1323,7 +1366,7 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
>  		return;
>  	}
>  
> -	dispatch_enqueue(sch, dsq, p,
> +	dispatch_enqueue(sch, rq, dsq, p,
>  			 p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
>  }
>  
> @@ -1407,13 +1450,22 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
>  	 * dequeue may be waiting. The store_release matches their load_acquire.
>  	 */
>  	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
> +
> +	/*
> +	 * Task is now in BPF scheduler's custody (queued on BPF internal
> +	 * structures). Set %SCX_TASK_NEED_DEQ so ops.dequeue() is called
> +	 * when it leaves custody (e.g. dispatched to a terminal DSQ or on
> +	 * property change).
> +	 */
> +	if (SCX_HAS_OP(sch, dequeue))

Related to the rename: Can we remove the guards and track the flag
regardless of whether ops.dequeue() is present?

There is no reason not to track whether a task is in BPF or the core, 
and it is a property that's independent of whether we implement ops.dequeue(). 
This also simplifies the code since we now just guard the actual ops.dequeue()
call.

> +		p->scx.flags |= SCX_TASK_NEED_DEQ;
>  	return;
>  
>  direct:
>  	direct_dispatch(sch, p, enq_flags);
>  	return;
>  local_norefill:
> -	dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
> +	dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags);
>  	return;
>  local:
>  	dsq = &rq->scx.local_dsq;
> @@ -1433,7 +1485,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
>  	 */
>  	touch_core_sched(rq, p);
>  	refill_task_slice_dfl(sch, p);
> -	dispatch_enqueue(sch, dsq, p, enq_flags);
> +	dispatch_enqueue(sch, rq, dsq, p, enq_flags);
>  }
>  
>  static bool task_runnable(const struct task_struct *p)
> @@ -1511,6 +1563,22 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
>  		__scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1);
>  }
>  
> +/*
> + * Call ops.dequeue() for a task leaving BPF custody. Adds %SCX_DEQ_SCHED_CHANGE
> + * when the dequeue is due to a property change (not sleep or core-sched pick).
> + */
> +static void call_task_dequeue(struct scx_sched *sch, struct rq *rq,
> +			      struct task_struct *p, u64 deq_flags)
> +{
> +	u64 flags = deq_flags;
> +
> +	if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC)))
> +		flags |= SCX_DEQ_SCHED_CHANGE;
> +
> +	SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, p, flags);
> +	p->scx.flags &= ~SCX_TASK_NEED_DEQ;
> +}
> +
>  static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
>  {
>  	struct scx_sched *sch = scx_root;
> @@ -1524,6 +1592,24 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
>  
>  	switch (opss & SCX_OPSS_STATE_MASK) {
>  	case SCX_OPSS_NONE:
> +		/*
> +		 * Task is not in BPF data structures (either dispatched to
> +		 * a DSQ or running). Only call ops.dequeue() if the task
> +		 * is still in BPF scheduler's custody (%SCX_TASK_NEED_DEQ
> +		 * is set).
> +		 *
> +		 * If the task has already been dispatched to a terminal
> +		 * DSQ (local DSQ or %SCX_DSQ_GLOBAL), it has left the BPF
> +		 * scheduler's custody and the flag will be clear, so we
> +		 * skip ops.dequeue().
> +		 *
> +		 * If this is a property change (not sleep/core-sched) and
> +		 * the task is still in BPF custody, set the
> +		 * %SCX_DEQ_SCHED_CHANGE flag.
> +		 */
> +		if (SCX_HAS_OP(sch, dequeue) &&
> +		    (p->scx.flags & SCX_TASK_NEED_DEQ))
> +			call_task_dequeue(sch, rq, p, deq_flags);
>  		break;
>  	case SCX_OPSS_QUEUEING:
>  		/*
> @@ -1532,9 +1618,14 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
>  		 */
>  		BUG();
>  	case SCX_OPSS_QUEUED:
> -		if (SCX_HAS_OP(sch, dequeue))
> -			SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
> -					 p, deq_flags);
> +		/*
> +		 * Task is still on the BPF scheduler (not dispatched yet).
> +		 * Call ops.dequeue() to notify it is leaving BPF custody.
> +		 */
> +		if (SCX_HAS_OP(sch, dequeue)) {
> +			WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_NEED_DEQ));
> +			call_task_dequeue(sch, rq, p, deq_flags);
> +		}
>  
>  		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
>  					    SCX_OPSS_NONE))
> @@ -1631,6 +1722,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
>  					 struct scx_dispatch_q *src_dsq,
>  					 struct rq *dst_rq)
>  {
> +	struct scx_sched *sch = scx_root;
>  	struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;
>  
>  	/* @dsq is locked and @p is on @dst_rq */
> @@ -1639,6 +1731,15 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
>  
>  	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
>  
> +	/*
> +	 * Task is moving from a non-local DSQ to a local (terminal) DSQ.
> +	 * Call ops.dequeue() if the task was in BPF custody.
> +	 */
> +	if (SCX_HAS_OP(sch, dequeue) && (p->scx.flags & SCX_TASK_NEED_DEQ)) {
> +		SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, dst_rq, p, 0);
> +		p->scx.flags &= ~SCX_TASK_NEED_DEQ;
> +	}
> +
>  	if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
>  		list_add(&p->scx.dsq_list.node, &dst_dsq->list);
>  	else
> @@ -1879,7 +1980,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
>  		dispatch_dequeue_locked(p, src_dsq);
>  		raw_spin_unlock(&src_dsq->lock);
>  
> -		dispatch_enqueue(sch, dst_dsq, p, enq_flags);
> +		dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags);
>  	}
>  
>  	return dst_rq;
> @@ -1969,14 +2070,14 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
>  	 * If dispatching to @rq that @p is already on, no lock dancing needed.
>  	 */
>  	if (rq == src_rq && rq == dst_rq) {
> -		dispatch_enqueue(sch, dst_dsq, p,
> +		dispatch_enqueue(sch, rq, dst_dsq, p,
>  				 enq_flags | SCX_ENQ_CLEAR_OPSS);
>  		return;
>  	}
>  
>  	if (src_rq != dst_rq &&
>  	    unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
> -		dispatch_enqueue(sch, find_global_dsq(sch, p), p,
> +		dispatch_enqueue(sch, rq, find_global_dsq(sch, p), p,
>  				 enq_flags | SCX_ENQ_CLEAR_OPSS);
>  		return;
>  	}
> @@ -2014,9 +2115,21 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
>  		 */
>  		if (src_rq == dst_rq) {
>  			p->scx.holding_cpu = -1;
> -			dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p,
> +			dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p,
>  					 enq_flags);
>  		} else {
> +			/*
> +			 * Moving to a remote local DSQ. dispatch_enqueue() is
> +			 * not used (we go through deactivate/activate), so
> +			 * call ops.dequeue() here if the task was in BPF
> +			 * custody.
> +			 */
> +			if (SCX_HAS_OP(sch, dequeue) &&
> +			    (p->scx.flags & SCX_TASK_NEED_DEQ)) {
> +				SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue,
> +						 src_rq, p, 0);
> +				p->scx.flags &= ~SCX_TASK_NEED_DEQ;
> +			}
>  			move_remote_task_to_local_dsq(p, enq_flags,
>  						      src_rq, dst_rq);
>  			/* task has been moved to dst_rq, which is now locked */
> @@ -2113,7 +2226,7 @@ static void finish_dispatch(struct scx_sched *sch, struct rq *rq,
>  	if (dsq->id == SCX_DSQ_LOCAL)
>  		dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
>  	else
> -		dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
> +		dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
>  }
>  
>  static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
> @@ -2414,7 +2527,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
>  		 * DSQ.
>  		 */
>  		if (p->scx.slice && !scx_rq_bypassing(rq)) {
> -			dispatch_enqueue(sch, &rq->scx.local_dsq, p,
> +			dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p,
>  					 SCX_ENQ_HEAD);
>  			goto switch_class;
>  		}
> @@ -2898,6 +3011,14 @@ static void scx_enable_task(struct task_struct *p)
>  
>  	lockdep_assert_rq_held(rq);
>  
> +	/*
> +	 * Verify the task is not in BPF scheduler's custody. If flag
> +	 * transitions are consistent, the flag should always be clear
> +	 * here.
> +	 */
> +	if (SCX_HAS_OP(sch, dequeue))
> +		WARN_ON_ONCE(p->scx.flags & SCX_TASK_NEED_DEQ);
> +
>  	/*
>  	 * Set the weight before calling ops.enable() so that the scheduler
>  	 * doesn't see a stale value if they inspect the task struct.
> @@ -2929,6 +3050,14 @@ static void scx_disable_task(struct task_struct *p)
>  	if (SCX_HAS_OP(sch, disable))
>  		SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p);
>  	scx_set_task_state(p, SCX_TASK_READY);
> +
> +	/*
> +	 * Verify the task is not in BPF scheduler's custody. If flag
> +	 * transitions are consistent, the flag should always be clear
> +	 * here.
> +	 */
> +	if (SCX_HAS_OP(sch, dequeue))
> +		WARN_ON_ONCE(p->scx.flags & SCX_TASK_NEED_DEQ);
>  }
>  
>  static void scx_exit_task(struct task_struct *p)
> @@ -3919,7 +4048,7 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
>  		 * between bypass DSQs.
>  		 */
>  		dispatch_dequeue_locked(p, donor_dsq);
> -		dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED);
> +		dispatch_enqueue(sch, donee_rq, donee_dsq, p, SCX_ENQ_NESTED);
>  
>  		/*
>  		 * $donee might have been idle and need to be woken up. No need
> diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
> index 386c677e4c9a0..befa9a5d6e53f 100644
> --- a/kernel/sched/ext_internal.h
> +++ b/kernel/sched/ext_internal.h
> @@ -982,6 +982,13 @@ enum scx_deq_flags {
>  	 * it hasn't been dispatched yet. Dequeue from the BPF side.
>  	 */
>  	SCX_DEQ_CORE_SCHED_EXEC	= 1LLU << 32,
> +
> +	/*
> +	 * The task is being dequeued due to a property change (e.g.,
> +	 * sched_setaffinity(), sched_setscheduler(), set_user_nice(),
> +	 * etc.).
> +	 */
> +	SCX_DEQ_SCHED_CHANGE	= 1LLU << 33,
>  };
>  
>  enum scx_pick_idle_cpu_flags {
> diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h
> index c2c33df9292c2..dcc945304760f 100644
> --- a/tools/sched_ext/include/scx/enum_defs.autogen.h
> +++ b/tools/sched_ext/include/scx/enum_defs.autogen.h
> @@ -21,6 +21,7 @@
>  #define HAVE_SCX_CPU_PREEMPT_UNKNOWN
>  #define HAVE_SCX_DEQ_SLEEP
>  #define HAVE_SCX_DEQ_CORE_SCHED_EXEC
> +#define HAVE_SCX_DEQ_SCHED_CHANGE
>  #define HAVE_SCX_DSQ_FLAG_BUILTIN
>  #define HAVE_SCX_DSQ_FLAG_LOCAL_ON
>  #define HAVE_SCX_DSQ_INVALID
> diff --git a/tools/sched_ext/include/scx/enums.autogen.bpf.h b/tools/sched_ext/include/scx/enums.autogen.bpf.h
> index 2f8002bcc19ad..5da50f9376844 100644
> --- a/tools/sched_ext/include/scx/enums.autogen.bpf.h
> +++ b/tools/sched_ext/include/scx/enums.autogen.bpf.h
> @@ -127,3 +127,5 @@ const volatile u64 __SCX_ENQ_CLEAR_OPSS __weak;
>  const volatile u64 __SCX_ENQ_DSQ_PRIQ __weak;
>  #define SCX_ENQ_DSQ_PRIQ __SCX_ENQ_DSQ_PRIQ
>  
> +const volatile u64 __SCX_DEQ_SCHED_CHANGE __weak;
> +#define SCX_DEQ_SCHED_CHANGE __SCX_DEQ_SCHED_CHANGE
> diff --git a/tools/sched_ext/include/scx/enums.autogen.h b/tools/sched_ext/include/scx/enums.autogen.h
> index fedec938584be..fc9a7a4d9dea5 100644
> --- a/tools/sched_ext/include/scx/enums.autogen.h
> +++ b/tools/sched_ext/include/scx/enums.autogen.h
> @@ -46,4 +46,5 @@
>  	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_LAST); \
>  	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_CLEAR_OPSS); \
>  	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_DSQ_PRIQ); \
> +	SCX_ENUM_SET(skel, scx_deq_flags, SCX_DEQ_SCHED_CHANGE); \
>  } while (0)


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-06 20:10   ` Emil Tsalapatis
@ 2026-02-07  9:16     ` Andrea Righi
  2026-02-08  5:11       ` Emil Tsalapatis
  0 siblings, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-07  9:16 UTC (permalink / raw)
  To: Emil Tsalapatis
  Cc: Tejun Heo, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

Hi Emil,

On Fri, Feb 06, 2026 at 03:10:55PM -0500, Emil Tsalapatis wrote:
> On Fri Feb 6, 2026 at 8:54 AM EST, Andrea Righi wrote:
> 
> Hi Andrea,
> 
> > Add a new kselftest to validate that the new ops.dequeue() semantics
> > work correctly for all task lifecycle scenarios, including the
> > distinction between terminal DSQs (where BPF scheduler is done with the
> > task), user DSQs (where BPF scheduler manages the task lifecycle) and
> > BPF data structures, regardless of which event performs the dispatch.
> >
> > The test validates the following scenarios:
> >
> >  - From ops.select_cpu():
> >      - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
> >        the BPF scheduler entirely; they never enter BPF custody, so
> >        ops.dequeue() is not called,
> >      - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
> >        bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
> >        not called,
> >      - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
> >        enqueue/dequeue lifecycle tracking and state machine validation
> >        (expects 1:1 enqueue/dequeue pairing).
> 
> Could you add a note here about why there's no equivalent to scenario 6?
> The differentiating factor between that and scenario 2 (nonterminal queue) is 
> that scx_dsq_insert_commit() is called regardless of whether the queue is terminal.
> And this makes sense since for non-DSQ queues the BPF scheduler can do its
> own tracking of enqueue/dequeue (plus it does not make too much sense to
> do BPF-internal enqueueing in select_cpu).
> 
> What do you think? If the above makes sense, maybe we should spell it out 
> in the documentation too. Maybe also add it makes no sense to enqueue
> in an internal BPF structure from select_cpu - the task is not yet
> enqueued, and would have to go through enqueue anyway.

Oh, I just didn't think about it, we can definitely add to ops.select_cpu()
a scenario equivalent to scenario 6 (push task to the BPF queue).

From a practical standpoint the benefits are questionable, but in the scope
of the kselftest I think it makes sense to better validate the entire state
machine in all cases. I'll add this scenario as well.

> 
> >
> >    - From ops.enqueue():
> >      - scenario 3 (local DSQ): same behavior as scenario 0,
> >      - scenario 4 (global DSQ): same behavior as scenario 1,
> >      - scenario 5 (user DSQ): same behavior as scenario 2,
> >      - scenario 6 (BPF internal queue): tasks are stored in a BPF queue
> >        in ops.enqueue() and consumed in ops.dispatch(); they remain in
> >        BPF custody until dispatch, with full lifecycle tracking and 1:1
> >        enqueue/dequeue validation.
> >
> > This verifies that:
> >  - terminal DSQ dispatch (local, global) don't trigger ops.dequeue(),
> >  - user DSQ / internal BPF data structure dispatch has exact 1:1
> >    ops.enqueue()/dequeue() pairing,
> >  - dispatch dequeues have no flags (normal workflow),
> >  - property change dequeues have the %SCX_DEQ_SCHED_CHANGE flag set,
> >  - no duplicate enqueues or invalid state transitions are happening,
> >  - ops.enqueue() and ops.select_cpu() dispatch paths behave identically.
> >
> > Cc: Tejun Heo <tj@kernel.org>
> > Cc: Emil Tsalapatis <emil@etsalapatis.com>
> > Cc: Kuba Piecuch <jpiecuch@google.com>
> > Signed-off-by: Andrea Righi <arighi@nvidia.com>
> > ---
> >  tools/testing/selftests/sched_ext/Makefile    |   1 +
> >  .../testing/selftests/sched_ext/dequeue.bpf.c | 403 ++++++++++++++++++
> >  tools/testing/selftests/sched_ext/dequeue.c   | 258 +++++++++++
> >  3 files changed, 662 insertions(+)
> >  create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
> >  create mode 100644 tools/testing/selftests/sched_ext/dequeue.c
> >
> > diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
> > index 5fe45f9c5f8fd..764e91edabf93 100644
> > --- a/tools/testing/selftests/sched_ext/Makefile
> > +++ b/tools/testing/selftests/sched_ext/Makefile
> > @@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
> >  
> >  auto-test-targets :=			\
> >  	create_dsq			\
> > +	dequeue				\
> >  	enq_last_no_enq_fails		\
> >  	ddsp_bogus_dsq_fail		\
> >  	ddsp_vtimelocal_fail		\
> > diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
> > new file mode 100644
> > index 0000000000000..4ba657ba1bff5
> > --- /dev/null
> > +++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
> > @@ -0,0 +1,403 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * A scheduler that validates ops.dequeue() is called correctly:
> > + * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
> > + *   scheduler entirely: no ops.dequeue() should be called
> > + * - Tasks dispatched to user DSQs enter BPF custody: ops.dequeue() must be
> > + *   called when they leave custody
> > + * - Every ops.enqueue() for non-terminal DSQs is followed by exactly one
> > + *   ops.dequeue() (validate 1:1 pairing and state machine)
> > + *
> > + * Copyright (c) 2026 NVIDIA Corporation.
> > + */
> > +
> > +#include <scx/common.bpf.h>
> > +
> > +#define SHARED_DSQ	0
> > +
> > +/*
> > + * Scenario 6: BPF internal queue. Tasks are stored here from ops.enqueue()
> > + * and consumed from ops.dispatch(), validating that tasks not on a user DSQ
> > + * (only on BPF internal structures) still get ops.dequeue() when they leave.
> > + */
> > +struct {
> > +	__uint(type, BPF_MAP_TYPE_QUEUE);
> > +	__uint(max_entries, 4096);
> 
> Nit: Can we make this larger? I don't think there's any downsides. I know
> there's a mitigation for if the queue gets full, please see nit below.

Sure, like 32768?

Or we can keep it like this so we can potentially test also the fallback
path sometimes (mixed BPF queue dispatches + built-in DSQ dispatches).

> 
> > +	__type(value, s32);
> > +} global_queue SEC(".maps");
> > +
> > +char _license[] SEC("license") = "GPL";
> > +
> > +UEI_DEFINE(uei);
> > +
> > +/*
> > + * Counters to track the lifecycle of tasks:
> > + * - enqueue_cnt: Number of times ops.enqueue() was called
> > + * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
> > + * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
> > + * - change_dequeue_cnt: Number of property change dequeues
> > + */
> > +u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt;
> > +
> > +/*
> > + * Test scenarios (0-2: ops.select_cpu(), 3-6: ops.enqueue()):
> > + * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
> > + *    scheduler, no dequeue callbacks)
> > + * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
> > + *    scheduler, no dequeue callbacks)
> > + * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
> > + *    dequeue callbacks expected)
> > + * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
> > + *    scheduler, no dequeue callbacks)
> > + * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
> > + *    scheduler, no dequeue callbacks)
> > + * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
> > + *    dequeue callbacks expected)
> > + * 6) BPF internal queue: store task PIDs in ops.enqueue(), consume in
> > + *    ops.dispatch() and dispatch to local DSQ (validates dequeue for tasks
> > + *    in BPF custody but not on a user DSQ)
> > + */
> > +u32 test_scenario;
> > +
> > +/*
> > + * Per-task state to track lifecycle and validate workflow semantics.
> > + * State transitions:
> > + *   NONE -> ENQUEUED (on enqueue)
> > + *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
> > + *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
> > + *   ENQUEUED -> NONE (on property change dequeue before dispatch)
> > + */
> > +enum task_state {
> > +	TASK_NONE = 0,
> > +	TASK_ENQUEUED,
> > +	TASK_DISPATCHED,
> > +};
> > +
> > +struct task_ctx {
> > +	enum task_state state; /* Current state in the workflow */
> > +	u64 enqueue_seq;       /* Sequence number for debugging */
> > +};
> > +
> > +struct {
> > +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
> > +	__uint(map_flags, BPF_F_NO_PREALLOC);
> > +	__type(key, int);
> > +	__type(value, struct task_ctx);
> > +} task_ctx_stor SEC(".maps");
> > +
> > +static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
> > +{
> > +	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> > +}
> > +
> > +s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
> > +		   s32 prev_cpu, u64 wake_flags)
> > +{
> > +	struct task_ctx *tctx;
> > +
> > +	tctx = try_lookup_task_ctx(p);
> > +	if (!tctx)
> > +		return prev_cpu;
> > +
> > +	switch (test_scenario) {
> > +	case 0:
> > +		/*
> > +		 * Scenario 0: Direct dispatch to local DSQ from select_cpu.
> > +		 *
> > +		 * Task bypasses BPF scheduler entirely: no enqueue
> > +		 * tracking, no dequeue callbacks. Behavior should be
> > +		 * identical to scenario 3.
> > +		 */
> > +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
> > +		return prev_cpu;
> > +
> > +	case 1:
> > +		/*
> > +		 * Scenario 1: Direct dispatch to global DSQ from select_cpu.
> > +		 *
> > +		 * Like scenario 0, task bypasses BPF scheduler entirely.
> > +		 * Behavior should be identical to scenario 4.
> > +		 */
> > +		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
> > +		return prev_cpu;
> > +
> > +	case 2:
> > +		/*
> > +		 * Scenario 2: Dispatch to shared user DSQ from select_cpu.
> > +		 *
> > +		 * Task enters BPF scheduler management: track
> > +		 * enqueue/dequeue lifecycle and validate state transitions.
> > +		 * Behavior should be identical to scenario 5.
> > +		 */
> > +		__sync_fetch_and_add(&enqueue_cnt, 1);
> > +
> > +		/*
> > +		 * Validate state transition: enqueue is only valid from
> > +		 * NONE or DISPATCHED states. Getting enqueue while in
> > +		 * ENQUEUED state indicates a missing dequeue.
> > +		 */
> > +		if (tctx->state == TASK_ENQUEUED)
> > +			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
> > +				      p->pid, p->comm, tctx->enqueue_seq);
> > +
> > +		/* Transition to ENQUEUED state */
> > +		tctx->state = TASK_ENQUEUED;
> > +		tctx->enqueue_seq++;
> > +
> > +		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);
> > +		return prev_cpu;
> > +
> > +	default:
> > +		/*
> > +		 * Force all tasks through ops.enqueue().
> > +		 */
> > +		return prev_cpu;
> > +	}
> > +}
> > +
> > +void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
> > +{
> > +	struct task_ctx *tctx;
> > +
> > +	tctx = try_lookup_task_ctx(p);
> > +	if (!tctx)
> > +		return;
> > +
> > +	switch (test_scenario) {
> > +	case 3:
> > +		/*
> > +		 * Scenario 3: Direct dispatch to the local DSQ.
> > +		 *
> > +		 * Task bypasses BPF scheduler entirely: no enqueue
> > +		 * tracking, no dequeue callbacks. Don't increment counters
> > +		 * or validate state since the task never enters BPF
> > +		 * scheduler management.
> > +		 */
> > +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
> > +		break;
> > +
> > +	case 4:
> > +		/*
> > +		 * Scenario 4: Direct dispatch to the global DSQ.
> > +		 *
> > +		 * Like scenario 3, task bypasses BPF scheduler entirely.
> > +		 * SCX_DSQ_GLOBAL is a terminal DSQ, tasks dispatched to it
> > +		 * leave BPF custody immediately, so no dequeue callbacks
> > +		 * should be triggered.
> > +		 */
> > +		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
> > +		break;
> > +
> > +	case 5:
> > +		/*
> > +		 * Scenario 5: Dispatch to shared user DSQ.
> > +		 *
> > +		 * Task enters BPF scheduler management: track
> > +		 * enqueue/dequeue lifecycle and validate state
> > +		 * transitions.
> > +		 */
> > +		__sync_fetch_and_add(&enqueue_cnt, 1);
> > +
> > +		/*
> > +		 * Validate state transition: enqueue is only valid from
> > +		 * NONE or DISPATCHED states. Getting enqueue while in
> > +		 * ENQUEUED state indicates a missing dequeue (or stale state
> > +		 * from a previous scenario when the scheduler was unregistered
> > +		 * with tasks still on a DSQ). Reset and proceed to avoid false
> > +		 * positives across scenario switches.
> > +		 */
> > +		if (tctx->state == TASK_ENQUEUED)
> > +			tctx->state = TASK_NONE;
> > +
> > +		/* Transition to ENQUEUED state */
> > +		tctx->state = TASK_ENQUEUED;
> > +		tctx->enqueue_seq++;
> > +
> > +		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
> > +		break;
> > +
> > +	case 6:
> > +		/*
> > +		 * Scenario 6: Store task in BPF internal queue. Task enters
> > +		 * BPF custody (kernel sets SCX_TASK_NEED_DEQ). When
> > +		 * ops.dispatch() later pops and inserts to local DSQ,
> > +		 * ops.dequeue() must be called.
> > +		 *
> > +		 * If the queue is full, fallback to local DSQ. The task still
> > +		 * goes through QUEUED in the kernel and gets ops.dequeue()
> > +		 * when moved to the terminal DSQ, so we track it the same.
> > +		 *
> > +		 * If state is already ENQUEUED (e.g. task was on a DSQ when
> > +		 * the scheduler was unregistered in a previous scenario),
> > +		 * reset to NONE and proceed to avoid false positives.
> > +		 */
> > +		{
> > +			s32 pid = p->pid;
> > +
> > +			if (tctx->state == TASK_ENQUEUED)
> > +				tctx->state = TASK_NONE;
> > +
> > +			tctx->state = TASK_ENQUEUED;
> > +			tctx->enqueue_seq++;
> > +
> > +			/* Queue full: fallback to the global DSQ */
> Nit: Can we remove this fallback? This silently changes the behavior of
> the test, and even though it makes sense to avoid overflowing the queue,
> it causes the test to succeed even if for some reason the
> bpf_map_push_elem fails. Why not just bump the queue number to a
> reasonably large number amount instead?

Hm... but if for any reason we overflow the queue we'd get a false positive
error: task is ignored, we trigger a stall and it looks like something is
wrong in ops.dequeue(). WDYT?

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics
  2026-02-06 20:35   ` Emil Tsalapatis
@ 2026-02-07  9:26     ` Andrea Righi
  2026-02-09 17:28       ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-07  9:26 UTC (permalink / raw)
  To: Emil Tsalapatis
  Cc: Tejun Heo, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

Hi Emil,

On Fri, Feb 06, 2026 at 03:35:34PM -0500, Emil Tsalapatis wrote:
> On Fri Feb 6, 2026 at 8:54 AM EST, Andrea Righi wrote:
...
> > diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
> > index bcb962d5ee7d8..c48f818eee9b8 100644
> > --- a/include/linux/sched/ext.h
> > +++ b/include/linux/sched/ext.h
> > @@ -84,6 +84,7 @@ struct scx_dispatch_q {
> >  /* scx_entity.flags */
> >  enum scx_ent_flags {
> >  	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
> > +	SCX_TASK_NEED_DEQ	= 1 << 1, /* in BPF custody, needs ops.dequeue() when leaving */
> 
> Can we make this "SCX_TASK_IN_BPF"? Since we've now defined what it means to be
> in BPF custody vs the core scx scheduler (terminal DSQs) this is a more
> general property that can be useful to check in the future. An example:
> We can now assert that a task's BPF state is consistent with its actual 
> kernel state when using BPF-based data structures to manage tasks.

Ack. I like SCX_TASK_IN_BPF and I also like the idea of resuing the flag
for other purposes. It can be helpful for debugging as well.

> 
> >  	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
> >  	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
> >  
> > diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> > index 0bb8fa927e9e9..d17fd9141adf4 100644
> > --- a/kernel/sched/ext.c
> > +++ b/kernel/sched/ext.c
> > @@ -925,6 +925,27 @@ static void touch_core_sched(struct rq *rq, struct task_struct *p)
> >  #endif
> >  }
> >  
> > +/**
> > + * is_terminal_dsq - Check if a DSQ is terminal for ops.dequeue() purposes
> > + * @dsq_id: DSQ ID to check
> > + *
> > + * Returns true if @dsq_id is a terminal/builtin DSQ where the BPF
> > + * scheduler is considered "done" with the task.
> > + *
> > + * Builtin DSQs include:
> > + *  - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON): per-CPU queues
> > + *    where tasks go directly to execution,
> > + *  - Global DSQ (%SCX_DSQ_GLOBAL): built-in fallback queue,
> > + *  - Bypass DSQ: used during bypass mode.
> > + *
> > + * Tasks dispatched to builtin DSQs exit BPF scheduler custody and do not
> > + * trigger ops.dequeue() when they are later consumed.
> > + */
> > +static inline bool is_terminal_dsq(u64 dsq_id)
> > +{
> > +	return dsq_id & SCX_DSQ_FLAG_BUILTIN;
> > +}
> > +
> >  /**
> >   * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
> >   * @rq: rq to read clock from, must be locked
> > @@ -1008,7 +1029,8 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p
> >  		resched_curr(rq);
> >  }
> >  
> > -static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
> > +static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
> > +			     struct scx_dispatch_q *dsq,
> >  			     struct task_struct *p, u64 enq_flags)
> >  {
> >  	bool is_local = dsq->id == SCX_DSQ_LOCAL;
> > @@ -1103,6 +1125,27 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
> >  	dsq_mod_nr(dsq, 1);
> >  	p->scx.dsq = dsq;
> >  
> > +	/*
> > +	 * Handle ops.dequeue() and custody tracking.
> > +	 *
> > +	 * Builtin DSQs (local, global, bypass) are terminal: the BPF
> > +	 * scheduler is done with the task. If it was in BPF custody, call
> > +	 * ops.dequeue() and clear the flag.
> > +	 *
> > +	 * User DSQs: Task is in BPF scheduler's custody. Set the flag so
> > +	 * ops.dequeue() will be called when it leaves.
> > +	 */
> > +	if (SCX_HAS_OP(sch, dequeue)) {
> > +		if (is_terminal_dsq(dsq->id)) {
> > +			if (p->scx.flags & SCX_TASK_NEED_DEQ)
> > +				SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue,
> > +						 rq, p, 0);
> > +			p->scx.flags &= ~SCX_TASK_NEED_DEQ;
> > +		} else {
> > +			p->scx.flags |= SCX_TASK_NEED_DEQ;
> > +		}
> > +	}
> > +
> >  	/*
> >  	 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
> >  	 * direct dispatch path, but we clear them here because the direct
> > @@ -1323,7 +1366,7 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
> >  		return;
> >  	}
> >  
> > -	dispatch_enqueue(sch, dsq, p,
> > +	dispatch_enqueue(sch, rq, dsq, p,
> >  			 p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
> >  }
> >  
> > @@ -1407,13 +1450,22 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
> >  	 * dequeue may be waiting. The store_release matches their load_acquire.
> >  	 */
> >  	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
> > +
> > +	/*
> > +	 * Task is now in BPF scheduler's custody (queued on BPF internal
> > +	 * structures). Set %SCX_TASK_NEED_DEQ so ops.dequeue() is called
> > +	 * when it leaves custody (e.g. dispatched to a terminal DSQ or on
> > +	 * property change).
> > +	 */
> > +	if (SCX_HAS_OP(sch, dequeue))
> 
> Related to the rename: Can we remove the guards and track the flag
> regardless of whether ops.dequeue() is present?
> 
> There is no reason not to track whether a task is in BPF or the core, 
> and it is a property that's independent of whether we implement ops.dequeue(). 
> This also simplifies the code since we now just guard the actual ops.dequeue()
> call.

I was concerned about introducing overhead, with the guard we can save a
few memory writes to p->scx.flags. But I don't have numbers and probably
the overhead is negligible.

Also, if we have a working ops.dequeue(), I guess more schedulers will
start implementing an ops.dequeue() callback, so the guard itself may
actually become the extra overhead.

So, I guess we can remove the guard and just set/clear the flag even
without an ops.dequeue() callback...

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-07  9:16     ` Andrea Righi
@ 2026-02-08  5:11       ` Emil Tsalapatis
  2026-02-08  9:02         ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Emil Tsalapatis @ 2026-02-08  5:11 UTC (permalink / raw)
  To: Andrea Righi
  Cc: Tejun Heo, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Sat Feb 7, 2026 at 4:16 AM EST, Andrea Righi wrote:
> Hi Emil,
>

Hi Andrea,

> On Fri, Feb 06, 2026 at 03:10:55PM -0500, Emil Tsalapatis wrote:
>> On Fri Feb 6, 2026 at 8:54 AM EST, Andrea Righi wrote:
>> 
>> Hi Andrea,
>> 
>> > Add a new kselftest to validate that the new ops.dequeue() semantics
>> > work correctly for all task lifecycle scenarios, including the
>> > distinction between terminal DSQs (where BPF scheduler is done with the
>> > task), user DSQs (where BPF scheduler manages the task lifecycle) and
>> > BPF data structures, regardless of which event performs the dispatch.
>> >
>> > The test validates the following scenarios:
>> >
>> >  - From ops.select_cpu():
>> >      - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
>> >        the BPF scheduler entirely; they never enter BPF custody, so
>> >        ops.dequeue() is not called,
>> >      - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
>> >        bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
>> >        not called,
>> >      - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
>> >        enqueue/dequeue lifecycle tracking and state machine validation
>> >        (expects 1:1 enqueue/dequeue pairing).
>> 
>> Could you add a note here about why there's no equivalent to scenario 6?
>> The differentiating factor between that and scenario 2 (nonterminal queue) is 
>> that scx_dsq_insert_commit() is called regardless of whether the queue is terminal.
>> And this makes sense since for non-DSQ queues the BPF scheduler can do its
>> own tracking of enqueue/dequeue (plus it does not make too much sense to
>> do BPF-internal enqueueing in select_cpu).
>> 
>> What do you think? If the above makes sense, maybe we should spell it out 
>> in the documentation too. Maybe also add it makes no sense to enqueue
>> in an internal BPF structure from select_cpu - the task is not yet
>> enqueued, and would have to go through enqueue anyway.
>
> Oh, I just didn't think about it, we can definitely add to ops.select_cpu()
> a scenario equivalent to scenario 6 (push task to the BPF queue).
>
> From a practical standpoint the benefits are questionable, but in the scope
> of the kselftest I think it makes sense to better validate the entire state
> machine in all cases. I'll add this scenario as well.
>

That makes sense! Let's add it for completeness. Even if it doesn't make
sense right now that may change in the future. For example, if we end
up finding a good reason to add the task into an internal structure from
.select_cpu(), we may allow the task to be explicitly marked as being in
the BPF scheduler's custody from a kfunc. Right now we can't do that
from select_cpu() unless we direct dispatch IIUC.

>> 
>> >
>> >    - From ops.enqueue():
>> >      - scenario 3 (local DSQ): same behavior as scenario 0,
>> >      - scenario 4 (global DSQ): same behavior as scenario 1,
>> >      - scenario 5 (user DSQ): same behavior as scenario 2,
>> >      - scenario 6 (BPF internal queue): tasks are stored in a BPF queue
>> >        in ops.enqueue() and consumed in ops.dispatch(); they remain in
>> >        BPF custody until dispatch, with full lifecycle tracking and 1:1
>> >        enqueue/dequeue validation.
>> >
>> > This verifies that:
>> >  - terminal DSQ dispatch (local, global) don't trigger ops.dequeue(),
>> >  - user DSQ / internal BPF data structure dispatch has exact 1:1
>> >    ops.enqueue()/dequeue() pairing,
>> >  - dispatch dequeues have no flags (normal workflow),
>> >  - property change dequeues have the %SCX_DEQ_SCHED_CHANGE flag set,
>> >  - no duplicate enqueues or invalid state transitions are happening,
>> >  - ops.enqueue() and ops.select_cpu() dispatch paths behave identically.
>> >
>> > Cc: Tejun Heo <tj@kernel.org>
>> > Cc: Emil Tsalapatis <emil@etsalapatis.com>
>> > Cc: Kuba Piecuch <jpiecuch@google.com>
>> > Signed-off-by: Andrea Righi <arighi@nvidia.com>
>> > ---
>> >  tools/testing/selftests/sched_ext/Makefile    |   1 +
>> >  .../testing/selftests/sched_ext/dequeue.bpf.c | 403 ++++++++++++++++++
>> >  tools/testing/selftests/sched_ext/dequeue.c   | 258 +++++++++++
>> >  3 files changed, 662 insertions(+)
>> >  create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
>> >  create mode 100644 tools/testing/selftests/sched_ext/dequeue.c
>> >
>> > diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
>> > index 5fe45f9c5f8fd..764e91edabf93 100644
>> > --- a/tools/testing/selftests/sched_ext/Makefile
>> > +++ b/tools/testing/selftests/sched_ext/Makefile
>> > @@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
>> >  
>> >  auto-test-targets :=			\
>> >  	create_dsq			\
>> > +	dequeue				\
>> >  	enq_last_no_enq_fails		\
>> >  	ddsp_bogus_dsq_fail		\
>> >  	ddsp_vtimelocal_fail		\
>> > diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
>> > new file mode 100644
>> > index 0000000000000..4ba657ba1bff5
>> > --- /dev/null
>> > +++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
>> > @@ -0,0 +1,403 @@
>> > +// SPDX-License-Identifier: GPL-2.0
>> > +/*
>> > + * A scheduler that validates ops.dequeue() is called correctly:
>> > + * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
>> > + *   scheduler entirely: no ops.dequeue() should be called
>> > + * - Tasks dispatched to user DSQs enter BPF custody: ops.dequeue() must be
>> > + *   called when they leave custody
>> > + * - Every ops.enqueue() for non-terminal DSQs is followed by exactly one
>> > + *   ops.dequeue() (validate 1:1 pairing and state machine)
>> > + *
>> > + * Copyright (c) 2026 NVIDIA Corporation.
>> > + */
>> > +
>> > +#include <scx/common.bpf.h>
>> > +
>> > +#define SHARED_DSQ	0
>> > +
>> > +/*
>> > + * Scenario 6: BPF internal queue. Tasks are stored here from ops.enqueue()
>> > + * and consumed from ops.dispatch(), validating that tasks not on a user DSQ
>> > + * (only on BPF internal structures) still get ops.dequeue() when they leave.
>> > + */
>> > +struct {
>> > +	__uint(type, BPF_MAP_TYPE_QUEUE);
>> > +	__uint(max_entries, 4096);
>> 
>> Nit: Can we make this larger? I don't think there's any downsides. I know
>> there's a mitigation for if the queue gets full, please see nit below.
>
> Sure, like 32768?
>
> Or we can keep it like this so we can potentially test also the fallback
> path sometimes (mixed BPF queue dispatches + built-in DSQ dispatches).
>

32K makes sense. If we keep the fallback, maybe we can just add a
WARN_ON_ONCE() equivalent that it is being triggered so that we make
sure we don't trigger it every single time (e.g. because the BPF queue
is misbehaving)?

>> 
>> > +	__type(value, s32);
>> > +} global_queue SEC(".maps");
>> > +
>> > +char _license[] SEC("license") = "GPL";
>> > +
>> > +UEI_DEFINE(uei);
>> > +
>> > +/*
>> > + * Counters to track the lifecycle of tasks:
>> > + * - enqueue_cnt: Number of times ops.enqueue() was called
>> > + * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
>> > + * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
>> > + * - change_dequeue_cnt: Number of property change dequeues
>> > + */
>> > +u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt;
>> > +
>> > +/*
>> > + * Test scenarios (0-2: ops.select_cpu(), 3-6: ops.enqueue()):
>> > + * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
>> > + *    scheduler, no dequeue callbacks)
>> > + * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
>> > + *    scheduler, no dequeue callbacks)
>> > + * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
>> > + *    dequeue callbacks expected)
>> > + * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
>> > + *    scheduler, no dequeue callbacks)
>> > + * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
>> > + *    scheduler, no dequeue callbacks)
>> > + * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
>> > + *    dequeue callbacks expected)
>> > + * 6) BPF internal queue: store task PIDs in ops.enqueue(), consume in
>> > + *    ops.dispatch() and dispatch to local DSQ (validates dequeue for tasks
>> > + *    in BPF custody but not on a user DSQ)
>> > + */
>> > +u32 test_scenario;
>> > +
>> > +/*
>> > + * Per-task state to track lifecycle and validate workflow semantics.
>> > + * State transitions:
>> > + *   NONE -> ENQUEUED (on enqueue)
>> > + *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
>> > + *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
>> > + *   ENQUEUED -> NONE (on property change dequeue before dispatch)
>> > + */
>> > +enum task_state {
>> > +	TASK_NONE = 0,
>> > +	TASK_ENQUEUED,
>> > +	TASK_DISPATCHED,
>> > +};
>> > +
>> > +struct task_ctx {
>> > +	enum task_state state; /* Current state in the workflow */
>> > +	u64 enqueue_seq;       /* Sequence number for debugging */
>> > +};
>> > +
>> > +struct {
>> > +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
>> > +	__uint(map_flags, BPF_F_NO_PREALLOC);
>> > +	__type(key, int);
>> > +	__type(value, struct task_ctx);
>> > +} task_ctx_stor SEC(".maps");
>> > +
>> > +static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
>> > +{
>> > +	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
>> > +}
>> > +
>> > +s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
>> > +		   s32 prev_cpu, u64 wake_flags)
>> > +{
>> > +	struct task_ctx *tctx;
>> > +
>> > +	tctx = try_lookup_task_ctx(p);
>> > +	if (!tctx)
>> > +		return prev_cpu;
>> > +
>> > +	switch (test_scenario) {
>> > +	case 0:
>> > +		/*
>> > +		 * Scenario 0: Direct dispatch to local DSQ from select_cpu.
>> > +		 *
>> > +		 * Task bypasses BPF scheduler entirely: no enqueue
>> > +		 * tracking, no dequeue callbacks. Behavior should be
>> > +		 * identical to scenario 3.
>> > +		 */
>> > +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
>> > +		return prev_cpu;
>> > +
>> > +	case 1:
>> > +		/*
>> > +		 * Scenario 1: Direct dispatch to global DSQ from select_cpu.
>> > +		 *
>> > +		 * Like scenario 0, task bypasses BPF scheduler entirely.
>> > +		 * Behavior should be identical to scenario 4.
>> > +		 */
>> > +		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
>> > +		return prev_cpu;
>> > +
>> > +	case 2:
>> > +		/*
>> > +		 * Scenario 2: Dispatch to shared user DSQ from select_cpu.
>> > +		 *
>> > +		 * Task enters BPF scheduler management: track
>> > +		 * enqueue/dequeue lifecycle and validate state transitions.
>> > +		 * Behavior should be identical to scenario 5.
>> > +		 */
>> > +		__sync_fetch_and_add(&enqueue_cnt, 1);
>> > +
>> > +		/*
>> > +		 * Validate state transition: enqueue is only valid from
>> > +		 * NONE or DISPATCHED states. Getting enqueue while in
>> > +		 * ENQUEUED state indicates a missing dequeue.
>> > +		 */
>> > +		if (tctx->state == TASK_ENQUEUED)
>> > +			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
>> > +				      p->pid, p->comm, tctx->enqueue_seq);
>> > +
>> > +		/* Transition to ENQUEUED state */
>> > +		tctx->state = TASK_ENQUEUED;
>> > +		tctx->enqueue_seq++;
>> > +
>> > +		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);
>> > +		return prev_cpu;
>> > +
>> > +	default:
>> > +		/*
>> > +		 * Force all tasks through ops.enqueue().
>> > +		 */
>> > +		return prev_cpu;
>> > +	}
>> > +}
>> > +
>> > +void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
>> > +{
>> > +	struct task_ctx *tctx;
>> > +
>> > +	tctx = try_lookup_task_ctx(p);
>> > +	if (!tctx)
>> > +		return;
>> > +
>> > +	switch (test_scenario) {
>> > +	case 3:
>> > +		/*
>> > +		 * Scenario 3: Direct dispatch to the local DSQ.
>> > +		 *
>> > +		 * Task bypasses BPF scheduler entirely: no enqueue
>> > +		 * tracking, no dequeue callbacks. Don't increment counters
>> > +		 * or validate state since the task never enters BPF
>> > +		 * scheduler management.
>> > +		 */
>> > +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
>> > +		break;
>> > +
>> > +	case 4:
>> > +		/*
>> > +		 * Scenario 4: Direct dispatch to the global DSQ.
>> > +		 *
>> > +		 * Like scenario 3, task bypasses BPF scheduler entirely.
>> > +		 * SCX_DSQ_GLOBAL is a terminal DSQ, tasks dispatched to it
>> > +		 * leave BPF custody immediately, so no dequeue callbacks
>> > +		 * should be triggered.
>> > +		 */
>> > +		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
>> > +		break;
>> > +
>> > +	case 5:
>> > +		/*
>> > +		 * Scenario 5: Dispatch to shared user DSQ.
>> > +		 *
>> > +		 * Task enters BPF scheduler management: track
>> > +		 * enqueue/dequeue lifecycle and validate state
>> > +		 * transitions.
>> > +		 */
>> > +		__sync_fetch_and_add(&enqueue_cnt, 1);
>> > +
>> > +		/*
>> > +		 * Validate state transition: enqueue is only valid from
>> > +		 * NONE or DISPATCHED states. Getting enqueue while in
>> > +		 * ENQUEUED state indicates a missing dequeue (or stale state
>> > +		 * from a previous scenario when the scheduler was unregistered
>> > +		 * with tasks still on a DSQ). Reset and proceed to avoid false
>> > +		 * positives across scenario switches.
>> > +		 */
>> > +		if (tctx->state == TASK_ENQUEUED)
>> > +			tctx->state = TASK_NONE;
>> > +
>> > +		/* Transition to ENQUEUED state */
>> > +		tctx->state = TASK_ENQUEUED;
>> > +		tctx->enqueue_seq++;
>> > +
>> > +		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
>> > +		break;
>> > +
>> > +	case 6:
>> > +		/*
>> > +		 * Scenario 6: Store task in BPF internal queue. Task enters
>> > +		 * BPF custody (kernel sets SCX_TASK_NEED_DEQ). When
>> > +		 * ops.dispatch() later pops and inserts to local DSQ,
>> > +		 * ops.dequeue() must be called.
>> > +		 *
>> > +		 * If the queue is full, fallback to local DSQ. The task still
>> > +		 * goes through QUEUED in the kernel and gets ops.dequeue()
>> > +		 * when moved to the terminal DSQ, so we track it the same.
>> > +		 *
>> > +		 * If state is already ENQUEUED (e.g. task was on a DSQ when
>> > +		 * the scheduler was unregistered in a previous scenario),
>> > +		 * reset to NONE and proceed to avoid false positives.
>> > +		 */
>> > +		{
>> > +			s32 pid = p->pid;
>> > +
>> > +			if (tctx->state == TASK_ENQUEUED)
>> > +				tctx->state = TASK_NONE;
>> > +
>> > +			tctx->state = TASK_ENQUEUED;
>> > +			tctx->enqueue_seq++;
>> > +
>> > +			/* Queue full: fallback to the global DSQ */
>> Nit: Can we remove this fallback? This silently changes the behavior of
>> the test, and even though it makes sense to avoid overflowing the queue,
>> it causes the test to succeed even if for some reason the
>> bpf_map_push_elem fails. Why not just bump the queue number to a
>> reasonably large number amount instead?
>
> Hm... but if for any reason we overflow the queue we'd get a false positive
> error: task is ignored, we trigger a stall and it looks like something is
> wrong in ops.dequeue(). WDYT?
>

I agree, but if we bump the queue size to a large number the probability
of that is nonexistent: I think these test make sense to run in CI-like
environments where there's few processes anyway, so if a queue is large
enough there will not be enough tasks to overflow it anyway. This is an
assumption we make for dsp_local_on, too. Maybe we can keep the fallback
but warn when it's used (see above)?

> Thanks,
> -Andrea


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-08  5:11       ` Emil Tsalapatis
@ 2026-02-08  9:02         ` Andrea Righi
  2026-02-08 10:26           ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-08  9:02 UTC (permalink / raw)
  To: Emil Tsalapatis
  Cc: Tejun Heo, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Sun, Feb 08, 2026 at 12:11:11AM -0500, Emil Tsalapatis wrote:
> On Sat Feb 7, 2026 at 4:16 AM EST, Andrea Righi wrote:
> > Hi Emil,
> >
> 
> Hi Andrea,
> 
> > On Fri, Feb 06, 2026 at 03:10:55PM -0500, Emil Tsalapatis wrote:
> >> On Fri Feb 6, 2026 at 8:54 AM EST, Andrea Righi wrote:
> >> 
> >> Hi Andrea,
> >> 
> >> > Add a new kselftest to validate that the new ops.dequeue() semantics
> >> > work correctly for all task lifecycle scenarios, including the
> >> > distinction between terminal DSQs (where BPF scheduler is done with the
> >> > task), user DSQs (where BPF scheduler manages the task lifecycle) and
> >> > BPF data structures, regardless of which event performs the dispatch.
> >> >
> >> > The test validates the following scenarios:
> >> >
> >> >  - From ops.select_cpu():
> >> >      - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
> >> >        the BPF scheduler entirely; they never enter BPF custody, so
> >> >        ops.dequeue() is not called,
> >> >      - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
> >> >        bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
> >> >        not called,
> >> >      - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
> >> >        enqueue/dequeue lifecycle tracking and state machine validation
> >> >        (expects 1:1 enqueue/dequeue pairing).
> >> 
> >> Could you add a note here about why there's no equivalent to scenario 6?
> >> The differentiating factor between that and scenario 2 (nonterminal queue) is 
> >> that scx_dsq_insert_commit() is called regardless of whether the queue is terminal.
> >> And this makes sense since for non-DSQ queues the BPF scheduler can do its
> >> own tracking of enqueue/dequeue (plus it does not make too much sense to
> >> do BPF-internal enqueueing in select_cpu).
> >> 
> >> What do you think? If the above makes sense, maybe we should spell it out 
> >> in the documentation too. Maybe also add it makes no sense to enqueue
> >> in an internal BPF structure from select_cpu - the task is not yet
> >> enqueued, and would have to go through enqueue anyway.
> >
> > Oh, I just didn't think about it, we can definitely add to ops.select_cpu()
> > a scenario equivalent to scenario 6 (push task to the BPF queue).
> >
> > From a practical standpoint the benefits are questionable, but in the scope
> > of the kselftest I think it makes sense to better validate the entire state
> > machine in all cases. I'll add this scenario as well.
> >
> 
> That makes sense! Let's add it for completeness. Even if it doesn't make
> sense right now that may change in the future. For example, if we end
> up finding a good reason to add the task into an internal structure from
> .select_cpu(), we may allow the task to be explicitly marked as being in
> the BPF scheduler's custody from a kfunc. Right now we can't do that
> from select_cpu() unless we direct dispatch IIUC.

Ok, I'll send a new patch later with the new scenario included. It should
work already (if done properly in the test case), I think we don't need to
change anything in the kernel.

We may need
https://lore.kernel.org/all/20260203230639.1259869-1-arighi@nvidia.com to
avoid hitting potential affinity issues, but that's a separate thing.

I was also considerinng to include this in the series (which I have a v2),
but it's not strictly related to ops.dequeue() and it actually depends on
this ops.dequeue() change, so I think it makes sense to fix this later as a
separate patch.

> 
> >> 
> >> >
> >> >    - From ops.enqueue():
> >> >      - scenario 3 (local DSQ): same behavior as scenario 0,
> >> >      - scenario 4 (global DSQ): same behavior as scenario 1,
> >> >      - scenario 5 (user DSQ): same behavior as scenario 2,
> >> >      - scenario 6 (BPF internal queue): tasks are stored in a BPF queue
> >> >        in ops.enqueue() and consumed in ops.dispatch(); they remain in
> >> >        BPF custody until dispatch, with full lifecycle tracking and 1:1
> >> >        enqueue/dequeue validation.
> >> >
> >> > This verifies that:
> >> >  - terminal DSQ dispatch (local, global) don't trigger ops.dequeue(),
> >> >  - user DSQ / internal BPF data structure dispatch has exact 1:1
> >> >    ops.enqueue()/dequeue() pairing,
> >> >  - dispatch dequeues have no flags (normal workflow),
> >> >  - property change dequeues have the %SCX_DEQ_SCHED_CHANGE flag set,
> >> >  - no duplicate enqueues or invalid state transitions are happening,
> >> >  - ops.enqueue() and ops.select_cpu() dispatch paths behave identically.
> >> >
> >> > Cc: Tejun Heo <tj@kernel.org>
> >> > Cc: Emil Tsalapatis <emil@etsalapatis.com>
> >> > Cc: Kuba Piecuch <jpiecuch@google.com>
> >> > Signed-off-by: Andrea Righi <arighi@nvidia.com>
> >> > ---
> >> >  tools/testing/selftests/sched_ext/Makefile    |   1 +
> >> >  .../testing/selftests/sched_ext/dequeue.bpf.c | 403 ++++++++++++++++++
> >> >  tools/testing/selftests/sched_ext/dequeue.c   | 258 +++++++++++
> >> >  3 files changed, 662 insertions(+)
> >> >  create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
> >> >  create mode 100644 tools/testing/selftests/sched_ext/dequeue.c
> >> >
> >> > diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
> >> > index 5fe45f9c5f8fd..764e91edabf93 100644
> >> > --- a/tools/testing/selftests/sched_ext/Makefile
> >> > +++ b/tools/testing/selftests/sched_ext/Makefile
> >> > @@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
> >> >  
> >> >  auto-test-targets :=			\
> >> >  	create_dsq			\
> >> > +	dequeue				\
> >> >  	enq_last_no_enq_fails		\
> >> >  	ddsp_bogus_dsq_fail		\
> >> >  	ddsp_vtimelocal_fail		\
> >> > diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
> >> > new file mode 100644
> >> > index 0000000000000..4ba657ba1bff5
> >> > --- /dev/null
> >> > +++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
> >> > @@ -0,0 +1,403 @@
> >> > +// SPDX-License-Identifier: GPL-2.0
> >> > +/*
> >> > + * A scheduler that validates ops.dequeue() is called correctly:
> >> > + * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
> >> > + *   scheduler entirely: no ops.dequeue() should be called
> >> > + * - Tasks dispatched to user DSQs enter BPF custody: ops.dequeue() must be
> >> > + *   called when they leave custody
> >> > + * - Every ops.enqueue() for non-terminal DSQs is followed by exactly one
> >> > + *   ops.dequeue() (validate 1:1 pairing and state machine)
> >> > + *
> >> > + * Copyright (c) 2026 NVIDIA Corporation.
> >> > + */
> >> > +
> >> > +#include <scx/common.bpf.h>
> >> > +
> >> > +#define SHARED_DSQ	0
> >> > +
> >> > +/*
> >> > + * Scenario 6: BPF internal queue. Tasks are stored here from ops.enqueue()
> >> > + * and consumed from ops.dispatch(), validating that tasks not on a user DSQ
> >> > + * (only on BPF internal structures) still get ops.dequeue() when they leave.
> >> > + */
> >> > +struct {
> >> > +	__uint(type, BPF_MAP_TYPE_QUEUE);
> >> > +	__uint(max_entries, 4096);
> >> 
> >> Nit: Can we make this larger? I don't think there's any downsides. I know
> >> there's a mitigation for if the queue gets full, please see nit below.
> >
> > Sure, like 32768?
> >
> > Or we can keep it like this so we can potentially test also the fallback
> > path sometimes (mixed BPF queue dispatches + built-in DSQ dispatches).
> >
> 
> 32K makes sense. If we keep the fallback, maybe we can just add a
> WARN_ON_ONCE() equivalent that it is being triggered so that we make
> sure we don't trigger it every single time (e.g. because the BPF queue
> is misbehaving)?

Maybe we can add a bpf_queue_full_cnt counter, similarly to the other task
lifecycle counters and print it to stdout as the result of the test.

In this way, the fallback won't be considered a critical failure and we
have this metric included in the output if we need this information.

> 
> >> 
> >> > +	__type(value, s32);
> >> > +} global_queue SEC(".maps");
> >> > +
> >> > +char _license[] SEC("license") = "GPL";
> >> > +
> >> > +UEI_DEFINE(uei);
> >> > +
> >> > +/*
> >> > + * Counters to track the lifecycle of tasks:
> >> > + * - enqueue_cnt: Number of times ops.enqueue() was called
> >> > + * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
> >> > + * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
> >> > + * - change_dequeue_cnt: Number of property change dequeues
> >> > + */
> >> > +u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt;
> >> > +
> >> > +/*
> >> > + * Test scenarios (0-2: ops.select_cpu(), 3-6: ops.enqueue()):
> >> > + * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
> >> > + *    scheduler, no dequeue callbacks)
> >> > + * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
> >> > + *    scheduler, no dequeue callbacks)
> >> > + * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
> >> > + *    dequeue callbacks expected)
> >> > + * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
> >> > + *    scheduler, no dequeue callbacks)
> >> > + * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
> >> > + *    scheduler, no dequeue callbacks)
> >> > + * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
> >> > + *    dequeue callbacks expected)
> >> > + * 6) BPF internal queue: store task PIDs in ops.enqueue(), consume in
> >> > + *    ops.dispatch() and dispatch to local DSQ (validates dequeue for tasks
> >> > + *    in BPF custody but not on a user DSQ)
> >> > + */
> >> > +u32 test_scenario;
> >> > +
> >> > +/*
> >> > + * Per-task state to track lifecycle and validate workflow semantics.
> >> > + * State transitions:
> >> > + *   NONE -> ENQUEUED (on enqueue)
> >> > + *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
> >> > + *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
> >> > + *   ENQUEUED -> NONE (on property change dequeue before dispatch)
> >> > + */
> >> > +enum task_state {
> >> > +	TASK_NONE = 0,
> >> > +	TASK_ENQUEUED,
> >> > +	TASK_DISPATCHED,
> >> > +};
> >> > +
> >> > +struct task_ctx {
> >> > +	enum task_state state; /* Current state in the workflow */
> >> > +	u64 enqueue_seq;       /* Sequence number for debugging */
> >> > +};
> >> > +
> >> > +struct {
> >> > +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
> >> > +	__uint(map_flags, BPF_F_NO_PREALLOC);
> >> > +	__type(key, int);
> >> > +	__type(value, struct task_ctx);
> >> > +} task_ctx_stor SEC(".maps");
> >> > +
> >> > +static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
> >> > +{
> >> > +	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> >> > +}
> >> > +
> >> > +s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
> >> > +		   s32 prev_cpu, u64 wake_flags)
> >> > +{
> >> > +	struct task_ctx *tctx;
> >> > +
> >> > +	tctx = try_lookup_task_ctx(p);
> >> > +	if (!tctx)
> >> > +		return prev_cpu;
> >> > +
> >> > +	switch (test_scenario) {
> >> > +	case 0:
> >> > +		/*
> >> > +		 * Scenario 0: Direct dispatch to local DSQ from select_cpu.
> >> > +		 *
> >> > +		 * Task bypasses BPF scheduler entirely: no enqueue
> >> > +		 * tracking, no dequeue callbacks. Behavior should be
> >> > +		 * identical to scenario 3.
> >> > +		 */
> >> > +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
> >> > +		return prev_cpu;
> >> > +
> >> > +	case 1:
> >> > +		/*
> >> > +		 * Scenario 1: Direct dispatch to global DSQ from select_cpu.
> >> > +		 *
> >> > +		 * Like scenario 0, task bypasses BPF scheduler entirely.
> >> > +		 * Behavior should be identical to scenario 4.
> >> > +		 */
> >> > +		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
> >> > +		return prev_cpu;
> >> > +
> >> > +	case 2:
> >> > +		/*
> >> > +		 * Scenario 2: Dispatch to shared user DSQ from select_cpu.
> >> > +		 *
> >> > +		 * Task enters BPF scheduler management: track
> >> > +		 * enqueue/dequeue lifecycle and validate state transitions.
> >> > +		 * Behavior should be identical to scenario 5.
> >> > +		 */
> >> > +		__sync_fetch_and_add(&enqueue_cnt, 1);
> >> > +
> >> > +		/*
> >> > +		 * Validate state transition: enqueue is only valid from
> >> > +		 * NONE or DISPATCHED states. Getting enqueue while in
> >> > +		 * ENQUEUED state indicates a missing dequeue.
> >> > +		 */
> >> > +		if (tctx->state == TASK_ENQUEUED)
> >> > +			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
> >> > +				      p->pid, p->comm, tctx->enqueue_seq);
> >> > +
> >> > +		/* Transition to ENQUEUED state */
> >> > +		tctx->state = TASK_ENQUEUED;
> >> > +		tctx->enqueue_seq++;
> >> > +
> >> > +		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);
> >> > +		return prev_cpu;
> >> > +
> >> > +	default:
> >> > +		/*
> >> > +		 * Force all tasks through ops.enqueue().
> >> > +		 */
> >> > +		return prev_cpu;
> >> > +	}
> >> > +}
> >> > +
> >> > +void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
> >> > +{
> >> > +	struct task_ctx *tctx;
> >> > +
> >> > +	tctx = try_lookup_task_ctx(p);
> >> > +	if (!tctx)
> >> > +		return;
> >> > +
> >> > +	switch (test_scenario) {
> >> > +	case 3:
> >> > +		/*
> >> > +		 * Scenario 3: Direct dispatch to the local DSQ.
> >> > +		 *
> >> > +		 * Task bypasses BPF scheduler entirely: no enqueue
> >> > +		 * tracking, no dequeue callbacks. Don't increment counters
> >> > +		 * or validate state since the task never enters BPF
> >> > +		 * scheduler management.
> >> > +		 */
> >> > +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
> >> > +		break;
> >> > +
> >> > +	case 4:
> >> > +		/*
> >> > +		 * Scenario 4: Direct dispatch to the global DSQ.
> >> > +		 *
> >> > +		 * Like scenario 3, task bypasses BPF scheduler entirely.
> >> > +		 * SCX_DSQ_GLOBAL is a terminal DSQ, tasks dispatched to it
> >> > +		 * leave BPF custody immediately, so no dequeue callbacks
> >> > +		 * should be triggered.
> >> > +		 */
> >> > +		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
> >> > +		break;
> >> > +
> >> > +	case 5:
> >> > +		/*
> >> > +		 * Scenario 5: Dispatch to shared user DSQ.
> >> > +		 *
> >> > +		 * Task enters BPF scheduler management: track
> >> > +		 * enqueue/dequeue lifecycle and validate state
> >> > +		 * transitions.
> >> > +		 */
> >> > +		__sync_fetch_and_add(&enqueue_cnt, 1);
> >> > +
> >> > +		/*
> >> > +		 * Validate state transition: enqueue is only valid from
> >> > +		 * NONE or DISPATCHED states. Getting enqueue while in
> >> > +		 * ENQUEUED state indicates a missing dequeue (or stale state
> >> > +		 * from a previous scenario when the scheduler was unregistered
> >> > +		 * with tasks still on a DSQ). Reset and proceed to avoid false
> >> > +		 * positives across scenario switches.
> >> > +		 */
> >> > +		if (tctx->state == TASK_ENQUEUED)
> >> > +			tctx->state = TASK_NONE;
> >> > +
> >> > +		/* Transition to ENQUEUED state */
> >> > +		tctx->state = TASK_ENQUEUED;
> >> > +		tctx->enqueue_seq++;
> >> > +
> >> > +		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
> >> > +		break;
> >> > +
> >> > +	case 6:
> >> > +		/*
> >> > +		 * Scenario 6: Store task in BPF internal queue. Task enters
> >> > +		 * BPF custody (kernel sets SCX_TASK_NEED_DEQ). When
> >> > +		 * ops.dispatch() later pops and inserts to local DSQ,
> >> > +		 * ops.dequeue() must be called.
> >> > +		 *
> >> > +		 * If the queue is full, fallback to local DSQ. The task still
> >> > +		 * goes through QUEUED in the kernel and gets ops.dequeue()
> >> > +		 * when moved to the terminal DSQ, so we track it the same.
> >> > +		 *
> >> > +		 * If state is already ENQUEUED (e.g. task was on a DSQ when
> >> > +		 * the scheduler was unregistered in a previous scenario),
> >> > +		 * reset to NONE and proceed to avoid false positives.
> >> > +		 */
> >> > +		{
> >> > +			s32 pid = p->pid;
> >> > +
> >> > +			if (tctx->state == TASK_ENQUEUED)
> >> > +				tctx->state = TASK_NONE;
> >> > +
> >> > +			tctx->state = TASK_ENQUEUED;
> >> > +			tctx->enqueue_seq++;
> >> > +
> >> > +			/* Queue full: fallback to the global DSQ */
> >> Nit: Can we remove this fallback? This silently changes the behavior of
> >> the test, and even though it makes sense to avoid overflowing the queue,
> >> it causes the test to succeed even if for some reason the
> >> bpf_map_push_elem fails. Why not just bump the queue number to a
> >> reasonably large number amount instead?
> >
> > Hm... but if for any reason we overflow the queue we'd get a false positive
> > error: task is ignored, we trigger a stall and it looks like something is
> > wrong in ops.dequeue(). WDYT?
> >
> 
> I agree, but if we bump the queue size to a large number the probability
> of that is nonexistent: I think these test make sense to run in CI-like
> environments where there's few processes anyway, so if a queue is large
> enough there will not be enough tasks to overflow it anyway. This is an
> assumption we make for dsp_local_on, too. Maybe we can keep the fallback
> but warn when it's used (see above)?

I think keeping the fallback + report the counter of "queue full" events is
the best option.

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-08  9:02         ` Andrea Righi
@ 2026-02-08 10:26           ` Andrea Righi
  2026-02-08 13:55             ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-08 10:26 UTC (permalink / raw)
  To: Emil Tsalapatis
  Cc: Tejun Heo, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Sun, Feb 08, 2026 at 10:02:41AM +0100, Andrea Righi wrote:
...
> > >> >  - From ops.select_cpu():
> > >> >      - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
> > >> >        the BPF scheduler entirely; they never enter BPF custody, so
> > >> >        ops.dequeue() is not called,
> > >> >      - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
> > >> >        bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
> > >> >        not called,
> > >> >      - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
> > >> >        enqueue/dequeue lifecycle tracking and state machine validation
> > >> >        (expects 1:1 enqueue/dequeue pairing).
> > >> 
> > >> Could you add a note here about why there's no equivalent to scenario 6?
> > >> The differentiating factor between that and scenario 2 (nonterminal queue) is 
> > >> that scx_dsq_insert_commit() is called regardless of whether the queue is terminal.
> > >> And this makes sense since for non-DSQ queues the BPF scheduler can do its
> > >> own tracking of enqueue/dequeue (plus it does not make too much sense to
> > >> do BPF-internal enqueueing in select_cpu).
> > >> 
> > >> What do you think? If the above makes sense, maybe we should spell it out 
> > >> in the documentation too. Maybe also add it makes no sense to enqueue
> > >> in an internal BPF structure from select_cpu - the task is not yet
> > >> enqueued, and would have to go through enqueue anyway.
> > >
> > > Oh, I just didn't think about it, we can definitely add to ops.select_cpu()
> > > a scenario equivalent to scenario 6 (push task to the BPF queue).
> > >
> > > From a practical standpoint the benefits are questionable, but in the scope
> > > of the kselftest I think it makes sense to better validate the entire state
> > > machine in all cases. I'll add this scenario as well.
> > >
> > 
> > That makes sense! Let's add it for completeness. Even if it doesn't make
> > sense right now that may change in the future. For example, if we end
> > up finding a good reason to add the task into an internal structure from
> > .select_cpu(), we may allow the task to be explicitly marked as being in
> > the BPF scheduler's custody from a kfunc. Right now we can't do that
> > from select_cpu() unless we direct dispatch IIUC.
> 
> Ok, I'll send a new patch later with the new scenario included. It should
> work already (if done properly in the test case), I think we don't need to
> change anything in the kernel.

Actually I take that back. The internal BPF queue from ops.select_cpu()
scenario is a bit tricky, because when we return from ops.select_cpu()
without p->scx.ddsp_dsq_id being set, we don't know if the scheduler added
the task to an internal BPF queue or simply did nothing.

We need to add some special logic here, preferably without introducing
overhead just to handle this particular (really uncommon) case. I'll take a
look.

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-08 10:26           ` Andrea Righi
@ 2026-02-08 13:55             ` Andrea Righi
  2026-02-08 17:59               ` Emil Tsalapatis
  0 siblings, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-08 13:55 UTC (permalink / raw)
  To: Emil Tsalapatis
  Cc: Tejun Heo, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Sun, Feb 08, 2026 at 11:26:13AM +0100, Andrea Righi wrote:
> On Sun, Feb 08, 2026 at 10:02:41AM +0100, Andrea Righi wrote:
> ...
> > > >> >  - From ops.select_cpu():
> > > >> >      - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
> > > >> >        the BPF scheduler entirely; they never enter BPF custody, so
> > > >> >        ops.dequeue() is not called,
> > > >> >      - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
> > > >> >        bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
> > > >> >        not called,
> > > >> >      - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
> > > >> >        enqueue/dequeue lifecycle tracking and state machine validation
> > > >> >        (expects 1:1 enqueue/dequeue pairing).
> > > >> 
> > > >> Could you add a note here about why there's no equivalent to scenario 6?
> > > >> The differentiating factor between that and scenario 2 (nonterminal queue) is 
> > > >> that scx_dsq_insert_commit() is called regardless of whether the queue is terminal.
> > > >> And this makes sense since for non-DSQ queues the BPF scheduler can do its
> > > >> own tracking of enqueue/dequeue (plus it does not make too much sense to
> > > >> do BPF-internal enqueueing in select_cpu).
> > > >> 
> > > >> What do you think? If the above makes sense, maybe we should spell it out 
> > > >> in the documentation too. Maybe also add it makes no sense to enqueue
> > > >> in an internal BPF structure from select_cpu - the task is not yet
> > > >> enqueued, and would have to go through enqueue anyway.
> > > >
> > > > Oh, I just didn't think about it, we can definitely add to ops.select_cpu()
> > > > a scenario equivalent to scenario 6 (push task to the BPF queue).
> > > >
> > > > From a practical standpoint the benefits are questionable, but in the scope
> > > > of the kselftest I think it makes sense to better validate the entire state
> > > > machine in all cases. I'll add this scenario as well.
> > > >
> > > 
> > > That makes sense! Let's add it for completeness. Even if it doesn't make
> > > sense right now that may change in the future. For example, if we end
> > > up finding a good reason to add the task into an internal structure from
> > > .select_cpu(), we may allow the task to be explicitly marked as being in
> > > the BPF scheduler's custody from a kfunc. Right now we can't do that
> > > from select_cpu() unless we direct dispatch IIUC.
> > 
> > Ok, I'll send a new patch later with the new scenario included. It should
> > work already (if done properly in the test case), I think we don't need to
> > change anything in the kernel.
> 
> Actually I take that back. The internal BPF queue from ops.select_cpu()
> scenario is a bit tricky, because when we return from ops.select_cpu()
> without p->scx.ddsp_dsq_id being set, we don't know if the scheduler added
> the task to an internal BPF queue or simply did nothing.
> 
> We need to add some special logic here, preferably without introducing
> overhead just to handle this particular (really uncommon) case. I'll take a
> look.

The more I think about this, the more it feels wrong to consider a task as
being "in BPF scheduler custody" if it is stored in a BPF internal data
structure from ops.select_cpu().

At the point where ops.select_cpu() runs, the task has not yet entered the
BPF scheduler's queues. While it is technically possible to stash the task
in some BPF-managed structure from there, doing so should not imply full
scheduler custody.

In particular, we should not trigger ops.dequeue(), because the task has
not reached the "enqueue" stage of its lifecycle. ops.select_cpu() is
effectively a pre-enqueue hook, primarily intended as a fast path to bypass
the scheduler altogether. As such, triggering ops.dequeue() in this case
would not make sense IMHO.

I think it would make more sense to document this behavior explicitly and
leave the kselftest as is.

Thoughts?

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-08 13:55             ` Andrea Righi
@ 2026-02-08 17:59               ` Emil Tsalapatis
  2026-02-08 20:08                 ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Emil Tsalapatis @ 2026-02-08 17:59 UTC (permalink / raw)
  To: Andrea Righi
  Cc: Tejun Heo, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Sun Feb 8, 2026 at 8:55 AM EST, Andrea Righi wrote:
> On Sun, Feb 08, 2026 at 11:26:13AM +0100, Andrea Righi wrote:
>> On Sun, Feb 08, 2026 at 10:02:41AM +0100, Andrea Righi wrote:
>> ...
>> > > >> >  - From ops.select_cpu():
>> > > >> >      - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
>> > > >> >        the BPF scheduler entirely; they never enter BPF custody, so
>> > > >> >        ops.dequeue() is not called,
>> > > >> >      - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
>> > > >> >        bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
>> > > >> >        not called,
>> > > >> >      - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
>> > > >> >        enqueue/dequeue lifecycle tracking and state machine validation
>> > > >> >        (expects 1:1 enqueue/dequeue pairing).
>> > > >> 
>> > > >> Could you add a note here about why there's no equivalent to scenario 6?
>> > > >> The differentiating factor between that and scenario 2 (nonterminal queue) is 
>> > > >> that scx_dsq_insert_commit() is called regardless of whether the queue is terminal.
>> > > >> And this makes sense since for non-DSQ queues the BPF scheduler can do its
>> > > >> own tracking of enqueue/dequeue (plus it does not make too much sense to
>> > > >> do BPF-internal enqueueing in select_cpu).
>> > > >> 
>> > > >> What do you think? If the above makes sense, maybe we should spell it out 
>> > > >> in the documentation too. Maybe also add it makes no sense to enqueue
>> > > >> in an internal BPF structure from select_cpu - the task is not yet
>> > > >> enqueued, and would have to go through enqueue anyway.
>> > > >
>> > > > Oh, I just didn't think about it, we can definitely add to ops.select_cpu()
>> > > > a scenario equivalent to scenario 6 (push task to the BPF queue).
>> > > >
>> > > > From a practical standpoint the benefits are questionable, but in the scope
>> > > > of the kselftest I think it makes sense to better validate the entire state
>> > > > machine in all cases. I'll add this scenario as well.
>> > > >
>> > > 
>> > > That makes sense! Let's add it for completeness. Even if it doesn't make
>> > > sense right now that may change in the future. For example, if we end
>> > > up finding a good reason to add the task into an internal structure from
>> > > .select_cpu(), we may allow the task to be explicitly marked as being in
>> > > the BPF scheduler's custody from a kfunc. Right now we can't do that
>> > > from select_cpu() unless we direct dispatch IIUC.
>> > 
>> > Ok, I'll send a new patch later with the new scenario included. It should
>> > work already (if done properly in the test case), I think we don't need to
>> > change anything in the kernel.
>> 
>> Actually I take that back. The internal BPF queue from ops.select_cpu()
>> scenario is a bit tricky, because when we return from ops.select_cpu()
>> without p->scx.ddsp_dsq_id being set, we don't know if the scheduler added
>> the task to an internal BPF queue or simply did nothing.
>> 
>> We need to add some special logic here, preferably without introducing
>> overhead just to handle this particular (really uncommon) case. I'll take a
>> look.
>
> The more I think about this, the more it feels wrong to consider a task as
> being "in BPF scheduler custody" if it is stored in a BPF internal data
> structure from ops.select_cpu().
>
> At the point where ops.select_cpu() runs, the task has not yet entered the
> BPF scheduler's queues. While it is technically possible to stash the task
> in some BPF-managed structure from there, doing so should not imply full
> scheduler custody.
>
> In particular, we should not trigger ops.dequeue(), because the task has
> not reached the "enqueue" stage of its lifecycle. ops.select_cpu() is
> effectively a pre-enqueue hook, primarily intended as a fast path to bypass
> the scheduler altogether. As such, triggering ops.dequeue() in this case
> would not make sense IMHO.
>
> I think it would make more sense to document this behavior explicitly and
> leave the kselftest as is.
>
> Thoughts?

I am going back and forth on this but I think the problem is that the enqueue() 
and dequeue() BPF callbacks we have are not actually symmetrical? 

1) ops.enqueue() is "sched-ext specific work for the scheduler core's enqueue
method". This is independent on whether the task ends up in BPF custody or not.
It could be in a terminal DSQ, a non-terminal DSQ, or a BPF data structure.

2) ops.dequeue() is "remove task from BPF custody". E.g., it is used by the
BPF scheduler to signal whether it should keep a task within its
internal tracking structures.

So the edge case of ops.select_cpu() placing the task in BPF custody is
currently valid. The way I see it, we have two choices in terms of
semantics:

1) ops.dequeue() must be the equivalent of ops.enqueue(). If the BPF
scheduler writer decides to place a task into BPF custody during the
ops.select_cpu() that's on them. ops.select_cpu() is supposed to be a
pure function providing a hint, anyway. Using it to place a task into
BPF is a bit of an abuse even if allowed.

2) We interpret ops.dequeue() to mean "dequeue from the BPF scheduler".
In that case we allow the edge case and interpret ops.dequeue() as "the
function that must be called to clear the NEEDS_DEQ/IN_BPF flag", not as
the complement of ops.enqueue(). In most cases both will be true, and in
the cases where not then it's up to the scheduler writer to understand
the nuance.

I think while 2) is cleaner, it is more involved and honestly kinda
speculative. However, I think it's fair game since once we settle on
the semantics it will be more difficult to change them. Which one do you 
think makes more sense?

>
> Thanks,
> -Andrea


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-08 17:59               ` Emil Tsalapatis
@ 2026-02-08 20:08                 ` Andrea Righi
  2026-02-09 10:20                   ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-08 20:08 UTC (permalink / raw)
  To: Emil Tsalapatis
  Cc: Tejun Heo, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Sun, Feb 08, 2026 at 12:59:36PM -0500, Emil Tsalapatis wrote:
> On Sun Feb 8, 2026 at 8:55 AM EST, Andrea Righi wrote:
> > On Sun, Feb 08, 2026 at 11:26:13AM +0100, Andrea Righi wrote:
> >> On Sun, Feb 08, 2026 at 10:02:41AM +0100, Andrea Righi wrote:
> >> ...
> >> > > >> >  - From ops.select_cpu():
> >> > > >> >      - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
> >> > > >> >        the BPF scheduler entirely; they never enter BPF custody, so
> >> > > >> >        ops.dequeue() is not called,
> >> > > >> >      - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
> >> > > >> >        bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
> >> > > >> >        not called,
> >> > > >> >      - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
> >> > > >> >        enqueue/dequeue lifecycle tracking and state machine validation
> >> > > >> >        (expects 1:1 enqueue/dequeue pairing).
> >> > > >> 
> >> > > >> Could you add a note here about why there's no equivalent to scenario 6?
> >> > > >> The differentiating factor between that and scenario 2 (nonterminal queue) is 
> >> > > >> that scx_dsq_insert_commit() is called regardless of whether the queue is terminal.
> >> > > >> And this makes sense since for non-DSQ queues the BPF scheduler can do its
> >> > > >> own tracking of enqueue/dequeue (plus it does not make too much sense to
> >> > > >> do BPF-internal enqueueing in select_cpu).
> >> > > >> 
> >> > > >> What do you think? If the above makes sense, maybe we should spell it out 
> >> > > >> in the documentation too. Maybe also add it makes no sense to enqueue
> >> > > >> in an internal BPF structure from select_cpu - the task is not yet
> >> > > >> enqueued, and would have to go through enqueue anyway.
> >> > > >
> >> > > > Oh, I just didn't think about it, we can definitely add to ops.select_cpu()
> >> > > > a scenario equivalent to scenario 6 (push task to the BPF queue).
> >> > > >
> >> > > > From a practical standpoint the benefits are questionable, but in the scope
> >> > > > of the kselftest I think it makes sense to better validate the entire state
> >> > > > machine in all cases. I'll add this scenario as well.
> >> > > >
> >> > > 
> >> > > That makes sense! Let's add it for completeness. Even if it doesn't make
> >> > > sense right now that may change in the future. For example, if we end
> >> > > up finding a good reason to add the task into an internal structure from
> >> > > .select_cpu(), we may allow the task to be explicitly marked as being in
> >> > > the BPF scheduler's custody from a kfunc. Right now we can't do that
> >> > > from select_cpu() unless we direct dispatch IIUC.
> >> > 
> >> > Ok, I'll send a new patch later with the new scenario included. It should
> >> > work already (if done properly in the test case), I think we don't need to
> >> > change anything in the kernel.
> >> 
> >> Actually I take that back. The internal BPF queue from ops.select_cpu()
> >> scenario is a bit tricky, because when we return from ops.select_cpu()
> >> without p->scx.ddsp_dsq_id being set, we don't know if the scheduler added
> >> the task to an internal BPF queue or simply did nothing.
> >> 
> >> We need to add some special logic here, preferably without introducing
> >> overhead just to handle this particular (really uncommon) case. I'll take a
> >> look.
> >
> > The more I think about this, the more it feels wrong to consider a task as
> > being "in BPF scheduler custody" if it is stored in a BPF internal data
> > structure from ops.select_cpu().
> >
> > At the point where ops.select_cpu() runs, the task has not yet entered the
> > BPF scheduler's queues. While it is technically possible to stash the task
> > in some BPF-managed structure from there, doing so should not imply full
> > scheduler custody.
> >
> > In particular, we should not trigger ops.dequeue(), because the task has
> > not reached the "enqueue" stage of its lifecycle. ops.select_cpu() is
> > effectively a pre-enqueue hook, primarily intended as a fast path to bypass
> > the scheduler altogether. As such, triggering ops.dequeue() in this case
> > would not make sense IMHO.
> >
> > I think it would make more sense to document this behavior explicitly and
> > leave the kselftest as is.
> >
> > Thoughts?
> 
> I am going back and forth on this but I think the problem is that the enqueue() 
> and dequeue() BPF callbacks we have are not actually symmetrical? 
> 
> 1) ops.enqueue() is "sched-ext specific work for the scheduler core's enqueue
> method". This is independent on whether the task ends up in BPF custody or not.
> It could be in a terminal DSQ, a non-terminal DSQ, or a BPF data structure.
> 
> 2) ops.dequeue() is "remove task from BPF custody". E.g., it is used by the
> BPF scheduler to signal whether it should keep a task within its
> internal tracking structures.
> 
> So the edge case of ops.select_cpu() placing the task in BPF custody is
> currently valid. The way I see it, we have two choices in terms of
> semantics:
> 
> 1) ops.dequeue() must be the equivalent of ops.enqueue(). If the BPF
> scheduler writer decides to place a task into BPF custody during the
> ops.select_cpu() that's on them. ops.select_cpu() is supposed to be a
> pure function providing a hint, anyway. Using it to place a task into
> BPF is a bit of an abuse even if allowed.
> 
> 2) We interpret ops.dequeue() to mean "dequeue from the BPF scheduler".
> In that case we allow the edge case and interpret ops.dequeue() as "the
> function that must be called to clear the NEEDS_DEQ/IN_BPF flag", not as
> the complement of ops.enqueue(). In most cases both will be true, and in
> the cases where not then it's up to the scheduler writer to understand
> the nuance.
> 
> I think while 2) is cleaner, it is more involved and honestly kinda
> speculative. However, I think it's fair game since once we settle on
> the semantics it will be more difficult to change them. Which one do you 
> think makes more sense?

Yeah, I'm also going back and forth on this.

Honestly from a pure theoretical perspective, option (1) feels cleaner to
me: when ops.select_cpu() runs, the task has not entered the BPF scheduler
yet. If we trigger ops.dequeue() in this case, we end up with tasks that
are "leaving" the scheduler without ever having entered it, which feels
like a violation of the lifecycle model.

However, from a practical perspective, it's probably more convenient to
trigger ops.dequeue() also for tasks that are stored in BPF data structures
or user DSQs from ops.select_cpu() as well. If we don't allow that, we
can't just silently ignore the behavior and it's also pretty hard to
reliably detect and trigger an error for this kind of "abuse" at runtime.
That means it could easily turn into a source of subtle bugs in the future,
and I don't think documentation alone would be sufficient to prevent that
(the "don't do that" rules are always fragile).

Therefore, at the moment I'm more inclined to go with option (2), as it
provides better robustness and gives schedulers more flexibility.

-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-08 20:08                 ` Andrea Righi
@ 2026-02-09 10:20                   ` Andrea Righi
  2026-02-09 15:00                     ` Emil Tsalapatis
  0 siblings, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-09 10:20 UTC (permalink / raw)
  To: Emil Tsalapatis
  Cc: Tejun Heo, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Sun, Feb 08, 2026 at 09:08:38PM +0100, Andrea Righi wrote:
> On Sun, Feb 08, 2026 at 12:59:36PM -0500, Emil Tsalapatis wrote:
> > On Sun Feb 8, 2026 at 8:55 AM EST, Andrea Righi wrote:
> > > On Sun, Feb 08, 2026 at 11:26:13AM +0100, Andrea Righi wrote:
> > >> On Sun, Feb 08, 2026 at 10:02:41AM +0100, Andrea Righi wrote:
> > >> ...
> > >> > > >> >  - From ops.select_cpu():
> > >> > > >> >      - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
> > >> > > >> >        the BPF scheduler entirely; they never enter BPF custody, so
> > >> > > >> >        ops.dequeue() is not called,
> > >> > > >> >      - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
> > >> > > >> >        bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
> > >> > > >> >        not called,
> > >> > > >> >      - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
> > >> > > >> >        enqueue/dequeue lifecycle tracking and state machine validation
> > >> > > >> >        (expects 1:1 enqueue/dequeue pairing).
> > >> > > >> 
> > >> > > >> Could you add a note here about why there's no equivalent to scenario 6?
> > >> > > >> The differentiating factor between that and scenario 2 (nonterminal queue) is 
> > >> > > >> that scx_dsq_insert_commit() is called regardless of whether the queue is terminal.
> > >> > > >> And this makes sense since for non-DSQ queues the BPF scheduler can do its
> > >> > > >> own tracking of enqueue/dequeue (plus it does not make too much sense to
> > >> > > >> do BPF-internal enqueueing in select_cpu).
> > >> > > >> 
> > >> > > >> What do you think? If the above makes sense, maybe we should spell it out 
> > >> > > >> in the documentation too. Maybe also add it makes no sense to enqueue
> > >> > > >> in an internal BPF structure from select_cpu - the task is not yet
> > >> > > >> enqueued, and would have to go through enqueue anyway.
> > >> > > >
> > >> > > > Oh, I just didn't think about it, we can definitely add to ops.select_cpu()
> > >> > > > a scenario equivalent to scenario 6 (push task to the BPF queue).
> > >> > > >
> > >> > > > From a practical standpoint the benefits are questionable, but in the scope
> > >> > > > of the kselftest I think it makes sense to better validate the entire state
> > >> > > > machine in all cases. I'll add this scenario as well.
> > >> > > >
> > >> > > 
> > >> > > That makes sense! Let's add it for completeness. Even if it doesn't make
> > >> > > sense right now that may change in the future. For example, if we end
> > >> > > up finding a good reason to add the task into an internal structure from
> > >> > > .select_cpu(), we may allow the task to be explicitly marked as being in
> > >> > > the BPF scheduler's custody from a kfunc. Right now we can't do that
> > >> > > from select_cpu() unless we direct dispatch IIUC.
> > >> > 
> > >> > Ok, I'll send a new patch later with the new scenario included. It should
> > >> > work already (if done properly in the test case), I think we don't need to
> > >> > change anything in the kernel.
> > >> 
> > >> Actually I take that back. The internal BPF queue from ops.select_cpu()
> > >> scenario is a bit tricky, because when we return from ops.select_cpu()
> > >> without p->scx.ddsp_dsq_id being set, we don't know if the scheduler added
> > >> the task to an internal BPF queue or simply did nothing.
> > >> 
> > >> We need to add some special logic here, preferably without introducing
> > >> overhead just to handle this particular (really uncommon) case. I'll take a
> > >> look.
> > >
> > > The more I think about this, the more it feels wrong to consider a task as
> > > being "in BPF scheduler custody" if it is stored in a BPF internal data
> > > structure from ops.select_cpu().
> > >
> > > At the point where ops.select_cpu() runs, the task has not yet entered the
> > > BPF scheduler's queues. While it is technically possible to stash the task
> > > in some BPF-managed structure from there, doing so should not imply full
> > > scheduler custody.
> > >
> > > In particular, we should not trigger ops.dequeue(), because the task has
> > > not reached the "enqueue" stage of its lifecycle. ops.select_cpu() is
> > > effectively a pre-enqueue hook, primarily intended as a fast path to bypass
> > > the scheduler altogether. As such, triggering ops.dequeue() in this case
> > > would not make sense IMHO.
> > >
> > > I think it would make more sense to document this behavior explicitly and
> > > leave the kselftest as is.
> > >
> > > Thoughts?
> > 
> > I am going back and forth on this but I think the problem is that the enqueue() 
> > and dequeue() BPF callbacks we have are not actually symmetrical? 
> > 
> > 1) ops.enqueue() is "sched-ext specific work for the scheduler core's enqueue
> > method". This is independent on whether the task ends up in BPF custody or not.
> > It could be in a terminal DSQ, a non-terminal DSQ, or a BPF data structure.
> > 
> > 2) ops.dequeue() is "remove task from BPF custody". E.g., it is used by the
> > BPF scheduler to signal whether it should keep a task within its
> > internal tracking structures.
> > 
> > So the edge case of ops.select_cpu() placing the task in BPF custody is
> > currently valid. The way I see it, we have two choices in terms of
> > semantics:
> > 
> > 1) ops.dequeue() must be the equivalent of ops.enqueue(). If the BPF
> > scheduler writer decides to place a task into BPF custody during the
> > ops.select_cpu() that's on them. ops.select_cpu() is supposed to be a
> > pure function providing a hint, anyway. Using it to place a task into
> > BPF is a bit of an abuse even if allowed.
> > 
> > 2) We interpret ops.dequeue() to mean "dequeue from the BPF scheduler".
> > In that case we allow the edge case and interpret ops.dequeue() as "the
> > function that must be called to clear the NEEDS_DEQ/IN_BPF flag", not as
> > the complement of ops.enqueue(). In most cases both will be true, and in
> > the cases where not then it's up to the scheduler writer to understand
> > the nuance.
> > 
> > I think while 2) is cleaner, it is more involved and honestly kinda
> > speculative. However, I think it's fair game since once we settle on
> > the semantics it will be more difficult to change them. Which one do you 
> > think makes more sense?
> 
> Yeah, I'm also going back and forth on this.
> 
> Honestly from a pure theoretical perspective, option (1) feels cleaner to
> me: when ops.select_cpu() runs, the task has not entered the BPF scheduler
> yet. If we trigger ops.dequeue() in this case, we end up with tasks that
> are "leaving" the scheduler without ever having entered it, which feels
> like a violation of the lifecycle model.
> 
> However, from a practical perspective, it's probably more convenient to
> trigger ops.dequeue() also for tasks that are stored in BPF data structures
> or user DSQs from ops.select_cpu() as well. If we don't allow that, we
> can't just silently ignore the behavior and it's also pretty hard to
> reliably detect and trigger an error for this kind of "abuse" at runtime.
> That means it could easily turn into a source of subtle bugs in the future,
> and I don't think documentation alone would be sufficient to prevent that
> (the "don't do that" rules are always fragile).
> 
> Therefore, at the moment I'm more inclined to go with option (2), as it
> provides better robustness and gives schedulers more flexibility.

I'm running into a number of headaches and corner cases if we go with
option (2)... One of them is the following.

Assume we push tasks into a BPF queue from ops.select_cpu() and pop them
from ops.dispatch(). The following scenario can happen:

  CPU0                                         CPU1
  ----                                         ----
  ops.select_cpu()
    bpf_map_push_elem(&queue, &pid, 0)
                                               ops.dispatch()
					         bpf_map_pop_elem(&queue, &pid)
						 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | dst_cpu)
						   ==> ops.dequeue() is not triggered!
    p->scx.flags |= SCX_TASK_IN_BPF

To fix this, we would need to always set SCX_TASK_IN_BPF before calling
ops.select_cpu(), and then clear it again if the task is directly
dispatched to a terminal DSQ from ops.select_cpu().

However, doing so introduces further problems. In particular, we may end up
triggering spurious ops.dequeue() callbacks, which means we would then need
to distinguish whether a task entered BPF custody via ops.select_cpu() or
via ops.enqueue(), and handle the two cases differently. Which is also racy
and leads to additional locking and complexity.

At that point, it starts to feel like we're over-complicating the design to
support a scenario that is both uncommon and of questionable practical
value.

Given that, I'd suggest proceeding incrementally: for now, we go with
option (1), which looks doable without major changes and it probably fixes
the ops.dequeue() semantics for the majority of use cases (which is already
a significant improvement over the current state). Once that is in place,
we can revisit the "store tasks in internal BPF data structures from
ops.select_cpu()" scenario and see if it's worth supporting it in a cleaner
way. WDYT?

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-09 10:20                   ` Andrea Righi
@ 2026-02-09 15:00                     ` Emil Tsalapatis
  2026-02-09 15:43                       ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Emil Tsalapatis @ 2026-02-09 15:00 UTC (permalink / raw)
  To: Andrea Righi
  Cc: Tejun Heo, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Mon Feb 9, 2026 at 5:20 AM EST, Andrea Righi wrote:
> On Sun, Feb 08, 2026 at 09:08:38PM +0100, Andrea Righi wrote:
>> On Sun, Feb 08, 2026 at 12:59:36PM -0500, Emil Tsalapatis wrote:
>> > On Sun Feb 8, 2026 at 8:55 AM EST, Andrea Righi wrote:
>> > > On Sun, Feb 08, 2026 at 11:26:13AM +0100, Andrea Righi wrote:
>> > >> On Sun, Feb 08, 2026 at 10:02:41AM +0100, Andrea Righi wrote:
>> > >> ...
>> > >> > > >> >  - From ops.select_cpu():
>> > >> > > >> >      - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
>> > >> > > >> >        the BPF scheduler entirely; they never enter BPF custody, so
>> > >> > > >> >        ops.dequeue() is not called,
>> > >> > > >> >      - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
>> > >> > > >> >        bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
>> > >> > > >> >        not called,
>> > >> > > >> >      - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
>> > >> > > >> >        enqueue/dequeue lifecycle tracking and state machine validation
>> > >> > > >> >        (expects 1:1 enqueue/dequeue pairing).
>> > >> > > >> 
>> > >> > > >> Could you add a note here about why there's no equivalent to scenario 6?
>> > >> > > >> The differentiating factor between that and scenario 2 (nonterminal queue) is 
>> > >> > > >> that scx_dsq_insert_commit() is called regardless of whether the queue is terminal.
>> > >> > > >> And this makes sense since for non-DSQ queues the BPF scheduler can do its
>> > >> > > >> own tracking of enqueue/dequeue (plus it does not make too much sense to
>> > >> > > >> do BPF-internal enqueueing in select_cpu).
>> > >> > > >> 
>> > >> > > >> What do you think? If the above makes sense, maybe we should spell it out 
>> > >> > > >> in the documentation too. Maybe also add it makes no sense to enqueue
>> > >> > > >> in an internal BPF structure from select_cpu - the task is not yet
>> > >> > > >> enqueued, and would have to go through enqueue anyway.
>> > >> > > >
>> > >> > > > Oh, I just didn't think about it, we can definitely add to ops.select_cpu()
>> > >> > > > a scenario equivalent to scenario 6 (push task to the BPF queue).
>> > >> > > >
>> > >> > > > From a practical standpoint the benefits are questionable, but in the scope
>> > >> > > > of the kselftest I think it makes sense to better validate the entire state
>> > >> > > > machine in all cases. I'll add this scenario as well.
>> > >> > > >
>> > >> > > 
>> > >> > > That makes sense! Let's add it for completeness. Even if it doesn't make
>> > >> > > sense right now that may change in the future. For example, if we end
>> > >> > > up finding a good reason to add the task into an internal structure from
>> > >> > > .select_cpu(), we may allow the task to be explicitly marked as being in
>> > >> > > the BPF scheduler's custody from a kfunc. Right now we can't do that
>> > >> > > from select_cpu() unless we direct dispatch IIUC.
>> > >> > 
>> > >> > Ok, I'll send a new patch later with the new scenario included. It should
>> > >> > work already (if done properly in the test case), I think we don't need to
>> > >> > change anything in the kernel.
>> > >> 
>> > >> Actually I take that back. The internal BPF queue from ops.select_cpu()
>> > >> scenario is a bit tricky, because when we return from ops.select_cpu()
>> > >> without p->scx.ddsp_dsq_id being set, we don't know if the scheduler added
>> > >> the task to an internal BPF queue or simply did nothing.
>> > >> 
>> > >> We need to add some special logic here, preferably without introducing
>> > >> overhead just to handle this particular (really uncommon) case. I'll take a
>> > >> look.
>> > >
>> > > The more I think about this, the more it feels wrong to consider a task as
>> > > being "in BPF scheduler custody" if it is stored in a BPF internal data
>> > > structure from ops.select_cpu().
>> > >
>> > > At the point where ops.select_cpu() runs, the task has not yet entered the
>> > > BPF scheduler's queues. While it is technically possible to stash the task
>> > > in some BPF-managed structure from there, doing so should not imply full
>> > > scheduler custody.
>> > >
>> > > In particular, we should not trigger ops.dequeue(), because the task has
>> > > not reached the "enqueue" stage of its lifecycle. ops.select_cpu() is
>> > > effectively a pre-enqueue hook, primarily intended as a fast path to bypass
>> > > the scheduler altogether. As such, triggering ops.dequeue() in this case
>> > > would not make sense IMHO.
>> > >
>> > > I think it would make more sense to document this behavior explicitly and
>> > > leave the kselftest as is.
>> > >
>> > > Thoughts?
>> > 
>> > I am going back and forth on this but I think the problem is that the enqueue() 
>> > and dequeue() BPF callbacks we have are not actually symmetrical? 
>> > 
>> > 1) ops.enqueue() is "sched-ext specific work for the scheduler core's enqueue
>> > method". This is independent on whether the task ends up in BPF custody or not.
>> > It could be in a terminal DSQ, a non-terminal DSQ, or a BPF data structure.
>> > 
>> > 2) ops.dequeue() is "remove task from BPF custody". E.g., it is used by the
>> > BPF scheduler to signal whether it should keep a task within its
>> > internal tracking structures.
>> > 
>> > So the edge case of ops.select_cpu() placing the task in BPF custody is
>> > currently valid. The way I see it, we have two choices in terms of
>> > semantics:
>> > 
>> > 1) ops.dequeue() must be the equivalent of ops.enqueue(). If the BPF
>> > scheduler writer decides to place a task into BPF custody during the
>> > ops.select_cpu() that's on them. ops.select_cpu() is supposed to be a
>> > pure function providing a hint, anyway. Using it to place a task into
>> > BPF is a bit of an abuse even if allowed.
>> > 
>> > 2) We interpret ops.dequeue() to mean "dequeue from the BPF scheduler".
>> > In that case we allow the edge case and interpret ops.dequeue() as "the
>> > function that must be called to clear the NEEDS_DEQ/IN_BPF flag", not as
>> > the complement of ops.enqueue(). In most cases both will be true, and in
>> > the cases where not then it's up to the scheduler writer to understand
>> > the nuance.
>> > 
>> > I think while 2) is cleaner, it is more involved and honestly kinda
>> > speculative. However, I think it's fair game since once we settle on
>> > the semantics it will be more difficult to change them. Which one do you 
>> > think makes more sense?
>> 
>> Yeah, I'm also going back and forth on this.
>> 
>> Honestly from a pure theoretical perspective, option (1) feels cleaner to
>> me: when ops.select_cpu() runs, the task has not entered the BPF scheduler
>> yet. If we trigger ops.dequeue() in this case, we end up with tasks that
>> are "leaving" the scheduler without ever having entered it, which feels
>> like a violation of the lifecycle model.
>> 
>> However, from a practical perspective, it's probably more convenient to
>> trigger ops.dequeue() also for tasks that are stored in BPF data structures
>> or user DSQs from ops.select_cpu() as well. If we don't allow that, we
>> can't just silently ignore the behavior and it's also pretty hard to
>> reliably detect and trigger an error for this kind of "abuse" at runtime.
>> That means it could easily turn into a source of subtle bugs in the future,
>> and I don't think documentation alone would be sufficient to prevent that
>> (the "don't do that" rules are always fragile).
>> 
>> Therefore, at the moment I'm more inclined to go with option (2), as it
>> provides better robustness and gives schedulers more flexibility.
>
> I'm running into a number of headaches and corner cases if we go with
> option (2)... One of them is the following.
>
> Assume we push tasks into a BPF queue from ops.select_cpu() and pop them
> from ops.dispatch(). The following scenario can happen:
>
>   CPU0                                         CPU1
>   ----                                         ----
>   ops.select_cpu()
>     bpf_map_push_elem(&queue, &pid, 0)
>                                                ops.dispatch()
> 					         bpf_map_pop_elem(&queue, &pid)
> 						 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | dst_cpu)
> 						   ==> ops.dequeue() is not triggered!
>     p->scx.flags |= SCX_TASK_IN_BPF
>
> To fix this, we would need to always set SCX_TASK_IN_BPF before calling
> ops.select_cpu(), and then clear it again if the task is directly
> dispatched to a terminal DSQ from ops.select_cpu().
>
> However, doing so introduces further problems. In particular, we may end up
> triggering spurious ops.dequeue() callbacks, which means we would then need
> to distinguish whether a task entered BPF custody via ops.select_cpu() or
> via ops.enqueue(), and handle the two cases differently. Which is also racy
> and leads to additional locking and complexity.
>
> At that point, it starts to feel like we're over-complicating the design to
> support a scenario that is both uncommon and of questionable practical
> value.
>
> Given that, I'd suggest proceeding incrementally: for now, we go with
> option (1), which looks doable without major changes and it probably fixes
> the ops.dequeue() semantics for the majority of use cases (which is already
> a significant improvement over the current state). Once that is in place,
> we can revisit the "store tasks in internal BPF data structures from
> ops.select_cpu()" scenario and see if it's worth supporting it in a cleaner
> way. WDYT?
>

I agree with going with option 1. 

For the select_cpu() edge case, how about introducing an explicit 
kfunc scx_place_in_bpf_custody() later? Placing a task in BPF custody 
during select_cpu() is already pretty niche, so we can assume the 
scheduler writer knows what they're doing. In that case, let's let 
_them_ decide when in select_cpu() the task is considered "in BPF". 
They can also do their own locking to avoid races with locking on 
the task context. This keeps the state machine clean for the average
scheduler while still handling the edge case. DYT that would work?


> Thanks,
> -Andrea


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-09 15:00                     ` Emil Tsalapatis
@ 2026-02-09 15:43                       ` Andrea Righi
  2026-02-09 17:23                         ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-09 15:43 UTC (permalink / raw)
  To: Emil Tsalapatis
  Cc: Tejun Heo, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Mon, Feb 09, 2026 at 10:00:40AM -0500, Emil Tsalapatis wrote:
> On Mon Feb 9, 2026 at 5:20 AM EST, Andrea Righi wrote:
> > On Sun, Feb 08, 2026 at 09:08:38PM +0100, Andrea Righi wrote:
> >> On Sun, Feb 08, 2026 at 12:59:36PM -0500, Emil Tsalapatis wrote:
> >> > On Sun Feb 8, 2026 at 8:55 AM EST, Andrea Righi wrote:
> >> > > On Sun, Feb 08, 2026 at 11:26:13AM +0100, Andrea Righi wrote:
> >> > >> On Sun, Feb 08, 2026 at 10:02:41AM +0100, Andrea Righi wrote:
> >> > >> ...
> >> > >> > > >> >  - From ops.select_cpu():
> >> > >> > > >> >      - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
> >> > >> > > >> >        the BPF scheduler entirely; they never enter BPF custody, so
> >> > >> > > >> >        ops.dequeue() is not called,
> >> > >> > > >> >      - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
> >> > >> > > >> >        bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
> >> > >> > > >> >        not called,
> >> > >> > > >> >      - scenario 2 (user DSQ): tasks enter BPF scheduler custody with full
> >> > >> > > >> >        enqueue/dequeue lifecycle tracking and state machine validation
> >> > >> > > >> >        (expects 1:1 enqueue/dequeue pairing).
> >> > >> > > >> 
> >> > >> > > >> Could you add a note here about why there's no equivalent to scenario 6?
> >> > >> > > >> The differentiating factor between that and scenario 2 (nonterminal queue) is 
> >> > >> > > >> that scx_dsq_insert_commit() is called regardless of whether the queue is terminal.
> >> > >> > > >> And this makes sense since for non-DSQ queues the BPF scheduler can do its
> >> > >> > > >> own tracking of enqueue/dequeue (plus it does not make too much sense to
> >> > >> > > >> do BPF-internal enqueueing in select_cpu).
> >> > >> > > >> 
> >> > >> > > >> What do you think? If the above makes sense, maybe we should spell it out 
> >> > >> > > >> in the documentation too. Maybe also add it makes no sense to enqueue
> >> > >> > > >> in an internal BPF structure from select_cpu - the task is not yet
> >> > >> > > >> enqueued, and would have to go through enqueue anyway.
> >> > >> > > >
> >> > >> > > > Oh, I just didn't think about it, we can definitely add to ops.select_cpu()
> >> > >> > > > a scenario equivalent to scenario 6 (push task to the BPF queue).
> >> > >> > > >
> >> > >> > > > From a practical standpoint the benefits are questionable, but in the scope
> >> > >> > > > of the kselftest I think it makes sense to better validate the entire state
> >> > >> > > > machine in all cases. I'll add this scenario as well.
> >> > >> > > >
> >> > >> > > 
> >> > >> > > That makes sense! Let's add it for completeness. Even if it doesn't make
> >> > >> > > sense right now that may change in the future. For example, if we end
> >> > >> > > up finding a good reason to add the task into an internal structure from
> >> > >> > > .select_cpu(), we may allow the task to be explicitly marked as being in
> >> > >> > > the BPF scheduler's custody from a kfunc. Right now we can't do that
> >> > >> > > from select_cpu() unless we direct dispatch IIUC.
> >> > >> > 
> >> > >> > Ok, I'll send a new patch later with the new scenario included. It should
> >> > >> > work already (if done properly in the test case), I think we don't need to
> >> > >> > change anything in the kernel.
> >> > >> 
> >> > >> Actually I take that back. The internal BPF queue from ops.select_cpu()
> >> > >> scenario is a bit tricky, because when we return from ops.select_cpu()
> >> > >> without p->scx.ddsp_dsq_id being set, we don't know if the scheduler added
> >> > >> the task to an internal BPF queue or simply did nothing.
> >> > >> 
> >> > >> We need to add some special logic here, preferably without introducing
> >> > >> overhead just to handle this particular (really uncommon) case. I'll take a
> >> > >> look.
> >> > >
> >> > > The more I think about this, the more it feels wrong to consider a task as
> >> > > being "in BPF scheduler custody" if it is stored in a BPF internal data
> >> > > structure from ops.select_cpu().
> >> > >
> >> > > At the point where ops.select_cpu() runs, the task has not yet entered the
> >> > > BPF scheduler's queues. While it is technically possible to stash the task
> >> > > in some BPF-managed structure from there, doing so should not imply full
> >> > > scheduler custody.
> >> > >
> >> > > In particular, we should not trigger ops.dequeue(), because the task has
> >> > > not reached the "enqueue" stage of its lifecycle. ops.select_cpu() is
> >> > > effectively a pre-enqueue hook, primarily intended as a fast path to bypass
> >> > > the scheduler altogether. As such, triggering ops.dequeue() in this case
> >> > > would not make sense IMHO.
> >> > >
> >> > > I think it would make more sense to document this behavior explicitly and
> >> > > leave the kselftest as is.
> >> > >
> >> > > Thoughts?
> >> > 
> >> > I am going back and forth on this but I think the problem is that the enqueue() 
> >> > and dequeue() BPF callbacks we have are not actually symmetrical? 
> >> > 
> >> > 1) ops.enqueue() is "sched-ext specific work for the scheduler core's enqueue
> >> > method". This is independent on whether the task ends up in BPF custody or not.
> >> > It could be in a terminal DSQ, a non-terminal DSQ, or a BPF data structure.
> >> > 
> >> > 2) ops.dequeue() is "remove task from BPF custody". E.g., it is used by the
> >> > BPF scheduler to signal whether it should keep a task within its
> >> > internal tracking structures.
> >> > 
> >> > So the edge case of ops.select_cpu() placing the task in BPF custody is
> >> > currently valid. The way I see it, we have two choices in terms of
> >> > semantics:
> >> > 
> >> > 1) ops.dequeue() must be the equivalent of ops.enqueue(). If the BPF
> >> > scheduler writer decides to place a task into BPF custody during the
> >> > ops.select_cpu() that's on them. ops.select_cpu() is supposed to be a
> >> > pure function providing a hint, anyway. Using it to place a task into
> >> > BPF is a bit of an abuse even if allowed.
> >> > 
> >> > 2) We interpret ops.dequeue() to mean "dequeue from the BPF scheduler".
> >> > In that case we allow the edge case and interpret ops.dequeue() as "the
> >> > function that must be called to clear the NEEDS_DEQ/IN_BPF flag", not as
> >> > the complement of ops.enqueue(). In most cases both will be true, and in
> >> > the cases where not then it's up to the scheduler writer to understand
> >> > the nuance.
> >> > 
> >> > I think while 2) is cleaner, it is more involved and honestly kinda
> >> > speculative. However, I think it's fair game since once we settle on
> >> > the semantics it will be more difficult to change them. Which one do you 
> >> > think makes more sense?
> >> 
> >> Yeah, I'm also going back and forth on this.
> >> 
> >> Honestly from a pure theoretical perspective, option (1) feels cleaner to
> >> me: when ops.select_cpu() runs, the task has not entered the BPF scheduler
> >> yet. If we trigger ops.dequeue() in this case, we end up with tasks that
> >> are "leaving" the scheduler without ever having entered it, which feels
> >> like a violation of the lifecycle model.
> >> 
> >> However, from a practical perspective, it's probably more convenient to
> >> trigger ops.dequeue() also for tasks that are stored in BPF data structures
> >> or user DSQs from ops.select_cpu() as well. If we don't allow that, we
> >> can't just silently ignore the behavior and it's also pretty hard to
> >> reliably detect and trigger an error for this kind of "abuse" at runtime.
> >> That means it could easily turn into a source of subtle bugs in the future,
> >> and I don't think documentation alone would be sufficient to prevent that
> >> (the "don't do that" rules are always fragile).
> >> 
> >> Therefore, at the moment I'm more inclined to go with option (2), as it
> >> provides better robustness and gives schedulers more flexibility.
> >
> > I'm running into a number of headaches and corner cases if we go with
> > option (2)... One of them is the following.
> >
> > Assume we push tasks into a BPF queue from ops.select_cpu() and pop them
> > from ops.dispatch(). The following scenario can happen:
> >
> >   CPU0                                         CPU1
> >   ----                                         ----
> >   ops.select_cpu()
> >     bpf_map_push_elem(&queue, &pid, 0)
> >                                                ops.dispatch()
> > 					         bpf_map_pop_elem(&queue, &pid)
> > 						 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | dst_cpu)
> > 						   ==> ops.dequeue() is not triggered!
> >     p->scx.flags |= SCX_TASK_IN_BPF
> >
> > To fix this, we would need to always set SCX_TASK_IN_BPF before calling
> > ops.select_cpu(), and then clear it again if the task is directly
> > dispatched to a terminal DSQ from ops.select_cpu().
> >
> > However, doing so introduces further problems. In particular, we may end up
> > triggering spurious ops.dequeue() callbacks, which means we would then need
> > to distinguish whether a task entered BPF custody via ops.select_cpu() or
> > via ops.enqueue(), and handle the two cases differently. Which is also racy
> > and leads to additional locking and complexity.
> >
> > At that point, it starts to feel like we're over-complicating the design to
> > support a scenario that is both uncommon and of questionable practical
> > value.
> >
> > Given that, I'd suggest proceeding incrementally: for now, we go with
> > option (1), which looks doable without major changes and it probably fixes
> > the ops.dequeue() semantics for the majority of use cases (which is already
> > a significant improvement over the current state). Once that is in place,
> > we can revisit the "store tasks in internal BPF data structures from
> > ops.select_cpu()" scenario and see if it's worth supporting it in a cleaner
> > way. WDYT?
> >
> 
> I agree with going with option 1. 
> 
> For the select_cpu() edge case, how about introducing an explicit 
> kfunc scx_place_in_bpf_custody() later? Placing a task in BPF custody 
> during select_cpu() is already pretty niche, so we can assume the 
> scheduler writer knows what they're doing. In that case, let's let 
> _them_ decide when in select_cpu() the task is considered "in BPF". 
> They can also do their own locking to avoid races with locking on 
> the task context. This keeps the state machine clean for the average
> scheduler while still handling the edge case. DYT that would work?

Yeah, I was also considering introducing dedicated kfuncs so that the BPF
scheduler can explicitly manage the "in BPF custody" state, decoupling the
notion of BPF custody from ops.enqueue(). With such interface, a scheduler
could do something like:

ops.select_cpu()
{
        s32 pid = p->pid;

        scx_bpf_enter_custody(p);
        if (!bpf_map_push_elem(&bpf_queue, &pid, 0)) {
                set_task_state(TASK_ENQUEUED);
        } else {
                scx_bpf_exit_custody(p);
                set_task_state(TASK_NONE);
        }

        return prev_cpu;
}

On the implementation side, entering / leaving BPF custody is essentially
setting / clearing SCX_TASK_IN_BPF, with the scheduler taking full
responsibility for ensuring the flag is managed consistently: you set the
flag => ops.dequeue() is called when the task leaves custody, you clear the
flag => fallback to the default custody behavior.

But I think this is something to explore in the future, for now I'd go with
the easier way first. :)

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-09 15:43                       ` Andrea Righi
@ 2026-02-09 17:23                         ` Tejun Heo
  2026-02-09 19:17                           ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2026-02-09 17:23 UTC (permalink / raw)
  To: Andrea Righi
  Cc: Emil Tsalapatis, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

Hello,

On Mon, Feb 09, 2026 at 04:43:20PM +0100, Andrea Righi wrote:
> > I agree with going with option 1. 

I think this is the only way. The only reason this is a bit murky is because
we allow direct dispatching from ops.select_cpu() but if you look at how
that's implemented it doesn't really bypass enqueue path. The task still has
to enter the enqueue path (as that's when the rq lock is grabbed and task
state can be updated) while already knowing what to do in the enqueue path.
I don't think it make sense to consider a task to be in the BPF sched's
custody before it has passed through enqueue. Note that you can't even set
the flag - the flag field is protected by the task's rq lock.

> > For the select_cpu() edge case, how about introducing an explicit 
> > kfunc scx_place_in_bpf_custody() later? Placing a task in BPF custody 
> > during select_cpu() is already pretty niche, so we can assume the 
> > scheduler writer knows what they're doing. In that case, let's let 
> > _them_ decide when in select_cpu() the task is considered "in BPF". 
> > They can also do their own locking to avoid races with locking on 
> > the task context. This keeps the state machine clean for the average
> > scheduler while still handling the edge case. DYT that would work?
> 
> Yeah, I was also considering introducing dedicated kfuncs so that the BPF
> scheduler can explicitly manage the "in BPF custody" state, decoupling the
> notion of BPF custody from ops.enqueue(). With such interface, a scheduler
> could do something like:
> 
> ops.select_cpu()
> {
>         s32 pid = p->pid;
> 
>         scx_bpf_enter_custody(p);
>         if (!bpf_map_push_elem(&bpf_queue, &pid, 0)) {
>                 set_task_state(TASK_ENQUEUED);
>         } else {
>                 scx_bpf_exit_custody(p);
>                 set_task_state(TASK_NONE);
>         }
> 
>         return prev_cpu;
> }
> 
> On the implementation side, entering / leaving BPF custody is essentially
> setting / clearing SCX_TASK_IN_BPF, with the scheduler taking full
> responsibility for ensuring the flag is managed consistently: you set the
> flag => ops.dequeue() is called when the task leaves custody, you clear the
> flag => fallback to the default custody behavior.
> 
> But I think this is something to explore in the future, for now I'd go with
> the easier way first. :)

We should just not do it.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics
  2026-02-07  9:26     ` Andrea Righi
@ 2026-02-09 17:28       ` Tejun Heo
  2026-02-09 19:06         ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2026-02-09 17:28 UTC (permalink / raw)
  To: Andrea Righi
  Cc: Emil Tsalapatis, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Sat, Feb 07, 2026 at 10:26:17AM +0100, Andrea Righi wrote:
> Hi Emil,
> 
> On Fri, Feb 06, 2026 at 03:35:34PM -0500, Emil Tsalapatis wrote:
> > On Fri Feb 6, 2026 at 8:54 AM EST, Andrea Righi wrote:
> ...
> > > diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
> > > index bcb962d5ee7d8..c48f818eee9b8 100644
> > > --- a/include/linux/sched/ext.h
> > > +++ b/include/linux/sched/ext.h
> > > @@ -84,6 +84,7 @@ struct scx_dispatch_q {
> > >  /* scx_entity.flags */
> > >  enum scx_ent_flags {
> > >  	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
> > > +	SCX_TASK_NEED_DEQ	= 1 << 1, /* in BPF custody, needs ops.dequeue() when leaving */
> > 
> > Can we make this "SCX_TASK_IN_BPF"? Since we've now defined what it means to be
> > in BPF custody vs the core scx scheduler (terminal DSQs) this is a more
> > general property that can be useful to check in the future. An example:
> > We can now assert that a task's BPF state is consistent with its actual 
> > kernel state when using BPF-based data structures to manage tasks.
> 
> Ack. I like SCX_TASK_IN_BPF and I also like the idea of resuing the flag
> for other purposes. It can be helpful for debugging as well.

One problem with the name is that when a task is in the BPF scheduler's
custody, it can be still be on the kernel side in a DSQ or can be on the BPF
side on a BPF data structure. This is currently distinguished by SCX_OPSS
state (queued on the ops side or not). We do say things like "the task is in
BPF" to note that the task is not on a DSQ but in BPF proper, so I think
SCX_TASK_IN_BPF can become confusing.

I don't know what the right name is. When we write it out, we say "in BPF
sched's custody" where "BPF sched" means the whole SCX scheduler. Maybe just
SCX_TASK_IN_CUSTODY?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics
  2026-02-09 17:28       ` Tejun Heo
@ 2026-02-09 19:06         ` Andrea Righi
  0 siblings, 0 replies; 33+ messages in thread
From: Andrea Righi @ 2026-02-09 19:06 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Emil Tsalapatis, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Mon, Feb 09, 2026 at 07:28:50AM -1000, Tejun Heo wrote:
> On Sat, Feb 07, 2026 at 10:26:17AM +0100, Andrea Righi wrote:
> > Hi Emil,
> > 
> > On Fri, Feb 06, 2026 at 03:35:34PM -0500, Emil Tsalapatis wrote:
> > > On Fri Feb 6, 2026 at 8:54 AM EST, Andrea Righi wrote:
> > ...
> > > > diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
> > > > index bcb962d5ee7d8..c48f818eee9b8 100644
> > > > --- a/include/linux/sched/ext.h
> > > > +++ b/include/linux/sched/ext.h
> > > > @@ -84,6 +84,7 @@ struct scx_dispatch_q {
> > > >  /* scx_entity.flags */
> > > >  enum scx_ent_flags {
> > > >  	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
> > > > +	SCX_TASK_NEED_DEQ	= 1 << 1, /* in BPF custody, needs ops.dequeue() when leaving */
> > > 
> > > Can we make this "SCX_TASK_IN_BPF"? Since we've now defined what it means to be
> > > in BPF custody vs the core scx scheduler (terminal DSQs) this is a more
> > > general property that can be useful to check in the future. An example:
> > > We can now assert that a task's BPF state is consistent with its actual 
> > > kernel state when using BPF-based data structures to manage tasks.
> > 
> > Ack. I like SCX_TASK_IN_BPF and I also like the idea of resuing the flag
> > for other purposes. It can be helpful for debugging as well.
> 
> One problem with the name is that when a task is in the BPF scheduler's
> custody, it can be still be on the kernel side in a DSQ or can be on the BPF
> side on a BPF data structure. This is currently distinguished by SCX_OPSS
> state (queued on the ops side or not). We do say things like "the task is in
> BPF" to note that the task is not on a DSQ but in BPF proper, so I think
> SCX_TASK_IN_BPF can become confusing.
> 
> I don't know what the right name is. When we write it out, we say "in BPF
> sched's custody" where "BPF sched" means the whole SCX scheduler. Maybe just
> SCX_TASK_IN_CUSTODY?

Yeah, I agree that the "task in BPF" concept is a bit too overloaded. I
think SCX_TASK_IN_CUSTODY is clear enough and it doesn't overlap with the
"in BPF" concept. I'll rename the flag to SCX_TASK_IN_CUSTODY.

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-09 17:23                         ` Tejun Heo
@ 2026-02-09 19:17                           ` Andrea Righi
  2026-02-09 20:10                             ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-09 19:17 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Emil Tsalapatis, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Mon, Feb 09, 2026 at 07:23:28AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Mon, Feb 09, 2026 at 04:43:20PM +0100, Andrea Righi wrote:
> > > I agree with going with option 1. 
> 
> I think this is the only way. The only reason this is a bit murky is because
> we allow direct dispatching from ops.select_cpu() but if you look at how
> that's implemented it doesn't really bypass enqueue path. The task still has
> to enter the enqueue path (as that's when the rq lock is grabbed and task
> state can be updated) while already knowing what to do in the enqueue path.
> I don't think it make sense to consider a task to be in the BPF sched's
> custody before it has passed through enqueue. Note that you can't even set
> the flag - the flag field is protected by the task's rq lock.

Agreed. And just to be clear, for the purpose of triggering ops.dequeue(),
**all** direct dispatches from ops.select_cpu() should be consistently
ignored, including dispatches to user DSQs. I'll update this behavior in
the next version, because this one treats direct dispatches to user DSQs
from ops.select_cpu() as if the task is in the scheduler's custody, which
shouldn't be the case for consistency.

> 
> > > For the select_cpu() edge case, how about introducing an explicit 
> > > kfunc scx_place_in_bpf_custody() later? Placing a task in BPF custody 
> > > during select_cpu() is already pretty niche, so we can assume the 
> > > scheduler writer knows what they're doing. In that case, let's let 
> > > _them_ decide when in select_cpu() the task is considered "in BPF". 
> > > They can also do their own locking to avoid races with locking on 
> > > the task context. This keeps the state machine clean for the average
> > > scheduler while still handling the edge case. DYT that would work?
> > 
> > Yeah, I was also considering introducing dedicated kfuncs so that the BPF
> > scheduler can explicitly manage the "in BPF custody" state, decoupling the
> > notion of BPF custody from ops.enqueue(). With such interface, a scheduler
> > could do something like:
> > 
> > ops.select_cpu()
> > {
> >         s32 pid = p->pid;
> > 
> >         scx_bpf_enter_custody(p);
> >         if (!bpf_map_push_elem(&bpf_queue, &pid, 0)) {
> >                 set_task_state(TASK_ENQUEUED);
> >         } else {
> >                 scx_bpf_exit_custody(p);
> >                 set_task_state(TASK_NONE);
> >         }
> > 
> >         return prev_cpu;
> > }
> > 
> > On the implementation side, entering / leaving BPF custody is essentially
> > setting / clearing SCX_TASK_IN_BPF, with the scheduler taking full
> > responsibility for ensuring the flag is managed consistently: you set the
> > flag => ops.dequeue() is called when the task leaves custody, you clear the
> > flag => fallback to the default custody behavior.
> > 
> > But I think this is something to explore in the future, for now I'd go with
> > the easier way first. :)
> 
> We should just not do it.

Ack.

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-09 19:17                           ` Andrea Righi
@ 2026-02-09 20:10                             ` Tejun Heo
  2026-02-09 22:22                               ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2026-02-09 20:10 UTC (permalink / raw)
  To: Andrea Righi
  Cc: Emil Tsalapatis, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

Hello,

On Mon, Feb 09, 2026 at 08:17:24PM +0100, Andrea Righi wrote:
> Agreed. And just to be clear, for the purpose of triggering ops.dequeue(),
> **all** direct dispatches from ops.select_cpu() should be consistently
> ignored, including dispatches to user DSQs. I'll update this behavior in
> the next version, because this one treats direct dispatches to user DSQs
> from ops.select_cpu() as if the task is in the scheduler's custody, which
> shouldn't be the case for consistency.

I'm not sure about that. ops.select_cpu() doing direct dispatch is just a
shortcut and should be treated like the same operation being done at the
head of ops.enqueue(). That's what's happening semantically and I think we
should stick with what's happening underneath - ie. make ops.select_cpu()'s
shortcut the special case, not whether tasks in a user DSQ get ops.dequeue()
or not.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-09 20:10                             ` Tejun Heo
@ 2026-02-09 22:22                               ` Andrea Righi
  2026-02-10  0:42                                 ` Tejun Heo
  0 siblings, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-09 22:22 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Emil Tsalapatis, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Mon, Feb 09, 2026 at 10:10:30AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Mon, Feb 09, 2026 at 08:17:24PM +0100, Andrea Righi wrote:
> > Agreed. And just to be clear, for the purpose of triggering ops.dequeue(),
> > **all** direct dispatches from ops.select_cpu() should be consistently
> > ignored, including dispatches to user DSQs. I'll update this behavior in
> > the next version, because this one treats direct dispatches to user DSQs
> > from ops.select_cpu() as if the task is in the scheduler's custody, which
> > shouldn't be the case for consistency.
> 
> I'm not sure about that. ops.select_cpu() doing direct dispatch is just a
> shortcut and should be treated like the same operation being done at the
> head of ops.enqueue(). That's what's happening semantically and I think we
> should stick with what's happening underneath - ie. make ops.select_cpu()'s
> shortcut the special case, not whether tasks in a user DSQ get ops.dequeue()
> or not.

Ok, what you're saying is that a direct dispatch from ops.select_cpu() is
just a shortcut for work that would otherwise happen at the head of
ops.enqueue().

So, while ops.select_cpu() itself is not "being in scheduler custody", the
semantic operation of dispatching a task is still the scheduler taking
control of the task. As a result, a dispatch to a user DSQ from
ops.select_cpu() should be treated the same as a dispatch to a user DSQ
from ops.enqueue() for the purpose of triggering ops.dequeue(). The fact
that this happens in ops.select_cpu() rather than ops.enqueue() is an
implementation detail, not a semantic boundary.

Under this interpretation, storing a task in BPF internal data structures
from ops.select_cpu() should not trigger ops.dequeue(), since the task has
not been put under scheduler control yet. However, dispatching a task to a
user DSQ, regardless of whether it happens from ops.select_cpu() or
ops.enqueue(), does explicitly place the task in the scheduler's custody.

If this matches what you mean, then I agree with this approach.

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-09 22:22                               ` Andrea Righi
@ 2026-02-10  0:42                                 ` Tejun Heo
  2026-02-10  7:29                                   ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Tejun Heo @ 2026-02-10  0:42 UTC (permalink / raw)
  To: Andrea Righi
  Cc: Emil Tsalapatis, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

Hello, Andrea.

On Mon, Feb 09, 2026 at 11:22:19PM +0100, Andrea Righi wrote:
...
> Ok, what you're saying is that a direct dispatch from ops.select_cpu() is
> just a shortcut for work that would otherwise happen at the head of
> ops.enqueue().
>
> So, while ops.select_cpu() itself is not "being in scheduler custody", the
> semantic operation of dispatching a task is still the scheduler taking
> control of the task. As a result, a dispatch to a user DSQ from
> ops.select_cpu() should be treated the same as a dispatch to a user DSQ
> from ops.enqueue() for the purpose of triggering ops.dequeue(). The fact
> that this happens in ops.select_cpu() rather than ops.enqueue() is an
> implementation detail, not a semantic boundary.

Yes.

> Under this interpretation, storing a task in BPF internal data structures
> from ops.select_cpu() should not trigger ops.dequeue(), since the task has
> not been put under scheduler control yet. However, dispatching a task to a

Also, ops.select_cpu() putting the task in a BPF struct doesn't affect
what's happening in the enqueue path. ops.enqueue() will still be invoked
and the task will be transferred to BPF side iff ops.enqueue() does not
perform a direct dispatch. Imagine the following (unlikely but possible)
scenario:

   CPU A                                   CPU B

   ops.select_cpu() puts task in a BPF
     data structure
                                           ops.dispatch() sees the task, dequeues it and
                                           dispatches it to CPU B's local DSQ.

                                           finish_dispatch() runs but the task is still
                                           SCX_OPSS_NONE and dispatch attempt is ignored.

   ops.enqueue() runs and returns without
   doing anything. Task transitions to
   SCX_OPSS_QUEUED.

Afterwards, the kernel considers the task to be owned by BPF but the BPF
side thinks the task has already been dispatched. It just doesn't make much
sense to do BPF enqueue operation from ops.select_cpu(). The only reason it
works for direct dispatch is because the kernel defers the operation to the
enqueue time behind the scene.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-10  0:42                                 ` Tejun Heo
@ 2026-02-10  7:29                                   ` Andrea Righi
  0 siblings, 0 replies; 33+ messages in thread
From: Andrea Righi @ 2026-02-10  7:29 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Emil Tsalapatis, David Vernet, Changwoo Min, Kuba Piecuch,
	Christian Loehle, Daniel Hodges, sched-ext, linux-kernel

On Mon, Feb 09, 2026 at 02:42:04PM -1000, Tejun Heo wrote:
> Hello, Andrea.
> 
> On Mon, Feb 09, 2026 at 11:22:19PM +0100, Andrea Righi wrote:
> ...
> > Ok, what you're saying is that a direct dispatch from ops.select_cpu() is
> > just a shortcut for work that would otherwise happen at the head of
> > ops.enqueue().
> >
> > So, while ops.select_cpu() itself is not "being in scheduler custody", the
> > semantic operation of dispatching a task is still the scheduler taking
> > control of the task. As a result, a dispatch to a user DSQ from
> > ops.select_cpu() should be treated the same as a dispatch to a user DSQ
> > from ops.enqueue() for the purpose of triggering ops.dequeue(). The fact
> > that this happens in ops.select_cpu() rather than ops.enqueue() is an
> > implementation detail, not a semantic boundary.
> 
> Yes.
> 
> > Under this interpretation, storing a task in BPF internal data structures
> > from ops.select_cpu() should not trigger ops.dequeue(), since the task has
> > not been put under scheduler control yet. However, dispatching a task to a
> 
> Also, ops.select_cpu() putting the task in a BPF struct doesn't affect
> what's happening in the enqueue path. ops.enqueue() will still be invoked
> and the task will be transferred to BPF side iff ops.enqueue() does not
> perform a direct dispatch. Imagine the following (unlikely but possible)
> scenario:
> 
>    CPU A                                   CPU B
> 
>    ops.select_cpu() puts task in a BPF
>      data structure
>                                            ops.dispatch() sees the task, dequeues it and
>                                            dispatches it to CPU B's local DSQ.
> 
>                                            finish_dispatch() runs but the task is still
>                                            SCX_OPSS_NONE and dispatch attempt is ignored.
> 
>    ops.enqueue() runs and returns without
>    doing anything. Task transitions to
>    SCX_OPSS_QUEUED.
> 
> Afterwards, the kernel considers the task to be owned by BPF but the BPF
> side thinks the task has already been dispatched. It just doesn't make much
> sense to do BPF enqueue operation from ops.select_cpu(). The only reason it
> works for direct dispatch is because the kernel defers the operation to the
> enqueue time behind the scene.

Makes sense. Storing a task in a BPF internal data structure from
ops.select_cpu() doesn't prevent ops.enqueue() from being called and it can
introduce racy behavior. So, putting a task in a BPF queue at that point is
just extra overhead and provides no real benefit.

In other words, the only thing that makes sense for ops.select_cpu() is
direct dispatch, attempting any other form of enqueue from there is
pointless because the ops.enqueue() path will be invoked anyway.

We should probably document this behavior to make it explicit.

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-10 21:26 [PATCHSET v8] sched_ext: Fix " Andrea Righi
@ 2026-02-10 21:26 ` Andrea Righi
  2026-02-12 17:15   ` Christian Loehle
  0 siblings, 1 reply; 33+ messages in thread
From: Andrea Righi @ 2026-02-10 21:26 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Emil Tsalapatis, Christian Loehle, Daniel Hodges,
	sched-ext, linux-kernel

Add a new kselftest to validate that the new ops.dequeue() semantics
work correctly for all task lifecycle scenarios, including the
distinction between terminal DSQs (where BPF scheduler is done with the
task), user DSQs (where BPF scheduler manages the task lifecycle) and
BPF data structures, regardless of which event performs the dispatch.

The test validates the following scenarios:

 - From ops.select_cpu():
     - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
       the BPF scheduler entirely; they never enter BPF custody, so
       ops.dequeue() is not called,
     - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
       bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
       not called,
     - scenario 2 (user DSQ): tasks dispatched to user DSQs from
       ops.select_cpu(): tasks enter BPF scheduler's custody with full
       enqueue/dequeue lifecycle tracking and state machine validation,
       expects 1:1 enqueue/dequeue pairing,

   - From ops.enqueue():
     - scenario 3 (local DSQ): same behavior as scenario 0,
     - scenario 4 (global DSQ): same behavior as scenario 1,
     - scenario 5 (user DSQ): same behavior as scenario 2,
     - scenario 6 (BPF internal queue): tasks are stored in a BPF queue
       from ops.enqueue() and consumed from ops.dispatch(); similarly to
       scenario 5, tasks enter BPF scheduler's custody with full
       lifecycle tracking and 1:1 enqueue/dequeue validation.

This verifies that:
 - terminal DSQ dispatch (local, global) don't trigger ops.dequeue(),
 - tasks dispatched to user DSQs, either from ops.select_cpu() or
   ops.enqueue(), enter BPF scheduler's custody and have exact 1:1
   enqueue/dequeue pairing,
 - tasks stored to internal BPF data structures from ops.enqueue() enter
   BPF scheduler's custody and have exact 1:1 enqueue/dequeue pairing,
 - dispatch dequeues have no flags (normal workflow),
 - property change dequeues have the %SCX_DEQ_SCHED_CHANGE flag set,
 - no duplicate enqueues or invalid state transitions are happening.

Cc: Tejun Heo <tj@kernel.org>
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Cc: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 tools/testing/selftests/sched_ext/Makefile    |   1 +
 .../testing/selftests/sched_ext/dequeue.bpf.c | 368 ++++++++++++++++++
 tools/testing/selftests/sched_ext/dequeue.c   | 265 +++++++++++++
 3 files changed, 634 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.c

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 5fe45f9c5f8fd..764e91edabf93 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
 
 auto-test-targets :=			\
 	create_dsq			\
+	dequeue				\
 	enq_last_no_enq_fails		\
 	ddsp_bogus_dsq_fail		\
 	ddsp_vtimelocal_fail		\
diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
new file mode 100644
index 0000000000000..d9d12f14cd673
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates ops.dequeue() is called correctly:
+ * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
+ *   scheduler entirely: no ops.dequeue() should be called
+ * - Tasks dispatched to user DSQs from ops.enqueue() enter BPF custody:
+ *   ops.dequeue() must be called when they leave custody
+ * - Every ops.enqueue() dispatch to non-terminal DSQs is followed by
+ *   exactly one ops.dequeue() (validate 1:1 pairing and state machine)
+ *
+ * Copyright (c) 2026 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+#define SHARED_DSQ	0
+
+/*
+ * BPF internal queue.
+ *
+ * Tasks are stored here and consumed from ops.dispatch(), validating that
+ * tasks on BPF internal structures still get ops.dequeue() when they
+ * leave.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, 32768);
+	__type(value, s32);
+} global_queue SEC(".maps");
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+/*
+ * Counters to track the lifecycle of tasks:
+ * - enqueue_cnt: Number of times ops.enqueue() was called
+ * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
+ * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
+ * - change_dequeue_cnt: Number of property change dequeues
+ * - bpf_queue_full: Number of times the BPF internal queue was full
+ */
+u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt, bpf_queue_full;
+
+/*
+ * Test scenarios:
+ * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
+ *    dequeue callbacks expected)
+ * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
+ *    dequeue callbacks expected)
+ * 6) BPF internal queue from ops.enqueue(): store task PIDs in ops.enqueue(),
+ *    consume in ops.dispatch() and dispatch to local DSQ (validates dequeue
+ *    for tasks stored in internal BPF data structures)
+ */
+u32 test_scenario;
+
+/*
+ * Per-task state to track lifecycle and validate workflow semantics.
+ * State transitions:
+ *   NONE -> ENQUEUED (on enqueue)
+ *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
+ *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
+ *   ENQUEUED -> NONE (on property change dequeue before dispatch)
+ */
+enum task_state {
+	TASK_NONE = 0,
+	TASK_ENQUEUED,
+	TASK_DISPATCHED,
+};
+
+struct task_ctx {
+	enum task_state state; /* Current state in the workflow */
+	u64 enqueue_seq;       /* Sequence number for debugging */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
+{
+	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	struct task_ctx *tctx;
+	s32 pid = p->pid;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return prev_cpu;
+
+	switch (test_scenario) {
+	case 0:
+		/*
+		 * Direct dispatch to the local DSQ.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no ops.dequeue() callbacks.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+		tctx->state = TASK_DISPATCHED;
+		break;
+	case 1:
+		/*
+		 * Direct dispatch to the global DSQ.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no ops.dequeue() callbacks.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		tctx->state = TASK_DISPATCHED;
+		break;
+	case 2:
+		/*
+		 * Dispatch to shared a user DSQ.
+		 *
+		 * Task enters BPF scheduler management: track
+		 * enqueue/dequeue lifecycle and validate state
+		 * transitions.
+		 */
+		if (tctx->state == TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);
+
+		__sync_fetch_and_add(&enqueue_cnt, 1);
+
+		tctx->state = TASK_ENQUEUED;
+		tctx->enqueue_seq++;
+		break;
+	}
+
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct task_ctx *tctx;
+	s32 pid = p->pid;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	switch (test_scenario) {
+	case 3:
+		/*
+		 * Direct dispatch to the local DSQ.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no ops.dequeue() callbacks.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		break;
+	case 4:
+		/*
+		 * Direct dispatch to the global DSQ.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no ops.dequeue() callbacks.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		break;
+	case 5:
+		/*
+		 * Dispatch to shared user DSQ.
+		 *
+		 * Task enters BPF scheduler management: track
+		 * enqueue/dequeue lifecycle and validate state
+		 * transitions.
+		 */
+		if (tctx->state == TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+
+		__sync_fetch_and_add(&enqueue_cnt, 1);
+
+		tctx->state = TASK_ENQUEUED;
+		tctx->enqueue_seq++;
+		break;
+	case 6:
+		/*
+		 * Store task in BPF internal queue.
+		 *
+		 * Task enters BPF scheduler management: track
+		 * enqueue/dequeue lifecycle and validate state
+		 * transitions.
+		 */
+		if (tctx->state == TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		if (bpf_map_push_elem(&global_queue, &pid, 0)) {
+			scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+			__sync_fetch_and_add(&bpf_queue_full, 1);
+
+			tctx->state = TASK_DISPATCHED;
+		} else {
+			__sync_fetch_and_add(&enqueue_cnt, 1);
+
+			tctx->state = TASK_ENQUEUED;
+			tctx->enqueue_seq++;
+		}
+		break;
+	default:
+		/* For all other scenarios, dispatch to the global DSQ */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		tctx->state = TASK_DISPATCHED;
+		break;
+	}
+
+	scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE);
+}
+
+void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
+{
+	struct task_ctx *tctx;
+
+	__sync_fetch_and_add(&dequeue_cnt, 1);
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	/*
+	 * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global),
+	 * ops.dequeue() should never be called because tasks bypass the
+	 * BPF scheduler entirely. If we get here, it's a kernel bug.
+	 */
+	if (test_scenario == 0 || test_scenario == 3) {
+		scx_bpf_error("%d (%s): dequeue called for local DSQ scenario",
+			      p->pid, p->comm);
+		return;
+	}
+
+	if (test_scenario == 1 || test_scenario == 4) {
+		scx_bpf_error("%d (%s): dequeue called for global DSQ scenario",
+			      p->pid, p->comm);
+		return;
+	}
+
+	if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
+		/*
+		 * Property change interrupting the workflow. Valid from
+		 * both ENQUEUED and DISPATCHED states. Transitions task
+		 * back to NONE state.
+		 */
+		__sync_fetch_and_add(&change_dequeue_cnt, 1);
+
+		/* Validate state transition */
+		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
+			scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/* Transition back to NONE: task outside scheduler control */
+		tctx->state = TASK_NONE;
+	} else {
+		/*
+		 * Regular dispatch dequeue: normal workflow step. Valid
+		 * only from ENQUEUED state (after enqueue, before dispatch
+		 * dequeue). Transitions to DISPATCHED state.
+		 */
+		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
+
+		/*
+		 * Dispatch dequeue should not have %SCX_DEQ_SCHED_CHANGE
+		 * flag.
+		 */
+		if (deq_flags & SCX_DEQ_SCHED_CHANGE)
+			scx_bpf_error("%d (%s): SCX_DEQ_SCHED_CHANGE in dispatch dequeue seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		/*
+		 * Must be in ENQUEUED state.
+		 */
+		if (tctx->state != TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/*
+		 * Transition to DISPATCHED: normal cycle completed
+		 * dispatch.
+		 */
+		tctx->state = TASK_DISPATCHED;
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
+{
+	if (test_scenario == 6) {
+		struct task_struct *p;
+		s32 pid;
+
+		if (bpf_map_pop_elem(&global_queue, &pid))
+			return;
+
+		p = bpf_task_from_pid(pid);
+		if (!p)
+			return;
+
+		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
+			cpu = scx_bpf_task_cpu(p);
+
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	} else {
+		scx_bpf_dsq_move_to_local(SHARED_DSQ);
+	}
+}
+
+s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	struct task_ctx *tctx;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!tctx)
+		return -ENOMEM;
+
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
+{
+	s32 ret;
+
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops dequeue_ops = {
+	.select_cpu		= (void *)dequeue_select_cpu,
+	.enqueue		= (void *)dequeue_enqueue,
+	.dequeue		= (void *)dequeue_dequeue,
+	.dispatch		= (void *)dequeue_dispatch,
+	.init_task		= (void *)dequeue_init_task,
+	.init			= (void *)dequeue_init,
+	.exit			= (void *)dequeue_exit,
+	.timeout_ms		= 5000,
+	.name			= "dequeue_test",
+};
diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
new file mode 100644
index 0000000000000..8bc9d263aa05c
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <time.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <sched.h>
+#include <pthread.h>
+#include "scx_test.h"
+#include "dequeue.bpf.skel.h"
+
+#define NUM_WORKERS 8
+#define AFFINITY_HAMMER_MS 50
+
+/*
+ * Worker function that creates enqueue/dequeue events via CPU work and
+ * sleeping. Property-change dequeues are triggered by the affinity hammer
+ * thread (external sched_setaffinity on worker PIDs).
+ */
+static void worker_fn(int id)
+{
+	int i;
+	volatile int sum = 0;
+
+	for (i = 0; i < 1000; i++) {
+		int j;
+
+		/* Do some work to trigger scheduling events */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Sleep to trigger dequeue */
+		usleep(1000 + (id * 100));
+	}
+
+	exit(0);
+}
+
+/*
+ * Property-change dequeues only happen when a task gets a property change
+ * while still in the queue. This thread changes workers' affinity from
+ * outside so that some changes hit tasks while they are still in the
+ * queue.
+ */
+static void *affinity_hammer_fn(void *arg)
+{
+	pid_t *pids = arg;
+	cpu_set_t cpuset;
+	int i, n = NUM_WORKERS;
+	struct timespec ts = { .tv_sec = 0, .tv_nsec = 1000000 }; /* 1ms */
+
+	for (i = 0; i < (AFFINITY_HAMMER_MS * 1000 / 100); i++) {
+		int w = i % n;
+		int cpu = (i / n) % 4;
+
+		CPU_ZERO(&cpuset);
+		CPU_SET(cpu, &cpuset);
+		sched_setaffinity(pids[w], sizeof(cpuset), &cpuset);
+		nanosleep(&ts, NULL);
+	}
+
+	return NULL;
+}
+
+static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
+					 const char *scenario_name)
+{
+	struct bpf_link *link;
+	pid_t pids[NUM_WORKERS];
+	pthread_t hammer;
+
+	int i, status;
+	u64 enq_start, deq_start,
+	    dispatch_deq_start, change_deq_start, bpf_queue_full_start;
+	u64 enq_delta, deq_delta,
+	    dispatch_deq_delta, change_deq_delta, bpf_queue_full_delta;
+
+	/* Set the test scenario */
+	skel->bss->test_scenario = scenario;
+
+	/* Record starting counts */
+	enq_start = skel->bss->enqueue_cnt;
+	deq_start = skel->bss->dequeue_cnt;
+	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
+	change_deq_start = skel->bss->change_dequeue_cnt;
+	bpf_queue_full_start = skel->bss->bpf_queue_full;
+
+	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
+
+	/* Fork worker processes to generate enqueue/dequeue events */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		pids[i] = fork();
+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
+
+		if (pids[i] == 0) {
+			worker_fn(i);
+			/* Should not reach here */
+			exit(1);
+		}
+	}
+
+	/*
+	 * Run an "affinity hammer" so that some property changes hit tasks
+	 * while they are still in BPF custody (e.g. in user DSQ or BPF queue),
+	 * triggering SCX_DEQ_SCHED_CHANGE dequeues in scenarios 2, 3, 6 and 7.
+	 */
+	SCX_FAIL_IF(pthread_create(&hammer, NULL, affinity_hammer_fn, pids) != 0,
+		    "Failed to create affinity hammer thread");
+	pthread_join(hammer, NULL);
+
+	/* Wait for all workers to complete */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for worker %d", i);
+		SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
+	}
+
+	bpf_link__destroy(link);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+
+	/* Calculate deltas */
+	enq_delta = skel->bss->enqueue_cnt - enq_start;
+	deq_delta = skel->bss->dequeue_cnt - deq_start;
+	dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
+	change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start;
+	bpf_queue_full_delta = skel->bss->bpf_queue_full - bpf_queue_full_start;
+
+	printf("%s:\n", scenario_name);
+	printf("  enqueues: %lu\n", (unsigned long)enq_delta);
+	printf("  dequeues: %lu (dispatch: %lu, property_change: %lu)\n",
+	       (unsigned long)deq_delta,
+	       (unsigned long)dispatch_deq_delta,
+	       (unsigned long)change_deq_delta);
+	printf("  BPF queue full: %lu\n", (unsigned long)bpf_queue_full_delta);
+
+	/*
+	 * Validate enqueue/dequeue lifecycle tracking.
+	 *
+	 * For scenarios 0, 1, 3, 4 (local and global DSQs from
+	 * ops.select_cpu() and ops.enqueue()), both enqueues and dequeues
+	 * should be 0 because tasks bypass the BPF scheduler entirely:
+	 * tasks never enter BPF scheduler's custody.
+	 *
+	 * For scenarios 2, 5, 6 (user DSQ or BPF internal queue) we expect
+	 * both enqueues and dequeues.
+	 *
+	 * The BPF code does strict state machine validation with
+	 * scx_bpf_error() to ensure the workflow semantics are correct.
+	 *
+	 * If we reach this point without errors, the semantics are
+	 * validated correctly.
+	 */
+	if (scenario == 0 || scenario == 1 ||
+	    scenario == 3 || scenario == 4) {
+		/* Tasks bypass BPF scheduler completely */
+		SCX_EQ(enq_delta, 0);
+		SCX_EQ(deq_delta, 0);
+		SCX_EQ(dispatch_deq_delta, 0);
+		SCX_EQ(change_deq_delta, 0);
+	} else {
+		/*
+		 * User DSQ from ops.enqueue() or ops.select_cpu(): tasks
+		 * enter BPF scheduler's custody.
+		 *
+		 * Also validate 1:1 enqueue/dequeue pairing.
+		 */
+		SCX_GT(enq_delta, 0);
+		SCX_GT(deq_delta, 0);
+		SCX_EQ(enq_delta, deq_delta);
+	}
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct dequeue *skel;
+
+	skel = dequeue__open();
+	SCX_FAIL_IF(!skel, "Failed to open skel");
+	SCX_ENUM_INIT(skel);
+	SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct dequeue *skel = ctx;
+	enum scx_test_status status;
+
+	status = run_scenario(skel, 0, "Scenario 0: Local DSQ from ops.select_cpu()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 1, "Scenario 1: Global DSQ from ops.select_cpu()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 2, "Scenario 2: User DSQ from ops.select_cpu()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 3, "Scenario 3: Local DSQ from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 4, "Scenario 4: Global DSQ from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 5, "Scenario 5: User DSQ from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 6, "Scenario 6: BPF queue from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	printf("\n=== Summary ===\n");
+	printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
+	printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
+	printf("  Dispatch dequeues: %lu (no flag, normal workflow)\n",
+	       (unsigned long)skel->bss->dispatch_dequeue_cnt);
+	printf("  Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n",
+	       (unsigned long)skel->bss->change_dequeue_cnt);
+	printf("  BPF queue full: %lu\n",
+	       (unsigned long)skel->bss->bpf_queue_full);
+	printf("\nAll scenarios passed - no state machine violations detected\n");
+	printf("-> Validated: Local DSQ dispatch bypasses BPF scheduler\n");
+	printf("-> Validated: Global DSQ dispatch bypasses BPF scheduler\n");
+	printf("-> Validated: User DSQ dispatch triggers ops.dequeue() callbacks\n");
+	printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
+	printf("-> Validated: Property change dequeues have SCX_DEQ_SCHED_CHANGE flag\n");
+	printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct dequeue *skel = ctx;
+
+	dequeue__destroy(skel);
+}
+
+struct scx_test dequeue_test = {
+	.name = "dequeue",
+	.description = "Verify ops.dequeue() semantics",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+
+REGISTER_SCX_TEST(&dequeue_test)
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-10 21:26 ` [PATCH 2/2] selftests/sched_ext: Add test to validate " Andrea Righi
@ 2026-02-12 17:15   ` Christian Loehle
  2026-02-12 18:25     ` Andrea Righi
  0 siblings, 1 reply; 33+ messages in thread
From: Christian Loehle @ 2026-02-12 17:15 UTC (permalink / raw)
  To: Andrea Righi, Tejun Heo, David Vernet, Changwoo Min
  Cc: Kuba Piecuch, Emil Tsalapatis, Daniel Hodges, sched-ext,
	linux-kernel

On 2/10/26 21:26, Andrea Righi wrote:
> Add a new kselftest to validate that the new ops.dequeue() semantics
> work correctly for all task lifecycle scenarios, including the
> distinction between terminal DSQs (where BPF scheduler is done with the
> task), user DSQs (where BPF scheduler manages the task lifecycle) and
> BPF data structures, regardless of which event performs the dispatch.
> 
> The test validates the following scenarios:
> 
>  - From ops.select_cpu():
>      - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
>        the BPF scheduler entirely; they never enter BPF custody, so
>        ops.dequeue() is not called,
>      - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
>        bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
>        not called,
>      - scenario 2 (user DSQ): tasks dispatched to user DSQs from
>        ops.select_cpu(): tasks enter BPF scheduler's custody with full
>        enqueue/dequeue lifecycle tracking and state machine validation,
>        expects 1:1 enqueue/dequeue pairing,
> 
>    - From ops.enqueue():
>      - scenario 3 (local DSQ): same behavior as scenario 0,
>      - scenario 4 (global DSQ): same behavior as scenario 1,
>      - scenario 5 (user DSQ): same behavior as scenario 2,
>      - scenario 6 (BPF internal queue): tasks are stored in a BPF queue
>        from ops.enqueue() and consumed from ops.dispatch(); similarly to
>        scenario 5, tasks enter BPF scheduler's custody with full
>        lifecycle tracking and 1:1 enqueue/dequeue validation.
> 
> This verifies that:
>  - terminal DSQ dispatch (local, global) don't trigger ops.dequeue(),
>  - tasks dispatched to user DSQs, either from ops.select_cpu() or
>    ops.enqueue(), enter BPF scheduler's custody and have exact 1:1
>    enqueue/dequeue pairing,
>  - tasks stored to internal BPF data structures from ops.enqueue() enter
>    BPF scheduler's custody and have exact 1:1 enqueue/dequeue pairing,
>  - dispatch dequeues have no flags (normal workflow),
>  - property change dequeues have the %SCX_DEQ_SCHED_CHANGE flag set,
>  - no duplicate enqueues or invalid state transitions are happening.
> 
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Emil Tsalapatis <emil@etsalapatis.com>
> Cc: Kuba Piecuch <jpiecuch@google.com>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---
>  tools/testing/selftests/sched_ext/Makefile    |   1 +
>  .../testing/selftests/sched_ext/dequeue.bpf.c | 368 ++++++++++++++++++
>  tools/testing/selftests/sched_ext/dequeue.c   | 265 +++++++++++++
>  3 files changed, 634 insertions(+)
>  create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
>  create mode 100644 tools/testing/selftests/sched_ext/dequeue.c
> 
> diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
> index 5fe45f9c5f8fd..764e91edabf93 100644
> --- a/tools/testing/selftests/sched_ext/Makefile
> +++ b/tools/testing/selftests/sched_ext/Makefile
> @@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
>  
>  auto-test-targets :=			\
>  	create_dsq			\
> +	dequeue				\
>  	enq_last_no_enq_fails		\
>  	ddsp_bogus_dsq_fail		\
>  	ddsp_vtimelocal_fail		\
> diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
> new file mode 100644
> index 0000000000000..d9d12f14cd673
> --- /dev/null
> +++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
> @@ -0,0 +1,368 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * A scheduler that validates ops.dequeue() is called correctly:
> + * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
> + *   scheduler entirely: no ops.dequeue() should be called
> + * - Tasks dispatched to user DSQs from ops.enqueue() enter BPF custody:
> + *   ops.dequeue() must be called when they leave custody
> + * - Every ops.enqueue() dispatch to non-terminal DSQs is followed by
> + *   exactly one ops.dequeue() (validate 1:1 pairing and state machine)
> + *
> + * Copyright (c) 2026 NVIDIA Corporation.
> + */
> +
> +#include <scx/common.bpf.h>
> +
> +#define SHARED_DSQ	0
> +
> +/*
> + * BPF internal queue.
> + *
> + * Tasks are stored here and consumed from ops.dispatch(), validating that
> + * tasks on BPF internal structures still get ops.dequeue() when they
> + * leave.
> + */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_QUEUE);
> +	__uint(max_entries, 32768);
> +	__type(value, s32);
> +} global_queue SEC(".maps");
> +
> +char _license[] SEC("license") = "GPL";
> +
> +UEI_DEFINE(uei);
> +
> +/*
> + * Counters to track the lifecycle of tasks:
> + * - enqueue_cnt: Number of times ops.enqueue() was called
> + * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
> + * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
> + * - change_dequeue_cnt: Number of property change dequeues
> + * - bpf_queue_full: Number of times the BPF internal queue was full
> + */
> +u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt, bpf_queue_full;
> +
> +/*
> + * Test scenarios:
> + * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
> + *    scheduler, no dequeue callbacks)
> + * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
> + *    scheduler, no dequeue callbacks)
> + * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
> + *    dequeue callbacks expected)
> + * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
> + *    scheduler, no dequeue callbacks)
> + * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
> + *    scheduler, no dequeue callbacks)
> + * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
> + *    dequeue callbacks expected)
> + * 6) BPF internal queue from ops.enqueue(): store task PIDs in ops.enqueue(),
> + *    consume in ops.dispatch() and dispatch to local DSQ (validates dequeue
> + *    for tasks stored in internal BPF data structures)
> + */
> +u32 test_scenario;
> +
> +/*
> + * Per-task state to track lifecycle and validate workflow semantics.
> + * State transitions:
> + *   NONE -> ENQUEUED (on enqueue)
> + *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
> + *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
> + *   ENQUEUED -> NONE (on property change dequeue before dispatch)
> + */
> +enum task_state {
> +	TASK_NONE = 0,
> +	TASK_ENQUEUED,
> +	TASK_DISPATCHED,
> +};
> +
> +struct task_ctx {
> +	enum task_state state; /* Current state in the workflow */
> +	u64 enqueue_seq;       /* Sequence number for debugging */
> +};
> +
> +struct {
> +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
> +	__uint(map_flags, BPF_F_NO_PREALLOC);
> +	__type(key, int);
> +	__type(value, struct task_ctx);
> +} task_ctx_stor SEC(".maps");
> +
> +static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
> +{
> +	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> +}
> +
> +s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
> +		   s32 prev_cpu, u64 wake_flags)
> +{
> +	struct task_ctx *tctx;
> +	s32 pid = p->pid;
> +> +	tctx = try_lookup_task_ctx(p);
> +	if (!tctx)
> +		return prev_cpu;
> +
> +	switch (test_scenario) {
> +	case 0:
> +		/*
> +		 * Direct dispatch to the local DSQ.
> +		 *
> +		 * Task bypasses BPF scheduler entirely: no enqueue
> +		 * tracking, no ops.dequeue() callbacks.
> +		 */
> +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
> +		tctx->state = TASK_DISPATCHED;
> +		break;
> +	case 1:
> +		/*
> +		 * Direct dispatch to the global DSQ.
> +		 *
> +		 * Task bypasses BPF scheduler entirely: no enqueue
> +		 * tracking, no ops.dequeue() callbacks.
> +		 */
> +		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
> +		tctx->state = TASK_DISPATCHED;
> +		break;
> +	case 2:
> +		/*
> +		 * Dispatch to shared a user DSQ.
> +		 *
> +		 * Task enters BPF scheduler management: track
> +		 * enqueue/dequeue lifecycle and validate state
> +		 * transitions.
> +		 */
> +		if (tctx->state == TASK_ENQUEUED)
> +			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
> +				      p->pid, p->comm, tctx->enqueue_seq);
> +
> +		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);
> +
> +		__sync_fetch_and_add(&enqueue_cnt, 1);
> +
> +		tctx->state = TASK_ENQUEUED;
> +		tctx->enqueue_seq++;
> +		break;
> +	}
> +
> +	return prev_cpu;
> +}
> +
> +void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
> +{
> +	struct task_ctx *tctx;
> +	s32 pid = p->pid;

unused

> +
> +	tctx = try_lookup_task_ctx(p);
> +	if (!tctx)
> +		return;
> +
> +	switch (test_scenario) {
> +	case 3:
> +		/*
> +		 * Direct dispatch to the local DSQ.
> +		 *
> +		 * Task bypasses BPF scheduler entirely: no enqueue
> +		 * tracking, no ops.dequeue() callbacks.
> +		 */
> +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
> +		break;
> +	case 4:
> +		/*
> +		 * Direct dispatch to the global DSQ.
> +		 *
> +		 * Task bypasses BPF scheduler entirely: no enqueue
> +		 * tracking, no ops.dequeue() callbacks.
> +		 */
> +		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
> +		break;
> +	case 5:
> +		/*
> +		 * Dispatch to shared user DSQ.
> +		 *
> +		 * Task enters BPF scheduler management: track
> +		 * enqueue/dequeue lifecycle and validate state
> +		 * transitions.
> +		 */
> +		if (tctx->state == TASK_ENQUEUED)
> +			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
> +				      p->pid, p->comm, tctx->enqueue_seq);
> +
> +		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
> +
> +		__sync_fetch_and_add(&enqueue_cnt, 1);
> +
> +		tctx->state = TASK_ENQUEUED;
> +		tctx->enqueue_seq++;
> +		break;
> +	case 6:
> +		/*
> +		 * Store task in BPF internal queue.
> +		 *
> +		 * Task enters BPF scheduler management: track
> +		 * enqueue/dequeue lifecycle and validate state
> +		 * transitions.
> +		 */
> +		if (tctx->state == TASK_ENQUEUED)
> +			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
> +				      p->pid, p->comm, tctx->enqueue_seq);
> +
> +		if (bpf_map_push_elem(&global_queue, &pid, 0)) {
> +			scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
> +			__sync_fetch_and_add(&bpf_queue_full, 1);
> +
> +			tctx->state = TASK_DISPATCHED;
> +		} else {
> +			__sync_fetch_and_add(&enqueue_cnt, 1);
> +
> +			tctx->state = TASK_ENQUEUED;
> +			tctx->enqueue_seq++;
> +		}
> +		break;
> +	default:
> +		/* For all other scenarios, dispatch to the global DSQ */
> +		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
> +		tctx->state = TASK_DISPATCHED;
> +		break;
> +	}
> +
> +	scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE);
> +}
> +
> +void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
> +{
> +	struct task_ctx *tctx;
> +
> +	__sync_fetch_and_add(&dequeue_cnt, 1);
> +
> +	tctx = try_lookup_task_ctx(p);
> +	if (!tctx)
> +		return;
> +
> +	/*
> +	 * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global),
> +	 * ops.dequeue() should never be called because tasks bypass the
> +	 * BPF scheduler entirely. If we get here, it's a kernel bug.
> +	 */
> +	if (test_scenario == 0 || test_scenario == 3) {
> +		scx_bpf_error("%d (%s): dequeue called for local DSQ scenario",
> +			      p->pid, p->comm);
> +		return;
> +	}
> +
> +	if (test_scenario == 1 || test_scenario == 4) {
> +		scx_bpf_error("%d (%s): dequeue called for global DSQ scenario",
> +			      p->pid, p->comm);
> +		return;
> +	}
> +
> +	if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
> +		/*
> +		 * Property change interrupting the workflow. Valid from
> +		 * both ENQUEUED and DISPATCHED states. Transitions task
> +		 * back to NONE state.
> +		 */
> +		__sync_fetch_and_add(&change_dequeue_cnt, 1);
> +
> +		/* Validate state transition */
> +		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
> +			scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu",
> +				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
> +
> +		/* Transition back to NONE: task outside scheduler control */
> +		tctx->state = TASK_NONE;
> +	} else {
> +		/*
> +		 * Regular dispatch dequeue: normal workflow step. Valid
> +		 * only from ENQUEUED state (after enqueue, before dispatch
> +		 * dequeue). Transitions to DISPATCHED state.
> +		 */
> +		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
> +
> +		/*
> +		 * Dispatch dequeue should not have %SCX_DEQ_SCHED_CHANGE
> +		 * flag.
> +		 */
> +		if (deq_flags & SCX_DEQ_SCHED_CHANGE)
> +			scx_bpf_error("%d (%s): SCX_DEQ_SCHED_CHANGE in dispatch dequeue seq=%llu",
> +				      p->pid, p->comm, tctx->enqueue_seq);
> +
> +		/*
> +		 * Must be in ENQUEUED state.
> +		 */
> +		if (tctx->state != TASK_ENQUEUED)
> +			scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu",
> +				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
> +
> +		/*
> +		 * Transition to DISPATCHED: normal cycle completed
> +		 * dispatch.
> +		 */
> +		tctx->state = TASK_DISPATCHED;
> +	}
> +}
> +
> +void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
> +{
> +	if (test_scenario == 6) {
> +		struct task_struct *p;
> +		s32 pid;
> +
> +		if (bpf_map_pop_elem(&global_queue, &pid))
> +			return;
> +
> +		p = bpf_task_from_pid(pid);
> +		if (!p)
> +			return;
> +
> +		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
> +			cpu = scx_bpf_task_cpu(p);
> +
> +		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
> +		bpf_task_release(p);
> +	} else {
> +		scx_bpf_dsq_move_to_local(SHARED_DSQ);
> +	}
> +}
> +
> +s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
> +		   struct scx_init_task_args *args)
> +{
> +	struct task_ctx *tctx;
> +
> +	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
> +				   BPF_LOCAL_STORAGE_GET_F_CREATE);
> +	if (!tctx)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
> +{
> +	s32 ret;
> +
> +	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}
> +
> +void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
> +{
> +	UEI_RECORD(uei, ei);
> +}
> +
> +SEC(".struct_ops.link")
> +struct sched_ext_ops dequeue_ops = {
> +	.select_cpu		= (void *)dequeue_select_cpu,
> +	.enqueue		= (void *)dequeue_enqueue,
> +	.dequeue		= (void *)dequeue_dequeue,
> +	.dispatch		= (void *)dequeue_dispatch,
> +	.init_task		= (void *)dequeue_init_task,
> +	.init			= (void *)dequeue_init,
> +	.exit			= (void *)dequeue_exit,
> +	.timeout_ms		= 5000,
> +	.name			= "dequeue_test",
> +};
> diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
> new file mode 100644
> index 0000000000000..8bc9d263aa05c
> --- /dev/null
> +++ b/tools/testing/selftests/sched_ext/dequeue.c
> @@ -0,0 +1,265 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2025 NVIDIA Corporation.
> + */
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <time.h>
> +#include <bpf/bpf.h>
> +#include <scx/common.h>
> +#include <sys/wait.h>
> +#include <sched.h>
> +#include <pthread.h>
> +#include "scx_test.h"
> +#include "dequeue.bpf.skel.h"
> +
> +#define NUM_WORKERS 8
> +#define AFFINITY_HAMMER_MS 50
> +
> +/*
> + * Worker function that creates enqueue/dequeue events via CPU work and
> + * sleeping. Property-change dequeues are triggered by the affinity hammer
> + * thread (external sched_setaffinity on worker PIDs).
> + */
> +static void worker_fn(int id)
> +{
> +	int i;
> +	volatile int sum = 0;
> +
> +	for (i = 0; i < 1000; i++) {
> +		int j;
> +
> +		/* Do some work to trigger scheduling events */
> +		for (j = 0; j < 10000; j++)
> +			sum += j;
> +
> +		/* Sleep to trigger dequeue */
> +		usleep(1000 + (id * 100));
> +	}
> +
> +	exit(0);
> +}
> +
> +/*
> + * Property-change dequeues only happen when a task gets a property change
> + * while still in the queue. This thread changes workers' affinity from
> + * outside so that some changes hit tasks while they are still in the
> + * queue.
> + */
> +static void *affinity_hammer_fn(void *arg)
> +{
> +	pid_t *pids = arg;
> +	cpu_set_t cpuset;
> +	int i, n = NUM_WORKERS;
> +	struct timespec ts = { .tv_sec = 0, .tv_nsec = 1000000 }; /* 1ms */
> +
> +	for (i = 0; i < (AFFINITY_HAMMER_MS * 1000 / 100); i++) {
> +		int w = i % n;
> +		int cpu = (i / n) % 4;
> +
> +		CPU_ZERO(&cpuset);
> +		CPU_SET(cpu, &cpuset);
> +		sched_setaffinity(pids[w], sizeof(cpuset), &cpuset);
> +		nanosleep(&ts, NULL);
> +	}
> +
> +	return NULL;
> +}
> +
> +static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
> +					 const char *scenario_name)
> +{
> +	struct bpf_link *link;
> +	pid_t pids[NUM_WORKERS];
> +	pthread_t hammer;
> +
> +	int i, status;
> +	u64 enq_start, deq_start,
> +	    dispatch_deq_start, change_deq_start, bpf_queue_full_start;
> +	u64 enq_delta, deq_delta,
> +	    dispatch_deq_delta, change_deq_delta, bpf_queue_full_delta;
> +
> +	/* Set the test scenario */
> +	skel->bss->test_scenario = scenario;
> +
> +	/* Record starting counts */
> +	enq_start = skel->bss->enqueue_cnt;
> +	deq_start = skel->bss->dequeue_cnt;
> +	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
> +	change_deq_start = skel->bss->change_dequeue_cnt;
> +	bpf_queue_full_start = skel->bss->bpf_queue_full;
> +
> +	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
> +	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
> +
> +	/* Fork worker processes to generate enqueue/dequeue events */
> +	for (i = 0; i < NUM_WORKERS; i++) {
> +		pids[i] = fork();
> +		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
> +
> +		if (pids[i] == 0) {
> +			worker_fn(i);
> +			/* Should not reach here */
> +			exit(1);
> +		}
> +	}
> +
> +	/*
> +	 * Run an "affinity hammer" so that some property changes hit tasks
> +	 * while they are still in BPF custody (e.g. in user DSQ or BPF queue),
> +	 * triggering SCX_DEQ_SCHED_CHANGE dequeues in scenarios 2, 3, 6 and 7.

Not true for 3, right?

> +	 */
> +	SCX_FAIL_IF(pthread_create(&hammer, NULL, affinity_hammer_fn, pids) != 0,
> +		    "Failed to create affinity hammer thread");
> +	pthread_join(hammer, NULL);
> +
> +	/* Wait for all workers to complete */
> +	for (i = 0; i < NUM_WORKERS; i++) {
> +		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
> +			    "Failed to wait for worker %d", i);
> +		SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
> +	}
> +
> +	bpf_link__destroy(link);
> +
> +	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
> +
> +	/* Calculate deltas */
> +	enq_delta = skel->bss->enqueue_cnt - enq_start;
> +	deq_delta = skel->bss->dequeue_cnt - deq_start;
> +	dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
> +	change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start;
> +	bpf_queue_full_delta = skel->bss->bpf_queue_full - bpf_queue_full_start;
> +
> +	printf("%s:\n", scenario_name);
> +	printf("  enqueues: %lu\n", (unsigned long)enq_delta);
> +	printf("  dequeues: %lu (dispatch: %lu, property_change: %lu)\n",
> +	       (unsigned long)deq_delta,
> +	       (unsigned long)dispatch_deq_delta,
> +	       (unsigned long)change_deq_delta);
> +	printf("  BPF queue full: %lu\n", (unsigned long)bpf_queue_full_delta);
> +
> +	/*
> +	 * Validate enqueue/dequeue lifecycle tracking.
> +	 *
> +	 * For scenarios 0, 1, 3, 4 (local and global DSQs from
> +	 * ops.select_cpu() and ops.enqueue()), both enqueues and dequeues
> +	 * should be 0 because tasks bypass the BPF scheduler entirely:
> +	 * tasks never enter BPF scheduler's custody.
> +	 *
> +	 * For scenarios 2, 5, 6 (user DSQ or BPF internal queue) we expect
> +	 * both enqueues and dequeues.
> +	 *
> +	 * The BPF code does strict state machine validation with
> +	 * scx_bpf_error() to ensure the workflow semantics are correct.
> +	 *
> +	 * If we reach this point without errors, the semantics are
> +	 * validated correctly.
> +	 */
> +	if (scenario == 0 || scenario == 1 ||
> +	    scenario == 3 || scenario == 4) {
> +		/* Tasks bypass BPF scheduler completely */
> +		SCX_EQ(enq_delta, 0);
> +		SCX_EQ(deq_delta, 0);
> +		SCX_EQ(dispatch_deq_delta, 0);
> +		SCX_EQ(change_deq_delta, 0);
> +	} else {
> +		/*
> +		 * User DSQ from ops.enqueue() or ops.select_cpu(): tasks
> +		 * enter BPF scheduler's custody.
> +		 *
> +		 * Also validate 1:1 enqueue/dequeue pairing.
> +		 */
> +		SCX_GT(enq_delta, 0);
> +		SCX_GT(deq_delta, 0);
> +		SCX_EQ(enq_delta, deq_delta);
> +	}
> +
> +	return SCX_TEST_PASS;
> +}
> +
> +static enum scx_test_status setup(void **ctx)
> +{
> +	struct dequeue *skel;
> +
> +	skel = dequeue__open();
> +	SCX_FAIL_IF(!skel, "Failed to open skel");
> +	SCX_ENUM_INIT(skel);
> +	SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
> +
> +	*ctx = skel;
> +
> +	return SCX_TEST_PASS;
> +}
> +
> +static enum scx_test_status run(void *ctx)
> +{
> +	struct dequeue *skel = ctx;
> +	enum scx_test_status status;
> +
> +	status = run_scenario(skel, 0, "Scenario 0: Local DSQ from ops.select_cpu()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 1, "Scenario 1: Global DSQ from ops.select_cpu()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 2, "Scenario 2: User DSQ from ops.select_cpu()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 3, "Scenario 3: Local DSQ from ops.enqueue()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 4, "Scenario 4: Global DSQ from ops.enqueue()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 5, "Scenario 5: User DSQ from ops.enqueue()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	status = run_scenario(skel, 6, "Scenario 6: BPF queue from ops.enqueue()");
> +	if (status != SCX_TEST_PASS)
> +		return status;
> +
> +	printf("\n=== Summary ===\n");
> +	printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
> +	printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
> +	printf("  Dispatch dequeues: %lu (no flag, normal workflow)\n",
> +	       (unsigned long)skel->bss->dispatch_dequeue_cnt);
> +	printf("  Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n",
> +	       (unsigned long)skel->bss->change_dequeue_cnt);
> +	printf("  BPF queue full: %lu\n",
> +	       (unsigned long)skel->bss->bpf_queue_full);
> +	printf("\nAll scenarios passed - no state machine violations detected\n");
> +	printf("-> Validated: Local DSQ dispatch bypasses BPF scheduler\n");
> +	printf("-> Validated: Global DSQ dispatch bypasses BPF scheduler\n");
> +	printf("-> Validated: User DSQ dispatch triggers ops.dequeue() callbacks\n");
> +	printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
> +	printf("-> Validated: Property change dequeues have SCX_DEQ_SCHED_CHANGE flag\n");
> +	printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
> +
> +	return SCX_TEST_PASS;
> +}
> +
> +static void cleanup(void *ctx)
> +{
> +	struct dequeue *skel = ctx;
> +
> +	dequeue__destroy(skel);
> +}
> +
> +struct scx_test dequeue_test = {
> +	.name = "dequeue",
> +	.description = "Verify ops.dequeue() semantics",
> +	.setup = setup,
> +	.run = run,
> +	.cleanup = cleanup,
> +};
> +
> +REGISTER_SCX_TEST(&dequeue_test)


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] selftests/sched_ext: Add test to validate ops.dequeue() semantics
  2026-02-12 17:15   ` Christian Loehle
@ 2026-02-12 18:25     ` Andrea Righi
  0 siblings, 0 replies; 33+ messages in thread
From: Andrea Righi @ 2026-02-12 18:25 UTC (permalink / raw)
  To: Christian Loehle
  Cc: Tejun Heo, David Vernet, Changwoo Min, Kuba Piecuch,
	Emil Tsalapatis, Daniel Hodges, sched-ext, linux-kernel

On Thu, Feb 12, 2026 at 05:15:28PM +0000, Christian Loehle wrote:
> On 2/10/26 21:26, Andrea Righi wrote:
...
> > +void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
> > +{
> > +	struct task_ctx *tctx;
> > +	s32 pid = p->pid;
> 
> unused

This one is used, but the one in dequeue_select_cpu() is not. I'll remove
that. :)

> > +static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
> > +					 const char *scenario_name)
> > +{
> > +	struct bpf_link *link;
> > +	pid_t pids[NUM_WORKERS];
> > +	pthread_t hammer;
> > +
> > +	int i, status;
> > +	u64 enq_start, deq_start,
> > +	    dispatch_deq_start, change_deq_start, bpf_queue_full_start;
> > +	u64 enq_delta, deq_delta,
> > +	    dispatch_deq_delta, change_deq_delta, bpf_queue_full_delta;
> > +
> > +	/* Set the test scenario */
> > +	skel->bss->test_scenario = scenario;
> > +
> > +	/* Record starting counts */
> > +	enq_start = skel->bss->enqueue_cnt;
> > +	deq_start = skel->bss->dequeue_cnt;
> > +	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
> > +	change_deq_start = skel->bss->change_dequeue_cnt;
> > +	bpf_queue_full_start = skel->bss->bpf_queue_full;
> > +
> > +	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
> > +	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
> > +
> > +	/* Fork worker processes to generate enqueue/dequeue events */
> > +	for (i = 0; i < NUM_WORKERS; i++) {
> > +		pids[i] = fork();
> > +		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
> > +
> > +		if (pids[i] == 0) {
> > +			worker_fn(i);
> > +			/* Should not reach here */
> > +			exit(1);
> > +		}
> > +	}
> > +
> > +	/*
> > +	 * Run an "affinity hammer" so that some property changes hit tasks
> > +	 * while they are still in BPF custody (e.g. in user DSQ or BPF queue),
> > +	 * triggering SCX_DEQ_SCHED_CHANGE dequeues in scenarios 2, 3, 6 and 7.
> 
> Not true for 3, right?

Oh yes, this selftest has been changed so many times that I was sure I
forgot to update some comments (also, scenario 7 doesn't exist anymore).

Thanks!
-Andrea

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2026-02-12 18:25 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-06 13:54 [PATCHSET v7] sched_ext: Fix ops.dequeue() semantics Andrea Righi
2026-02-06 13:54 ` [PATCH 1/2] " Andrea Righi
2026-02-06 20:35   ` Emil Tsalapatis
2026-02-07  9:26     ` Andrea Righi
2026-02-09 17:28       ` Tejun Heo
2026-02-09 19:06         ` Andrea Righi
2026-02-06 13:54 ` [PATCH 2/2] selftests/sched_ext: Add test to validate " Andrea Righi
2026-02-06 20:10   ` Emil Tsalapatis
2026-02-07  9:16     ` Andrea Righi
2026-02-08  5:11       ` Emil Tsalapatis
2026-02-08  9:02         ` Andrea Righi
2026-02-08 10:26           ` Andrea Righi
2026-02-08 13:55             ` Andrea Righi
2026-02-08 17:59               ` Emil Tsalapatis
2026-02-08 20:08                 ` Andrea Righi
2026-02-09 10:20                   ` Andrea Righi
2026-02-09 15:00                     ` Emil Tsalapatis
2026-02-09 15:43                       ` Andrea Righi
2026-02-09 17:23                         ` Tejun Heo
2026-02-09 19:17                           ` Andrea Righi
2026-02-09 20:10                             ` Tejun Heo
2026-02-09 22:22                               ` Andrea Righi
2026-02-10  0:42                                 ` Tejun Heo
2026-02-10  7:29                                   ` Andrea Righi
  -- strict thread matches above, loose matches on Subject: below --
2026-02-10 21:26 [PATCHSET v8] sched_ext: Fix " Andrea Righi
2026-02-10 21:26 ` [PATCH 2/2] selftests/sched_ext: Add test to validate " Andrea Righi
2026-02-12 17:15   ` Christian Loehle
2026-02-12 18:25     ` Andrea Righi
2026-02-05 15:32 [PATCHSET v6] sched_ext: Fix " Andrea Righi
2026-02-05 15:32 ` [PATCH 2/2] selftests/sched_ext: Add test to validate " Andrea Righi
2026-02-04 16:05 [PATCHSET v5] sched_ext: Fix " Andrea Righi
2026-02-04 16:05 ` [PATCH 2/2] selftests/sched_ext: Add test to validate " Andrea Righi
2026-02-01  9:08 [PATCHSET v4 sched_ext/for-6.20] sched_ext: Fix " Andrea Righi
2026-02-01  9:08 ` [PATCH 2/2] selftests/sched_ext: Add test to validate " Andrea Righi
2026-01-26  8:41 [PATCHSET v3 sched_ext/for-6.20] sched_ext: Fix " Andrea Righi
2026-01-26  8:41 ` [PATCH 2/2] selftests/sched_ext: Add test to validate " Andrea Righi
2026-01-27 16:53   ` Emil Tsalapatis
2026-01-21 12:25 [PATCHSET v2 sched_ext/for-6.20] sched_ext: Fix " Andrea Righi
2026-01-21 12:25 ` [PATCH 2/2] selftests/sched_ext: Add test to validate " Andrea Righi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox