[PATCH sched_ext/for-6.11 1/2] sched_ext: Implement DSQ iterator

public inbox for bpf@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH sched_ext/for-6.11 1/2] sched_ext: Implement DSQ iterator
@ 2024-06-28  0:20 Tejun Heo
  2024-06-28  0:24 ` [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task() Tejun Heo
  2024-06-28  1:11 ` [PATCH sched_ext/for-6.11 1/2] sched_ext: Implement DSQ iterator Alexei Starovoitov
  0 siblings, 2 replies; 12+ messages in thread
From: Tejun Heo @ 2024-06-28  0:20 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: linux-kernel, bpf, David Vernet

DSQs are very opaque in the consumption path. The BPF scheduler has no way
of knowing which tasks are being considered and which is picked. This patch
adds BPF DSQ iterator.

- Allows iterating tasks queued on a DSQ in the dispatch order or reverse
  from anywhere using bpf_for_each(scx_dsq) or calling the iterator kfuncs
  directly.

- Has ordering guarantee where only tasks which were already queued when the
  iteration started are visible and consumable during the iteration.

scx_qmap is updated to implement periodic dumping of the shared DSQ.

v2: - scx_bpf_consume_task() is separated out into a separate patch.

    - DSQ seq and iter flags don't need to be u64. Use u32.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: bpf@vger.kernel.org
---
Hello, Alexei.

These two patches implement inline iterator for a task queue data structure
that's used by sched_ext. The first one implements the iterator itself. It's
pretty straightforward and seems to work fine. The second one implements a
kfunc which consumes a task while iterating. This one is a bit nasty
unfortunately. I'll continue on the second patch.

Thanks.

 include/linux/sched/ext.h                |    4 
 kernel/sched/ext.c                       |  182 ++++++++++++++++++++++++++++++-
 tools/sched_ext/include/scx/common.bpf.h |    3 
 tools/sched_ext/scx_qmap.bpf.c           |   25 ++++
 tools/sched_ext/scx_qmap.c               |    8 +
 5 files changed, 218 insertions(+), 4 deletions(-)

--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -61,6 +61,7 @@ struct scx_dispatch_q {
 	struct list_head	list;	/* tasks in dispatch order */
 	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
 	u32			nr;
+	u32			seq;	/* used by BPF iter */
 	u64			id;
 	struct rhash_head	hash_node;
 	struct llist_node	free_node;
@@ -94,6 +95,8 @@ enum scx_task_state {
 /* scx_entity.dsq_flags */
 enum scx_ent_dsq_flags {
 	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
+
+	SCX_TASK_DSQ_CURSOR	= 1 << 31, /* iteration cursor, not a task */
 };
 
 /*
@@ -134,6 +137,7 @@ struct scx_dsq_node {
 struct sched_ext_entity {
 	struct scx_dispatch_q	*dsq;
 	struct scx_dsq_node	dsq_node;	/* protected by dsq lock */
+	u32			dsq_seq;
 	u32			flags;		/* protected by rq lock */
 	u32			weight;
 	s32			sticky_cpu;
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1066,6 +1066,72 @@ static __always_inline bool scx_kf_allow
 	return true;
 }
 
+/**
+ * nldsq_next_task - Iterate to the next task in a non-local DSQ
+ * @dsq: user dsq being interated
+ * @cur: current position, %NULL to start iteration
+ * @rev: walk backwards
+ *
+ * Returns %NULL when iteration is finished.
+ */
+static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
+					   struct task_struct *cur, bool rev)
+{
+	struct list_head *list_node;
+	struct scx_dsq_node *dsq_node;
+
+	lockdep_assert_held(&dsq->lock);
+
+	if (cur)
+		list_node = &cur->scx.dsq_node.list;
+	else
+		list_node = &dsq->list;
+
+	/* find the next task, need to skip BPF iteration cursors */
+	do {
+		if (rev)
+			list_node = list_node->prev;
+		else
+			list_node = list_node->next;
+
+		if (list_node == &dsq->list)
+			return NULL;
+
+		dsq_node = container_of(list_node, struct scx_dsq_node, list);
+	} while (dsq_node->flags & SCX_TASK_DSQ_CURSOR);
+
+	return container_of(dsq_node, struct task_struct, scx.dsq_node);
+}
+
+#define nldsq_for_each_task(p, dsq)						\
+	for ((p) = nldsq_next_task((dsq), NULL, false); (p);			\
+	     (p) = nldsq_next_task((dsq), (p), false))
+
+
+/*
+ * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
+ * dispatch order. BPF-visible iterator is opaque and larger to allow future
+ * changes without breaking backward compatibility. Can be used with
+ * bpf_for_each(). See bpf_iter_scx_dsq_*().
+ */
+enum scx_dsq_iter_flags {
+	/* iterate in the reverse dispatch order */
+	SCX_DSQ_ITER_REV		= 1U << 0,
+
+	__SCX_DSQ_ITER_ALL_FLAGS	= SCX_DSQ_ITER_REV,
+};
+
+struct bpf_iter_scx_dsq_kern {
+	struct scx_dsq_node		cursor;
+	struct scx_dispatch_q		*dsq;
+	u32				dsq_seq;
+	u32				flags;
+} __attribute__((aligned(8)));
+
+struct bpf_iter_scx_dsq {
+	u64				__opaque[12];
+} __attribute__((aligned(8)));
+
 
 /*
  * SCX task iterator.
@@ -1415,7 +1481,7 @@ static void dispatch_enqueue(struct scx_
 		 * tested easily when adding the first task.
 		 */
 		if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
-			     !list_empty(&dsq->list)))
+			     nldsq_next_task(dsq, NULL, false)))
 			scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
 				      dsq->id);
 
@@ -1447,6 +1513,10 @@ static void dispatch_enqueue(struct scx_
 			list_add_tail(&p->scx.dsq_node.list, &dsq->list);
 	}
 
+	/* seq records the order tasks are queued, used by BPF DSQ iterator */
+	dsq->seq++;
+	p->scx.dsq_seq = dsq->seq;
+
 	dsq_mod_nr(dsq, 1);
 	p->scx.dsq = dsq;
 
@@ -2109,7 +2179,7 @@ retry:
 
 	raw_spin_lock(&dsq->lock);
 
-	list_for_each_entry(p, &dsq->list, scx.dsq_node.list) {
+	nldsq_for_each_task(p, dsq) {
 		struct rq *task_rq = task_rq(p);
 
 		if (rq == task_rq) {
@@ -5697,6 +5767,111 @@ __bpf_kfunc void scx_bpf_destroy_dsq(u64
 	destroy_dsq(dsq_id);
 }
 
+/**
+ * bpf_iter_scx_dsq_new - Create a DSQ iterator
+ * @it: iterator to initialize
+ * @dsq_id: DSQ to iterate
+ * @flags: %SCX_DSQ_ITER_*
+ *
+ * Initialize BPF iterator @it which can be used with bpf_for_each() to walk
+ * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
+ * tasks which are already queued when this function is invoked.
+ */
+__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
+				     u64 flags)
+{
+	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
+		     sizeof(struct bpf_iter_scx_dsq));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
+		     __alignof__(struct bpf_iter_scx_dsq));
+
+	if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS)
+		return -EINVAL;
+
+	kit->dsq = find_non_local_dsq(dsq_id);
+	if (!kit->dsq)
+		return -ENOENT;
+
+	INIT_LIST_HEAD(&kit->cursor.list);
+	RB_CLEAR_NODE(&kit->cursor.priq);
+	kit->cursor.flags = SCX_TASK_DSQ_CURSOR;
+	kit->dsq_seq = READ_ONCE(kit->dsq->seq);
+	kit->flags = flags;
+
+	return 0;
+}
+
+/**
+ * bpf_iter_scx_dsq_next - Progress a DSQ iterator
+ * @it: iterator to progress
+ *
+ * Return the next task. See bpf_iter_scx_dsq_new().
+ */
+__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
+{
+	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+	bool rev = kit->flags & SCX_DSQ_ITER_REV;
+	struct task_struct *p;
+	unsigned long flags;
+
+	if (!kit->dsq)
+		return NULL;
+
+	raw_spin_lock_irqsave(&kit->dsq->lock, flags);
+
+	if (list_empty(&kit->cursor.list))
+		p = NULL;
+	else
+		p = container_of(&kit->cursor, struct task_struct, scx.dsq_node);
+
+	/*
+	 * Only tasks which were queued before the iteration started are
+	 * visible. This bounds BPF iterations and guarantees that vtime never
+	 * jumps in the other direction while iterating.
+	 */
+	do {
+		p = nldsq_next_task(kit->dsq, p, rev);
+	} while (p && unlikely((s32)(p->scx.dsq_seq - kit->dsq_seq) > 0));
+
+	if (p) {
+		if (rev)
+			list_move_tail(&kit->cursor.list, &p->scx.dsq_node.list);
+		else
+			list_move(&kit->cursor.list, &p->scx.dsq_node.list);
+	} else {
+		list_del_init(&kit->cursor.list);
+	}
+
+	raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
+
+	return p;
+}
+
+/**
+ * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
+ * @it: iterator to destroy
+ *
+ * Undo scx_iter_scx_dsq_new().
+ */
+__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
+{
+	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+
+	if (!kit->dsq)
+		return;
+
+	if (!list_empty(&kit->cursor.list)) {
+		unsigned long flags;
+
+		raw_spin_lock_irqsave(&kit->dsq->lock, flags);
+		list_del_init(&kit->cursor.list);
+		raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
+	}
+	kit->dsq = NULL;
+}
+
 __bpf_kfunc_end_defs();
 
 static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
@@ -6118,6 +6293,9 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
 BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -39,6 +39,9 @@ u32 scx_bpf_reenqueue_local(void) __ksym
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, bool rev) __ksym __weak;
+struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
+void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
 void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
 void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
 void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -36,6 +36,7 @@ const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile u32 dsp_inf_loop_after;
 const volatile u32 dsp_batch;
+const volatile bool print_shared_dsq;
 const volatile s32 disallow_tgid;
 const volatile bool suppress_dump;
 
@@ -604,10 +605,34 @@ out:
 	scx_bpf_put_cpumask(online);
 }
 
+/*
+ * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
+ * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
+ * see meaningful dumps in the trace pipe.
+ */
+static void dump_shared_dsq(void)
+{
+	struct task_struct *p;
+	s32 nr;
+
+	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
+		return;
+
+	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
+
+	bpf_rcu_read_lock();
+	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
+		bpf_printk("%s[%d]", p->comm, p->pid);
+	bpf_rcu_read_unlock();
+}
+
 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
 {
 	monitor_cpuperf();
 
+	if (print_shared_dsq)
+		dump_shared_dsq();
+
 	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
 	return 0;
 }
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -20,7 +20,7 @@ const char help_fmt[] =
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
 "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-"       [-d PID] [-D LEN] [-p] [-v]\n"
+"       [-P] [-d PID] [-D LEN] [-p] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -28,6 +28,7 @@ const char help_fmt[] =
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
 "  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
+"  -P            Print out DSQ content to trace_pipe every second, use with -b\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -D LEN        Set scx_exit_info.dump buffer length\n"
 "  -S            Suppress qmap-specific debug dump\n"
@@ -62,7 +63,7 @@ int main(int argc, char **argv)
 
 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:d:D:Spvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -82,6 +83,9 @@ int main(int argc, char **argv)
 		case 'b':
 			skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
 			break;
+		case 'P':
+			skel->rodata->print_shared_dsq = true;
+			break;
 		case 'd':
 			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
 			if (skel->rodata->disallow_tgid < 0)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task()
  2024-06-28  0:20 [PATCH sched_ext/for-6.11 1/2] sched_ext: Implement DSQ iterator Tejun Heo
@ 2024-06-28  0:24 ` Tejun Heo
  2024-06-28  1:34   ` Alexei Starovoitov
  2024-06-28  1:11 ` [PATCH sched_ext/for-6.11 1/2] sched_ext: Implement DSQ iterator Alexei Starovoitov
  1 sibling, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2024-06-28  0:24 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: linux-kernel, bpf, David Vernet

Implement scx_bpf_consume_task() which allows consuming arbitrary tasks on
the DSQ in any order while iterating in the dispatch path.

scx_qmap is updated to implement periodic dumping of the shared DSQ and a
rather silly prioritization mechanism to demonstrate the use of DSQ
iteration and selective consumption.

Note that it does a bit of nastry dance to pass in the pointer to the
iterator to __scx_bpf_consume_task(). This is to work around the current
limitation in the BPF verifier where it doesn't allow the memory area used
for an iterator to be passed into kfuncs. This may be too nasty and might
require a different approach.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: bpf@vger.kernel.org
---
Hello, again.

(continuing from the previous patch) so, the problem is that I need to
distinguish the tasks which have left a queue and then get requeued while an
iteration is in progress. The iterator itself already does this - it
remembers a sequence number when iteration starts and ignores tasks which
are queued afterwards.

As a task can get removed and requeued anytime, I need
scx_bpf_consume_task() to do the same testing, so I want to pass in the
iterator pointer into scx_bpf_consume_task() so that it can read the
sequence number stored in the iterator. However, BPF doesn't allow this, so
I'm doing the weird self pointer probe read thing, to obtain it, which is
quite nasty.

What do you think?

Thanks.

 kernel/sched/ext.c                       |   89 +++++++++++++++++++++++++++++--
 tools/sched_ext/include/scx/common.bpf.h |   16 +++++
 tools/sched_ext/scx_qmap.bpf.c           |   34 ++++++++++-
 tools/sched_ext/scx_qmap.c               |   14 +++-
 4 files changed, 142 insertions(+), 11 deletions(-)

--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1122,6 +1122,12 @@ enum scx_dsq_iter_flags {
 };
 
 struct bpf_iter_scx_dsq_kern {
+	/*
+	 * Must be the first field. Used to work around BPF restriction and pass
+	 * in the iterator pointer to scx_bpf_consume_task().
+	 */
+	struct bpf_iter_scx_dsq_kern	*self;
+
 	struct scx_dsq_node		cursor;
 	struct scx_dispatch_q		*dsq;
 	u32				dsq_seq;
@@ -1518,7 +1524,7 @@ static void dispatch_enqueue(struct scx_
 	p->scx.dsq_seq = dsq->seq;
 
 	dsq_mod_nr(dsq, 1);
-	p->scx.dsq = dsq;
+	WRITE_ONCE(p->scx.dsq, dsq);
 
 	/*
 	 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
@@ -1611,7 +1617,7 @@ static void dispatch_dequeue(struct rq *
 		WARN_ON_ONCE(task_linked_on_dsq(p));
 		p->scx.holding_cpu = -1;
 	}
-	p->scx.dsq = NULL;
+	WRITE_ONCE(p->scx.dsq, NULL);
 
 	if (!is_local)
 		raw_spin_unlock(&dsq->lock);
@@ -2107,7 +2113,7 @@ static void consume_local_task(struct rq
 	list_add_tail(&p->scx.dsq_node.list, &rq->scx.local_dsq.list);
 	dsq_mod_nr(dsq, -1);
 	dsq_mod_nr(&rq->scx.local_dsq, 1);
-	p->scx.dsq = &rq->scx.local_dsq;
+	WRITE_ONCE(p->scx.dsq, &rq->scx.local_dsq);
 	raw_spin_unlock(&dsq->lock);
 }
 
@@ -5585,12 +5591,88 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq
 	}
 }
 
+/**
+ * __scx_bpf_consume_task - Transfer a task from DSQ iteration to the local DSQ
+ * @it: DSQ iterator in progress
+ * @p: task to consume
+ *
+ * Transfer @p which is on the DSQ currently iterated by @it to the current
+ * CPU's local DSQ. For the transfer to be successful, @p must still be on the
+ * DSQ and have been queued before the DSQ iteration started. This function
+ * doesn't care whether @p was obtained from the DSQ iteration. @p just has to
+ * be on the DSQ and have been queued before the iteration started.
+ *
+ * Returns %true if @p has been consumed, %false if @p had already been consumed
+ * or dequeued.
+ */
+__bpf_kfunc bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p)
+{
+	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+	struct scx_dispatch_q *dsq, *kit_dsq;
+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+	struct rq *task_rq;
+	u64 kit_dsq_seq;
+
+	/* can't trust @kit, carefully fetch the values we need */
+	if (get_kernel_nofault(kit_dsq, &kit->dsq) ||
+	    get_kernel_nofault(kit_dsq_seq, &kit->dsq_seq)) {
+		scx_ops_error("invalid @it 0x%lx", it);
+		return false;
+	}
+
+	/*
+	 * @kit can't be trusted and we can only get the DSQ from @p. As we
+	 * don't know @p's rq is locked, use READ_ONCE() to access the field.
+	 * Derefing is safe as DSQs are RCU protected.
+	 */
+	dsq = READ_ONCE(p->scx.dsq);
+
+	if (unlikely(!dsq || dsq != kit_dsq))
+		return false;
+
+	if (unlikely(dsq->id == SCX_DSQ_LOCAL)) {
+		scx_ops_error("local DSQ not allowed");
+		return false;
+	}
+
+	if (!scx_kf_allowed(SCX_KF_DISPATCH))
+		return false;
+
+	flush_dispatch_buf(dspc->rq, dspc->rf);
+
+	raw_spin_lock(&dsq->lock);
+
+	/*
+	 * Did someone else get to it? @p could have already left $dsq, got
+	 * re-enqueud, or be in the process of being consumed by someone else.
+	 */
+	if (unlikely(p->scx.dsq != dsq ||
+		     time_after64(p->scx.dsq_seq, kit_dsq_seq) ||
+		     p->scx.holding_cpu >= 0))
+		goto out_unlock;
+
+	task_rq = task_rq(p);
+
+	if (dspc->rq == task_rq) {
+		consume_local_task(dspc->rq, dsq, p);
+		return true;
+	}
+
+	if (task_can_run_on_remote_rq(p, dspc->rq))
+		return consume_remote_task(dspc->rq, dspc->rf, dsq, p, task_rq);
+
+out_unlock:
+	raw_spin_unlock(&dsq->lock);
+	return false;
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
 BTF_ID_FLAGS(func, scx_bpf_consume)
+BTF_ID_FLAGS(func, __scx_bpf_consume_task)
 BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
@@ -5797,6 +5879,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(str
 	INIT_LIST_HEAD(&kit->cursor.list);
 	RB_CLEAR_NODE(&kit->cursor.priq);
 	kit->cursor.flags = SCX_TASK_DSQ_CURSOR;
+	kit->self = kit;
 	kit->dsq_seq = READ_ONCE(kit->dsq->seq);
 	kit->flags = flags;
 
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -35,6 +35,7 @@ void scx_bpf_dispatch_vtime(struct task_
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
+bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p) __ksym __weak;
 u32 scx_bpf_reenqueue_local(void) __ksym;
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
@@ -61,6 +62,21 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 
+/*
+ * Use the following as @it when calling scx_bpf_consume_task() from whitin
+ * bpf_for_each() loops.
+ */
+#define BPF_FOR_EACH_ITER	(&___it)
+
+/* hopefully temporary wrapper to work around BPF restriction */
+static inline bool scx_bpf_consume_task(struct bpf_iter_scx_dsq *it,
+					struct task_struct *p)
+{
+	unsigned long ptr;
+	bpf_probe_read_kernel(&ptr, sizeof(ptr), it);
+	return __scx_bpf_consume_task(ptr, p);
+}
+
 static inline __attribute__((format(printf, 1, 2)))
 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
 
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
 #include <scx/common.bpf.h>
+#include <string.h>
 
 enum consts {
 	ONE_SEC_IN_NS		= 1000000000,
@@ -37,6 +38,7 @@ const volatile u32 stall_kernel_nth;
 const volatile u32 dsp_inf_loop_after;
 const volatile u32 dsp_batch;
 const volatile bool print_shared_dsq;
+const volatile u64 exp_cgid;
 const volatile s32 disallow_tgid;
 const volatile bool suppress_dump;
 
@@ -121,7 +123,7 @@ struct {
 
 /* Statistics */
 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
-u64 nr_core_sched_execed;
+u64 nr_core_sched_execed, nr_expedited;
 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
 
@@ -260,6 +262,32 @@ static void update_core_sched_head_seq(s
 		scx_bpf_error("task_ctx lookup failed");
 }
 
+static bool consume_shared_dsq(void)
+{
+	struct task_struct *p;
+	bool consumed;
+
+	if (!exp_cgid)
+		return scx_bpf_consume(SHARED_DSQ);
+
+	/*
+	 * To demonstrate the use of scx_bpf_consume_task(), implement silly
+	 * selective priority boosting mechanism by scanning SHARED_DSQ looking
+	 * for matching comms and consume them first. This makes difference only
+	 * when dsp_batch is larger than 1.
+	 */
+	consumed = false;
+	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
+		if (p->cgroups->dfl_cgrp->kn->id == exp_cgid &&
+		    scx_bpf_consume_task(BPF_FOR_EACH_ITER, p)) {
+			consumed = true;
+			__sync_fetch_and_add(&nr_expedited, 1);
+		}
+	}
+
+	return consumed || scx_bpf_consume(SHARED_DSQ);
+}
+
 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
@@ -268,7 +296,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 c
 	void *fifo;
 	s32 i, pid;
 
-	if (scx_bpf_consume(SHARED_DSQ))
+	if (consume_shared_dsq())
 		return;
 
 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
@@ -319,7 +347,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 c
 			batch--;
 			cpuc->dsp_cnt--;
 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
-				scx_bpf_consume(SHARED_DSQ);
+				consume_shared_dsq();
 				return;
 			}
 			if (!cpuc->dsp_cnt)
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -20,7 +20,7 @@ const char help_fmt[] =
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
 "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-"       [-P] [-d PID] [-D LEN] [-p] [-v]\n"
+"       [-P] [-E PREFIX] [-d PID] [-D LEN] [-p] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -29,10 +29,11 @@ const char help_fmt[] =
 "  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
 "  -P            Print out DSQ content to trace_pipe every second, use with -b\n"
+"  -E CGID       Expedite consumption of threads in a cgroup, use with -b\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -D LEN        Set scx_exit_info.dump buffer length\n"
 "  -S            Suppress qmap-specific debug dump\n"
-"  -p            Switch only tasks on SCHED_EXT policy instead of all\n"
+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";
 
@@ -63,7 +64,7 @@ int main(int argc, char **argv)
 
 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PE:d:D:Spvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -86,6 +87,9 @@ int main(int argc, char **argv)
 		case 'P':
 			skel->rodata->print_shared_dsq = true;
 			break;
+		case 'E':
+			skel->rodata->exp_cgid = strtoull(optarg, NULL, 0);
+			break;
 		case 'd':
 			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
 			if (skel->rodata->disallow_tgid < 0)
@@ -116,10 +120,10 @@ int main(int argc, char **argv)
 		long nr_enqueued = skel->bss->nr_enqueued;
 		long nr_dispatched = skel->bss->nr_dispatched;
 
-		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64"\n",
+		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" exp=%"PRIu64"\n",
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
 		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
-		       skel->bss->nr_core_sched_execed);
+		       skel->bss->nr_core_sched_execed, skel->bss->nr_expedited);
 		if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
 			printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
 			       skel->bss->cpuperf_min,

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH sched_ext/for-6.11 1/2] sched_ext: Implement DSQ iterator
  2024-06-28  0:20 [PATCH sched_ext/for-6.11 1/2] sched_ext: Implement DSQ iterator Tejun Heo
  2024-06-28  0:24 ` [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task() Tejun Heo
@ 2024-06-28  1:11 ` Alexei Starovoitov
  2024-06-28 22:14   ` Tejun Heo
  1 sibling, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2024-06-28  1:11 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Alexei Starovoitov, LKML, bpf, David Vernet

On Thu, Jun 27, 2024 at 5:20 PM Tejun Heo <tj@kernel.org> wrote:
>
> DSQs are very opaque in the consumption path. The BPF scheduler has no way
> of knowing which tasks are being considered and which is picked. This patch
> adds BPF DSQ iterator.
>
> - Allows iterating tasks queued on a DSQ in the dispatch order or reverse
>   from anywhere using bpf_for_each(scx_dsq) or calling the iterator kfuncs
>   directly.
>
> - Has ordering guarantee where only tasks which were already queued when the
>   iteration started are visible and consumable during the iteration.
>
> scx_qmap is updated to implement periodic dumping of the shared DSQ.
>
> v2: - scx_bpf_consume_task() is separated out into a separate patch.
>
>     - DSQ seq and iter flags don't need to be u64. Use u32.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reviewed-by: David Vernet <dvernet@meta.com>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: bpf@vger.kernel.org
> ---
> Hello, Alexei.
>
> These two patches implement inline iterator for a task queue data structure
> that's used by sched_ext. The first one implements the iterator itself. It's
> pretty straightforward and seems to work fine. The second one implements a
> kfunc which consumes a task while iterating. This one is a bit nasty
> unfortunately. I'll continue on the second patch.
>
> Thanks.
>
>  include/linux/sched/ext.h                |    4
>  kernel/sched/ext.c                       |  182 ++++++++++++++++++++++++++++++-
>  tools/sched_ext/include/scx/common.bpf.h |    3
>  tools/sched_ext/scx_qmap.bpf.c           |   25 ++++
>  tools/sched_ext/scx_qmap.c               |    8 +
>  5 files changed, 218 insertions(+), 4 deletions(-)
>
> --- a/include/linux/sched/ext.h
> +++ b/include/linux/sched/ext.h
> @@ -61,6 +61,7 @@ struct scx_dispatch_q {
>         struct list_head        list;   /* tasks in dispatch order */
>         struct rb_root          priq;   /* used to order by p->scx.dsq_vtime */
>         u32                     nr;
> +       u32                     seq;    /* used by BPF iter */
>         u64                     id;
>         struct rhash_head       hash_node;
>         struct llist_node       free_node;
> @@ -94,6 +95,8 @@ enum scx_task_state {
>  /* scx_entity.dsq_flags */
>  enum scx_ent_dsq_flags {
>         SCX_TASK_DSQ_ON_PRIQ    = 1 << 0, /* task is queued on the priority queue of a dsq */
> +
> +       SCX_TASK_DSQ_CURSOR     = 1 << 31, /* iteration cursor, not a task */
>  };
>
>  /*
> @@ -134,6 +137,7 @@ struct scx_dsq_node {
>  struct sched_ext_entity {
>         struct scx_dispatch_q   *dsq;
>         struct scx_dsq_node     dsq_node;       /* protected by dsq lock */
> +       u32                     dsq_seq;
>         u32                     flags;          /* protected by rq lock */
>         u32                     weight;
>         s32                     sticky_cpu;
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1066,6 +1066,72 @@ static __always_inline bool scx_kf_allow
>         return true;
>  }
>
> +/**
> + * nldsq_next_task - Iterate to the next task in a non-local DSQ
> + * @dsq: user dsq being interated
> + * @cur: current position, %NULL to start iteration
> + * @rev: walk backwards
> + *
> + * Returns %NULL when iteration is finished.
> + */
> +static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
> +                                          struct task_struct *cur, bool rev)
> +{
> +       struct list_head *list_node;
> +       struct scx_dsq_node *dsq_node;
> +
> +       lockdep_assert_held(&dsq->lock);
> +
> +       if (cur)
> +               list_node = &cur->scx.dsq_node.list;
> +       else
> +               list_node = &dsq->list;
> +
> +       /* find the next task, need to skip BPF iteration cursors */
> +       do {
> +               if (rev)
> +                       list_node = list_node->prev;
> +               else
> +                       list_node = list_node->next;
> +
> +               if (list_node == &dsq->list)
> +                       return NULL;
> +
> +               dsq_node = container_of(list_node, struct scx_dsq_node, list);
> +       } while (dsq_node->flags & SCX_TASK_DSQ_CURSOR);
> +
> +       return container_of(dsq_node, struct task_struct, scx.dsq_node);
> +}
> +
> +#define nldsq_for_each_task(p, dsq)                                            \
> +       for ((p) = nldsq_next_task((dsq), NULL, false); (p);                    \
> +            (p) = nldsq_next_task((dsq), (p), false))
> +
> +
> +/*
> + * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
> + * dispatch order. BPF-visible iterator is opaque and larger to allow future
> + * changes without breaking backward compatibility. Can be used with
> + * bpf_for_each(). See bpf_iter_scx_dsq_*().
> + */
> +enum scx_dsq_iter_flags {
> +       /* iterate in the reverse dispatch order */
> +       SCX_DSQ_ITER_REV                = 1U << 0,
> +
> +       __SCX_DSQ_ITER_ALL_FLAGS        = SCX_DSQ_ITER_REV,
> +};
> +
> +struct bpf_iter_scx_dsq_kern {
> +       struct scx_dsq_node             cursor;
> +       struct scx_dispatch_q           *dsq;
> +       u32                             dsq_seq;
> +       u32                             flags;
> +} __attribute__((aligned(8)));
> +
> +struct bpf_iter_scx_dsq {
> +       u64                             __opaque[12];
> +} __attribute__((aligned(8)));

I think this is a bit too much to put on the prog stack.
Folks are working on increasing this limit and moving
the stack into "divided stack", so it won't be an issue eventually,
but let's find a way to reduce it.
It seems to me scx_dsq_node has a bunch of fields,
but if I'm reading the code correctly this patch is
only using cursor.list part of it ?

Another alternative is to use bpf_mem_alloc() like we do
in bpf_iter_css_task and others?

> +
>
>  /*
>   * SCX task iterator.
> @@ -1415,7 +1481,7 @@ static void dispatch_enqueue(struct scx_
>                  * tested easily when adding the first task.
>                  */
>                 if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
> -                            !list_empty(&dsq->list)))
> +                            nldsq_next_task(dsq, NULL, false)))
>                         scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
>                                       dsq->id);
>
> @@ -1447,6 +1513,10 @@ static void dispatch_enqueue(struct scx_
>                         list_add_tail(&p->scx.dsq_node.list, &dsq->list);
>         }
>
> +       /* seq records the order tasks are queued, used by BPF DSQ iterator */
> +       dsq->seq++;
> +       p->scx.dsq_seq = dsq->seq;
> +
>         dsq_mod_nr(dsq, 1);
>         p->scx.dsq = dsq;
>
> @@ -2109,7 +2179,7 @@ retry:
>
>         raw_spin_lock(&dsq->lock);
>
> -       list_for_each_entry(p, &dsq->list, scx.dsq_node.list) {
> +       nldsq_for_each_task(p, dsq) {
>                 struct rq *task_rq = task_rq(p);
>
>                 if (rq == task_rq) {
> @@ -5697,6 +5767,111 @@ __bpf_kfunc void scx_bpf_destroy_dsq(u64
>         destroy_dsq(dsq_id);
>  }
>
> +/**
> + * bpf_iter_scx_dsq_new - Create a DSQ iterator
> + * @it: iterator to initialize
> + * @dsq_id: DSQ to iterate
> + * @flags: %SCX_DSQ_ITER_*
> + *
> + * Initialize BPF iterator @it which can be used with bpf_for_each() to walk
> + * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
> + * tasks which are already queued when this function is invoked.
> + */
> +__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
> +                                    u64 flags)
> +{
> +       struct bpf_iter_scx_dsq_kern *kit = (void *)it;
> +
> +       BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
> +                    sizeof(struct bpf_iter_scx_dsq));
> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
> +                    __alignof__(struct bpf_iter_scx_dsq));
> +
> +       if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS)
> +               return -EINVAL;
> +
> +       kit->dsq = find_non_local_dsq(dsq_id);
> +       if (!kit->dsq)
> +               return -ENOENT;
> +
> +       INIT_LIST_HEAD(&kit->cursor.list);
> +       RB_CLEAR_NODE(&kit->cursor.priq);
> +       kit->cursor.flags = SCX_TASK_DSQ_CURSOR;

Are these two assignments really necessary?
Something inside nldsq_next_task() is using that?


> +       kit->dsq_seq = READ_ONCE(kit->dsq->seq);
> +       kit->flags = flags;
> +
> +       return 0;
> +}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task()
  2024-06-28  0:24 ` [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task() Tejun Heo
@ 2024-06-28  1:34   ` Alexei Starovoitov
  2024-06-28 21:57     ` Tejun Heo
  0 siblings, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2024-06-28  1:34 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Alexei Starovoitov, LKML, bpf, David Vernet

On Thu, Jun 27, 2024 at 5:24 PM Tejun Heo <tj@kernel.org> wrote:
>
> Implement scx_bpf_consume_task() which allows consuming arbitrary tasks on
> the DSQ in any order while iterating in the dispatch path.
>
> scx_qmap is updated to implement periodic dumping of the shared DSQ and a
> rather silly prioritization mechanism to demonstrate the use of DSQ
> iteration and selective consumption.
>
> Note that it does a bit of nastry dance to pass in the pointer to the
> iterator to __scx_bpf_consume_task(). This is to work around the current
> limitation in the BPF verifier where it doesn't allow the memory area used
> for an iterator to be passed into kfuncs. This may be too nasty and might
> require a different approach.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reviewed-by: David Vernet <dvernet@meta.com>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: bpf@vger.kernel.org
> ---
> Hello, again.
>
> (continuing from the previous patch) so, the problem is that I need to
> distinguish the tasks which have left a queue and then get requeued while an
> iteration is in progress. The iterator itself already does this - it
> remembers a sequence number when iteration starts and ignores tasks which
> are queued afterwards.
>
> As a task can get removed and requeued anytime, I need
> scx_bpf_consume_task() to do the same testing, so I want to pass in the
> iterator pointer into scx_bpf_consume_task() so that it can read the
> sequence number stored in the iterator. However, BPF doesn't allow this, so
> I'm doing the weird self pointer probe read thing, to obtain it, which is
> quite nasty.
>
> What do you think?
>
> Thanks.
>
>  kernel/sched/ext.c                       |   89 +++++++++++++++++++++++++++++--
>  tools/sched_ext/include/scx/common.bpf.h |   16 +++++
>  tools/sched_ext/scx_qmap.bpf.c           |   34 ++++++++++-
>  tools/sched_ext/scx_qmap.c               |   14 +++-
>  4 files changed, 142 insertions(+), 11 deletions(-)
>
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1122,6 +1122,12 @@ enum scx_dsq_iter_flags {
>  };
>
>  struct bpf_iter_scx_dsq_kern {
> +       /*
> +        * Must be the first field. Used to work around BPF restriction and pass
> +        * in the iterator pointer to scx_bpf_consume_task().
> +        */
> +       struct bpf_iter_scx_dsq_kern    *self;
> +
>         struct scx_dsq_node             cursor;
>         struct scx_dispatch_q           *dsq;
>         u32                             dsq_seq;
> @@ -1518,7 +1524,7 @@ static void dispatch_enqueue(struct scx_
>         p->scx.dsq_seq = dsq->seq;
>
>         dsq_mod_nr(dsq, 1);
> -       p->scx.dsq = dsq;
> +       WRITE_ONCE(p->scx.dsq, dsq);
>
>         /*
>          * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
> @@ -1611,7 +1617,7 @@ static void dispatch_dequeue(struct rq *
>                 WARN_ON_ONCE(task_linked_on_dsq(p));
>                 p->scx.holding_cpu = -1;
>         }
> -       p->scx.dsq = NULL;
> +       WRITE_ONCE(p->scx.dsq, NULL);
>
>         if (!is_local)
>                 raw_spin_unlock(&dsq->lock);
> @@ -2107,7 +2113,7 @@ static void consume_local_task(struct rq
>         list_add_tail(&p->scx.dsq_node.list, &rq->scx.local_dsq.list);
>         dsq_mod_nr(dsq, -1);
>         dsq_mod_nr(&rq->scx.local_dsq, 1);
> -       p->scx.dsq = &rq->scx.local_dsq;
> +       WRITE_ONCE(p->scx.dsq, &rq->scx.local_dsq);
>         raw_spin_unlock(&dsq->lock);
>  }
>
> @@ -5585,12 +5591,88 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq
>         }
>  }
>
> +/**
> + * __scx_bpf_consume_task - Transfer a task from DSQ iteration to the local DSQ
> + * @it: DSQ iterator in progress
> + * @p: task to consume
> + *
> + * Transfer @p which is on the DSQ currently iterated by @it to the current
> + * CPU's local DSQ. For the transfer to be successful, @p must still be on the
> + * DSQ and have been queued before the DSQ iteration started. This function
> + * doesn't care whether @p was obtained from the DSQ iteration. @p just has to
> + * be on the DSQ and have been queued before the iteration started.
> + *
> + * Returns %true if @p has been consumed, %false if @p had already been consumed
> + * or dequeued.
> + */
> +__bpf_kfunc bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p)
> +{
> +       struct bpf_iter_scx_dsq_kern *kit = (void *)it;
> +       struct scx_dispatch_q *dsq, *kit_dsq;
> +       struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
> +       struct rq *task_rq;
> +       u64 kit_dsq_seq;
> +
> +       /* can't trust @kit, carefully fetch the values we need */
> +       if (get_kernel_nofault(kit_dsq, &kit->dsq) ||
> +           get_kernel_nofault(kit_dsq_seq, &kit->dsq_seq)) {
> +               scx_ops_error("invalid @it 0x%lx", it);
> +               return false;
> +       }

With scx_bpf_consume_task() it's only a compile time protection from bugs.
Since kfunc doesn't dereference any field in kit_dsq it won't crash
immediately, but let's figure out how to make it work properly.

Since kit_dsq and kit_dsq_seq are pretty much anything in this implementation
can they be passed as two scalars instead ?
I guess not, since tricking dsq != kit_dsq and
time_after64(..,kit_dsq_seq) can lead to real issues ?

Can some of it be mitigated by passing dsq into kfunc that
was used to init the iter ?
Then kfunc will read dsq->seq from it instead of kit->dsq_seq ?

> +
> +       /*
> +        * @kit can't be trusted and we can only get the DSQ from @p. As we
> +        * don't know @p's rq is locked, use READ_ONCE() to access the field.
> +        * Derefing is safe as DSQs are RCU protected.
> +        */
> +       dsq = READ_ONCE(p->scx.dsq);
> +
> +       if (unlikely(!dsq || dsq != kit_dsq))
> +               return false;
> +
> +       if (unlikely(dsq->id == SCX_DSQ_LOCAL)) {
> +               scx_ops_error("local DSQ not allowed");
> +               return false;
> +       }
> +
> +       if (!scx_kf_allowed(SCX_KF_DISPATCH))
> +               return false;
> +
> +       flush_dispatch_buf(dspc->rq, dspc->rf);
> +
> +       raw_spin_lock(&dsq->lock);
> +
> +       /*
> +        * Did someone else get to it? @p could have already left $dsq, got
> +        * re-enqueud, or be in the process of being consumed by someone else.
> +        */
> +       if (unlikely(p->scx.dsq != dsq ||
> +                    time_after64(p->scx.dsq_seq, kit_dsq_seq) ||

In the previous patch you do:
(s32)(p->scx.dsq_seq - kit->dsq_seq) > 0
and here
time_after64().
Close enough, but 32 vs 64 and equality difference?

> +                    p->scx.holding_cpu >= 0))
> +               goto out_unlock;
> +
> +       task_rq = task_rq(p);
> +
> +       if (dspc->rq == task_rq) {
> +               consume_local_task(dspc->rq, dsq, p);
> +               return true;
> +       }
> +
> +       if (task_can_run_on_remote_rq(p, dspc->rq))
> +               return consume_remote_task(dspc->rq, dspc->rf, dsq, p, task_rq);
> +
> +out_unlock:
> +       raw_spin_unlock(&dsq->lock);
> +       return false;
> +}
> +
>  __bpf_kfunc_end_defs();

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task()
  2024-06-28  1:34   ` Alexei Starovoitov
@ 2024-06-28 21:57     ` Tejun Heo
  2024-06-28 22:34       ` Andrii Nakryiko
  0 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2024-06-28 21:57 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: Alexei Starovoitov, LKML, bpf, David Vernet

Hello, Alexei.

On Thu, Jun 27, 2024 at 06:34:14PM -0700, Alexei Starovoitov wrote:
...
> > +__bpf_kfunc bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p)
> > +{
> > +       struct bpf_iter_scx_dsq_kern *kit = (void *)it;
> > +       struct scx_dispatch_q *dsq, *kit_dsq;
> > +       struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
> > +       struct rq *task_rq;
> > +       u64 kit_dsq_seq;
> > +
> > +       /* can't trust @kit, carefully fetch the values we need */
> > +       if (get_kernel_nofault(kit_dsq, &kit->dsq) ||
> > +           get_kernel_nofault(kit_dsq_seq, &kit->dsq_seq)) {
> > +               scx_ops_error("invalid @it 0x%lx", it);
> > +               return false;
> > +       }
> 
> With scx_bpf_consume_task() it's only a compile time protection from bugs.
> Since kfunc doesn't dereference any field in kit_dsq it won't crash
> immediately, but let's figure out how to make it work properly.
> 
> Since kit_dsq and kit_dsq_seq are pretty much anything in this implementation
> can they be passed as two scalars instead ?
> I guess not, since tricking dsq != kit_dsq and
> time_after64(..,kit_dsq_seq) can lead to real issues ?

That actually should be okay. It can lead to real but not crashing issues.
The system integrity is going to be fine no matter what the passed in seq
value is. It can just lead to confusing behaviors from the BPF scheduler's
POV, so it's fine to put the onus on the BPF scheduler.

> Can some of it be mitigated by passing dsq into kfunc that
> was used to init the iter ?
> Then kfunc will read dsq->seq from it instead of kit->dsq_seq ?

I don't quite follow this part. bpf_iter_scx_dsq_new() takes @dsq_id. The
function looks up the matching DSQ and then the iterator remembers the
current dsq->seq which serves as the threshold (tasks queued afterwards are
ignored). ie. The value needs to be copied at that point to guarantee that
iteration ignores tasks that are queued after the iteration started.

> > +       /*
> > +        * Did someone else get to it? @p could have already left $dsq, got
> > +        * re-enqueud, or be in the process of being consumed by someone else.
> > +        */
> > +       if (unlikely(p->scx.dsq != dsq ||
> > +                    time_after64(p->scx.dsq_seq, kit_dsq_seq) ||
> 
> In the previous patch you do:
> (s32)(p->scx.dsq_seq - kit->dsq_seq) > 0
> and here
> time_after64().
> Close enough, but 32 vs 64 and equality difference?

Sorry about the sloppiness. It was originally u64 and then I forgot to
update here after changing them to u32. I'll add a helper for the comparison
and update both sites.

Going back to the sequence number barrier, it's a sort of scoping and one
way to solve it is adding an explicit helper to fetch the target DSQ's
sequence number and then pass it to the consume_task function. ie. sth like:

	barrier_seq = scx_bpf_dsq_seq(dsq_id);
	bpf_for_each(scx_dsq, p, dsq_id, 0) {
		...
		scx_bpf_consume_task(p, dsq_id, barrier_seq);
	}

This should work but it's not as neat in that it now involves three dsq_id
-> DSQ lookups. Also, there's extra subtlety arising from @barrier_seq being
different from the barrier seq that the scx_dsq iterator would be using.

As a DSQ iteration needs to have its own barrier sequence, maybe the answer
is to require passing it in as an explicit parameter. ie.:

	barrier_seq = scx_bpf_dsq_seq(dsq_id);
	bpf_for_each(scx_dsq, p, dsq_id, barrier_seq, 0) {
		...
		scx_bpf_consume_task(p, dsq_id, barrier_seq);
	}

There still are three dsq_id lookups but at least there is just one sequence
number in play. It is more cumbersome tho compared to the current interface:

	bpf_for_each(scx_dsq, p, dsq_id, 0) {
		...
		scx_bpf_consume_task(BPF_FOR_EACH_ITER, p);
	}

What do you think?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH sched_ext/for-6.11 1/2] sched_ext: Implement DSQ iterator
  2024-06-28  1:11 ` [PATCH sched_ext/for-6.11 1/2] sched_ext: Implement DSQ iterator Alexei Starovoitov
@ 2024-06-28 22:14   ` Tejun Heo
  0 siblings, 0 replies; 12+ messages in thread
From: Tejun Heo @ 2024-06-28 22:14 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: Alexei Starovoitov, LKML, bpf, David Vernet

Hello,

On Thu, Jun 27, 2024 at 06:11:48PM -0700, Alexei Starovoitov wrote:
> > +struct bpf_iter_scx_dsq_kern {
> > +       struct scx_dsq_node             cursor;
> > +       struct scx_dispatch_q           *dsq;
> > +       u32                             dsq_seq;
> > +       u32                             flags;
> > +} __attribute__((aligned(8)));
> > +
> > +struct bpf_iter_scx_dsq {
> > +       u64                             __opaque[12];
> > +} __attribute__((aligned(8)));
> 
> I think this is a bit too much to put on the prog stack.
> Folks are working on increasing this limit and moving
> the stack into "divided stack", so it won't be an issue eventually,
> but let's find a way to reduce it.

Yeah, it is kinda big. Do you have some idea on where the boundary between
okay and too big would fall on?

> It seems to me scx_dsq_node has a bunch of fields,
> but if I'm reading the code correctly this patch is
> only using cursor.list part of it ?

Great point. Cursors used to have to go on the rbtrees too but that's no
longer the case, so I should be able to drop the rbnode which should help
reducing the size substantially. I'll see what I can do.

> Another alternative is to use bpf_mem_alloc() like we do
> in bpf_iter_css_task and others?

This might be okay but given that this can be used pretty frequently (e.g.
every scheduling event) and it *seems* possible to reduce its size
substantially, I'd like to keep it on stack if possible.

> > +__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
> > +                                    u64 flags)
> > +{
> > +       struct bpf_iter_scx_dsq_kern *kit = (void *)it;
> > +
> > +       BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
> > +                    sizeof(struct bpf_iter_scx_dsq));
> > +       BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
> > +                    __alignof__(struct bpf_iter_scx_dsq));
> > +
> > +       if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS)
> > +               return -EINVAL;
> > +
> > +       kit->dsq = find_non_local_dsq(dsq_id);
> > +       if (!kit->dsq)
> > +               return -ENOENT;
> > +
> > +       INIT_LIST_HEAD(&kit->cursor.list);
> > +       RB_CLEAR_NODE(&kit->cursor.priq);
> > +       kit->cursor.flags = SCX_TASK_DSQ_CURSOR;
> 
> Are these two assignments really necessary?
> Something inside nldsq_next_task() is using that?
> 
> > +       kit->dsq_seq = READ_ONCE(kit->dsq->seq);
> > +       kit->flags = flags;

I'm a bit confused whether you're referring to the statements above or
below, but AFAICS, they're all used except for kit->cursor.priq.

- SCX_TASK_DSQ_CURSOR assignment is what tells nldsq_next_task() that the
  node is a cursor, not a real task, and thus should be skipped for internal
  iterations.

- kit->dsq_seq is used by bpf_iter_scx_dsq_next() to ignore tasks that are
  queued after the iteration has started. This, among other things,
  guarantees that p->scx.dsq_vtime increases monotonically throughout
  iteration.

- kit->flags carries SCX_DSQ_ITER_REV which tells bpf_iter_scx_dsq_next()
  the direction of the iteration.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task()
  2024-06-28 21:57     ` Tejun Heo
@ 2024-06-28 22:34       ` Andrii Nakryiko
  2024-06-28 23:04         ` Tejun Heo
  0 siblings, 1 reply; 12+ messages in thread
From: Andrii Nakryiko @ 2024-06-28 22:34 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Alexei Starovoitov, Alexei Starovoitov, LKML, bpf, David Vernet

On Fri, Jun 28, 2024 at 3:08 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello, Alexei.
>
> On Thu, Jun 27, 2024 at 06:34:14PM -0700, Alexei Starovoitov wrote:
> ...
> > > +__bpf_kfunc bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p)
> > > +{
> > > +       struct bpf_iter_scx_dsq_kern *kit = (void *)it;
> > > +       struct scx_dispatch_q *dsq, *kit_dsq;
> > > +       struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
> > > +       struct rq *task_rq;
> > > +       u64 kit_dsq_seq;
> > > +
> > > +       /* can't trust @kit, carefully fetch the values we need */
> > > +       if (get_kernel_nofault(kit_dsq, &kit->dsq) ||
> > > +           get_kernel_nofault(kit_dsq_seq, &kit->dsq_seq)) {
> > > +               scx_ops_error("invalid @it 0x%lx", it);
> > > +               return false;
> > > +       }
> >
> > With scx_bpf_consume_task() it's only a compile time protection from bugs.
> > Since kfunc doesn't dereference any field in kit_dsq it won't crash
> > immediately, but let's figure out how to make it work properly.
> >
> > Since kit_dsq and kit_dsq_seq are pretty much anything in this implementation
> > can they be passed as two scalars instead ?
> > I guess not, since tricking dsq != kit_dsq and
> > time_after64(..,kit_dsq_seq) can lead to real issues ?
>
> That actually should be okay. It can lead to real but not crashing issues.
> The system integrity is going to be fine no matter what the passed in seq
> value is. It can just lead to confusing behaviors from the BPF scheduler's
> POV, so it's fine to put the onus on the BPF scheduler.
>
> > Can some of it be mitigated by passing dsq into kfunc that
> > was used to init the iter ?
> > Then kfunc will read dsq->seq from it instead of kit->dsq_seq ?
>
> I don't quite follow this part. bpf_iter_scx_dsq_new() takes @dsq_id. The
> function looks up the matching DSQ and then the iterator remembers the
> current dsq->seq which serves as the threshold (tasks queued afterwards are
> ignored). ie. The value needs to be copied at that point to guarantee that
> iteration ignores tasks that are queued after the iteration started.
>
> > > +       /*
> > > +        * Did someone else get to it? @p could have already left $dsq, got
> > > +        * re-enqueud, or be in the process of being consumed by someone else.
> > > +        */
> > > +       if (unlikely(p->scx.dsq != dsq ||
> > > +                    time_after64(p->scx.dsq_seq, kit_dsq_seq) ||
> >
> > In the previous patch you do:
> > (s32)(p->scx.dsq_seq - kit->dsq_seq) > 0
> > and here
> > time_after64().
> > Close enough, but 32 vs 64 and equality difference?
>
> Sorry about the sloppiness. It was originally u64 and then I forgot to
> update here after changing them to u32. I'll add a helper for the comparison
> and update both sites.
>
> Going back to the sequence number barrier, it's a sort of scoping and one
> way to solve it is adding an explicit helper to fetch the target DSQ's
> sequence number and then pass it to the consume_task function. ie. sth like:
>
>         barrier_seq = scx_bpf_dsq_seq(dsq_id);
>         bpf_for_each(scx_dsq, p, dsq_id, 0) {
>                 ...
>                 scx_bpf_consume_task(p, dsq_id, barrier_seq);
>         }
>
> This should work but it's not as neat in that it now involves three dsq_id
> -> DSQ lookups. Also, there's extra subtlety arising from @barrier_seq being
> different from the barrier seq that the scx_dsq iterator would be using.

maybe a stupid question, but why scx_dsq iterator cannot accept
scx_dispatch_q pointer directly instead of dsq_id and then doing
lookup? I.e., what if you had a kfunc to do dsq_id -> scx_dispatch_q
lookup (returning PTR_TRUSTED instance), and then you can pass that to
iterator, you can pass that to scx_bpf_consume_task() kfunc.

Technically, you can also have another kfunc accepting scx_dispatch_q
and returning current "timestamp" as some special type (TRUSTED and
all), which will be passed into consume_task() as well.

Is this too explicit in terms of types?

>
> As a DSQ iteration needs to have its own barrier sequence, maybe the answer
> is to require passing it in as an explicit parameter. ie.:
>
>         barrier_seq = scx_bpf_dsq_seq(dsq_id);
>         bpf_for_each(scx_dsq, p, dsq_id, barrier_seq, 0) {
>                 ...
>                 scx_bpf_consume_task(p, dsq_id, barrier_seq);
>         }
>
> There still are three dsq_id lookups but at least there is just one sequence
> number in play. It is more cumbersome tho compared to the current interface:
>
>         bpf_for_each(scx_dsq, p, dsq_id, 0) {
>                 ...
>                 scx_bpf_consume_task(BPF_FOR_EACH_ITER, p);
>         }
>
> What do you think?
>
> Thanks.
>
> --
> tejun
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task()
  2024-06-28 22:34       ` Andrii Nakryiko
@ 2024-06-28 23:04         ` Tejun Heo
  2024-06-28 23:12           ` Tejun Heo
  0 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2024-06-28 23:04 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Alexei Starovoitov, Alexei Starovoitov, LKML, bpf, David Vernet

Hello, Andrii.

On Fri, Jun 28, 2024 at 03:34:01PM -0700, Andrii Nakryiko wrote:
> > This should work but it's not as neat in that it now involves three dsq_id
> > -> DSQ lookups. Also, there's extra subtlety arising from @barrier_seq being
> > different from the barrier seq that the scx_dsq iterator would be using.
> 
> maybe a stupid question, but why scx_dsq iterator cannot accept
> scx_dispatch_q pointer directly instead of dsq_id and then doing
> lookup? I.e., what if you had a kfunc to do dsq_id -> scx_dispatch_q
> lookup (returning PTR_TRUSTED instance), and then you can pass that to
> iterator, you can pass that to scx_bpf_consume_task() kfunc.

Not a stupid question at all. It's just that all the existing interface is
based on IDs. This is partly because there's not much the BPF code can do
with the DSQ data structure and partly because DSQs are usually not accessed
multiple times in sequence (ie. if the BPF code isn't going to look it up
and hold it persistently, it's going to have to look it up each time
anyway).

The multiple lookups aren't the end of the world. They're all on a resizing
hashtable, so lookups should be pretty low cost. It's just a little bit sad
to look at.

> Technically, you can also have another kfunc accepting scx_dispatch_q
> and returning current "timestamp" as some special type (TRUSTED and
> all), which will be passed into consume_task() as well.
> 
> Is this too explicit in terms of types?

That would work fine too and maybe we can make the iter init function to
accept NULL pointer to start its own scope so that users who don't want to
use consume_task() can skip the extra step.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task()
  2024-06-28 23:04         ` Tejun Heo
@ 2024-06-28 23:12           ` Tejun Heo
  2024-06-28 23:56             ` Andrii Nakryiko
  0 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2024-06-28 23:12 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Alexei Starovoitov, Alexei Starovoitov, LKML, bpf, David Vernet

Hello, again.

On Fri, Jun 28, 2024 at 01:04:04PM -1000, Tejun Heo wrote:
...
> Not a stupid question at all. It's just that all the existing interface is
> based on IDs. This is partly because there's not much the BPF code can do
> with the DSQ data structure and partly because DSQs are usually not accessed
> multiple times in sequence (ie. if the BPF code isn't going to look it up
> and hold it persistently, it's going to have to look it up each time
> anyway).
> 
> The multiple lookups aren't the end of the world. They're all on a resizing
> hashtable, so lookups should be pretty low cost. It's just a little bit sad
> to look at.

Just a bit of addition and a question. scx_bpf_consume_task() is maybe named
too generically and I have a hard time imagining it being useful outside
iteration loop. So, it does work out kinda neatly if we can tie the whole
thing (DSQ lookup, barrier seq) to the iterator.

The reason why this becomes nasty is because I can't pass the pointer to the
iterator to a kfunc, so maybe allowing that can be a solution here too?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task()
  2024-06-28 23:12           ` Tejun Heo
@ 2024-06-28 23:56             ` Andrii Nakryiko
  2024-06-29  1:35               ` Tejun Heo
  0 siblings, 1 reply; 12+ messages in thread
From: Andrii Nakryiko @ 2024-06-28 23:56 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Alexei Starovoitov, Alexei Starovoitov, LKML, bpf, David Vernet

On Fri, Jun 28, 2024 at 4:13 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello, again.
>
> On Fri, Jun 28, 2024 at 01:04:04PM -1000, Tejun Heo wrote:
> ...
> > Not a stupid question at all. It's just that all the existing interface is
> > based on IDs. This is partly because there's not much the BPF code can do
> > with the DSQ data structure and partly because DSQs are usually not accessed
> > multiple times in sequence (ie. if the BPF code isn't going to look it up
> > and hold it persistently, it's going to have to look it up each time
> > anyway).
> >
> > The multiple lookups aren't the end of the world. They're all on a resizing
> > hashtable, so lookups should be pretty low cost. It's just a little bit sad
> > to look at.
>
> Just a bit of addition and a question. scx_bpf_consume_task() is maybe named
> too generically and I have a hard time imagining it being useful outside
> iteration loop. So, it does work out kinda neatly if we can tie the whole
> thing (DSQ lookup, barrier seq) to the iterator.
>
> The reason why this becomes nasty is because I can't pass the pointer to the
> iterator to a kfunc, so maybe allowing that can be a solution here too?
>

Sure, if that's the best way to go about this.


> Thanks.
>
> --
> tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task()
  2024-06-28 23:56             ` Andrii Nakryiko
@ 2024-06-29  1:35               ` Tejun Heo
  2024-07-02  0:55                 ` Andrii Nakryiko
  0 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2024-06-29  1:35 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Alexei Starovoitov, Alexei Starovoitov, LKML, bpf, David Vernet

Hello, Andrii.

On Fri, Jun 28, 2024 at 04:56:55PM -0700, Andrii Nakryiko wrote:
> > Just a bit of addition and a question. scx_bpf_consume_task() is maybe named
> > too generically and I have a hard time imagining it being useful outside
> > iteration loop. So, it does work out kinda neatly if we can tie the whole
> > thing (DSQ lookup, barrier seq) to the iterator.
> >
> > The reason why this becomes nasty is because I can't pass the pointer to the
> > iterator to a kfunc, so maybe allowing that can be a solution here too?
> 
> Sure, if that's the best way to go about this.

If we decide to go this way, how difficult would it be to change the
verifier to allow this?

BTW, as none of the practical schedulers use consume_task() yet, I can skip
this for now. I'll post an updated patches for the iterator itself. We can
decide what to do with consume_task() later.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task()
  2024-06-29  1:35               ` Tejun Heo
@ 2024-07-02  0:55                 ` Andrii Nakryiko
  0 siblings, 0 replies; 12+ messages in thread
From: Andrii Nakryiko @ 2024-07-02  0:55 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Alexei Starovoitov, Alexei Starovoitov, LKML, bpf, David Vernet

On Fri, Jun 28, 2024 at 6:35 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello, Andrii.
>
> On Fri, Jun 28, 2024 at 04:56:55PM -0700, Andrii Nakryiko wrote:
> > > Just a bit of addition and a question. scx_bpf_consume_task() is maybe named
> > > too generically and I have a hard time imagining it being useful outside
> > > iteration loop. So, it does work out kinda neatly if we can tie the whole
> > > thing (DSQ lookup, barrier seq) to the iterator.
> > >
> > > The reason why this becomes nasty is because I can't pass the pointer to the
> > > iterator to a kfunc, so maybe allowing that can be a solution here too?
> >
> > Sure, if that's the best way to go about this.
>
> If we decide to go this way, how difficult would it be to change the
> verifier to allow this?

Shouldn't be too difficult, but we'll know for sure when we start
implementing this, of course.

>
> BTW, as none of the practical schedulers use consume_task() yet, I can skip
> this for now. I'll post an updated patches for the iterator itself. We can
> decide what to do with consume_task() later.
>

Sounds good.

> Thanks.
>
> --
> tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-07-02  0:55 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-28  0:20 [PATCH sched_ext/for-6.11 1/2] sched_ext: Implement DSQ iterator Tejun Heo
2024-06-28  0:24 ` [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task() Tejun Heo
2024-06-28  1:34   ` Alexei Starovoitov
2024-06-28 21:57     ` Tejun Heo
2024-06-28 22:34       ` Andrii Nakryiko
2024-06-28 23:04         ` Tejun Heo
2024-06-28 23:12           ` Tejun Heo
2024-06-28 23:56             ` Andrii Nakryiko
2024-06-29  1:35               ` Tejun Heo
2024-07-02  0:55                 ` Andrii Nakryiko
2024-06-28  1:11 ` [PATCH sched_ext/for-6.11 1/2] sched_ext: Implement DSQ iterator Alexei Starovoitov
2024-06-28 22:14   ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox