Re: [PATCH 12/36] sched_ext: Implement BPF extensible scheduler class

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Andrea Righi <andrea.righi@canonical.com>
To: Tejun Heo <tj@kernel.org>
Cc: torvalds@linux-foundation.org, mingo@redhat.com,
	peterz@infradead.org, juri.lelli@redhat.com,
	vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
	rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
	bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
	daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
	joshdon@google.com, brho@google.com, pjt@google.com,
	derkling@google.com, haoluo@google.com, dvernet@meta.com,
	dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com,
	changwoo@igalia.com, himadrics@inria.fr, memxor@gmail.com,
	linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
	kernel-team@meta.com
Subject: Re: [PATCH 12/36] sched_ext: Implement BPF extensible scheduler class
Date: Thu, 23 Nov 2023 09:07:35 +0100	[thread overview]
Message-ID: <ZV8IR/w4IaxJ2vPA@gpd> (raw)
In-Reply-To: <20231111024835.2164816-13-tj@kernel.org>

On Fri, Nov 10, 2023 at 04:47:38PM -1000, Tejun Heo wrote:
...
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3961,6 +3961,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
>  
>  static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
>  {
> +	/*
> +	 * The BPF scheduler may depend on select_task_rq() being invoked during
> +	 * wakeups. In addition, @p may end up executing on a different CPU
> +	 * regardless of what happens in the wakeup path making the ttwu_queue
> +	 * optimization less meaningful. Skip if on SCX.
> +	 */
> +	if (task_on_scx(p))
> +		return false;
> +
>  	/*
>  	 * Do not complicate things with the async wake_list while the CPU is
>  	 * in hotplug state.
> @@ -4531,6 +4540,18 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
>  	p->rt.on_rq		= 0;
>  	p->rt.on_list		= 0;
>  
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +	p->scx.dsq		= NULL;
> +	INIT_LIST_HEAD(&p->scx.dsq_node);
> +	p->scx.flags		= 0;
> +	p->scx.weight		= 0;
> +	p->scx.sticky_cpu	= -1;
> +	p->scx.holding_cpu	= -1;
> +	p->scx.kf_mask		= 0;
> +	atomic64_set(&p->scx.ops_state, 0);

We probably need atomic_long_set() here or in 32-bit arches (such as
armhf) we get this:

kernel/sched/core.c:4564:22: error: passing argument 1 of ‘atomic64_set’ from incompatible pointer type [-Werror=incompatible-pointer-types]
 4564 |         atomic64_set(&p->scx.ops_state, 0);
      |                      ^~~~~~~~~~~~~~~~~
      |                      |
      |                      atomic_long_t * {aka atomic_t *}

> +	p->scx.slice		= SCX_SLICE_DFL;
> +#endif
> +
>  #ifdef CONFIG_PREEMPT_NOTIFIERS
>  	INIT_HLIST_HEAD(&p->preempt_notifiers);
>  #endif
> @@ -4779,6 +4800,10 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
>  		goto out_cancel;
>  	} else if (rt_prio(p->prio)) {
>  		p->sched_class = &rt_sched_class;
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +	} else if (task_should_scx(p)) {
> +		p->sched_class = &ext_sched_class;
> +#endif
>  	} else {
>  		p->sched_class = &fair_sched_class;
>  	}
> @@ -7059,6 +7084,10 @@ void __setscheduler_prio(struct task_struct *p, int prio)
>  		p->sched_class = &dl_sched_class;
>  	else if (rt_prio(prio))
>  		p->sched_class = &rt_sched_class;
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +	else if (task_should_scx(p))
> +		p->sched_class = &ext_sched_class;
> +#endif
>  	else
>  		p->sched_class = &fair_sched_class;
>  
> @@ -9055,6 +9084,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
>  	case SCHED_NORMAL:
>  	case SCHED_BATCH:
>  	case SCHED_IDLE:
> +	case SCHED_EXT:
>  		ret = 0;
>  		break;
>  	}
> @@ -9082,6 +9112,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
>  	case SCHED_NORMAL:
>  	case SCHED_BATCH:
>  	case SCHED_IDLE:
> +	case SCHED_EXT:
>  		ret = 0;
>  	}
>  	return ret;
> @@ -9918,6 +9949,10 @@ void __init sched_init(void)
>  	BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
>  	BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
>  	BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +	BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class));
> +	BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
> +#endif
>  
>  	wait_bit_init();
>  
> @@ -12047,3 +12082,38 @@ void sched_mm_cid_fork(struct task_struct *t)
>  	t->mm_cid_active = 1;
>  }
>  #endif
> +
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
> +			    struct sched_enq_and_set_ctx *ctx)
> +{
> +	struct rq *rq = task_rq(p);
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	*ctx = (struct sched_enq_and_set_ctx){
> +		.p = p,
> +		.queue_flags = queue_flags,
> +		.queued = task_on_rq_queued(p),
> +		.running = task_current(rq, p),
> +	};
> +
> +	update_rq_clock(rq);
> +	if (ctx->queued)
> +		dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
> +	if (ctx->running)
> +		put_prev_task(rq, p);
> +}
> +
> +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
> +{
> +	struct rq *rq = task_rq(ctx->p);
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	if (ctx->queued)
> +		enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
> +	if (ctx->running)
> +		set_next_task(rq, ctx->p);
> +}
> +#endif	/* CONFIG_SCHED_CLASS_EXT */
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 4580a450700e..6587a45ffe96 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -374,6 +374,9 @@ static __init int sched_init_debug(void)
>  
>  	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
>  
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +	debugfs_create_file("ext", 0444, debugfs_sched, NULL, &sched_ext_fops);
> +#endif
>  	return 0;
>  }
>  late_initcall(sched_init_debug);
> @@ -1085,6 +1088,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
>  		P(dl.runtime);
>  		P(dl.deadline);
>  	}
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +	__PS("ext.enabled", task_on_scx(p));
> +#endif
>  #undef PN_SCHEDSTAT
>  #undef P_SCHEDSTAT
>  
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> new file mode 100644
> index 000000000000..7b78f77d2293
> --- /dev/null
> +++ b/kernel/sched/ext.c
> @@ -0,0 +1,3158 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
> + * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
> + * Copyright (c) 2022 David Vernet <dvernet@meta.com>
> + */
> +#define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
> +
> +enum scx_internal_consts {
> +	SCX_NR_ONLINE_OPS	= SCX_OP_IDX(init),
> +	SCX_DSP_DFL_MAX_BATCH	= 32,
> +};
> +
> +enum scx_ops_enable_state {
> +	SCX_OPS_PREPPING,
> +	SCX_OPS_ENABLING,
> +	SCX_OPS_ENABLED,
> +	SCX_OPS_DISABLING,
> +	SCX_OPS_DISABLED,
> +};
> +
> +/*
> + * sched_ext_entity->ops_state
> + *
> + * Used to track the task ownership between the SCX core and the BPF scheduler.
> + * State transitions look as follows:
> + *
> + * NONE -> QUEUEING -> QUEUED -> DISPATCHING
> + *   ^              |                 |
> + *   |              v                 v
> + *   \-------------------------------/
> + *
> + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
> + * sites for explanations on the conditions being waited upon and why they are
> + * safe. Transitions out of them into NONE or QUEUED must store_release and the
> + * waiters should load_acquire.
> + *
> + * Tracking scx_ops_state enables sched_ext core to reliably determine whether
> + * any given task can be dispatched by the BPF scheduler at all times and thus
> + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
> + * to try to dispatch any task anytime regardless of its state as the SCX core
> + * can safely reject invalid dispatches.
> + */
> +enum scx_ops_state {
> +	SCX_OPSS_NONE,		/* owned by the SCX core */
> +	SCX_OPSS_QUEUEING,	/* in transit to the BPF scheduler */
> +	SCX_OPSS_QUEUED,	/* owned by the BPF scheduler */
> +	SCX_OPSS_DISPATCHING,	/* in transit back to the SCX core */
> +
> +	/*
> +	 * QSEQ brands each QUEUED instance so that, when dispatch races
> +	 * dequeue/requeue, the dispatcher can tell whether it still has a claim
> +	 * on the task being dispatched.
> +	 *
> +	 * As some 32bit archs can't do 64bit store_release/load_acquire,
> +	 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
> +	 * 32bit machines. The dispatch race window QSEQ protects is very narrow
> +	 * and runs with IRQ disabled. 30 bits should be sufficient.
> +	 */
> +	SCX_OPSS_QSEQ_SHIFT	= 2,
> +};
> +
> +/* Use macros to ensure that the type is unsigned long for the masks */
> +#define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
> +#define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
> +
> +/*
> + * During exit, a task may schedule after losing its PIDs. When disabling the
> + * BPF scheduler, we need to be able to iterate tasks in every state to
> + * guarantee system safety. Maintain a dedicated task list which contains every
> + * task between its fork and eventual free.
> + */
> +static DEFINE_SPINLOCK(scx_tasks_lock);
> +static LIST_HEAD(scx_tasks);
> +
> +/* ops enable/disable */
> +static struct kthread_worker *scx_ops_helper;
> +static DEFINE_MUTEX(scx_ops_enable_mutex);
> +DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
> +DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
> +static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
> +static struct sched_ext_ops scx_ops;
> +static bool scx_warned_zero_slice;
> +
> +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
> +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
> +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
> +
> +struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] =
> +	{ [0 ... SCX_NR_ONLINE_OPS-1] = STATIC_KEY_FALSE_INIT };
> +
> +static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
> +static struct scx_exit_info scx_exit_info;
> +
> +/* idle tracking */
> +#ifdef CONFIG_SMP
> +#ifdef CONFIG_CPUMASK_OFFSTACK
> +#define CL_ALIGNED_IF_ONSTACK
> +#else
> +#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
> +#endif
> +
> +static struct {
> +	cpumask_var_t cpu;
> +	cpumask_var_t smt;
> +} idle_masks CL_ALIGNED_IF_ONSTACK;
> +
> +#endif	/* CONFIG_SMP */
> +
> +/*
> + * Direct dispatch marker.
> + *
> + * Non-NULL values are used for direct dispatch from enqueue path. A valid
> + * pointer points to the task currently being enqueued. An ERR_PTR value is used
> + * to indicate that direct dispatch has already happened.
> + */
> +static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
> +
> +/* dispatch queues */
> +static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
> +
> +static const struct rhashtable_params dsq_hash_params = {
> +	.key_len		= 8,
> +	.key_offset		= offsetof(struct scx_dispatch_q, id),
> +	.head_offset		= offsetof(struct scx_dispatch_q, hash_node),
> +};
> +
> +static struct rhashtable dsq_hash;
> +static LLIST_HEAD(dsqs_to_free);
> +
> +/* dispatch buf */
> +struct scx_dsp_buf_ent {
> +	struct task_struct	*task;
> +	unsigned long		qseq;
> +	u64			dsq_id;
> +	u64			enq_flags;
> +};
> +
> +static u32 scx_dsp_max_batch;
> +static struct scx_dsp_buf_ent __percpu *scx_dsp_buf;
> +
> +struct scx_dsp_ctx {
> +	struct rq		*rq;
> +	struct rq_flags		*rf;
> +	u32			buf_cursor;
> +	u32			nr_tasks;
> +};
> +
> +static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx);
> +
> +void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
> +		      u64 enq_flags);
> +__printf(2, 3) static void scx_ops_error_kind(enum scx_exit_kind kind,
> +					      const char *fmt, ...);
> +#define scx_ops_error(fmt, args...)						\
> +	scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
> +
> +struct scx_task_iter {
> +	struct sched_ext_entity		cursor;
> +	struct task_struct		*locked;
> +	struct rq			*rq;
> +	struct rq_flags			rf;
> +};
> +
> +#define SCX_HAS_OP(op)	static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
> +
> +/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
> +static u32 higher_bits(u32 flags)
> +{
> +	return ~((1 << fls(flags)) - 1);
> +}
> +
> +/* return the mask with only the highest bit set */
> +static u32 highest_bit(u32 flags)
> +{
> +	int bit = fls(flags);
> +	return bit ? 1 << (bit - 1) : 0;
> +}
> +
> +/*
> + * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
> + * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
> + * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
> + * whether it's running from an allowed context.
> + *
> + * @mask is constant, always inline to cull the mask calculations.
> + */
> +static __always_inline void scx_kf_allow(u32 mask)
> +{
> +	/* nesting is allowed only in increasing scx_kf_mask order */
> +	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
> +		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
> +		  current->scx.kf_mask, mask);
> +	current->scx.kf_mask |= mask;
> +}
> +
> +static void scx_kf_disallow(u32 mask)
> +{
> +	current->scx.kf_mask &= ~mask;
> +}
> +
> +#define SCX_CALL_OP(mask, op, args...)						\
> +do {										\
> +	if (mask) {								\
> +		scx_kf_allow(mask);						\
> +		scx_ops.op(args);						\
> +		scx_kf_disallow(mask);						\
> +	} else {								\
> +		scx_ops.op(args);						\
> +	}									\
> +} while (0)
> +
> +#define SCX_CALL_OP_RET(mask, op, args...)					\
> +({										\
> +	__typeof__(scx_ops.op(args)) __ret;					\
> +	if (mask) {								\
> +		scx_kf_allow(mask);						\
> +		__ret = scx_ops.op(args);					\
> +		scx_kf_disallow(mask);						\
> +	} else {								\
> +		__ret = scx_ops.op(args);					\
> +	}									\
> +	__ret;									\
> +})
> +
> +/* @mask is constant, always inline to cull unnecessary branches */
> +static __always_inline bool scx_kf_allowed(u32 mask)
> +{
> +	if (unlikely(!(current->scx.kf_mask & mask))) {
> +		scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
> +			      mask, current->scx.kf_mask);
> +		return false;
> +	}
> +
> +	if (unlikely((mask & (SCX_KF_INIT | SCX_KF_SLEEPABLE)) &&
> +		     in_interrupt())) {
> +		scx_ops_error("sleepable kfunc called from non-sleepable context");
> +		return false;
> +	}
> +
> +	/*
> +	 * Enforce nesting boundaries. e.g. A kfunc which can be called from
> +	 * DISPATCH must not be called if we're running DEQUEUE which is nested
> +	 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
> +	 * boundary thanks to the above in_interrupt() check.
> +	 */
> +	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
> +		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
> +		scx_ops_error("dispatch kfunc called from a nested operation");
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +/**
> + * scx_task_iter_init - Initialize a task iterator
> + * @iter: iterator to init
> + *
> + * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized,
> + * @iter must eventually be exited with scx_task_iter_exit().
> + *
> + * scx_tasks_lock may be released between this and the first next() call or
> + * between any two next() calls. If scx_tasks_lock is released between two
> + * next() calls, the caller is responsible for ensuring that the task being
> + * iterated remains accessible either through RCU read lock or obtaining a
> + * reference count.
> + *
> + * All tasks which existed when the iteration started are guaranteed to be
> + * visited as long as they still exist.
> + */
> +static void scx_task_iter_init(struct scx_task_iter *iter)
> +{
> +	lockdep_assert_held(&scx_tasks_lock);
> +
> +	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
> +	list_add(&iter->cursor.tasks_node, &scx_tasks);
> +	iter->locked = NULL;
> +}
> +
> +/**
> + * scx_task_iter_exit - Exit a task iterator
> + * @iter: iterator to exit
> + *
> + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held.
> + * If the iterator holds a task's rq lock, that rq lock is released. See
> + * scx_task_iter_init() for details.
> + */
> +static void scx_task_iter_exit(struct scx_task_iter *iter)
> +{
> +	struct list_head *cursor = &iter->cursor.tasks_node;
> +
> +	lockdep_assert_held(&scx_tasks_lock);
> +
> +	if (iter->locked) {
> +		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
> +		iter->locked = NULL;
> +	}
> +
> +	if (list_empty(cursor))
> +		return;
> +
> +	list_del_init(cursor);
> +}
> +
> +/**
> + * scx_task_iter_next - Next task
> + * @iter: iterator to walk
> + *
> + * Visit the next task. See scx_task_iter_init() for details.
> + */
> +static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
> +{
> +	struct list_head *cursor = &iter->cursor.tasks_node;
> +	struct sched_ext_entity *pos;
> +
> +	lockdep_assert_held(&scx_tasks_lock);
> +
> +	list_for_each_entry(pos, cursor, tasks_node) {
> +		if (&pos->tasks_node == &scx_tasks)
> +			return NULL;
> +		if (!(pos->flags & SCX_TASK_CURSOR)) {
> +			list_move(cursor, &pos->tasks_node);
> +			return container_of(pos, struct task_struct, scx);
> +		}
> +	}
> +
> +	/* can't happen, should always terminate at scx_tasks above */
> +	BUG();
> +}
> +
> +/**
> + * scx_task_iter_next_filtered - Next non-idle task
> + * @iter: iterator to walk
> + *
> + * Visit the next non-idle task. See scx_task_iter_init() for details.
> + */
> +static struct task_struct *
> +scx_task_iter_next_filtered(struct scx_task_iter *iter)
> +{
> +	struct task_struct *p;
> +
> +	while ((p = scx_task_iter_next(iter))) {
> +		/*
> +		 * is_idle_task() tests %PF_IDLE which may not be set for CPUs
> +		 * which haven't yet been onlined. Test sched_class directly.
> +		 */
> +		if (p->sched_class != &idle_sched_class)
> +			return p;
> +	}
> +	return NULL;
> +}
> +
> +/**
> + * scx_task_iter_next_filtered_locked - Next non-idle task with its rq locked
> + * @iter: iterator to walk
> + *
> + * Visit the next non-idle task with its rq lock held. See scx_task_iter_init()
> + * for details.
> + */
> +static struct task_struct *
> +scx_task_iter_next_filtered_locked(struct scx_task_iter *iter)
> +{
> +	struct task_struct *p;
> +
> +	if (iter->locked) {
> +		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
> +		iter->locked = NULL;
> +	}
> +
> +	p = scx_task_iter_next_filtered(iter);
> +	if (!p)
> +		return NULL;
> +
> +	iter->rq = task_rq_lock(p, &iter->rf);
> +	iter->locked = p;
> +	return p;
> +}
> +
> +static enum scx_ops_enable_state scx_ops_enable_state(void)
> +{
> +	return atomic_read(&scx_ops_enable_state_var);
> +}
> +
> +static enum scx_ops_enable_state
> +scx_ops_set_enable_state(enum scx_ops_enable_state to)
> +{
> +	return atomic_xchg(&scx_ops_enable_state_var, to);
> +}
> +
> +static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
> +					enum scx_ops_enable_state from)
> +{
> +	int from_v = from;
> +
> +	return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
> +}
> +
> +static bool scx_ops_disabling(void)
> +{
> +	return unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING);
> +}
> +
> +/**
> + * wait_ops_state - Busy-wait the specified ops state to end
> + * @p: target task
> + * @opss: state to wait the end of
> + *
> + * Busy-wait for @p to transition out of @opss. This can only be used when the
> + * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also
> + * has load_acquire semantics to ensure that the caller can see the updates made
> + * in the enqueueing and dispatching paths.
> + */
> +static void wait_ops_state(struct task_struct *p, unsigned long opss)
> +{
> +	do {
> +		cpu_relax();
> +	} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
> +}
> +
> +/**
> + * ops_cpu_valid - Verify a cpu number
> + * @cpu: cpu number which came from a BPF ops
> + *
> + * @cpu is a cpu number which came from the BPF scheduler and can be any value.
> + * Verify that it is in range and one of the possible cpus.
> + */
> +static bool ops_cpu_valid(s32 cpu)
> +{
> +	return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu));
> +}
> +
> +/**
> + * ops_sanitize_err - Sanitize a -errno value
> + * @ops_name: operation to blame on failure
> + * @err: -errno value to sanitize
> + *
> + * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
> + * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
> + * cause misbehaviors. For an example, a large negative return from
> + * ops.prep_enable() triggers an oops when passed up the call chain because the
> + * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
> + * handled as a pointer.
> + */
> +static int ops_sanitize_err(const char *ops_name, s32 err)
> +{
> +	if (err < 0 && err >= -MAX_ERRNO)
> +		return err;
> +
> +	scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
> +	return -EPROTO;
> +}
> +
> +static void update_curr_scx(struct rq *rq)
> +{
> +	struct task_struct *curr = rq->curr;
> +	u64 now = rq_clock_task(rq);
> +	u64 delta_exec;
> +
> +	if (time_before_eq64(now, curr->se.exec_start))
> +		return;
> +
> +	delta_exec = now - curr->se.exec_start;
> +	curr->se.exec_start = now;
> +	curr->se.sum_exec_runtime += delta_exec;
> +	account_group_exec_runtime(curr, delta_exec);
> +	cgroup_account_cputime(curr, delta_exec);
> +
> +	curr->scx.slice -= min(curr->scx.slice, delta_exec);
> +}
> +
> +static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
> +			     u64 enq_flags)
> +{
> +	bool is_local = dsq->id == SCX_DSQ_LOCAL;
> +
> +	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node));
> +
> +	if (!is_local) {
> +		raw_spin_lock(&dsq->lock);
> +		if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
> +			scx_ops_error("attempting to dispatch to a destroyed dsq");
> +			/* fall back to the global dsq */
> +			raw_spin_unlock(&dsq->lock);
> +			dsq = &scx_dsq_global;
> +			raw_spin_lock(&dsq->lock);
> +		}
> +	}
> +
> +	if (enq_flags & SCX_ENQ_HEAD)
> +		list_add(&p->scx.dsq_node, &dsq->fifo);
> +	else
> +		list_add_tail(&p->scx.dsq_node, &dsq->fifo);
> +	dsq->nr++;
> +	p->scx.dsq = dsq;
> +
> +	/*
> +	 * We're transitioning out of QUEUEING or DISPATCHING. store_release to
> +	 * match waiters' load_acquire.
> +	 */
> +	if (enq_flags & SCX_ENQ_CLEAR_OPSS)
> +		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
> +
> +	if (is_local) {
> +		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
> +
> +		if (sched_class_above(&ext_sched_class, rq->curr->sched_class))
> +			resched_curr(rq);
> +	} else {
> +		raw_spin_unlock(&dsq->lock);
> +	}
> +}
> +
> +static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p)
> +{
> +	struct scx_dispatch_q *dsq = p->scx.dsq;
> +	bool is_local = dsq == &scx_rq->local_dsq;
> +
> +	if (!dsq) {
> +		WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
> +		/*
> +		 * When dispatching directly from the BPF scheduler to a local
> +		 * DSQ, the task isn't associated with any DSQ but
> +		 * @p->scx.holding_cpu may be set under the protection of
> +		 * %SCX_OPSS_DISPATCHING.
> +		 */
> +		if (p->scx.holding_cpu >= 0)
> +			p->scx.holding_cpu = -1;
> +		return;
> +	}
> +
> +	if (!is_local)
> +		raw_spin_lock(&dsq->lock);
> +
> +	/*
> +	 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node
> +	 * can't change underneath us.
> +	*/
> +	if (p->scx.holding_cpu < 0) {
> +		/* @p must still be on @dsq, dequeue */
> +		WARN_ON_ONCE(list_empty(&p->scx.dsq_node));
> +		list_del_init(&p->scx.dsq_node);
> +		dsq->nr--;
> +	} else {
> +		/*
> +		 * We're racing against dispatch_to_local_dsq() which already
> +		 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
> +		 * holding_cpu which tells dispatch_to_local_dsq() that it lost
> +		 * the race.
> +		 */
> +		WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
> +		p->scx.holding_cpu = -1;
> +	}
> +	p->scx.dsq = NULL;
> +
> +	if (!is_local)
> +		raw_spin_unlock(&dsq->lock);
> +}
> +
> +static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id)
> +{
> +	lockdep_assert(rcu_read_lock_any_held());
> +
> +	if (dsq_id == SCX_DSQ_GLOBAL)
> +		return &scx_dsq_global;
> +	else
> +		return rhashtable_lookup_fast(&dsq_hash, &dsq_id,
> +					      dsq_hash_params);
> +}
> +
> +static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
> +						    struct task_struct *p)
> +{
> +	struct scx_dispatch_q *dsq;
> +
> +	if (dsq_id == SCX_DSQ_LOCAL)
> +		return &rq->scx.local_dsq;
> +
> +	dsq = find_non_local_dsq(dsq_id);
> +	if (unlikely(!dsq)) {
> +		scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
> +			      dsq_id, p->comm, p->pid);
> +		return &scx_dsq_global;
> +	}
> +
> +	return dsq;
> +}
> +
> +static void direct_dispatch(struct task_struct *ddsp_task, struct task_struct *p,
> +			    u64 dsq_id, u64 enq_flags)
> +{
> +	struct scx_dispatch_q *dsq;
> +
> +	/* @p must match the task which is being enqueued */
> +	if (unlikely(p != ddsp_task)) {
> +		if (IS_ERR(ddsp_task))
> +			scx_ops_error("%s[%d] already direct-dispatched",
> +				      p->comm, p->pid);
> +		else
> +			scx_ops_error("enqueueing %s[%d] but trying to direct-dispatch %s[%d]",
> +				      ddsp_task->comm, ddsp_task->pid,
> +				      p->comm, p->pid);
> +		return;
> +	}
> +
> +	/*
> +	 * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because
> +	 * dispatching to the local DSQ of a different CPU requires unlocking
> +	 * the current rq which isn't allowed in the enqueue path. Use
> +	 * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL.
> +	 */
> +	if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) {
> +		scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch");
> +		return;
> +	}
> +
> +	dsq = find_dsq_for_dispatch(task_rq(p), dsq_id, p);
> +	dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
> +
> +	/*
> +	 * Mark that dispatch already happened by spoiling direct_dispatch_task
> +	 * with a non-NULL value which can never match a valid task pointer.
> +	 */
> +	__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
> +}
> +
> +static bool test_rq_online(struct rq *rq)
> +{
> +#ifdef CONFIG_SMP
> +	return rq->online;
> +#else
> +	return true;
> +#endif
> +}
> +
> +static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
> +			    int sticky_cpu)
> +{
> +	struct task_struct **ddsp_taskp;
> +	unsigned long qseq;
> +
> +	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
> +
> +	if (p->scx.flags & SCX_TASK_ENQ_LOCAL) {
> +		enq_flags |= SCX_ENQ_LOCAL;
> +		p->scx.flags &= ~SCX_TASK_ENQ_LOCAL;
> +	}
> +
> +	/* rq migration */
> +	if (sticky_cpu == cpu_of(rq))
> +		goto local_norefill;
> +
> +	/*
> +	 * If !rq->online, we already told the BPF scheduler that the CPU is
> +	 * offline. We're just trying to on/offline the CPU. Don't bother the
> +	 * BPF scheduler.
> +	 */
> +	if (unlikely(!test_rq_online(rq)))
> +		goto local;
> +
> +	/* see %SCX_OPS_ENQ_EXITING */
> +	if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
> +	    unlikely(p->flags & PF_EXITING))
> +		goto local;
> +
> +	/* see %SCX_OPS_ENQ_LAST */
> +	if (!static_branch_unlikely(&scx_ops_enq_last) &&
> +	    (enq_flags & SCX_ENQ_LAST))
> +		goto local;
> +
> +	if (!SCX_HAS_OP(enqueue)) {
> +		if (enq_flags & SCX_ENQ_LOCAL)
> +			goto local;
> +		else
> +			goto global;
> +	}
> +
> +	/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
> +	qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
> +
> +	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
> +	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
> +
> +	ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
> +	WARN_ON_ONCE(*ddsp_taskp);
> +	*ddsp_taskp = p;
> +
> +	SCX_CALL_OP(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
> +
> +	/*
> +	 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
> +	 * dequeue may be waiting. The store_release matches their load_acquire.
> +	 */
> +	if (*ddsp_taskp == p)
> +		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
> +	*ddsp_taskp = NULL;
> +	return;
> +
> +local:
> +	p->scx.slice = SCX_SLICE_DFL;
> +local_norefill:
> +	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
> +	return;
> +
> +global:
> +	p->scx.slice = SCX_SLICE_DFL;
> +	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
> +}
> +
> +static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
> +{
> +	int sticky_cpu = p->scx.sticky_cpu;
> +
> +	enq_flags |= rq->scx.extra_enq_flags;
> +
> +	if (sticky_cpu >= 0)
> +		p->scx.sticky_cpu = -1;
> +
> +	/*
> +	 * Restoring a running task will be immediately followed by
> +	 * set_next_task_scx() which expects the task to not be on the BPF
> +	 * scheduler as tasks can only start running through local DSQs. Force
> +	 * direct-dispatch into the local DSQ by setting the sticky_cpu.
> +	 */
> +	if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
> +		sticky_cpu = cpu_of(rq);
> +
> +	if (p->scx.flags & SCX_TASK_QUEUED)
> +		return;
> +
> +	p->scx.flags |= SCX_TASK_QUEUED;
> +	rq->scx.nr_running++;
> +	add_nr_running(rq, 1);
> +
> +	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
> +}
> +
> +static void ops_dequeue(struct task_struct *p, u64 deq_flags)
> +{
> +	unsigned long opss;
> +
> +	/* acquire ensures that we see the preceding updates on QUEUED */
> +	opss = atomic_long_read_acquire(&p->scx.ops_state);
> +
> +	switch (opss & SCX_OPSS_STATE_MASK) {
> +	case SCX_OPSS_NONE:
> +		break;
> +	case SCX_OPSS_QUEUEING:
> +		/*
> +		 * QUEUEING is started and finished while holding @p's rq lock.
> +		 * As we're holding the rq lock now, we shouldn't see QUEUEING.
> +		 */
> +		BUG();
> +	case SCX_OPSS_QUEUED:
> +		if (SCX_HAS_OP(dequeue))
> +			SCX_CALL_OP(SCX_KF_REST, dequeue, p, deq_flags);
> +
> +		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
> +					    SCX_OPSS_NONE))
> +			break;
> +		fallthrough;
> +	case SCX_OPSS_DISPATCHING:
> +		/*
> +		 * If @p is being dispatched from the BPF scheduler to a DSQ,
> +		 * wait for the transfer to complete so that @p doesn't get
> +		 * added to its DSQ after dequeueing is complete.
> +		 *
> +		 * As we're waiting on DISPATCHING with the rq locked, the
> +		 * dispatching side shouldn't try to lock the rq while
> +		 * DISPATCHING is set. See dispatch_to_local_dsq().
> +		 *
> +		 * DISPATCHING shouldn't have qseq set and control can reach
> +		 * here with NONE @opss from the above QUEUED case block.
> +		 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
> +		 */
> +		wait_ops_state(p, SCX_OPSS_DISPATCHING);
> +		BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
> +		break;
> +	}
> +}
> +
> +static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
> +{
> +	struct scx_rq *scx_rq = &rq->scx;
> +
> +	if (!(p->scx.flags & SCX_TASK_QUEUED))
> +		return;
> +
> +	ops_dequeue(p, deq_flags);
> +
> +	if (deq_flags & SCX_DEQ_SLEEP)
> +		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
> +	else
> +		p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
> +
> +	p->scx.flags &= ~SCX_TASK_QUEUED;
> +	scx_rq->nr_running--;
> +	sub_nr_running(rq, 1);
> +
> +	dispatch_dequeue(scx_rq, p);
> +}
> +
> +static void yield_task_scx(struct rq *rq)
> +{
> +	struct task_struct *p = rq->curr;
> +
> +	if (SCX_HAS_OP(yield))
> +		SCX_CALL_OP_RET(SCX_KF_REST, yield, p, NULL);
> +	else
> +		p->scx.slice = 0;
> +}
> +
> +static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
> +{
> +	struct task_struct *from = rq->curr;
> +
> +	if (SCX_HAS_OP(yield))
> +		return SCX_CALL_OP_RET(SCX_KF_REST, yield, from, to);
> +	else
> +		return false;
> +}
> +
> +#ifdef CONFIG_SMP
> +/**
> + * move_task_to_local_dsq - Move a task from a different rq to a local DSQ
> + * @rq: rq to move the task into, currently locked
> + * @p: task to move
> + * @enq_flags: %SCX_ENQ_*
> + *
> + * Move @p which is currently on a different rq to @rq's local DSQ. The caller
> + * must:
> + *
> + * 1. Start with exclusive access to @p either through its DSQ lock or
> + *    %SCX_OPSS_DISPATCHING flag.
> + *
> + * 2. Set @p->scx.holding_cpu to raw_smp_processor_id().
> + *
> + * 3. Remember task_rq(@p). Release the exclusive access so that we don't
> + *    deadlock with dequeue.
> + *
> + * 4. Lock @rq and the task_rq from #3.
> + *
> + * 5. Call this function.
> + *
> + * Returns %true if @p was successfully moved. %false after racing dequeue and
> + * losing.
> + */
> +static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
> +				   u64 enq_flags)
> +{
> +	struct rq *task_rq;
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	/*
> +	 * If dequeue got to @p while we were trying to lock both rq's, it'd
> +	 * have cleared @p->scx.holding_cpu to -1. While other cpus may have
> +	 * updated it to different values afterwards, as this operation can't be
> +	 * preempted or recurse, @p->scx.holding_cpu can never become
> +	 * raw_smp_processor_id() again before we're done. Thus, we can tell
> +	 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is
> +	 * still raw_smp_processor_id().
> +	 *
> +	 * See dispatch_dequeue() for the counterpart.
> +	 */
> +	if (unlikely(p->scx.holding_cpu != raw_smp_processor_id()))
> +		return false;
> +
> +	/* @p->rq couldn't have changed if we're still the holding cpu */
> +	task_rq = task_rq(p);
> +	lockdep_assert_rq_held(task_rq);
> +
> +	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr));
> +	deactivate_task(task_rq, p, 0);
> +	set_task_cpu(p, cpu_of(rq));
> +	p->scx.sticky_cpu = cpu_of(rq);
> +
> +	/*
> +	 * We want to pass scx-specific enq_flags but activate_task() will
> +	 * truncate the upper 32 bit. As we own @rq, we can pass them through
> +	 * @rq->scx.extra_enq_flags instead.
> +	 */
> +	WARN_ON_ONCE(rq->scx.extra_enq_flags);
> +	rq->scx.extra_enq_flags = enq_flags;
> +	activate_task(rq, p, 0);
> +	rq->scx.extra_enq_flags = 0;
> +
> +	return true;
> +}
> +
> +/**
> + * dispatch_to_local_dsq_lock - Ensure source and desitnation rq's are locked
> + * @rq: current rq which is locked
> + * @rf: rq_flags to use when unlocking @rq
> + * @src_rq: rq to move task from
> + * @dst_rq: rq to move task to
> + *
> + * We're holding @rq lock and trying to dispatch a task from @src_rq to
> + * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether
> + * @rq stays locked isn't important as long as the state is restored after
> + * dispatch_to_local_dsq_unlock().
> + */
> +static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
> +				       struct rq *src_rq, struct rq *dst_rq)
> +{
> +	rq_unpin_lock(rq, rf);
> +
> +	if (src_rq == dst_rq) {
> +		raw_spin_rq_unlock(rq);
> +		raw_spin_rq_lock(dst_rq);
> +	} else if (rq == src_rq) {
> +		double_lock_balance(rq, dst_rq);
> +		rq_repin_lock(rq, rf);
> +	} else if (rq == dst_rq) {
> +		double_lock_balance(rq, src_rq);
> +		rq_repin_lock(rq, rf);
> +	} else {
> +		raw_spin_rq_unlock(rq);
> +		double_rq_lock(src_rq, dst_rq);
> +	}
> +}
> +
> +/**
> + * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock()
> + * @rq: current rq which is locked
> + * @rf: rq_flags to use when unlocking @rq
> + * @src_rq: rq to move task from
> + * @dst_rq: rq to move task to
> + *
> + * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return.
> + */
> +static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
> +					 struct rq *src_rq, struct rq *dst_rq)
> +{
> +	if (src_rq == dst_rq) {
> +		raw_spin_rq_unlock(dst_rq);
> +		raw_spin_rq_lock(rq);
> +		rq_repin_lock(rq, rf);
> +	} else if (rq == src_rq) {
> +		double_unlock_balance(rq, dst_rq);
> +	} else if (rq == dst_rq) {
> +		double_unlock_balance(rq, src_rq);
> +	} else {
> +		double_rq_unlock(src_rq, dst_rq);
> +		raw_spin_rq_lock(rq);
> +		rq_repin_lock(rq, rf);
> +	}
> +}
> +#endif	/* CONFIG_SMP */
> +
> +
> +static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
> +			       struct scx_dispatch_q *dsq)
> +{
> +	struct scx_rq *scx_rq = &rq->scx;
> +	struct task_struct *p;
> +	struct rq *task_rq;
> +	bool moved = false;
> +retry:
> +	if (list_empty(&dsq->fifo))
> +		return false;
> +
> +	raw_spin_lock(&dsq->lock);
> +	list_for_each_entry(p, &dsq->fifo, scx.dsq_node) {
> +		task_rq = task_rq(p);
> +		if (rq == task_rq)
> +			goto this_rq;
> +		if (likely(test_rq_online(rq)) && !is_migration_disabled(p) &&
> +		    cpumask_test_cpu(cpu_of(rq), p->cpus_ptr))
> +			goto remote_rq;
> +	}
> +	raw_spin_unlock(&dsq->lock);
> +	return false;
> +
> +this_rq:
> +	/* @dsq is locked and @p is on this rq */
> +	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
> +	list_move_tail(&p->scx.dsq_node, &scx_rq->local_dsq.fifo);
> +	dsq->nr--;
> +	scx_rq->local_dsq.nr++;
> +	p->scx.dsq = &scx_rq->local_dsq;
> +	raw_spin_unlock(&dsq->lock);
> +	return true;
> +
> +remote_rq:
> +#ifdef CONFIG_SMP
> +	/*
> +	 * @dsq is locked and @p is on a remote rq. @p is currently protected by
> +	 * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab
> +	 * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the
> +	 * rq lock or fail, do a little dancing from our side. See
> +	 * move_task_to_local_dsq().
> +	 */
> +	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
> +	list_del_init(&p->scx.dsq_node);
> +	dsq->nr--;
> +	p->scx.holding_cpu = raw_smp_processor_id();
> +	raw_spin_unlock(&dsq->lock);
> +
> +	rq_unpin_lock(rq, rf);
> +	double_lock_balance(rq, task_rq);
> +	rq_repin_lock(rq, rf);
> +
> +	moved = move_task_to_local_dsq(rq, p, 0);
> +
> +	double_unlock_balance(rq, task_rq);
> +#endif /* CONFIG_SMP */
> +	if (likely(moved))
> +		return true;
> +	goto retry;
> +}
> +
> +enum dispatch_to_local_dsq_ret {
> +	DTL_DISPATCHED,		/* successfully dispatched */
> +	DTL_LOST,		/* lost race to dequeue */
> +	DTL_NOT_LOCAL,		/* destination is not a local DSQ */
> +	DTL_INVALID,		/* invalid local dsq_id */
> +};
> +
> +/**
> + * dispatch_to_local_dsq - Dispatch a task to a local dsq
> + * @rq: current rq which is locked
> + * @rf: rq_flags to use when unlocking @rq
> + * @dsq_id: destination dsq ID
> + * @p: task to dispatch
> + * @enq_flags: %SCX_ENQ_*
> + *
> + * We're holding @rq lock and want to dispatch @p to the local DSQ identified by
> + * @dsq_id. This function performs all the synchronization dancing needed
> + * because local DSQs are protected with rq locks.
> + *
> + * The caller must have exclusive ownership of @p (e.g. through
> + * %SCX_OPSS_DISPATCHING).
> + */
> +static enum dispatch_to_local_dsq_ret
> +dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
> +		      struct task_struct *p, u64 enq_flags)
> +{
> +	struct rq *src_rq = task_rq(p);
> +	struct rq *dst_rq;
> +
> +	/*
> +	 * We're synchronized against dequeue through DISPATCHING. As @p can't
> +	 * be dequeued, its task_rq and cpus_allowed are stable too.
> +	 */
> +	if (dsq_id == SCX_DSQ_LOCAL) {
> +		dst_rq = rq;
> +	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
> +		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
> +
> +		if (!ops_cpu_valid(cpu)) {
> +			scx_ops_error("invalid cpu %d in SCX_DSQ_LOCAL_ON verdict for %s[%d]",
> +				      cpu, p->comm, p->pid);
> +			return DTL_INVALID;
> +		}
> +		dst_rq = cpu_rq(cpu);
> +	} else {
> +		return DTL_NOT_LOCAL;
> +	}
> +
> +	/* if dispatching to @rq that @p is already on, no lock dancing needed */
> +	if (rq == src_rq && rq == dst_rq) {
> +		dispatch_enqueue(&dst_rq->scx.local_dsq, p,
> +				 enq_flags | SCX_ENQ_CLEAR_OPSS);
> +		return DTL_DISPATCHED;
> +	}
> +
> +#ifdef CONFIG_SMP
> +	if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) {
> +		struct rq *locked_dst_rq = dst_rq;
> +		bool dsp;
> +
> +		/*
> +		 * @p is on a possibly remote @src_rq which we need to lock to
> +		 * move the task. If dequeue is in progress, it'd be locking
> +		 * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq
> +		 * lock while holding DISPATCHING.
> +		 *
> +		 * As DISPATCHING guarantees that @p is wholly ours, we can
> +		 * pretend that we're moving from a DSQ and use the same
> +		 * mechanism - mark the task under transfer with holding_cpu,
> +		 * release DISPATCHING and then follow the same protocol.
> +		 */
> +		p->scx.holding_cpu = raw_smp_processor_id();
> +
> +		/* store_release ensures that dequeue sees the above */
> +		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
> +
> +		dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq);
> +
> +		/*
> +		 * We don't require the BPF scheduler to avoid dispatching to
> +		 * offline CPUs mostly for convenience but also because CPUs can
> +		 * go offline between scx_bpf_dispatch() calls and here. If @p
> +		 * is destined to an offline CPU, queue it on its current CPU
> +		 * instead, which should always be safe. As this is an allowed
> +		 * behavior, don't trigger an ops error.
> +		 */
> +		if (unlikely(!test_rq_online(dst_rq)))
> +			dst_rq = src_rq;
> +
> +		if (src_rq == dst_rq) {
> +			/*
> +			 * As @p is staying on the same rq, there's no need to
> +			 * go through the full deactivate/activate cycle.
> +			 * Optimize by abbreviating the operations in
> +			 * move_task_to_local_dsq().
> +			 */
> +			dsp = p->scx.holding_cpu == raw_smp_processor_id();
> +			if (likely(dsp)) {
> +				p->scx.holding_cpu = -1;
> +				dispatch_enqueue(&dst_rq->scx.local_dsq, p,
> +						 enq_flags);
> +			}
> +		} else {
> +			dsp = move_task_to_local_dsq(dst_rq, p, enq_flags);
> +		}
> +
> +		/* if the destination CPU is idle, wake it up */
> +		if (dsp && p->sched_class > dst_rq->curr->sched_class)
> +			resched_curr(dst_rq);
> +
> +		dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq);
> +
> +		return dsp ? DTL_DISPATCHED : DTL_LOST;
> +	}
> +#endif /* CONFIG_SMP */
> +
> +	scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
> +		      cpu_of(dst_rq), p->comm, p->pid);
> +	return DTL_INVALID;
> +}
> +
> +/**
> + * finish_dispatch - Asynchronously finish dispatching a task
> + * @rq: current rq which is locked
> + * @rf: rq_flags to use when unlocking @rq
> + * @p: task to finish dispatching
> + * @qseq_at_dispatch: qseq when @p started getting dispatched
> + * @dsq_id: destination DSQ ID
> + * @enq_flags: %SCX_ENQ_*
> + *
> + * Dispatching to local DSQs may need to wait for queueing to complete or
> + * require rq lock dancing. As we don't wanna do either while inside
> + * ops.dispatch() to avoid locking order inversion, we split dispatching into
> + * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the
> + * task and its qseq. Once ops.dispatch() returns, this function is called to
> + * finish up.
> + *
> + * There is no guarantee that @p is still valid for dispatching or even that it
> + * was valid in the first place. Make sure that the task is still owned by the
> + * BPF scheduler and claim the ownership before dispatching.
> + */
> +static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
> +			    struct task_struct *p,
> +			    unsigned long qseq_at_dispatch,
> +			    u64 dsq_id, u64 enq_flags)
> +{
> +	struct scx_dispatch_q *dsq;
> +	unsigned long opss;
> +
> +retry:
> +	/*
> +	 * No need for _acquire here. @p is accessed only after a successful
> +	 * try_cmpxchg to DISPATCHING.
> +	 */
> +	opss = atomic_long_read(&p->scx.ops_state);
> +
> +	switch (opss & SCX_OPSS_STATE_MASK) {
> +	case SCX_OPSS_DISPATCHING:
> +	case SCX_OPSS_NONE:
> +		/* someone else already got to it */
> +		return;
> +	case SCX_OPSS_QUEUED:
> +		/*
> +		 * If qseq doesn't match, @p has gone through at least one
> +		 * dispatch/dequeue and re-enqueue cycle between
> +		 * scx_bpf_dispatch() and here and we have no claim on it.
> +		 */
> +		if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
> +			return;
> +
> +		/*
> +		 * While we know @p is accessible, we don't yet have a claim on
> +		 * it - the BPF scheduler is allowed to dispatch tasks
> +		 * spuriously and there can be a racing dequeue attempt. Let's
> +		 * claim @p by atomically transitioning it from QUEUED to
> +		 * DISPATCHING.
> +		 */
> +		if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
> +						   SCX_OPSS_DISPATCHING)))
> +			break;
> +		goto retry;
> +	case SCX_OPSS_QUEUEING:
> +		/*
> +		 * do_enqueue_task() is in the process of transferring the task
> +		 * to the BPF scheduler while holding @p's rq lock. As we aren't
> +		 * holding any kernel or BPF resource that the enqueue path may
> +		 * depend upon, it's safe to wait.
> +		 */
> +		wait_ops_state(p, opss);
> +		goto retry;
> +	}
> +
> +	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
> +
> +	switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) {
> +	case DTL_DISPATCHED:
> +		break;
> +	case DTL_LOST:
> +		break;
> +	case DTL_INVALID:
> +		dsq_id = SCX_DSQ_GLOBAL;
> +		fallthrough;
> +	case DTL_NOT_LOCAL:
> +		dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()),
> +					    dsq_id, p);
> +		dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
> +		break;
> +	}
> +}
> +
> +static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
> +{
> +	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
> +	u32 u;
> +
> +	for (u = 0; u < dspc->buf_cursor; u++) {
> +		struct scx_dsp_buf_ent *ent = &this_cpu_ptr(scx_dsp_buf)[u];
> +
> +		finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id,
> +				ent->enq_flags);
> +	}
> +
> +	dspc->nr_tasks += dspc->buf_cursor;
> +	dspc->buf_cursor = 0;
> +}
> +
> +static int balance_scx(struct rq *rq, struct task_struct *prev,
> +		       struct rq_flags *rf)
> +{
> +	struct scx_rq *scx_rq = &rq->scx;
> +	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
> +	bool prev_on_scx = prev->sched_class == &ext_sched_class;
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	if (prev_on_scx) {
> +		WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
> +		update_curr_scx(rq);
> +
> +		/*
> +		 * If @prev is runnable & has slice left, it has priority and
> +		 * fetching more just increases latency for the fetched tasks.
> +		 * Tell put_prev_task_scx() to put @prev on local_dsq.
> +		 *
> +		 * See scx_ops_disable_workfn() for the explanation on the
> +		 * disabling() test.
> +		 */
> +		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
> +		    prev->scx.slice && !scx_ops_disabling()) {
> +			prev->scx.flags |= SCX_TASK_BAL_KEEP;
> +			return 1;
> +		}
> +	}
> +
> +	/* if there already are tasks to run, nothing to do */
> +	if (scx_rq->local_dsq.nr)
> +		return 1;
> +
> +	if (consume_dispatch_q(rq, rf, &scx_dsq_global))
> +		return 1;
> +
> +	if (!SCX_HAS_OP(dispatch))
> +		return 0;
> +
> +	dspc->rq = rq;
> +	dspc->rf = rf;
> +
> +	/*
> +	 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
> +	 * the local DSQ might still end up empty after a successful
> +	 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
> +	 * produced some tasks, retry. The BPF scheduler may depend on this
> +	 * looping behavior to simplify its implementation.
> +	 */
> +	do {
> +		dspc->nr_tasks = 0;
> +
> +		SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
> +			    prev_on_scx ? prev : NULL);
> +
> +		flush_dispatch_buf(rq, rf);
> +
> +		if (scx_rq->local_dsq.nr)
> +			return 1;
> +		if (consume_dispatch_q(rq, rf, &scx_dsq_global))
> +			return 1;
> +	} while (dspc->nr_tasks);
> +
> +	return 0;
> +}
> +
> +static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
> +{
> +	if (p->scx.flags & SCX_TASK_QUEUED) {
> +		WARN_ON_ONCE(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);

Ditto. Even if this line is replaced later by
"[PATCH 31/36] sched_ext: Implement core-sched support"

> +		dispatch_dequeue(&rq->scx, p);
> +	}
> +
> +	p->se.exec_start = rq_clock_task(rq);
> +}

Thanks,
-Andrea

next prev parent reply	other threads:[~2023-11-23  8:08 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-11  2:47 [PATCHSET v5] sched: Implement BPF extensible scheduler class Tejun Heo
2023-11-11  2:47 ` [PATCH 01/36] cgroup: Implement cgroup_show_cftypes() Tejun Heo
2023-11-11  2:47 ` [PATCH 02/36] sched: Restructure sched_class order sanity checks in sched_init() Tejun Heo
2023-11-11  2:47 ` [PATCH 03/36] sched: Allow sched_cgroup_fork() to fail and introduce sched_cancel_fork() Tejun Heo
2023-11-11  2:47 ` [PATCH 04/36] sched: Add sched_class->reweight_task() Tejun Heo
2023-11-11  2:47 ` [PATCH 05/36] sched: Add sched_class->switching_to() and expose check_class_changing/changed() Tejun Heo
2023-11-11  2:47 ` [PATCH 06/36] sched: Factor out cgroup weight conversion functions Tejun Heo
2023-11-11  2:47 ` [PATCH 07/36] sched: Expose css_tg() and __setscheduler_prio() Tejun Heo
2023-11-11  2:47 ` [PATCH 08/36] sched: Enumerate CPU cgroup file types Tejun Heo
2023-11-11  2:47 ` [PATCH 09/36] sched: Add @reason to sched_class->rq_{on|off}line() Tejun Heo
2023-11-11  2:47 ` [PATCH 10/36] sched: Add normal_policy() Tejun Heo
2023-11-11  2:47 ` [PATCH 11/36] sched_ext: Add boilerplate for extensible scheduler class Tejun Heo
2023-11-11  2:47 ` [PATCH 12/36] sched_ext: Implement BPF " Tejun Heo
2023-11-13 13:34   ` Changwoo Min
2023-11-14 19:07     ` Tejun Heo
2023-11-13 20:04   ` Andrea Righi
2023-11-14 19:07     ` Tejun Heo
2023-11-23  8:07   ` Andrea Righi [this message]
2023-11-25 19:59     ` Tejun Heo
2023-11-26  9:05       ` Andrea Righi
2023-12-07  2:04   ` [PATCH] scx: set p->scx.ops_state using atomic_long_set_release Changwoo Min
2023-12-08  0:16     ` Tejun Heo
2024-03-23  2:37   ` [PATCH 12/36] sched_ext: Implement BPF extensible scheduler class Joel Fernandes
2024-03-23 22:12     ` Tejun Heo
2024-04-25 21:28       ` Joel Fernandes
2024-04-26 16:57         ` Barret Rhoden
2024-04-26 21:58         ` Tejun Heo
2023-11-11  2:47 ` [PATCH 13/36] sched_ext: Add scx_simple and scx_example_qmap example schedulers Tejun Heo
2023-11-12  4:17   ` kernel test robot
2023-11-12 18:06     ` Tejun Heo
2023-11-11  2:47 ` [PATCH 14/36] sched_ext: Add sysrq-S which disables the BPF scheduler Tejun Heo
2023-11-11  2:47 ` [PATCH 15/36] sched_ext: Implement runnable task stall watchdog Tejun Heo
2023-11-11  2:47 ` [PATCH 16/36] sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT Tejun Heo
2023-11-11  2:47 ` [PATCH 17/36] sched_ext: Allow BPF schedulers to switch all eligible tasks into sched_ext Tejun Heo
2023-11-11  2:47 ` [PATCH 18/36] sched_ext: Print sched_ext info when dumping stack Tejun Heo
2023-11-14 19:23   ` [PATCH v2 " Tejun Heo
2023-11-11  2:47 ` [PATCH 19/36] sched_ext: Implement scx_bpf_kick_cpu() and task preemption support Tejun Heo
2023-11-11  2:47 ` [PATCH 20/36] sched_ext: Add a central scheduler which makes all scheduling decisions on one CPU Tejun Heo
2023-11-11  2:47 ` [PATCH 21/36] sched_ext: Make watchdog handle ops.dispatch() looping stall Tejun Heo
2023-11-11  2:47 ` [PATCH 22/36] sched_ext: Add task state tracking operations Tejun Heo
2023-11-11  2:47 ` [PATCH 23/36] sched_ext: Implement tickless support Tejun Heo
2023-11-11  2:47 ` [PATCH 24/36] sched_ext: Track tasks that are subjects of the in-flight SCX operation Tejun Heo
2023-11-11  2:47 ` [PATCH 25/36] sched_ext: Add cgroup support Tejun Heo
2023-11-11  2:47 ` [PATCH 26/36] sched_ext: Add a cgroup-based core-scheduling scheduler Tejun Heo
2023-11-11  2:47 ` [PATCH 27/36] sched_ext: Add a cgroup scheduler which uses flattened hierarchy Tejun Heo
2023-11-11  2:47 ` [PATCH 28/36] sched_ext: Implement SCX_KICK_WAIT Tejun Heo
2023-11-11  2:47 ` [PATCH 29/36] sched_ext: Implement sched_ext_ops.cpu_acquire/release() Tejun Heo
2023-11-11  2:47 ` [PATCH 30/36] sched_ext: Implement sched_ext_ops.cpu_online/offline() Tejun Heo
2023-11-11  2:47 ` [PATCH 31/36] sched_ext: Implement core-sched support Tejun Heo
2023-11-11  2:47 ` [PATCH 32/36] sched_ext: Add vtime-ordered priority queue to dispatch_q's Tejun Heo
2023-11-11  2:47 ` [PATCH 33/36] sched_ext: Documentation: scheduler: Document extensible scheduler class Tejun Heo
2023-11-11  2:48 ` [PATCH 34/36] sched_ext: Add a basic, userland vruntime scheduler Tejun Heo
2023-11-11  2:48 ` [PATCH 35/36] sched_ext: Add scx_rusty, a rust userspace hybrid scheduler Tejun Heo
2023-11-11  2:48 ` [PATCH 36/36] sched_ext: Add scx_layered, a highly configurable multi-layer scheduler Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZV8IR/w4IaxJ2vPA@gpd \
    --to=andrea.righi@canonical.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=brho@google.com \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=changwoo@igalia.com \
    --cc=daniel@iogearbox.net \
    --cc=derkling@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=dschatzberg@meta.com \
    --cc=dskarlat@cs.cmu.edu \
    --cc=dvernet@meta.com \
    --cc=haoluo@google.com \
    --cc=himadrics@inria.fr \
    --cc=joshdon@google.com \
    --cc=juri.lelli@redhat.com \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=martin.lau@kernel.org \
    --cc=memxor@gmail.com \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=riel@surriel.com \
    --cc=rostedt@goodmis.org \
    --cc=tj@kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.