Re: [PATCH sched_ext/for-7.2] sched_ext: add p->scx.tid and SCX_OPS_TID_TO_TASK lookup

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Andrea Righi <arighi@nvidia.com>
To: Tejun Heo <tj@kernel.org>
Cc: David Vernet <void@manifault.com>,
	Changwoo Min <changwoo@igalia.com>,
	sched-ext@lists.linux.dev, Emil Tsalapatis <emil@etsalapatis.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH sched_ext/for-7.2] sched_ext: add p->scx.tid and SCX_OPS_TID_TO_TASK lookup
Date: Sun, 19 Apr 2026 19:02:47 +0200	[thread overview]
Message-ID: <aeUKt4Mv7NPZIrLW@gpd4> (raw)
In-Reply-To: <a19c8c74d7c767bc9865b75d6de7c723@kernel.org>

Hi Tejun,

On Sun, Apr 19, 2026 at 06:18:46AM -1000, Tejun Heo wrote:
> BPF schedulers that can't hold task_struct pointers (arena-backed ones in
> particular) key tasks by pid. During exit, pid is released before the
> task finishes passing through scheduler callbacks, so a dying task
> becomes invisible to the BPF side mid-schedule. scx_qmap hits this: an
> exiting task's dispatch callback can't recover its queue entry, stalling
> dispatch until SCX_EXIT_ERROR_STALL.
> 
> Add a unique non-zero u64 p->scx.tid assigned at fork that survives the
> full task lifetime including exit. scx_bpf_tid_to_task() looks up the
> task; unlike bpf_task_from_pid(), it handles exiting tasks.
> 
> The lookup costs an rhashtable insert/remove under scx_tasks_lock, so
> root schedulers opt in via SCX_OPS_TID_TO_TASK. Sub-schedulers that set
> the flag to declare a dependency are rejected at attach if root didn't
> opt in.
> 
> scx_qmap converted: keys tasks by tid and enables SCX_OPS_ENQ_EXITING.
> Pre-patch it stalls within seconds under a non-leader-exec workload;
> with the patch it runs cleanly.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  include/linux/sched/ext.h                |    9 +
>  kernel/sched/ext.c                       |  144 +++++++++++++++++++++++++++++--
>  kernel/sched/ext_internal.h              |   20 +++-
>  tools/sched_ext/include/scx/common.bpf.h |    1 
>  tools/sched_ext/scx_qmap.bpf.c           |   13 +-
>  5 files changed, 170 insertions(+), 17 deletions(-)
> 
> --- a/include/linux/sched/ext.h
> +++ b/include/linux/sched/ext.h
> @@ -203,6 +203,15 @@ struct sched_ext_entity {
>  	u64			core_sched_at;	/* see scx_prio_less() */
>  #endif
>  
> +	/*
> +	 * Unique non-zero task ID assigned at fork. Persists across exec and
> +	 * is never reused. Lets BPF schedulers identify tasks without storing
> +	 * kernel pointers - arena-backed schedulers being one example. See
> +	 * scx_bpf_tid_to_task().
> +	 */
> +	u64			tid;
> +	struct rhash_head	tid_hash_node;	/* see SCX_OPS_TID_TO_TASK */
> +
>  	/* BPF scheduler modifiable fields */
>  
>  	/*
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -38,6 +38,15 @@ static const struct rhashtable_params sc
>  static struct rhashtable scx_sched_hash;
>  #endif
>  
> +/* see SCX_OPS_TID_TO_TASK */
> +static const struct rhashtable_params scx_tid_hash_params = {
> +	.key_len		= sizeof_field(struct sched_ext_entity, tid),
> +	.key_offset		= offsetof(struct sched_ext_entity, tid),
> +	.head_offset		= offsetof(struct sched_ext_entity, tid_hash_node),
> +	.insecure_elasticity	= true,	/* inserted/removed under scx_tasks_lock */
> +};
> +static struct rhashtable scx_tid_hash;
> +
>  /*
>   * During exit, a task may schedule after losing its PIDs. When disabling the
>   * BPF scheduler, we need to be able to iterate tasks in every state to
> @@ -58,10 +67,25 @@ static cpumask_var_t scx_bypass_lb_resch
>  static bool scx_init_task_enabled;
>  static bool scx_switching_all;
>  DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
> +static DEFINE_STATIC_KEY_FALSE(__scx_tid_to_task_enabled);
> +
> +/*
> + * True once SCX_OPS_TID_TO_TASK has been negotiated with the root scheduler
> + * and the tid->task table is live. Wraps the static key so callers don't
> + * take the address, and hints "likely enabled" for the common case where
> + * the feature is in use.
> + */
> +static inline bool scx_tid_to_task_enabled(void)
> +{
> +	return static_branch_likely(&__scx_tid_to_task_enabled);
> +}
>  
>  static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
>  static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
>  
> +/* Global cursor for the per-CPU tid allocator. Starts at 1; tid 0 is reserved. */
> +static atomic64_t scx_tid_cursor = ATOMIC64_INIT(1);
> +
>  #ifdef CONFIG_EXT_SUB_SCHED
>  /*
>   * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit
> @@ -111,6 +135,17 @@ struct scx_kick_syncs {
>  static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs);
>  
>  /*
> + * Per-CPU buffered allocator state for p->scx.tid. Each CPU pulls a chunk of
> + * SCX_TID_CHUNK ids from scx_tid_cursor and hands them out locally without
> + * further synchronization. See scx_alloc_tid().
> + */
> +struct scx_tid_alloc {
> +	u64	next;
> +	u64	end;
> +};
> +static DEFINE_PER_CPU(struct scx_tid_alloc, scx_tid_alloc);
> +
> +/*
>   * Direct dispatch marker.
>   *
>   * Non-NULL values are used for direct dispatch from enqueue path. A valid
> @@ -3665,6 +3700,21 @@ void init_scx_entity(struct sched_ext_en
>  	scx->slice = SCX_SLICE_DFL;
>  }
>  
> +/* See scx_tid_alloc / scx_tid_cursor. */
> +static u64 scx_alloc_tid(void)
> +{
> +	struct scx_tid_alloc *ta;
> +
> +	guard(preempt)();
> +	ta = this_cpu_ptr(&scx_tid_alloc);
> +
> +	if (unlikely(ta->next >= ta->end)) {
> +		ta->next = atomic64_fetch_add(SCX_TID_CHUNK, &scx_tid_cursor);
> +		ta->end = ta->next + SCX_TID_CHUNK;
> +	}
> +	return ta->next++;
> +}
> +
>  void scx_pre_fork(struct task_struct *p)
>  {
>  	/*
> @@ -3682,6 +3732,8 @@ int scx_fork(struct task_struct *p, stru
>  
>  	percpu_rwsem_assert_held(&scx_fork_rwsem);
>  
> +	p->scx.tid = scx_alloc_tid();
> +
>  	if (scx_init_task_enabled) {
>  #ifdef CONFIG_EXT_SUB_SCHED
>  		struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
> @@ -3717,9 +3769,13 @@ void scx_post_fork(struct task_struct *p
>  		}
>  	}
>  
> -	raw_spin_lock_irq(&scx_tasks_lock);
> -	list_add_tail(&p->scx.tasks_node, &scx_tasks);
> -	raw_spin_unlock_irq(&scx_tasks_lock);
> +	scoped_guard(raw_spinlock_irq, &scx_tasks_lock) {
> +		list_add_tail(&p->scx.tasks_node, &scx_tasks);
> +		if (scx_tid_to_task_enabled())
> +			rhashtable_lookup_insert_fast(&scx_tid_hash,
> +						      &p->scx.tid_hash_node,
> +						      scx_tid_hash_params);
> +	}
>  
>  	percpu_up_read(&scx_fork_rwsem);
>  }
> @@ -3770,17 +3826,19 @@ static bool task_dead_and_done(struct ta
>  
>  void sched_ext_dead(struct task_struct *p)
>  {
> -	unsigned long flags;
> -
>  	/*
>  	 * By the time control reaches here, @p has %TASK_DEAD set, switched out
>  	 * for the last time and then dropped the rq lock - task_dead_and_done()
>  	 * should be returning %true nullifying the straggling sched_class ops.
>  	 * Remove from scx_tasks and exit @p.
>  	 */
> -	raw_spin_lock_irqsave(&scx_tasks_lock, flags);
> -	list_del_init(&p->scx.tasks_node);
> -	raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
> +	scoped_guard(raw_spinlock_irqsave, &scx_tasks_lock) {
> +		list_del_init(&p->scx.tasks_node);
> +		if (scx_tid_to_task_enabled())
> +			rhashtable_remove_fast(&scx_tid_hash,
> +					       &p->scx.tid_hash_node,
> +					       scx_tid_hash_params);
> +	}
>  
>  	/*
>  	 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
> @@ -5794,9 +5852,13 @@ static void scx_root_disable(struct scx_
>  
>  	/* no task is on scx, turn off all the switches and flush in-progress calls */
>  	static_branch_disable(&__scx_enabled);
> +	if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
> +		static_branch_disable(&__scx_tid_to_task_enabled);
>  	bitmap_zero(sch->has_op, SCX_OPI_END);
>  	scx_idle_disable();
>  	synchronize_rcu();
> +	if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
> +		rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL);

IIUC we don't unlink per-element nodes here, but we just free the whole bucket
storage, right? So, nodes may still be chained in task_struct for live tasks
(leaving a potential stale state).

I'm wondering if we should have a teardown function, called before disabling
SCX_OPS_TID_TO_TASK and destroying scx_tid_hash, to explicitly remove all the
scx_tid_hash entries via rhashtable_remove_fast().

Essentially the order of operation should be:
 1. disable all tasks on scx,
 2. drain all tid hash entries,
 3. allow forks again, turn off static keys, synchronize_rcu(),
    rhashtable_free_and_destroy()

Am I missing something?

Thanks,
-Andrea

next prev parent reply	other threads:[~2026-04-19 17:02 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-19 16:18 [PATCH sched_ext/for-7.2] sched_ext: add p->scx.tid and SCX_OPS_TID_TO_TASK lookup Tejun Heo
2026-04-19 17:02 ` Andrea Righi [this message]
2026-04-19 18:12   ` Tejun Heo
2026-04-19 18:31     ` Andrea Righi
2026-04-19 17:23 ` Cheng-Yang Chou
2026-04-19 18:10 ` Cheng-Yang Chou

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aeUKt4Mv7NPZIrLW@gpd4 \
    --to=arighi@nvidia.com \
    --cc=changwoo@igalia.com \
    --cc=emil@etsalapatis.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sched-ext@lists.linux.dev \
    --cc=tj@kernel.org \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.