public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Andrea Righi <arighi@nvidia.com>
To: Tejun Heo <tj@kernel.org>
Cc: David Vernet <void@manifault.com>,
	Changwoo Min <changwoo@igalia.com>,
	sched-ext@lists.linux.dev, Emil Tsalapatis <emil@etsalapatis.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH sched_ext/for-7.2] sched_ext: add p->scx.tid and SCX_OPS_TID_TO_TASK lookup
Date: Sun, 19 Apr 2026 19:02:47 +0200	[thread overview]
Message-ID: <aeUKt4Mv7NPZIrLW@gpd4> (raw)
In-Reply-To: <a19c8c74d7c767bc9865b75d6de7c723@kernel.org>

Hi Tejun,

On Sun, Apr 19, 2026 at 06:18:46AM -1000, Tejun Heo wrote:
> BPF schedulers that can't hold task_struct pointers (arena-backed ones in
> particular) key tasks by pid. During exit, pid is released before the
> task finishes passing through scheduler callbacks, so a dying task
> becomes invisible to the BPF side mid-schedule. scx_qmap hits this: an
> exiting task's dispatch callback can't recover its queue entry, stalling
> dispatch until SCX_EXIT_ERROR_STALL.
> 
> Add a unique non-zero u64 p->scx.tid assigned at fork that survives the
> full task lifetime including exit. scx_bpf_tid_to_task() looks up the
> task; unlike bpf_task_from_pid(), it handles exiting tasks.
> 
> The lookup costs an rhashtable insert/remove under scx_tasks_lock, so
> root schedulers opt in via SCX_OPS_TID_TO_TASK. Sub-schedulers that set
> the flag to declare a dependency are rejected at attach if root didn't
> opt in.
> 
> scx_qmap converted: keys tasks by tid and enables SCX_OPS_ENQ_EXITING.
> Pre-patch it stalls within seconds under a non-leader-exec workload;
> with the patch it runs cleanly.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  include/linux/sched/ext.h                |    9 +
>  kernel/sched/ext.c                       |  144 +++++++++++++++++++++++++++++--
>  kernel/sched/ext_internal.h              |   20 +++-
>  tools/sched_ext/include/scx/common.bpf.h |    1 
>  tools/sched_ext/scx_qmap.bpf.c           |   13 +-
>  5 files changed, 170 insertions(+), 17 deletions(-)
> 
> --- a/include/linux/sched/ext.h
> +++ b/include/linux/sched/ext.h
> @@ -203,6 +203,15 @@ struct sched_ext_entity {
>  	u64			core_sched_at;	/* see scx_prio_less() */
>  #endif
>  
> +	/*
> +	 * Unique non-zero task ID assigned at fork. Persists across exec and
> +	 * is never reused. Lets BPF schedulers identify tasks without storing
> +	 * kernel pointers - arena-backed schedulers being one example. See
> +	 * scx_bpf_tid_to_task().
> +	 */
> +	u64			tid;
> +	struct rhash_head	tid_hash_node;	/* see SCX_OPS_TID_TO_TASK */
> +
>  	/* BPF scheduler modifiable fields */
>  
>  	/*
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -38,6 +38,15 @@ static const struct rhashtable_params sc
>  static struct rhashtable scx_sched_hash;
>  #endif
>  
> +/* see SCX_OPS_TID_TO_TASK */
> +static const struct rhashtable_params scx_tid_hash_params = {
> +	.key_len		= sizeof_field(struct sched_ext_entity, tid),
> +	.key_offset		= offsetof(struct sched_ext_entity, tid),
> +	.head_offset		= offsetof(struct sched_ext_entity, tid_hash_node),
> +	.insecure_elasticity	= true,	/* inserted/removed under scx_tasks_lock */
> +};
> +static struct rhashtable scx_tid_hash;
> +
>  /*
>   * During exit, a task may schedule after losing its PIDs. When disabling the
>   * BPF scheduler, we need to be able to iterate tasks in every state to
> @@ -58,10 +67,25 @@ static cpumask_var_t scx_bypass_lb_resch
>  static bool scx_init_task_enabled;
>  static bool scx_switching_all;
>  DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
> +static DEFINE_STATIC_KEY_FALSE(__scx_tid_to_task_enabled);
> +
> +/*
> + * True once SCX_OPS_TID_TO_TASK has been negotiated with the root scheduler
> + * and the tid->task table is live. Wraps the static key so callers don't
> + * take the address, and hints "likely enabled" for the common case where
> + * the feature is in use.
> + */
> +static inline bool scx_tid_to_task_enabled(void)
> +{
> +	return static_branch_likely(&__scx_tid_to_task_enabled);
> +}
>  
>  static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
>  static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
>  
> +/* Global cursor for the per-CPU tid allocator. Starts at 1; tid 0 is reserved. */
> +static atomic64_t scx_tid_cursor = ATOMIC64_INIT(1);
> +
>  #ifdef CONFIG_EXT_SUB_SCHED
>  /*
>   * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit
> @@ -111,6 +135,17 @@ struct scx_kick_syncs {
>  static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs);
>  
>  /*
> + * Per-CPU buffered allocator state for p->scx.tid. Each CPU pulls a chunk of
> + * SCX_TID_CHUNK ids from scx_tid_cursor and hands them out locally without
> + * further synchronization. See scx_alloc_tid().
> + */
> +struct scx_tid_alloc {
> +	u64	next;
> +	u64	end;
> +};
> +static DEFINE_PER_CPU(struct scx_tid_alloc, scx_tid_alloc);
> +
> +/*
>   * Direct dispatch marker.
>   *
>   * Non-NULL values are used for direct dispatch from enqueue path. A valid
> @@ -3665,6 +3700,21 @@ void init_scx_entity(struct sched_ext_en
>  	scx->slice = SCX_SLICE_DFL;
>  }
>  
> +/* See scx_tid_alloc / scx_tid_cursor. */
> +static u64 scx_alloc_tid(void)
> +{
> +	struct scx_tid_alloc *ta;
> +
> +	guard(preempt)();
> +	ta = this_cpu_ptr(&scx_tid_alloc);
> +
> +	if (unlikely(ta->next >= ta->end)) {
> +		ta->next = atomic64_fetch_add(SCX_TID_CHUNK, &scx_tid_cursor);
> +		ta->end = ta->next + SCX_TID_CHUNK;
> +	}
> +	return ta->next++;
> +}
> +
>  void scx_pre_fork(struct task_struct *p)
>  {
>  	/*
> @@ -3682,6 +3732,8 @@ int scx_fork(struct task_struct *p, stru
>  
>  	percpu_rwsem_assert_held(&scx_fork_rwsem);
>  
> +	p->scx.tid = scx_alloc_tid();
> +
>  	if (scx_init_task_enabled) {
>  #ifdef CONFIG_EXT_SUB_SCHED
>  		struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
> @@ -3717,9 +3769,13 @@ void scx_post_fork(struct task_struct *p
>  		}
>  	}
>  
> -	raw_spin_lock_irq(&scx_tasks_lock);
> -	list_add_tail(&p->scx.tasks_node, &scx_tasks);
> -	raw_spin_unlock_irq(&scx_tasks_lock);
> +	scoped_guard(raw_spinlock_irq, &scx_tasks_lock) {
> +		list_add_tail(&p->scx.tasks_node, &scx_tasks);
> +		if (scx_tid_to_task_enabled())
> +			rhashtable_lookup_insert_fast(&scx_tid_hash,
> +						      &p->scx.tid_hash_node,
> +						      scx_tid_hash_params);
> +	}
>  
>  	percpu_up_read(&scx_fork_rwsem);
>  }
> @@ -3770,17 +3826,19 @@ static bool task_dead_and_done(struct ta
>  
>  void sched_ext_dead(struct task_struct *p)
>  {
> -	unsigned long flags;
> -
>  	/*
>  	 * By the time control reaches here, @p has %TASK_DEAD set, switched out
>  	 * for the last time and then dropped the rq lock - task_dead_and_done()
>  	 * should be returning %true nullifying the straggling sched_class ops.
>  	 * Remove from scx_tasks and exit @p.
>  	 */
> -	raw_spin_lock_irqsave(&scx_tasks_lock, flags);
> -	list_del_init(&p->scx.tasks_node);
> -	raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
> +	scoped_guard(raw_spinlock_irqsave, &scx_tasks_lock) {
> +		list_del_init(&p->scx.tasks_node);
> +		if (scx_tid_to_task_enabled())
> +			rhashtable_remove_fast(&scx_tid_hash,
> +					       &p->scx.tid_hash_node,
> +					       scx_tid_hash_params);
> +	}
>  
>  	/*
>  	 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
> @@ -5794,9 +5852,13 @@ static void scx_root_disable(struct scx_
>  
>  	/* no task is on scx, turn off all the switches and flush in-progress calls */
>  	static_branch_disable(&__scx_enabled);
> +	if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
> +		static_branch_disable(&__scx_tid_to_task_enabled);
>  	bitmap_zero(sch->has_op, SCX_OPI_END);
>  	scx_idle_disable();
>  	synchronize_rcu();
> +	if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
> +		rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL);

IIUC we don't unlink per-element nodes here, but we just free the whole bucket
storage, right? So, nodes may still be chained in task_struct for live tasks
(leaving a potential stale state).

I'm wondering if we should have a teardown function, called before disabling
SCX_OPS_TID_TO_TASK and destroying scx_tid_hash, to explicitly remove all the
scx_tid_hash entries via rhashtable_remove_fast().

Essentially the order of operation should be:
 1. disable all tasks on scx,
 2. drain all tid hash entries,
 3. allow forks again, turn off static keys, synchronize_rcu(),
    rhashtable_free_and_destroy()

Am I missing something?

Thanks,
-Andrea

  reply	other threads:[~2026-04-19 17:02 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-19 16:18 [PATCH sched_ext/for-7.2] sched_ext: add p->scx.tid and SCX_OPS_TID_TO_TASK lookup Tejun Heo
2026-04-19 17:02 ` Andrea Righi [this message]
2026-04-19 18:12   ` Tejun Heo
2026-04-19 18:31     ` Andrea Righi
2026-04-19 17:23 ` Cheng-Yang Chou
2026-04-19 18:10 ` Cheng-Yang Chou

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aeUKt4Mv7NPZIrLW@gpd4 \
    --to=arighi@nvidia.com \
    --cc=changwoo@igalia.com \
    --cc=emil@etsalapatis.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sched-ext@lists.linux.dev \
    --cc=tj@kernel.org \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox