All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrea Righi <arighi@nvidia.com>
To: Tejun Heo <tj@kernel.org>
Cc: Phil Auld <pauld@redhat.com>, David Vernet <void@manifault.com>,
	Changwoo Min <changwoo@igalia.com>,
	sched-ext@lists.linux.dev
Subject: Re: [PATCH v2] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
Date: Thu, 9 Oct 2025 00:24:16 +0200	[thread overview]
Message-ID: <aObkkKf_UCHySRGz@gpd4> (raw)
In-Reply-To: <aObcIMFpUoMy7vJI@slm.duckdns.org>

Hi Tejun,

On Wed, Oct 08, 2025 at 11:48:16AM -1000, Tejun Heo wrote:
> On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
> boot because it exceeds the 32,768 byte percpu allocator limit.
> 
> Restructure to use DEFINE_PER_CPU() for the per-CPU pointers, with each CPU
> pointing to its own kvzalloc'd array. Move allocation from boot time to
> scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
> consumed when sched_ext is active.
> 
> Use RCU to guard against racing with free. Arrays are freed via call_rcu()
> and kick_cpus_irq_workfn() uses rcu_dereference_bh() with a NULL check.
> 
> While at it, rename to scx_kick_pseqs for brevity and update comments to
> clarify these are pick_task sequence numbers.
> 
> Reported-by: Phil Auld <pauld@redhat.com>
> Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  kernel/sched/ext.c | 88 ++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 78 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 2b0e88206d07..217c80d0105c 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
>  
>  static struct delayed_work scx_watchdog_work;
>  
> -/* for %SCX_KICK_WAIT */
> -static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
> +/*
> + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
> + * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
> + * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
> + * lazily when enabling and freed when disabling to avoid waste when sched_ext
> + * isn't active.
> + */
> +struct scx_kick_pseqs {
> +	struct rcu_head		rcu;
> +	unsigned long		seqs[];
> +};
> +
> +static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
>  
>  /*
>   * Direct dispatch marker.
> @@ -3850,6 +3861,25 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
>  	}
>  }
>  
> +static void free_kick_pseqs_rcu(struct rcu_head *rcu)
> +{
> +	struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
> +
> +	kvfree(pseqs);
> +}
> +
> +static void free_kick_pseqs(void)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> +
> +		call_rcu(&(*pseqs)->rcu, free_kick_pseqs_rcu);
> +		RCU_INIT_POINTER(*pseqs, NULL);

Is this safe? I think we should replace the pointer first and then schedule
the free via call_rcu(), like:

        old = rcu_replace_pointer(*pseqs, NULL, true);
        if (old)
		call_rcu(&old->rcu, free_kick_pseqs_rcu);

> +	}
> +}
> +
>  static void scx_disable_workfn(struct kthread_work *work)
>  {
>  	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
> @@ -3986,6 +4016,7 @@ static void scx_disable_workfn(struct kthread_work *work)
>  	free_percpu(scx_dsp_ctx);
>  	scx_dsp_ctx = NULL;
>  	scx_dsp_max_batch = 0;
> +	free_kick_pseqs();
>  
>  	mutex_unlock(&scx_enable_mutex);
>  
> @@ -4348,6 +4379,34 @@ static void scx_vexit(struct scx_sched *sch,
>  	irq_work_queue(&sch->error_irq_work);
>  }
>  
> +static int alloc_kick_pseqs(void)
> +{
> +	int cpu;
> +
> +	/*
> +	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
> +	 * can exceed percpu allocator limits on large machines.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> +		struct scx_kick_pseqs *new_pseqs;
> +
> +

nit: extra newline.

> +		WARN_ON_ONCE(rcu_access_pointer(*pseqs));
> +
> +		new_pseqs = kvzalloc_node(sizeof(unsigned long) * nr_cpu_ids,
> +					  GFP_KERNEL, cpu_to_node(cpu));

Don't we need to allocate the struct as well? This should be something
like:

	new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
				  GFP_KERNEL, cpu_to_node(cpu));

> +		if (!new_pseqs) {
> +			free_kick_pseqs();
> +			return -ENOMEM;
> +		}
> +
> +		rcu_assign_pointer(*pseqs, new_pseqs);
> +	}
> +
> +	return 0;
> +}
> +
>  static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
>  {
>  	struct scx_sched *sch;
> @@ -4490,15 +4549,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	mutex_lock(&scx_enable_mutex);
>  
> +	ret = alloc_kick_pseqs();
> +	if (ret)
> +		goto err_unlock;
> +
>  	if (scx_enable_state() != SCX_DISABLED) {
>  		ret = -EBUSY;
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	sch = scx_alloc_and_add_sched(ops);
>  	if (IS_ERR(sch)) {
>  		ret = PTR_ERR(sch);
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	/*
> @@ -4701,6 +4764,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	return 0;
>  
> +err_free_pseqs:
> +	free_kick_pseqs();
>  err_unlock:
>  	mutex_unlock(&scx_enable_mutex);
>  	return ret;
> @@ -5082,10 +5147,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
>  {
>  	struct rq *this_rq = this_rq();
>  	struct scx_rq *this_scx = &this_rq->scx;
> -	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
> +	struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
>  	bool should_wait = false;
> +	unsigned long *pseqs;
>  	s32 cpu;
>  
> +	if (unlikely(!pseqs_pcpu)) {
> +		pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
> +		return;
> +	}
> +
> +	pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
> +
>  	for_each_cpu(cpu, this_scx->cpus_to_kick) {
>  		should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
>  		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
> @@ -5208,11 +5281,6 @@ void __init init_sched_ext_class(void)
>  
>  	scx_idle_init_masks();
>  
> -	scx_kick_cpus_pnt_seqs =
> -		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
> -			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
> -	BUG_ON(!scx_kick_cpus_pnt_seqs);
> -
>  	for_each_possible_cpu(cpu) {
>  		struct rq *rq = cpu_rq(cpu);
>  		int  n = cpu_to_node(cpu);
> -- 
> 2.51.0
> 

Thanks,
-Andrea

  reply	other threads:[~2025-10-08 22:24 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-07 13:35 sched_ext and large cpu counts Phil Auld
2025-10-08  2:37 ` Tejun Heo
2025-10-08  6:10   ` Andrea Righi
2025-10-08 20:53     ` Tejun Heo
2025-10-08 21:48       ` [PATCH v2] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc() Tejun Heo
2025-10-08 22:24         ` Andrea Righi [this message]
2025-10-08 23:36           ` Tejun Heo
2025-10-08 23:38             ` Tejun Heo
2025-10-08 23:43             ` [PATCH v3] " Tejun Heo
2025-10-09  6:43               ` Andrea Righi
2025-10-09 12:06               ` Phil Auld
2025-10-10 13:02                 ` Phil Auld
2025-10-09 13:58               ` Emil Tsalapatis
2025-10-13 18:44               ` Tejun Heo
2025-10-13 20:13                 ` Andrea Righi
2025-10-08 11:23   ` sched_ext and large cpu counts Phil Auld

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aObkkKf_UCHySRGz@gpd4 \
    --to=arighi@nvidia.com \
    --cc=changwoo@igalia.com \
    --cc=pauld@redhat.com \
    --cc=sched-ext@lists.linux.dev \
    --cc=tj@kernel.org \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.