All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrea Righi <arighi@nvidia.com>
To: Tejun Heo <tj@kernel.org>
Cc: Phil Auld <pauld@redhat.com>, David Vernet <void@manifault.com>,
	Changwoo Min <changwoo@igalia.com>,
	sched-ext@lists.linux.dev
Subject: Re: sched_ext and large cpu counts
Date: Wed, 8 Oct 2025 08:10:31 +0200	[thread overview]
Message-ID: <aOYAV2ZQvMt5aZxu@gpd4> (raw)
In-Reply-To: <d1c0eda5206b01fd670e148bd5ea0d56@kernel.org>

Hi Tejun and Phil,

On Tue, Oct 07, 2025 at 04:37:24PM -1000, Tejun Heo wrote:
> Hello,
> 
> Can you please see whether the following patch resolves the problem?
> 
> Thanks.
> 
> --
> tejun
> 
> ----- 8< -----
> From 4d7f7d24e90fba47bb08ddbeb8668123b4bbab1b Mon Sep 17 00:00:00 2001
> From: Tejun Heo <tj@kernel.org>
> Date: Tue, 7 Oct 2025 16:23:43 -1000
> Subject: [PATCH] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using
>  kvzalloc()
> 
> On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
> boot because it exceeds the 32,768 byte percpu allocator limit. The allocation
> size is sizeof(unsigned long) * nr_cpu_ids, which becomes 33,792 bytes with
> 4224 CPUs.
> 
> Restructure scx_kick_cpus_pnt_seqs to use DEFINE_PER_CPU() for the per-CPU
> pointers, with each CPU pointing to its own kvzalloc'd array. This avoids
> percpu allocator size limits. Additionally, move allocation from boot time to
> scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
> consumed when sched_ext is active.
> 
> Reported-by: Phil Auld <pauld@redhat.com>
> Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  kernel/sched/ext.c | 59 ++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 49 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 2b0e88206d07..042fc73fb141 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -67,8 +67,13 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
>  
>  static struct delayed_work scx_watchdog_work;
>  
> -/* for %SCX_KICK_WAIT */
> -static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
> +/*
> + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of sequence numbers.
> + * The arrays are allocated with kvzalloc() as size can exceed percpu allocator
> + * limits on large machines. O(nr_cpu_ids^2) allocation, allocated lazily when
> + * enabling and freed when disabling to avoid waste when sched_ext isn't active.
> + */
> +static DEFINE_PER_CPU(unsigned long *, scx_kick_cpus_pnt_seqs);
>  
>  /*
>   * Direct dispatch marker.
> @@ -3850,6 +3855,16 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
>  	}
>  }
>  
> +static void free_kick_cpus_pnt_seqs(void)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		kvfree(per_cpu(scx_kick_cpus_pnt_seqs, cpu));
> +		per_cpu(scx_kick_cpus_pnt_seqs, cpu) = NULL;
> +	}
> +}
> +
>  static void scx_disable_workfn(struct kthread_work *work)
>  {
>  	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
> @@ -3986,6 +4001,7 @@ static void scx_disable_workfn(struct kthread_work *work)
>  	free_percpu(scx_dsp_ctx);
>  	scx_dsp_ctx = NULL;
>  	scx_dsp_max_batch = 0;
> +	free_kick_cpus_pnt_seqs();
>  
>  	mutex_unlock(&scx_enable_mutex);
>  
> @@ -4348,6 +4364,28 @@ static void scx_vexit(struct scx_sched *sch,
>  	irq_work_queue(&sch->error_irq_work);
>  }
>  
> +static int alloc_kick_cpus_pnt_seqs(void)
> +{
> +	int cpu;
> +
> +	/*
> +	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
> +	 * can exceed percpu allocator limits on large machines.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		WARN_ON_ONCE(per_cpu(scx_kick_cpus_pnt_seqs, cpu));
> +		per_cpu(scx_kick_cpus_pnt_seqs, cpu) =
> +			kvzalloc_node(sizeof(unsigned long) * nr_cpu_ids,
> +				      GFP_KERNEL, cpu_to_node(cpu));
> +		if (!per_cpu(scx_kick_cpus_pnt_seqs, cpu)) {
> +			free_kick_cpus_pnt_seqs();
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
>  {
>  	struct scx_sched *sch;
> @@ -4490,15 +4528,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	mutex_lock(&scx_enable_mutex);
>  
> +	ret = alloc_kick_cpus_pnt_seqs();
> +	if (ret)
> +		goto err_unlock;
> +
>  	if (scx_enable_state() != SCX_DISABLED) {
>  		ret = -EBUSY;
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	sch = scx_alloc_and_add_sched(ops);
>  	if (IS_ERR(sch)) {
>  		ret = PTR_ERR(sch);
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	/*
> @@ -4701,6 +4743,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	return 0;
>  
> +err_free_pseqs:
> +	free_kick_cpus_pnt_seqs();
>  err_unlock:
>  	mutex_unlock(&scx_enable_mutex);
>  	return ret;
> @@ -5082,7 +5126,7 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
>  {
>  	struct rq *this_rq = this_rq();
>  	struct scx_rq *this_scx = &this_rq->scx;
> -	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
> +	unsigned long *pseqs = __this_cpu_read(scx_kick_cpus_pnt_seqs);
>  	bool should_wait = false;
>  	s32 cpu;

Should we add:

	if (WARN_ON_ONCE(!pseqs))
		return;

Not sure if we can race with scx_disable_workfn() here.
But for a test the patch looks fine as it is.

Thanks!
-Andrea

>  
> @@ -5208,11 +5252,6 @@ void __init init_sched_ext_class(void)
>  
>  	scx_idle_init_masks();
>  
> -	scx_kick_cpus_pnt_seqs =
> -		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
> -			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
> -	BUG_ON(!scx_kick_cpus_pnt_seqs);
> -
>  	for_each_possible_cpu(cpu) {
>  		struct rq *rq = cpu_rq(cpu);
>  		int  n = cpu_to_node(cpu);
> -- 
> 2.51.0
> 

  reply	other threads:[~2025-10-08  6:10 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-07 13:35 sched_ext and large cpu counts Phil Auld
2025-10-08  2:37 ` Tejun Heo
2025-10-08  6:10   ` Andrea Righi [this message]
2025-10-08 20:53     ` Tejun Heo
2025-10-08 21:48       ` [PATCH v2] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc() Tejun Heo
2025-10-08 22:24         ` Andrea Righi
2025-10-08 23:36           ` Tejun Heo
2025-10-08 23:38             ` Tejun Heo
2025-10-08 23:43             ` [PATCH v3] " Tejun Heo
2025-10-09  6:43               ` Andrea Righi
2025-10-09 12:06               ` Phil Auld
2025-10-10 13:02                 ` Phil Auld
2025-10-09 13:58               ` Emil Tsalapatis
2025-10-13 18:44               ` Tejun Heo
2025-10-13 20:13                 ` Andrea Righi
2025-10-08 11:23   ` sched_ext and large cpu counts Phil Auld

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aOYAV2ZQvMt5aZxu@gpd4 \
    --to=arighi@nvidia.com \
    --cc=changwoo@igalia.com \
    --cc=pauld@redhat.com \
    --cc=sched-ext@lists.linux.dev \
    --cc=tj@kernel.org \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.