From: Andrea Righi <arighi@nvidia.com>
To: Tejun Heo <tj@kernel.org>
Cc: Phil Auld <pauld@redhat.com>, David Vernet <void@manifault.com>,
Changwoo Min <changwoo@igalia.com>,
sched-ext@lists.linux.dev
Subject: Re: [PATCH v3] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
Date: Thu, 9 Oct 2025 08:43:27 +0200 [thread overview]
Message-ID: <aOdZj3ERqEeNJ3OP@gpd4> (raw)
In-Reply-To: <aOb3Hv0uPv9G7N33@slm.duckdns.org>
Hi Tejun,
On Wed, Oct 08, 2025 at 01:43:26PM -1000, Tejun Heo wrote:
> On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
> boot because it exceeds the 32,768 byte percpu allocator limit.
>
> Restructure to use DEFINE_PER_CPU() for the per-CPU pointers, with each CPU
> pointing to its own kvzalloc'd array. Move allocation from boot time to
> scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
> consumed when sched_ext is active.
>
> Use RCU to guard against racing with free. Arrays are freed via call_rcu()
> and kick_cpus_irq_workfn() uses rcu_dereference_bh() with a NULL check.
>
> While at it, rename to scx_kick_pseqs for brevity and update comments to
> clarify these are pick_task sequence numbers.
>
> v2: RCU protect scx_kick_seqs to manage kick_cpus_irq_workfn() racing
> against disable as per Andrea.
>
> v3: Fix bugs notcied by Andrea.
>
> Reported-by: Phil Auld <pauld@redhat.com>
> Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Andrea Righi <arighi@nvidia.com>
Looks good now!
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Thanks,
-Andrea
> ---
> kernel/sched/ext.c | 89 ++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 79 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 2b0e88206d07..01010c3378b0 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
>
> static struct delayed_work scx_watchdog_work;
>
> -/* for %SCX_KICK_WAIT */
> -static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
> +/*
> + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
> + * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
> + * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
> + * lazily when enabling and freed when disabling to avoid waste when sched_ext
> + * isn't active.
> + */
> +struct scx_kick_pseqs {
> + struct rcu_head rcu;
> + unsigned long seqs[];
> +};
> +
> +static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
>
> /*
> * Direct dispatch marker.
> @@ -3850,6 +3861,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
> }
> }
>
> +static void free_kick_pseqs_rcu(struct rcu_head *rcu)
> +{
> + struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
> +
> + kvfree(pseqs);
> +}
> +
> +static void free_kick_pseqs(void)
> +{
> + int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> + struct scx_kick_pseqs *to_free;
> +
> + to_free = rcu_replace_pointer(*pseqs, NULL, true);
> + if (to_free)
> + call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
> + }
> +}
> +
> static void scx_disable_workfn(struct kthread_work *work)
> {
> struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
> @@ -3986,6 +4018,7 @@ static void scx_disable_workfn(struct kthread_work *work)
> free_percpu(scx_dsp_ctx);
> scx_dsp_ctx = NULL;
> scx_dsp_max_batch = 0;
> + free_kick_pseqs();
>
> mutex_unlock(&scx_enable_mutex);
>
> @@ -4348,6 +4381,33 @@ static void scx_vexit(struct scx_sched *sch,
> irq_work_queue(&sch->error_irq_work);
> }
>
> +static int alloc_kick_pseqs(void)
> +{
> + int cpu;
> +
> + /*
> + * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
> + * can exceed percpu allocator limits on large machines.
> + */
> + for_each_possible_cpu(cpu) {
> + struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> + struct scx_kick_pseqs *new_pseqs;
> +
> + WARN_ON_ONCE(rcu_access_pointer(*pseqs));
> +
> + new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
> + GFP_KERNEL, cpu_to_node(cpu));
> + if (!new_pseqs) {
> + free_kick_pseqs();
> + return -ENOMEM;
> + }
> +
> + rcu_assign_pointer(*pseqs, new_pseqs);
> + }
> +
> + return 0;
> +}
> +
> static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
> {
> struct scx_sched *sch;
> @@ -4490,15 +4550,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>
> mutex_lock(&scx_enable_mutex);
>
> + ret = alloc_kick_pseqs();
> + if (ret)
> + goto err_unlock;
> +
> if (scx_enable_state() != SCX_DISABLED) {
> ret = -EBUSY;
> - goto err_unlock;
> + goto err_free_pseqs;
> }
>
> sch = scx_alloc_and_add_sched(ops);
> if (IS_ERR(sch)) {
> ret = PTR_ERR(sch);
> - goto err_unlock;
> + goto err_free_pseqs;
> }
>
> /*
> @@ -4701,6 +4765,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>
> return 0;
>
> +err_free_pseqs:
> + free_kick_pseqs();
> err_unlock:
> mutex_unlock(&scx_enable_mutex);
> return ret;
> @@ -5082,10 +5148,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
> {
> struct rq *this_rq = this_rq();
> struct scx_rq *this_scx = &this_rq->scx;
> - unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
> + struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
> bool should_wait = false;
> + unsigned long *pseqs;
> s32 cpu;
>
> + if (unlikely(!pseqs_pcpu)) {
> + pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
> + return;
> + }
> +
> + pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
> +
> for_each_cpu(cpu, this_scx->cpus_to_kick) {
> should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
> cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
> @@ -5208,11 +5282,6 @@ void __init init_sched_ext_class(void)
>
> scx_idle_init_masks();
>
> - scx_kick_cpus_pnt_seqs =
> - __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
> - __alignof__(scx_kick_cpus_pnt_seqs[0]));
> - BUG_ON(!scx_kick_cpus_pnt_seqs);
> -
> for_each_possible_cpu(cpu) {
> struct rq *rq = cpu_rq(cpu);
> int n = cpu_to_node(cpu);
> --
> 2.51.0
>
next prev parent reply other threads:[~2025-10-09 6:43 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-07 13:35 sched_ext and large cpu counts Phil Auld
2025-10-08 2:37 ` Tejun Heo
2025-10-08 6:10 ` Andrea Righi
2025-10-08 20:53 ` Tejun Heo
2025-10-08 21:48 ` [PATCH v2] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc() Tejun Heo
2025-10-08 22:24 ` Andrea Righi
2025-10-08 23:36 ` Tejun Heo
2025-10-08 23:38 ` Tejun Heo
2025-10-08 23:43 ` [PATCH v3] " Tejun Heo
2025-10-09 6:43 ` Andrea Righi [this message]
2025-10-09 12:06 ` Phil Auld
2025-10-10 13:02 ` Phil Auld
2025-10-09 13:58 ` Emil Tsalapatis
2025-10-13 18:44 ` Tejun Heo
2025-10-13 20:13 ` Andrea Righi
2025-10-08 11:23 ` sched_ext and large cpu counts Phil Auld
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aOdZj3ERqEeNJ3OP@gpd4 \
--to=arighi@nvidia.com \
--cc=changwoo@igalia.com \
--cc=pauld@redhat.com \
--cc=sched-ext@lists.linux.dev \
--cc=tj@kernel.org \
--cc=void@manifault.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.