sched_ext and large cpu counts

Sched_ext development
 help / color / mirror / Atom feed

* sched_ext and large cpu counts
@ 2025-10-07 13:35 Phil Auld
  2025-10-08  2:37 ` Tejun Heo
  0 siblings, 1 reply; 16+ messages in thread
From: Phil Auld @ 2025-10-07 13:35 UTC (permalink / raw)
  To: Andrea Righi; +Cc: Tejun Heo, David Vernet, Changwoo Min, sched-ext, pauld

Hi Andrea (and other sched_ext folks),

I've got some partners with systems with > 4096 cpus. On those systems
sched_ext crashes at boot due to:

init_sched_ext_class() {
...
        scx_kick_cpus_pnt_seqs =
                __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
                               __alignof__(scx_kick_cpus_pnt_seqs[0]));
        BUG_ON(!scx_kick_cpus_pnt_seqs);
...

4096 * 8 bytes is 32768 and is the max you can precpu allocate.  Anything more
and the _alloc_percpu fails and WARNs

[    0.000000] illegal size (33792) or align (8) for percpu allocation
[    0.000000] WARNING: CPU: 0 PID: 0 at mm/percpu.c:1779 pcpu_alloc_noprof+0x715/0x820

I started looking into changing that to static which would have to be based on
NR_CPUS (8192 in our case).  Because it's N^2 that starts to be a lot space.

While looking at how it's used I had a different question.  The comment says

  * We busy-wait here to guarantee that no other task can
                         * be scheduled on our core before the target CPU has
                         * entered the resched path.

But pnt_seq is actually only updated if we enter the resched path AND switch
classes. That seems more restrictive that the comment seems to require, no?

Any ideas on how to do this in different way?

I'd rather not have to turn off the CONFIG but 512 MB is a lot of space to
allocate for this. 

Thanks for taking a look and any suggestions.

Cheers,
Phil

-- 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: sched_ext and large cpu counts
  2025-10-07 13:35 sched_ext and large cpu counts Phil Auld
@ 2025-10-08  2:37 ` Tejun Heo
  2025-10-08  6:10   ` Andrea Righi
  2025-10-08 11:23   ` sched_ext and large cpu counts Phil Auld
  0 siblings, 2 replies; 16+ messages in thread
From: Tejun Heo @ 2025-10-08  2:37 UTC (permalink / raw)
  To: Phil Auld; +Cc: Andrea Righi, David Vernet, Changwoo Min, sched-ext

Hello,

Can you please see whether the following patch resolves the problem?

Thanks.

--
tejun

----- 8< -----
From 4d7f7d24e90fba47bb08ddbeb8668123b4bbab1b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Oct 2025 16:23:43 -1000
Subject: [PATCH] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using
 kvzalloc()

On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
boot because it exceeds the 32,768 byte percpu allocator limit. The allocation
size is sizeof(unsigned long) * nr_cpu_ids, which becomes 33,792 bytes with
4224 CPUs.

Restructure scx_kick_cpus_pnt_seqs to use DEFINE_PER_CPU() for the per-CPU
pointers, with each CPU pointing to its own kvzalloc'd array. This avoids
percpu allocator size limits. Additionally, move allocation from boot time to
scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
consumed when sched_ext is active.

Reported-by: Phil Auld <pauld@redhat.com>
Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 59 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2b0e88206d07..042fc73fb141 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -67,8 +67,13 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
 
 static struct delayed_work scx_watchdog_work;
 
-/* for %SCX_KICK_WAIT */
-static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
+/*
+ * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of sequence numbers.
+ * The arrays are allocated with kvzalloc() as size can exceed percpu allocator
+ * limits on large machines. O(nr_cpu_ids^2) allocation, allocated lazily when
+ * enabling and freed when disabling to avoid waste when sched_ext isn't active.
+ */
+static DEFINE_PER_CPU(unsigned long *, scx_kick_cpus_pnt_seqs);
 
 /*
  * Direct dispatch marker.
@@ -3850,6 +3855,16 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
 	}
 }
 
+static void free_kick_cpus_pnt_seqs(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		kvfree(per_cpu(scx_kick_cpus_pnt_seqs, cpu));
+		per_cpu(scx_kick_cpus_pnt_seqs, cpu) = NULL;
+	}
+}
+
 static void scx_disable_workfn(struct kthread_work *work)
 {
 	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
@@ -3986,6 +4001,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 	free_percpu(scx_dsp_ctx);
 	scx_dsp_ctx = NULL;
 	scx_dsp_max_batch = 0;
+	free_kick_cpus_pnt_seqs();
 
 	mutex_unlock(&scx_enable_mutex);
 
@@ -4348,6 +4364,28 @@ static void scx_vexit(struct scx_sched *sch,
 	irq_work_queue(&sch->error_irq_work);
 }
 
+static int alloc_kick_cpus_pnt_seqs(void)
+{
+	int cpu;
+
+	/*
+	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
+	 * can exceed percpu allocator limits on large machines.
+	 */
+	for_each_possible_cpu(cpu) {
+		WARN_ON_ONCE(per_cpu(scx_kick_cpus_pnt_seqs, cpu));
+		per_cpu(scx_kick_cpus_pnt_seqs, cpu) =
+			kvzalloc_node(sizeof(unsigned long) * nr_cpu_ids,
+				      GFP_KERNEL, cpu_to_node(cpu));
+		if (!per_cpu(scx_kick_cpus_pnt_seqs, cpu)) {
+			free_kick_cpus_pnt_seqs();
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 {
 	struct scx_sched *sch;
@@ -4490,15 +4528,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	mutex_lock(&scx_enable_mutex);
 
+	ret = alloc_kick_cpus_pnt_seqs();
+	if (ret)
+		goto err_unlock;
+
 	if (scx_enable_state() != SCX_DISABLED) {
 		ret = -EBUSY;
-		goto err_unlock;
+		goto err_free_pseqs;
 	}
 
 	sch = scx_alloc_and_add_sched(ops);
 	if (IS_ERR(sch)) {
 		ret = PTR_ERR(sch);
-		goto err_unlock;
+		goto err_free_pseqs;
 	}
 
 	/*
@@ -4701,6 +4743,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	return 0;
 
+err_free_pseqs:
+	free_kick_cpus_pnt_seqs();
 err_unlock:
 	mutex_unlock(&scx_enable_mutex);
 	return ret;
@@ -5082,7 +5126,7 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 {
 	struct rq *this_rq = this_rq();
 	struct scx_rq *this_scx = &this_rq->scx;
-	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
+	unsigned long *pseqs = __this_cpu_read(scx_kick_cpus_pnt_seqs);
 	bool should_wait = false;
 	s32 cpu;
 
@@ -5208,11 +5252,6 @@ void __init init_sched_ext_class(void)
 
 	scx_idle_init_masks();
 
-	scx_kick_cpus_pnt_seqs =
-		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
-			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
-	BUG_ON(!scx_kick_cpus_pnt_seqs);
-
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
 		int  n = cpu_to_node(cpu);
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: sched_ext and large cpu counts
  2025-10-08  2:37 ` Tejun Heo
@ 2025-10-08  6:10   ` Andrea Righi
  2025-10-08 20:53     ` Tejun Heo
  2025-10-08 11:23   ` sched_ext and large cpu counts Phil Auld
  1 sibling, 1 reply; 16+ messages in thread
From: Andrea Righi @ 2025-10-08  6:10 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Phil Auld, David Vernet, Changwoo Min, sched-ext

Hi Tejun and Phil,

On Tue, Oct 07, 2025 at 04:37:24PM -1000, Tejun Heo wrote:
> Hello,
> 
> Can you please see whether the following patch resolves the problem?
> 
> Thanks.
> 
> --
> tejun
> 
> ----- 8< -----
> From 4d7f7d24e90fba47bb08ddbeb8668123b4bbab1b Mon Sep 17 00:00:00 2001
> From: Tejun Heo <tj@kernel.org>
> Date: Tue, 7 Oct 2025 16:23:43 -1000
> Subject: [PATCH] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using
>  kvzalloc()
> 
> On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
> boot because it exceeds the 32,768 byte percpu allocator limit. The allocation
> size is sizeof(unsigned long) * nr_cpu_ids, which becomes 33,792 bytes with
> 4224 CPUs.
> 
> Restructure scx_kick_cpus_pnt_seqs to use DEFINE_PER_CPU() for the per-CPU
> pointers, with each CPU pointing to its own kvzalloc'd array. This avoids
> percpu allocator size limits. Additionally, move allocation from boot time to
> scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
> consumed when sched_ext is active.
> 
> Reported-by: Phil Auld <pauld@redhat.com>
> Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  kernel/sched/ext.c | 59 ++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 49 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 2b0e88206d07..042fc73fb141 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -67,8 +67,13 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
>  
>  static struct delayed_work scx_watchdog_work;
>  
> -/* for %SCX_KICK_WAIT */
> -static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
> +/*
> + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of sequence numbers.
> + * The arrays are allocated with kvzalloc() as size can exceed percpu allocator
> + * limits on large machines. O(nr_cpu_ids^2) allocation, allocated lazily when
> + * enabling and freed when disabling to avoid waste when sched_ext isn't active.
> + */
> +static DEFINE_PER_CPU(unsigned long *, scx_kick_cpus_pnt_seqs);
>  
>  /*
>   * Direct dispatch marker.
> @@ -3850,6 +3855,16 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
>  	}
>  }
>  
> +static void free_kick_cpus_pnt_seqs(void)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		kvfree(per_cpu(scx_kick_cpus_pnt_seqs, cpu));
> +		per_cpu(scx_kick_cpus_pnt_seqs, cpu) = NULL;
> +	}
> +}
> +
>  static void scx_disable_workfn(struct kthread_work *work)
>  {
>  	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
> @@ -3986,6 +4001,7 @@ static void scx_disable_workfn(struct kthread_work *work)
>  	free_percpu(scx_dsp_ctx);
>  	scx_dsp_ctx = NULL;
>  	scx_dsp_max_batch = 0;
> +	free_kick_cpus_pnt_seqs();
>  
>  	mutex_unlock(&scx_enable_mutex);
>  
> @@ -4348,6 +4364,28 @@ static void scx_vexit(struct scx_sched *sch,
>  	irq_work_queue(&sch->error_irq_work);
>  }
>  
> +static int alloc_kick_cpus_pnt_seqs(void)
> +{
> +	int cpu;
> +
> +	/*
> +	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
> +	 * can exceed percpu allocator limits on large machines.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		WARN_ON_ONCE(per_cpu(scx_kick_cpus_pnt_seqs, cpu));
> +		per_cpu(scx_kick_cpus_pnt_seqs, cpu) =
> +			kvzalloc_node(sizeof(unsigned long) * nr_cpu_ids,
> +				      GFP_KERNEL, cpu_to_node(cpu));
> +		if (!per_cpu(scx_kick_cpus_pnt_seqs, cpu)) {
> +			free_kick_cpus_pnt_seqs();
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
>  {
>  	struct scx_sched *sch;
> @@ -4490,15 +4528,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	mutex_lock(&scx_enable_mutex);
>  
> +	ret = alloc_kick_cpus_pnt_seqs();
> +	if (ret)
> +		goto err_unlock;
> +
>  	if (scx_enable_state() != SCX_DISABLED) {
>  		ret = -EBUSY;
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	sch = scx_alloc_and_add_sched(ops);
>  	if (IS_ERR(sch)) {
>  		ret = PTR_ERR(sch);
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	/*
> @@ -4701,6 +4743,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	return 0;
>  
> +err_free_pseqs:
> +	free_kick_cpus_pnt_seqs();
>  err_unlock:
>  	mutex_unlock(&scx_enable_mutex);
>  	return ret;
> @@ -5082,7 +5126,7 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
>  {
>  	struct rq *this_rq = this_rq();
>  	struct scx_rq *this_scx = &this_rq->scx;
> -	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
> +	unsigned long *pseqs = __this_cpu_read(scx_kick_cpus_pnt_seqs);
>  	bool should_wait = false;
>  	s32 cpu;

Should we add:

	if (WARN_ON_ONCE(!pseqs))
		return;

Not sure if we can race with scx_disable_workfn() here.
But for a test the patch looks fine as it is.

Thanks!
-Andrea

>  
> @@ -5208,11 +5252,6 @@ void __init init_sched_ext_class(void)
>  
>  	scx_idle_init_masks();
>  
> -	scx_kick_cpus_pnt_seqs =
> -		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
> -			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
> -	BUG_ON(!scx_kick_cpus_pnt_seqs);
> -
>  	for_each_possible_cpu(cpu) {
>  		struct rq *rq = cpu_rq(cpu);
>  		int  n = cpu_to_node(cpu);
> -- 
> 2.51.0
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: sched_ext and large cpu counts
  2025-10-08  6:10   ` Andrea Righi
@ 2025-10-08 20:53     ` Tejun Heo
  2025-10-08 21:48       ` [PATCH v2] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc() Tejun Heo
  0 siblings, 1 reply; 16+ messages in thread
From: Tejun Heo @ 2025-10-08 20:53 UTC (permalink / raw)
  To: Andrea Righi; +Cc: Phil Auld, David Vernet, Changwoo Min, sched-ext

On Wed, Oct 08, 2025 at 08:10:31AM +0200, Andrea Righi wrote:
> > @@ -5082,7 +5126,7 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
> >  {
> >  	struct rq *this_rq = this_rq();
> >  	struct scx_rq *this_scx = &this_rq->scx;
> > -	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
> > +	unsigned long *pseqs = __this_cpu_read(scx_kick_cpus_pnt_seqs);
> >  	bool should_wait = false;
> >  	s32 cpu;
> 
> Should we add:
> 
> 	if (WARN_ON_ONCE(!pseqs))
> 		return;
> 
> Not sure if we can race with scx_disable_workfn() here.
> But for a test the patch looks fine as it is.

Oh yeah, I think it can. We should probably RCU free the arrays and add a
NULL check here.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
  2025-10-08 20:53     ` Tejun Heo
@ 2025-10-08 21:48       ` Tejun Heo
  2025-10-08 22:24         ` Andrea Righi
  0 siblings, 1 reply; 16+ messages in thread
From: Tejun Heo @ 2025-10-08 21:48 UTC (permalink / raw)
  To: Andrea Righi; +Cc: Phil Auld, David Vernet, Changwoo Min, sched-ext

On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
boot because it exceeds the 32,768 byte percpu allocator limit.

Restructure to use DEFINE_PER_CPU() for the per-CPU pointers, with each CPU
pointing to its own kvzalloc'd array. Move allocation from boot time to
scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
consumed when sched_ext is active.

Use RCU to guard against racing with free. Arrays are freed via call_rcu()
and kick_cpus_irq_workfn() uses rcu_dereference_bh() with a NULL check.

While at it, rename to scx_kick_pseqs for brevity and update comments to
clarify these are pick_task sequence numbers.

Reported-by: Phil Auld <pauld@redhat.com>
Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 88 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 78 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2b0e88206d07..217c80d0105c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
 
 static struct delayed_work scx_watchdog_work;
 
-/* for %SCX_KICK_WAIT */
-static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
+/*
+ * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
+ * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
+ * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
+ * lazily when enabling and freed when disabling to avoid waste when sched_ext
+ * isn't active.
+ */
+struct scx_kick_pseqs {
+	struct rcu_head		rcu;
+	unsigned long		seqs[];
+};
+
+static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
 
 /*
  * Direct dispatch marker.
@@ -3850,6 +3861,25 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
 	}
 }
 
+static void free_kick_pseqs_rcu(struct rcu_head *rcu)
+{
+	struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
+
+	kvfree(pseqs);
+}
+
+static void free_kick_pseqs(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
+
+		call_rcu(&(*pseqs)->rcu, free_kick_pseqs_rcu);
+		RCU_INIT_POINTER(*pseqs, NULL);
+	}
+}
+
 static void scx_disable_workfn(struct kthread_work *work)
 {
 	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
@@ -3986,6 +4016,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 	free_percpu(scx_dsp_ctx);
 	scx_dsp_ctx = NULL;
 	scx_dsp_max_batch = 0;
+	free_kick_pseqs();
 
 	mutex_unlock(&scx_enable_mutex);
 
@@ -4348,6 +4379,34 @@ static void scx_vexit(struct scx_sched *sch,
 	irq_work_queue(&sch->error_irq_work);
 }
 
+static int alloc_kick_pseqs(void)
+{
+	int cpu;
+
+	/*
+	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
+	 * can exceed percpu allocator limits on large machines.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
+		struct scx_kick_pseqs *new_pseqs;
+
+
+		WARN_ON_ONCE(rcu_access_pointer(*pseqs));
+
+		new_pseqs = kvzalloc_node(sizeof(unsigned long) * nr_cpu_ids,
+					  GFP_KERNEL, cpu_to_node(cpu));
+		if (!new_pseqs) {
+			free_kick_pseqs();
+			return -ENOMEM;
+		}
+
+		rcu_assign_pointer(*pseqs, new_pseqs);
+	}
+
+	return 0;
+}
+
 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 {
 	struct scx_sched *sch;
@@ -4490,15 +4549,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	mutex_lock(&scx_enable_mutex);
 
+	ret = alloc_kick_pseqs();
+	if (ret)
+		goto err_unlock;
+
 	if (scx_enable_state() != SCX_DISABLED) {
 		ret = -EBUSY;
-		goto err_unlock;
+		goto err_free_pseqs;
 	}
 
 	sch = scx_alloc_and_add_sched(ops);
 	if (IS_ERR(sch)) {
 		ret = PTR_ERR(sch);
-		goto err_unlock;
+		goto err_free_pseqs;
 	}
 
 	/*
@@ -4701,6 +4764,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	return 0;
 
+err_free_pseqs:
+	free_kick_pseqs();
 err_unlock:
 	mutex_unlock(&scx_enable_mutex);
 	return ret;
@@ -5082,10 +5147,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 {
 	struct rq *this_rq = this_rq();
 	struct scx_rq *this_scx = &this_rq->scx;
-	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
+	struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
 	bool should_wait = false;
+	unsigned long *pseqs;
 	s32 cpu;
 
+	if (unlikely(!pseqs_pcpu)) {
+		pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
+		return;
+	}
+
+	pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
+
 	for_each_cpu(cpu, this_scx->cpus_to_kick) {
 		should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
@@ -5208,11 +5281,6 @@ void __init init_sched_ext_class(void)
 
 	scx_idle_init_masks();
 
-	scx_kick_cpus_pnt_seqs =
-		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
-			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
-	BUG_ON(!scx_kick_cpus_pnt_seqs);
-
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
 		int  n = cpu_to_node(cpu);
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
  2025-10-08 21:48       ` [PATCH v2] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc() Tejun Heo
@ 2025-10-08 22:24         ` Andrea Righi
  2025-10-08 23:36           ` Tejun Heo
  0 siblings, 1 reply; 16+ messages in thread
From: Andrea Righi @ 2025-10-08 22:24 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Phil Auld, David Vernet, Changwoo Min, sched-ext

Hi Tejun,

On Wed, Oct 08, 2025 at 11:48:16AM -1000, Tejun Heo wrote:
> On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
> boot because it exceeds the 32,768 byte percpu allocator limit.
> 
> Restructure to use DEFINE_PER_CPU() for the per-CPU pointers, with each CPU
> pointing to its own kvzalloc'd array. Move allocation from boot time to
> scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
> consumed when sched_ext is active.
> 
> Use RCU to guard against racing with free. Arrays are freed via call_rcu()
> and kick_cpus_irq_workfn() uses rcu_dereference_bh() with a NULL check.
> 
> While at it, rename to scx_kick_pseqs for brevity and update comments to
> clarify these are pick_task sequence numbers.
> 
> Reported-by: Phil Auld <pauld@redhat.com>
> Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  kernel/sched/ext.c | 88 ++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 78 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 2b0e88206d07..217c80d0105c 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
>  
>  static struct delayed_work scx_watchdog_work;
>  
> -/* for %SCX_KICK_WAIT */
> -static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
> +/*
> + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
> + * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
> + * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
> + * lazily when enabling and freed when disabling to avoid waste when sched_ext
> + * isn't active.
> + */
> +struct scx_kick_pseqs {
> +	struct rcu_head		rcu;
> +	unsigned long		seqs[];
> +};
> +
> +static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
>  
>  /*
>   * Direct dispatch marker.
> @@ -3850,6 +3861,25 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
>  	}
>  }
>  
> +static void free_kick_pseqs_rcu(struct rcu_head *rcu)
> +{
> +	struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
> +
> +	kvfree(pseqs);
> +}
> +
> +static void free_kick_pseqs(void)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> +
> +		call_rcu(&(*pseqs)->rcu, free_kick_pseqs_rcu);
> +		RCU_INIT_POINTER(*pseqs, NULL);

Is this safe? I think we should replace the pointer first and then schedule
the free via call_rcu(), like:

        old = rcu_replace_pointer(*pseqs, NULL, true);
        if (old)
		call_rcu(&old->rcu, free_kick_pseqs_rcu);

> +	}
> +}
> +
>  static void scx_disable_workfn(struct kthread_work *work)
>  {
>  	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
> @@ -3986,6 +4016,7 @@ static void scx_disable_workfn(struct kthread_work *work)
>  	free_percpu(scx_dsp_ctx);
>  	scx_dsp_ctx = NULL;
>  	scx_dsp_max_batch = 0;
> +	free_kick_pseqs();
>  
>  	mutex_unlock(&scx_enable_mutex);
>  
> @@ -4348,6 +4379,34 @@ static void scx_vexit(struct scx_sched *sch,
>  	irq_work_queue(&sch->error_irq_work);
>  }
>  
> +static int alloc_kick_pseqs(void)
> +{
> +	int cpu;
> +
> +	/*
> +	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
> +	 * can exceed percpu allocator limits on large machines.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> +		struct scx_kick_pseqs *new_pseqs;
> +
> +

nit: extra newline.

> +		WARN_ON_ONCE(rcu_access_pointer(*pseqs));
> +
> +		new_pseqs = kvzalloc_node(sizeof(unsigned long) * nr_cpu_ids,
> +					  GFP_KERNEL, cpu_to_node(cpu));

Don't we need to allocate the struct as well? This should be something
like:

	new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
				  GFP_KERNEL, cpu_to_node(cpu));

> +		if (!new_pseqs) {
> +			free_kick_pseqs();
> +			return -ENOMEM;
> +		}
> +
> +		rcu_assign_pointer(*pseqs, new_pseqs);
> +	}
> +
> +	return 0;
> +}
> +
>  static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
>  {
>  	struct scx_sched *sch;
> @@ -4490,15 +4549,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	mutex_lock(&scx_enable_mutex);
>  
> +	ret = alloc_kick_pseqs();
> +	if (ret)
> +		goto err_unlock;
> +
>  	if (scx_enable_state() != SCX_DISABLED) {
>  		ret = -EBUSY;
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	sch = scx_alloc_and_add_sched(ops);
>  	if (IS_ERR(sch)) {
>  		ret = PTR_ERR(sch);
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	/*
> @@ -4701,6 +4764,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	return 0;
>  
> +err_free_pseqs:
> +	free_kick_pseqs();
>  err_unlock:
>  	mutex_unlock(&scx_enable_mutex);
>  	return ret;
> @@ -5082,10 +5147,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
>  {
>  	struct rq *this_rq = this_rq();
>  	struct scx_rq *this_scx = &this_rq->scx;
> -	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
> +	struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
>  	bool should_wait = false;
> +	unsigned long *pseqs;
>  	s32 cpu;
>  
> +	if (unlikely(!pseqs_pcpu)) {
> +		pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
> +		return;
> +	}
> +
> +	pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
> +
>  	for_each_cpu(cpu, this_scx->cpus_to_kick) {
>  		should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
>  		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
> @@ -5208,11 +5281,6 @@ void __init init_sched_ext_class(void)
>  
>  	scx_idle_init_masks();
>  
> -	scx_kick_cpus_pnt_seqs =
> -		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
> -			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
> -	BUG_ON(!scx_kick_cpus_pnt_seqs);
> -
>  	for_each_possible_cpu(cpu) {
>  		struct rq *rq = cpu_rq(cpu);
>  		int  n = cpu_to_node(cpu);
> -- 
> 2.51.0
> 

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
  2025-10-08 22:24         ` Andrea Righi
@ 2025-10-08 23:36           ` Tejun Heo
  2025-10-08 23:38             ` Tejun Heo
  2025-10-08 23:43             ` [PATCH v3] " Tejun Heo
  0 siblings, 2 replies; 16+ messages in thread
From: Tejun Heo @ 2025-10-08 23:36 UTC (permalink / raw)
  To: Andrea Righi; +Cc: Phil Auld, David Vernet, Changwoo Min, sched-ext

Hello,

On Thu, Oct 09, 2025 at 12:24:16AM +0200, Andrea Righi wrote:
> > +static void free_kick_pseqs(void)
> > +{
> > +	int cpu;
> > +
> > +	for_each_possible_cpu(cpu) {
> > +		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> > +
> > +		call_rcu(&(*pseqs)->rcu, free_kick_pseqs_rcu);
> > +		RCU_INIT_POINTER(*pseqs, NULL);
> 
> Is this safe? I think we should replace the pointer first and then schedule
> the free via call_rcu(), like:
> 
>         old = rcu_replace_pointer(*pseqs, NULL, true);
>         if (old)
> 		call_rcu(&old->rcu, free_kick_pseqs_rcu);

*pseqs is the deferenced per-cpu pointer and we're freeding the struct
that's pointed by it. Whether the struct is freed or not doesn't really
affect the pointer storage. Let's say the struct gets freed between
call_rcu() and RCU_INIT_POINTER(). Then, *pseqs is just a dangling pointer
inbetween until it gets cleared.

> > +		WARN_ON_ONCE(rcu_access_pointer(*pseqs));
> > +
> > +		new_pseqs = kvzalloc_node(sizeof(unsigned long) * nr_cpu_ids,
> > +					  GFP_KERNEL, cpu_to_node(cpu));
> 
> Don't we need to allocate the struct as well? This should be something
> like:
> 
> 	new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
> 				  GFP_KERNEL, cpu_to_node(cpu));

Yes, will post the fixed version.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
  2025-10-08 23:36           ` Tejun Heo
@ 2025-10-08 23:38             ` Tejun Heo
  2025-10-08 23:43             ` [PATCH v3] " Tejun Heo
  1 sibling, 0 replies; 16+ messages in thread
From: Tejun Heo @ 2025-10-08 23:38 UTC (permalink / raw)
  To: Andrea Righi; +Cc: Phil Auld, David Vernet, Changwoo Min, sched-ext

Hello, again.

On Wed, Oct 08, 2025 at 01:36:27PM -1000, Tejun Heo wrote:
> On Thu, Oct 09, 2025 at 12:24:16AM +0200, Andrea Righi wrote:
> > > +static void free_kick_pseqs(void)
> > > +{
> > > +	int cpu;
> > > +
> > > +	for_each_possible_cpu(cpu) {
> > > +		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> > > +
> > > +		call_rcu(&(*pseqs)->rcu, free_kick_pseqs_rcu);
> > > +		RCU_INIT_POINTER(*pseqs, NULL);
> > 
> > Is this safe? I think we should replace the pointer first and then schedule
> > the free via call_rcu(), like:
> > 
> >         old = rcu_replace_pointer(*pseqs, NULL, true);
> >         if (old)
> > 		call_rcu(&old->rcu, free_kick_pseqs_rcu);
> 
> *pseqs is the deferenced per-cpu pointer and we're freeding the struct
> that's pointed by it. Whether the struct is freed or not doesn't really
> affect the pointer storage. Let's say the struct gets freed between
> call_rcu() and RCU_INIT_POINTER(). Then, *pseqs is just a dangling pointer
> inbetween until it gets cleared.

Wait, you're right. The problem is that the readers can end up fetching the
pointer afterwards. Will fix it.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
  2025-10-08 23:36           ` Tejun Heo
  2025-10-08 23:38             ` Tejun Heo
@ 2025-10-08 23:43             ` Tejun Heo
  2025-10-09  6:43               ` Andrea Righi
                                 ` (3 more replies)
  1 sibling, 4 replies; 16+ messages in thread
From: Tejun Heo @ 2025-10-08 23:43 UTC (permalink / raw)
  To: Andrea Righi; +Cc: Phil Auld, David Vernet, Changwoo Min, sched-ext

On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
boot because it exceeds the 32,768 byte percpu allocator limit.

Restructure to use DEFINE_PER_CPU() for the per-CPU pointers, with each CPU
pointing to its own kvzalloc'd array. Move allocation from boot time to
scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
consumed when sched_ext is active.

Use RCU to guard against racing with free. Arrays are freed via call_rcu()
and kick_cpus_irq_workfn() uses rcu_dereference_bh() with a NULL check.

While at it, rename to scx_kick_pseqs for brevity and update comments to
clarify these are pick_task sequence numbers.

v2: RCU protect scx_kick_seqs to manage kick_cpus_irq_workfn() racing
    against disable as per Andrea.

v3: Fix bugs notcied by Andrea.

Reported-by: Phil Auld <pauld@redhat.com>
Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 89 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 79 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2b0e88206d07..01010c3378b0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
 
 static struct delayed_work scx_watchdog_work;
 
-/* for %SCX_KICK_WAIT */
-static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
+/*
+ * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
+ * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
+ * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
+ * lazily when enabling and freed when disabling to avoid waste when sched_ext
+ * isn't active.
+ */
+struct scx_kick_pseqs {
+	struct rcu_head		rcu;
+	unsigned long		seqs[];
+};
+
+static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
 
 /*
  * Direct dispatch marker.
@@ -3850,6 +3861,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
 	}
 }
 
+static void free_kick_pseqs_rcu(struct rcu_head *rcu)
+{
+	struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
+
+	kvfree(pseqs);
+}
+
+static void free_kick_pseqs(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
+		struct scx_kick_pseqs *to_free;
+
+		to_free = rcu_replace_pointer(*pseqs, NULL, true);
+		if (to_free)
+			call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
+	}
+}
+
 static void scx_disable_workfn(struct kthread_work *work)
 {
 	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
@@ -3986,6 +4018,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 	free_percpu(scx_dsp_ctx);
 	scx_dsp_ctx = NULL;
 	scx_dsp_max_batch = 0;
+	free_kick_pseqs();
 
 	mutex_unlock(&scx_enable_mutex);
 
@@ -4348,6 +4381,33 @@ static void scx_vexit(struct scx_sched *sch,
 	irq_work_queue(&sch->error_irq_work);
 }
 
+static int alloc_kick_pseqs(void)
+{
+	int cpu;
+
+	/*
+	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
+	 * can exceed percpu allocator limits on large machines.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
+		struct scx_kick_pseqs *new_pseqs;
+
+		WARN_ON_ONCE(rcu_access_pointer(*pseqs));
+
+		new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
+					  GFP_KERNEL, cpu_to_node(cpu));
+		if (!new_pseqs) {
+			free_kick_pseqs();
+			return -ENOMEM;
+		}
+
+		rcu_assign_pointer(*pseqs, new_pseqs);
+	}
+
+	return 0;
+}
+
 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 {
 	struct scx_sched *sch;
@@ -4490,15 +4550,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	mutex_lock(&scx_enable_mutex);
 
+	ret = alloc_kick_pseqs();
+	if (ret)
+		goto err_unlock;
+
 	if (scx_enable_state() != SCX_DISABLED) {
 		ret = -EBUSY;
-		goto err_unlock;
+		goto err_free_pseqs;
 	}
 
 	sch = scx_alloc_and_add_sched(ops);
 	if (IS_ERR(sch)) {
 		ret = PTR_ERR(sch);
-		goto err_unlock;
+		goto err_free_pseqs;
 	}
 
 	/*
@@ -4701,6 +4765,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	return 0;
 
+err_free_pseqs:
+	free_kick_pseqs();
 err_unlock:
 	mutex_unlock(&scx_enable_mutex);
 	return ret;
@@ -5082,10 +5148,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 {
 	struct rq *this_rq = this_rq();
 	struct scx_rq *this_scx = &this_rq->scx;
-	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
+	struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
 	bool should_wait = false;
+	unsigned long *pseqs;
 	s32 cpu;
 
+	if (unlikely(!pseqs_pcpu)) {
+		pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
+		return;
+	}
+
+	pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
+
 	for_each_cpu(cpu, this_scx->cpus_to_kick) {
 		should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
@@ -5208,11 +5282,6 @@ void __init init_sched_ext_class(void)
 
 	scx_idle_init_masks();
 
-	scx_kick_cpus_pnt_seqs =
-		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
-			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
-	BUG_ON(!scx_kick_cpus_pnt_seqs);
-
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
 		int  n = cpu_to_node(cpu);
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v3] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
  2025-10-08 23:43             ` [PATCH v3] " Tejun Heo
@ 2025-10-09  6:43               ` Andrea Righi
  2025-10-09 12:06               ` Phil Auld
                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 16+ messages in thread
From: Andrea Righi @ 2025-10-09  6:43 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Phil Auld, David Vernet, Changwoo Min, sched-ext

Hi Tejun,

On Wed, Oct 08, 2025 at 01:43:26PM -1000, Tejun Heo wrote:
> On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
> boot because it exceeds the 32,768 byte percpu allocator limit.
> 
> Restructure to use DEFINE_PER_CPU() for the per-CPU pointers, with each CPU
> pointing to its own kvzalloc'd array. Move allocation from boot time to
> scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
> consumed when sched_ext is active.
> 
> Use RCU to guard against racing with free. Arrays are freed via call_rcu()
> and kick_cpus_irq_workfn() uses rcu_dereference_bh() with a NULL check.
> 
> While at it, rename to scx_kick_pseqs for brevity and update comments to
> clarify these are pick_task sequence numbers.
> 
> v2: RCU protect scx_kick_seqs to manage kick_cpus_irq_workfn() racing
>     against disable as per Andrea.
> 
> v3: Fix bugs notcied by Andrea.
> 
> Reported-by: Phil Auld <pauld@redhat.com>
> Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Andrea Righi <arighi@nvidia.com>

Looks good now!

Reviewed-by: Andrea Righi <arighi@nvidia.com>

Thanks,
-Andrea

> ---
>  kernel/sched/ext.c | 89 ++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 79 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 2b0e88206d07..01010c3378b0 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
>  
>  static struct delayed_work scx_watchdog_work;
>  
> -/* for %SCX_KICK_WAIT */
> -static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
> +/*
> + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
> + * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
> + * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
> + * lazily when enabling and freed when disabling to avoid waste when sched_ext
> + * isn't active.
> + */
> +struct scx_kick_pseqs {
> +	struct rcu_head		rcu;
> +	unsigned long		seqs[];
> +};
> +
> +static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
>  
>  /*
>   * Direct dispatch marker.
> @@ -3850,6 +3861,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
>  	}
>  }
>  
> +static void free_kick_pseqs_rcu(struct rcu_head *rcu)
> +{
> +	struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
> +
> +	kvfree(pseqs);
> +}
> +
> +static void free_kick_pseqs(void)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> +		struct scx_kick_pseqs *to_free;
> +
> +		to_free = rcu_replace_pointer(*pseqs, NULL, true);
> +		if (to_free)
> +			call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
> +	}
> +}
> +
>  static void scx_disable_workfn(struct kthread_work *work)
>  {
>  	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
> @@ -3986,6 +4018,7 @@ static void scx_disable_workfn(struct kthread_work *work)
>  	free_percpu(scx_dsp_ctx);
>  	scx_dsp_ctx = NULL;
>  	scx_dsp_max_batch = 0;
> +	free_kick_pseqs();
>  
>  	mutex_unlock(&scx_enable_mutex);
>  
> @@ -4348,6 +4381,33 @@ static void scx_vexit(struct scx_sched *sch,
>  	irq_work_queue(&sch->error_irq_work);
>  }
>  
> +static int alloc_kick_pseqs(void)
> +{
> +	int cpu;
> +
> +	/*
> +	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
> +	 * can exceed percpu allocator limits on large machines.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> +		struct scx_kick_pseqs *new_pseqs;
> +
> +		WARN_ON_ONCE(rcu_access_pointer(*pseqs));
> +
> +		new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
> +					  GFP_KERNEL, cpu_to_node(cpu));
> +		if (!new_pseqs) {
> +			free_kick_pseqs();
> +			return -ENOMEM;
> +		}
> +
> +		rcu_assign_pointer(*pseqs, new_pseqs);
> +	}
> +
> +	return 0;
> +}
> +
>  static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
>  {
>  	struct scx_sched *sch;
> @@ -4490,15 +4550,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	mutex_lock(&scx_enable_mutex);
>  
> +	ret = alloc_kick_pseqs();
> +	if (ret)
> +		goto err_unlock;
> +
>  	if (scx_enable_state() != SCX_DISABLED) {
>  		ret = -EBUSY;
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	sch = scx_alloc_and_add_sched(ops);
>  	if (IS_ERR(sch)) {
>  		ret = PTR_ERR(sch);
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	/*
> @@ -4701,6 +4765,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	return 0;
>  
> +err_free_pseqs:
> +	free_kick_pseqs();
>  err_unlock:
>  	mutex_unlock(&scx_enable_mutex);
>  	return ret;
> @@ -5082,10 +5148,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
>  {
>  	struct rq *this_rq = this_rq();
>  	struct scx_rq *this_scx = &this_rq->scx;
> -	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
> +	struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
>  	bool should_wait = false;
> +	unsigned long *pseqs;
>  	s32 cpu;
>  
> +	if (unlikely(!pseqs_pcpu)) {
> +		pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
> +		return;
> +	}
> +
> +	pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
> +
>  	for_each_cpu(cpu, this_scx->cpus_to_kick) {
>  		should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
>  		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
> @@ -5208,11 +5282,6 @@ void __init init_sched_ext_class(void)
>  
>  	scx_idle_init_masks();
>  
> -	scx_kick_cpus_pnt_seqs =
> -		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
> -			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
> -	BUG_ON(!scx_kick_cpus_pnt_seqs);
> -
>  	for_each_possible_cpu(cpu) {
>  		struct rq *rq = cpu_rq(cpu);
>  		int  n = cpu_to_node(cpu);
> -- 
> 2.51.0
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
  2025-10-08 23:43             ` [PATCH v3] " Tejun Heo
  2025-10-09  6:43               ` Andrea Righi
@ 2025-10-09 12:06               ` Phil Auld
  2025-10-10 13:02                 ` Phil Auld
  2025-10-09 13:58               ` Emil Tsalapatis
  2025-10-13 18:44               ` Tejun Heo
  3 siblings, 1 reply; 16+ messages in thread
From: Phil Auld @ 2025-10-09 12:06 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Andrea Righi, David Vernet, Changwoo Min, sched-ext

Hi Tejun,

On Wed, Oct 08, 2025 at 01:43:26PM -1000 Tejun Heo wrote:
> On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
> boot because it exceeds the 32,768 byte percpu allocator limit.
> 
> Restructure to use DEFINE_PER_CPU() for the per-CPU pointers, with each CPU
> pointing to its own kvzalloc'd array. Move allocation from boot time to
> scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
> consumed when sched_ext is active.
> 
> Use RCU to guard against racing with free. Arrays are freed via call_rcu()
> and kick_cpus_irq_workfn() uses rcu_dereference_bh() with a NULL check.
> 
> While at it, rename to scx_kick_pseqs for brevity and update comments to
> clarify these are pick_task sequence numbers.
> 
> v2: RCU protect scx_kick_seqs to manage kick_cpus_irq_workfn() racing
>     against disable as per Andrea.
> 
> v3: Fix bugs notcied by Andrea.

This looks good to me.  The original patch of course did solve
the boot issue. They aren't using scx so there were no other issues
seen. I've kicked off a build with this version and will try to
get that booted on the large system as well. It should solve that
problem just as well as the original one obviously.

Thanks for the quick turn around.

I'm not going to offer a Tested tag since it was only boot tested
(with no scx use) on the large system. And all that really tests
is the 5 lines removed from init_sched_ext_class...

Reviewed-by: Phil Auld <pauld@redhat.com>


Cheers,
Phil

> 
> Reported-by: Phil Auld <pauld@redhat.com>
> Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Andrea Righi <arighi@nvidia.com>
> ---
>  kernel/sched/ext.c | 89 ++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 79 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 2b0e88206d07..01010c3378b0 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
>  
>  static struct delayed_work scx_watchdog_work;
>  
> -/* for %SCX_KICK_WAIT */
> -static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
> +/*
> + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
> + * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
> + * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
> + * lazily when enabling and freed when disabling to avoid waste when sched_ext
> + * isn't active.
> + */
> +struct scx_kick_pseqs {
> +	struct rcu_head		rcu;
> +	unsigned long		seqs[];
> +};
> +
> +static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
>  
>  /*
>   * Direct dispatch marker.
> @@ -3850,6 +3861,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
>  	}
>  }
>  
> +static void free_kick_pseqs_rcu(struct rcu_head *rcu)
> +{
> +	struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
> +
> +	kvfree(pseqs);
> +}
> +
> +static void free_kick_pseqs(void)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> +		struct scx_kick_pseqs *to_free;
> +
> +		to_free = rcu_replace_pointer(*pseqs, NULL, true);
> +		if (to_free)
> +			call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
> +	}
> +}
> +
>  static void scx_disable_workfn(struct kthread_work *work)
>  {
>  	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
> @@ -3986,6 +4018,7 @@ static void scx_disable_workfn(struct kthread_work *work)
>  	free_percpu(scx_dsp_ctx);
>  	scx_dsp_ctx = NULL;
>  	scx_dsp_max_batch = 0;
> +	free_kick_pseqs();
>  
>  	mutex_unlock(&scx_enable_mutex);
>  
> @@ -4348,6 +4381,33 @@ static void scx_vexit(struct scx_sched *sch,
>  	irq_work_queue(&sch->error_irq_work);
>  }
>  
> +static int alloc_kick_pseqs(void)
> +{
> +	int cpu;
> +
> +	/*
> +	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
> +	 * can exceed percpu allocator limits on large machines.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> +		struct scx_kick_pseqs *new_pseqs;
> +
> +		WARN_ON_ONCE(rcu_access_pointer(*pseqs));
> +
> +		new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
> +					  GFP_KERNEL, cpu_to_node(cpu));
> +		if (!new_pseqs) {
> +			free_kick_pseqs();
> +			return -ENOMEM;
> +		}
> +
> +		rcu_assign_pointer(*pseqs, new_pseqs);
> +	}
> +
> +	return 0;
> +}
> +
>  static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
>  {
>  	struct scx_sched *sch;
> @@ -4490,15 +4550,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	mutex_lock(&scx_enable_mutex);
>  
> +	ret = alloc_kick_pseqs();
> +	if (ret)
> +		goto err_unlock;
> +
>  	if (scx_enable_state() != SCX_DISABLED) {
>  		ret = -EBUSY;
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	sch = scx_alloc_and_add_sched(ops);
>  	if (IS_ERR(sch)) {
>  		ret = PTR_ERR(sch);
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	/*
> @@ -4701,6 +4765,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	return 0;
>  
> +err_free_pseqs:
> +	free_kick_pseqs();
>  err_unlock:
>  	mutex_unlock(&scx_enable_mutex);
>  	return ret;
> @@ -5082,10 +5148,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
>  {
>  	struct rq *this_rq = this_rq();
>  	struct scx_rq *this_scx = &this_rq->scx;
> -	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
> +	struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
>  	bool should_wait = false;
> +	unsigned long *pseqs;
>  	s32 cpu;
>  
> +	if (unlikely(!pseqs_pcpu)) {
> +		pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
> +		return;
> +	}
> +
> +	pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
> +
>  	for_each_cpu(cpu, this_scx->cpus_to_kick) {
>  		should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
>  		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
> @@ -5208,11 +5282,6 @@ void __init init_sched_ext_class(void)
>  
>  	scx_idle_init_masks();
>  
> -	scx_kick_cpus_pnt_seqs =
> -		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
> -			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
> -	BUG_ON(!scx_kick_cpus_pnt_seqs);
> -
>  	for_each_possible_cpu(cpu) {
>  		struct rq *rq = cpu_rq(cpu);
>  		int  n = cpu_to_node(cpu);
> -- 
> 2.51.0
> 

-- 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
  2025-10-09 12:06               ` Phil Auld
@ 2025-10-10 13:02                 ` Phil Auld
  0 siblings, 0 replies; 16+ messages in thread
From: Phil Auld @ 2025-10-10 13:02 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Andrea Righi, David Vernet, Changwoo Min, sched-ext

On Thu, Oct 09, 2025 at 08:06:40AM -0400 Phil Auld wrote:
> Hi Tejun,
> 
> On Wed, Oct 08, 2025 at 01:43:26PM -1000 Tejun Heo wrote:
> > On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
> > boot because it exceeds the 32,768 byte percpu allocator limit.
> > 
> > Restructure to use DEFINE_PER_CPU() for the per-CPU pointers, with each CPU
> > pointing to its own kvzalloc'd array. Move allocation from boot time to
> > scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
> > consumed when sched_ext is active.
> > 
> > Use RCU to guard against racing with free. Arrays are freed via call_rcu()
> > and kick_cpus_irq_workfn() uses rcu_dereference_bh() with a NULL check.
> > 
> > While at it, rename to scx_kick_pseqs for brevity and update comments to
> > clarify these are pick_task sequence numbers.
> > 
> > v2: RCU protect scx_kick_seqs to manage kick_cpus_irq_workfn() racing
> >     against disable as per Andrea.
> > 
> > v3: Fix bugs notcied by Andrea.
> 
> This looks good to me.  The original patch of course did solve
> the boot issue. They aren't using scx so there were no other issues
> seen. I've kicked off a build with this version and will try to
> get that booted on the large system as well. It should solve that
> problem just as well as the original one obviously.
>

Just to follow up: as expected this also boots on the 5220 cpu
system. 


Cheers,
Phil


> Thanks for the quick turn around.
> 
> I'm not going to offer a Tested tag since it was only boot tested
> (with no scx use) on the large system. And all that really tests
> is the 5 lines removed from init_sched_ext_class...
> 
> Reviewed-by: Phil Auld <pauld@redhat.com>
> 
> 
> Cheers,
> Phil
> 
> > 
> > Reported-by: Phil Auld <pauld@redhat.com>
> > Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
> > Signed-off-by: Tejun Heo <tj@kernel.org>
> > Cc: Andrea Righi <arighi@nvidia.com>
> > ---
> >  kernel/sched/ext.c | 89 ++++++++++++++++++++++++++++++++++++++++------
> >  1 file changed, 79 insertions(+), 10 deletions(-)
> > 
> > diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> > index 2b0e88206d07..01010c3378b0 100644
> > --- a/kernel/sched/ext.c
> > +++ b/kernel/sched/ext.c
> > @@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
> >  
> >  static struct delayed_work scx_watchdog_work;
> >  
> > -/* for %SCX_KICK_WAIT */
> > -static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
> > +/*
> > + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
> > + * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
> > + * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
> > + * lazily when enabling and freed when disabling to avoid waste when sched_ext
> > + * isn't active.
> > + */
> > +struct scx_kick_pseqs {
> > +	struct rcu_head		rcu;
> > +	unsigned long		seqs[];
> > +};
> > +
> > +static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
> >  
> >  /*
> >   * Direct dispatch marker.
> > @@ -3850,6 +3861,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
> >  	}
> >  }
> >  
> > +static void free_kick_pseqs_rcu(struct rcu_head *rcu)
> > +{
> > +	struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
> > +
> > +	kvfree(pseqs);
> > +}
> > +
> > +static void free_kick_pseqs(void)
> > +{
> > +	int cpu;
> > +
> > +	for_each_possible_cpu(cpu) {
> > +		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> > +		struct scx_kick_pseqs *to_free;
> > +
> > +		to_free = rcu_replace_pointer(*pseqs, NULL, true);
> > +		if (to_free)
> > +			call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
> > +	}
> > +}
> > +
> >  static void scx_disable_workfn(struct kthread_work *work)
> >  {
> >  	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
> > @@ -3986,6 +4018,7 @@ static void scx_disable_workfn(struct kthread_work *work)
> >  	free_percpu(scx_dsp_ctx);
> >  	scx_dsp_ctx = NULL;
> >  	scx_dsp_max_batch = 0;
> > +	free_kick_pseqs();
> >  
> >  	mutex_unlock(&scx_enable_mutex);
> >  
> > @@ -4348,6 +4381,33 @@ static void scx_vexit(struct scx_sched *sch,
> >  	irq_work_queue(&sch->error_irq_work);
> >  }
> >  
> > +static int alloc_kick_pseqs(void)
> > +{
> > +	int cpu;
> > +
> > +	/*
> > +	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
> > +	 * can exceed percpu allocator limits on large machines.
> > +	 */
> > +	for_each_possible_cpu(cpu) {
> > +		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> > +		struct scx_kick_pseqs *new_pseqs;
> > +
> > +		WARN_ON_ONCE(rcu_access_pointer(*pseqs));
> > +
> > +		new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
> > +					  GFP_KERNEL, cpu_to_node(cpu));
> > +		if (!new_pseqs) {
> > +			free_kick_pseqs();
> > +			return -ENOMEM;
> > +		}
> > +
> > +		rcu_assign_pointer(*pseqs, new_pseqs);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
> >  {
> >  	struct scx_sched *sch;
> > @@ -4490,15 +4550,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> >  
> >  	mutex_lock(&scx_enable_mutex);
> >  
> > +	ret = alloc_kick_pseqs();
> > +	if (ret)
> > +		goto err_unlock;
> > +
> >  	if (scx_enable_state() != SCX_DISABLED) {
> >  		ret = -EBUSY;
> > -		goto err_unlock;
> > +		goto err_free_pseqs;
> >  	}
> >  
> >  	sch = scx_alloc_and_add_sched(ops);
> >  	if (IS_ERR(sch)) {
> >  		ret = PTR_ERR(sch);
> > -		goto err_unlock;
> > +		goto err_free_pseqs;
> >  	}
> >  
> >  	/*
> > @@ -4701,6 +4765,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> >  
> >  	return 0;
> >  
> > +err_free_pseqs:
> > +	free_kick_pseqs();
> >  err_unlock:
> >  	mutex_unlock(&scx_enable_mutex);
> >  	return ret;
> > @@ -5082,10 +5148,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
> >  {
> >  	struct rq *this_rq = this_rq();
> >  	struct scx_rq *this_scx = &this_rq->scx;
> > -	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
> > +	struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
> >  	bool should_wait = false;
> > +	unsigned long *pseqs;
> >  	s32 cpu;
> >  
> > +	if (unlikely(!pseqs_pcpu)) {
> > +		pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
> > +		return;
> > +	}
> > +
> > +	pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
> > +
> >  	for_each_cpu(cpu, this_scx->cpus_to_kick) {
> >  		should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
> >  		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
> > @@ -5208,11 +5282,6 @@ void __init init_sched_ext_class(void)
> >  
> >  	scx_idle_init_masks();
> >  
> > -	scx_kick_cpus_pnt_seqs =
> > -		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
> > -			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
> > -	BUG_ON(!scx_kick_cpus_pnt_seqs);
> > -
> >  	for_each_possible_cpu(cpu) {
> >  		struct rq *rq = cpu_rq(cpu);
> >  		int  n = cpu_to_node(cpu);
> > -- 
> > 2.51.0
> > 
> 
> -- 

-- 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
  2025-10-08 23:43             ` [PATCH v3] " Tejun Heo
  2025-10-09  6:43               ` Andrea Righi
  2025-10-09 12:06               ` Phil Auld
@ 2025-10-09 13:58               ` Emil Tsalapatis
  2025-10-13 18:44               ` Tejun Heo
  3 siblings, 0 replies; 16+ messages in thread
From: Emil Tsalapatis @ 2025-10-09 13:58 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Andrea Righi, Phil Auld, David Vernet, Changwoo Min, sched-ext

On Wed, Oct 8, 2025 at 7:43 PM Tejun Heo <tj@kernel.org> wrote:
>
> On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
> boot because it exceeds the 32,768 byte percpu allocator limit.
>
> Restructure to use DEFINE_PER_CPU() for the per-CPU pointers, with each CPU
> pointing to its own kvzalloc'd array. Move allocation from boot time to
> scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
> consumed when sched_ext is active.
>
> Use RCU to guard against racing with free. Arrays are freed via call_rcu()
> and kick_cpus_irq_workfn() uses rcu_dereference_bh() with a NULL check.
>
> While at it, rename to scx_kick_pseqs for brevity and update comments to
> clarify these are pick_task sequence numbers.
>
> v2: RCU protect scx_kick_seqs to manage kick_cpus_irq_workfn() racing
>     against disable as per Andrea.
>
> v3: Fix bugs notcied by Andrea.
>
> Reported-by: Phil Auld <pauld@redhat.com>
> Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Andrea Righi <arighi@nvidia.com>
> ---


Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

>  kernel/sched/ext.c | 89 ++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 79 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 2b0e88206d07..01010c3378b0 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
>
>  static struct delayed_work scx_watchdog_work;
>
> -/* for %SCX_KICK_WAIT */
> -static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
> +/*
> + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
> + * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
> + * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
> + * lazily when enabling and freed when disabling to avoid waste when sched_ext
> + * isn't active.
> + */
> +struct scx_kick_pseqs {
> +       struct rcu_head         rcu;
> +       unsigned long           seqs[];
> +};
> +
> +static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
>
>  /*
>   * Direct dispatch marker.
> @@ -3850,6 +3861,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
>         }
>  }
>
> +static void free_kick_pseqs_rcu(struct rcu_head *rcu)
> +{
> +       struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
> +
> +       kvfree(pseqs);
> +}
> +
> +static void free_kick_pseqs(void)
> +{
> +       int cpu;
> +
> +       for_each_possible_cpu(cpu) {
> +               struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> +               struct scx_kick_pseqs *to_free;
> +
> +               to_free = rcu_replace_pointer(*pseqs, NULL, true);
> +               if (to_free)
> +                       call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
> +       }
> +}
> +
>  static void scx_disable_workfn(struct kthread_work *work)
>  {
>         struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
> @@ -3986,6 +4018,7 @@ static void scx_disable_workfn(struct kthread_work *work)
>         free_percpu(scx_dsp_ctx);
>         scx_dsp_ctx = NULL;
>         scx_dsp_max_batch = 0;
> +       free_kick_pseqs();
>
>         mutex_unlock(&scx_enable_mutex);
>
> @@ -4348,6 +4381,33 @@ static void scx_vexit(struct scx_sched *sch,
>         irq_work_queue(&sch->error_irq_work);
>  }
>
> +static int alloc_kick_pseqs(void)
> +{
> +       int cpu;
> +
> +       /*
> +        * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
> +        * can exceed percpu allocator limits on large machines.
> +        */
> +       for_each_possible_cpu(cpu) {
> +               struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
> +               struct scx_kick_pseqs *new_pseqs;
> +
> +               WARN_ON_ONCE(rcu_access_pointer(*pseqs));
> +
> +               new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
> +                                         GFP_KERNEL, cpu_to_node(cpu));
> +               if (!new_pseqs) {
> +                       free_kick_pseqs();
> +                       return -ENOMEM;
> +               }
> +
> +               rcu_assign_pointer(*pseqs, new_pseqs);
> +       }
> +
> +       return 0;
> +}
> +
>  static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
>  {
>         struct scx_sched *sch;
> @@ -4490,15 +4550,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>
>         mutex_lock(&scx_enable_mutex);
>
> +       ret = alloc_kick_pseqs();
> +       if (ret)
> +               goto err_unlock;
> +
>         if (scx_enable_state() != SCX_DISABLED) {
>                 ret = -EBUSY;
> -               goto err_unlock;
> +               goto err_free_pseqs;
>         }
>
>         sch = scx_alloc_and_add_sched(ops);
>         if (IS_ERR(sch)) {
>                 ret = PTR_ERR(sch);
> -               goto err_unlock;
> +               goto err_free_pseqs;
>         }
>
>         /*
> @@ -4701,6 +4765,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>
>         return 0;
>
> +err_free_pseqs:
> +       free_kick_pseqs();
>  err_unlock:
>         mutex_unlock(&scx_enable_mutex);
>         return ret;
> @@ -5082,10 +5148,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
>  {
>         struct rq *this_rq = this_rq();
>         struct scx_rq *this_scx = &this_rq->scx;
> -       unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
> +       struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
>         bool should_wait = false;
> +       unsigned long *pseqs;
>         s32 cpu;
>
> +       if (unlikely(!pseqs_pcpu)) {
> +               pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
> +               return;
> +       }
> +
> +       pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
> +
>         for_each_cpu(cpu, this_scx->cpus_to_kick) {
>                 should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
>                 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
> @@ -5208,11 +5282,6 @@ void __init init_sched_ext_class(void)
>
>         scx_idle_init_masks();
>
> -       scx_kick_cpus_pnt_seqs =
> -               __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
> -                              __alignof__(scx_kick_cpus_pnt_seqs[0]));
> -       BUG_ON(!scx_kick_cpus_pnt_seqs);
> -
>         for_each_possible_cpu(cpu) {
>                 struct rq *rq = cpu_rq(cpu);
>                 int  n = cpu_to_node(cpu);
> --
> 2.51.0
>
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
  2025-10-08 23:43             ` [PATCH v3] " Tejun Heo
                                 ` (2 preceding siblings ...)
  2025-10-09 13:58               ` Emil Tsalapatis
@ 2025-10-13 18:44               ` Tejun Heo
  2025-10-13 20:13                 ` Andrea Righi
  3 siblings, 1 reply; 16+ messages in thread
From: Tejun Heo @ 2025-10-13 18:44 UTC (permalink / raw)
  To: Phil Auld, Andrea Righi, Emil Tsalapatis
  Cc: David Vernet, Changwoo Min, sched-ext, linux-kernel

> Tejun Heo (1):
>   sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()

Applied to sched_ext/for-6.18-fixes.

Thanks.

--
tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
  2025-10-13 18:44               ` Tejun Heo
@ 2025-10-13 20:13                 ` Andrea Righi
  0 siblings, 0 replies; 16+ messages in thread
From: Andrea Righi @ 2025-10-13 20:13 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Phil Auld, Emil Tsalapatis, David Vernet, Changwoo Min, sched-ext,
	linux-kernel

On Mon, Oct 13, 2025 at 08:44:26AM -1000, Tejun Heo wrote:
> > Tejun Heo (1):
> >   sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
> 
> Applied to sched_ext/for-6.18-fixes.

Meh, too late. I just noticed a bug with this. I'll send a follow-up patch.

-Andrea

> 
> Thanks.
> 
> --
> tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: sched_ext and large cpu counts
  2025-10-08  2:37 ` Tejun Heo
  2025-10-08  6:10   ` Andrea Righi
@ 2025-10-08 11:23   ` Phil Auld
  1 sibling, 0 replies; 16+ messages in thread
From: Phil Auld @ 2025-10-08 11:23 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Andrea Righi, David Vernet, Changwoo Min, sched-ext

On Tue, Oct 07, 2025 at 04:37:24PM -1000 Tejun Heo wrote:
> Hello,
> 
> Can you please see whether the following patch resolves the problem?
>

Yes, thanks, I'll do that. It may take a little time since they have
not given me such a machine.  I'll built it and pass it on to the
partner to test it.  My take a few days to turn around. 


Cheers,
Phil

> Thanks.
> 
> --
> tejun
> 
> ----- 8< -----
> From 4d7f7d24e90fba47bb08ddbeb8668123b4bbab1b Mon Sep 17 00:00:00 2001
> From: Tejun Heo <tj@kernel.org>
> Date: Tue, 7 Oct 2025 16:23:43 -1000
> Subject: [PATCH] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using
>  kvzalloc()
> 
> On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
> boot because it exceeds the 32,768 byte percpu allocator limit. The allocation
> size is sizeof(unsigned long) * nr_cpu_ids, which becomes 33,792 bytes with
> 4224 CPUs.
> 
> Restructure scx_kick_cpus_pnt_seqs to use DEFINE_PER_CPU() for the per-CPU
> pointers, with each CPU pointing to its own kvzalloc'd array. This avoids
> percpu allocator size limits. Additionally, move allocation from boot time to
> scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
> consumed when sched_ext is active.
> 
> Reported-by: Phil Auld <pauld@redhat.com>
> Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  kernel/sched/ext.c | 59 ++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 49 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 2b0e88206d07..042fc73fb141 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -67,8 +67,13 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
>  
>  static struct delayed_work scx_watchdog_work;
>  
> -/* for %SCX_KICK_WAIT */
> -static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
> +/*
> + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of sequence numbers.
> + * The arrays are allocated with kvzalloc() as size can exceed percpu allocator
> + * limits on large machines. O(nr_cpu_ids^2) allocation, allocated lazily when
> + * enabling and freed when disabling to avoid waste when sched_ext isn't active.
> + */
> +static DEFINE_PER_CPU(unsigned long *, scx_kick_cpus_pnt_seqs);
>  
>  /*
>   * Direct dispatch marker.
> @@ -3850,6 +3855,16 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
>  	}
>  }
>  
> +static void free_kick_cpus_pnt_seqs(void)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		kvfree(per_cpu(scx_kick_cpus_pnt_seqs, cpu));
> +		per_cpu(scx_kick_cpus_pnt_seqs, cpu) = NULL;
> +	}
> +}
> +
>  static void scx_disable_workfn(struct kthread_work *work)
>  {
>  	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
> @@ -3986,6 +4001,7 @@ static void scx_disable_workfn(struct kthread_work *work)
>  	free_percpu(scx_dsp_ctx);
>  	scx_dsp_ctx = NULL;
>  	scx_dsp_max_batch = 0;
> +	free_kick_cpus_pnt_seqs();
>  
>  	mutex_unlock(&scx_enable_mutex);
>  
> @@ -4348,6 +4364,28 @@ static void scx_vexit(struct scx_sched *sch,
>  	irq_work_queue(&sch->error_irq_work);
>  }
>  
> +static int alloc_kick_cpus_pnt_seqs(void)
> +{
> +	int cpu;
> +
> +	/*
> +	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
> +	 * can exceed percpu allocator limits on large machines.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		WARN_ON_ONCE(per_cpu(scx_kick_cpus_pnt_seqs, cpu));
> +		per_cpu(scx_kick_cpus_pnt_seqs, cpu) =
> +			kvzalloc_node(sizeof(unsigned long) * nr_cpu_ids,
> +				      GFP_KERNEL, cpu_to_node(cpu));
> +		if (!per_cpu(scx_kick_cpus_pnt_seqs, cpu)) {
> +			free_kick_cpus_pnt_seqs();
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
>  {
>  	struct scx_sched *sch;
> @@ -4490,15 +4528,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	mutex_lock(&scx_enable_mutex);
>  
> +	ret = alloc_kick_cpus_pnt_seqs();
> +	if (ret)
> +		goto err_unlock;
> +
>  	if (scx_enable_state() != SCX_DISABLED) {
>  		ret = -EBUSY;
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	sch = scx_alloc_and_add_sched(ops);
>  	if (IS_ERR(sch)) {
>  		ret = PTR_ERR(sch);
> -		goto err_unlock;
> +		goto err_free_pseqs;
>  	}
>  
>  	/*
> @@ -4701,6 +4743,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	return 0;
>  
> +err_free_pseqs:
> +	free_kick_cpus_pnt_seqs();
>  err_unlock:
>  	mutex_unlock(&scx_enable_mutex);
>  	return ret;
> @@ -5082,7 +5126,7 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
>  {
>  	struct rq *this_rq = this_rq();
>  	struct scx_rq *this_scx = &this_rq->scx;
> -	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
> +	unsigned long *pseqs = __this_cpu_read(scx_kick_cpus_pnt_seqs);
>  	bool should_wait = false;
>  	s32 cpu;
>  
> @@ -5208,11 +5252,6 @@ void __init init_sched_ext_class(void)
>  
>  	scx_idle_init_masks();
>  
> -	scx_kick_cpus_pnt_seqs =
> -		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
> -			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
> -	BUG_ON(!scx_kick_cpus_pnt_seqs);
> -
>  	for_each_possible_cpu(cpu) {
>  		struct rq *rq = cpu_rq(cpu);
>  		int  n = cpu_to_node(cpu);
> -- 
> 2.51.0
> 

-- 


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2025-10-13 20:13 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-07 13:35 sched_ext and large cpu counts Phil Auld
2025-10-08  2:37 ` Tejun Heo
2025-10-08  6:10   ` Andrea Righi
2025-10-08 20:53     ` Tejun Heo
2025-10-08 21:48       ` [PATCH v2] sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc() Tejun Heo
2025-10-08 22:24         ` Andrea Righi
2025-10-08 23:36           ` Tejun Heo
2025-10-08 23:38             ` Tejun Heo
2025-10-08 23:43             ` [PATCH v3] " Tejun Heo
2025-10-09  6:43               ` Andrea Righi
2025-10-09 12:06               ` Phil Auld
2025-10-10 13:02                 ` Phil Auld
2025-10-09 13:58               ` Emil Tsalapatis
2025-10-13 18:44               ` Tejun Heo
2025-10-13 20:13                 ` Andrea Righi
2025-10-08 11:23   ` sched_ext and large cpu counts Phil Auld

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox