Sched_ext development
 help / color / mirror / Atom feed
* [PATCH sched_ext/for-7.2] selftests/sched_ext: Fix select_cpu_dfl link leak on early return
@ 2026-05-08  6:55 Cheng-Yang Chou
  2026-05-08  8:08 ` Andrea Righi
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Cheng-Yang Chou @ 2026-05-08  6:55 UTC (permalink / raw)
  To: sched-ext, Tejun Heo, David Vernet, Andrea Righi, Changwoo Min
  Cc: Ching-Chun Huang, Chia-Ping Tsai, yphbchou0911

If run() exits early via SCX_EQ/SCX_ASSERT (which calls return
directly), bpf_link__destroy() is never reached and the BPF
scheduler stays loaded. All subsequent tests then fail to attach
because SCX is not in the DISABLED state.

Move bpf_link into a context struct so cleanup() always destroys
it, regardless of how run() exits. Also skip waitpid() for children
where fork() returned -1, avoiding waitpid(-1,...) accidentally
reaping an unrelated child and triggering the early return path.

Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
---
I was hitting this error when testing the defer SCX bandwidth patchset,
though I think it shouldn't hit the bug under regular testing. Tested
this on the for-next branch and everything still looks fine.

 .../selftests/sched_ext/select_cpu_dfl.c      | 54 +++++++++++++------
 1 file changed, 38 insertions(+), 16 deletions(-)

diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
index 5b6e045e1109..7e342c0cec65 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
@@ -6,6 +6,7 @@
  */
 #include <bpf/bpf.h>
 #include <scx/common.h>
+#include <stdlib.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include "select_cpu_dfl.bpf.skel.h"
@@ -13,29 +14,44 @@
 
 #define NUM_CHILDREN 1028
 
+struct select_cpu_dfl_ctx {
+	struct select_cpu_dfl	*skel;
+	struct bpf_link		*link;
+};
+
 static enum scx_test_status setup(void **ctx)
 {
-	struct select_cpu_dfl *skel;
+	struct select_cpu_dfl_ctx *tctx;
+
+	tctx = malloc(sizeof(*tctx));
+	SCX_FAIL_IF(!tctx, "Failed to allocate test context");
+	tctx->link = NULL;
 
-	skel = select_cpu_dfl__open();
-	SCX_FAIL_IF(!skel, "Failed to open");
-	SCX_ENUM_INIT(skel);
-	SCX_FAIL_IF(select_cpu_dfl__load(skel), "Failed to load skel");
+	tctx->skel = select_cpu_dfl__open();
+	if (!tctx->skel) {
+		free(tctx);
+		SCX_FAIL("Failed to open");
+	}
+	SCX_ENUM_INIT(tctx->skel);
+	if (select_cpu_dfl__load(tctx->skel)) {
+		select_cpu_dfl__destroy(tctx->skel);
+		free(tctx);
+		SCX_FAIL("Failed to load skel");
+	}
 
-	*ctx = skel;
+	*ctx = tctx;
 
 	return SCX_TEST_PASS;
 }
 
 static enum scx_test_status run(void *ctx)
 {
-	struct select_cpu_dfl *skel = ctx;
-	struct bpf_link *link;
+	struct select_cpu_dfl_ctx *tctx = ctx;
 	pid_t pids[NUM_CHILDREN];
-	int i, status;
+	int i, status, nforked = 0;
 
-	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops);
-	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+	tctx->link = bpf_map__attach_struct_ops(tctx->skel->maps.select_cpu_dfl_ops);
+	SCX_FAIL_IF(!tctx->link, "Failed to attach scheduler");
 
 	for (i = 0; i < NUM_CHILDREN; i++) {
 		pids[i] = fork();
@@ -43,25 +59,31 @@ static enum scx_test_status run(void *ctx)
 			sleep(1);
 			exit(0);
 		}
+		if (pids[i] > 0)
+			nforked++;
 	}
 
 	for (i = 0; i < NUM_CHILDREN; i++) {
+		if (pids[i] <= 0)
+			continue;
 		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
 		SCX_EQ(status, 0);
 	}
 
-	SCX_ASSERT(!skel->bss->saw_local);
-
-	bpf_link__destroy(link);
+	SCX_GT(nforked, 0);
+	SCX_ASSERT(!tctx->skel->bss->saw_local);
 
 	return SCX_TEST_PASS;
 }
 
 static void cleanup(void *ctx)
 {
-	struct select_cpu_dfl *skel = ctx;
+	struct select_cpu_dfl_ctx *tctx = ctx;
 
-	select_cpu_dfl__destroy(skel);
+	if (tctx->link)
+		bpf_link__destroy(tctx->link);
+	select_cpu_dfl__destroy(tctx->skel);
+	free(tctx);
 }
 
 struct scx_test select_cpu_dfl = {
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH sched_ext/for-7.2] selftests/sched_ext: Fix select_cpu_dfl link leak on early return
  2026-05-08  6:55 [PATCH sched_ext/for-7.2] selftests/sched_ext: Fix select_cpu_dfl link leak on early return Cheng-Yang Chou
@ 2026-05-08  8:08 ` Andrea Righi
  2026-05-08  9:54   ` Cheng-Yang Chou
  2026-05-08 15:45 ` Tejun Heo
  2026-05-08 18:20 ` sashiko-bot
  2 siblings, 1 reply; 5+ messages in thread
From: Andrea Righi @ 2026-05-08  8:08 UTC (permalink / raw)
  To: Cheng-Yang Chou
  Cc: sched-ext, Tejun Heo, David Vernet, Changwoo Min,
	Ching-Chun Huang, Chia-Ping Tsai

Hi Cheng-Yang,

On Fri, May 08, 2026 at 02:55:12PM +0800, Cheng-Yang Chou wrote:
> If run() exits early via SCX_EQ/SCX_ASSERT (which calls return
> directly), bpf_link__destroy() is never reached and the BPF
> scheduler stays loaded. All subsequent tests then fail to attach
> because SCX is not in the DISABLED state.
> 
> Move bpf_link into a context struct so cleanup() always destroys
> it, regardless of how run() exits. Also skip waitpid() for children
> where fork() returned -1, avoiding waitpid(-1,...) accidentally
> reaping an unrelated child and triggering the early return path.
> 
> Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>

Looks good to me.

Reviewed-by: Andrea Righi <arighi@nvidia.com>

I think other selftests may need a similar change to prevent potential cascading
failures.

Thanks,
-Andrea

> ---
> I was hitting this error when testing the defer SCX bandwidth patchset,
> though I think it shouldn't hit the bug under regular testing. Tested
> this on the for-next branch and everything still looks fine.
> 
>  .../selftests/sched_ext/select_cpu_dfl.c      | 54 +++++++++++++------
>  1 file changed, 38 insertions(+), 16 deletions(-)
> 
> diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
> index 5b6e045e1109..7e342c0cec65 100644
> --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c
> +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
> @@ -6,6 +6,7 @@
>   */
>  #include <bpf/bpf.h>
>  #include <scx/common.h>
> +#include <stdlib.h>
>  #include <sys/wait.h>
>  #include <unistd.h>
>  #include "select_cpu_dfl.bpf.skel.h"
> @@ -13,29 +14,44 @@
>  
>  #define NUM_CHILDREN 1028
>  
> +struct select_cpu_dfl_ctx {
> +	struct select_cpu_dfl	*skel;
> +	struct bpf_link		*link;
> +};
> +
>  static enum scx_test_status setup(void **ctx)
>  {
> -	struct select_cpu_dfl *skel;
> +	struct select_cpu_dfl_ctx *tctx;
> +
> +	tctx = malloc(sizeof(*tctx));
> +	SCX_FAIL_IF(!tctx, "Failed to allocate test context");
> +	tctx->link = NULL;
>  
> -	skel = select_cpu_dfl__open();
> -	SCX_FAIL_IF(!skel, "Failed to open");
> -	SCX_ENUM_INIT(skel);
> -	SCX_FAIL_IF(select_cpu_dfl__load(skel), "Failed to load skel");
> +	tctx->skel = select_cpu_dfl__open();
> +	if (!tctx->skel) {
> +		free(tctx);
> +		SCX_FAIL("Failed to open");
> +	}
> +	SCX_ENUM_INIT(tctx->skel);
> +	if (select_cpu_dfl__load(tctx->skel)) {
> +		select_cpu_dfl__destroy(tctx->skel);
> +		free(tctx);
> +		SCX_FAIL("Failed to load skel");
> +	}
>  
> -	*ctx = skel;
> +	*ctx = tctx;
>  
>  	return SCX_TEST_PASS;
>  }
>  
>  static enum scx_test_status run(void *ctx)
>  {
> -	struct select_cpu_dfl *skel = ctx;
> -	struct bpf_link *link;
> +	struct select_cpu_dfl_ctx *tctx = ctx;
>  	pid_t pids[NUM_CHILDREN];
> -	int i, status;
> +	int i, status, nforked = 0;
>  
> -	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops);
> -	SCX_FAIL_IF(!link, "Failed to attach scheduler");
> +	tctx->link = bpf_map__attach_struct_ops(tctx->skel->maps.select_cpu_dfl_ops);
> +	SCX_FAIL_IF(!tctx->link, "Failed to attach scheduler");
>  
>  	for (i = 0; i < NUM_CHILDREN; i++) {
>  		pids[i] = fork();
> @@ -43,25 +59,31 @@ static enum scx_test_status run(void *ctx)
>  			sleep(1);
>  			exit(0);
>  		}
> +		if (pids[i] > 0)
> +			nforked++;
>  	}
>  
>  	for (i = 0; i < NUM_CHILDREN; i++) {
> +		if (pids[i] <= 0)
> +			continue;
>  		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
>  		SCX_EQ(status, 0);
>  	}
>  
> -	SCX_ASSERT(!skel->bss->saw_local);
> -
> -	bpf_link__destroy(link);
> +	SCX_GT(nforked, 0);
> +	SCX_ASSERT(!tctx->skel->bss->saw_local);
>  
>  	return SCX_TEST_PASS;
>  }
>  
>  static void cleanup(void *ctx)
>  {
> -	struct select_cpu_dfl *skel = ctx;
> +	struct select_cpu_dfl_ctx *tctx = ctx;
>  
> -	select_cpu_dfl__destroy(skel);
> +	if (tctx->link)
> +		bpf_link__destroy(tctx->link);
> +	select_cpu_dfl__destroy(tctx->skel);
> +	free(tctx);
>  }
>  
>  struct scx_test select_cpu_dfl = {
> -- 
> 2.48.1
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH sched_ext/for-7.2] selftests/sched_ext: Fix select_cpu_dfl link leak on early return
  2026-05-08  8:08 ` Andrea Righi
@ 2026-05-08  9:54   ` Cheng-Yang Chou
  0 siblings, 0 replies; 5+ messages in thread
From: Cheng-Yang Chou @ 2026-05-08  9:54 UTC (permalink / raw)
  To: Andrea Righi
  Cc: sched-ext, Tejun Heo, David Vernet, Changwoo Min,
	Ching-Chun Huang, Chia-Ping Tsai

Hi Andrea,

On Fri, May 08, 2026 at 10:08:47AM +0200, Andrea Righi wrote:
> Hi Cheng-Yang,
> 
> On Fri, May 08, 2026 at 02:55:12PM +0800, Cheng-Yang Chou wrote:
> > If run() exits early via SCX_EQ/SCX_ASSERT (which calls return
> > directly), bpf_link__destroy() is never reached and the BPF
> > scheduler stays loaded. All subsequent tests then fail to attach
> > because SCX is not in the DISABLED state.
> > 
> > Move bpf_link into a context struct so cleanup() always destroys
> > it, regardless of how run() exits. Also skip waitpid() for children
> > where fork() returned -1, avoiding waitpid(-1,...) accidentally
> > reaping an unrelated child and triggering the early return path.
> > 
> > Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
> 
> Looks good to me.
> 
> Reviewed-by: Andrea Righi <arighi@nvidia.com>
> 
> I think other selftests may need a similar change to prevent potential cascading
> failures.

Thanks for the review!

Sure, I'll scan through the other selftests to see if the same
cleanup can be applied. Thanks!

> 
> Thanks,
> -Andrea
> 
> > ---
> > I was hitting this error when testing the defer SCX bandwidth patchset,
> > though I think it shouldn't hit the bug under regular testing. Tested
> > this on the for-next branch and everything still looks fine.
> > 
> >  .../selftests/sched_ext/select_cpu_dfl.c      | 54 +++++++++++++------
> >  1 file changed, 38 insertions(+), 16 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
> > index 5b6e045e1109..7e342c0cec65 100644
> > --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c
> > +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
> > @@ -6,6 +6,7 @@
> >   */
> >  #include <bpf/bpf.h>
> >  #include <scx/common.h>
> > +#include <stdlib.h>
> >  #include <sys/wait.h>
> >  #include <unistd.h>
> >  #include "select_cpu_dfl.bpf.skel.h"
> > @@ -13,29 +14,44 @@
> >  
> >  #define NUM_CHILDREN 1028
> >  
> > +struct select_cpu_dfl_ctx {
> > +	struct select_cpu_dfl	*skel;
> > +	struct bpf_link		*link;
> > +};
> > +
> >  static enum scx_test_status setup(void **ctx)
> >  {
> > -	struct select_cpu_dfl *skel;
> > +	struct select_cpu_dfl_ctx *tctx;
> > +
> > +	tctx = malloc(sizeof(*tctx));
> > +	SCX_FAIL_IF(!tctx, "Failed to allocate test context");
> > +	tctx->link = NULL;
> >  
> > -	skel = select_cpu_dfl__open();
> > -	SCX_FAIL_IF(!skel, "Failed to open");
> > -	SCX_ENUM_INIT(skel);
> > -	SCX_FAIL_IF(select_cpu_dfl__load(skel), "Failed to load skel");
> > +	tctx->skel = select_cpu_dfl__open();
> > +	if (!tctx->skel) {
> > +		free(tctx);
> > +		SCX_FAIL("Failed to open");
> > +	}
> > +	SCX_ENUM_INIT(tctx->skel);
> > +	if (select_cpu_dfl__load(tctx->skel)) {
> > +		select_cpu_dfl__destroy(tctx->skel);
> > +		free(tctx);
> > +		SCX_FAIL("Failed to load skel");
> > +	}
> >  
> > -	*ctx = skel;
> > +	*ctx = tctx;
> >  
> >  	return SCX_TEST_PASS;
> >  }
> >  
> >  static enum scx_test_status run(void *ctx)
> >  {
> > -	struct select_cpu_dfl *skel = ctx;
> > -	struct bpf_link *link;
> > +	struct select_cpu_dfl_ctx *tctx = ctx;
> >  	pid_t pids[NUM_CHILDREN];
> > -	int i, status;
> > +	int i, status, nforked = 0;
> >  
> > -	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops);
> > -	SCX_FAIL_IF(!link, "Failed to attach scheduler");
> > +	tctx->link = bpf_map__attach_struct_ops(tctx->skel->maps.select_cpu_dfl_ops);
> > +	SCX_FAIL_IF(!tctx->link, "Failed to attach scheduler");
> >  
> >  	for (i = 0; i < NUM_CHILDREN; i++) {
> >  		pids[i] = fork();
> > @@ -43,25 +59,31 @@ static enum scx_test_status run(void *ctx)
> >  			sleep(1);
> >  			exit(0);
> >  		}
> > +		if (pids[i] > 0)
> > +			nforked++;
> >  	}
> >  
> >  	for (i = 0; i < NUM_CHILDREN; i++) {
> > +		if (pids[i] <= 0)
> > +			continue;
> >  		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
> >  		SCX_EQ(status, 0);
> >  	}
> >  
> > -	SCX_ASSERT(!skel->bss->saw_local);
> > -
> > -	bpf_link__destroy(link);
> > +	SCX_GT(nforked, 0);
> > +	SCX_ASSERT(!tctx->skel->bss->saw_local);
> >  
> >  	return SCX_TEST_PASS;
> >  }
> >  
> >  static void cleanup(void *ctx)
> >  {
> > -	struct select_cpu_dfl *skel = ctx;
> > +	struct select_cpu_dfl_ctx *tctx = ctx;
> >  
> > -	select_cpu_dfl__destroy(skel);
> > +	if (tctx->link)
> > +		bpf_link__destroy(tctx->link);
> > +	select_cpu_dfl__destroy(tctx->skel);
> > +	free(tctx);
> >  }
> >  
> >  struct scx_test select_cpu_dfl = {
> > -- 
> > 2.48.1
> > 

-- 
Cheers,
Cheng-Yang

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH sched_ext/for-7.2] selftests/sched_ext: Fix select_cpu_dfl link leak on early return
  2026-05-08  6:55 [PATCH sched_ext/for-7.2] selftests/sched_ext: Fix select_cpu_dfl link leak on early return Cheng-Yang Chou
  2026-05-08  8:08 ` Andrea Righi
@ 2026-05-08 15:45 ` Tejun Heo
  2026-05-08 18:20 ` sashiko-bot
  2 siblings, 0 replies; 5+ messages in thread
From: Tejun Heo @ 2026-05-08 15:45 UTC (permalink / raw)
  To: Cheng-Yang Chou
  Cc: sched-ext, David Vernet, Andrea Righi, Changwoo Min,
	Ching-Chun Huang, Chia-Ping Tsai, Emil Tsalapatis, linux-kernel

Hello,

On Fri, May 08, 2026 at 02:55:12PM +0800, Cheng-Yang Chou wrote:
> selftests/sched_ext: Fix select_cpu_dfl link leak on early return

Applied to sched_ext/for-7.2.

Thanks.

--
tejun

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH sched_ext/for-7.2] selftests/sched_ext: Fix select_cpu_dfl link leak on early return
  2026-05-08  6:55 [PATCH sched_ext/for-7.2] selftests/sched_ext: Fix select_cpu_dfl link leak on early return Cheng-Yang Chou
  2026-05-08  8:08 ` Andrea Righi
  2026-05-08 15:45 ` Tejun Heo
@ 2026-05-08 18:20 ` sashiko-bot
  2 siblings, 0 replies; 5+ messages in thread
From: sashiko-bot @ 2026-05-08 18:20 UTC (permalink / raw)
  To: Cheng-Yang Chou; +Cc: sched-ext

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] Child processes are leaked as zombies if an early return occurs during the `waitpid()` loop, bypassing the cleanup guarantee the patch aims to provide.
--

commit 804f29ba6f027f7b0b2d5d4230d76424d3576212
Author: Cheng-Yang Chou <yphbchou0911@gmail.com>

selftests/sched_ext: Fix select_cpu_dfl link leak on early return

This commit attempts to prevent a BPF link leak caused by early returns in
the run() function by moving the skeleton and link into a context struct
that is safely destroyed in cleanup().

> diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
> index 5b6e045e1109b..7e342c0cec653 100644
> --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c
> +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
[ ... ]
> @@ -43,25 +59,31 @@ static enum scx_test_status run(void *ctx)
>  			sleep(1);
>  			exit(0);
>  		}
> +		if (pids[i] > 0)
> +			nforked++;
>  	}
>  
>  	for (i = 0; i < NUM_CHILDREN; i++) {
> +		if (pids[i] <= 0)
> +			continue;
>  		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
>  		SCX_EQ(status, 0);

Does this leave child processes as zombies if an early return occurs
during the wait loop?

If SCX_EQ() triggers an immediate return before the loop finishes, any
remaining successfully forked child processes will be abandoned. Since the
pids array remains stack-allocated in run() and is not part of the context
struct, cleanup() has no way to find and wait on them.

Should the child pid tracking also be moved into the context struct
so that cleanup() can reap or kill any remaining children?

>  	}
>  
> -	SCX_ASSERT(!skel->bss->saw_local);
> -
> -	bpf_link__destroy(link);
> +	SCX_GT(nforked, 0);
> +	SCX_ASSERT(!tctx->skel->bss->saw_local);
>  
>  	return SCX_TEST_PASS;
>  }

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260508065514.1723906-1-yphbchou0911@gmail.com?part=1

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-05-08 18:20 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-08  6:55 [PATCH sched_ext/for-7.2] selftests/sched_ext: Fix select_cpu_dfl link leak on early return Cheng-Yang Chou
2026-05-08  8:08 ` Andrea Righi
2026-05-08  9:54   ` Cheng-Yang Chou
2026-05-08 15:45 ` Tejun Heo
2026-05-08 18:20 ` sashiko-bot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox