[PATCH v2] sched_ext: Sync with scx upstream

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2] sched_ext: Sync with scx upstream
@ 2025-07-23 13:02 Cheng-Yang Chou
  2025-07-25 12:06 ` Andrea Righi
  0 siblings, 1 reply; 2+ messages in thread
From: Cheng-Yang Chou @ 2025-07-23 13:02 UTC (permalink / raw)
  To: sched-ext; +Cc: tj, void, arighi, changwoo, jserv, yphbchou0911

Sync via ./sync-to-kernel.sh /path/to/kernel/tree, as suggested by
the upstream scx repository.

Suggested-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
---
 tools/sched_ext/include/scx/common.bpf.h     | 102 +++++++++++++++++--
 tools/sched_ext/include/scx/common.h         |   5 +-
 tools/sched_ext/include/scx/compat.bpf.h     |   5 +
 tools/sched_ext/include/scx/user_exit_info.h |  49 +--------
 tools/sched_ext/scx_central.bpf.c            |   2 +-
 tools/sched_ext/scx_central.c                |   1 +
 tools/sched_ext/scx_flatcg.bpf.c             |   2 +-
 tools/sched_ext/scx_flatcg.c                 |   2 +
 tools/sched_ext/scx_qmap.bpf.c               |  23 -----
 tools/sched_ext/scx_simple.c                 |   2 +
 10 files changed, 110 insertions(+), 83 deletions(-)

Changes in v2:
 - Squash all changes into one patch
 - Link to v1: https://lore.kernel.org/all/20250723120746.52847-1-yphbchou0911@gmail.com/

diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index d4e21558e982..86abdb3c3142 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -24,14 +24,26 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <asm-generic/errno.h>
-#include "user_exit_info.h"
+#include "user_exit_info.bpf.h"
 #include "enum_defs.autogen.h"
 
+#define PF_IDLE				0x00000002	/* I am an IDLE thread */
+#define PF_IO_WORKER			0x00000010	/* Task is an IO worker */
 #define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
+#define PF_KCOMPACTD			0x00010000      /* I am kcompactd */
+#define PF_KSWAPD			0x00020000      /* I am kswapd */
 #define PF_KTHREAD			0x00200000	/* I am a kernel thread */
 #define PF_EXITING			0x00000004
 #define CLOCK_MONOTONIC			1
 
+#ifndef NR_CPUS
+#define NR_CPUS 1024
+#endif
+
+#ifndef NUMA_NO_NODE
+#define	NUMA_NO_NODE	(-1)
+#endif
+
 extern int LINUX_KERNEL_VERSION __kconfig;
 extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak;
 extern const char CONFIG_LOCALVERSION[64] __kconfig __weak;
@@ -107,6 +119,9 @@ void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __
 static inline __attribute__((format(printf, 1, 2)))
 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
 
+#define SCX_STRINGIFY(x) #x
+#define SCX_TOSTRING(x) SCX_STRINGIFY(x)
+
 /*
  * Helper macro for initializing the fmt and variadic argument inputs to both
  * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
@@ -141,13 +156,15 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
  * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
  * instead of an array of u64. Invoking this macro will cause the scheduler to
  * exit in an erroneous state, with diagnostic information being passed to the
- * user.
+ * user. It appends the file and line number to aid debugging.
  */
 #define scx_bpf_error(fmt, args...)						\
 ({										\
-	scx_bpf_bstr_preamble(fmt, args)					\
+	scx_bpf_bstr_preamble(							\
+		__FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args)		\
 	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
-	___scx_bpf_bstr_format_checker(fmt, ##args);				\
+	___scx_bpf_bstr_format_checker(						\
+		__FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args);		\
 })
 
 /*
@@ -229,6 +246,7 @@ BPF_PROG(name, ##args)
  * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
  * `MEMBER_VPTR(ptr, ->member)`.
  */
+#ifndef MEMBER_VPTR
 #define MEMBER_VPTR(base, member) (typeof((base) member) *)			\
 ({										\
 	u64 __base = (u64)&(base);						\
@@ -245,6 +263,7 @@ BPF_PROG(name, ##args)
 		  [max]"i"(sizeof(base) - sizeof((base) member)));		\
 	__addr;									\
 })
+#endif /* MEMBER_VPTR */
 
 /**
  * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element
@@ -260,6 +279,7 @@ BPF_PROG(name, ##args)
  * size of the array to compute the max, which will result in rejection by
  * the verifier.
  */
+#ifndef ARRAY_ELEM_PTR
 #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)				\
 ({										\
 	u64 __base = (u64)arr;							\
@@ -274,7 +294,7 @@ BPF_PROG(name, ##args)
 		  [max]"r"(sizeof(arr[0]) * ((n) - 1)));			\
 	__addr;									\
 })
-
+#endif /* ARRAY_ELEM_PTR */
 
 /*
  * BPF declarations and helpers
@@ -438,8 +458,27 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
  */
 static inline bool is_migration_disabled(const struct task_struct *p)
 {
-	if (bpf_core_field_exists(p->migration_disabled))
-		return p->migration_disabled;
+	/*
+	 * Testing p->migration_disabled in a BPF code is tricky because the
+	 * migration is _always_ disabled while running the BPF code.
+	 * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF
+	 * code execution disable and re-enable the migration of the current
+	 * task, respectively. So, the _current_ task of the sched_ext ops is
+	 * always migration-disabled. Moreover, p->migration_disabled could be
+	 * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is
+	 * executed in the middle of the other BPF code execution.
+	 *
+	 * Therefore, we should decide that the _current_ task is
+	 * migration-disabled only when its migration_disabled count is greater
+	 * than one. In other words, when  p->migration_disabled == 1, there is
+	 * an ambiguity, so we should check if @p is the current task or not.
+	 */
+	if (bpf_core_field_exists(p->migration_disabled)) {
+		if (p->migration_disabled == 1)
+			return bpf_get_current_task_btf() != p;
+		else
+			return p->migration_disabled;
+	}
 	return false;
 }
 
@@ -476,7 +515,7 @@ static inline s64 time_delta(u64 after, u64 before)
  */
 static inline bool time_after(u64 a, u64 b)
 {
-	 return (s64)(b - a) < 0;
+	return (s64)(b - a) < 0;
 }
 
 /**
@@ -500,7 +539,7 @@ static inline bool time_before(u64 a, u64 b)
  */
 static inline bool time_after_eq(u64 a, u64 b)
 {
-	 return (s64)(a - b) >= 0;
+	return (s64)(a - b) >= 0;
 }
 
 /**
@@ -547,9 +586,15 @@ static inline bool time_in_range_open(u64 a, u64 b, u64 c)
  */
 
 /* useful compiler attributes */
+#ifndef likely
 #define likely(x) __builtin_expect(!!(x), 1)
+#endif
+#ifndef unlikely
 #define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+#ifndef __maybe_unused
 #define __maybe_unused __attribute__((__unused__))
+#endif
 
 /*
  * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They
@@ -632,6 +677,26 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	__u.__val;								\
 })
 
+/*
+ * __calc_avg - Calculate exponential weighted moving average (EWMA) with
+ * @old and @new values. @decay represents how large the @old value remains.
+ * With a larger @decay value, the moving average changes slowly, exhibiting
+ * fewer fluctuations.
+ */
+#define __calc_avg(old, new, decay) ({						\
+	typeof(decay) thr = 1 << (decay);					\
+	typeof(old) ret;							\
+	if (((old) < thr) || ((new) < thr)) {					\
+		if (((old) == 1) && ((new) == 0))				\
+			ret = 0;						\
+		else								\
+			ret = ((old) - ((old) >> 1)) + ((new) >> 1);		\
+	} else {								\
+		ret = ((old) - ((old) >> (decay))) + ((new) >> (decay));	\
+	}									\
+	ret;									\
+})
+
 /*
  * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value.
  * @v: The value for which we're computing the base 2 logarithm.
@@ -662,6 +727,25 @@ static inline u32 log2_u64(u64 v)
                 return log2_u32(v) + 1;
 }
 
+/*
+ * sqrt_u64 - Calculate the square root of value @x using Newton's method.
+ */
+static inline u64 __sqrt_u64(u64 x)
+{
+	if (x == 0 || x == 1)
+		return x;
+
+	u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32);
+
+	for (int i = 0; i < 8; ++i) {
+		u64 q = x / r;
+		if (r <= q)
+			break;
+		r = (r + q) >> 1;
+	}
+	return r;
+}
+
 /*
  * Return a value proportionally scaled to the task's weight.
  */
diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h
index 1dc76bd84296..b3c6372bcf81 100644
--- a/tools/sched_ext/include/scx/common.h
+++ b/tools/sched_ext/include/scx/common.h
@@ -75,8 +75,9 @@ typedef int64_t s64;
 #include "enums.h"
 
 /* not available when building kernel tools/sched_ext */
-#if __has_include(<lib/sdt_task.h>)
-#include <lib/sdt_task.h>
+#if __has_include(<lib/sdt_task_defs.h>)
+#include "bpf_arena_common.h"
+#include <lib/sdt_task_defs.h>
 #endif
 
 #endif	/* __SCHED_EXT_COMMON_H */
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 9252e1a00556..36e0cd2fd4ed 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -38,6 +38,7 @@ void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__i
 void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
 bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
 
 #define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags)				\
 	(bpf_ksym_exists(scx_bpf_dsq_insert) ?					\
@@ -82,6 +83,10 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter,
 	  scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
 	  false))
 
+#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz)		\
+	(bpf_ksym_exists(bpf_cpumask_populate) ?			\
+	 (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP)
+
 #define scx_bpf_dispatch(p, dsq_id, slice, enq_flags)				\
 	_Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()")
 
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
index 66f856640ee7..399697fa372f 100644
--- a/tools/sched_ext/include/scx/user_exit_info.h
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -10,55 +10,11 @@
 #ifndef __USER_EXIT_INFO_H
 #define __USER_EXIT_INFO_H
 
-#ifdef LSP
-#define __bpf__
-#include "../vmlinux.h"
-#endif
-
-enum uei_sizes {
-	UEI_REASON_LEN		= 128,
-	UEI_MSG_LEN		= 1024,
-	UEI_DUMP_DFL_LEN	= 32768,
-};
-
-struct user_exit_info {
-	int		kind;
-	s64		exit_code;
-	char		reason[UEI_REASON_LEN];
-	char		msg[UEI_MSG_LEN];
-};
-
-#ifdef __bpf__
-
-#ifndef LSP
-#include "vmlinux.h"
-#endif
-#include <bpf/bpf_core_read.h>
-
-#define UEI_DEFINE(__name)							\
-	char RESIZABLE_ARRAY(data, __name##_dump);				\
-	const volatile u32 __name##_dump_len;					\
-	struct user_exit_info __name SEC(".data")
-
-#define UEI_RECORD(__uei_name, __ei) ({						\
-	bpf_probe_read_kernel_str(__uei_name.reason,				\
-				  sizeof(__uei_name.reason), (__ei)->reason);	\
-	bpf_probe_read_kernel_str(__uei_name.msg,				\
-				  sizeof(__uei_name.msg), (__ei)->msg);		\
-	bpf_probe_read_kernel_str(__uei_name##_dump,				\
-				  __uei_name##_dump_len, (__ei)->dump);		\
-	if (bpf_core_field_exists((__ei)->exit_code))				\
-		__uei_name.exit_code = (__ei)->exit_code;			\
-	/* use __sync to force memory barrier */				\
-	__sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind,		\
-				    (__ei)->kind);				\
-})
-
-#else	/* !__bpf__ */
-
 #include <stdio.h>
 #include <stdbool.h>
 
+#include "user_exit_info_common.h"
+
 /* no need to call the following explicitly if SCX_OPS_LOAD() is used */
 #define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({					\
 	u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN;	\
@@ -114,5 +70,4 @@ enum uei_ecode_mask {
 
 #define UEI_ECODE_RESTART(__ecode)	(UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART)
 
-#endif	/* __bpf__ */
 #endif	/* __USER_EXIT_INFO_H */
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 50bc1737c167..55df8b798865 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * A central FIFO sched_ext scheduler which demonstrates the followings:
+ * A central FIFO sched_ext scheduler which demonstrates the following:
  *
  * a. Making all scheduling decisions from one CPU:
  *
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 6ba6e610eeaa..55931a4cd71c 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -61,6 +61,7 @@ int main(int argc, char **argv)
 	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
 	skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
 
+	assert(skel->rodata->nr_cpu_ids > 0);
 	assert(skel->rodata->nr_cpu_ids <= INT32_MAX);
 
 	while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index fdc7170639e6..2c720e3ecad5 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -950,5 +950,5 @@ SCX_OPS_DEFINE(flatcg_ops,
 	       .cgroup_move		= (void *)fcg_cgroup_move,
 	       .init			= (void *)fcg_init,
 	       .exit			= (void *)fcg_exit,
-	       .flags			= SCX_OPS_ENQ_EXITING,
+	       .flags			= SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING,
 	       .name			= "flatcg");
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index 6dd423eeb4ff..cd85eb401179 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -6,6 +6,7 @@
  */
 #include <stdio.h>
 #include <signal.h>
+#include <assert.h>
 #include <unistd.h>
 #include <libgen.h>
 #include <limits.h>
@@ -137,6 +138,7 @@ int main(int argc, char **argv)
 	skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
 
 	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
+	assert(skel->rodata->nr_cpus > 0);
 	skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
 
 	while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 69d877501cb7..c3cd9a17d48e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -615,26 +615,6 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
 		     taskc->force_local, taskc->core_sched_seq);
 }
 
-s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
-{
-	bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
-		   cgrp->kn->id, args->weight, args->bw_period_us,
-		   args->bw_quota_us, args->bw_burst_us);
-	return 0;
-}
-
-void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
-{
-	bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
-}
-
-void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
-		    u64 period_us, u64 quota_us, u64 burst_us)
-{
-	bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id,
-		   period_us, quota_us, burst_us);
-}
-
 /*
  * Print out the online and possible CPU map using bpf_printk() as a
  * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
@@ -860,9 +840,6 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .dump			= (void *)qmap_dump,
 	       .dump_cpu		= (void *)qmap_dump_cpu,
 	       .dump_task		= (void *)qmap_dump_task,
-	       .cgroup_init		= (void *)qmap_cgroup_init,
-	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
-	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
 	       .cpu_online		= (void *)qmap_cpu_online,
 	       .cpu_offline		= (void *)qmap_cpu_offline,
 	       .init			= (void *)qmap_init,
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 76d83199545c..06d4b13bf76b 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -7,6 +7,7 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <signal.h>
+#include <assert.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
@@ -41,6 +42,7 @@ static void sigint_handler(int simple)
 static void read_stats(struct scx_simple *skel, __u64 *stats)
 {
 	int nr_cpus = libbpf_num_possible_cpus();
+	assert(nr_cpus > 0);
 	__u64 cnts[2][nr_cpus];
 	__u32 idx;
 
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH v2] sched_ext: Sync with scx upstream
  2025-07-23 13:02 [PATCH v2] sched_ext: Sync with scx upstream Cheng-Yang Chou
@ 2025-07-25 12:06 ` Andrea Righi
  0 siblings, 0 replies; 2+ messages in thread
From: Andrea Righi @ 2025-07-25 12:06 UTC (permalink / raw)
  To: Cheng-Yang Chou; +Cc: sched-ext, tj, void, changwoo, jserv

On Wed, Jul 23, 2025 at 09:02:43PM +0800, Cheng-Yang Chou wrote:
> Sync via ./sync-to-kernel.sh /path/to/kernel/tree, as suggested by
> the upstream scx repository.
> 
> Suggested-by: Andrea Righi <arighi@nvidia.com>
> Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>

This breaks the build of the sched_ext kselftests:

$ cd tools/testing/selftests/sched_ext
$ make -j$(nproc)
...
In file included from /home/arighi/src/linux/tools/sched_ext/include/scx/common.h:73,
                 from scx_test.h:12,
                 from runner.c:12:
/home/arighi/src/linux/tools/sched_ext/include/scx/user_exit_info.h:16:10: fatal error: user_exit_info_common.h: No such file or directory
   16 | #include "user_exit_info_common.h"
      |          ^~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
make: *** [Makefile:190: /home/arighi/src/linux/tools/testing/selftests/sched_ext/build/obj/sched_ext/runner.o] Error 1
make: *** Waiting for unfinished jobs....

The problem is that we're missing the following files from the scx repo:
 - scheds/include/scx/user_exit_info.bpf.h
 - scheds/include/scx/user_exit_info_common.h

They need to be copied under tools/sched_ext/include/scx.
Can you add these files as well?

Ideally it'd be nice to fix also sync-to-kernel.sh in the scx repo, if you
have time, otherwise I'll take a look.

Thanks,
-Andrea

> ---
>  tools/sched_ext/include/scx/common.bpf.h     | 102 +++++++++++++++++--
>  tools/sched_ext/include/scx/common.h         |   5 +-
>  tools/sched_ext/include/scx/compat.bpf.h     |   5 +
>  tools/sched_ext/include/scx/user_exit_info.h |  49 +--------
>  tools/sched_ext/scx_central.bpf.c            |   2 +-
>  tools/sched_ext/scx_central.c                |   1 +
>  tools/sched_ext/scx_flatcg.bpf.c             |   2 +-
>  tools/sched_ext/scx_flatcg.c                 |   2 +
>  tools/sched_ext/scx_qmap.bpf.c               |  23 -----
>  tools/sched_ext/scx_simple.c                 |   2 +
>  10 files changed, 110 insertions(+), 83 deletions(-)
> 
> Changes in v2:
>  - Squash all changes into one patch
>  - Link to v1: https://lore.kernel.org/all/20250723120746.52847-1-yphbchou0911@gmail.com/
> 
> diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
> index d4e21558e982..86abdb3c3142 100644
> --- a/tools/sched_ext/include/scx/common.bpf.h
> +++ b/tools/sched_ext/include/scx/common.bpf.h
> @@ -24,14 +24,26 @@
>  #include <bpf/bpf_helpers.h>
>  #include <bpf/bpf_tracing.h>
>  #include <asm-generic/errno.h>
> -#include "user_exit_info.h"
> +#include "user_exit_info.bpf.h"
>  #include "enum_defs.autogen.h"
>  
> +#define PF_IDLE				0x00000002	/* I am an IDLE thread */
> +#define PF_IO_WORKER			0x00000010	/* Task is an IO worker */
>  #define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
> +#define PF_KCOMPACTD			0x00010000      /* I am kcompactd */
> +#define PF_KSWAPD			0x00020000      /* I am kswapd */
>  #define PF_KTHREAD			0x00200000	/* I am a kernel thread */
>  #define PF_EXITING			0x00000004
>  #define CLOCK_MONOTONIC			1
>  
> +#ifndef NR_CPUS
> +#define NR_CPUS 1024
> +#endif
> +
> +#ifndef NUMA_NO_NODE
> +#define	NUMA_NO_NODE	(-1)
> +#endif
> +
>  extern int LINUX_KERNEL_VERSION __kconfig;
>  extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak;
>  extern const char CONFIG_LOCALVERSION[64] __kconfig __weak;
> @@ -107,6 +119,9 @@ void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __
>  static inline __attribute__((format(printf, 1, 2)))
>  void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
>  
> +#define SCX_STRINGIFY(x) #x
> +#define SCX_TOSTRING(x) SCX_STRINGIFY(x)
> +
>  /*
>   * Helper macro for initializing the fmt and variadic argument inputs to both
>   * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
> @@ -141,13 +156,15 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
>   * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
>   * instead of an array of u64. Invoking this macro will cause the scheduler to
>   * exit in an erroneous state, with diagnostic information being passed to the
> - * user.
> + * user. It appends the file and line number to aid debugging.
>   */
>  #define scx_bpf_error(fmt, args...)						\
>  ({										\
> -	scx_bpf_bstr_preamble(fmt, args)					\
> +	scx_bpf_bstr_preamble(							\
> +		__FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args)		\
>  	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
> -	___scx_bpf_bstr_format_checker(fmt, ##args);				\
> +	___scx_bpf_bstr_format_checker(						\
> +		__FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args);		\
>  })
>  
>  /*
> @@ -229,6 +246,7 @@ BPF_PROG(name, ##args)
>   * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
>   * `MEMBER_VPTR(ptr, ->member)`.
>   */
> +#ifndef MEMBER_VPTR
>  #define MEMBER_VPTR(base, member) (typeof((base) member) *)			\
>  ({										\
>  	u64 __base = (u64)&(base);						\
> @@ -245,6 +263,7 @@ BPF_PROG(name, ##args)
>  		  [max]"i"(sizeof(base) - sizeof((base) member)));		\
>  	__addr;									\
>  })
> +#endif /* MEMBER_VPTR */
>  
>  /**
>   * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element
> @@ -260,6 +279,7 @@ BPF_PROG(name, ##args)
>   * size of the array to compute the max, which will result in rejection by
>   * the verifier.
>   */
> +#ifndef ARRAY_ELEM_PTR
>  #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)				\
>  ({										\
>  	u64 __base = (u64)arr;							\
> @@ -274,7 +294,7 @@ BPF_PROG(name, ##args)
>  		  [max]"r"(sizeof(arr[0]) * ((n) - 1)));			\
>  	__addr;									\
>  })
> -
> +#endif /* ARRAY_ELEM_PTR */
>  
>  /*
>   * BPF declarations and helpers
> @@ -438,8 +458,27 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
>   */
>  static inline bool is_migration_disabled(const struct task_struct *p)
>  {
> -	if (bpf_core_field_exists(p->migration_disabled))
> -		return p->migration_disabled;
> +	/*
> +	 * Testing p->migration_disabled in a BPF code is tricky because the
> +	 * migration is _always_ disabled while running the BPF code.
> +	 * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF
> +	 * code execution disable and re-enable the migration of the current
> +	 * task, respectively. So, the _current_ task of the sched_ext ops is
> +	 * always migration-disabled. Moreover, p->migration_disabled could be
> +	 * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is
> +	 * executed in the middle of the other BPF code execution.
> +	 *
> +	 * Therefore, we should decide that the _current_ task is
> +	 * migration-disabled only when its migration_disabled count is greater
> +	 * than one. In other words, when  p->migration_disabled == 1, there is
> +	 * an ambiguity, so we should check if @p is the current task or not.
> +	 */
> +	if (bpf_core_field_exists(p->migration_disabled)) {
> +		if (p->migration_disabled == 1)
> +			return bpf_get_current_task_btf() != p;
> +		else
> +			return p->migration_disabled;
> +	}
>  	return false;
>  }
>  
> @@ -476,7 +515,7 @@ static inline s64 time_delta(u64 after, u64 before)
>   */
>  static inline bool time_after(u64 a, u64 b)
>  {
> -	 return (s64)(b - a) < 0;
> +	return (s64)(b - a) < 0;
>  }
>  
>  /**
> @@ -500,7 +539,7 @@ static inline bool time_before(u64 a, u64 b)
>   */
>  static inline bool time_after_eq(u64 a, u64 b)
>  {
> -	 return (s64)(a - b) >= 0;
> +	return (s64)(a - b) >= 0;
>  }
>  
>  /**
> @@ -547,9 +586,15 @@ static inline bool time_in_range_open(u64 a, u64 b, u64 c)
>   */
>  
>  /* useful compiler attributes */
> +#ifndef likely
>  #define likely(x) __builtin_expect(!!(x), 1)
> +#endif
> +#ifndef unlikely
>  #define unlikely(x) __builtin_expect(!!(x), 0)
> +#endif
> +#ifndef __maybe_unused
>  #define __maybe_unused __attribute__((__unused__))
> +#endif
>  
>  /*
>   * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They
> @@ -632,6 +677,26 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
>  	__u.__val;								\
>  })
>  
> +/*
> + * __calc_avg - Calculate exponential weighted moving average (EWMA) with
> + * @old and @new values. @decay represents how large the @old value remains.
> + * With a larger @decay value, the moving average changes slowly, exhibiting
> + * fewer fluctuations.
> + */
> +#define __calc_avg(old, new, decay) ({						\
> +	typeof(decay) thr = 1 << (decay);					\
> +	typeof(old) ret;							\
> +	if (((old) < thr) || ((new) < thr)) {					\
> +		if (((old) == 1) && ((new) == 0))				\
> +			ret = 0;						\
> +		else								\
> +			ret = ((old) - ((old) >> 1)) + ((new) >> 1);		\
> +	} else {								\
> +		ret = ((old) - ((old) >> (decay))) + ((new) >> (decay));	\
> +	}									\
> +	ret;									\
> +})
> +
>  /*
>   * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value.
>   * @v: The value for which we're computing the base 2 logarithm.
> @@ -662,6 +727,25 @@ static inline u32 log2_u64(u64 v)
>                  return log2_u32(v) + 1;
>  }
>  
> +/*
> + * sqrt_u64 - Calculate the square root of value @x using Newton's method.
> + */
> +static inline u64 __sqrt_u64(u64 x)
> +{
> +	if (x == 0 || x == 1)
> +		return x;
> +
> +	u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32);
> +
> +	for (int i = 0; i < 8; ++i) {
> +		u64 q = x / r;
> +		if (r <= q)
> +			break;
> +		r = (r + q) >> 1;
> +	}
> +	return r;
> +}
> +
>  /*
>   * Return a value proportionally scaled to the task's weight.
>   */
> diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h
> index 1dc76bd84296..b3c6372bcf81 100644
> --- a/tools/sched_ext/include/scx/common.h
> +++ b/tools/sched_ext/include/scx/common.h
> @@ -75,8 +75,9 @@ typedef int64_t s64;
>  #include "enums.h"
>  
>  /* not available when building kernel tools/sched_ext */
> -#if __has_include(<lib/sdt_task.h>)
> -#include <lib/sdt_task.h>
> +#if __has_include(<lib/sdt_task_defs.h>)
> +#include "bpf_arena_common.h"
> +#include <lib/sdt_task_defs.h>
>  #endif
>  
>  #endif	/* __SCHED_EXT_COMMON_H */
> diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
> index 9252e1a00556..36e0cd2fd4ed 100644
> --- a/tools/sched_ext/include/scx/compat.bpf.h
> +++ b/tools/sched_ext/include/scx/compat.bpf.h
> @@ -38,6 +38,7 @@ void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__i
>  void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
>  bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
>  bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
> +int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
>  
>  #define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags)				\
>  	(bpf_ksym_exists(scx_bpf_dsq_insert) ?					\
> @@ -82,6 +83,10 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter,
>  	  scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
>  	  false))
>  
> +#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz)		\
> +	(bpf_ksym_exists(bpf_cpumask_populate) ?			\
> +	 (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP)
> +
>  #define scx_bpf_dispatch(p, dsq_id, slice, enq_flags)				\
>  	_Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()")
>  
> diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
> index 66f856640ee7..399697fa372f 100644
> --- a/tools/sched_ext/include/scx/user_exit_info.h
> +++ b/tools/sched_ext/include/scx/user_exit_info.h
> @@ -10,55 +10,11 @@
>  #ifndef __USER_EXIT_INFO_H
>  #define __USER_EXIT_INFO_H
>  
> -#ifdef LSP
> -#define __bpf__
> -#include "../vmlinux.h"
> -#endif
> -
> -enum uei_sizes {
> -	UEI_REASON_LEN		= 128,
> -	UEI_MSG_LEN		= 1024,
> -	UEI_DUMP_DFL_LEN	= 32768,
> -};
> -
> -struct user_exit_info {
> -	int		kind;
> -	s64		exit_code;
> -	char		reason[UEI_REASON_LEN];
> -	char		msg[UEI_MSG_LEN];
> -};
> -
> -#ifdef __bpf__
> -
> -#ifndef LSP
> -#include "vmlinux.h"
> -#endif
> -#include <bpf/bpf_core_read.h>
> -
> -#define UEI_DEFINE(__name)							\
> -	char RESIZABLE_ARRAY(data, __name##_dump);				\
> -	const volatile u32 __name##_dump_len;					\
> -	struct user_exit_info __name SEC(".data")
> -
> -#define UEI_RECORD(__uei_name, __ei) ({						\
> -	bpf_probe_read_kernel_str(__uei_name.reason,				\
> -				  sizeof(__uei_name.reason), (__ei)->reason);	\
> -	bpf_probe_read_kernel_str(__uei_name.msg,				\
> -				  sizeof(__uei_name.msg), (__ei)->msg);		\
> -	bpf_probe_read_kernel_str(__uei_name##_dump,				\
> -				  __uei_name##_dump_len, (__ei)->dump);		\
> -	if (bpf_core_field_exists((__ei)->exit_code))				\
> -		__uei_name.exit_code = (__ei)->exit_code;			\
> -	/* use __sync to force memory barrier */				\
> -	__sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind,		\
> -				    (__ei)->kind);				\
> -})
> -
> -#else	/* !__bpf__ */
> -
>  #include <stdio.h>
>  #include <stdbool.h>
>  
> +#include "user_exit_info_common.h"
> +
>  /* no need to call the following explicitly if SCX_OPS_LOAD() is used */
>  #define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({					\
>  	u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN;	\
> @@ -114,5 +70,4 @@ enum uei_ecode_mask {
>  
>  #define UEI_ECODE_RESTART(__ecode)	(UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART)
>  
> -#endif	/* __bpf__ */
>  #endif	/* __USER_EXIT_INFO_H */
> diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
> index 50bc1737c167..55df8b798865 100644
> --- a/tools/sched_ext/scx_central.bpf.c
> +++ b/tools/sched_ext/scx_central.bpf.c
> @@ -1,6 +1,6 @@
>  /* SPDX-License-Identifier: GPL-2.0 */
>  /*
> - * A central FIFO sched_ext scheduler which demonstrates the followings:
> + * A central FIFO sched_ext scheduler which demonstrates the following:
>   *
>   * a. Making all scheduling decisions from one CPU:
>   *
> diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
> index 6ba6e610eeaa..55931a4cd71c 100644
> --- a/tools/sched_ext/scx_central.c
> +++ b/tools/sched_ext/scx_central.c
> @@ -61,6 +61,7 @@ int main(int argc, char **argv)
>  	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
>  	skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
>  
> +	assert(skel->rodata->nr_cpu_ids > 0);
>  	assert(skel->rodata->nr_cpu_ids <= INT32_MAX);
>  
>  	while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
> diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
> index fdc7170639e6..2c720e3ecad5 100644
> --- a/tools/sched_ext/scx_flatcg.bpf.c
> +++ b/tools/sched_ext/scx_flatcg.bpf.c
> @@ -950,5 +950,5 @@ SCX_OPS_DEFINE(flatcg_ops,
>  	       .cgroup_move		= (void *)fcg_cgroup_move,
>  	       .init			= (void *)fcg_init,
>  	       .exit			= (void *)fcg_exit,
> -	       .flags			= SCX_OPS_ENQ_EXITING,
> +	       .flags			= SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING,
>  	       .name			= "flatcg");
> diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
> index 6dd423eeb4ff..cd85eb401179 100644
> --- a/tools/sched_ext/scx_flatcg.c
> +++ b/tools/sched_ext/scx_flatcg.c
> @@ -6,6 +6,7 @@
>   */
>  #include <stdio.h>
>  #include <signal.h>
> +#include <assert.h>
>  #include <unistd.h>
>  #include <libgen.h>
>  #include <limits.h>
> @@ -137,6 +138,7 @@ int main(int argc, char **argv)
>  	skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
>  
>  	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
> +	assert(skel->rodata->nr_cpus > 0);
>  	skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
>  
>  	while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index 69d877501cb7..c3cd9a17d48e 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -615,26 +615,6 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
>  		     taskc->force_local, taskc->core_sched_seq);
>  }
>  
> -s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
> -{
> -	bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
> -		   cgrp->kn->id, args->weight, args->bw_period_us,
> -		   args->bw_quota_us, args->bw_burst_us);
> -	return 0;
> -}
> -
> -void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
> -{
> -	bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
> -}
> -
> -void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
> -		    u64 period_us, u64 quota_us, u64 burst_us)
> -{
> -	bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id,
> -		   period_us, quota_us, burst_us);
> -}
> -
>  /*
>   * Print out the online and possible CPU map using bpf_printk() as a
>   * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
> @@ -860,9 +840,6 @@ SCX_OPS_DEFINE(qmap_ops,
>  	       .dump			= (void *)qmap_dump,
>  	       .dump_cpu		= (void *)qmap_dump_cpu,
>  	       .dump_task		= (void *)qmap_dump_task,
> -	       .cgroup_init		= (void *)qmap_cgroup_init,
> -	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
> -	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
>  	       .cpu_online		= (void *)qmap_cpu_online,
>  	       .cpu_offline		= (void *)qmap_cpu_offline,
>  	       .init			= (void *)qmap_init,
> diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
> index 76d83199545c..06d4b13bf76b 100644
> --- a/tools/sched_ext/scx_simple.c
> +++ b/tools/sched_ext/scx_simple.c
> @@ -7,6 +7,7 @@
>  #include <stdio.h>
>  #include <unistd.h>
>  #include <signal.h>
> +#include <assert.h>
>  #include <libgen.h>
>  #include <bpf/bpf.h>
>  #include <scx/common.h>
> @@ -41,6 +42,7 @@ static void sigint_handler(int simple)
>  static void read_stats(struct scx_simple *skel, __u64 *stats)
>  {
>  	int nr_cpus = libbpf_num_possible_cpus();
> +	assert(nr_cpus > 0);
>  	__u64 cnts[2][nr_cpus];
>  	__u32 idx;
>  
> -- 
> 2.48.1
> 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2025-07-25 12:06 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-23 13:02 [PATCH v2] sched_ext: Sync with scx upstream Cheng-Yang Chou
2025-07-25 12:06 ` Andrea Righi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.