From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
peterz@infradead.org, juri.lelli@redhat.com,
vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
joshdon@google.com, brho@google.com, pjt@google.com,
derkling@google.com, haoluo@google.com, dvernet@meta.com,
dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 17/34] sched_ext: Allow BPF schedulers to switch all eligible tasks into sched_ext
Date: Mon, 10 Jul 2023 15:13:35 -1000 [thread overview]
Message-ID: <20230711011412.100319-18-tj@kernel.org> (raw)
In-Reply-To: <20230711011412.100319-1-tj@kernel.org>
Currently, to use sched_ext, each task has to be put into sched_ext using
sched_setscheduler(2). However, some BPF schedulers and use cases might
prefer to service all eligible tasks.
This patch adds a new kfunc helper, scx_bpf_switch_all(), that BPF
schedulers can call from ops.init() to switch all SCHED_NORMAL, SCHED_BATCH
and SCHED_IDLE tasks into sched_ext. This has the benefit that the scheduler
swaps are transparent to the users and applications. As we know that CFS is
not being used when scx_bpf_switch_all() is used, we can also disable hot
path entry points with static_branches.
Both the simple and qmap example schedulers are updated to switch all tasks
by default to ease testing. '-p' option is added which enables the original
behavior of switching only tasks which are explicitly on SCHED_EXT.
v2: In the example schedulers, switch all tasks by default.
Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Barret Rhoden <brho@google.com>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
kernel/sched/core.c | 8 +++---
kernel/sched/ext.c | 43 ++++++++++++++++++++++++++++++++
kernel/sched/ext.h | 5 ++++
tools/sched_ext/scx_common.bpf.h | 1 +
tools/sched_ext/scx_qmap.bpf.c | 9 +++++++
tools/sched_ext/scx_qmap.c | 8 ++++--
tools/sched_ext/scx_simple.bpf.c | 10 ++++++++
tools/sched_ext/scx_simple.c | 8 ++++--
8 files changed, 85 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c8528cbfeb57..c976a36dd642 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1226,7 +1226,7 @@ bool sched_can_stop_tick(struct rq *rq)
* if there's more than one we need the tick for involuntary
* preemption.
*/
- if (rq->nr_running > 1)
+ if (!scx_switched_all() && rq->nr_running > 1)
return false;
return true;
@@ -5690,8 +5690,10 @@ void scheduler_tick(void)
perf_event_task_tick();
#ifdef CONFIG_SMP
- rq->idle_balance = idle_cpu(cpu);
- trigger_load_balance(rq);
+ if (!scx_switched_all()) {
+ rq->idle_balance = idle_cpu(cpu);
+ trigger_load_balance(rq);
+ }
#endif
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d5ef8809e05f..6cb3412cee9f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -73,6 +73,10 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static bool scx_switch_all_req;
+static bool scx_switching_all;
+DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+
static struct sched_ext_ops scx_ops;
static bool scx_warned_zero_slice;
@@ -2056,6 +2060,8 @@ bool task_should_scx(struct task_struct *p)
{
if (!scx_enabled() || scx_ops_disabling())
return false;
+ if (READ_ONCE(scx_switching_all))
+ return true;
return p->policy == SCHED_EXT;
}
@@ -2183,6 +2189,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
*/
mutex_lock(&scx_ops_enable_mutex);
+ static_branch_disable(&__scx_switched_all);
+ WRITE_ONCE(scx_switching_all, false);
+
/* avoid racing against fork */
cpus_read_lock();
percpu_down_write(&scx_fork_rwsem);
@@ -2366,6 +2375,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
*/
cpus_read_lock();
+ scx_switch_all_req = false;
if (scx_ops.init) {
ret = SCX_CALL_OP_RET(SCX_KF_INIT, init);
if (ret) {
@@ -2481,6 +2491,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
* transitions here are synchronized against sched_ext_free() through
* scx_tasks_lock.
*/
+ WRITE_ONCE(scx_switching_all, scx_switch_all_req);
+
scx_task_iter_init(&sti);
while ((p = scx_task_iter_next_filtered_locked(&sti))) {
if (READ_ONCE(p->__state) != TASK_DEAD) {
@@ -2512,6 +2524,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
goto err_disable;
}
+ if (scx_switch_all_req)
+ static_branch_enable_cpuslocked(&__scx_switched_all);
+
cpus_read_unlock();
mutex_unlock(&scx_ops_enable_mutex);
@@ -2546,6 +2561,9 @@ static int scx_debug_show(struct seq_file *m, void *v)
mutex_lock(&scx_ops_enable_mutex);
seq_printf(m, "%-30s: %s\n", "ops", scx_ops.name);
seq_printf(m, "%-30s: %ld\n", "enabled", scx_enabled());
+ seq_printf(m, "%-30s: %d\n", "switching_all",
+ READ_ONCE(scx_switching_all));
+ seq_printf(m, "%-30s: %ld\n", "switched_all", scx_switched_all());
seq_printf(m, "%-30s: %s\n", "enable_state",
scx_ops_enable_state_str[scx_ops_enable_state()]);
seq_printf(m, "%-30s: %llu\n", "nr_rejected",
@@ -2797,6 +2815,29 @@ __diag_push();
__diag_ignore_all("-Wmissing-prototypes",
"Global functions as their definitions will be in vmlinux BTF");
+/**
+ * scx_bpf_switch_all - Switch all tasks into SCX
+ *
+ * Switch all existing and future non-dl/rt tasks to SCX. This can only be
+ * called from ops.init(), and actual switching is performed asynchronously.
+ */
+void scx_bpf_switch_all(void)
+{
+ if (!scx_kf_allowed(SCX_KF_INIT))
+ return;
+
+ scx_switch_all_req = true;
+}
+
+BTF_SET8_START(scx_kfunc_ids_init)
+BTF_ID_FLAGS(func, scx_bpf_switch_all)
+BTF_SET8_END(scx_kfunc_ids_init)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_init = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_init,
+};
+
/**
* scx_bpf_create_dsq - Create a custom DSQ
* @dsq_id: DSQ to create
@@ -3292,6 +3333,8 @@ static int __init register_ext_kfuncs(void)
* check using scx_kf_allowed().
*/
if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_init)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_sleepable)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_enqueue_dispatch)) ||
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 444a917d27b1..c32f14bb981c 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -75,7 +75,9 @@ extern unsigned long scx_watchdog_timeout;
extern unsigned long scx_watchdog_timestamp;
DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled);
+DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
+#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
static inline bool task_on_scx(struct task_struct *p)
{
@@ -115,6 +117,8 @@ static inline void scx_notify_sched_tick(void)
static inline const struct sched_class *next_active_class(const struct sched_class *class)
{
class++;
+ if (scx_switched_all() && class == &fair_sched_class)
+ class++;
if (!scx_enabled() && class == &ext_sched_class)
class++;
return class;
@@ -137,6 +141,7 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
#else /* CONFIG_SCHED_CLASS_EXT */
#define scx_enabled() false
+#define scx_switched_all() false
static inline bool task_on_scx(struct task_struct *p) { return false; }
static inline void scx_pre_fork(struct task_struct *p) {}
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 8b8502afcce3..0e6d7d3e2d27 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -53,6 +53,7 @@ void ___scx_bpf_error_format_checker(const char *fmt, ...) {}
___scx_bpf_error_format_checker(fmt, ##args); \
})
+void scx_bpf_switch_all(void) __ksym;
s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
bool scx_bpf_consume(u64 dsq_id) __ksym;
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index d0bc67095062..da43f962ab4e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -25,6 +25,7 @@
char _license[] SEC("license") = "GPL";
const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile bool switch_partial;
const volatile u32 stall_user_nth;
const volatile u32 stall_kernel_nth;
const volatile s32 disallow_tgid;
@@ -239,6 +240,13 @@ s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
return -ENOMEM;
}
+s32 BPF_STRUCT_OPS(qmap_init)
+{
+ if (!switch_partial)
+ scx_bpf_switch_all();
+ return 0;
+}
+
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
{
uei_record(&uei, ei);
@@ -251,6 +259,7 @@ struct sched_ext_ops qmap_ops = {
.dequeue = (void *)qmap_dequeue,
.dispatch = (void *)qmap_dispatch,
.prep_enable = (void *)qmap_prep_enable,
+ .init = (void *)qmap_init,
.exit = (void *)qmap_exit,
.timeout_ms = 5000U,
.name = "qmap",
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 5f50f889ea18..3444e3597b19 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -20,13 +20,14 @@ const char help_fmt[] =
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID] [-p]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
" -t COUNT Stall every COUNT'th user thread\n"
" -T COUNT Stall every COUNT'th kernel thread\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
+" -p Switch only tasks on SCHED_EXT policy intead of all\n"
" -h Display this help and exit\n";
static volatile int exit_req;
@@ -50,7 +51,7 @@ int main(int argc, char **argv)
skel = scx_qmap__open();
assert(skel);
- while ((opt = getopt(argc, argv, "s:e:t:T:d:h")) != -1) {
+ while ((opt = getopt(argc, argv, "s:e:t:T:d:ph")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -69,6 +70,9 @@ int main(int argc, char **argv)
if (skel->rodata->disallow_tgid < 0)
skel->rodata->disallow_tgid = getpid();
break;
+ case 'p':
+ skel->rodata->switch_partial = true;
+ break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index 9326124a32fa..6302a4ea9ea5 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -15,6 +15,8 @@
char _license[] SEC("license") = "GPL";
+const volatile bool switch_partial;
+
struct user_exit_info uei;
struct {
@@ -43,6 +45,13 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
}
+s32 BPF_STRUCT_OPS(simple_init)
+{
+ if (!switch_partial)
+ scx_bpf_switch_all();
+ return 0;
+}
+
void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
{
uei_record(&uei, ei);
@@ -51,6 +60,7 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
SEC(".struct_ops.link")
struct sched_ext_ops simple_ops = {
.enqueue = (void *)simple_enqueue,
+ .init = (void *)simple_init,
.exit = (void *)simple_exit,
.name = "simple",
};
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 9ba38ba1e71f..1e507c0a35cd 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -19,8 +19,9 @@ const char help_fmt[] =
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
-"Usage: %s\n"
+"Usage: %s [-p]\n"
"\n"
+" -p Switch only tasks on SCHED_EXT policy intead of all\n"
" -h Display this help and exit\n";
static volatile int exit_req;
@@ -64,8 +65,11 @@ int main(int argc, char **argv)
skel = scx_simple__open();
assert(skel);
- while ((opt = getopt(argc, argv, "h")) != -1) {
+ while ((opt = getopt(argc, argv, "ph")) != -1) {
switch (opt) {
+ case 'p':
+ skel->rodata->switch_partial = true;
+ break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
--
2.41.0
next prev parent reply other threads:[~2023-07-11 1:16 UTC|newest]
Thread overview: 54+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-07-11 1:13 [PATCHSET v4] sched: Implement BPF extensible scheduler class Tejun Heo
2023-07-11 1:13 ` [PATCH 01/34] cgroup: Implement cgroup_show_cftypes() Tejun Heo
2023-07-11 1:13 ` [PATCH 02/34] sched: Restructure sched_class order sanity checks in sched_init() Tejun Heo
2023-07-11 1:13 ` [PATCH 03/34] sched: Allow sched_cgroup_fork() to fail and introduce sched_cancel_fork() Tejun Heo
2023-07-11 1:13 ` [PATCH 04/34] sched: Add sched_class->reweight_task() Tejun Heo
2023-07-11 1:13 ` [PATCH 05/34] sched: Add sched_class->switching_to() and expose check_class_changing/changed() Tejun Heo
2023-07-11 1:13 ` [PATCH 06/34] sched: Factor out cgroup weight conversion functions Tejun Heo
2023-07-11 1:13 ` [PATCH 07/34] sched: Expose css_tg() and __setscheduler_prio() Tejun Heo
2023-07-11 1:13 ` [PATCH 08/34] sched: Enumerate CPU cgroup file types Tejun Heo
2023-07-11 1:13 ` [PATCH 09/34] sched: Add @reason to sched_class->rq_{on|off}line() Tejun Heo
2023-07-11 1:13 ` [PATCH 10/34] sched: Add normal_policy() Tejun Heo
2023-07-11 1:13 ` [PATCH 11/34] sched_ext: Add boilerplate for extensible scheduler class Tejun Heo
2023-07-11 1:13 ` [PATCH 12/34] sched_ext: Implement BPF " Tejun Heo
2023-07-11 9:21 ` Andrea Righi
2023-07-11 21:45 ` Tejun Heo
2023-08-16 11:45 ` Vishal Chourasia
2023-08-16 19:20 ` Tejun Heo
2023-07-11 1:13 ` [PATCH 13/34] sched_ext: Add scx_simple and scx_example_qmap example schedulers Tejun Heo
2023-07-11 1:13 ` [PATCH 14/34] sched_ext: Add sysrq-S which disables the BPF scheduler Tejun Heo
2023-07-11 1:13 ` [PATCH 15/34] sched_ext: Implement runnable task stall watchdog Tejun Heo
2023-07-11 1:13 ` [PATCH 16/34] sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT Tejun Heo
2023-07-11 1:13 ` Tejun Heo [this message]
2023-07-11 1:13 ` [PATCH 18/34] sched_ext: Implement scx_bpf_kick_cpu() and task preemption support Tejun Heo
2023-07-11 1:13 ` [PATCH 19/34] sched_ext: Add a central scheduler which makes all scheduling decisions on one CPU Tejun Heo
2023-07-11 1:13 ` [PATCH 20/34] sched_ext: Make watchdog handle ops.dispatch() looping stall Tejun Heo
2023-07-11 1:13 ` [PATCH 21/34] sched_ext: Add task state tracking operations Tejun Heo
2023-07-11 1:13 ` [PATCH 22/34] sched_ext: Implement tickless support Tejun Heo
2023-07-11 1:13 ` [PATCH 23/34] sched_ext: Track tasks that are subjects of the in-flight SCX operation Tejun Heo
2023-07-11 1:13 ` [PATCH 24/34] sched_ext: Add cgroup support Tejun Heo
2023-07-11 1:13 ` [PATCH 25/34] sched_ext: Add a cgroup-based core-scheduling scheduler Tejun Heo
2023-07-11 1:13 ` [PATCH 26/34] sched_ext: Add a cgroup scheduler which uses flattened hierarchy Tejun Heo
2023-07-11 1:13 ` [PATCH 27/34] sched_ext: Implement SCX_KICK_WAIT Tejun Heo
2023-07-13 13:45 ` Andrea Righi
2023-07-13 18:32 ` Linus Torvalds
2023-07-13 19:48 ` Tejun Heo
2023-07-11 1:13 ` [PATCH 28/34] sched_ext: Implement sched_ext_ops.cpu_acquire/release() Tejun Heo
2023-07-11 1:13 ` [PATCH 29/34] sched_ext: Implement sched_ext_ops.cpu_online/offline() Tejun Heo
2023-07-11 1:13 ` [PATCH 30/34] sched_ext: Implement core-sched support Tejun Heo
2023-07-11 1:13 ` [PATCH 31/34] sched_ext: Add vtime-ordered priority queue to dispatch_q's Tejun Heo
2023-07-11 1:13 ` [PATCH 32/34] sched_ext: Documentation: scheduler: Document extensible scheduler class Tejun Heo
2023-07-11 1:13 ` [PATCH 33/34] sched_ext: Add a basic, userland vruntime scheduler Tejun Heo
2023-07-11 1:13 ` [PATCH 34/34] sched_ext: Add a rust userspace hybrid example scheduler Tejun Heo
2023-07-21 18:37 ` [PATCHSET v4] sched: Implement BPF extensible scheduler class Tejun Heo
2023-07-24 15:11 ` Barret Rhoden
2023-07-26 9:17 ` Peter Zijlstra
2023-07-28 0:12 ` Tejun Heo
2023-08-04 0:08 ` Tejun Heo
2023-08-11 1:16 ` Tejun Heo
2023-08-17 12:44 ` Mel Gorman
2023-08-24 21:31 ` Tejun Heo
2023-09-19 17:56 ` Tejun Heo
2023-09-26 9:20 ` Mel Gorman
2023-10-10 22:09 ` Tejun Heo
2023-08-25 0:26 ` Josh Don
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230711011412.100319-18-tj@kernel.org \
--to=tj@kernel.org \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=brho@google.com \
--cc=bristot@redhat.com \
--cc=bsegall@google.com \
--cc=daniel@iogearbox.net \
--cc=derkling@google.com \
--cc=dietmar.eggemann@arm.com \
--cc=dschatzberg@meta.com \
--cc=dskarlat@cs.cmu.edu \
--cc=dvernet@meta.com \
--cc=haoluo@google.com \
--cc=joshdon@google.com \
--cc=juri.lelli@redhat.com \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=martin.lau@kernel.org \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=pjt@google.com \
--cc=riel@surriel.com \
--cc=rostedt@goodmis.org \
--cc=torvalds@linux-foundation.org \
--cc=vincent.guittot@linaro.org \
--cc=vschneid@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox