public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
	peterz@infradead.org, juri.lelli@redhat.com,
	vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
	rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
	bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
	daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
	joshdon@google.com, brho@google.com, pjt@google.com,
	derkling@google.com, haoluo@google.com, dvernet@meta.com,
	dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
	kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 18/30] sched_ext: Allow BPF schedulers to switch all eligible tasks into sched_ext
Date: Fri, 27 Jan 2023 14:16:27 -1000	[thread overview]
Message-ID: <20230128001639.3510083-19-tj@kernel.org> (raw)
In-Reply-To: <20230128001639.3510083-1-tj@kernel.org>

Currently, to use sched_ext, each task has to be put into sched_ext using
sched_setscheduler(2). However, some BPF schedulers and use cases might
prefer to service all eligible tasks.

This patch adds a new kfunc helper, scx_bpf_switch_all(), that BPF
schedulers can call from ops.init() to switch all SCHED_NORMAL, SCHED_BATCH
and SCHED_IDLE tasks into sched_ext. This has the benefit that the scheduler
swaps are transparent to the users and applications. As we know that CFS is
not being used when scx_bpf_switch_all() is used, we can also disable hot
path entry points with static_branches.

Both the dummy and qmap example schedulers are updated with the '-a' option
which enables the switch_all behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Barret Rhoden <brho@google.com>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c                     |  8 +++--
 kernel/sched/ext.c                      | 45 +++++++++++++++++++++++++
 kernel/sched/ext.h                      |  5 +++
 tools/sched_ext/scx_common.bpf.h        |  1 +
 tools/sched_ext/scx_example_dummy.bpf.c | 11 ++++++
 tools/sched_ext/scx_example_dummy.c     |  8 +++--
 tools/sched_ext/scx_example_qmap.bpf.c  |  9 +++++
 tools/sched_ext/scx_example_qmap.c      |  7 ++--
 8 files changed, 87 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9e566e72c3f2..5b68b822312b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1204,7 +1204,7 @@ bool sched_can_stop_tick(struct rq *rq)
 	 * if there's more than one we need the tick for involuntary
 	 * preemption.
 	 */
-	if (rq->nr_running > 1)
+	if (!scx_switched_all() && rq->nr_running > 1)
 		return false;
 
 	return true;
@@ -5590,8 +5590,10 @@ void scheduler_tick(void)
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
-	rq->idle_balance = idle_cpu(cpu);
-	trigger_load_balance(rq);
+	if (!scx_switched_all()) {
+		rq->idle_balance = idle_cpu(cpu);
+		trigger_load_balance(rq);
+	}
 #endif
 }
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b9d55c25cec9..63f0a3cf2d53 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -73,6 +73,10 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static bool scx_switch_all_req;
+static bool scx_switching_all;
+DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+
 static struct sched_ext_ops scx_ops;
 static bool scx_warned_zero_slice;
 
@@ -1966,6 +1970,8 @@ bool task_on_scx(struct task_struct *p)
 {
 	if (!scx_enabled() || scx_ops_disabling())
 		return false;
+	if (READ_ONCE(scx_switching_all))
+		return true;
 	return p->policy == SCHED_EXT;
 }
 
@@ -2092,6 +2098,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	 */
 	mutex_lock(&scx_ops_enable_mutex);
 
+	static_branch_disable(&__scx_switched_all);
+	WRITE_ONCE(scx_switching_all, false);
+
 	/* avoid racing against fork */
 	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
@@ -2276,6 +2285,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 */
 	cpus_read_lock();
 
+	scx_switch_all_req = false;
 	if (scx_ops.init) {
 		ret = SCX_CALL_OP_RET(SCX_KF_INIT | SCX_KF_SLEEPABLE, init);
 		if (ret) {
@@ -2391,6 +2401,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 * transitions here are synchronized against sched_ext_free() through
 	 * scx_tasks_lock.
 	 */
+	WRITE_ONCE(scx_switching_all, scx_switch_all_req);
+
 	scx_task_iter_init(&sti);
 	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
 		if (READ_ONCE(p->__state) != TASK_DEAD) {
@@ -2422,6 +2434,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		goto err_disable_unlock;
 	}
 
+	if (scx_switch_all_req)
+		static_branch_enable_cpuslocked(&__scx_switched_all);
+
 	cpus_read_unlock();
 	mutex_unlock(&scx_ops_enable_mutex);
 
@@ -2456,6 +2471,9 @@ static int scx_debug_show(struct seq_file *m, void *v)
 	mutex_lock(&scx_ops_enable_mutex);
 	seq_printf(m, "%-30s: %s\n", "ops", scx_ops.name);
 	seq_printf(m, "%-30s: %ld\n", "enabled", scx_enabled());
+	seq_printf(m, "%-30s: %d\n", "switching_all",
+		   READ_ONCE(scx_switching_all));
+	seq_printf(m, "%-30s: %ld\n", "switched_all", scx_switched_all());
 	seq_printf(m, "%-30s: %s\n", "enable_state",
 		   scx_ops_enable_state_str[scx_ops_enable_state()]);
 	seq_printf(m, "%-30s: %llu\n", "nr_rejected",
@@ -2694,6 +2712,31 @@ __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in vmlinux BTF");
 
+/**
+ * scx_bpf_switch_all - Switch all tasks into SCX
+ * @into_scx: switch direction
+ *
+ * If @into_scx is %true, all existing and future non-dl/rt tasks are switched
+ * to SCX. If %false, only tasks which have %SCHED_EXT explicitly set are put on
+ * SCX. The actual switching is asynchronous. Can be called from ops.init().
+ */
+void scx_bpf_switch_all(void)
+{
+	if (!scx_kf_allowed(SCX_KF_INIT))
+		return;
+
+	scx_switch_all_req = true;
+}
+
+BTF_SET8_START(scx_kfunc_ids_init)
+BTF_ID_FLAGS(func, scx_bpf_switch_all)
+BTF_SET8_END(scx_kfunc_ids_init)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_init = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_init,
+};
+
 /**
  * scx_bpf_create_dsq - Create a custom DSQ
  * @dsq_id: DSQ to create
@@ -3131,6 +3174,8 @@ static int __init register_ext_kfuncs(void)
 	 * check using scx_kf_allowed().
 	 */
 	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_init)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_sleepable)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_enqueue_dispatch)) ||
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 76c94babd19e..a4fe649e649d 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -60,7 +60,9 @@ extern unsigned long scx_watchdog_timeout;
 extern unsigned long scx_watchdog_timestamp;
 
 DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled);
+DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
 #define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
+#define scx_switched_all()	static_branch_unlikely(&__scx_switched_all)
 
 bool task_on_scx(struct task_struct *p);
 void scx_pre_fork(struct task_struct *p);
@@ -95,6 +97,8 @@ static inline void scx_notify_sched_tick(void)
 static inline const struct sched_class *next_active_class(const struct sched_class *class)
 {
 	class++;
+	if (scx_switched_all() && class == &fair_sched_class)
+		class++;
 	if (!scx_enabled() && class == &ext_sched_class)
 		class++;
 	return class;
@@ -117,6 +121,7 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
 #else	/* CONFIG_SCHED_CLASS_EXT */
 
 #define scx_enabled()		false
+#define scx_switched_all()	false
 
 static inline void scx_pre_fork(struct task_struct *p) {}
 static inline int scx_fork(struct task_struct *p) { return 0; }
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index b40a4fc6a159..fec19e8d0681 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -66,6 +66,7 @@ bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1,
 		     const struct cpumask *src2) __ksym;
 u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
 
+void scx_bpf_switch_all(void) __ksym;
 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
diff --git a/tools/sched_ext/scx_example_dummy.bpf.c b/tools/sched_ext/scx_example_dummy.bpf.c
index ac7b490b5a39..28251373d1c3 100644
--- a/tools/sched_ext/scx_example_dummy.bpf.c
+++ b/tools/sched_ext/scx_example_dummy.bpf.c
@@ -7,6 +7,7 @@
  *
  * - Statistics tracking how many are queued to local and global dsq's.
  * - Termination notification for userspace.
+ * - Support for switch_all.
  *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
@@ -16,6 +17,8 @@
 
 char _license[] SEC("license") = "GPL";
 
+const volatile bool switch_all;
+
 struct user_exit_info uei;
 
 struct {
@@ -32,6 +35,13 @@ static void stat_inc(u32 idx)
 		(*cnt_p)++;
 }
 
+s32 BPF_STRUCT_OPS(dummy_init)
+{
+	if (switch_all)
+		scx_bpf_switch_all();
+	return 0;
+}
+
 void BPF_STRUCT_OPS(dummy_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	if (enq_flags & SCX_ENQ_LOCAL) {
@@ -51,6 +61,7 @@ void BPF_STRUCT_OPS(dummy_exit, struct scx_exit_info *ei)
 SEC(".struct_ops")
 struct sched_ext_ops dummy_ops = {
 	.enqueue		= (void *)dummy_enqueue,
+	.init			= (void *)dummy_init,
 	.exit			= (void *)dummy_exit,
 	.name			= "dummy",
 };
diff --git a/tools/sched_ext/scx_example_dummy.c b/tools/sched_ext/scx_example_dummy.c
index 72881c881830..9229973e8698 100644
--- a/tools/sched_ext/scx_example_dummy.c
+++ b/tools/sched_ext/scx_example_dummy.c
@@ -19,8 +19,9 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s\n"
+"Usage: %s [-a]\n"
 "\n"
+"  -a            Switch all tasks\n"
 "  -h            Display this help and exit\n";
 
 static volatile int exit_req;
@@ -64,8 +65,11 @@ int main(int argc, char **argv)
 	skel = scx_example_dummy__open();
 	assert(skel);
 
-	while ((opt = getopt(argc, argv, "h")) != -1) {
+	while ((opt = getopt(argc, argv, "ah")) != -1) {
 		switch (opt) {
+		case 'a':
+			skel->rodata->switch_all = true;
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 46bc16ed301f..ec8c4ee7ef16 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -22,6 +22,7 @@
 char _license[] SEC("license") = "GPL";
 
 const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile bool switch_all;
 const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile s32 disallow_tgid;
@@ -236,6 +237,13 @@ s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
 		return -ENOMEM;
 }
 
+s32 BPF_STRUCT_OPS(qmap_init)
+{
+	if (switch_all)
+		scx_bpf_switch_all();
+	return 0;
+}
+
 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 {
 	uei_record(&uei, ei);
@@ -248,6 +256,7 @@ struct sched_ext_ops qmap_ops = {
 	.dequeue		= (void *)qmap_dequeue,
 	.dispatch		= (void *)qmap_dispatch,
 	.prep_enable		= (void *)qmap_prep_enable,
+	.init			= (void *)qmap_init,
 	.exit			= (void *)qmap_exit,
 	.timeout_ms		= 5000U,
 	.name			= "qmap",
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
index dff9323dfd20..30633122e6d5 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -20,7 +20,7 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID]\n"
+"Usage: %s [-a] [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -50,8 +50,11 @@ int main(int argc, char **argv)
 	skel = scx_example_qmap__open();
 	assert(skel);
 
-	while ((opt = getopt(argc, argv, "hs:e:t:T:d:")) != -1) {
+	while ((opt = getopt(argc, argv, "ahs:e:t:T:d:")) != -1) {
 		switch (opt) {
+		case 'a':
+			skel->rodata->switch_all = true;
+			break;
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
 			break;
-- 
2.39.1


  parent reply	other threads:[~2023-01-28  0:19 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-28  0:16 [PATCHSET v2] sched: Implement BPF extensible scheduler class Tejun Heo
2023-01-28  0:16 ` [PATCH 01/30] cgroup: Implement cgroup_show_cftypes() Tejun Heo
2023-01-28  0:16 ` [PATCH 02/30] sched: Encapsulate task attribute change sequence into a helper macro Tejun Heo
2023-01-28  0:16 ` [PATCH 03/30] sched: Restructure sched_class order sanity checks in sched_init() Tejun Heo
2023-01-28  0:16 ` [PATCH 04/30] sched: Allow sched_cgroup_fork() to fail and introduce sched_cancel_fork() Tejun Heo
2023-01-28  0:16 ` [PATCH 05/30] sched: Add sched_class->reweight_task() Tejun Heo
2023-01-28  0:16 ` [PATCH 06/30] sched: Add sched_class->switching_to() and expose check_class_changing/changed() Tejun Heo
2023-01-28  0:16 ` [PATCH 07/30] sched: Factor out cgroup weight conversion functions Tejun Heo
2023-01-28  0:16 ` [PATCH 08/30] sched: Expose css_tg(), __setscheduler_prio() and SCHED_CHANGE_BLOCK() Tejun Heo
2023-01-28 17:24   ` kernel test robot
2023-01-28  0:16 ` [PATCH 09/30] sched: Enumerate CPU cgroup file types Tejun Heo
2023-01-28  0:16 ` [PATCH 10/30] sched: Add @reason to sched_class->rq_{on|off}line() Tejun Heo
2023-01-28  0:16 ` [PATCH 11/30] sched: Add normal_policy() Tejun Heo
2023-01-28  0:16 ` [PATCH 12/30] sched_ext: Add boilerplate for extensible scheduler class Tejun Heo
2023-01-28  0:16 ` [PATCH 13/30] sched_ext: Implement BPF " Tejun Heo
2023-01-28  0:16 ` [PATCH 14/30] sched_ext: Add scx_example_dummy and scx_example_qmap example schedulers Tejun Heo
2023-01-28  0:16 ` [PATCH 15/30] sched_ext: Add sysrq-S which disables the BPF scheduler Tejun Heo
2023-01-28  0:16 ` [PATCH 16/30] sched_ext: Implement runnable task stall watchdog Tejun Heo
2023-01-28  0:16 ` [PATCH 17/30] sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT Tejun Heo
2023-01-28  0:16 ` Tejun Heo [this message]
2023-01-28  0:16 ` [PATCH 19/30] sched_ext: Implement scx_bpf_kick_cpu() and task preemption support Tejun Heo
2023-01-28  0:16 ` [PATCH 20/30] sched_ext: Make watchdog handle ops.dispatch() looping stall Tejun Heo
2023-01-28  0:16 ` [PATCH 21/30] sched_ext: Add task state tracking operations Tejun Heo
2023-01-28  0:16 ` [PATCH 22/30] sched_ext: Implement tickless support Tejun Heo
2023-01-28  0:16 ` [PATCH 23/30] sched_ext: Add cgroup support Tejun Heo
2023-01-28 18:05   ` kernel test robot
2023-01-30 23:41     ` Tejun Heo
2023-01-28  0:16 ` [PATCH 24/30] sched_ext: Implement SCX_KICK_WAIT Tejun Heo
2023-01-28  0:16 ` [PATCH 25/30] sched_ext: Implement sched_ext_ops.cpu_acquire/release() Tejun Heo
2023-01-28  0:16 ` [PATCH 26/30] sched_ext: Implement sched_ext_ops.cpu_online/offline() Tejun Heo
2023-01-28  0:16 ` [PATCH 27/30] sched_ext: Implement core-sched support Tejun Heo
2023-01-28 19:07   ` kernel test robot
2023-01-30 21:38   ` Josh Don
2023-01-31  0:26     ` Tejun Heo
2023-01-31  0:36       ` Tejun Heo
2023-01-31  1:45       ` Josh Don
2023-01-28  0:16 ` [PATCH 28/30] sched_ext: Documentation: scheduler: Document extensible scheduler class Tejun Heo
2023-01-28  0:16 ` [PATCH 29/30] sched_ext: Add a basic, userland vruntime scheduler Tejun Heo
2023-01-28  0:16 ` [PATCH 30/30] sched_ext: Add a rust userspace hybrid example scheduler Tejun Heo
2023-02-08 21:55 ` [PATCHSET v2] sched: Implement BPF extensible scheduler class Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230128001639.3510083-19-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=brho@google.com \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=daniel@iogearbox.net \
    --cc=derkling@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=dschatzberg@meta.com \
    --cc=dskarlat@cs.cmu.edu \
    --cc=dvernet@meta.com \
    --cc=haoluo@google.com \
    --cc=joshdon@google.com \
    --cc=juri.lelli@redhat.com \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=martin.lau@kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=riel@surriel.com \
    --cc=rostedt@goodmis.org \
    --cc=torvalds@linux-foundation.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox