All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
	peterz@infradead.org, juri.lelli@redhat.com,
	vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
	rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
	bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
	daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
	joshdon@google.com, brho@google.com, pjt@google.com,
	derkling@google.com, haoluo@google.com, dvernet@meta.com,
	dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
	kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 18/30] sched_ext: Allow BPF schedulers to switch all eligible tasks into sched_ext
Date: Fri, 27 Jan 2023 14:16:27 -1000	[thread overview]
Message-ID: <20230128001639.3510083-19-tj@kernel.org> (raw)
In-Reply-To: <20230128001639.3510083-1-tj@kernel.org>

Currently, to use sched_ext, each task has to be put into sched_ext using
sched_setscheduler(2). However, some BPF schedulers and use cases might
prefer to service all eligible tasks.

This patch adds a new kfunc helper, scx_bpf_switch_all(), that BPF
schedulers can call from ops.init() to switch all SCHED_NORMAL, SCHED_BATCH
and SCHED_IDLE tasks into sched_ext. This has the benefit that the scheduler
swaps are transparent to the users and applications. As we know that CFS is
not being used when scx_bpf_switch_all() is used, we can also disable hot
path entry points with static_branches.

Both the dummy and qmap example schedulers are updated with the '-a' option
which enables the switch_all behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Barret Rhoden <brho@google.com>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c                     |  8 +++--
 kernel/sched/ext.c                      | 45 +++++++++++++++++++++++++
 kernel/sched/ext.h                      |  5 +++
 tools/sched_ext/scx_common.bpf.h        |  1 +
 tools/sched_ext/scx_example_dummy.bpf.c | 11 ++++++
 tools/sched_ext/scx_example_dummy.c     |  8 +++--
 tools/sched_ext/scx_example_qmap.bpf.c  |  9 +++++
 tools/sched_ext/scx_example_qmap.c      |  7 ++--
 8 files changed, 87 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9e566e72c3f2..5b68b822312b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1204,7 +1204,7 @@ bool sched_can_stop_tick(struct rq *rq)
 	 * if there's more than one we need the tick for involuntary
 	 * preemption.
 	 */
-	if (rq->nr_running > 1)
+	if (!scx_switched_all() && rq->nr_running > 1)
 		return false;
 
 	return true;
@@ -5590,8 +5590,10 @@ void scheduler_tick(void)
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
-	rq->idle_balance = idle_cpu(cpu);
-	trigger_load_balance(rq);
+	if (!scx_switched_all()) {
+		rq->idle_balance = idle_cpu(cpu);
+		trigger_load_balance(rq);
+	}
 #endif
 }
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b9d55c25cec9..63f0a3cf2d53 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -73,6 +73,10 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static bool scx_switch_all_req;
+static bool scx_switching_all;
+DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+
 static struct sched_ext_ops scx_ops;
 static bool scx_warned_zero_slice;
 
@@ -1966,6 +1970,8 @@ bool task_on_scx(struct task_struct *p)
 {
 	if (!scx_enabled() || scx_ops_disabling())
 		return false;
+	if (READ_ONCE(scx_switching_all))
+		return true;
 	return p->policy == SCHED_EXT;
 }
 
@@ -2092,6 +2098,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	 */
 	mutex_lock(&scx_ops_enable_mutex);
 
+	static_branch_disable(&__scx_switched_all);
+	WRITE_ONCE(scx_switching_all, false);
+
 	/* avoid racing against fork */
 	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
@@ -2276,6 +2285,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 */
 	cpus_read_lock();
 
+	scx_switch_all_req = false;
 	if (scx_ops.init) {
 		ret = SCX_CALL_OP_RET(SCX_KF_INIT | SCX_KF_SLEEPABLE, init);
 		if (ret) {
@@ -2391,6 +2401,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 * transitions here are synchronized against sched_ext_free() through
 	 * scx_tasks_lock.
 	 */
+	WRITE_ONCE(scx_switching_all, scx_switch_all_req);
+
 	scx_task_iter_init(&sti);
 	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
 		if (READ_ONCE(p->__state) != TASK_DEAD) {
@@ -2422,6 +2434,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		goto err_disable_unlock;
 	}
 
+	if (scx_switch_all_req)
+		static_branch_enable_cpuslocked(&__scx_switched_all);
+
 	cpus_read_unlock();
 	mutex_unlock(&scx_ops_enable_mutex);
 
@@ -2456,6 +2471,9 @@ static int scx_debug_show(struct seq_file *m, void *v)
 	mutex_lock(&scx_ops_enable_mutex);
 	seq_printf(m, "%-30s: %s\n", "ops", scx_ops.name);
 	seq_printf(m, "%-30s: %ld\n", "enabled", scx_enabled());
+	seq_printf(m, "%-30s: %d\n", "switching_all",
+		   READ_ONCE(scx_switching_all));
+	seq_printf(m, "%-30s: %ld\n", "switched_all", scx_switched_all());
 	seq_printf(m, "%-30s: %s\n", "enable_state",
 		   scx_ops_enable_state_str[scx_ops_enable_state()]);
 	seq_printf(m, "%-30s: %llu\n", "nr_rejected",
@@ -2694,6 +2712,31 @@ __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in vmlinux BTF");
 
+/**
+ * scx_bpf_switch_all - Switch all tasks into SCX
+ * @into_scx: switch direction
+ *
+ * If @into_scx is %true, all existing and future non-dl/rt tasks are switched
+ * to SCX. If %false, only tasks which have %SCHED_EXT explicitly set are put on
+ * SCX. The actual switching is asynchronous. Can be called from ops.init().
+ */
+void scx_bpf_switch_all(void)
+{
+	if (!scx_kf_allowed(SCX_KF_INIT))
+		return;
+
+	scx_switch_all_req = true;
+}
+
+BTF_SET8_START(scx_kfunc_ids_init)
+BTF_ID_FLAGS(func, scx_bpf_switch_all)
+BTF_SET8_END(scx_kfunc_ids_init)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_init = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_init,
+};
+
 /**
  * scx_bpf_create_dsq - Create a custom DSQ
  * @dsq_id: DSQ to create
@@ -3131,6 +3174,8 @@ static int __init register_ext_kfuncs(void)
 	 * check using scx_kf_allowed().
 	 */
 	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_init)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_sleepable)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_enqueue_dispatch)) ||
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 76c94babd19e..a4fe649e649d 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -60,7 +60,9 @@ extern unsigned long scx_watchdog_timeout;
 extern unsigned long scx_watchdog_timestamp;
 
 DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled);
+DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
 #define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
+#define scx_switched_all()	static_branch_unlikely(&__scx_switched_all)
 
 bool task_on_scx(struct task_struct *p);
 void scx_pre_fork(struct task_struct *p);
@@ -95,6 +97,8 @@ static inline void scx_notify_sched_tick(void)
 static inline const struct sched_class *next_active_class(const struct sched_class *class)
 {
 	class++;
+	if (scx_switched_all() && class == &fair_sched_class)
+		class++;
 	if (!scx_enabled() && class == &ext_sched_class)
 		class++;
 	return class;
@@ -117,6 +121,7 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
 #else	/* CONFIG_SCHED_CLASS_EXT */
 
 #define scx_enabled()		false
+#define scx_switched_all()	false
 
 static inline void scx_pre_fork(struct task_struct *p) {}
 static inline int scx_fork(struct task_struct *p) { return 0; }
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index b40a4fc6a159..fec19e8d0681 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -66,6 +66,7 @@ bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1,
 		     const struct cpumask *src2) __ksym;
 u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
 
+void scx_bpf_switch_all(void) __ksym;
 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
diff --git a/tools/sched_ext/scx_example_dummy.bpf.c b/tools/sched_ext/scx_example_dummy.bpf.c
index ac7b490b5a39..28251373d1c3 100644
--- a/tools/sched_ext/scx_example_dummy.bpf.c
+++ b/tools/sched_ext/scx_example_dummy.bpf.c
@@ -7,6 +7,7 @@
  *
  * - Statistics tracking how many are queued to local and global dsq's.
  * - Termination notification for userspace.
+ * - Support for switch_all.
  *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
@@ -16,6 +17,8 @@
 
 char _license[] SEC("license") = "GPL";
 
+const volatile bool switch_all;
+
 struct user_exit_info uei;
 
 struct {
@@ -32,6 +35,13 @@ static void stat_inc(u32 idx)
 		(*cnt_p)++;
 }
 
+s32 BPF_STRUCT_OPS(dummy_init)
+{
+	if (switch_all)
+		scx_bpf_switch_all();
+	return 0;
+}
+
 void BPF_STRUCT_OPS(dummy_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	if (enq_flags & SCX_ENQ_LOCAL) {
@@ -51,6 +61,7 @@ void BPF_STRUCT_OPS(dummy_exit, struct scx_exit_info *ei)
 SEC(".struct_ops")
 struct sched_ext_ops dummy_ops = {
 	.enqueue		= (void *)dummy_enqueue,
+	.init			= (void *)dummy_init,
 	.exit			= (void *)dummy_exit,
 	.name			= "dummy",
 };
diff --git a/tools/sched_ext/scx_example_dummy.c b/tools/sched_ext/scx_example_dummy.c
index 72881c881830..9229973e8698 100644
--- a/tools/sched_ext/scx_example_dummy.c
+++ b/tools/sched_ext/scx_example_dummy.c
@@ -19,8 +19,9 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s\n"
+"Usage: %s [-a]\n"
 "\n"
+"  -a            Switch all tasks\n"
 "  -h            Display this help and exit\n";
 
 static volatile int exit_req;
@@ -64,8 +65,11 @@ int main(int argc, char **argv)
 	skel = scx_example_dummy__open();
 	assert(skel);
 
-	while ((opt = getopt(argc, argv, "h")) != -1) {
+	while ((opt = getopt(argc, argv, "ah")) != -1) {
 		switch (opt) {
+		case 'a':
+			skel->rodata->switch_all = true;
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 46bc16ed301f..ec8c4ee7ef16 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -22,6 +22,7 @@
 char _license[] SEC("license") = "GPL";
 
 const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile bool switch_all;
 const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile s32 disallow_tgid;
@@ -236,6 +237,13 @@ s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
 		return -ENOMEM;
 }
 
+s32 BPF_STRUCT_OPS(qmap_init)
+{
+	if (switch_all)
+		scx_bpf_switch_all();
+	return 0;
+}
+
 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 {
 	uei_record(&uei, ei);
@@ -248,6 +256,7 @@ struct sched_ext_ops qmap_ops = {
 	.dequeue		= (void *)qmap_dequeue,
 	.dispatch		= (void *)qmap_dispatch,
 	.prep_enable		= (void *)qmap_prep_enable,
+	.init			= (void *)qmap_init,
 	.exit			= (void *)qmap_exit,
 	.timeout_ms		= 5000U,
 	.name			= "qmap",
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
index dff9323dfd20..30633122e6d5 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -20,7 +20,7 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID]\n"
+"Usage: %s [-a] [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -50,8 +50,11 @@ int main(int argc, char **argv)
 	skel = scx_example_qmap__open();
 	assert(skel);
 
-	while ((opt = getopt(argc, argv, "hs:e:t:T:d:")) != -1) {
+	while ((opt = getopt(argc, argv, "ahs:e:t:T:d:")) != -1) {
 		switch (opt) {
+		case 'a':
+			skel->rodata->switch_all = true;
+			break;
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
 			break;
-- 
2.39.1


  parent reply	other threads:[~2023-01-28  0:19 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-28  0:16 [PATCHSET v2] sched: Implement BPF extensible scheduler class Tejun Heo
2023-01-28  0:16 ` [PATCH 01/30] cgroup: Implement cgroup_show_cftypes() Tejun Heo
2023-01-28  0:16 ` [PATCH 02/30] sched: Encapsulate task attribute change sequence into a helper macro Tejun Heo
2023-01-28  0:16 ` [PATCH 03/30] sched: Restructure sched_class order sanity checks in sched_init() Tejun Heo
2023-01-28  0:16 ` [PATCH 04/30] sched: Allow sched_cgroup_fork() to fail and introduce sched_cancel_fork() Tejun Heo
2023-01-28  0:16 ` [PATCH 05/30] sched: Add sched_class->reweight_task() Tejun Heo
2023-01-28  0:16 ` [PATCH 06/30] sched: Add sched_class->switching_to() and expose check_class_changing/changed() Tejun Heo
2023-01-28  0:16 ` [PATCH 07/30] sched: Factor out cgroup weight conversion functions Tejun Heo
2023-01-28  0:16 ` [PATCH 08/30] sched: Expose css_tg(), __setscheduler_prio() and SCHED_CHANGE_BLOCK() Tejun Heo
2023-01-28 17:24   ` kernel test robot
2023-01-28  0:16 ` [PATCH 09/30] sched: Enumerate CPU cgroup file types Tejun Heo
2023-01-28  0:16 ` [PATCH 10/30] sched: Add @reason to sched_class->rq_{on|off}line() Tejun Heo
2023-01-28  0:16 ` [PATCH 11/30] sched: Add normal_policy() Tejun Heo
2023-01-28  0:16 ` [PATCH 12/30] sched_ext: Add boilerplate for extensible scheduler class Tejun Heo
2023-01-28  0:16 ` [PATCH 13/30] sched_ext: Implement BPF " Tejun Heo
2023-01-28  0:16 ` [PATCH 14/30] sched_ext: Add scx_example_dummy and scx_example_qmap example schedulers Tejun Heo
2023-01-28  0:16 ` [PATCH 15/30] sched_ext: Add sysrq-S which disables the BPF scheduler Tejun Heo
2023-01-28  0:16 ` [PATCH 16/30] sched_ext: Implement runnable task stall watchdog Tejun Heo
2023-01-28  0:16 ` [PATCH 17/30] sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT Tejun Heo
2023-01-28  0:16 ` Tejun Heo [this message]
2023-01-28  0:16 ` [PATCH 19/30] sched_ext: Implement scx_bpf_kick_cpu() and task preemption support Tejun Heo
2023-01-28  0:16 ` [PATCH 20/30] sched_ext: Make watchdog handle ops.dispatch() looping stall Tejun Heo
2023-01-28  0:16 ` [PATCH 21/30] sched_ext: Add task state tracking operations Tejun Heo
2023-01-28  0:16 ` [PATCH 22/30] sched_ext: Implement tickless support Tejun Heo
2023-01-28  0:16 ` [PATCH 23/30] sched_ext: Add cgroup support Tejun Heo
2023-01-28 18:05   ` kernel test robot
2023-01-30 23:41     ` Tejun Heo
2023-01-28  0:16 ` [PATCH 24/30] sched_ext: Implement SCX_KICK_WAIT Tejun Heo
2023-01-28  0:16 ` [PATCH 25/30] sched_ext: Implement sched_ext_ops.cpu_acquire/release() Tejun Heo
2023-01-28  0:16 ` [PATCH 26/30] sched_ext: Implement sched_ext_ops.cpu_online/offline() Tejun Heo
2023-01-28  0:16 ` [PATCH 27/30] sched_ext: Implement core-sched support Tejun Heo
2023-01-28 19:07   ` kernel test robot
2023-01-30 21:38   ` Josh Don
2023-01-31  0:26     ` Tejun Heo
2023-01-31  0:36       ` Tejun Heo
2023-01-31  1:45       ` Josh Don
2023-01-28  0:16 ` [PATCH 28/30] sched_ext: Documentation: scheduler: Document extensible scheduler class Tejun Heo
2023-01-28  0:16 ` [PATCH 29/30] sched_ext: Add a basic, userland vruntime scheduler Tejun Heo
2023-01-28  0:16 ` [PATCH 30/30] sched_ext: Add a rust userspace hybrid example scheduler Tejun Heo
2023-02-08 21:55 ` [PATCHSET v2] sched: Implement BPF extensible scheduler class Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230128001639.3510083-19-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=brho@google.com \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=daniel@iogearbox.net \
    --cc=derkling@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=dschatzberg@meta.com \
    --cc=dskarlat@cs.cmu.edu \
    --cc=dvernet@meta.com \
    --cc=haoluo@google.com \
    --cc=joshdon@google.com \
    --cc=juri.lelli@redhat.com \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=martin.lau@kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=riel@surriel.com \
    --cc=rostedt@goodmis.org \
    --cc=torvalds@linux-foundation.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.