From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
peterz@infradead.org, juri.lelli@redhat.com,
vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
joshdon@google.com, brho@google.com, pjt@google.com,
derkling@google.com, haoluo@google.com, dvernet@meta.com,
dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 19/31] sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT
Date: Tue, 29 Nov 2022 22:23:01 -1000 [thread overview]
Message-ID: <20221130082313.3241517-20-tj@kernel.org> (raw)
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
BPF schedulers might not want to schedule certain tasks - e.g. kernel
threads. This patch adds p->scx.disallow which can be set by BPF schedulers
in such cases. The field can be changed anytime and setting it in
ops.prep_enable() guarantees that the task can never be scheduled by
sched_ext.
scx_example_qmap is updated with the -d option to disallow a specific PID:
# echo $$
1092
# egrep '(policy)|(ext\.enabled)' /proc/self/sched
policy : 0
ext.enabled : 0
# ./set-scx 1092
# egrep '(policy)|(ext\.enabled)' /proc/self/sched
policy : 7
ext.enabled : 0
Run "scx_example_qmap -d 1092" in another terminal.
# grep rejected /sys/kernel/debug/sched/ext
nr_rejected : 1
# egrep '(policy)|(ext\.enabled)' /proc/self/sched
policy : 0
ext.enabled : 0
# ./set-scx 1092
setparam failed for 1092 (Permission denied)
Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Barret Rhoden <brho@google.com>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
include/linux/sched/ext.h | 12 ++++++++
kernel/sched/core.c | 4 +++
kernel/sched/ext.c | 38 ++++++++++++++++++++++++++
kernel/sched/ext.h | 3 ++
tools/sched_ext/scx_example_qmap.bpf.c | 4 +++
tools/sched_ext/scx_example_qmap.c | 8 +++++-
6 files changed, 68 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 1a57945abea0..82dcbecfcfb9 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -397,6 +397,18 @@ struct sched_ext_entity {
*/
u64 slice;
+ /*
+ * If set, reject future sched_setscheduler(2) calls updating the policy
+ * to %SCHED_EXT with -%EACCES.
+ *
+ * If set from ops.prep_enable() and the task's policy is already
+ * %SCHED_EXT, which can happen while the BPF scheduler is being loaded
+ * or by inhering the parent's policy during fork, the task's policy is
+ * rejected and forcefully reverted to %SCHED_NORMAL. The number of such
+ * events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
+ */
+ bool disallow; /* reject switching into SCX */
+
/* cold fields */
struct list_head tasks_node;
};
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39d9ccb64f40..3404277fed30 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7552,6 +7552,10 @@ static int __sched_setscheduler(struct task_struct *p,
goto unlock;
}
+ retval = scx_check_setscheduler(p, policy);
+ if (retval)
+ goto unlock;
+
/*
* If not changing anything there's no need to proceed further,
* but store a possible modification of reset_on_fork.
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 030175f2b1d6..ddd5aa4a8bca 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1509,6 +1509,8 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
WARN_ON_ONCE(p->scx.flags & SCX_TASK_OPS_PREPPED);
+ p->scx.disallow = false;
+
if (SCX_HAS_OP(prep_enable)) {
struct scx_enable_args args = { };
@@ -1519,6 +1521,27 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
}
}
+ if (p->scx.disallow) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * We're either in fork or load path and @p->policy will be
+ * applied right after. Reverting @p->policy here and rejecting
+ * %SCHED_EXT transitions from scx_check_setscheduler()
+ * guarantees that if ops.prep_enable() sets @p->disallow, @p
+ * can never be in SCX.
+ */
+ if (p->policy == SCHED_EXT) {
+ p->policy = SCHED_NORMAL;
+ atomic64_inc(&scx_nr_rejected);
+ }
+
+ task_rq_unlock(rq, p, &rf);
+ }
+
p->scx.flags |= (SCX_TASK_OPS_PREPPED | SCX_TASK_WATCHDOG_RESET);
return 0;
}
@@ -1664,6 +1687,18 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
static void check_preempt_curr_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
+int scx_check_setscheduler(struct task_struct *p, int policy)
+{
+ lockdep_assert_rq_held(task_rq(p));
+
+ /* if disallow, reject transitioning into SCX */
+ if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
+ p->policy != policy && policy == SCHED_EXT)
+ return -EACCES;
+
+ return 0;
+}
+
/*
* Omitted operations:
*
@@ -2367,6 +2402,9 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
if (off >= offsetof(struct task_struct, scx.slice) &&
off + size <= offsetofend(struct task_struct, scx.slice))
return SCALAR_VALUE;
+ if (off >= offsetof(struct task_struct, scx.disallow) &&
+ off + size <= offsetofend(struct task_struct, scx.disallow))
+ return SCALAR_VALUE;
}
if (atype == BPF_READ)
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index bda1d9c11486..0743a0536560 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -79,6 +79,7 @@ int scx_fork(struct task_struct *p);
void scx_post_fork(struct task_struct *p);
void scx_cancel_fork(struct task_struct *p);
int balance_scx(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+int scx_check_setscheduler(struct task_struct *p, int policy);
void init_sched_ext_class(void);
__printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
@@ -136,6 +137,8 @@ static inline void scx_post_fork(struct task_struct *p) {}
static inline void scx_cancel_fork(struct task_struct *p) {}
static inline int balance_scx(struct rq *rq, struct task_struct *prev,
struct rq_flags *rf) { return 0; }
+static inline int scx_check_setscheduler(struct task_struct *p,
+ int policy) { return 0; }
static inline void init_sched_ext_class(void) {}
static inline void scx_notify_sched_tick(void) {}
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 9e0b6519c8a4..b6febc5dadbf 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -24,6 +24,7 @@ char _license[] SEC("license") = "GPL";
const volatile u64 slice_ns = SCX_SLICE_DFL;
const volatile u32 stall_user_nth;
const volatile u32 stall_kernel_nth;
+const volatile s32 disallow_tgid;
u32 test_error_cnt;
@@ -208,6 +209,9 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
struct scx_enable_args *args)
{
+ if (p->tgid == disallow_tgid)
+ p->scx.disallow = true;
+
/*
* @p is new. Let's ensure that its task_ctx is available. We can sleep
* in this function and the following will automatically use GFP_KERNEL.
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
index 34c764c38e19..99cc7169bd90 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -20,12 +20,13 @@ const char help_fmt[] =
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
" -t COUNT Stall every COUNT'th user thread\n"
" -T COUNT Stall every COUNT'th kernel thread\n"
+" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
" -h Display this help and exit\n";
static volatile int exit_req;
@@ -63,6 +64,11 @@ int main(int argc, char **argv)
case 'T':
skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
break;
+ case 'd':
+ skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
+ if (skel->rodata->disallow_tgid < 0)
+ skel->rodata->disallow_tgid = getpid();
+ break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
--
2.38.1
next prev parent reply other threads:[~2022-11-30 8:26 UTC|newest]
Thread overview: 92+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-11-30 8:22 [PATCHSET RFC] sched: Implement BPF extensible scheduler class Tejun Heo
2022-11-30 8:22 ` [PATCH 01/31] rhashtable: Allow rhashtable to be used from irq-safe contexts Tejun Heo
2022-11-30 16:35 ` Linus Torvalds
2022-11-30 17:00 ` Tejun Heo
2022-12-06 21:36 ` [PATCH v2 " Tejun Heo
2022-12-09 10:50 ` patchwork-bot+netdevbpf
2022-11-30 8:22 ` [PATCH 02/31] cgroup: Implement cgroup_show_cftypes() Tejun Heo
2022-11-30 8:22 ` [PATCH 03/31] BPF: Add @prog to bpf_struct_ops->check_member() Tejun Heo
2022-11-30 8:22 ` [PATCH 04/31] sched: Allow sched_cgroup_fork() to fail and introduce sched_cancel_fork() Tejun Heo
2022-12-12 11:13 ` Peter Zijlstra
2022-12-12 18:03 ` Tejun Heo
2022-12-12 20:07 ` Peter Zijlstra
2022-12-12 20:12 ` Tejun Heo
2022-11-30 8:22 ` [PATCH 05/31] sched: Add sched_class->reweight_task() Tejun Heo
2022-12-12 11:22 ` Peter Zijlstra
2022-12-12 17:34 ` Tejun Heo
2022-12-12 20:11 ` Peter Zijlstra
2022-12-12 20:15 ` Tejun Heo
2022-11-30 8:22 ` [PATCH 06/31] sched: Add sched_class->switching_to() and expose check_class_changing/changed() Tejun Heo
2022-12-12 11:28 ` Peter Zijlstra
2022-12-12 17:59 ` Tejun Heo
2022-11-30 8:22 ` [PATCH 07/31] sched: Factor out cgroup weight conversion functions Tejun Heo
2022-11-30 8:22 ` [PATCH 08/31] sched: Expose css_tg() and __setscheduler_prio() in kernel/sched/sched.h Tejun Heo
2022-12-12 11:49 ` Peter Zijlstra
2022-12-12 17:47 ` Tejun Heo
2022-11-30 8:22 ` [PATCH 09/31] sched: Enumerate CPU cgroup file types Tejun Heo
2022-11-30 8:22 ` [PATCH 10/31] sched: Add @reason to sched_class->rq_{on|off}line() Tejun Heo
2022-12-12 11:57 ` Peter Zijlstra
2022-12-12 18:06 ` Tejun Heo
2022-11-30 8:22 ` [PATCH 11/31] sched: Add @reason to sched_move_task() Tejun Heo
2022-12-12 12:00 ` Peter Zijlstra
2022-12-12 17:54 ` Tejun Heo
2022-11-30 8:22 ` [PATCH 12/31] sched: Add normal_policy() Tejun Heo
2022-11-30 8:22 ` [PATCH 13/31] sched_ext: Add boilerplate for extensible scheduler class Tejun Heo
2022-11-30 8:22 ` [PATCH 14/31] sched_ext: Implement BPF " Tejun Heo
2022-12-02 17:08 ` Barret Rhoden
2022-12-02 18:01 ` Tejun Heo
2022-12-06 21:42 ` Tejun Heo
2022-12-06 21:44 ` Tejun Heo
2022-12-11 22:33 ` Julia Lawall
2022-12-12 2:15 ` Tejun Heo
2022-12-12 6:03 ` Julia Lawall
2022-12-12 6:08 ` Tejun Heo
2022-12-12 12:31 ` Peter Zijlstra
2022-12-12 20:03 ` Tejun Heo
2022-12-12 12:53 ` Peter Zijlstra
2022-12-12 21:33 ` Tejun Heo
2022-12-13 10:55 ` Peter Zijlstra
2022-12-13 18:12 ` Tejun Heo
2022-12-13 18:40 ` Rik van Riel
2022-12-13 23:20 ` Josh Don
2022-12-13 10:57 ` Peter Zijlstra
2022-12-13 17:32 ` Tejun Heo
2022-11-30 8:22 ` [PATCH 15/31] sched_ext: [TEMPORARY] Add temporary workaround kfunc helpers Tejun Heo
2022-11-30 8:22 ` [PATCH 16/31] sched_ext: Add scx_example_dummy and scx_example_qmap example schedulers Tejun Heo
2022-11-30 8:22 ` [PATCH 17/31] sched_ext: Add sysrq-S which disables the BPF scheduler Tejun Heo
2022-11-30 8:23 ` [PATCH 18/31] sched_ext: Implement runnable task stall watchdog Tejun Heo
2022-11-30 8:23 ` Tejun Heo [this message]
2022-11-30 8:23 ` [PATCH 20/31] sched_ext: Allow BPF schedulers to switch all eligible tasks into sched_ext Tejun Heo
2022-11-30 8:23 ` [PATCH 21/31] sched_ext: Implement scx_bpf_kick_cpu() and task preemption support Tejun Heo
2022-11-30 8:23 ` [PATCH 22/31] sched_ext: Add task state tracking operations Tejun Heo
2022-11-30 8:23 ` [PATCH 23/31] sched_ext: Implement tickless support Tejun Heo
2022-11-30 8:23 ` [PATCH 24/31] sched_ext: Add cgroup support Tejun Heo
2022-11-30 8:23 ` [PATCH 25/31] sched_ext: Implement SCX_KICK_WAIT Tejun Heo
2022-11-30 8:23 ` [PATCH 26/31] sched_ext: Implement sched_ext_ops.cpu_acquire/release() Tejun Heo
2022-11-30 8:23 ` [PATCH 27/31] sched_ext: Implement sched_ext_ops.cpu_online/offline() Tejun Heo
2022-11-30 8:23 ` [PATCH 28/31] sched_ext: Add Documentation/scheduler/sched-ext.rst Tejun Heo
2022-12-12 4:01 ` Bagas Sanjaya
2022-12-12 6:28 ` Tejun Heo
2022-12-12 13:07 ` Bagas Sanjaya
2022-12-12 17:30 ` Tejun Heo
2022-12-12 12:39 ` Peter Zijlstra
2022-12-12 17:16 ` Tejun Heo
2022-11-30 8:23 ` [PATCH 29/31] sched_ext: Add a basic, userland vruntime scheduler Tejun Heo
2022-11-30 8:23 ` [PATCH 30/31] BPF: [TEMPORARY] Nerf BTF scalar value check Tejun Heo
2022-11-30 8:23 ` [PATCH 31/31] sched_ext: Add a rust userspace hybrid example scheduler Tejun Heo
2022-12-12 14:03 ` Peter Zijlstra
2022-12-12 21:05 ` Peter Oskolkov
2022-12-13 11:02 ` Peter Zijlstra
2022-12-13 18:24 ` Peter Oskolkov
2022-12-12 22:00 ` Tejun Heo
2022-12-12 22:18 ` Josh Don
2022-12-13 11:30 ` Peter Zijlstra
2022-12-13 20:33 ` Tejun Heo
2022-12-14 2:00 ` Josh Don
2022-12-12 9:37 ` [PATCHSET RFC] sched: Implement BPF extensible scheduler class Peter Zijlstra
2022-12-12 17:27 ` Tejun Heo
2022-12-12 10:14 ` Peter Zijlstra
2022-12-14 2:11 ` Josh Don
2022-12-14 8:55 ` Peter Zijlstra
2022-12-14 22:23 ` Tejun Heo
2022-12-14 23:20 ` Barret Rhoden
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20221130082313.3241517-20-tj@kernel.org \
--to=tj@kernel.org \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=brho@google.com \
--cc=bristot@redhat.com \
--cc=bsegall@google.com \
--cc=daniel@iogearbox.net \
--cc=derkling@google.com \
--cc=dietmar.eggemann@arm.com \
--cc=dschatzberg@meta.com \
--cc=dskarlat@cs.cmu.edu \
--cc=dvernet@meta.com \
--cc=haoluo@google.com \
--cc=joshdon@google.com \
--cc=juri.lelli@redhat.com \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=martin.lau@kernel.org \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=pjt@google.com \
--cc=riel@surriel.com \
--cc=rostedt@goodmis.org \
--cc=torvalds@linux-foundation.org \
--cc=vincent.guittot@linaro.org \
--cc=vschneid@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.