From: David Vernet <void@manifault.com>
To: tj@kernel.org
Cc: linux-kernel@vger.kernel.org, kernel-team@meta.com
Subject: [PATCH 1/2] scx: Allow calling sleepable kfuncs from BPF_PROG_TYPE_SYSCALL
Date: Wed, 31 Jul 2024 00:14:36 -0500 [thread overview]
Message-ID: <20240731051437.69689-1-void@manifault.com> (raw)
We currently only allow calling sleepable scx kfuncs (i.e.
scx_bpf_create_dsq()) from BPF_PROG_TYPE_STRUCT_OPS progs. The idea here
was that we'd never have to call scx_bpf_create_dsq() outside of a
sched_ext struct_ops callback, but that might not actually be true. For
example, a scheduler could do something like the following:
1. Open and load (not yet attach) a scheduler skel
2. Synchronously call into a BPF_PROG_TYPE_SYSCALL prog from user space.
For example, to initialize an LLC domain, or some other global,
read-only state.
3. Attach the skel, which actually enables the scheduler
The advantage of doing this is that it can preclude having to do pretty
ugly boilerplate like initializing a read-only, statically sized array of
u64[]'s which the kernel consumes literally once at init time to then
create struct bpf_cpumask objects which are actually queried at runtime.
Doing the above is already possible given that we can invoke core BPF
kfuncs, such as bpf_cpumask_create(), from BPF_PROG_TYPE_SYSCALL progs. We
already allow many scx kfuncs to be called from BPF_PROG_TYPE_SYSCALL progs
(e.g. scx_bpf_kick_cpu()). Let's allow the sleepable kfuncs as well.
Signed-off-by: David Vernet <void@manifault.com>
---
include/linux/sched/ext.h | 14 ++++++--------
kernel/sched/ext.c | 27 +++++++++++----------------
2 files changed, 17 insertions(+), 24 deletions(-)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 593d2f4909dd..26e1c33bc844 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -106,16 +106,14 @@ enum scx_ent_dsq_flags {
* mechanism. See scx_kf_allow().
*/
enum scx_kf_mask {
- SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */
- /* all non-sleepables may be nested inside SLEEPABLE */
- SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */
+ SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */
/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
- SCX_KF_CPU_RELEASE = 1 << 1, /* ops.cpu_release() */
+ SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */
/* ops.dequeue (in REST) may be nested inside DISPATCH */
- SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */
- SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */
- SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */
- SCX_KF_REST = 1 << 5, /* other rq-locked operations */
+ SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */
+ SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */
+ SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */
+ SCX_KF_REST = 1 << 4, /* other rq-locked operations */
__SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index da9cac6b6cc2..4a07deb30c44 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1029,16 +1029,12 @@ static __always_inline bool scx_kf_allowed(u32 mask)
return false;
}
- if (unlikely((mask & SCX_KF_SLEEPABLE) && in_interrupt())) {
- scx_ops_error("sleepable kfunc called from non-sleepable context");
- return false;
- }
-
/*
* Enforce nesting boundaries. e.g. A kfunc which can be called from
* DISPATCH must not be called if we're running DEQUEUE which is nested
- * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
- * boundary thanks to the above in_interrupt() check.
+ * inside ops.dispatch(). We don't need to check boundaries for any
+ * blocking kfuncs as the verifier ensures they're only called from
+ * sleepable progs.
*/
if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
@@ -3224,9 +3220,9 @@ static void handle_hotplug(struct rq *rq, bool online)
atomic_long_inc(&scx_hotplug_seq);
if (online && SCX_HAS_OP(cpu_online))
- SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
else if (!online && SCX_HAS_OP(cpu_offline))
- SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
else
scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
"cpu %d going %s, exiting scheduler", cpu,
@@ -3390,7 +3386,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
.fork = fork,
};
- ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args);
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
if (unlikely(ret)) {
ret = ops_sanitize_err("init_task", ret);
return ret;
@@ -4648,7 +4644,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
cpus_read_lock();
if (scx_ops.init) {
- ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init);
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
if (ret) {
ret = ops_sanitize_err("init", ret);
goto err_disable_unlock_cpus;
@@ -5424,14 +5420,11 @@ __bpf_kfunc_start_defs();
* @dsq_id: DSQ to create
* @node: NUMA node to allocate from
*
- * Create a custom DSQ identified by @dsq_id. Can be called from ops.init() and
- * ops.init_task().
+ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
+ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
*/
__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
{
- if (!scx_kf_allowed(SCX_KF_SLEEPABLE))
- return -EINVAL;
-
if (unlikely(node >= (int)nr_node_ids ||
(node < 0 && node != NUMA_NO_NODE)))
return -EINVAL;
@@ -6490,6 +6483,8 @@ static int __init scx_init(void)
*/
if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_sleepable)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+ &scx_kfunc_set_sleepable)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_select_cpu)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
--
2.45.2
next reply other threads:[~2024-07-31 5:14 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-07-31 5:14 David Vernet [this message]
2024-07-31 5:14 ` [PATCH 2/2] scx/selftests: Verify we can call create_dsq from prog_run David Vernet
2024-07-31 17:48 ` Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240731051437.69689-1-void@manifault.com \
--to=void@manifault.com \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.