From: Tejun Heo <tj@kernel.org>
To: Kumar Kartikeya Dwivedi <memxor@gmail.com>,
Alexei Starovoitov <ast@kernel.org>,
Emil Tsalapatis <emil@etsalapatis.com>,
Eduard Zingerman <eddyz87@gmail.com>,
Andrii Nakryiko <andrii@kernel.org>
Cc: David Vernet <void@manifault.com>,
Andrea Righi <arighi@nvidia.com>,
Changwoo Min <changwoo@igalia.com>,
bpf@vger.kernel.org, sched-ext@lists.linux.dev,
linux-kernel@vger.kernel.org
Subject: [RFC PATCH 7/9] sched_ext: Require MAP_ALWAYS arena for cid-form schedulers
Date: Mon, 27 Apr 2026 00:51:07 -1000 [thread overview]
Message-ID: <20260427105109.2554518-8-tj@kernel.org> (raw)
In-Reply-To: <20260427105109.2554518-1-tj@kernel.org>
Upcoming patches will let the kernel place arena-resident scratch
shared with the BPF program (e.g. per-CPU set_cmask cmask) so the
BPF side can dereference it directly via __arena pointers, replacing
the current cmask_copy_from_kernel() probe-read loop. That requires
each cid-form scheduler to expose its arena to the kernel and to opt
into BPF_F_ARENA_MAP_ALWAYS so kernel-side stores never fault.
bpf_scx_reg_cid() walks the struct_ops member progs via the new
bpf_struct_ops_for_each_prog() helper and discovers the arena from
prog->aux->used_maps. It requires exactly one BPF_MAP_TYPE_ARENA
across all member progs and rejects if BPF_F_ARENA_MAP_ALWAYS is not
set. The map ref is held on scx_sched and dropped on sched destroy.
cpu-form schedulers (bpf_scx_reg) are unchanged - no arena
requirement.
scx_qmap adds BPF_F_ARENA_MAP_ALWAYS to its arena map definition.
v2: Defer sch->arena_map = cmd->arena_map consumption past
scx_alloc_and_add_sched() failure points so an early kzalloc/kstrdup
failure leaves cmd->arena_map set; bpf_scx_reg_cid() then drops the
ref via the existing cmd.arena_map cleanup.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 59 +++++++++++++++++++++++++++++++++-
kernel/sched/ext_internal.h | 9 ++++++
tools/sched_ext/scx_qmap.bpf.c | 2 +-
3 files changed, 68 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a078cd4225c1..835ac505f991 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4916,6 +4916,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
free_exit_info(sch->exit_info);
+ if (sch->arena_map)
+ bpf_map_put(sch->arena_map);
kfree(sch);
}
@@ -6588,6 +6590,7 @@ struct scx_enable_cmd {
struct sched_ext_ops_cid *ops_cid;
};
bool is_cid_type;
+ struct bpf_map *arena_map; /* arena ref to transfer to sch */
int ret;
};
@@ -6751,6 +6754,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
return ERR_PTR(ret);
}
#endif /* CONFIG_EXT_SUB_SCHED */
+
+ /*
+ * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so
+ * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid
+ * drops the ref. After this point, sch owns the ref and any cleanup
+ * runs through scx_sched_free_rcu_work() which puts it.
+ */
+ sch->arena_map = cmd->arena_map;
+ cmd->arena_map = NULL;
return sch;
err_free_lb_resched:
@@ -7676,11 +7688,56 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
return scx_enable(&cmd, link);
}
+struct scx_arena_scan {
+ struct bpf_map *arena;
+ int err;
+};
+
+static int scx_arena_scan_map(struct bpf_map *m, void *data)
+{
+ struct scx_arena_scan *s = data;
+
+ if (m->map_type != BPF_MAP_TYPE_ARENA)
+ return 0;
+ if (s->arena && s->arena != m) {
+ s->err = -EINVAL;
+ return 1;
+ }
+ s->arena = m;
+ return 0;
+}
+
+static int scx_arena_scan_prog(struct bpf_prog *prog, void *data)
+{
+ return bpf_prog_for_each_used_map(prog, scx_arena_scan_map, data);
+}
+
static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link)
{
struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true };
+ struct scx_arena_scan scan = {};
+ int ret;
- return scx_enable(&cmd, link);
+ bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan);
+ if (scan.err) {
+ pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n");
+ return scan.err;
+ }
+ if (!scan.arena) {
+ pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n");
+ return -EINVAL;
+ }
+ if (!(scan.arena->map_flags & BPF_F_ARENA_MAP_ALWAYS)) {
+ pr_err("sched_ext: arena map requires BPF_F_ARENA_MAP_ALWAYS for cid-form\n");
+ return -EINVAL;
+ }
+
+ bpf_map_inc(scan.arena);
+ cmd.arena_map = scan.arena;
+ ret = scx_enable(&cmd, link);
+ if (cmd.arena_map) /* not consumed by scx_alloc_and_add_sched() */
+ bpf_map_put(cmd.arena_map);
+ return ret;
}
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index e5f52986d317..bcffbc32541c 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1102,6 +1102,15 @@ struct scx_sched {
struct sched_ext_ops_cid ops_cid;
};
bool is_cid_type; /* true if registered via bpf_sched_ext_ops_cid */
+
+ /*
+ * Arena map auto-discovered from member progs at struct_ops attach.
+ * cid-form schedulers must use exactly one arena with
+ * BPF_F_ARENA_MAP_ALWAYS to enable direct arena access from kernel
+ * side. NULL on cpu-form.
+ */
+ struct bpf_map *arena_map;
+
DECLARE_BITMAP(has_op, SCX_OPI_END);
/*
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 2ffea8a93217..edce734c3019 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -83,7 +83,7 @@ UEI_DEFINE(uei);
*/
struct {
__uint(type, BPF_MAP_TYPE_ARENA);
- __uint(map_flags, BPF_F_MMAPABLE);
+ __uint(map_flags, BPF_F_MMAPABLE | BPF_F_ARENA_MAP_ALWAYS);
__uint(max_entries, 1 << 16); /* upper bound in pages */
#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
__ulong(map_extra, 0x1ull << 32); /* user/BPF mmap base */
--
2.53.0
next prev parent reply other threads:[~2026-04-27 10:51 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-27 10:51 [RFC PATCH 0/9] bpf/arena: Direct kernel-side access Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 1/9] bpf/arena: Plumb struct bpf_arena * through PTE callbacks Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 2/9] bpf/arena: Add BPF_F_ARENA_MAP_ALWAYS for direct kernel access Tejun Heo
2026-05-12 0:31 ` Kumar Kartikeya Dwivedi
2026-05-12 2:05 ` Emil Tsalapatis
2026-05-12 2:43 ` Kumar Kartikeya Dwivedi
2026-05-12 3:25 ` Alexei Starovoitov
2026-05-12 3:48 ` Kumar Kartikeya Dwivedi
2026-05-12 4:24 ` Alexei Starovoitov
2026-05-12 12:29 ` Emil Tsalapatis
2026-05-12 14:07 ` Kumar Kartikeya Dwivedi
2026-05-12 15:59 ` Emil Tsalapatis
2026-05-12 3:42 ` Emil Tsalapatis
2026-04-27 10:51 ` [RFC PATCH 3/9] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 4/9] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 5/9] bpf: Add bpf_prog_for_each_used_map() Tejun Heo
2026-05-11 21:44 ` Kumar Kartikeya Dwivedi
2026-04-27 10:51 ` [RFC PATCH 6/9] bpf/arena: Add bpf_arena_map_kern_vm_start() Tejun Heo
2026-04-27 10:51 ` Tejun Heo [this message]
2026-04-27 10:51 ` [RFC PATCH 8/9] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 9/9] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260427105109.2554518-8-tj@kernel.org \
--to=tj@kernel.org \
--cc=andrii@kernel.org \
--cc=arighi@nvidia.com \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=changwoo@igalia.com \
--cc=eddyz87@gmail.com \
--cc=emil@etsalapatis.com \
--cc=linux-kernel@vger.kernel.org \
--cc=memxor@gmail.com \
--cc=sched-ext@lists.linux.dev \
--cc=void@manifault.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.