public inbox for bpf@vger.kernel.org
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: Kumar Kartikeya Dwivedi <memxor@gmail.com>,
	Alexei Starovoitov <ast@kernel.org>,
	Emil Tsalapatis <emil@etsalapatis.com>,
	Eduard Zingerman <eddyz87@gmail.com>,
	Andrii Nakryiko <andrii@kernel.org>
Cc: David Vernet <void@manifault.com>,
	Andrea Righi <arighi@nvidia.com>,
	Changwoo Min <changwoo@igalia.com>,
	bpf@vger.kernel.org, sched-ext@lists.linux.dev,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH 7/9] sched_ext: Require MAP_ALWAYS arena for cid-form schedulers
Date: Mon, 27 Apr 2026 00:51:07 -1000	[thread overview]
Message-ID: <20260427105109.2554518-8-tj@kernel.org> (raw)
In-Reply-To: <20260427105109.2554518-1-tj@kernel.org>

Upcoming patches will let the kernel place arena-resident scratch
shared with the BPF program (e.g. per-CPU set_cmask cmask) so the
BPF side can dereference it directly via __arena pointers, replacing
the current cmask_copy_from_kernel() probe-read loop. That requires
each cid-form scheduler to expose its arena to the kernel and to opt
into BPF_F_ARENA_MAP_ALWAYS so kernel-side stores never fault.

bpf_scx_reg_cid() walks the struct_ops member progs via the new
bpf_struct_ops_for_each_prog() helper and discovers the arena from
prog->aux->used_maps. It requires exactly one BPF_MAP_TYPE_ARENA
across all member progs and rejects if BPF_F_ARENA_MAP_ALWAYS is not
set. The map ref is held on scx_sched and dropped on sched destroy.
cpu-form schedulers (bpf_scx_reg) are unchanged - no arena
requirement.

scx_qmap adds BPF_F_ARENA_MAP_ALWAYS to its arena map definition.

v2: Defer sch->arena_map = cmd->arena_map consumption past
    scx_alloc_and_add_sched() failure points so an early kzalloc/kstrdup
    failure leaves cmd->arena_map set; bpf_scx_reg_cid() then drops the
    ref via the existing cmd.arena_map cleanup.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c             | 59 +++++++++++++++++++++++++++++++++-
 kernel/sched/ext_internal.h    |  9 ++++++
 tools/sched_ext/scx_qmap.bpf.c |  2 +-
 3 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a078cd4225c1..835ac505f991 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4916,6 +4916,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 	free_exit_info(sch->exit_info);
+	if (sch->arena_map)
+		bpf_map_put(sch->arena_map);
 	kfree(sch);
 }
 
@@ -6588,6 +6590,7 @@ struct scx_enable_cmd {
 		struct sched_ext_ops_cid	*ops_cid;
 	};
 	bool			is_cid_type;
+	struct bpf_map		*arena_map;	/* arena ref to transfer to sch */
 	int			ret;
 };
 
@@ -6751,6 +6754,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
 		return ERR_PTR(ret);
 	}
 #endif	/* CONFIG_EXT_SUB_SCHED */
+
+	/*
+	 * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so
+	 * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid
+	 * drops the ref. After this point, sch owns the ref and any cleanup
+	 * runs through scx_sched_free_rcu_work() which puts it.
+	 */
+	sch->arena_map = cmd->arena_map;
+	cmd->arena_map = NULL;
 	return sch;
 
 err_free_lb_resched:
@@ -7676,11 +7688,56 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
 	return scx_enable(&cmd, link);
 }
 
+struct scx_arena_scan {
+	struct bpf_map	*arena;
+	int		err;
+};
+
+static int scx_arena_scan_map(struct bpf_map *m, void *data)
+{
+	struct scx_arena_scan *s = data;
+
+	if (m->map_type != BPF_MAP_TYPE_ARENA)
+		return 0;
+	if (s->arena && s->arena != m) {
+		s->err = -EINVAL;
+		return 1;
+	}
+	s->arena = m;
+	return 0;
+}
+
+static int scx_arena_scan_prog(struct bpf_prog *prog, void *data)
+{
+	return bpf_prog_for_each_used_map(prog, scx_arena_scan_map, data);
+}
+
 static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link)
 {
 	struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true };
+	struct scx_arena_scan scan = {};
+	int ret;
 
-	return scx_enable(&cmd, link);
+	bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan);
+	if (scan.err) {
+		pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n");
+		return scan.err;
+	}
+	if (!scan.arena) {
+		pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n");
+		return -EINVAL;
+	}
+	if (!(scan.arena->map_flags & BPF_F_ARENA_MAP_ALWAYS)) {
+		pr_err("sched_ext: arena map requires BPF_F_ARENA_MAP_ALWAYS for cid-form\n");
+		return -EINVAL;
+	}
+
+	bpf_map_inc(scan.arena);
+	cmd.arena_map = scan.arena;
+	ret = scx_enable(&cmd, link);
+	if (cmd.arena_map)		/* not consumed by scx_alloc_and_add_sched() */
+		bpf_map_put(cmd.arena_map);
+	return ret;
 }
 
 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index e5f52986d317..bcffbc32541c 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1102,6 +1102,15 @@ struct scx_sched {
 		struct sched_ext_ops_cid	ops_cid;
 	};
 	bool			is_cid_type;	/* true if registered via bpf_sched_ext_ops_cid */
+
+	/*
+	 * Arena map auto-discovered from member progs at struct_ops attach.
+	 * cid-form schedulers must use exactly one arena with
+	 * BPF_F_ARENA_MAP_ALWAYS to enable direct arena access from kernel
+	 * side. NULL on cpu-form.
+	 */
+	struct bpf_map		*arena_map;
+
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
 	/*
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 2ffea8a93217..edce734c3019 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -83,7 +83,7 @@ UEI_DEFINE(uei);
  */
 struct {
 	__uint(type, BPF_MAP_TYPE_ARENA);
-	__uint(map_flags, BPF_F_MMAPABLE);
+	__uint(map_flags, BPF_F_MMAPABLE | BPF_F_ARENA_MAP_ALWAYS);
 	__uint(max_entries, 1 << 16);		/* upper bound in pages */
 #if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
 	__ulong(map_extra, 0x1ull << 32);	/* user/BPF mmap base */
-- 
2.53.0


  parent reply	other threads:[~2026-04-27 10:51 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-27 10:51 [RFC PATCH 0/9] bpf/arena: Direct kernel-side access Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 1/9] bpf/arena: Plumb struct bpf_arena * through PTE callbacks Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 2/9] bpf/arena: Add BPF_F_ARENA_MAP_ALWAYS for direct kernel access Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 3/9] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 4/9] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 5/9] bpf: Add bpf_prog_for_each_used_map() Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 6/9] bpf/arena: Add bpf_arena_map_kern_vm_start() Tejun Heo
2026-04-27 10:51 ` Tejun Heo [this message]
2026-04-27 10:51 ` [RFC PATCH 8/9] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 9/9] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260427105109.2554518-8-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=andrii@kernel.org \
    --cc=arighi@nvidia.com \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=changwoo@igalia.com \
    --cc=eddyz87@gmail.com \
    --cc=emil@etsalapatis.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=memxor@gmail.com \
    --cc=sched-ext@lists.linux.dev \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox