From: Vernon Yang <vernon2gm@gmail.com>
To: akpm@linux-foundation.org, david@kernel.org, ljs@kernel.org,
roman.gushchin@linux.dev, inwardvessel@gmail.com,
shakeel.butt@linux.dev, ast@kernel.org, daniel@iogearbox.net,
surenb@google.com
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
bpf@vger.kernel.org, baohua@kernel.org, lance.yang@linux.dev,
dev.jain@arm.com, Vernon Yang <yanglincheng@kylinos.cn>
Subject: [PATCH 3/4] mm: introduce bpf_mthp_ops struct ops
Date: Mon, 4 May 2026 00:50:23 +0800 [thread overview]
Message-ID: <20260503165024.1526680-4-vernon2gm@gmail.com> (raw)
In-Reply-To: <20260503165024.1526680-1-vernon2gm@gmail.com>
From: Vernon Yang <yanglincheng@kylinos.cn>
Introducing bpf_mthp_ops enables eBPF programs to register the
mthp_choose callback function via cgroup-ebpf.
Using cgroup-bpf to customize mTHP size for different scenarios,
automatically select different mTHP sizes for different cgroups,
let's focus on making them truly transparent.
Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
---
MAINTAINERS | 3 +
include/linux/bpf_huge_memory.h | 35 +++++++
include/linux/cgroup-defs.h | 1 +
include/linux/huge_mm.h | 6 ++
mm/Kconfig | 14 +++
mm/Makefile | 1 +
mm/bpf_huge_memory.c | 169 ++++++++++++++++++++++++++++++++
7 files changed, 229 insertions(+)
create mode 100644 include/linux/bpf_huge_memory.h
create mode 100644 mm/bpf_huge_memory.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 27a073f53cea..39f00676eeb7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4887,7 +4887,10 @@ M: Shakeel Butt <shakeel.butt@linux.dev>
L: bpf@vger.kernel.org
L: linux-mm@kvack.org
S: Maintained
+F: include/linux/bpf_huge_memory.h
+F: mm/bpf_huge_memory.c
F: mm/bpf_memcontrol.c
+F: samples/bpf/mthp_ext.*
BPF [MISC]
L: bpf@vger.kernel.org
diff --git a/include/linux/bpf_huge_memory.h b/include/linux/bpf_huge_memory.h
new file mode 100644
index 000000000000..1c8a6f7ad8f1
--- /dev/null
+++ b/include/linux/bpf_huge_memory.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef __BPF_HUGE_MEMORY_H
+#define __BPF_HUGE_MEMORY_H
+
+/**
+ * struct bpf_mthp_ops - BPF callbacks for mTHP operations
+ * @mthp_choose: Choose the custom mTHP orders
+ *
+ * This structure defines the interface for BPF programs to customize
+ * mTHP behavior through struct_ops programs.
+ */
+struct bpf_mthp_ops {
+ unsigned long (*mthp_choose)(struct cgroup *cgrp, unsigned long orders);
+};
+
+#if defined(CONFIG_BPF_TRANSPARENT_HUGEPAGE) && defined(CONFIG_BPF_SYSCALL)
+/**
+ * bpf_mthp_choose: Choose the custom mTHP orders using bpf
+ * @mm: task mm_struct
+ * @orders: original orders
+ *
+ * Return suited mTHP orders.
+ */
+unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders);
+#else
+static inline unsigned long bpf_mthp_choose(struct mm_struct *mm,
+ unsigned long orders)
+{
+ return orders;
+}
+#endif /* CONFIG_BPF_SYSCALL */
+
+#endif /* __BPF_HUGE_MEMORY_H */
+
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index f42563739d2e..78854d0e06ab 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -628,6 +628,7 @@ struct cgroup {
#ifdef CONFIG_BPF_SYSCALL
struct bpf_local_storage __rcu *bpf_cgrp_storage;
+ struct bpf_mthp_ops *mthp_ops;
#endif
#ifdef CONFIG_EXT_SUB_SCHED
struct scx_sched __rcu *scx_sched;
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2949e5acff35..80ec622213df 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -3,6 +3,7 @@
#define _LINUX_HUGE_MM_H
#include <linux/mm_types.h>
+#include <linux/bpf_huge_memory.h>
#include <linux/fs.h> /* only for vma_is_dax() */
#include <linux/kobject.h>
@@ -291,6 +292,11 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
enum tva_type type,
unsigned long orders)
{
+ /* The eBPF-specified orders overrides which order is selected. */
+ orders &= bpf_mthp_choose(vma->vm_mm, orders);
+ if (!orders)
+ return 0;
+
/*
* Optimization to check if required orders are enabled early. Only
* forced collapse ignores sysfs configs.
diff --git a/mm/Kconfig b/mm/Kconfig
index e8bf1e9e6ad9..12382431ddc7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -963,6 +963,20 @@ config NO_PAGE_MAPCOUNT
EXPERIMENTAL because the impact of some changes is still unclear.
+config BPF_TRANSPARENT_HUGEPAGE
+ bool "BPF-based transparent hugepage (EXPERIMENTAL)"
+ depends on TRANSPARENT_HUGEPAGE
+ help
+ Using cgroup-bpf to customize mTHP size for different scenarios,
+ automatically select different mTHP sizes for different cgroups,
+ let's focus on making them truly transparent.
+
+ This is an experimental feature, that might go away at any time,
+ Please do not rely any production environment.
+
+ EXPERIMENTAL because the BPF interface is unstable and may be removed
+ at any time.
+
endif # TRANSPARENT_HUGEPAGE
# simple helper to make the code a bit easier to read
diff --git a/mm/Makefile b/mm/Makefile
index 8ad2ab08244e..b474c21c3253 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -108,6 +108,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
ifdef CONFIG_BPF_SYSCALL
obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
+obj-$(CONFIG_BPF_TRANSPARENT_HUGEPAGE) += bpf_huge_memory.o
endif
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_GUP_TEST) += gup_test.o
diff --git a/mm/bpf_huge_memory.c b/mm/bpf_huge_memory.c
new file mode 100644
index 000000000000..e34e0a35edac
--- /dev/null
+++ b/mm/bpf_huge_memory.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Huge memory related BPF code
+ *
+ * Author: Vernon Yang <yanglincheng@kylinos.cn>
+ */
+
+#include <linux/bpf.h>
+#include <linux/srcu.h>
+
+/* Protects cgrp->mthp_ops pointer for read and write. */
+DEFINE_SRCU(mthp_bpf_srcu);
+
+unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders)
+{
+ struct cgroup *cgrp;
+ struct mem_cgroup *memcg;
+ struct bpf_mthp_ops *ops;
+ int idx;
+
+ memcg = get_mem_cgroup_from_mm(mm);
+ if (!memcg)
+ return orders;
+
+ cgrp = memcg->css.cgroup;
+ ops = READ_ONCE(cgrp->mthp_ops);
+ if (unlikely(ops)) {
+ idx = srcu_read_lock(&mthp_bpf_srcu);
+ if (ops->mthp_choose)
+ orders = ops->mthp_choose(cgrp, orders);
+ srcu_read_unlock(&mthp_bpf_srcu, idx);
+ }
+
+ mem_cgroup_put(memcg);
+
+ return orders;
+}
+
+static int bpf_mthp_ops_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg, int off, int size)
+{
+ return -EACCES;
+}
+
+static bool bpf_mthp_ops_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog, struct bpf_insn_access_aux *info)
+{
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+const struct bpf_verifier_ops bpf_mthp_verifier_ops = {
+ .get_func_proto = bpf_base_func_proto,
+ .btf_struct_access = bpf_mthp_ops_btf_struct_access,
+ .is_valid_access = bpf_mthp_ops_is_valid_access,
+};
+
+static int bpf_mthp_ops_reg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+ struct bpf_mthp_ops *ops = kdata;
+ struct cgroup *cgrp = st_link->cgroup;
+ struct cgroup_subsys_state *pos;
+
+ /* The link is not yet fully initialized, but cgroup should be set */
+ if (!link)
+ return -EOPNOTSUPP;
+
+ cgroup_lock();
+ css_for_each_descendant_pre(pos, &cgrp->self) {
+ struct cgroup *child = pos->cgroup;
+
+ if (READ_ONCE(child->mthp_ops)) {
+ /* TODO
+ * Do not destroy the cgroup hierarchy property.
+ * If an eBPF program already exists in the sub-cgroup,
+ * trigger an error and clear the already set
+ * bpf_mthp_ops data.
+ */
+ continue;
+ }
+ WRITE_ONCE(child->mthp_ops, ops);
+ }
+ cgroup_unlock();
+
+ return 0;
+}
+
+static void bpf_mthp_ops_unreg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+ struct bpf_mthp_ops *ops = kdata;
+ struct cgroup *cgrp = st_link->cgroup;
+ struct cgroup_subsys_state *pos;
+
+ cgroup_lock();
+ css_for_each_descendant_pre(pos, &cgrp->self) {
+ struct cgroup *child = pos->cgroup;
+
+ if (READ_ONCE(child->mthp_ops) == ops)
+ WRITE_ONCE(child->mthp_ops, NULL);
+ }
+ cgroup_unlock();
+
+ synchronize_srcu(&mthp_bpf_srcu);
+}
+
+static int bpf_mthp_ops_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct bpf_mthp_ops, mthp_choose):
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (prog->sleepable)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int bpf_mthp_ops_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ return 0;
+}
+
+static int bpf_mthp_ops_init(struct btf *btf)
+{
+ return 0;
+}
+
+static unsigned long cfi_mthp_choose(struct cgroup *cgrp, unsigned long orders)
+{
+ return 0;
+}
+
+static struct bpf_mthp_ops cfi_bpf_mthp_ops = {
+ .mthp_choose = cfi_mthp_choose,
+};
+
+static struct bpf_struct_ops bso_bpf_mthp_ops = {
+ .verifier_ops = &bpf_mthp_verifier_ops,
+ .reg = bpf_mthp_ops_reg,
+ .unreg = bpf_mthp_ops_unreg,
+ .check_member = bpf_mthp_ops_check_member,
+ .init_member = bpf_mthp_ops_init_member,
+ .init = bpf_mthp_ops_init,
+ .name = "bpf_mthp_ops",
+ .owner = THIS_MODULE,
+ .cfi_stubs = &cfi_bpf_mthp_ops,
+};
+
+static int __init bpf_huge_memory_init(void)
+{
+ int err;
+
+ err = register_bpf_struct_ops(&bso_bpf_mthp_ops, bpf_mthp_ops);
+ if (err)
+ pr_warn("Registration of bpf_mthp_ops failed, err %d\n", err);
+
+ return err;
+}
+late_initcall(bpf_huge_memory_init);
--
2.53.0
next prev parent reply other threads:[~2026-05-03 16:51 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-03 16:50 [PATCH 0/4] mm: introduce mthp_ext via cgroup-bpf to make mTHP more transparent Vernon Yang
2026-05-03 16:50 ` [PATCH 1/4] psi: add psi_group_flush_stats() function Vernon Yang
2026-05-03 16:50 ` [PATCH 2/4] bpf: add bpf_cgroup_{flush_stats,stall} function Vernon Yang
2026-05-03 17:23 ` bot+bpf-ci
2026-05-03 16:50 ` Vernon Yang [this message]
2026-05-03 17:35 ` [PATCH 3/4] mm: introduce bpf_mthp_ops struct ops bot+bpf-ci
2026-05-03 16:50 ` [PATCH 4/4] samples: bpf: add mthp_ext Vernon Yang
2026-05-03 17:35 ` bot+bpf-ci
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260503165024.1526680-4-vernon2gm@gmail.com \
--to=vernon2gm@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=ast@kernel.org \
--cc=baohua@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=inwardvessel@gmail.com \
--cc=lance.yang@linux.dev \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=surenb@google.com \
--cc=yanglincheng@kylinos.cn \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox