From: Vernon Yang <vernon2gm@gmail.com>
To: akpm@linux-foundation.org, david@kernel.org, ljs@kernel.org,
roman.gushchin@linux.dev, inwardvessel@gmail.com,
shakeel.butt@linux.dev, ast@kernel.org, daniel@iogearbox.net,
surenb@google.com
Cc: tz2294@columbia.edu, baohua@kernel.org, lance.yang@linux.dev,
dev.jain@arm.com, laoar.shao@gmail.com,
gutierrez.asier@huawei-partners.com,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
bpf@vger.kernel.org, Vernon Yang <yanglincheng@kylinos.cn>
Subject: [PATCH v2 3/4] mm: introduce bpf_mthp_ops struct ops
Date: Fri, 8 May 2026 23:00:54 +0800 [thread overview]
Message-ID: <20260508150055.680136-4-vernon2gm@gmail.com> (raw)
In-Reply-To: <20260508150055.680136-1-vernon2gm@gmail.com>
From: Vernon Yang <yanglincheng@kylinos.cn>
Introducing bpf_mthp_ops enables eBPF programs to register the
mthp_choose callback function via cgroup-ebpf.
Using cgroup-bpf to customize mTHP size for different scenarios,
automatically select different mTHP sizes for different cgroups,
let's focus on making them truly transparent.
Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
---
MAINTAINERS | 3 +
include/linux/bpf_huge_memory.h | 52 ++++++++++
include/linux/cgroup-defs.h | 1 +
include/linux/huge_mm.h | 6 ++
kernel/cgroup/cgroup.c | 2 +
mm/Kconfig | 14 +++
mm/Makefile | 1 +
mm/bpf_huge_memory.c | 168 ++++++++++++++++++++++++++++++++
8 files changed, 247 insertions(+)
create mode 100644 include/linux/bpf_huge_memory.h
create mode 100644 mm/bpf_huge_memory.c
diff --git a/MAINTAINERS b/MAINTAINERS
index caaa0d6e6056..f1113eaa1193 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4887,7 +4887,10 @@ M: Shakeel Butt <shakeel.butt@linux.dev>
L: bpf@vger.kernel.org
L: linux-mm@kvack.org
S: Maintained
+F: include/linux/bpf_huge_memory.h
+F: mm/bpf_huge_memory.c
F: mm/bpf_memcontrol.c
+F: samples/bpf/mthp_ext.*
BPF [MISC]
L: bpf@vger.kernel.org
diff --git a/include/linux/bpf_huge_memory.h b/include/linux/bpf_huge_memory.h
new file mode 100644
index 000000000000..ffda445c9572
--- /dev/null
+++ b/include/linux/bpf_huge_memory.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef __BPF_HUGE_MEMORY_H
+#define __BPF_HUGE_MEMORY_H
+
+#include <linux/cgroup-defs.h>
+
+/**
+ * struct bpf_mthp_ops - BPF callbacks for mTHP operations
+ * @mthp_choose: Choose the custom mTHP orders
+ *
+ * This structure defines the interface for BPF programs to customize
+ * mTHP behavior through struct_ops programs.
+ */
+struct bpf_mthp_ops {
+ unsigned long (*mthp_choose)(struct cgroup *cgrp, unsigned long orders);
+};
+
+#ifdef CONFIG_BPF_TRANSPARENT_HUGEPAGE
+/**
+ * bpf_mthp_choose - Choose the custom mTHP orders using bpf
+ * @mm: task mm_struct
+ * @orders: original orders
+ *
+ * Return suited mTHP orders.
+ */
+unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders);
+
+/**
+ * cgroup_bpf_set_mthp_ops - Set sub-cgroup mthp_ops to parent cgroup
+ * @cgrp: want to set mthp_ops of sub-cgroup
+ * @parent: parent cgroup
+ */
+static inline void cgroup_bpf_set_mthp_ops(struct cgroup *cgrp,
+ struct cgroup *parent)
+{
+ WRITE_ONCE(cgrp->mthp_ops, parent->mthp_ops);
+}
+#else
+static inline unsigned long bpf_mthp_choose(struct mm_struct *mm,
+ unsigned long orders)
+{
+ return orders;
+}
+static inline void cgroup_bpf_set_mthp_ops(struct cgroup *cgrp,
+ struct cgroup *parent)
+{
+}
+#endif /* CONFIG_BPF_TRANSPARENT_HUGEPAGE */
+
+#endif /* __BPF_HUGE_MEMORY_H */
+
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index f42563739d2e..78854d0e06ab 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -628,6 +628,7 @@ struct cgroup {
#ifdef CONFIG_BPF_SYSCALL
struct bpf_local_storage __rcu *bpf_cgrp_storage;
+ struct bpf_mthp_ops *mthp_ops;
#endif
#ifdef CONFIG_EXT_SUB_SCHED
struct scx_sched __rcu *scx_sched;
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 127f9e1e7604..65da35fb0980 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -3,6 +3,7 @@
#define _LINUX_HUGE_MM_H
#include <linux/mm_types.h>
+#include <linux/bpf_huge_memory.h>
#include <linux/fs.h> /* only for vma_is_dax() */
#include <linux/kobject.h>
@@ -296,6 +297,11 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
enum tva_type type,
unsigned long orders)
{
+ /* The eBPF-specified orders overrides which order is selected. */
+ orders &= bpf_mthp_choose(vma->vm_mm, orders);
+ if (!orders)
+ return 0;
+
/*
* Optimization to check if required orders are enabled early. Only
* forced collapse ignores sysfs configs.
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 43adc96c7f1a..1dbef3e8b179 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5836,6 +5836,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
if (ret)
goto out_stat_exit;
+ cgroup_bpf_set_mthp_ops(cgrp, parent);
+
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
cgrp->ancestors[tcgrp->level] = tcgrp;
diff --git a/mm/Kconfig b/mm/Kconfig
index 27dc5b0139ba..be49bde783a7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -949,6 +949,20 @@ config NO_PAGE_MAPCOUNT
EXPERIMENTAL because the impact of some changes is still unclear.
+config BPF_TRANSPARENT_HUGEPAGE
+ bool "BPF-based transparent hugepage (EXPERIMENTAL)"
+ depends on TRANSPARENT_HUGEPAGE && CGROUP_BPF
+ help
+ Using cgroup-bpf to customize mTHP size for different scenarios,
+ automatically select different mTHP sizes for different cgroups,
+ let's focus on making them truly transparent.
+
+ This is an experimental feature, that might go away at any time,
+ Please do not rely any production environment.
+
+ EXPERIMENTAL because the BPF interface is unstable and may be removed
+ at any time.
+
endif # TRANSPARENT_HUGEPAGE
# simple helper to make the code a bit easier to read
diff --git a/mm/Makefile b/mm/Makefile
index 8ad2ab08244e..b474c21c3253 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -108,6 +108,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
ifdef CONFIG_BPF_SYSCALL
obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
+obj-$(CONFIG_BPF_TRANSPARENT_HUGEPAGE) += bpf_huge_memory.o
endif
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_GUP_TEST) += gup_test.o
diff --git a/mm/bpf_huge_memory.c b/mm/bpf_huge_memory.c
new file mode 100644
index 000000000000..851c6ebe2933
--- /dev/null
+++ b/mm/bpf_huge_memory.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Huge memory related BPF code
+ *
+ * Author: Vernon Yang <yanglincheng@kylinos.cn>
+ */
+
+#include <linux/bpf.h>
+#include <linux/srcu.h>
+
+/* Protects cgrp->mthp_ops pointer for read and write. */
+DEFINE_SRCU(mthp_bpf_srcu);
+
+unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders)
+{
+ struct cgroup *cgrp;
+ struct mem_cgroup *memcg;
+ struct bpf_mthp_ops *ops;
+ int idx;
+
+ memcg = get_mem_cgroup_from_mm(mm);
+ if (!memcg)
+ return orders;
+
+ cgrp = memcg->css.cgroup;
+
+ idx = srcu_read_lock(&mthp_bpf_srcu);
+ ops = READ_ONCE(cgrp->mthp_ops);
+ if (unlikely(ops && ops->mthp_choose))
+ orders = ops->mthp_choose(cgrp, orders);
+ srcu_read_unlock(&mthp_bpf_srcu, idx);
+
+ mem_cgroup_put(memcg);
+
+ return orders;
+}
+
+static int bpf_mthp_ops_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg, int off, int size)
+{
+ return -EACCES;
+}
+
+static bool bpf_mthp_ops_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog, struct bpf_insn_access_aux *info)
+{
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+const struct bpf_verifier_ops bpf_mthp_verifier_ops = {
+ .get_func_proto = bpf_base_func_proto,
+ .btf_struct_access = bpf_mthp_ops_btf_struct_access,
+ .is_valid_access = bpf_mthp_ops_is_valid_access,
+};
+
+static int bpf_mthp_ops_reg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+ struct bpf_mthp_ops *ops = kdata;
+ struct cgroup_subsys_state *child;
+ struct cgroup *cgrp;
+
+ if (!link)
+ return -EOPNOTSUPP;
+
+ cgrp = st_link->cgroup;
+ if (!cgrp)
+ return -EINVAL;
+
+ cgroup_lock();
+ css_for_each_descendant_pre(child, &cgrp->self) {
+ if (READ_ONCE(child->cgroup->mthp_ops)) {
+ pr_warn("sub-cgroup has already registered.\n");
+ cgroup_unlock();
+ return -EBUSY;
+ }
+ }
+ css_for_each_descendant_pre(child, &cgrp->self)
+ WRITE_ONCE(child->cgroup->mthp_ops, ops);
+ cgroup_unlock();
+
+ return 0;
+}
+
+static void bpf_mthp_ops_unreg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+ struct cgroup_subsys_state *child;
+ struct cgroup *cgrp;
+
+ if (!link)
+ return;
+
+ cgrp = st_link->cgroup;
+ if (!cgrp)
+ return;
+
+ cgroup_lock();
+ css_for_each_descendant_pre(child, &cgrp->self)
+ WRITE_ONCE(child->cgroup->mthp_ops, NULL);
+ cgroup_unlock();
+
+ synchronize_srcu(&mthp_bpf_srcu);
+}
+
+static int bpf_mthp_ops_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct bpf_mthp_ops, mthp_choose):
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (prog->sleepable)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int bpf_mthp_ops_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ return 0;
+}
+
+static int bpf_mthp_ops_init(struct btf *btf)
+{
+ return 0;
+}
+
+static unsigned long cfi_mthp_choose(struct cgroup *cgrp, unsigned long orders)
+{
+ return 0;
+}
+
+static struct bpf_mthp_ops cfi_bpf_mthp_ops = {
+ .mthp_choose = cfi_mthp_choose,
+};
+
+static struct bpf_struct_ops bso_bpf_mthp_ops = {
+ .verifier_ops = &bpf_mthp_verifier_ops,
+ .reg = bpf_mthp_ops_reg,
+ .unreg = bpf_mthp_ops_unreg,
+ .check_member = bpf_mthp_ops_check_member,
+ .init_member = bpf_mthp_ops_init_member,
+ .init = bpf_mthp_ops_init,
+ .name = "bpf_mthp_ops",
+ .owner = THIS_MODULE,
+ .cfi_stubs = &cfi_bpf_mthp_ops,
+};
+
+static int __init bpf_huge_memory_init(void)
+{
+ int err;
+
+ err = register_bpf_struct_ops(&bso_bpf_mthp_ops, bpf_mthp_ops);
+ if (err)
+ pr_warn("Registration of bpf_mthp_ops failed, err %d\n", err);
+
+ return err;
+}
+late_initcall(bpf_huge_memory_init);
--
2.53.0
next prev parent reply other threads:[~2026-05-08 15:01 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-08 15:00 [PATCH v2 0/4] mm: introduce mthp_ext via cgroup-bpf to make mTHP more transparent Vernon Yang
2026-05-08 15:00 ` [PATCH v2 1/4] psi: add psi_group_flush_stats() function Vernon Yang
2026-05-08 15:19 ` Lorenzo Stoakes
2026-05-08 15:00 ` [PATCH v2 2/4] bpf: add bpf_cgroup_{flush_stats,stall} function Vernon Yang
2026-05-08 15:40 ` bot+bpf-ci
2026-05-08 15:00 ` Vernon Yang [this message]
2026-05-08 15:40 ` [PATCH v2 3/4] mm: introduce bpf_mthp_ops struct ops bot+bpf-ci
2026-05-08 15:57 ` Lorenzo Stoakes
2026-05-08 20:54 ` David Hildenbrand (Arm)
2026-05-08 15:00 ` [PATCH v2 4/4] samples: bpf: add mthp_ext Vernon Yang
2026-05-08 15:40 ` bot+bpf-ci
2026-05-08 15:14 ` [PATCH v2 0/4] mm: introduce mthp_ext via cgroup-bpf to make mTHP more transparent Lorenzo Stoakes
2026-05-08 16:05 ` Lorenzo Stoakes
2026-05-08 16:53 ` Vernon Yang
2026-05-08 16:00 ` Pedro Falcato
2026-05-08 16:15 ` Lorenzo Stoakes
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260508150055.680136-4-vernon2gm@gmail.com \
--to=vernon2gm@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=ast@kernel.org \
--cc=baohua@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=gutierrez.asier@huawei-partners.com \
--cc=inwardvessel@gmail.com \
--cc=lance.yang@linux.dev \
--cc=laoar.shao@gmail.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=surenb@google.com \
--cc=tz2294@columbia.edu \
--cc=yanglincheng@kylinos.cn \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox