From: Vernon Yang <vernon2gm@gmail.com>
To: akpm@linux-foundation.org, david@kernel.org, ljs@kernel.org,
roman.gushchin@linux.dev, inwardvessel@gmail.com,
shakeel.butt@linux.dev, ast@kernel.org, daniel@iogearbox.net,
surenb@google.com
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
bpf@vger.kernel.org, baohua@kernel.org, lance.yang@linux.dev,
dev.jain@arm.com, Vernon Yang <yanglincheng@kylinos.cn>
Subject: [PATCH 3/4] mm: introduce bpf_mthp_ops struct ops
Date: Mon, 4 May 2026 00:50:23 +0800 [thread overview]
Message-ID: <20260503165024.1526680-4-vernon2gm@gmail.com> (raw)
In-Reply-To: <20260503165024.1526680-1-vernon2gm@gmail.com>
From: Vernon Yang <yanglincheng@kylinos.cn>
Introducing bpf_mthp_ops enables eBPF programs to register the
mthp_choose callback function via cgroup-ebpf.
Using cgroup-bpf to customize mTHP size for different scenarios,
automatically select different mTHP sizes for different cgroups,
let's focus on making them truly transparent.
Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
---
MAINTAINERS | 3 +
include/linux/bpf_huge_memory.h | 35 +++++++
include/linux/cgroup-defs.h | 1 +
include/linux/huge_mm.h | 6 ++
mm/Kconfig | 14 +++
mm/Makefile | 1 +
mm/bpf_huge_memory.c | 169 ++++++++++++++++++++++++++++++++
7 files changed, 229 insertions(+)
create mode 100644 include/linux/bpf_huge_memory.h
create mode 100644 mm/bpf_huge_memory.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 27a073f53cea..39f00676eeb7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4887,7 +4887,10 @@ M: Shakeel Butt <shakeel.butt@linux.dev>
L: bpf@vger.kernel.org
L: linux-mm@kvack.org
S: Maintained
+F: include/linux/bpf_huge_memory.h
+F: mm/bpf_huge_memory.c
F: mm/bpf_memcontrol.c
+F: samples/bpf/mthp_ext.*
BPF [MISC]
L: bpf@vger.kernel.org
diff --git a/include/linux/bpf_huge_memory.h b/include/linux/bpf_huge_memory.h
new file mode 100644
index 000000000000..1c8a6f7ad8f1
--- /dev/null
+++ b/include/linux/bpf_huge_memory.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef __BPF_HUGE_MEMORY_H
+#define __BPF_HUGE_MEMORY_H
+
+/**
+ * struct bpf_mthp_ops - BPF callbacks for mTHP operations
+ * @mthp_choose: Choose the custom mTHP orders
+ *
+ * This structure defines the interface for BPF programs to customize
+ * mTHP behavior through struct_ops programs.
+ */
+struct bpf_mthp_ops {
+ unsigned long (*mthp_choose)(struct cgroup *cgrp, unsigned long orders);
+};
+
+#if defined(CONFIG_BPF_TRANSPARENT_HUGEPAGE) && defined(CONFIG_BPF_SYSCALL)
+/**
+ * bpf_mthp_choose: Choose the custom mTHP orders using bpf
+ * @mm: task mm_struct
+ * @orders: original orders
+ *
+ * Return suited mTHP orders.
+ */
+unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders);
+#else
+static inline unsigned long bpf_mthp_choose(struct mm_struct *mm,
+ unsigned long orders)
+{
+ return orders;
+}
+#endif /* CONFIG_BPF_SYSCALL */
+
+#endif /* __BPF_HUGE_MEMORY_H */
+
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index f42563739d2e..78854d0e06ab 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -628,6 +628,7 @@ struct cgroup {
#ifdef CONFIG_BPF_SYSCALL
struct bpf_local_storage __rcu *bpf_cgrp_storage;
+ struct bpf_mthp_ops *mthp_ops;
#endif
#ifdef CONFIG_EXT_SUB_SCHED
struct scx_sched __rcu *scx_sched;
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2949e5acff35..80ec622213df 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -3,6 +3,7 @@
#define _LINUX_HUGE_MM_H
#include <linux/mm_types.h>
+#include <linux/bpf_huge_memory.h>
#include <linux/fs.h> /* only for vma_is_dax() */
#include <linux/kobject.h>
@@ -291,6 +292,11 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
enum tva_type type,
unsigned long orders)
{
+ /* The eBPF-specified orders overrides which order is selected. */
+ orders &= bpf_mthp_choose(vma->vm_mm, orders);
+ if (!orders)
+ return 0;
+
/*
* Optimization to check if required orders are enabled early. Only
* forced collapse ignores sysfs configs.
diff --git a/mm/Kconfig b/mm/Kconfig
index e8bf1e9e6ad9..12382431ddc7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -963,6 +963,20 @@ config NO_PAGE_MAPCOUNT
EXPERIMENTAL because the impact of some changes is still unclear.
+config BPF_TRANSPARENT_HUGEPAGE
+ bool "BPF-based transparent hugepage (EXPERIMENTAL)"
+ depends on TRANSPARENT_HUGEPAGE
+ help
+ Using cgroup-bpf to customize mTHP size for different scenarios,
+ automatically select different mTHP sizes for different cgroups,
+ let's focus on making them truly transparent.
+
+ This is an experimental feature, that might go away at any time,
+ Please do not rely any production environment.
+
+ EXPERIMENTAL because the BPF interface is unstable and may be removed
+ at any time.
+
endif # TRANSPARENT_HUGEPAGE
# simple helper to make the code a bit easier to read
diff --git a/mm/Makefile b/mm/Makefile
index 8ad2ab08244e..b474c21c3253 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -108,6 +108,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
ifdef CONFIG_BPF_SYSCALL
obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
+obj-$(CONFIG_BPF_TRANSPARENT_HUGEPAGE) += bpf_huge_memory.o
endif
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_GUP_TEST) += gup_test.o
diff --git a/mm/bpf_huge_memory.c b/mm/bpf_huge_memory.c
new file mode 100644
index 000000000000..e34e0a35edac
--- /dev/null
+++ b/mm/bpf_huge_memory.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Huge memory related BPF code
+ *
+ * Author: Vernon Yang <yanglincheng@kylinos.cn>
+ */
+
+#include <linux/bpf.h>
+#include <linux/srcu.h>
+
+/* Protects cgrp->mthp_ops pointer for read and write. */
+DEFINE_SRCU(mthp_bpf_srcu);
+
+unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders)
+{
+ struct cgroup *cgrp;
+ struct mem_cgroup *memcg;
+ struct bpf_mthp_ops *ops;
+ int idx;
+
+ memcg = get_mem_cgroup_from_mm(mm);
+ if (!memcg)
+ return orders;
+
+ cgrp = memcg->css.cgroup;
+ ops = READ_ONCE(cgrp->mthp_ops);
+ if (unlikely(ops)) {
+ idx = srcu_read_lock(&mthp_bpf_srcu);
+ if (ops->mthp_choose)
+ orders = ops->mthp_choose(cgrp, orders);
+ srcu_read_unlock(&mthp_bpf_srcu, idx);
+ }
+
+ mem_cgroup_put(memcg);
+
+ return orders;
+}
+
+static int bpf_mthp_ops_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg, int off, int size)
+{
+ return -EACCES;
+}
+
+static bool bpf_mthp_ops_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog, struct bpf_insn_access_aux *info)
+{
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+const struct bpf_verifier_ops bpf_mthp_verifier_ops = {
+ .get_func_proto = bpf_base_func_proto,
+ .btf_struct_access = bpf_mthp_ops_btf_struct_access,
+ .is_valid_access = bpf_mthp_ops_is_valid_access,
+};
+
+static int bpf_mthp_ops_reg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+ struct bpf_mthp_ops *ops = kdata;
+ struct cgroup *cgrp = st_link->cgroup;
+ struct cgroup_subsys_state *pos;
+
+ /* The link is not yet fully initialized, but cgroup should be set */
+ if (!link)
+ return -EOPNOTSUPP;
+
+ cgroup_lock();
+ css_for_each_descendant_pre(pos, &cgrp->self) {
+ struct cgroup *child = pos->cgroup;
+
+ if (READ_ONCE(child->mthp_ops)) {
+ /* TODO
+ * Do not destroy the cgroup hierarchy property.
+ * If an eBPF program already exists in the sub-cgroup,
+ * trigger an error and clear the already set
+ * bpf_mthp_ops data.
+ */
+ continue;
+ }
+ WRITE_ONCE(child->mthp_ops, ops);
+ }
+ cgroup_unlock();
+
+ return 0;
+}
+
+static void bpf_mthp_ops_unreg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+ struct bpf_mthp_ops *ops = kdata;
+ struct cgroup *cgrp = st_link->cgroup;
+ struct cgroup_subsys_state *pos;
+
+ cgroup_lock();
+ css_for_each_descendant_pre(pos, &cgrp->self) {
+ struct cgroup *child = pos->cgroup;
+
+ if (READ_ONCE(child->mthp_ops) == ops)
+ WRITE_ONCE(child->mthp_ops, NULL);
+ }
+ cgroup_unlock();
+
+ synchronize_srcu(&mthp_bpf_srcu);
+}
+
+static int bpf_mthp_ops_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct bpf_mthp_ops, mthp_choose):
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (prog->sleepable)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int bpf_mthp_ops_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ return 0;
+}
+
+static int bpf_mthp_ops_init(struct btf *btf)
+{
+ return 0;
+}
+
+static unsigned long cfi_mthp_choose(struct cgroup *cgrp, unsigned long orders)
+{
+ return 0;
+}
+
+static struct bpf_mthp_ops cfi_bpf_mthp_ops = {
+ .mthp_choose = cfi_mthp_choose,
+};
+
+static struct bpf_struct_ops bso_bpf_mthp_ops = {
+ .verifier_ops = &bpf_mthp_verifier_ops,
+ .reg = bpf_mthp_ops_reg,
+ .unreg = bpf_mthp_ops_unreg,
+ .check_member = bpf_mthp_ops_check_member,
+ .init_member = bpf_mthp_ops_init_member,
+ .init = bpf_mthp_ops_init,
+ .name = "bpf_mthp_ops",
+ .owner = THIS_MODULE,
+ .cfi_stubs = &cfi_bpf_mthp_ops,
+};
+
+static int __init bpf_huge_memory_init(void)
+{
+ int err;
+
+ err = register_bpf_struct_ops(&bso_bpf_mthp_ops, bpf_mthp_ops);
+ if (err)
+ pr_warn("Registration of bpf_mthp_ops failed, err %d\n", err);
+
+ return err;
+}
+late_initcall(bpf_huge_memory_init);
--
2.53.0
next prev parent reply other threads:[~2026-05-03 16:51 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-03 16:50 [PATCH 0/4] mm: introduce mthp_ext via cgroup-bpf to make mTHP more transparent Vernon Yang
2026-05-03 16:50 ` [PATCH 1/4] psi: add psi_group_flush_stats() function Vernon Yang
2026-05-03 16:50 ` [PATCH 2/4] bpf: add bpf_cgroup_{flush_stats,stall} function Vernon Yang
2026-05-03 17:23 ` bot+bpf-ci
2026-05-06 12:38 ` Vernon Yang
2026-05-03 17:25 ` sashiko-bot
2026-05-06 12:55 ` Vernon Yang
2026-05-03 16:50 ` Vernon Yang [this message]
2026-05-03 17:35 ` [PATCH 3/4] mm: introduce bpf_mthp_ops struct ops bot+bpf-ci
2026-05-06 13:06 ` Vernon Yang
2026-05-03 17:41 ` sashiko-bot
2026-05-06 13:26 ` Vernon Yang
2026-05-03 16:50 ` [PATCH 4/4] samples: bpf: add mthp_ext Vernon Yang
2026-05-03 17:35 ` bot+bpf-ci
2026-05-06 13:30 ` Vernon Yang
2026-05-03 17:57 ` sashiko-bot
2026-05-06 13:50 ` Vernon Yang
2026-05-07 3:34 ` [PATCH 0/4] mm: introduce mthp_ext via cgroup-bpf to make mTHP more transparent Yafang Shao
2026-05-07 12:50 ` Vernon Yang
2026-05-07 13:18 ` Yafang Shao
2026-05-07 15:19 ` Vernon Yang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260503165024.1526680-4-vernon2gm@gmail.com \
--to=vernon2gm@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=ast@kernel.org \
--cc=baohua@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=inwardvessel@gmail.com \
--cc=lance.yang@linux.dev \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=surenb@google.com \
--cc=yanglincheng@kylinos.cn \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.