The Linux Kernel Mailing List
 help / color / mirror / Atom feed
From: Vernon Yang <vernon2gm@gmail.com>
To: akpm@linux-foundation.org, david@kernel.org, ljs@kernel.org,
	roman.gushchin@linux.dev, inwardvessel@gmail.com,
	shakeel.butt@linux.dev, ast@kernel.org, daniel@iogearbox.net,
	surenb@google.com
Cc: tz2294@columbia.edu, baohua@kernel.org, lance.yang@linux.dev,
	dev.jain@arm.com, laoar.shao@gmail.com,
	gutierrez.asier@huawei-partners.com,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	bpf@vger.kernel.org, Vernon Yang <yanglincheng@kylinos.cn>
Subject: [PATCH v2 3/4] mm: introduce bpf_mthp_ops struct ops
Date: Fri,  8 May 2026 23:00:54 +0800	[thread overview]
Message-ID: <20260508150055.680136-4-vernon2gm@gmail.com> (raw)
In-Reply-To: <20260508150055.680136-1-vernon2gm@gmail.com>

From: Vernon Yang <yanglincheng@kylinos.cn>

Introducing bpf_mthp_ops enables eBPF programs to register the
mthp_choose callback function via cgroup-ebpf.

Using cgroup-bpf to customize mTHP size for different scenarios,
automatically select different mTHP sizes for different cgroups,
let's focus on making them truly transparent.

Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
---
 MAINTAINERS                     |   3 +
 include/linux/bpf_huge_memory.h |  52 ++++++++++
 include/linux/cgroup-defs.h     |   1 +
 include/linux/huge_mm.h         |   6 ++
 kernel/cgroup/cgroup.c          |   2 +
 mm/Kconfig                      |  14 +++
 mm/Makefile                     |   1 +
 mm/bpf_huge_memory.c            | 168 ++++++++++++++++++++++++++++++++
 8 files changed, 247 insertions(+)
 create mode 100644 include/linux/bpf_huge_memory.h
 create mode 100644 mm/bpf_huge_memory.c

diff --git a/MAINTAINERS b/MAINTAINERS
index caaa0d6e6056..f1113eaa1193 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4887,7 +4887,10 @@ M:	Shakeel Butt <shakeel.butt@linux.dev>
 L:	bpf@vger.kernel.org
 L:	linux-mm@kvack.org
 S:	Maintained
+F:	include/linux/bpf_huge_memory.h
+F:	mm/bpf_huge_memory.c
 F:	mm/bpf_memcontrol.c
+F:	samples/bpf/mthp_ext.*
 
 BPF [MISC]
 L:	bpf@vger.kernel.org
diff --git a/include/linux/bpf_huge_memory.h b/include/linux/bpf_huge_memory.h
new file mode 100644
index 000000000000..ffda445c9572
--- /dev/null
+++ b/include/linux/bpf_huge_memory.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef __BPF_HUGE_MEMORY_H
+#define __BPF_HUGE_MEMORY_H
+
+#include <linux/cgroup-defs.h>
+
+/**
+ * struct bpf_mthp_ops - BPF callbacks for mTHP operations
+ * @mthp_choose: Choose the custom mTHP orders
+ *
+ * This structure defines the interface for BPF programs to customize
+ * mTHP behavior through struct_ops programs.
+ */
+struct bpf_mthp_ops {
+	unsigned long (*mthp_choose)(struct cgroup *cgrp, unsigned long orders);
+};
+
+#ifdef CONFIG_BPF_TRANSPARENT_HUGEPAGE
+/**
+ * bpf_mthp_choose - Choose the custom mTHP orders using bpf
+ * @mm: task mm_struct
+ * @orders: original orders
+ *
+ * Return suited mTHP orders.
+ */
+unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders);
+
+/**
+ * cgroup_bpf_set_mthp_ops - Set sub-cgroup mthp_ops to parent cgroup
+ * @cgrp: want to set mthp_ops of sub-cgroup
+ * @parent: parent cgroup
+ */
+static inline void cgroup_bpf_set_mthp_ops(struct cgroup *cgrp,
+					   struct cgroup *parent)
+{
+	WRITE_ONCE(cgrp->mthp_ops, parent->mthp_ops);
+}
+#else
+static inline unsigned long bpf_mthp_choose(struct mm_struct *mm,
+					    unsigned long orders)
+{
+	return orders;
+}
+static inline void cgroup_bpf_set_mthp_ops(struct cgroup *cgrp,
+					   struct cgroup *parent)
+{
+}
+#endif /* CONFIG_BPF_TRANSPARENT_HUGEPAGE */
+
+#endif /* __BPF_HUGE_MEMORY_H */
+
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index f42563739d2e..78854d0e06ab 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -628,6 +628,7 @@ struct cgroup {
 
 #ifdef CONFIG_BPF_SYSCALL
 	struct bpf_local_storage __rcu  *bpf_cgrp_storage;
+	struct bpf_mthp_ops *mthp_ops;
 #endif
 #ifdef CONFIG_EXT_SUB_SCHED
 	struct scx_sched __rcu *scx_sched;
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 127f9e1e7604..65da35fb0980 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -3,6 +3,7 @@
 #define _LINUX_HUGE_MM_H
 
 #include <linux/mm_types.h>
+#include <linux/bpf_huge_memory.h>
 
 #include <linux/fs.h> /* only for vma_is_dax() */
 #include <linux/kobject.h>
@@ -296,6 +297,11 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 				       enum tva_type type,
 				       unsigned long orders)
 {
+	/* The eBPF-specified orders overrides which order is selected. */
+	orders &= bpf_mthp_choose(vma->vm_mm, orders);
+	if (!orders)
+		return 0;
+
 	/*
 	 * Optimization to check if required orders are enabled early. Only
 	 * forced collapse ignores sysfs configs.
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 43adc96c7f1a..1dbef3e8b179 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5836,6 +5836,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
 	if (ret)
 		goto out_stat_exit;
 
+	cgroup_bpf_set_mthp_ops(cgrp, parent);
+
 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
 		cgrp->ancestors[tcgrp->level] = tcgrp;
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 27dc5b0139ba..be49bde783a7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -949,6 +949,20 @@ config NO_PAGE_MAPCOUNT
 
 	  EXPERIMENTAL because the impact of some changes is still unclear.
 
+config BPF_TRANSPARENT_HUGEPAGE
+	bool "BPF-based transparent hugepage (EXPERIMENTAL)"
+	depends on TRANSPARENT_HUGEPAGE && CGROUP_BPF
+	help
+	  Using cgroup-bpf to customize mTHP size for different scenarios,
+	  automatically select different mTHP sizes for different cgroups,
+	  let's focus on making them truly transparent.
+
+	  This is an experimental feature, that might go away at any time,
+	  Please do not rely any production environment.
+
+	  EXPERIMENTAL because the BPF interface is unstable and may be removed
+	  at any time.
+
 endif # TRANSPARENT_HUGEPAGE
 
 # simple helper to make the code a bit easier to read
diff --git a/mm/Makefile b/mm/Makefile
index 8ad2ab08244e..b474c21c3253 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -108,6 +108,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o
 endif
 ifdef CONFIG_BPF_SYSCALL
 obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
+obj-$(CONFIG_BPF_TRANSPARENT_HUGEPAGE) += bpf_huge_memory.o
 endif
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_GUP_TEST) += gup_test.o
diff --git a/mm/bpf_huge_memory.c b/mm/bpf_huge_memory.c
new file mode 100644
index 000000000000..851c6ebe2933
--- /dev/null
+++ b/mm/bpf_huge_memory.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Huge memory related BPF code
+ *
+ * Author: Vernon Yang <yanglincheng@kylinos.cn>
+ */
+
+#include <linux/bpf.h>
+#include <linux/srcu.h>
+
+/* Protects cgrp->mthp_ops pointer for read and write. */
+DEFINE_SRCU(mthp_bpf_srcu);
+
+unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders)
+{
+	struct cgroup *cgrp;
+	struct mem_cgroup *memcg;
+	struct bpf_mthp_ops *ops;
+	int idx;
+
+	memcg = get_mem_cgroup_from_mm(mm);
+	if (!memcg)
+		return orders;
+
+	cgrp = memcg->css.cgroup;
+
+	idx = srcu_read_lock(&mthp_bpf_srcu);
+	ops = READ_ONCE(cgrp->mthp_ops);
+	if (unlikely(ops && ops->mthp_choose))
+		orders = ops->mthp_choose(cgrp, orders);
+	srcu_read_unlock(&mthp_bpf_srcu, idx);
+
+	mem_cgroup_put(memcg);
+
+	return orders;
+}
+
+static int bpf_mthp_ops_btf_struct_access(struct bpf_verifier_log *log,
+		const struct bpf_reg_state *reg, int off, int size)
+{
+	return -EACCES;
+}
+
+static bool bpf_mthp_ops_is_valid_access(int off, int size, enum bpf_access_type type,
+		const struct bpf_prog *prog, struct bpf_insn_access_aux *info)
+{
+	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+const struct bpf_verifier_ops bpf_mthp_verifier_ops = {
+	.get_func_proto = bpf_base_func_proto,
+	.btf_struct_access = bpf_mthp_ops_btf_struct_access,
+	.is_valid_access = bpf_mthp_ops_is_valid_access,
+};
+
+static int bpf_mthp_ops_reg(void *kdata, struct bpf_link *link)
+{
+	struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+	struct bpf_mthp_ops *ops = kdata;
+	struct cgroup_subsys_state *child;
+	struct cgroup *cgrp;
+
+	if (!link)
+		return -EOPNOTSUPP;
+
+	cgrp = st_link->cgroup;
+	if (!cgrp)
+		return -EINVAL;
+
+	cgroup_lock();
+	css_for_each_descendant_pre(child, &cgrp->self) {
+		if (READ_ONCE(child->cgroup->mthp_ops)) {
+			pr_warn("sub-cgroup has already registered.\n");
+			cgroup_unlock();
+			return -EBUSY;
+		}
+	}
+	css_for_each_descendant_pre(child, &cgrp->self)
+		WRITE_ONCE(child->cgroup->mthp_ops, ops);
+	cgroup_unlock();
+
+	return 0;
+}
+
+static void bpf_mthp_ops_unreg(void *kdata, struct bpf_link *link)
+{
+	struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+	struct cgroup_subsys_state *child;
+	struct cgroup *cgrp;
+
+	if (!link)
+		return;
+
+	cgrp = st_link->cgroup;
+	if (!cgrp)
+		return;
+
+	cgroup_lock();
+	css_for_each_descendant_pre(child, &cgrp->self)
+		WRITE_ONCE(child->cgroup->mthp_ops, NULL);
+	cgroup_unlock();
+
+	synchronize_srcu(&mthp_bpf_srcu);
+}
+
+static int bpf_mthp_ops_check_member(const struct btf_type *t,
+				     const struct btf_member *member,
+				     const struct bpf_prog *prog)
+{
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct bpf_mthp_ops, mthp_choose):
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (prog->sleepable)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int bpf_mthp_ops_init_member(const struct btf_type *t,
+				    const struct btf_member *member,
+				    void *kdata, const void *udata)
+{
+	return 0;
+}
+
+static int bpf_mthp_ops_init(struct btf *btf)
+{
+	return 0;
+}
+
+static unsigned long cfi_mthp_choose(struct cgroup *cgrp, unsigned long orders)
+{
+	return 0;
+}
+
+static struct bpf_mthp_ops cfi_bpf_mthp_ops = {
+	.mthp_choose = cfi_mthp_choose,
+};
+
+static struct bpf_struct_ops bso_bpf_mthp_ops = {
+	.verifier_ops = &bpf_mthp_verifier_ops,
+	.reg = bpf_mthp_ops_reg,
+	.unreg = bpf_mthp_ops_unreg,
+	.check_member = bpf_mthp_ops_check_member,
+	.init_member = bpf_mthp_ops_init_member,
+	.init = bpf_mthp_ops_init,
+	.name = "bpf_mthp_ops",
+	.owner = THIS_MODULE,
+	.cfi_stubs = &cfi_bpf_mthp_ops,
+};
+
+static int __init bpf_huge_memory_init(void)
+{
+	int err;
+
+	err = register_bpf_struct_ops(&bso_bpf_mthp_ops, bpf_mthp_ops);
+	if (err)
+		pr_warn("Registration of bpf_mthp_ops failed, err %d\n", err);
+
+	return err;
+}
+late_initcall(bpf_huge_memory_init);
-- 
2.53.0


  parent reply	other threads:[~2026-05-08 15:01 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-08 15:00 [PATCH v2 0/4] mm: introduce mthp_ext via cgroup-bpf to make mTHP more transparent Vernon Yang
2026-05-08 15:00 ` [PATCH v2 1/4] psi: add psi_group_flush_stats() function Vernon Yang
2026-05-08 15:19   ` Lorenzo Stoakes
2026-05-08 15:00 ` [PATCH v2 2/4] bpf: add bpf_cgroup_{flush_stats,stall} function Vernon Yang
2026-05-08 15:40   ` bot+bpf-ci
2026-05-08 15:00 ` Vernon Yang [this message]
2026-05-08 15:40   ` [PATCH v2 3/4] mm: introduce bpf_mthp_ops struct ops bot+bpf-ci
2026-05-08 15:57   ` Lorenzo Stoakes
2026-05-08 20:54   ` David Hildenbrand (Arm)
2026-05-11 11:25     ` Lorenzo Stoakes
2026-05-08 15:00 ` [PATCH v2 4/4] samples: bpf: add mthp_ext Vernon Yang
2026-05-08 15:40   ` bot+bpf-ci
2026-05-08 15:14 ` [PATCH v2 0/4] mm: introduce mthp_ext via cgroup-bpf to make mTHP more transparent Lorenzo Stoakes
2026-05-08 16:05   ` Lorenzo Stoakes
2026-05-08 16:53     ` Vernon Yang
2026-05-11 11:20       ` Lorenzo Stoakes
2026-05-08 16:00 ` Pedro Falcato
2026-05-08 16:15   ` Lorenzo Stoakes

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260508150055.680136-4-vernon2gm@gmail.com \
    --to=vernon2gm@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=ast@kernel.org \
    --cc=baohua@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=gutierrez.asier@huawei-partners.com \
    --cc=inwardvessel@gmail.com \
    --cc=lance.yang@linux.dev \
    --cc=laoar.shao@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=surenb@google.com \
    --cc=tz2294@columbia.edu \
    --cc=yanglincheng@kylinos.cn \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox