All of lore.kernel.org
 help / color / mirror / Atom feed
From: Vernon Yang <vernon2gm@gmail.com>
To: akpm@linux-foundation.org, david@kernel.org, ljs@kernel.org,
	roman.gushchin@linux.dev, inwardvessel@gmail.com,
	shakeel.butt@linux.dev, ast@kernel.org, daniel@iogearbox.net,
	surenb@google.com
Cc: tz2294@columbia.edu, baohua@kernel.org, lance.yang@linux.dev,
	dev.jain@arm.com, laoar.shao@gmail.com,
	gutierrez.asier@huawei-partners.com,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	bpf@vger.kernel.org, Vernon Yang <yanglincheng@kylinos.cn>
Subject: [PATCH v2 3/4] mm: introduce bpf_mthp_ops struct ops
Date: Fri,  8 May 2026 23:00:54 +0800	[thread overview]
Message-ID: <20260508150055.680136-4-vernon2gm@gmail.com> (raw)
In-Reply-To: <20260508150055.680136-1-vernon2gm@gmail.com>

From: Vernon Yang <yanglincheng@kylinos.cn>

Introducing bpf_mthp_ops enables eBPF programs to register the
mthp_choose callback function via cgroup-ebpf.

Using cgroup-bpf to customize mTHP size for different scenarios,
automatically select different mTHP sizes for different cgroups,
let's focus on making them truly transparent.

Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
---
 MAINTAINERS                     |   3 +
 include/linux/bpf_huge_memory.h |  52 ++++++++++
 include/linux/cgroup-defs.h     |   1 +
 include/linux/huge_mm.h         |   6 ++
 kernel/cgroup/cgroup.c          |   2 +
 mm/Kconfig                      |  14 +++
 mm/Makefile                     |   1 +
 mm/bpf_huge_memory.c            | 168 ++++++++++++++++++++++++++++++++
 8 files changed, 247 insertions(+)
 create mode 100644 include/linux/bpf_huge_memory.h
 create mode 100644 mm/bpf_huge_memory.c

diff --git a/MAINTAINERS b/MAINTAINERS
index caaa0d6e6056..f1113eaa1193 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4887,7 +4887,10 @@ M:	Shakeel Butt <shakeel.butt@linux.dev>
 L:	bpf@vger.kernel.org
 L:	linux-mm@kvack.org
 S:	Maintained
+F:	include/linux/bpf_huge_memory.h
+F:	mm/bpf_huge_memory.c
 F:	mm/bpf_memcontrol.c
+F:	samples/bpf/mthp_ext.*
 
 BPF [MISC]
 L:	bpf@vger.kernel.org
diff --git a/include/linux/bpf_huge_memory.h b/include/linux/bpf_huge_memory.h
new file mode 100644
index 000000000000..ffda445c9572
--- /dev/null
+++ b/include/linux/bpf_huge_memory.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef __BPF_HUGE_MEMORY_H
+#define __BPF_HUGE_MEMORY_H
+
+#include <linux/cgroup-defs.h>
+
+/**
+ * struct bpf_mthp_ops - BPF callbacks for mTHP operations
+ * @mthp_choose: Choose the custom mTHP orders
+ *
+ * This structure defines the interface for BPF programs to customize
+ * mTHP behavior through struct_ops programs.
+ */
+struct bpf_mthp_ops {
+	unsigned long (*mthp_choose)(struct cgroup *cgrp, unsigned long orders);
+};
+
+#ifdef CONFIG_BPF_TRANSPARENT_HUGEPAGE
+/**
+ * bpf_mthp_choose - Choose the custom mTHP orders using bpf
+ * @mm: task mm_struct
+ * @orders: original orders
+ *
+ * Return suited mTHP orders.
+ */
+unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders);
+
+/**
+ * cgroup_bpf_set_mthp_ops - Set sub-cgroup mthp_ops to parent cgroup
+ * @cgrp: want to set mthp_ops of sub-cgroup
+ * @parent: parent cgroup
+ */
+static inline void cgroup_bpf_set_mthp_ops(struct cgroup *cgrp,
+					   struct cgroup *parent)
+{
+	WRITE_ONCE(cgrp->mthp_ops, parent->mthp_ops);
+}
+#else
+static inline unsigned long bpf_mthp_choose(struct mm_struct *mm,
+					    unsigned long orders)
+{
+	return orders;
+}
+static inline void cgroup_bpf_set_mthp_ops(struct cgroup *cgrp,
+					   struct cgroup *parent)
+{
+}
+#endif /* CONFIG_BPF_TRANSPARENT_HUGEPAGE */
+
+#endif /* __BPF_HUGE_MEMORY_H */
+
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index f42563739d2e..78854d0e06ab 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -628,6 +628,7 @@ struct cgroup {
 
 #ifdef CONFIG_BPF_SYSCALL
 	struct bpf_local_storage __rcu  *bpf_cgrp_storage;
+	struct bpf_mthp_ops *mthp_ops;
 #endif
 #ifdef CONFIG_EXT_SUB_SCHED
 	struct scx_sched __rcu *scx_sched;
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 127f9e1e7604..65da35fb0980 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -3,6 +3,7 @@
 #define _LINUX_HUGE_MM_H
 
 #include <linux/mm_types.h>
+#include <linux/bpf_huge_memory.h>
 
 #include <linux/fs.h> /* only for vma_is_dax() */
 #include <linux/kobject.h>
@@ -296,6 +297,11 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 				       enum tva_type type,
 				       unsigned long orders)
 {
+	/* The eBPF-specified orders overrides which order is selected. */
+	orders &= bpf_mthp_choose(vma->vm_mm, orders);
+	if (!orders)
+		return 0;
+
 	/*
 	 * Optimization to check if required orders are enabled early. Only
 	 * forced collapse ignores sysfs configs.
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 43adc96c7f1a..1dbef3e8b179 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5836,6 +5836,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
 	if (ret)
 		goto out_stat_exit;
 
+	cgroup_bpf_set_mthp_ops(cgrp, parent);
+
 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
 		cgrp->ancestors[tcgrp->level] = tcgrp;
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 27dc5b0139ba..be49bde783a7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -949,6 +949,20 @@ config NO_PAGE_MAPCOUNT
 
 	  EXPERIMENTAL because the impact of some changes is still unclear.
 
+config BPF_TRANSPARENT_HUGEPAGE
+	bool "BPF-based transparent hugepage (EXPERIMENTAL)"
+	depends on TRANSPARENT_HUGEPAGE && CGROUP_BPF
+	help
+	  Using cgroup-bpf to customize mTHP size for different scenarios,
+	  automatically select different mTHP sizes for different cgroups,
+	  let's focus on making them truly transparent.
+
+	  This is an experimental feature, that might go away at any time,
+	  Please do not rely any production environment.
+
+	  EXPERIMENTAL because the BPF interface is unstable and may be removed
+	  at any time.
+
 endif # TRANSPARENT_HUGEPAGE
 
 # simple helper to make the code a bit easier to read
diff --git a/mm/Makefile b/mm/Makefile
index 8ad2ab08244e..b474c21c3253 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -108,6 +108,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o
 endif
 ifdef CONFIG_BPF_SYSCALL
 obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
+obj-$(CONFIG_BPF_TRANSPARENT_HUGEPAGE) += bpf_huge_memory.o
 endif
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_GUP_TEST) += gup_test.o
diff --git a/mm/bpf_huge_memory.c b/mm/bpf_huge_memory.c
new file mode 100644
index 000000000000..851c6ebe2933
--- /dev/null
+++ b/mm/bpf_huge_memory.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Huge memory related BPF code
+ *
+ * Author: Vernon Yang <yanglincheng@kylinos.cn>
+ */
+
+#include <linux/bpf.h>
+#include <linux/srcu.h>
+
+/* Protects cgrp->mthp_ops pointer for read and write. */
+DEFINE_SRCU(mthp_bpf_srcu);
+
+unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders)
+{
+	struct cgroup *cgrp;
+	struct mem_cgroup *memcg;
+	struct bpf_mthp_ops *ops;
+	int idx;
+
+	memcg = get_mem_cgroup_from_mm(mm);
+	if (!memcg)
+		return orders;
+
+	cgrp = memcg->css.cgroup;
+
+	idx = srcu_read_lock(&mthp_bpf_srcu);
+	ops = READ_ONCE(cgrp->mthp_ops);
+	if (unlikely(ops && ops->mthp_choose))
+		orders = ops->mthp_choose(cgrp, orders);
+	srcu_read_unlock(&mthp_bpf_srcu, idx);
+
+	mem_cgroup_put(memcg);
+
+	return orders;
+}
+
+static int bpf_mthp_ops_btf_struct_access(struct bpf_verifier_log *log,
+		const struct bpf_reg_state *reg, int off, int size)
+{
+	return -EACCES;
+}
+
+static bool bpf_mthp_ops_is_valid_access(int off, int size, enum bpf_access_type type,
+		const struct bpf_prog *prog, struct bpf_insn_access_aux *info)
+{
+	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+const struct bpf_verifier_ops bpf_mthp_verifier_ops = {
+	.get_func_proto = bpf_base_func_proto,
+	.btf_struct_access = bpf_mthp_ops_btf_struct_access,
+	.is_valid_access = bpf_mthp_ops_is_valid_access,
+};
+
+static int bpf_mthp_ops_reg(void *kdata, struct bpf_link *link)
+{
+	struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+	struct bpf_mthp_ops *ops = kdata;
+	struct cgroup_subsys_state *child;
+	struct cgroup *cgrp;
+
+	if (!link)
+		return -EOPNOTSUPP;
+
+	cgrp = st_link->cgroup;
+	if (!cgrp)
+		return -EINVAL;
+
+	cgroup_lock();
+	css_for_each_descendant_pre(child, &cgrp->self) {
+		if (READ_ONCE(child->cgroup->mthp_ops)) {
+			pr_warn("sub-cgroup has already registered.\n");
+			cgroup_unlock();
+			return -EBUSY;
+		}
+	}
+	css_for_each_descendant_pre(child, &cgrp->self)
+		WRITE_ONCE(child->cgroup->mthp_ops, ops);
+	cgroup_unlock();
+
+	return 0;
+}
+
+static void bpf_mthp_ops_unreg(void *kdata, struct bpf_link *link)
+{
+	struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+	struct cgroup_subsys_state *child;
+	struct cgroup *cgrp;
+
+	if (!link)
+		return;
+
+	cgrp = st_link->cgroup;
+	if (!cgrp)
+		return;
+
+	cgroup_lock();
+	css_for_each_descendant_pre(child, &cgrp->self)
+		WRITE_ONCE(child->cgroup->mthp_ops, NULL);
+	cgroup_unlock();
+
+	synchronize_srcu(&mthp_bpf_srcu);
+}
+
+static int bpf_mthp_ops_check_member(const struct btf_type *t,
+				     const struct btf_member *member,
+				     const struct bpf_prog *prog)
+{
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct bpf_mthp_ops, mthp_choose):
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (prog->sleepable)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int bpf_mthp_ops_init_member(const struct btf_type *t,
+				    const struct btf_member *member,
+				    void *kdata, const void *udata)
+{
+	return 0;
+}
+
+static int bpf_mthp_ops_init(struct btf *btf)
+{
+	return 0;
+}
+
+static unsigned long cfi_mthp_choose(struct cgroup *cgrp, unsigned long orders)
+{
+	return 0;
+}
+
+static struct bpf_mthp_ops cfi_bpf_mthp_ops = {
+	.mthp_choose = cfi_mthp_choose,
+};
+
+static struct bpf_struct_ops bso_bpf_mthp_ops = {
+	.verifier_ops = &bpf_mthp_verifier_ops,
+	.reg = bpf_mthp_ops_reg,
+	.unreg = bpf_mthp_ops_unreg,
+	.check_member = bpf_mthp_ops_check_member,
+	.init_member = bpf_mthp_ops_init_member,
+	.init = bpf_mthp_ops_init,
+	.name = "bpf_mthp_ops",
+	.owner = THIS_MODULE,
+	.cfi_stubs = &cfi_bpf_mthp_ops,
+};
+
+static int __init bpf_huge_memory_init(void)
+{
+	int err;
+
+	err = register_bpf_struct_ops(&bso_bpf_mthp_ops, bpf_mthp_ops);
+	if (err)
+		pr_warn("Registration of bpf_mthp_ops failed, err %d\n", err);
+
+	return err;
+}
+late_initcall(bpf_huge_memory_init);
-- 
2.53.0


  parent reply	other threads:[~2026-05-08 15:01 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-08 15:00 [PATCH v2 0/4] mm: introduce mthp_ext via cgroup-bpf to make mTHP more transparent Vernon Yang
2026-05-08 15:00 ` [PATCH v2 1/4] psi: add psi_group_flush_stats() function Vernon Yang
2026-05-08 15:19   ` Lorenzo Stoakes
2026-05-08 21:36   ` sashiko-bot
2026-05-08 15:00 ` [PATCH v2 2/4] bpf: add bpf_cgroup_{flush_stats,stall} function Vernon Yang
2026-05-08 15:40   ` bot+bpf-ci
2026-05-08 22:01   ` sashiko-bot
2026-05-08 15:00 ` Vernon Yang [this message]
2026-05-08 15:40   ` [PATCH v2 3/4] mm: introduce bpf_mthp_ops struct ops bot+bpf-ci
2026-05-08 15:57   ` Lorenzo Stoakes
2026-05-08 20:54   ` David Hildenbrand (Arm)
2026-05-11 11:25     ` Lorenzo Stoakes
2026-05-08 22:29   ` sashiko-bot
2026-05-08 15:00 ` [PATCH v2 4/4] samples: bpf: add mthp_ext Vernon Yang
2026-05-08 15:40   ` bot+bpf-ci
2026-05-08 22:52   ` sashiko-bot
2026-05-08 15:14 ` [PATCH v2 0/4] mm: introduce mthp_ext via cgroup-bpf to make mTHP more transparent Lorenzo Stoakes
2026-05-08 16:05   ` Lorenzo Stoakes
2026-05-08 16:53     ` Vernon Yang
2026-05-11 11:20       ` Lorenzo Stoakes
2026-05-08 16:00 ` Pedro Falcato
2026-05-08 16:15   ` Lorenzo Stoakes

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260508150055.680136-4-vernon2gm@gmail.com \
    --to=vernon2gm@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=ast@kernel.org \
    --cc=baohua@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=gutierrez.asier@huawei-partners.com \
    --cc=inwardvessel@gmail.com \
    --cc=lance.yang@linux.dev \
    --cc=laoar.shao@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=surenb@google.com \
    --cc=tz2294@columbia.edu \
    --cc=yanglincheng@kylinos.cn \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.