From: Yafang Shao <laoar.shao@gmail.com>
To: akpm@linux-foundation.org, david@redhat.com, ziy@nvidia.com,
baolin.wang@linux.alibaba.com, lorenzo.stoakes@oracle.com,
Liam.Howlett@oracle.com, npache@redhat.com, ryan.roberts@arm.com,
dev.jain@arm.com, hannes@cmpxchg.org, usamaarif642@gmail.com,
gutierrez.asier@huawei-partners.com, willy@infradead.org,
ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org,
ameryhung@gmail.com, rientjes@google.com, corbet@lwn.net
Cc: bpf@vger.kernel.org, linux-mm@kvack.org,
linux-doc@vger.kernel.org, Yafang Shao <laoar.shao@gmail.com>
Subject: [PATCH v6 mm-new 05/10] selftests/bpf: add a simple BPF based THP policy
Date: Tue, 26 Aug 2025 15:19:43 +0800 [thread overview]
Message-ID: <20250826071948.2618-6-laoar.shao@gmail.com> (raw)
In-Reply-To: <20250826071948.2618-1-laoar.shao@gmail.com>
This selftest verifies that PMD-mapped THP allocation is restricted in
page faults for tasks within a specific cgroup, while still permitting
THP allocation via khugepaged.
Since THP allocation depends on various factors (e.g., system memory
pressure), using the actual allocated THP size for validation is
unreliable. Instead, we check the return value of get_suggested_order(),
which indicates whether the system intends to allocate a THP, regardless of
whether the allocation ultimately succeeds.
This test case defines a simple THP policy. The policy permits
PMD-mapped THP allocation through khugepaged for tasks in a designated
cgroup, but prohibits it for all other tasks and contexts, including the
page fault handler. However, khugepaged might not run immediately during
this test, making its count metrics unreliable.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
tools/testing/selftests/bpf/config | 3 +
.../selftests/bpf/prog_tests/thp_adjust.c | 254 ++++++++++++++++++
.../selftests/bpf/progs/test_thp_adjust.c | 76 ++++++
3 files changed, 333 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/thp_adjust.c
create mode 100644 tools/testing/selftests/bpf/progs/test_thp_adjust.c
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 8916ab814a3e..27f0249c7600 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -26,6 +26,7 @@ CONFIG_DMABUF_HEAPS=y
CONFIG_DMABUF_HEAPS_SYSTEM=y
CONFIG_DUMMY=y
CONFIG_DYNAMIC_FTRACE=y
+CONFIG_EXPERIMENTAL_BPF_ORDER_SELECTION=y
CONFIG_FPROBE=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_FUNCTION_ERROR_INJECTION=y
@@ -51,6 +52,7 @@ CONFIG_IPV6_TUNNEL=y
CONFIG_KEYS=y
CONFIG_LIRC=y
CONFIG_LWTUNNEL=y
+CONFIG_MEMCG=y
CONFIG_MODULE_SIG=y
CONFIG_MODULE_SRCVERSION_ALL=y
CONFIG_MODULE_UNLOAD=y
@@ -114,6 +116,7 @@ CONFIG_SECURITY=y
CONFIG_SECURITYFS=y
CONFIG_SYN_COOKIES=y
CONFIG_TEST_BPF=m
+CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_UDMABUF=y
CONFIG_USERFAULTFD=y
CONFIG_VSOCKETS=y
diff --git a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c
new file mode 100644
index 000000000000..a4a34ee28301
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <math.h>
+#include <sys/mman.h>
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+#include "test_thp_adjust.skel.h"
+
+#define LEN (16 * 1024 * 1024) /* 16MB */
+#define THP_ENABLED_FILE "/sys/kernel/mm/transparent_hugepage/enabled"
+#define PMD_SIZE_FILE "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
+
+static struct test_thp_adjust *skel;
+static char *thp_addr, old_mode[32];
+static long pagesize;
+
+static int thp_mode_save(void)
+{
+ const char *start, *end;
+ char buf[128];
+ int fd, err;
+ size_t len;
+
+ fd = open(THP_ENABLED_FILE, O_RDONLY);
+ if (fd == -1)
+ return -1;
+
+ err = read(fd, buf, sizeof(buf) - 1);
+ if (err == -1)
+ goto close;
+
+ start = strchr(buf, '[');
+ end = start ? strchr(start, ']') : NULL;
+ if (!start || !end || end <= start) {
+ err = -1;
+ goto close;
+ }
+
+ len = end - start - 1;
+ if (len >= sizeof(old_mode))
+ len = sizeof(old_mode) - 1;
+ strncpy(old_mode, start + 1, len);
+ old_mode[len] = '\0';
+
+close:
+ close(fd);
+ return err;
+}
+
+static int thp_mode_set(const char *desired_mode)
+{
+ int fd, err;
+
+ fd = open(THP_ENABLED_FILE, O_RDWR);
+ if (fd == -1)
+ return -1;
+
+ err = write(fd, desired_mode, strlen(desired_mode));
+ close(fd);
+ return err;
+}
+
+static int thp_mode_reset(void)
+{
+ int fd, err;
+
+ fd = open(THP_ENABLED_FILE, O_WRONLY);
+ if (fd == -1)
+ return -1;
+
+ err = write(fd, old_mode, strlen(old_mode));
+ close(fd);
+ return err;
+}
+
+static int thp_alloc(void)
+{
+ int err, i;
+
+ thp_addr = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (thp_addr == MAP_FAILED)
+ return -1;
+
+ err = madvise(thp_addr, LEN, MADV_HUGEPAGE);
+ if (err == -1)
+ goto unmap;
+
+ /* Accessing a single byte within a page is sufficient to trigger a page fault. */
+ for (i = 0; i < LEN; i += pagesize)
+ thp_addr[i] = 1;
+ return 0;
+
+unmap:
+ munmap(thp_addr, LEN);
+ return -1;
+}
+
+static void thp_free(void)
+{
+ if (!thp_addr)
+ return;
+ munmap(thp_addr, LEN);
+}
+
+static int get_pmd_order(void)
+{
+ ssize_t bytes_read, size;
+ int fd, order, ret = -1;
+ char buf[64], *endptr;
+
+ fd = open(PMD_SIZE_FILE, O_RDONLY);
+ if (fd < 0)
+ return -1;
+
+ bytes_read = read(fd, buf, sizeof(buf) - 1);
+ if (bytes_read <= 0)
+ goto close_fd;
+
+ /* Remove potential newline character */
+ if (buf[bytes_read - 1] == '\n')
+ buf[bytes_read - 1] = '\0';
+
+ size = strtoul(buf, &endptr, 10);
+ if (endptr == buf || *endptr != '\0')
+ goto close_fd;
+ if (size % pagesize != 0)
+ goto close_fd;
+ ret = size / pagesize;
+ if ((ret & (ret - 1)) == 0) {
+ order = 0;
+ while (ret > 1) {
+ ret >>= 1;
+ order++;
+ }
+ ret = order;
+ }
+
+close_fd:
+ close(fd);
+ return ret;
+}
+
+static void subtest_thp_policy(void)
+{
+ struct bpf_link *fentry_link, *ops_link;
+
+ /* After attaching struct_ops, THP will be allocated only in khugepaged . */
+ ops_link = bpf_map__attach_struct_ops(skel->maps.khugepaged_ops);
+ if (!ASSERT_OK_PTR(ops_link, "attach struct_ops"))
+ return;
+
+ /* Create a new BPF program to detect the result. */
+ fentry_link = bpf_program__attach_trace(skel->progs.thp_run);
+ if (!ASSERT_OK_PTR(fentry_link, "attach fentry"))
+ goto detach_ops;
+ if (!ASSERT_NEQ(thp_alloc(), -1, "THP alloc"))
+ goto detach;
+
+ if (!ASSERT_EQ(skel->bss->pf_alloc, 0, "alloc_in_pf"))
+ goto thp_free;
+ if (!ASSERT_GT(skel->bss->pf_disallow, 0, "disallow_in_pf"))
+ goto thp_free;
+
+ ASSERT_EQ(skel->bss->khugepaged_disallow, 0, "disallow_in_khugepaged");
+thp_free:
+ thp_free();
+detach:
+ bpf_link__destroy(fentry_link);
+detach_ops:
+ bpf_link__destroy(ops_link);
+}
+
+static int thp_adjust_setup(void)
+{
+ int err, cgrp_fd, cgrp_id, pmd_order;
+
+ pagesize = sysconf(_SC_PAGESIZE);
+ pmd_order = get_pmd_order();
+ if (!ASSERT_NEQ(pmd_order, -1, "get_pmd_order"))
+ return -1;
+
+ err = setup_cgroup_environment();
+ if (!ASSERT_OK(err, "cgrp_env_setup"))
+ return -1;
+
+ cgrp_fd = create_and_get_cgroup("thp_adjust");
+ if (!ASSERT_GE(cgrp_fd, 0, "create_and_get_cgroup"))
+ goto cleanup;
+ close(cgrp_fd);
+
+ err = join_cgroup("thp_adjust");
+ if (!ASSERT_OK(err, "join_cgroup"))
+ goto remove_cgrp;
+
+ err = -1;
+ cgrp_id = get_cgroup_id("thp_adjust");
+ if (!ASSERT_GE(cgrp_id, 0, "create_and_get_cgroup"))
+ goto join_root;
+
+ if (!ASSERT_NEQ(thp_mode_save(), -1, "THP mode save"))
+ goto join_root;
+ if (!ASSERT_GE(thp_mode_set("madvise"), 0, "THP mode set"))
+ goto join_root;
+
+ skel = test_thp_adjust__open();
+ if (!ASSERT_OK_PTR(skel, "open"))
+ goto thp_reset;
+
+ skel->bss->cgrp_id = cgrp_id;
+ skel->bss->pmd_order = pmd_order;
+
+ err = test_thp_adjust__load(skel);
+ if (!ASSERT_OK(err, "load"))
+ goto destroy;
+ return 0;
+
+destroy:
+ test_thp_adjust__destroy(skel);
+thp_reset:
+ ASSERT_GE(thp_mode_reset(), 0, "THP mode reset");
+join_root:
+ /* We must join the root cgroup before removing the created cgroup. */
+ err = join_root_cgroup();
+ ASSERT_OK(err, "join_cgroup to root");
+remove_cgrp:
+ remove_cgroup("thp_adjust");
+cleanup:
+ cleanup_cgroup_environment();
+ return err;
+}
+
+static void thp_adjust_destroy(void)
+{
+ int err;
+
+ test_thp_adjust__destroy(skel);
+ ASSERT_GE(thp_mode_reset(), 0, "THP mode reset");
+ err = join_root_cgroup();
+ ASSERT_OK(err, "join_cgroup to root");
+ if (!err)
+ remove_cgroup("thp_adjust");
+ cleanup_cgroup_environment();
+}
+
+void test_thp_adjust(void)
+{
+ if (thp_adjust_setup() == -1)
+ return;
+
+ if (test__start_subtest("alloc_in_khugepaged"))
+ subtest_thp_policy();
+
+ thp_adjust_destroy();
+}
diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust.c b/tools/testing/selftests/bpf/progs/test_thp_adjust.c
new file mode 100644
index 000000000000..635915f31786
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_thp_adjust.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int pf_alloc, pf_disallow, khugepaged_disallow;
+struct mm_struct *target_mm;
+int pmd_order, cgrp_id;
+
+/* Detecting whether a task can successfully allocate THP is unreliable because
+ * it may be influenced by system memory pressure. Instead of making the result
+ * dependent on unpredictable factors, we should simply check
+ * get_suggested_order()'s return value, which is deterministic.
+ */
+SEC("fexit/get_suggested_order")
+int BPF_PROG(thp_run, struct mm_struct *mm, struct vm_area_struct *vma__nullable,
+ u64 vma_flags, u64 tva_flags, int orders, int retval)
+{
+ if (mm != target_mm)
+ return 0;
+
+ if (orders != (1 << pmd_order))
+ return 0;
+
+ if (tva_flags == TVA_PAGEFAULT) {
+ if (retval == (1 << pmd_order))
+ pf_alloc++;
+ else if (!retval)
+ pf_disallow++;
+ } else if (tva_flags == TVA_KHUGEPAGED || tva_flags == -1) {
+ /* khugepaged is not triggered immediately, so its allocation
+ * counts are unreliable.
+ */
+ if (!retval)
+ khugepaged_disallow++;
+ }
+ return 0;
+}
+
+SEC("struct_ops/get_suggested_order")
+int BPF_PROG(alloc_in_khugepaged, struct mm_struct *mm, struct vm_area_struct *vma__nullable,
+ u64 vma_flags, enum tva_type tva_flags, int orders)
+{
+ struct mem_cgroup *memcg;
+ int suggested_orders = 0;
+
+ if (orders != (1 << pmd_order))
+ return 0;
+
+ /* Only works when CONFIG_MEMCG is enabled. */
+ memcg = bpf_mm_get_mem_cgroup(mm);
+ if (!memcg)
+ return 0;
+
+ if (memcg->css.cgroup->kn->id == cgrp_id) {
+ if (!target_mm)
+ target_mm = mm;
+
+ /* BPF THP allocation policy:
+ * - Allow PMD allocation in khugepagd only
+ */
+ if (tva_flags == TVA_KHUGEPAGED || tva_flags == -1)
+ suggested_orders = orders;
+ }
+
+ bpf_put_mem_cgroup(memcg);
+ return suggested_orders;
+}
+
+SEC(".struct_ops.link")
+struct bpf_thp_ops khugepaged_ops = {
+ .get_suggested_order = (void *)alloc_in_khugepaged,
+};
--
2.47.3
next prev parent reply other threads:[~2025-08-26 7:21 UTC|newest]
Thread overview: 61+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-08-26 7:19 [PATCH v6 mm-new 00/10] mm, bpf: BPF based THP order selection Yafang Shao
2025-08-26 7:19 ` [PATCH v6 mm-new 01/10] mm: thp: add support for " Yafang Shao
2025-08-27 2:57 ` kernel test robot
2025-08-27 11:39 ` Yafang Shao
2025-08-27 15:04 ` Lorenzo Stoakes
2025-08-27 15:03 ` Lorenzo Stoakes
2025-08-28 5:54 ` Yafang Shao
2025-08-28 10:50 ` Lorenzo Stoakes
2025-08-29 3:01 ` Yafang Shao
2025-08-29 10:42 ` Lorenzo Stoakes
2025-08-31 3:11 ` Yafang Shao
2025-09-01 11:39 ` Lorenzo Stoakes
2025-09-02 2:48 ` Yafang Shao
2025-09-02 7:50 ` Lorenzo Stoakes
2025-09-03 2:10 ` Yafang Shao
2025-08-29 4:56 ` Barry Song
2025-08-29 5:36 ` Yafang Shao
2025-08-26 7:19 ` [PATCH v6 mm-new 02/10] mm: thp: add a new kfunc bpf_mm_get_mem_cgroup() Yafang Shao
2025-08-27 15:34 ` Lorenzo Stoakes
2025-08-27 20:50 ` Shakeel Butt
2025-08-28 10:40 ` Lorenzo Stoakes
2025-08-28 16:00 ` Shakeel Butt
2025-08-29 10:45 ` Lorenzo Stoakes
2025-08-28 6:57 ` Yafang Shao
2025-08-28 10:42 ` Lorenzo Stoakes
2025-08-29 3:09 ` Yafang Shao
2025-08-27 20:45 ` Shakeel Butt
2025-08-28 6:58 ` Yafang Shao
2025-08-26 7:19 ` [PATCH v6 mm-new 03/10] mm: thp: add a new kfunc bpf_mm_get_task() Yafang Shao
2025-08-27 15:42 ` Lorenzo Stoakes
2025-08-27 21:50 ` Andrii Nakryiko
2025-08-28 6:50 ` Yafang Shao
2025-08-28 10:51 ` Lorenzo Stoakes
2025-08-29 3:15 ` Yafang Shao
2025-08-29 10:42 ` Lorenzo Stoakes
2025-08-28 6:47 ` Yafang Shao
2025-08-29 10:43 ` Lorenzo Stoakes
2025-08-26 7:19 ` [PATCH v6 mm-new 04/10] bpf: mark vma->vm_mm as trusted Yafang Shao
2025-08-27 15:45 ` Lorenzo Stoakes
2025-08-28 6:12 ` Yafang Shao
2025-08-28 11:11 ` Lorenzo Stoakes
2025-08-29 3:05 ` Yafang Shao
2025-08-29 10:49 ` Lorenzo Stoakes
2025-08-31 3:16 ` Yafang Shao
2025-09-01 10:36 ` Lorenzo Stoakes
2025-08-26 7:19 ` Yafang Shao [this message]
2025-08-26 7:19 ` [PATCH v6 mm-new 06/10] selftests/bpf: add test case for khugepaged fork Yafang Shao
2025-08-26 7:19 ` [PATCH v6 mm-new 07/10] selftests/bpf: add test case to update thp policy Yafang Shao
2025-08-26 7:19 ` [PATCH v6 mm-new 08/10] selftests/bpf: add test cases for invalid thp_adjust usage Yafang Shao
2025-08-26 7:19 ` [PATCH v6 mm-new 09/10] Documentation: add BPF-based THP adjustment documentation Yafang Shao
2025-08-26 7:19 ` [PATCH v6 mm-new 10/10] MAINTAINERS: add entry for BPF-based THP adjustment Yafang Shao
2025-08-27 15:47 ` Lorenzo Stoakes
2025-08-28 6:08 ` Yafang Shao
2025-08-26 7:42 ` [PATCH v6 mm-new 00/10] mm, bpf: BPF based THP order selection David Hildenbrand
2025-08-26 8:33 ` Lorenzo Stoakes
2025-08-26 12:06 ` Yafang Shao
2025-08-26 9:52 ` Usama Arif
2025-08-26 12:10 ` Yafang Shao
2025-08-26 12:03 ` Yafang Shao
2025-08-27 13:14 ` Lorenzo Stoakes
2025-08-28 2:58 ` Yafang Shao
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250826071948.2618-6-laoar.shao@gmail.com \
--to=laoar.shao@gmail.com \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=ameryhung@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bpf@vger.kernel.org \
--cc=corbet@lwn.net \
--cc=daniel@iogearbox.net \
--cc=david@redhat.com \
--cc=dev.jain@arm.com \
--cc=gutierrez.asier@huawei-partners.com \
--cc=hannes@cmpxchg.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=npache@redhat.com \
--cc=rientjes@google.com \
--cc=ryan.roberts@arm.com \
--cc=usamaarif642@gmail.com \
--cc=willy@infradead.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).