linux-kselftest.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] selftests/mm: add process_madvise() tests
@ 2025-07-03  4:43 wang lian
  2025-07-05 19:32 ` Lorenzo Stoakes
  2025-07-08 17:44 ` Zi Yan
  0 siblings, 2 replies; 25+ messages in thread
From: wang lian @ 2025-07-03  4:43 UTC (permalink / raw)
  To: david, linux-mm, akpm, lorenzo.stoakes, sj
  Cc: lianux.mm, Liam.Howlett, brauner, gkwang, jannh, linux-kernel,
	linux-kselftest, p1ucky0923, ryncsn, shuah, vbabka, zijing.zhang

Add tests for process_madvise(), focusing on verifying behavior under
various conditions including valid usage and error cases.

Signed-off-by: wang lian <lianux.mm@gmail.com>
Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: SeongJae Park <sj@kernel.org>
---

Changelog v3:
- Rebased onto the latest mm-stable branch to ensure clean application.
- Refactor common signal handling logic into vm_util to reduce code duplication.
- Improve test robustness and diagnostics based on community feedback.
- Address minor code style and script corrections.

Changelog v2:
- Drop MADV_DONTNEED tests based on feedback.
- Focus solely on process_madvise() syscall.
- Improve error handling and structure.
- Add future-proof flag test.
- Style and comment cleanups.

 tools/testing/selftests/mm/.gitignore      |   1 +
 tools/testing/selftests/mm/Makefile        |   1 +
 tools/testing/selftests/mm/guard-regions.c |  51 ---
 tools/testing/selftests/mm/process_madv.c  | 358 +++++++++++++++++++++
 tools/testing/selftests/mm/run_vmtests.sh  |   5 +
 tools/testing/selftests/mm/vm_util.c       |  35 ++
 tools/testing/selftests/mm/vm_util.h       |  22 ++
 7 files changed, 422 insertions(+), 51 deletions(-)
 create mode 100644 tools/testing/selftests/mm/process_madv.c

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 824266982aa3..95bd9c6ead9e 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -25,6 +25,7 @@ pfnmap
 protection_keys
 protection_keys_32
 protection_keys_64
+process_madv
 madv_populate
 uffd-stress
 uffd-unit-tests
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index ae6f994d3add..d13b3cef2a2b 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test
 TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += pagemap_ioctl
 TEST_GEN_FILES += pfnmap
+TEST_GEN_FILES += process_madv
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += uffd-stress
diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c
index 93af3d3760f9..4cf101b0fe5e 100644
--- a/tools/testing/selftests/mm/guard-regions.c
+++ b/tools/testing/selftests/mm/guard-regions.c
@@ -9,8 +9,6 @@
 #include <linux/limits.h>
 #include <linux/userfaultfd.h>
 #include <linux/fs.h>
-#include <setjmp.h>
-#include <signal.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -24,24 +22,6 @@
 
 #include "../pidfd/pidfd.h"
 
-/*
- * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1:
- *
- * "If the signal occurs other than as the result of calling the abort or raise
- *  function, the behavior is undefined if the signal handler refers to any
- *  object with static storage duration other than by assigning a value to an
- *  object declared as volatile sig_atomic_t"
- */
-static volatile sig_atomic_t signal_jump_set;
-static sigjmp_buf signal_jmp_buf;
-
-/*
- * Ignore the checkpatch warning, we must read from x but don't want to do
- * anything with it in order to trigger a read page fault. We therefore must use
- * volatile to stop the compiler from optimising this away.
- */
-#define FORCE_READ(x) (*(volatile typeof(x) *)x)
-
 /*
  * How is the test backing the mapping being tested?
  */
@@ -120,14 +100,6 @@ static int userfaultfd(int flags)
 	return syscall(SYS_userfaultfd, flags);
 }
 
-static void handle_fatal(int c)
-{
-	if (!signal_jump_set)
-		return;
-
-	siglongjmp(signal_jmp_buf, c);
-}
-
 static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
 				   size_t n, int advice, unsigned int flags)
 {
@@ -180,29 +152,6 @@ static bool try_read_write_buf(char *ptr)
 	return try_read_buf(ptr) && try_write_buf(ptr);
 }
 
-static void setup_sighandler(void)
-{
-	struct sigaction act = {
-		.sa_handler = &handle_fatal,
-		.sa_flags = SA_NODEFER,
-	};
-
-	sigemptyset(&act.sa_mask);
-	if (sigaction(SIGSEGV, &act, NULL))
-		ksft_exit_fail_perror("sigaction");
-}
-
-static void teardown_sighandler(void)
-{
-	struct sigaction act = {
-		.sa_handler = SIG_DFL,
-		.sa_flags = SA_NODEFER,
-	};
-
-	sigemptyset(&act.sa_mask);
-	sigaction(SIGSEGV, &act, NULL);
-}
-
 static int open_file(const char *prefix, char *path)
 {
 	int fd;
diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c
new file mode 100644
index 000000000000..3d26105b4781
--- /dev/null
+++ b/tools/testing/selftests/mm/process_madv.c
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+#include "../kselftest_harness.h"
+#include <errno.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <sched.h>
+#include <sys/pidfd.h>
+#include "vm_util.h"
+
+#include "../pidfd/pidfd.h"
+
+FIXTURE(process_madvise)
+{
+	int pidfd;
+	int flag;
+};
+
+FIXTURE_SETUP(process_madvise)
+{
+	self->pidfd = PIDFD_SELF;
+	self->flag = 0;
+	setup_sighandler();
+};
+
+FIXTURE_TEARDOWN(process_madvise)
+{
+	teardown_sighandler();
+}
+
+static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
+				   size_t vlen, int advice, unsigned int flags)
+{
+	return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
+}
+
+/*
+ * Enable our signal catcher and try to read the specified buffer. The
+ * return value indicates whether the read succeeds without a fatal
+ * signal.
+ */
+static bool try_read_buf(char *ptr)
+{
+	bool failed;
+
+	/* Tell signal handler to jump back here on fatal signal. */
+	signal_jump_set = true;
+	/* If a fatal signal arose, we will jump back here and failed is set. */
+	failed = sigsetjmp(signal_jmp_buf, 0) != 0;
+
+	if (!failed)
+		FORCE_READ(ptr);
+
+	signal_jump_set = false;
+	return !failed;
+}
+
+TEST_F(process_madvise, basic)
+{
+	const unsigned long pagesize = (unsigned long)sysconf(_SC_PAGESIZE);
+	const int madvise_pages = 4;
+	char *map;
+	ssize_t ret;
+	struct iovec vec[madvise_pages];
+
+	/*
+	 * Create a single large mapping. We will pick pages from this
+	 * mapping to advise on. This ensures we test non-contiguous iovecs.
+	 */
+	map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (map == MAP_FAILED)
+		ksft_exit_skip("mmap failed, not enough memory.\n");
+
+	/* Fill the entire region with a known pattern. */
+	memset(map, 'A', pagesize * 10);
+
+	/*
+	 * Setup the iovec to point to 4 non-contiguous pages
+	 * within the mapping.
+	 */
+	vec[0].iov_base = &map[0 * pagesize];
+	vec[0].iov_len = pagesize;
+	vec[1].iov_base = &map[3 * pagesize];
+	vec[1].iov_len = pagesize;
+	vec[2].iov_base = &map[5 * pagesize];
+	vec[2].iov_len = pagesize;
+	vec[3].iov_base = &map[8 * pagesize];
+	vec[3].iov_len = pagesize;
+
+	ret = sys_process_madvise(PIDFD_SELF, vec, madvise_pages, MADV_DONTNEED,
+				  0);
+	if (ret == -1 && errno == EPERM)
+		ksft_exit_skip(
+			"process_madvise() unsupported or permission denied, try running as root.\n");
+	else if (errno == EINVAL)
+		ksft_exit_skip(
+			"process_madvise() unsupported or parameter invalid, please check arguments.\n");
+
+	/* The call should succeed and report the total bytes processed. */
+	ASSERT_EQ(ret, madvise_pages * pagesize);
+
+	/* Check that advised pages are now zero. */
+	for (int i = 0; i < madvise_pages; i++) {
+		char *advised_page = (char *)vec[i].iov_base;
+
+		/* Access should be successful (kernel provides a new page). */
+		ASSERT_TRUE(try_read_buf(advised_page));
+		/* Content must be 0, not 'A'. */
+		ASSERT_EQ(*advised_page, 0);
+	}
+
+	/* Check that an un-advised page in between is still 'A'. */
+	char *unadvised_page = &map[1 * pagesize];
+
+	ASSERT_TRUE(try_read_buf(unadvised_page));
+	for (int i = 0; i < pagesize; i++)
+		ASSERT_EQ(unadvised_page[i], 'A');
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(map, pagesize * 10), 0);
+}
+
+static long get_smaps_anon_huge_pages(pid_t pid, void *addr)
+{
+	char smaps_path[64];
+	char *line = NULL;
+	unsigned long start, end;
+	long anon_huge_kb;
+	size_t len;
+	FILE *f;
+	bool in_vma;
+
+	in_vma = false;
+	snprintf(smaps_path, sizeof(smaps_path), "/proc/%d/smaps", pid);
+	f = fopen(smaps_path, "r");
+	if (!f)
+		return -1;
+
+	while (getline(&line, &len, f) != -1) {
+		/* Check if the line describes a VMA range */
+		if (sscanf(line, "%lx-%lx", &start, &end) == 2) {
+			if ((unsigned long)addr >= start &&
+			    (unsigned long)addr < end)
+				in_vma = true;
+			else
+				in_vma = false;
+			continue;
+		}
+
+		/* If we are in the correct VMA, look for the AnonHugePages field */
+		if (in_vma &&
+		    sscanf(line, "AnonHugePages: %ld kB", &anon_huge_kb) == 1)
+			break;
+	}
+
+	free(line);
+	fclose(f);
+
+	return (anon_huge_kb > 0) ? (anon_huge_kb * 1024) : 0;
+}
+
+/**
+ * TEST_F(process_madvise, remote_collapse)
+ *
+ * This test deterministically validates process_madvise() with MADV_COLLAPSE
+ * on a remote process, other advices are difficult to verify reliably.
+ *
+ * The test verifies that a memory region in a child process, initially
+ * backed by small pages, can be collapsed into a Transparent Huge Page by a
+ * request from the parent. The result is verified by parsing the child's
+ * /proc/<pid>/smaps file.
+ */
+TEST_F(process_madvise, remote_collapse)
+{
+	const unsigned long pagesize = (unsigned long)sysconf(_SC_PAGESIZE);
+	pid_t child_pid;
+	int pidfd;
+	long huge_page_size;
+	int pipe_info[2];
+	ssize_t ret;
+	struct iovec vec;
+
+	struct child_info {
+		pid_t pid;
+		void *map_addr;
+	} info;
+
+	huge_page_size = default_huge_page_size();
+	if (huge_page_size <= 0)
+		ksft_exit_skip("Could not determine a valid huge page size.\n");
+
+	ASSERT_EQ(pipe(pipe_info), 0);
+
+	child_pid = fork();
+	ASSERT_NE(child_pid, -1);
+
+	if (child_pid == 0) {
+		char *map;
+		size_t map_size = 2 * huge_page_size;
+
+		close(pipe_info[0]);
+
+		map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		ASSERT_NE(map, MAP_FAILED);
+
+		/* Fault in as small pages */
+		for (size_t i = 0; i < map_size; i += pagesize)
+			map[i] = 'A';
+
+		/* Send info and pause */
+		info.pid = getpid();
+		info.map_addr = map;
+		ret = write(pipe_info[1], &info, sizeof(info));
+		ASSERT_EQ(ret, sizeof(info));
+		close(pipe_info[1]);
+
+		pause();
+		exit(0);
+	}
+
+	close(pipe_info[1]);
+
+	/* Receive child info */
+	ret = read(pipe_info[0], &info, sizeof(info));
+	if (ret <= 0) {
+		waitpid(child_pid, NULL, 0);
+		ksft_exit_skip("Failed to read child info from pipe.\n");
+	}
+	ASSERT_EQ(ret, sizeof(info));
+	close(pipe_info[0]);
+	child_pid = info.pid;
+
+	pidfd = pidfd_open(child_pid, 0);
+	ASSERT_GE(pidfd, 0);
+
+	/* Baseline Check from Parent's perspective */
+	ASSERT_EQ(get_smaps_anon_huge_pages(child_pid, info.map_addr), 0);
+
+	vec.iov_base = info.map_addr;
+	vec.iov_len = huge_page_size;
+	ret = sys_process_madvise(pidfd, &vec, 1, MADV_COLLAPSE, 0);
+	if (ret == -1) {
+		if (errno == EINVAL)
+			ksft_exit_skip(
+				"PROCESS_MADV_ADVISE is not supported.\n");
+		else if (errno == EPERM)
+			ksft_exit_skip(
+				"No process_madvise() permissions, try running as root.\n");
+		goto cleanup;
+	}
+	ASSERT_EQ(ret, huge_page_size);
+
+	ASSERT_EQ(get_smaps_anon_huge_pages(child_pid, info.map_addr),
+		  huge_page_size);
+
+	ksft_test_result_pass(
+		"MADV_COLLAPSE successfully verified via smaps.\n");
+
+cleanup:
+	/* Cleanup */
+	kill(child_pid, SIGKILL);
+	waitpid(child_pid, NULL, 0);
+	if (pidfd >= 0)
+		close(pidfd);
+}
+
+/*
+ * Test process_madvise() with various invalid pidfds to ensure correct error
+ * handling. This includes negative fds, non-pidfd fds, and pidfds for
+ * processes that no longer exist.
+ */
+TEST_F(process_madvise, invalid_pidfd)
+{
+	struct iovec vec;
+	pid_t child_pid;
+	ssize_t ret;
+	int pidfd;
+
+	vec.iov_base = (void *)0x1234;
+	vec.iov_len = 4096;
+
+	/* Using an invalid fd number (-1) should fail with EBADF. */
+	ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EBADF);
+
+	/*
+	 * Using a valid fd that is not a pidfd (e.g. stdin) should fail
+	 * with EBADF.
+	 */
+	ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EBADF);
+
+	/*
+	 * Using a pidfd for a process that has already exited should fail
+	 * with ESRCH.
+	 */
+	child_pid = fork();
+	ASSERT_NE(child_pid, -1);
+
+	if (child_pid == 0)
+		exit(0);
+
+	pidfd = pidfd_open(child_pid, 0);
+	ASSERT_GE(pidfd, 0);
+
+	/* Wait for the child to ensure it has terminated. */
+	waitpid(child_pid, NULL, 0);
+
+	ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, 0);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, ESRCH);
+	close(pidfd);
+}
+
+/*
+ * Test process_madvise() with an invalid flag value. Now we only support flag=0
+ * future we will use it support sync so reserve this test.
+ */
+TEST_F(process_madvise, flag)
+{
+	const unsigned long pagesize = (unsigned long)sysconf(_SC_PAGESIZE);
+	unsigned int invalid_flag;
+	struct iovec vec;
+	char *map;
+	ssize_t ret;
+
+	map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+		   0);
+	if (map == MAP_FAILED)
+		ksft_exit_skip("mmap failed, not enough memory.\n");
+
+	vec.iov_base = map;
+	vec.iov_len = pagesize;
+
+	invalid_flag = 0x80000000;
+
+	ret = sys_process_madvise(PIDFD_SELF, &vec, 1, MADV_DONTNEED,
+				  invalid_flag);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(map, pagesize), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index dddd1dd8af14..84fb51902c3e 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -65,6 +65,8 @@ separated by spaces:
 	test pagemap_scan IOCTL
 - pfnmap
 	tests for VM_PFNMAP handling
+- process_madv
+	test process_madvise
 - cow
 	test copy-on-write semantics
 - thp
@@ -422,6 +424,9 @@ CATEGORY="hmm" run_test bash ./test_hmm.sh smoke
 # MADV_GUARD_INSTALL and MADV_GUARD_REMOVE tests
 CATEGORY="madv_guard" run_test ./guard-regions
 
+# PROCESS_MADVISE TEST
+CATEGORY="process_madv" run_test ./process_madv
+
 # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
 CATEGORY="madv_populate" run_test ./madv_populate
 
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 5492e3f784df..85b209260e5a 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -20,6 +20,9 @@
 unsigned int __page_size;
 unsigned int __page_shift;
 
+volatile sig_atomic_t signal_jump_set;
+sigjmp_buf signal_jmp_buf;
+
 uint64_t pagemap_get_entry(int fd, char *start)
 {
 	const unsigned long pfn = (unsigned long)start / getpagesize();
@@ -524,3 +527,35 @@ int read_sysfs(const char *file_path, unsigned long *val)
 
 	return 0;
 }
+
+static void handle_fatal(int c)
+{
+	if (!signal_jump_set)
+		return;
+
+	siglongjmp(signal_jmp_buf, c);
+}
+
+void setup_sighandler(void)
+{
+	struct sigaction act = {
+		.sa_handler = &handle_fatal,
+		.sa_flags = SA_NODEFER,
+	};
+
+	sigemptyset(&act.sa_mask);
+	if (sigaction(SIGSEGV, &act, NULL))
+		ksft_exit_fail_perror("sigaction in setup");
+}
+
+void teardown_sighandler(void)
+{
+	struct sigaction act = {
+		.sa_handler = SIG_DFL,
+		.sa_flags = SA_NODEFER,
+	};
+
+	sigemptyset(&act.sa_mask);
+	if (sigaction(SIGSEGV, &act, NULL))
+		ksft_exit_fail_perror("sigaction in teardown");
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index b8136d12a0f8..6bc4177a2807 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -8,6 +8,8 @@
 #include <unistd.h> /* _SC_PAGESIZE */
 #include "../kselftest.h"
 #include <linux/fs.h>
+#include <setjmp.h>
+#include <signal.h>
 
 #define BIT_ULL(nr)                   (1ULL << (nr))
 #define PM_SOFT_DIRTY                 BIT_ULL(55)
@@ -61,6 +63,24 @@ static inline void skip_test_dodgy_fs(const char *op_name)
 	ksft_test_result_skip("%s failed with ENOENT. Filesystem might be buggy (9pfs?)\n", op_name);
 }
 
+/*
+ * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1:
+ *
+ * "If the signal occurs other than as the result of calling the abort or raise
+ *  function, the behavior is undefined if the signal handler refers to any
+ *  object with static storage duration other than by assigning a value to an
+ *  object declared as volatile sig_atomic_t"
+ */
+extern volatile sig_atomic_t signal_jump_set;
+extern sigjmp_buf signal_jmp_buf;
+
+/*
+ * Ignore the checkpatch warning, we must read from x but don't want to do
+ * anything with it in order to trigger a read page fault. We therefore must use
+ * volatile to stop the compiler from optimising this away.
+ */
+#define FORCE_READ(x) (*(volatile typeof(x) *)x)
+
 uint64_t pagemap_get_entry(int fd, char *start);
 bool pagemap_is_softdirty(int fd, char *start);
 bool pagemap_is_swapped(int fd, char *start);
@@ -90,6 +110,8 @@ bool find_vma_procmap(struct procmap_fd *procmap, void *address);
 int close_procmap(struct procmap_fd *procmap);
 int write_sysfs(const char *file_path, unsigned long val);
 int read_sysfs(const char *file_path, unsigned long *val);
+void setup_sighandler(void);
+void teardown_sighandler(void);
 
 static inline int open_self_procmap(struct procmap_fd *procmap_out)
 {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-03  4:43 [PATCH v3] " wang lian
@ 2025-07-05 19:32 ` Lorenzo Stoakes
  2025-07-08 17:44 ` Zi Yan
  1 sibling, 0 replies; 25+ messages in thread
From: Lorenzo Stoakes @ 2025-07-05 19:32 UTC (permalink / raw)
  To: wang lian
  Cc: david, linux-mm, akpm, sj, Liam.Howlett, brauner, gkwang, jannh,
	linux-kernel, linux-kselftest, p1ucky0923, ryncsn, shuah, vbabka,
	zijing.zhang

Andrew - can we drop this for now?

Hi Wang,

Something's broken here, the collapse test isn't working properly:

#  RUN           process_madvise.remote_collapse ...
# process_madv.c:304:remote_collapse:Expected get_smaps_anon_huge_pages(child_pid, info.map_addr) (4194304) == 0 (0)
# remote_collapse: Test terminated by assertion
#          FAIL  process_madvise.remote_collapse

And because this assert fails, when run from run_test in run_vmtest.sh it's
causing the whole script to freeze up because you're not cleaning up
properly.

Could you take a look?

When I get a moment I'll review properly, it's on my TODO.

Thanks, Lorenzo

On Thu, Jul 03, 2025 at 12:43:26PM +0800, wang lian wrote:
> Add tests for process_madvise(), focusing on verifying behavior under
> various conditions including valid usage and error cases.
>
> Signed-off-by: wang lian <lianux.mm@gmail.com>
> Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Acked-by: SeongJae Park <sj@kernel.org>
> ---
>
> Changelog v3:
> - Rebased onto the latest mm-stable branch to ensure clean application.
> - Refactor common signal handling logic into vm_util to reduce code duplication.
> - Improve test robustness and diagnostics based on community feedback.
> - Address minor code style and script corrections.
>
> Changelog v2:
> - Drop MADV_DONTNEED tests based on feedback.
> - Focus solely on process_madvise() syscall.
> - Improve error handling and structure.
> - Add future-proof flag test.
> - Style and comment cleanups.

Please try to provide links to previous versions.

>
>  tools/testing/selftests/mm/.gitignore      |   1 +
>  tools/testing/selftests/mm/Makefile        |   1 +
>  tools/testing/selftests/mm/guard-regions.c |  51 ---
>  tools/testing/selftests/mm/process_madv.c  | 358 +++++++++++++++++++++
>  tools/testing/selftests/mm/run_vmtests.sh  |   5 +
>  tools/testing/selftests/mm/vm_util.c       |  35 ++
>  tools/testing/selftests/mm/vm_util.h       |  22 ++
>  7 files changed, 422 insertions(+), 51 deletions(-)
>  create mode 100644 tools/testing/selftests/mm/process_madv.c
>
> diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
> index 824266982aa3..95bd9c6ead9e 100644
> --- a/tools/testing/selftests/mm/.gitignore
> +++ b/tools/testing/selftests/mm/.gitignore
> @@ -25,6 +25,7 @@ pfnmap
>  protection_keys
>  protection_keys_32
>  protection_keys_64
> +process_madv
>  madv_populate
>  uffd-stress
>  uffd-unit-tests
> diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
> index ae6f994d3add..d13b3cef2a2b 100644
> --- a/tools/testing/selftests/mm/Makefile
> +++ b/tools/testing/selftests/mm/Makefile
> @@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test
>  TEST_GEN_FILES += on-fault-limit
>  TEST_GEN_FILES += pagemap_ioctl
>  TEST_GEN_FILES += pfnmap
> +TEST_GEN_FILES += process_madv
>  TEST_GEN_FILES += thuge-gen
>  TEST_GEN_FILES += transhuge-stress
>  TEST_GEN_FILES += uffd-stress
> diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c
> index 93af3d3760f9..4cf101b0fe5e 100644
> --- a/tools/testing/selftests/mm/guard-regions.c
> +++ b/tools/testing/selftests/mm/guard-regions.c
> @@ -9,8 +9,6 @@
>  #include <linux/limits.h>
>  #include <linux/userfaultfd.h>
>  #include <linux/fs.h>
> -#include <setjmp.h>
> -#include <signal.h>
>  #include <stdbool.h>
>  #include <stdio.h>
>  #include <stdlib.h>
> @@ -24,24 +22,6 @@
>
>  #include "../pidfd/pidfd.h"
>
> -/*
> - * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1:
> - *
> - * "If the signal occurs other than as the result of calling the abort or raise
> - *  function, the behavior is undefined if the signal handler refers to any
> - *  object with static storage duration other than by assigning a value to an
> - *  object declared as volatile sig_atomic_t"
> - */
> -static volatile sig_atomic_t signal_jump_set;
> -static sigjmp_buf signal_jmp_buf;
> -
> -/*
> - * Ignore the checkpatch warning, we must read from x but don't want to do
> - * anything with it in order to trigger a read page fault. We therefore must use
> - * volatile to stop the compiler from optimising this away.
> - */
> -#define FORCE_READ(x) (*(volatile typeof(x) *)x)
> -
>  /*
>   * How is the test backing the mapping being tested?
>   */
> @@ -120,14 +100,6 @@ static int userfaultfd(int flags)
>  	return syscall(SYS_userfaultfd, flags);
>  }
>
> -static void handle_fatal(int c)
> -{
> -	if (!signal_jump_set)
> -		return;
> -
> -	siglongjmp(signal_jmp_buf, c);
> -}
> -
>  static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
>  				   size_t n, int advice, unsigned int flags)
>  {
> @@ -180,29 +152,6 @@ static bool try_read_write_buf(char *ptr)
>  	return try_read_buf(ptr) && try_write_buf(ptr);
>  }
>
> -static void setup_sighandler(void)
> -{
> -	struct sigaction act = {
> -		.sa_handler = &handle_fatal,
> -		.sa_flags = SA_NODEFER,
> -	};
> -
> -	sigemptyset(&act.sa_mask);
> -	if (sigaction(SIGSEGV, &act, NULL))
> -		ksft_exit_fail_perror("sigaction");
> -}
> -
> -static void teardown_sighandler(void)
> -{
> -	struct sigaction act = {
> -		.sa_handler = SIG_DFL,
> -		.sa_flags = SA_NODEFER,
> -	};
> -
> -	sigemptyset(&act.sa_mask);
> -	sigaction(SIGSEGV, &act, NULL);
> -}
> -
>  static int open_file(const char *prefix, char *path)
>  {
>  	int fd;
> diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c
> new file mode 100644
> index 000000000000..3d26105b4781
> --- /dev/null
> +++ b/tools/testing/selftests/mm/process_madv.c
> @@ -0,0 +1,358 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +#define _GNU_SOURCE
> +#include "../kselftest_harness.h"
> +#include <errno.h>
> +#include <setjmp.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/mman.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +#include <sched.h>
> +#include <sys/pidfd.h>
> +#include "vm_util.h"
> +
> +#include "../pidfd/pidfd.h"
> +
> +FIXTURE(process_madvise)
> +{
> +	int pidfd;
> +	int flag;
> +};
> +
> +FIXTURE_SETUP(process_madvise)
> +{
> +	self->pidfd = PIDFD_SELF;
> +	self->flag = 0;
> +	setup_sighandler();
> +};
> +
> +FIXTURE_TEARDOWN(process_madvise)
> +{
> +	teardown_sighandler();
> +}
> +
> +static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
> +				   size_t vlen, int advice, unsigned int flags)
> +{
> +	return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
> +}
> +
> +/*
> + * Enable our signal catcher and try to read the specified buffer. The
> + * return value indicates whether the read succeeds without a fatal
> + * signal.
> + */
> +static bool try_read_buf(char *ptr)
> +{
> +	bool failed;
> +
> +	/* Tell signal handler to jump back here on fatal signal. */
> +	signal_jump_set = true;
> +	/* If a fatal signal arose, we will jump back here and failed is set. */
> +	failed = sigsetjmp(signal_jmp_buf, 0) != 0;
> +
> +	if (!failed)
> +		FORCE_READ(ptr);
> +
> +	signal_jump_set = false;
> +	return !failed;
> +}
> +
> +TEST_F(process_madvise, basic)
> +{
> +	const unsigned long pagesize = (unsigned long)sysconf(_SC_PAGESIZE);
> +	const int madvise_pages = 4;
> +	char *map;
> +	ssize_t ret;
> +	struct iovec vec[madvise_pages];
> +
> +	/*
> +	 * Create a single large mapping. We will pick pages from this
> +	 * mapping to advise on. This ensures we test non-contiguous iovecs.
> +	 */
> +	map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
> +		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +	if (map == MAP_FAILED)
> +		ksft_exit_skip("mmap failed, not enough memory.\n");
> +
> +	/* Fill the entire region with a known pattern. */
> +	memset(map, 'A', pagesize * 10);
> +
> +	/*
> +	 * Setup the iovec to point to 4 non-contiguous pages
> +	 * within the mapping.
> +	 */
> +	vec[0].iov_base = &map[0 * pagesize];
> +	vec[0].iov_len = pagesize;
> +	vec[1].iov_base = &map[3 * pagesize];
> +	vec[1].iov_len = pagesize;
> +	vec[2].iov_base = &map[5 * pagesize];
> +	vec[2].iov_len = pagesize;
> +	vec[3].iov_base = &map[8 * pagesize];
> +	vec[3].iov_len = pagesize;
> +
> +	ret = sys_process_madvise(PIDFD_SELF, vec, madvise_pages, MADV_DONTNEED,
> +				  0);
> +	if (ret == -1 && errno == EPERM)
> +		ksft_exit_skip(
> +			"process_madvise() unsupported or permission denied, try running as root.\n");
> +	else if (errno == EINVAL)
> +		ksft_exit_skip(
> +			"process_madvise() unsupported or parameter invalid, please check arguments.\n");
> +
> +	/* The call should succeed and report the total bytes processed. */
> +	ASSERT_EQ(ret, madvise_pages * pagesize);
> +
> +	/* Check that advised pages are now zero. */
> +	for (int i = 0; i < madvise_pages; i++) {
> +		char *advised_page = (char *)vec[i].iov_base;
> +
> +		/* Access should be successful (kernel provides a new page). */
> +		ASSERT_TRUE(try_read_buf(advised_page));
> +		/* Content must be 0, not 'A'. */
> +		ASSERT_EQ(*advised_page, 0);
> +	}
> +
> +	/* Check that an un-advised page in between is still 'A'. */
> +	char *unadvised_page = &map[1 * pagesize];
> +
> +	ASSERT_TRUE(try_read_buf(unadvised_page));
> +	for (int i = 0; i < pagesize; i++)
> +		ASSERT_EQ(unadvised_page[i], 'A');
> +
> +	/* Cleanup. */
> +	ASSERT_EQ(munmap(map, pagesize * 10), 0);
> +}
> +
> +static long get_smaps_anon_huge_pages(pid_t pid, void *addr)
> +{
> +	char smaps_path[64];
> +	char *line = NULL;
> +	unsigned long start, end;
> +	long anon_huge_kb;
> +	size_t len;
> +	FILE *f;
> +	bool in_vma;
> +
> +	in_vma = false;
> +	snprintf(smaps_path, sizeof(smaps_path), "/proc/%d/smaps", pid);
> +	f = fopen(smaps_path, "r");
> +	if (!f)
> +		return -1;
> +
> +	while (getline(&line, &len, f) != -1) {
> +		/* Check if the line describes a VMA range */
> +		if (sscanf(line, "%lx-%lx", &start, &end) == 2) {
> +			if ((unsigned long)addr >= start &&
> +			    (unsigned long)addr < end)
> +				in_vma = true;
> +			else
> +				in_vma = false;
> +			continue;
> +		}
> +
> +		/* If we are in the correct VMA, look for the AnonHugePages field */
> +		if (in_vma &&
> +		    sscanf(line, "AnonHugePages: %ld kB", &anon_huge_kb) == 1)
> +			break;
> +	}
> +
> +	free(line);
> +	fclose(f);
> +
> +	return (anon_huge_kb > 0) ? (anon_huge_kb * 1024) : 0;
> +}
> +
> +/**
> + * TEST_F(process_madvise, remote_collapse)
> + *
> + * This test deterministically validates process_madvise() with MADV_COLLAPSE
> + * on a remote process, other advices are difficult to verify reliably.
> + *
> + * The test verifies that a memory region in a child process, initially
> + * backed by small pages, can be collapsed into a Transparent Huge Page by a
> + * request from the parent. The result is verified by parsing the child's
> + * /proc/<pid>/smaps file.
> + */
> +TEST_F(process_madvise, remote_collapse)
> +{
> +	const unsigned long pagesize = (unsigned long)sysconf(_SC_PAGESIZE);
> +	pid_t child_pid;
> +	int pidfd;
> +	long huge_page_size;
> +	int pipe_info[2];
> +	ssize_t ret;
> +	struct iovec vec;
> +
> +	struct child_info {
> +		pid_t pid;
> +		void *map_addr;
> +	} info;
> +
> +	huge_page_size = default_huge_page_size();
> +	if (huge_page_size <= 0)
> +		ksft_exit_skip("Could not determine a valid huge page size.\n");
> +
> +	ASSERT_EQ(pipe(pipe_info), 0);
> +
> +	child_pid = fork();
> +	ASSERT_NE(child_pid, -1);
> +
> +	if (child_pid == 0) {
> +		char *map;
> +		size_t map_size = 2 * huge_page_size;
> +
> +		close(pipe_info[0]);
> +
> +		map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
> +			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +		ASSERT_NE(map, MAP_FAILED);
> +
> +		/* Fault in as small pages */
> +		for (size_t i = 0; i < map_size; i += pagesize)
> +			map[i] = 'A';
> +
> +		/* Send info and pause */
> +		info.pid = getpid();
> +		info.map_addr = map;
> +		ret = write(pipe_info[1], &info, sizeof(info));
> +		ASSERT_EQ(ret, sizeof(info));
> +		close(pipe_info[1]);
> +
> +		pause();
> +		exit(0);
> +	}
> +
> +	close(pipe_info[1]);
> +
> +	/* Receive child info */
> +	ret = read(pipe_info[0], &info, sizeof(info));
> +	if (ret <= 0) {
> +		waitpid(child_pid, NULL, 0);
> +		ksft_exit_skip("Failed to read child info from pipe.\n");
> +	}
> +	ASSERT_EQ(ret, sizeof(info));
> +	close(pipe_info[0]);
> +	child_pid = info.pid;
> +
> +	pidfd = pidfd_open(child_pid, 0);
> +	ASSERT_GE(pidfd, 0);
> +
> +	/* Baseline Check from Parent's perspective */
> +	ASSERT_EQ(get_smaps_anon_huge_pages(child_pid, info.map_addr), 0);
> +
> +	vec.iov_base = info.map_addr;
> +	vec.iov_len = huge_page_size;
> +	ret = sys_process_madvise(pidfd, &vec, 1, MADV_COLLAPSE, 0);
> +	if (ret == -1) {
> +		if (errno == EINVAL)
> +			ksft_exit_skip(
> +				"PROCESS_MADV_ADVISE is not supported.\n");
> +		else if (errno == EPERM)
> +			ksft_exit_skip(
> +				"No process_madvise() permissions, try running as root.\n");
> +		goto cleanup;
> +	}
> +	ASSERT_EQ(ret, huge_page_size);
> +
> +	ASSERT_EQ(get_smaps_anon_huge_pages(child_pid, info.map_addr),
> +		  huge_page_size);
> +
> +	ksft_test_result_pass(
> +		"MADV_COLLAPSE successfully verified via smaps.\n");
> +
> +cleanup:
> +	/* Cleanup */
> +	kill(child_pid, SIGKILL);
> +	waitpid(child_pid, NULL, 0);
> +	if (pidfd >= 0)
> +		close(pidfd);
> +}
> +
> +/*
> + * Test process_madvise() with various invalid pidfds to ensure correct error
> + * handling. This includes negative fds, non-pidfd fds, and pidfds for
> + * processes that no longer exist.
> + */
> +TEST_F(process_madvise, invalid_pidfd)
> +{
> +	struct iovec vec;
> +	pid_t child_pid;
> +	ssize_t ret;
> +	int pidfd;
> +
> +	vec.iov_base = (void *)0x1234;
> +	vec.iov_len = 4096;
> +
> +	/* Using an invalid fd number (-1) should fail with EBADF. */
> +	ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
> +	ASSERT_EQ(ret, -1);
> +	ASSERT_EQ(errno, EBADF);
> +
> +	/*
> +	 * Using a valid fd that is not a pidfd (e.g. stdin) should fail
> +	 * with EBADF.
> +	 */
> +	ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
> +	ASSERT_EQ(ret, -1);
> +	ASSERT_EQ(errno, EBADF);
> +
> +	/*
> +	 * Using a pidfd for a process that has already exited should fail
> +	 * with ESRCH.
> +	 */
> +	child_pid = fork();
> +	ASSERT_NE(child_pid, -1);
> +
> +	if (child_pid == 0)
> +		exit(0);
> +
> +	pidfd = pidfd_open(child_pid, 0);
> +	ASSERT_GE(pidfd, 0);
> +
> +	/* Wait for the child to ensure it has terminated. */
> +	waitpid(child_pid, NULL, 0);
> +
> +	ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, 0);
> +	ASSERT_EQ(ret, -1);
> +	ASSERT_EQ(errno, ESRCH);
> +	close(pidfd);
> +}
> +
> +/*
> + * Test process_madvise() with an invalid flag value. Now we only support flag=0
> + * future we will use it support sync so reserve this test.
> + */
> +TEST_F(process_madvise, flag)
> +{
> +	const unsigned long pagesize = (unsigned long)sysconf(_SC_PAGESIZE);
> +	unsigned int invalid_flag;
> +	struct iovec vec;
> +	char *map;
> +	ssize_t ret;
> +
> +	map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
> +		   0);
> +	if (map == MAP_FAILED)
> +		ksft_exit_skip("mmap failed, not enough memory.\n");
> +
> +	vec.iov_base = map;
> +	vec.iov_len = pagesize;
> +
> +	invalid_flag = 0x80000000;
> +
> +	ret = sys_process_madvise(PIDFD_SELF, &vec, 1, MADV_DONTNEED,
> +				  invalid_flag);
> +	ASSERT_EQ(ret, -1);
> +	ASSERT_EQ(errno, EINVAL);
> +
> +	/* Cleanup. */
> +	ASSERT_EQ(munmap(map, pagesize), 0);
> +}
> +
> +TEST_HARNESS_MAIN
> diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
> index dddd1dd8af14..84fb51902c3e 100755
> --- a/tools/testing/selftests/mm/run_vmtests.sh
> +++ b/tools/testing/selftests/mm/run_vmtests.sh
> @@ -65,6 +65,8 @@ separated by spaces:
>  	test pagemap_scan IOCTL
>  - pfnmap
>  	tests for VM_PFNMAP handling
> +- process_madv
> +	test process_madvise
>  - cow
>  	test copy-on-write semantics
>  - thp
> @@ -422,6 +424,9 @@ CATEGORY="hmm" run_test bash ./test_hmm.sh smoke
>  # MADV_GUARD_INSTALL and MADV_GUARD_REMOVE tests
>  CATEGORY="madv_guard" run_test ./guard-regions
>
> +# PROCESS_MADVISE TEST
> +CATEGORY="process_madv" run_test ./process_madv
> +
>  # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
>  CATEGORY="madv_populate" run_test ./madv_populate
>
> diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
> index 5492e3f784df..85b209260e5a 100644
> --- a/tools/testing/selftests/mm/vm_util.c
> +++ b/tools/testing/selftests/mm/vm_util.c
> @@ -20,6 +20,9 @@
>  unsigned int __page_size;
>  unsigned int __page_shift;
>
> +volatile sig_atomic_t signal_jump_set;
> +sigjmp_buf signal_jmp_buf;
> +
>  uint64_t pagemap_get_entry(int fd, char *start)
>  {
>  	const unsigned long pfn = (unsigned long)start / getpagesize();
> @@ -524,3 +527,35 @@ int read_sysfs(const char *file_path, unsigned long *val)
>
>  	return 0;
>  }
> +
> +static void handle_fatal(int c)
> +{
> +	if (!signal_jump_set)
> +		return;
> +
> +	siglongjmp(signal_jmp_buf, c);
> +}
> +
> +void setup_sighandler(void)
> +{
> +	struct sigaction act = {
> +		.sa_handler = &handle_fatal,
> +		.sa_flags = SA_NODEFER,
> +	};
> +
> +	sigemptyset(&act.sa_mask);
> +	if (sigaction(SIGSEGV, &act, NULL))
> +		ksft_exit_fail_perror("sigaction in setup");
> +}
> +
> +void teardown_sighandler(void)
> +{
> +	struct sigaction act = {
> +		.sa_handler = SIG_DFL,
> +		.sa_flags = SA_NODEFER,
> +	};
> +
> +	sigemptyset(&act.sa_mask);
> +	if (sigaction(SIGSEGV, &act, NULL))
> +		ksft_exit_fail_perror("sigaction in teardown");
> +}
> diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
> index b8136d12a0f8..6bc4177a2807 100644
> --- a/tools/testing/selftests/mm/vm_util.h
> +++ b/tools/testing/selftests/mm/vm_util.h
> @@ -8,6 +8,8 @@
>  #include <unistd.h> /* _SC_PAGESIZE */
>  #include "../kselftest.h"
>  #include <linux/fs.h>
> +#include <setjmp.h>
> +#include <signal.h>
>
>  #define BIT_ULL(nr)                   (1ULL << (nr))
>  #define PM_SOFT_DIRTY                 BIT_ULL(55)
> @@ -61,6 +63,24 @@ static inline void skip_test_dodgy_fs(const char *op_name)
>  	ksft_test_result_skip("%s failed with ENOENT. Filesystem might be buggy (9pfs?)\n", op_name);
>  }
>
> +/*
> + * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1:
> + *
> + * "If the signal occurs other than as the result of calling the abort or raise
> + *  function, the behavior is undefined if the signal handler refers to any
> + *  object with static storage duration other than by assigning a value to an
> + *  object declared as volatile sig_atomic_t"
> + */
> +extern volatile sig_atomic_t signal_jump_set;
> +extern sigjmp_buf signal_jmp_buf;
> +
> +/*
> + * Ignore the checkpatch warning, we must read from x but don't want to do
> + * anything with it in order to trigger a read page fault. We therefore must use
> + * volatile to stop the compiler from optimising this away.
> + */
> +#define FORCE_READ(x) (*(volatile typeof(x) *)x)
> +
>  uint64_t pagemap_get_entry(int fd, char *start);
>  bool pagemap_is_softdirty(int fd, char *start);
>  bool pagemap_is_swapped(int fd, char *start);
> @@ -90,6 +110,8 @@ bool find_vma_procmap(struct procmap_fd *procmap, void *address);
>  int close_procmap(struct procmap_fd *procmap);
>  int write_sysfs(const char *file_path, unsigned long val);
>  int read_sysfs(const char *file_path, unsigned long *val);
> +void setup_sighandler(void);
> +void teardown_sighandler(void);
>
>  static inline int open_self_procmap(struct procmap_fd *procmap_out)
>  {
> --
> 2.43.0
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-03  4:43 [PATCH v3] " wang lian
  2025-07-05 19:32 ` Lorenzo Stoakes
@ 2025-07-08 17:44 ` Zi Yan
  2025-07-09 12:32   ` wang lian
  1 sibling, 1 reply; 25+ messages in thread
From: Zi Yan @ 2025-07-08 17:44 UTC (permalink / raw)
  To: wang lian
  Cc: david, linux-mm, akpm, lorenzo.stoakes, sj, Liam.Howlett, brauner,
	gkwang, jannh, linux-kernel, linux-kselftest, p1ucky0923, ryncsn,
	shuah, vbabka, zijing.zhang

On 3 Jul 2025, at 0:43, wang lian wrote:

> Add tests for process_madvise(), focusing on verifying behavior under
> various conditions including valid usage and error cases.
>
> Signed-off-by: wang lian <lianux.mm@gmail.com>
> Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Acked-by: SeongJae Park <sj@kernel.org>
> ---
>
> Changelog v3:
> - Rebased onto the latest mm-stable branch to ensure clean application.
> - Refactor common signal handling logic into vm_util to reduce code duplication.
> - Improve test robustness and diagnostics based on community feedback.
> - Address minor code style and script corrections.
>
> Changelog v2:
> - Drop MADV_DONTNEED tests based on feedback.
> - Focus solely on process_madvise() syscall.
> - Improve error handling and structure.
> - Add future-proof flag test.
> - Style and comment cleanups.
>
>  tools/testing/selftests/mm/.gitignore      |   1 +
>  tools/testing/selftests/mm/Makefile        |   1 +
>  tools/testing/selftests/mm/guard-regions.c |  51 ---
>  tools/testing/selftests/mm/process_madv.c  | 358 +++++++++++++++++++++
>  tools/testing/selftests/mm/run_vmtests.sh  |   5 +
>  tools/testing/selftests/mm/vm_util.c       |  35 ++
>  tools/testing/selftests/mm/vm_util.h       |  22 ++
>  7 files changed, 422 insertions(+), 51 deletions(-)
>  create mode 100644 tools/testing/selftests/mm/process_madv.c
>

<snip>

> diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c
> new file mode 100644
> index 000000000000..3d26105b4781
> --- /dev/null
> +++ b/tools/testing/selftests/mm/process_madv.c
> @@ -0,0 +1,358 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +#define _GNU_SOURCE
> +#include "../kselftest_harness.h"
> +#include <errno.h>
> +#include <setjmp.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/mman.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +#include <sched.h>
> +#include <sys/pidfd.h>

When I was compiling it on arm64, I got the error below.
“fatal error: sys/pidfd.h: No such file or directory”

I ran “make headers_install” before the compilation,
but still got the error.

It works fine with x86_64. I am not sure what I am missing.

Best Regards,
Yan, Zi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-08 17:44 ` Zi Yan
@ 2025-07-09 12:32   ` wang lian
  2025-07-09 12:51     ` Lorenzo Stoakes
  2025-07-09 14:46     ` Zi Yan
  0 siblings, 2 replies; 25+ messages in thread
From: wang lian @ 2025-07-09 12:32 UTC (permalink / raw)
  To: ziy
  Cc: Liam.Howlett, akpm, brauner, david, gkwang, jannh, lianux.mm,
	linux-kernel, linux-kselftest, linux-mm, lorenzo.stoakes,
	p1ucky0923, ryncsn, shuah, sj, vbabka, zijing.zhang

Hi Zi Yan,
Thanks for testing the patch and reporting this build failure.
I don't have an arm64 environment readily available for testing, so I
appreciate you catching this. I suspect this is caused by missing or
older userspace headers in the cross-compilation toolchain.
I will try to fix this in the next version. If the problem persists, a
good solution would be to manually define the syscall wrapper to avoid
the dependency on <sys/pidfd.h>.
I'll send out a v4 with a fix.
Thanks again for your help.
Best regards,
Wang Lian

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-09 12:32   ` wang lian
@ 2025-07-09 12:51     ` Lorenzo Stoakes
  2025-07-10 11:29       ` wang lian
  2025-07-09 14:46     ` Zi Yan
  1 sibling, 1 reply; 25+ messages in thread
From: Lorenzo Stoakes @ 2025-07-09 12:51 UTC (permalink / raw)
  To: wang lian
  Cc: ziy, Liam.Howlett, akpm, brauner, david, gkwang, jannh,
	linux-kernel, linux-kselftest, linux-mm, p1ucky0923, ryncsn, sj,
	vbabka, zijing.zhang

On Wed, Jul 09, 2025 at 08:32:24PM +0800, wang lian wrote:
> Hi Zi Yan,
> Thanks for testing the patch and reporting this build failure.
> I don't have an arm64 environment readily available for testing, so I
> appreciate you catching this. I suspect this is caused by missing or
> older userspace headers in the cross-compilation toolchain.
> I will try to fix this in the next version. If the problem persists, a
> good solution would be to manually define the syscall wrapper to avoid
> the dependency on <sys/pidfd.h>.
> I'll send out a v4 with a fix.
> Thanks again for your help.
> Best regards,
> Wang Lian

Hi Wang,

Please try to include the context of what you're replying to in your
messages, reading the above I have to _just remember_ what Zi said, and I'm
old so that's hard now ;)

Please note that mm tests _must_ work without make headers_install being
run.

Your test must not rely upon those.

Thanks, Lorenzo

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-09 12:32   ` wang lian
  2025-07-09 12:51     ` Lorenzo Stoakes
@ 2025-07-09 14:46     ` Zi Yan
  2025-07-10  8:42       ` Mark Brown
  2025-07-10 11:40       ` wang lian
  1 sibling, 2 replies; 25+ messages in thread
From: Zi Yan @ 2025-07-09 14:46 UTC (permalink / raw)
  To: wang lian
  Cc: Liam.Howlett, akpm, brauner, david, gkwang, jannh, linux-kernel,
	linux-kselftest, linux-mm, lorenzo.stoakes, p1ucky0923, ryncsn,
	shuah, sj, vbabka, zijing.zhang

On 9 Jul 2025, at 8:32, wang lian wrote:

> Hi Zi Yan,
> Thanks for testing the patch and reporting this build failure.
> I don't have an arm64 environment readily available for testing, so I
> appreciate you catching this. I suspect this is caused by missing or
> older userspace headers in the cross-compilation toolchain.

Right. My /usr/include/sys does not have pidfd.h. IMHO selftests
should not rely on userspace headers, otherwise we cannot test
latest kernel changes.

> I will try to fix this in the next version. If the problem persists, a
> good solution would be to manually define the syscall wrapper to avoid
> the dependency on <sys/pidfd.h>.

Based on what I see in other mm tests, the following patch fixes my
compilation issue.

diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c
index 3d26105b4781..8bf11433d6e6 100644
--- a/tools/testing/selftests/mm/process_madv.c
+++ b/tools/testing/selftests/mm/process_madv.c
@@ -10,13 +10,14 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
+#include <linux/mman.h>
 #include <sys/syscall.h>
 #include <unistd.h>
 #include <sched.h>
-#include <sys/pidfd.h>
+#include <linux/pidfd.h>
+#include <linux/uio.h>
 #include "vm_util.h"

-#include "../pidfd/pidfd.h"

 FIXTURE(process_madvise)
 {
@@ -240,7 +241,7 @@ TEST_F(process_madvise, remote_collapse)
 	close(pipe_info[0]);
 	child_pid = info.pid;

-	pidfd = pidfd_open(child_pid, 0);
+	pidfd = syscall(__NR_pidfd_open, child_pid, 0);
 	ASSERT_GE(pidfd, 0);

 	/* Baseline Check from Parent's perspective */
@@ -312,7 +313,7 @@ TEST_F(process_madvise, invalid_pidfd)
 	if (child_pid == 0)
 		exit(0);

-	pidfd = pidfd_open(child_pid, 0);
+	pidfd = syscall(__NR_pidfd_open, child_pid, 0);
 	ASSERT_GE(pidfd, 0);

 	/* Wait for the child to ensure it has terminated. */


Best Regards,
Yan, Zi

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-09 14:46     ` Zi Yan
@ 2025-07-10  8:42       ` Mark Brown
  2025-07-10 16:28         ` Zi Yan
  2025-07-10 11:40       ` wang lian
  1 sibling, 1 reply; 25+ messages in thread
From: Mark Brown @ 2025-07-10  8:42 UTC (permalink / raw)
  To: Zi Yan
  Cc: wang lian, Liam.Howlett, akpm, brauner, david, gkwang, jannh,
	linux-kernel, linux-kselftest, linux-mm, lorenzo.stoakes,
	p1ucky0923, ryncsn, shuah, sj, vbabka, zijing.zhang

[-- Attachment #1: Type: text/plain, Size: 1413 bytes --]

On Wed, Jul 09, 2025 at 10:46:07AM -0400, Zi Yan wrote:
> On 9 Jul 2025, at 8:32, wang lian wrote:

> > Thanks for testing the patch and reporting this build failure.
> > I don't have an arm64 environment readily available for testing, so I
> > appreciate you catching this. I suspect this is caused by missing or
> > older userspace headers in the cross-compilation toolchain.

> Right. My /usr/include/sys does not have pidfd.h. IMHO selftests
> should not rely on userspace headers, otherwise we cannot test
> latest kernel changes.

That's not realistic, we need to be able to use things like libc and for
many areas you'd just end up copying or reimplmenenting the userspace
libraries.  There's some concerns for sure, for example we used to have
hideous problems with the BPF tests needing extremely recent versions of
LLVM which weren't available from distros, but just saying nothing from
userspace is a big blocker to getting things done.  With some things
they're widely enough available that you can just assume they're there,
with other things they're less standard so we need build time checks.

OTOH in a case like this where we can just refer directly to a kernel
header for some constants or structs then it does make sense to use the
kernel headers, or in other cases where we're testing things that are
intended to be controlled by libc it makes sense to use nolibc avoid
conflicting with libc.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v4] selftests/mm: add process_madvise() tests
@ 2025-07-10 11:22 wang lian
  2025-07-10 13:42 ` Mark Brown
  2025-07-10 16:57 ` [PATCH v4] " Lorenzo Stoakes
  0 siblings, 2 replies; 25+ messages in thread
From: wang lian @ 2025-07-10 11:22 UTC (permalink / raw)
  To: akpm, ziy, lorenzo.stoakes, david, sj, linux-kernel
  Cc: linux-mm, linux-kselftest, shuah, broonie, Liam.Howlett, brauner,
	gkwang, jannh, p1ucky0923, ryncsn, vbabka, zijing.zhang,
	lianux.mm

Add tests for process_madvise(), focusing on verifying behavior under
various conditions including valid usage and error cases.

Signed-off-by: wang lian <lianux.mm@gmail.com>
Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Suggested-by: Zi Yan <ziy@nvidia.com>
Acked-by: SeongJae Park <sj@kernel.org>
---
Changelog v4:
- Refine resource cleanup logic in test teardown to be more robust.
- Improve remote_collapse test to correctly handle different THP
  (Transparent Huge Page) policies ('always', 'madvise', 'never'),
  including handling race conditions with khugepaged.
- Resolve build errors

Changelog v3: https://lore.kernel.org/lkml/20250703044326.65061-1-lianux.mm@gmail.com/
- Rebased onto the latest mm-stable branch to ensure clean application.
- Refactor common signal handling logic into vm_util to reduce code duplication.
- Improve test robustness and diagnostics based on community feedback.
- Address minor code style and script corrections.

Changelog v2: https://lore.kernel.org/lkml/20250630140957.4000-1-lianux.mm@gmail.com/
- Drop MADV_DONTNEED tests based on feedback.
- Focus solely on process_madvise() syscall.
- Improve error handling and structure.
- Add future-proof flag test.
- Style and comment cleanups.

-V1: https://lore.kernel.org/lkml/20250621133003.4733-1-lianux.mm@gmail.com/

 tools/testing/selftests/mm/.gitignore      |   1 +
 tools/testing/selftests/mm/Makefile        |   1 +
 tools/testing/selftests/mm/guard-regions.c |  51 ---
 tools/testing/selftests/mm/process_madv.c  | 447 +++++++++++++++++++++
 tools/testing/selftests/mm/run_vmtests.sh  |   5 +
 tools/testing/selftests/mm/vm_util.c       |  35 ++
 tools/testing/selftests/mm/vm_util.h       |  22 +
 7 files changed, 511 insertions(+), 51 deletions(-)
 create mode 100644 tools/testing/selftests/mm/process_madv.c

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 824266982aa3..95bd9c6ead9e 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -25,6 +25,7 @@ pfnmap
 protection_keys
 protection_keys_32
 protection_keys_64
+process_madv
 madv_populate
 uffd-stress
 uffd-unit-tests
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index ae6f994d3add..d13b3cef2a2b 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test
 TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += pagemap_ioctl
 TEST_GEN_FILES += pfnmap
+TEST_GEN_FILES += process_madv
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += uffd-stress
diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c
index 93af3d3760f9..4cf101b0fe5e 100644
--- a/tools/testing/selftests/mm/guard-regions.c
+++ b/tools/testing/selftests/mm/guard-regions.c
@@ -9,8 +9,6 @@
 #include <linux/limits.h>
 #include <linux/userfaultfd.h>
 #include <linux/fs.h>
-#include <setjmp.h>
-#include <signal.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -24,24 +22,6 @@
 
 #include "../pidfd/pidfd.h"
 
-/*
- * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1:
- *
- * "If the signal occurs other than as the result of calling the abort or raise
- *  function, the behavior is undefined if the signal handler refers to any
- *  object with static storage duration other than by assigning a value to an
- *  object declared as volatile sig_atomic_t"
- */
-static volatile sig_atomic_t signal_jump_set;
-static sigjmp_buf signal_jmp_buf;
-
-/*
- * Ignore the checkpatch warning, we must read from x but don't want to do
- * anything with it in order to trigger a read page fault. We therefore must use
- * volatile to stop the compiler from optimising this away.
- */
-#define FORCE_READ(x) (*(volatile typeof(x) *)x)
-
 /*
  * How is the test backing the mapping being tested?
  */
@@ -120,14 +100,6 @@ static int userfaultfd(int flags)
 	return syscall(SYS_userfaultfd, flags);
 }
 
-static void handle_fatal(int c)
-{
-	if (!signal_jump_set)
-		return;
-
-	siglongjmp(signal_jmp_buf, c);
-}
-
 static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
 				   size_t n, int advice, unsigned int flags)
 {
@@ -180,29 +152,6 @@ static bool try_read_write_buf(char *ptr)
 	return try_read_buf(ptr) && try_write_buf(ptr);
 }
 
-static void setup_sighandler(void)
-{
-	struct sigaction act = {
-		.sa_handler = &handle_fatal,
-		.sa_flags = SA_NODEFER,
-	};
-
-	sigemptyset(&act.sa_mask);
-	if (sigaction(SIGSEGV, &act, NULL))
-		ksft_exit_fail_perror("sigaction");
-}
-
-static void teardown_sighandler(void)
-{
-	struct sigaction act = {
-		.sa_handler = SIG_DFL,
-		.sa_flags = SA_NODEFER,
-	};
-
-	sigemptyset(&act.sa_mask);
-	sigaction(SIGSEGV, &act, NULL);
-}
-
 static int open_file(const char *prefix, char *path)
 {
 	int fd;
diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c
new file mode 100644
index 000000000000..7d7509486d46
--- /dev/null
+++ b/tools/testing/selftests/mm/process_madv.c
@@ -0,0 +1,447 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+#include "../kselftest_harness.h"
+#include <errno.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <sched.h>
+#include <linux/pidfd.h>
+#include <linux/uio.h>
+#include "vm_util.h"
+
+FIXTURE(process_madvise)
+{
+	int pidfd;
+	int flag;
+	pid_t child_pid;
+};
+
+FIXTURE_SETUP(process_madvise)
+{
+	self->pidfd = PIDFD_SELF;
+	self->flag = 0;
+	self->child_pid = -1;
+	setup_sighandler();
+};
+
+FIXTURE_TEARDOWN_PARENT(process_madvise)
+{
+	teardown_sighandler();
+	if (self->child_pid > 0) {
+		kill(self->child_pid, SIGKILL);
+		waitpid(self->child_pid, NULL, 0);
+	}
+}
+
+static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
+				   size_t vlen, int advice, unsigned int flags)
+{
+	return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
+}
+
+/*
+ * Enable our signal catcher and try to read the specified buffer. The
+ * return value indicates whether the read succeeds without a fatal
+ * signal.
+ */
+static bool try_read_buf(char *ptr)
+{
+	bool failed;
+
+	/* Tell signal handler to jump back here on fatal signal. */
+	signal_jump_set = true;
+	/* If a fatal signal arose, we will jump back here and failed is set. */
+	failed = sigsetjmp(signal_jmp_buf, 0) != 0;
+
+	if (!failed)
+		FORCE_READ(ptr);
+
+	signal_jump_set = false;
+	return !failed;
+}
+
+TEST_F(process_madvise, basic)
+{
+	const unsigned long pagesize = (unsigned long)sysconf(_SC_PAGESIZE);
+	const int madvise_pages = 4;
+	char *map;
+	ssize_t ret;
+	struct iovec vec[madvise_pages];
+
+	/*
+	 * Create a single large mapping. We will pick pages from this
+	 * mapping to advise on. This ensures we test non-contiguous iovecs.
+	 */
+	map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (map == MAP_FAILED)
+		ksft_exit_skip("mmap failed, not enough memory.\n");
+
+	/* Fill the entire region with a known pattern. */
+	memset(map, 'A', pagesize * 10);
+
+	/*
+	 * Setup the iovec to point to 4 non-contiguous pages
+	 * within the mapping.
+	 */
+	vec[0].iov_base = &map[0 * pagesize];
+	vec[0].iov_len = pagesize;
+	vec[1].iov_base = &map[3 * pagesize];
+	vec[1].iov_len = pagesize;
+	vec[2].iov_base = &map[5 * pagesize];
+	vec[2].iov_len = pagesize;
+	vec[3].iov_base = &map[8 * pagesize];
+	vec[3].iov_len = pagesize;
+
+	ret = sys_process_madvise(PIDFD_SELF, vec, madvise_pages, MADV_DONTNEED,
+				  0);
+	if (ret == -1 && errno == EPERM)
+		ksft_exit_skip(
+			"process_madvise() unsupported or permission denied, try running as root.\n");
+	else if (errno == EINVAL)
+		ksft_exit_skip(
+			"process_madvise() unsupported or parameter invalid, please check arguments.\n");
+
+	/* The call should succeed and report the total bytes processed. */
+	ASSERT_EQ(ret, madvise_pages * pagesize);
+
+	/* Check that advised pages are now zero. */
+	for (int i = 0; i < madvise_pages; i++) {
+		char *advised_page = (char *)vec[i].iov_base;
+
+		/* Access should be successful (kernel provides a new page). */
+		ASSERT_TRUE(try_read_buf(advised_page));
+		/* Content must be 0, not 'A'. */
+		ASSERT_EQ(*advised_page, 0);
+	}
+
+	/* Check that an un-advised page in between is still 'A'. */
+	char *unadvised_page = &map[1 * pagesize];
+
+	ASSERT_TRUE(try_read_buf(unadvised_page));
+	for (int i = 0; i < pagesize; i++)
+		ASSERT_EQ(unadvised_page[i], 'A');
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(map, pagesize * 10), 0);
+}
+
+static long get_smaps_anon_huge_pages(pid_t pid, void *addr)
+{
+	char smaps_path[64];
+	char *line = NULL;
+	unsigned long start, end;
+	long anon_huge_kb;
+	size_t len;
+	FILE *f;
+	bool in_vma;
+
+	in_vma = false;
+	snprintf(smaps_path, sizeof(smaps_path), "/proc/%d/smaps", pid);
+	f = fopen(smaps_path, "r");
+	if (!f)
+		return -1;
+
+	while (getline(&line, &len, f) != -1) {
+		/* Check if the line describes a VMA range */
+		if (sscanf(line, "%lx-%lx", &start, &end) == 2) {
+			if ((unsigned long)addr >= start &&
+			    (unsigned long)addr < end)
+				in_vma = true;
+			else
+				in_vma = false;
+			continue;
+		}
+
+		/* If we are in the correct VMA, look for the AnonHugePages field */
+		if (in_vma &&
+		    sscanf(line, "AnonHugePages: %ld kB", &anon_huge_kb) == 1)
+			break;
+	}
+
+	free(line);
+	fclose(f);
+
+	return (anon_huge_kb > 0) ? (anon_huge_kb * 1024) : 0;
+}
+
+static bool is_thp_always(void)
+{
+	const char *path = "/sys/kernel/mm/transparent_hugepage/enabled";
+	char buf[32];
+	FILE *f = fopen(path, "r");
+
+	if (!f)
+		return false;
+
+	if (fgets(buf, sizeof(buf), f))
+		if (strstr(buf, "[always]")) {
+			fclose(f);
+			return true;
+		}
+
+	fclose(f);
+	return false;
+}
+
+/**
+ * TEST_F(process_madvise, remote_collapse)
+ *
+ * This test deterministically validates process_madvise() with MADV_COLLAPSE
+ * on a remote process, other advices are difficult to verify reliably.
+ *
+ * The test verifies that a memory region in a child process, initially
+ * backed by small pages, can be collapsed into a Transparent Huge Page by a
+ * request from the parent. The result is verified by parsing the child's
+ * /proc/<pid>/smaps file.
+ */
+TEST_F(process_madvise, remote_collapse)
+{
+	const unsigned long pagesize = (unsigned long)sysconf(_SC_PAGESIZE);
+	int pidfd;
+	long huge_page_size;
+	int pipe_info[2];
+	ssize_t ret;
+	struct iovec vec;
+
+	struct child_info {
+		pid_t pid;
+		void *map_addr;
+	} info;
+
+	huge_page_size = default_huge_page_size();
+	if (huge_page_size <= 0)
+		ksft_exit_skip("Could not determine a valid huge page size.\n");
+
+	ASSERT_EQ(pipe(pipe_info), 0);
+
+	self->child_pid = fork();
+	ASSERT_NE(self->child_pid, -1);
+
+	if (self->child_pid == 0) {
+		char *map;
+		size_t map_size = 2 * huge_page_size;
+
+		close(pipe_info[0]);
+
+		map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		ASSERT_NE(map, MAP_FAILED);
+
+		/* Fault in as small pages */
+		for (size_t i = 0; i < map_size; i += pagesize)
+			map[i] = 'A';
+
+		/* Send info and pause */
+		info.pid = getpid();
+		info.map_addr = map;
+		ret = write(pipe_info[1], &info, sizeof(info));
+		ASSERT_EQ(ret, sizeof(info));
+		close(pipe_info[1]);
+
+		pause();
+		exit(0);
+	}
+
+	close(pipe_info[1]);
+
+	/* Receive child info */
+	ret = read(pipe_info[0], &info, sizeof(info));
+	if (ret <= 0) {
+		waitpid(self->child_pid, NULL, 0);
+		ksft_exit_skip("Failed to read child info from pipe.\n");
+	}
+	ASSERT_EQ(ret, sizeof(info));
+	close(pipe_info[0]);
+	self->child_pid = info.pid;
+
+	pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
+	ASSERT_GE(pidfd, 0);
+
+	vec.iov_base = info.map_addr;
+	vec.iov_len = huge_page_size;
+
+	if (is_thp_always()) {
+		long initial_huge_pages;
+
+		/*
+		 * When THP is 'always', khugepaged may pre-emptively
+		 * collapse the pages before our MADV_COLLAPSE call. Check
+		 * the initial state to provide a more accurate test report.
+		 */
+		initial_huge_pages =
+			get_smaps_anon_huge_pages(self->child_pid, info.map_addr);
+
+		if (initial_huge_pages == 2 * huge_page_size) {
+			/*
+			 * The pages were already collapsed by khugepaged.
+			 * The test goal narrows to verifying that MADV_COLLAPSE
+			 * correctly returns success on an already-collapsed
+			 * region, as documented.
+			 */
+			ksft_test_result_skip(
+				"THP is 'always' and pages were pre-collapsed; verifying success on already-collapsed page.\n");
+
+			ret = sys_process_madvise(pidfd, &vec, 1, MADV_COLLAPSE,
+						  0);
+			ASSERT_EQ(ret, huge_page_size);
+			goto cleanup;
+		}
+
+		/*
+		 * Pages are still small, creating a race between our call
+		 * and khugepaged. This is the main test scenario for 'always'.
+		 */
+		ret = sys_process_madvise(pidfd, &vec, 1, MADV_COLLAPSE, 0);
+
+		if (ret == -1) {
+			/*
+			 * MADV_COLLAPSE lost the race to khugepaged, which
+			 * likely held a page lock. The kernel correctly
+			 * reports this temporary contention with EAGAIN.
+			 */
+			if (errno == EAGAIN) {
+				ksft_test_result_skip(
+					"THP is 'always', process_madvise returned EAGAIN due to an expected race with khugepaged.\n");
+			} else {
+				ksft_test_result_fail(
+					"process_madvise failed with unexpected errno %d in 'always' mode.\n",
+					errno);
+			}
+			goto cleanup;
+		}
+
+		/*
+		 * MADV_COLLAPSE won the race and successfully collapsed
+		 * the pages. Verify the final state.
+		 */
+		ASSERT_EQ(ret, huge_page_size);
+		ASSERT_EQ(get_smaps_anon_huge_pages(self->child_pid, info.map_addr),
+			  huge_page_size);
+		ksft_test_result_pass(
+			"THP is 'always', MADV_COLLAPSE won race and collapsed pages.\n");
+		goto cleanup;
+	}
+
+	/*
+	 * THP is 'madvise' or 'never'. No race is expected with khugepaged.
+	 * We can perform a straightforward state-change verification.
+	 */
+	ASSERT_EQ(get_smaps_anon_huge_pages(self->child_pid, info.map_addr), 0);
+
+	ret = sys_process_madvise(pidfd, &vec, 1, MADV_COLLAPSE, 0);
+	if (ret == -1) {
+		if (errno == EINVAL)
+			ksft_exit_skip(
+				"PROCESS_MADV_ADVISE is not supported.\n");
+		else if (errno == EPERM)
+			ksft_exit_skip(
+				"No process_madvise() permissions, try running as root.\n");
+		goto cleanup;
+	}
+	ASSERT_EQ(ret, huge_page_size);
+
+	ASSERT_EQ(get_smaps_anon_huge_pages(self->child_pid, info.map_addr),
+		  huge_page_size);
+
+	ksft_test_result_pass(
+		"MADV_COLLAPSE successfully verified via smaps.\n");
+
+cleanup:
+	/* Cleanup */
+	kill(self->child_pid, SIGKILL);
+	waitpid(self->child_pid, NULL, 0);
+	if (pidfd >= 0)
+		close(pidfd);
+}
+
+/*
+ * Test process_madvise() with various invalid pidfds to ensure correct error
+ * handling. This includes negative fds, non-pidfd fds, and pidfds for
+ * processes that no longer exist.
+ */
+TEST_F(process_madvise, invalid_pidfd)
+{
+	struct iovec vec;
+	pid_t child_pid;
+	ssize_t ret;
+	int pidfd;
+
+	vec.iov_base = (void *)0x1234;
+	vec.iov_len = 4096;
+
+	/* Using an invalid fd number (-1) should fail with EBADF. */
+	ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EBADF);
+
+	/*
+	 * Using a valid fd that is not a pidfd (e.g. stdin) should fail
+	 * with EBADF.
+	 */
+	ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EBADF);
+
+	/*
+	 * Using a pidfd for a process that has already exited should fail
+	 * with ESRCH.
+	 */
+	child_pid = fork();
+	ASSERT_NE(child_pid, -1);
+
+	if (child_pid == 0)
+		exit(0);
+
+	pidfd = syscall(__NR_pidfd_open, child_pid, 0);
+	ASSERT_GE(pidfd, 0);
+
+	/* Wait for the child to ensure it has terminated. */
+	waitpid(child_pid, NULL, 0);
+
+	ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, 0);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, ESRCH);
+	close(pidfd);
+}
+
+/*
+ * Test process_madvise() with an invalid flag value. Now we only support flag=0
+ * future we will use it support sync so reserve this test.
+ */
+TEST_F(process_madvise, flag)
+{
+	const unsigned long pagesize = (unsigned long)sysconf(_SC_PAGESIZE);
+	unsigned int invalid_flag;
+	struct iovec vec;
+	char *map;
+	ssize_t ret;
+
+	map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+		   0);
+	if (map == MAP_FAILED)
+		ksft_exit_skip("mmap failed, not enough memory.\n");
+
+	vec.iov_base = map;
+	vec.iov_len = pagesize;
+
+	invalid_flag = 0x80000000;
+
+	ret = sys_process_madvise(PIDFD_SELF, &vec, 1, MADV_DONTNEED,
+				  invalid_flag);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(map, pagesize), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index dddd1dd8af14..84fb51902c3e 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -65,6 +65,8 @@ separated by spaces:
 	test pagemap_scan IOCTL
 - pfnmap
 	tests for VM_PFNMAP handling
+- process_madv
+	test process_madvise
 - cow
 	test copy-on-write semantics
 - thp
@@ -422,6 +424,9 @@ CATEGORY="hmm" run_test bash ./test_hmm.sh smoke
 # MADV_GUARD_INSTALL and MADV_GUARD_REMOVE tests
 CATEGORY="madv_guard" run_test ./guard-regions
 
+# PROCESS_MADVISE TEST
+CATEGORY="process_madv" run_test ./process_madv
+
 # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
 CATEGORY="madv_populate" run_test ./madv_populate
 
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 5492e3f784df..85b209260e5a 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -20,6 +20,9 @@
 unsigned int __page_size;
 unsigned int __page_shift;
 
+volatile sig_atomic_t signal_jump_set;
+sigjmp_buf signal_jmp_buf;
+
 uint64_t pagemap_get_entry(int fd, char *start)
 {
 	const unsigned long pfn = (unsigned long)start / getpagesize();
@@ -524,3 +527,35 @@ int read_sysfs(const char *file_path, unsigned long *val)
 
 	return 0;
 }
+
+static void handle_fatal(int c)
+{
+	if (!signal_jump_set)
+		return;
+
+	siglongjmp(signal_jmp_buf, c);
+}
+
+void setup_sighandler(void)
+{
+	struct sigaction act = {
+		.sa_handler = &handle_fatal,
+		.sa_flags = SA_NODEFER,
+	};
+
+	sigemptyset(&act.sa_mask);
+	if (sigaction(SIGSEGV, &act, NULL))
+		ksft_exit_fail_perror("sigaction in setup");
+}
+
+void teardown_sighandler(void)
+{
+	struct sigaction act = {
+		.sa_handler = SIG_DFL,
+		.sa_flags = SA_NODEFER,
+	};
+
+	sigemptyset(&act.sa_mask);
+	if (sigaction(SIGSEGV, &act, NULL))
+		ksft_exit_fail_perror("sigaction in teardown");
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index b8136d12a0f8..6bc4177a2807 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -8,6 +8,8 @@
 #include <unistd.h> /* _SC_PAGESIZE */
 #include "../kselftest.h"
 #include <linux/fs.h>
+#include <setjmp.h>
+#include <signal.h>
 
 #define BIT_ULL(nr)                   (1ULL << (nr))
 #define PM_SOFT_DIRTY                 BIT_ULL(55)
@@ -61,6 +63,24 @@ static inline void skip_test_dodgy_fs(const char *op_name)
 	ksft_test_result_skip("%s failed with ENOENT. Filesystem might be buggy (9pfs?)\n", op_name);
 }
 
+/*
+ * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1:
+ *
+ * "If the signal occurs other than as the result of calling the abort or raise
+ *  function, the behavior is undefined if the signal handler refers to any
+ *  object with static storage duration other than by assigning a value to an
+ *  object declared as volatile sig_atomic_t"
+ */
+extern volatile sig_atomic_t signal_jump_set;
+extern sigjmp_buf signal_jmp_buf;
+
+/*
+ * Ignore the checkpatch warning, we must read from x but don't want to do
+ * anything with it in order to trigger a read page fault. We therefore must use
+ * volatile to stop the compiler from optimising this away.
+ */
+#define FORCE_READ(x) (*(volatile typeof(x) *)x)
+
 uint64_t pagemap_get_entry(int fd, char *start);
 bool pagemap_is_softdirty(int fd, char *start);
 bool pagemap_is_swapped(int fd, char *start);
@@ -90,6 +110,8 @@ bool find_vma_procmap(struct procmap_fd *procmap, void *address);
 int close_procmap(struct procmap_fd *procmap);
 int write_sysfs(const char *file_path, unsigned long val);
 int read_sysfs(const char *file_path, unsigned long *val);
+void setup_sighandler(void);
+void teardown_sighandler(void);
 
 static inline int open_self_procmap(struct procmap_fd *procmap_out)
 {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-09 12:51     ` Lorenzo Stoakes
@ 2025-07-10 11:29       ` wang lian
  0 siblings, 0 replies; 25+ messages in thread
From: wang lian @ 2025-07-10 11:29 UTC (permalink / raw)
  To: lorenzo.stoakes
  Cc: Liam.Howlett, akpm, brauner, david, gkwang, jannh, lianux.mm,
	linux-kernel, linux-kselftest, linux-mm, p1ucky0923, ryncsn, sj,
	vbabka, zijing.zhang, ziy

Hi Lorenzo,

On <Date of Lorenzo's email>, Lorenzo Stoakes wrote:
> Hi Wang,
>
> Please try to include the context of what you're replying to in your
> messages, reading the above I have to _just remember_ what Zi said, and I'm
> old so that's hard now ;)
>
> Please note that mm tests _must_ work without make headers_install being
> run.
>
> Your test must not rely upon those.

Thank you for the clear feedback and for pointing out these important rules.

You're absolutely right about including the context in my replies. My
apologies for the inconvenience this caused. SeongJae Park recently
recommended `hackmail` to me, and I am learning to use it to format
my emails correctly for the mailing list. I will be sure to get this
right going forward.


As this is one of my first contributions, your patience and clear
guidance, along with the help from everyone on the list, have been
incredibly encouraging. I have learned a great deal through this
process, and it has genuinely deepened my passion for this work.

Thanks again for all your help.

Best regards,
Wang Lian

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-09 14:46     ` Zi Yan
  2025-07-10  8:42       ` Mark Brown
@ 2025-07-10 11:40       ` wang lian
  1 sibling, 0 replies; 25+ messages in thread
From: wang lian @ 2025-07-10 11:40 UTC (permalink / raw)
  To: ziy
  Cc: Liam.Howlett, akpm, brauner, david, gkwang, jannh, lianux.mm,
	linux-kernel, linux-kselftest, linux-mm, lorenzo.stoakes,
	p1ucky0923, ryncsn, shuah, sj, vbabka, zijing.zhang

Hi Zi,

On <Date of Zi's email>, Zi Yan wrote:
> On 9 Jul 2025, at 8:32, wang lian wrote:
>
>> Hi Zi Yan,
>> Thanks for testing the patch and reporting this build failure.
>> I don't have an arm64 environment readily available for testing, so I
>> appreciate you catching this. I suspect this is caused by missing or
>> older userspace headers in the cross-compilation toolchain.
>
> Right. My /usr/include/sys does not have pidfd.h. IMHO selftests
> should not rely on userspace headers, otherwise we cannot test
> latest kernel changes.
>
>> I will try to fix this in the next version. If the problem persists, a
>> good solution would be to manually define the syscall wrapper to avoid
>> the dependency on <sys/pidfd.h>.
>
> Based on what I see in other mm tests, the following patch fixes my
> compilation issue.
>
> [ ... patch snippet ... ]

Thank you very much for not only identifying the root cause but also
providing a concrete patch to fix the compilation issue. Your analysis
that selftests should be independent of userspace headers is spot on,
and this approach aligns perfectly with the feedback I've received.

I have integrated your suggested changes into my local tree and will
include them in the next version of the patch. I will also be sure
to add your "Suggested-by" tag in the commit message to properly
credit your contribution.

Your help has been invaluable.

Best regards,
Wang Lian

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4] selftests/mm: add process_madvise() tests
  2025-07-10 11:22 [PATCH v4] selftests/mm: add process_madvise() tests wang lian
@ 2025-07-10 13:42 ` Mark Brown
  2025-07-10 16:21   ` Zi Yan
  2025-07-11 12:19   ` [PATCH v3] " wang lian
  2025-07-10 16:57 ` [PATCH v4] " Lorenzo Stoakes
  1 sibling, 2 replies; 25+ messages in thread
From: Mark Brown @ 2025-07-10 13:42 UTC (permalink / raw)
  To: wang lian
  Cc: akpm, ziy, lorenzo.stoakes, david, sj, linux-kernel, linux-mm,
	linux-kselftest, shuah, Liam.Howlett, brauner, gkwang, jannh,
	p1ucky0923, ryncsn, vbabka, zijing.zhang

[-- Attachment #1: Type: text/plain, Size: 3115 bytes --]

On Thu, Jul 10, 2025 at 07:22:49PM +0800, wang lian wrote:

> Add tests for process_madvise(), focusing on verifying behavior under
> various conditions including valid usage and error cases.

> --- a/tools/testing/selftests/mm/guard-regions.c
> +++ b/tools/testing/selftests/mm/guard-regions.c

> -static void handle_fatal(int c)
> -{
> -	if (!signal_jump_set)
> -		return;
> -
> -	siglongjmp(signal_jmp_buf, c);
> -}

I see from looking later in the patch that you're factoring this out of
the guard regions test into vm_util.c so that it can be used by your new
test.  This is good and sensible but it's a bit surprising, especially
since your changelog only said you were adding a new test.  It would be
better to split this out into a separate refactoring patch that just
does the code motion, as covered in submitting-patches.rst it's better
if changes just do one thing.

> +#include <linux/pidfd.h>
> +#include <linux/uio.h>

Does this work without 'make headers_install' for the systems that were
affectd by missing headers?  Lorenzo mentioned that we shouldn't depend
on that for the mm tests (I'm not enthusiastic about that approach
myself, but if it's what mm needs).

> +	ret = read(pipe_info[0], &info, sizeof(info));
> +	if (ret <= 0) {
> +		waitpid(self->child_pid, NULL, 0);
> +		ksft_exit_skip("Failed to read child info from pipe.\n");
> +	}

If you're using the harness you should use SKIP() rather than the ksft
APIs for reporting test results.  Don't mix and match the result
reporting APIs, harness will call the ksft_ APIs appropriately for you.

> +			/*
> +			 * MADV_COLLAPSE lost the race to khugepaged, which
> +			 * likely held a page lock. The kernel correctly
> +			 * reports this temporary contention with EAGAIN.
> +			 */
> +			if (errno == EAGAIN) {
> +				ksft_test_result_skip(
> +					"THP is 'always', process_madvise returned EAGAIN due to an expected race with khugepaged.\n");
> +			} else {
> +				ksft_test_result_fail(
> +					"process_madvise failed with unexpected errno %d in 'always' mode.\n",
> +					errno);
> +			}

Similarly, to fail use an ASSERT or EXPECT.  Note also that when using
the ksft_ API for reporting results each test should report a consistent
test name as the string, if you want to report an error message print it
separately to the test result. 

This applies throughout the program.

> +/*
> + * Test process_madvise() with various invalid pidfds to ensure correct error
> + * handling. This includes negative fds, non-pidfd fds, and pidfds for
> + * processes that no longer exist.
> + */

This sounds like it should be a series of small tests rather than a
single omnibus test, that'd result in clearer error reporting from test
frameworks since they will say which operation failed directly rather
than having to look at the logs then match them to the source.

> +	pidfd = syscall(__NR_pidfd_open, child_pid, 0);
> +	ASSERT_GE(pidfd, 0);

This is particularly the case given the use of ASSERTs, we'll not report
any issues other than the first one we hit.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4] selftests/mm: add process_madvise() tests
  2025-07-10 13:42 ` Mark Brown
@ 2025-07-10 16:21   ` Zi Yan
  2025-07-11  8:05     ` Mark Brown
  2025-07-11 12:19   ` [PATCH v3] " wang lian
  1 sibling, 1 reply; 25+ messages in thread
From: Zi Yan @ 2025-07-10 16:21 UTC (permalink / raw)
  To: Mark Brown
  Cc: wang lian, akpm, lorenzo.stoakes, david, sj, linux-kernel,
	linux-mm, linux-kselftest, shuah, Liam.Howlett, brauner, gkwang,
	jannh, p1ucky0923, ryncsn, vbabka, zijing.zhang

On 10 Jul 2025, at 9:42, Mark Brown wrote:

> On Thu, Jul 10, 2025 at 07:22:49PM +0800, wang lian wrote:
>
>> Add tests for process_madvise(), focusing on verifying behavior under
>> various conditions including valid usage and error cases.
>
>> --- a/tools/testing/selftests/mm/guard-regions.c
>> +++ b/tools/testing/selftests/mm/guard-regions.c
>
>> -static void handle_fatal(int c)
>> -{
>> -	if (!signal_jump_set)
>> -		return;
>> -
>> -	siglongjmp(signal_jmp_buf, c);
>> -}
>
> I see from looking later in the patch that you're factoring this out of
> the guard regions test into vm_util.c so that it can be used by your new
> test.  This is good and sensible but it's a bit surprising, especially
> since your changelog only said you were adding a new test.  It would be
> better to split this out into a separate refactoring patch that just
> does the code motion, as covered in submitting-patches.rst it's better
> if changes just do one thing.
>
>> +#include <linux/pidfd.h>
>> +#include <linux/uio.h>
>
> Does this work without 'make headers_install' for the systems that were
> affectd by missing headers?  Lorenzo mentioned that we shouldn't depend
> on that for the mm tests (I'm not enthusiastic about that approach
> myself, but if it's what mm needs).

No. “make headers_install” is still needed. I tried to get it compiled
without it but failed. It seems that a lot of files will need to be
copied to tools/include from “make headers”.

Best Regards,
Yan, Zi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-10  8:42       ` Mark Brown
@ 2025-07-10 16:28         ` Zi Yan
  2025-07-11  8:34           ` Mark Brown
  0 siblings, 1 reply; 25+ messages in thread
From: Zi Yan @ 2025-07-10 16:28 UTC (permalink / raw)
  To: Mark Brown
  Cc: wang lian, Liam.Howlett, akpm, brauner, david, gkwang, jannh,
	linux-kernel, linux-kselftest, linux-mm, lorenzo.stoakes,
	p1ucky0923, ryncsn, shuah, sj, vbabka, zijing.zhang

On 10 Jul 2025, at 4:42, Mark Brown wrote:

> On Wed, Jul 09, 2025 at 10:46:07AM -0400, Zi Yan wrote:
>> On 9 Jul 2025, at 8:32, wang lian wrote:
>
>>> Thanks for testing the patch and reporting this build failure.
>>> I don't have an arm64 environment readily available for testing, so I
>>> appreciate you catching this. I suspect this is caused by missing or
>>> older userspace headers in the cross-compilation toolchain.
>
>> Right. My /usr/include/sys does not have pidfd.h. IMHO selftests
>> should not rely on userspace headers, otherwise we cannot test
>> latest kernel changes.
>
> That's not realistic, we need to be able to use things like libc and for
> many areas you'd just end up copying or reimplmenenting the userspace
> libraries.  There's some concerns for sure, for example we used to have
> hideous problems with the BPF tests needing extremely recent versions of
> LLVM which weren't available from distros, but just saying nothing from
> userspace is a big blocker to getting things done.  With some things
> they're widely enough available that you can just assume they're there,
> with other things they're less standard so we need build time checks.

Sure. For libraries like libc, it is unrealistic to not rely on it.
But for header files, are we expecting to install any kernel headers
to the running system to get selftests compiled? If we are testing
RC versions and header files might change before the actual release,
that would pollute the system header files, right?

>
> OTOH in a case like this where we can just refer directly to a kernel
> header for some constants or structs then it does make sense to use the
> kernel headers, or in other cases where we're testing things that are

That is exactly my point above.

> intended to be controlled by libc it makes sense to use nolibc avoid
> conflicting with libc.


Best Regards,
Yan, Zi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4] selftests/mm: add process_madvise() tests
  2025-07-10 11:22 [PATCH v4] selftests/mm: add process_madvise() tests wang lian
  2025-07-10 13:42 ` Mark Brown
@ 2025-07-10 16:57 ` Lorenzo Stoakes
  2025-07-11  8:11   ` Mark Brown
  2025-07-11 11:16   ` wang lian
  1 sibling, 2 replies; 25+ messages in thread
From: Lorenzo Stoakes @ 2025-07-10 16:57 UTC (permalink / raw)
  To: wang lian
  Cc: akpm, ziy, david, sj, linux-kernel, linux-mm, linux-kselftest,
	shuah, broonie, Liam.Howlett, brauner, gkwang, jannh, p1ucky0923,
	ryncsn, vbabka, zijing.zhang

This series doesn't apply against mm-new currently, seems some conflict on
vm_util.c. So unable to test.

Am reviewing based on code, will have to double-check functionality against
respin.

On Thu, Jul 10, 2025 at 07:22:49PM +0800, wang lian wrote:
> Add tests for process_madvise(), focusing on verifying behavior under
> various conditions including valid usage and error cases.
>
> Signed-off-by: wang lian <lianux.mm@gmail.com>
> Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Suggested-by: Zi Yan <ziy@nvidia.com>
> Acked-by: SeongJae Park <sj@kernel.org>
> ---
> Changelog v4:
> - Refine resource cleanup logic in test teardown to be more robust.
> - Improve remote_collapse test to correctly handle different THP
>   (Transparent Huge Page) policies ('always', 'madvise', 'never'),
>   including handling race conditions with khugepaged.
> - Resolve build errors
>
> Changelog v3: https://lore.kernel.org/lkml/20250703044326.65061-1-lianux.mm@gmail.com/
> - Rebased onto the latest mm-stable branch to ensure clean application.
> - Refactor common signal handling logic into vm_util to reduce code duplication.
> - Improve test robustness and diagnostics based on community feedback.
> - Address minor code style and script corrections.
>
> Changelog v2: https://lore.kernel.org/lkml/20250630140957.4000-1-lianux.mm@gmail.com/
> - Drop MADV_DONTNEED tests based on feedback.
> - Focus solely on process_madvise() syscall.
> - Improve error handling and structure.
> - Add future-proof flag test.
> - Style and comment cleanups.
>
> -V1: https://lore.kernel.org/lkml/20250621133003.4733-1-lianux.mm@gmail.com/
>
>  tools/testing/selftests/mm/.gitignore      |   1 +
>  tools/testing/selftests/mm/Makefile        |   1 +
>  tools/testing/selftests/mm/guard-regions.c |  51 ---
>  tools/testing/selftests/mm/process_madv.c  | 447 +++++++++++++++++++++
>  tools/testing/selftests/mm/run_vmtests.sh  |   5 +
>  tools/testing/selftests/mm/vm_util.c       |  35 ++
>  tools/testing/selftests/mm/vm_util.h       |  22 +
>  7 files changed, 511 insertions(+), 51 deletions(-)
>  create mode 100644 tools/testing/selftests/mm/process_madv.c
>
> diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
> index 824266982aa3..95bd9c6ead9e 100644
> --- a/tools/testing/selftests/mm/.gitignore
> +++ b/tools/testing/selftests/mm/.gitignore
> @@ -25,6 +25,7 @@ pfnmap
>  protection_keys
>  protection_keys_32
>  protection_keys_64
> +process_madv
>  madv_populate
>  uffd-stress
>  uffd-unit-tests
> diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
> index ae6f994d3add..d13b3cef2a2b 100644
> --- a/tools/testing/selftests/mm/Makefile
> +++ b/tools/testing/selftests/mm/Makefile
> @@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test
>  TEST_GEN_FILES += on-fault-limit
>  TEST_GEN_FILES += pagemap_ioctl
>  TEST_GEN_FILES += pfnmap
> +TEST_GEN_FILES += process_madv
>  TEST_GEN_FILES += thuge-gen
>  TEST_GEN_FILES += transhuge-stress
>  TEST_GEN_FILES += uffd-stress
> diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c
> index 93af3d3760f9..4cf101b0fe5e 100644
> --- a/tools/testing/selftests/mm/guard-regions.c
> +++ b/tools/testing/selftests/mm/guard-regions.c
> @@ -9,8 +9,6 @@
>  #include <linux/limits.h>
>  #include <linux/userfaultfd.h>
>  #include <linux/fs.h>
> -#include <setjmp.h>
> -#include <signal.h>
>  #include <stdbool.h>
>  #include <stdio.h>
>  #include <stdlib.h>
> @@ -24,24 +22,6 @@
>
>  #include "../pidfd/pidfd.h"
>
> -/*
> - * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1:
> - *
> - * "If the signal occurs other than as the result of calling the abort or raise
> - *  function, the behavior is undefined if the signal handler refers to any
> - *  object with static storage duration other than by assigning a value to an
> - *  object declared as volatile sig_atomic_t"
> - */
> -static volatile sig_atomic_t signal_jump_set;
> -static sigjmp_buf signal_jmp_buf;

Please keep these static.

> -
> -/*
> - * Ignore the checkpatch warning, we must read from x but don't want to do
> - * anything with it in order to trigger a read page fault. We therefore must use
> - * volatile to stop the compiler from optimising this away.
> - */
> -#define FORCE_READ(x) (*(volatile typeof(x) *)x)

This one is ok to put in a shared header.

> -
>  /*
>   * How is the test backing the mapping being tested?
>   */
> @@ -120,14 +100,6 @@ static int userfaultfd(int flags)
>  	return syscall(SYS_userfaultfd, flags);
>  }
>
> -static void handle_fatal(int c)
> -{
> -	if (!signal_jump_set)
> -		return;
> -
> -	siglongjmp(signal_jmp_buf, c);
> -}
> -
>  static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
>  				   size_t n, int advice, unsigned int flags)
>  {
> @@ -180,29 +152,6 @@ static bool try_read_write_buf(char *ptr)
>  	return try_read_buf(ptr) && try_write_buf(ptr);
>  }
>
> -static void setup_sighandler(void)
> -{
> -	struct sigaction act = {
> -		.sa_handler = &handle_fatal,
> -		.sa_flags = SA_NODEFER,
> -	};
> -
> -	sigemptyset(&act.sa_mask);
> -	if (sigaction(SIGSEGV, &act, NULL))
> -		ksft_exit_fail_perror("sigaction");
> -}
> -
> -static void teardown_sighandler(void)
> -{
> -	struct sigaction act = {
> -		.sa_handler = SIG_DFL,
> -		.sa_flags = SA_NODEFER,
> -	};
> -
> -	sigemptyset(&act.sa_mask);
> -	sigaction(SIGSEGV, &act, NULL);
> -}
> -

I see what you're doing here, but I really don't feel comfortable with
having different tests share these signal-y variables. This stuff is fiddly
as it is.

Also let's please not have 'setup' or 'teardown' functions in
vm_util.h/vm_util.c - the util still is meant to be there for abstractions,
not test implementation details.

Also note this signal setup stuff is basically customised to the usecase
here - so overall I don't think you should abstract any of this.

Yes it's somewhat duplicative, but these are tests, that's ok.

>  static int open_file(const char *prefix, char *path)
>  {
>  	int fd;
> diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c
> new file mode 100644
> index 000000000000..7d7509486d46
> --- /dev/null
> +++ b/tools/testing/selftests/mm/process_madv.c
> @@ -0,0 +1,447 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +#define _GNU_SOURCE
> +#include "../kselftest_harness.h"
> +#include <errno.h>
> +#include <setjmp.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <linux/mman.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +#include <sched.h>
> +#include <linux/pidfd.h>

So you've seen the discussion around this.

Basically John has provided an excellent abstraction layer for this kind of
thing in tools/include. This _should_ be automatically available, so all
you _should_ need to do is:

cp include/uapi/linux/pidfd.h tools/include/uapi/linux/pidfd.h

However, the pidfd tests already have a stub in so you can alternatively
use:

#include "../pidfd/pidfd.h"

As is done in guard-regions.c.

> +#include <linux/uio.h>
> +#include "vm_util.h"
> +
> +FIXTURE(process_madvise)
> +{
> +	int pidfd;
> +	int flag;
> +	pid_t child_pid;
> +};
> +
> +FIXTURE_SETUP(process_madvise)
> +{
> +	self->pidfd = PIDFD_SELF;
> +	self->flag = 0;
> +	self->child_pid = -1;
> +	setup_sighandler();
> +};
> +
> +FIXTURE_TEARDOWN_PARENT(process_madvise)
> +{
> +	teardown_sighandler();
> +	if (self->child_pid > 0) {
> +		kill(self->child_pid, SIGKILL);
> +		waitpid(self->child_pid, NULL, 0);
> +	}
> +}
> +
> +static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
> +				   size_t vlen, int advice, unsigned int flags)
> +{
> +	return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
> +}
> +
> +/*
> + * Enable our signal catcher and try to read the specified buffer. The
> + * return value indicates whether the read succeeds without a fatal
> + * signal.
> + */
> +static bool try_read_buf(char *ptr)
> +{
> +	bool failed;
> +
> +	/* Tell signal handler to jump back here on fatal signal. */
> +	signal_jump_set = true;
> +	/* If a fatal signal arose, we will jump back here and failed is set. */
> +	failed = sigsetjmp(signal_jmp_buf, 0) != 0;
> +
> +	if (!failed)
> +		FORCE_READ(ptr);
> +
> +	signal_jump_set = false;
> +	return !failed;
> +}

At no point do you ever assert this false nor would you in any sane
situation get a page fault on anything you're testing, so I suggest you
just drop this + all related checks.

> +
> +TEST_F(process_madvise, basic)
> +{
> +	const unsigned long pagesize = (unsigned long)sysconf(_SC_PAGESIZE);

You should just put pagesize in self + get on setup. You can always do
something like:

	const unsigned long pagesize = self->pagesize;

Here for brevity after that.

> +	const int madvise_pages = 4;
> +	char *map;
> +	ssize_t ret;
> +	struct iovec vec[madvise_pages];
> +
> +	/*
> +	 * Create a single large mapping. We will pick pages from this
> +	 * mapping to advise on. This ensures we test non-contiguous iovecs.
> +	 */
> +	map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
> +		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +	if (map == MAP_FAILED)
> +		ksft_exit_skip("mmap failed, not enough memory.\n");
> +
> +	/* Fill the entire region with a known pattern. */
> +	memset(map, 'A', pagesize * 10);
> +
> +	/*
> +	 * Setup the iovec to point to 4 non-contiguous pages
> +	 * within the mapping.
> +	 */
> +	vec[0].iov_base = &map[0 * pagesize];
> +	vec[0].iov_len = pagesize;
> +	vec[1].iov_base = &map[3 * pagesize];
> +	vec[1].iov_len = pagesize;
> +	vec[2].iov_base = &map[5 * pagesize];
> +	vec[2].iov_len = pagesize;
> +	vec[3].iov_base = &map[8 * pagesize];
> +	vec[3].iov_len = pagesize;
> +
> +	ret = sys_process_madvise(PIDFD_SELF, vec, madvise_pages, MADV_DONTNEED,
> +				  0);
> +	if (ret == -1 && errno == EPERM)
> +		ksft_exit_skip(
> +			"process_madvise() unsupported or permission denied, try running as root.\n");

I think you can use the SKIP() macro here.

> +	else if (errno == EINVAL)
> +		ksft_exit_skip(
> +			"process_madvise() unsupported or parameter invalid, please check arguments.\n");

Isn't this latter one indicative of a bug?

> +
> +	/* The call should succeed and report the total bytes processed. */
> +	ASSERT_EQ(ret, madvise_pages * pagesize);
> +
> +	/* Check that advised pages are now zero. */
> +	for (int i = 0; i < madvise_pages; i++) {
> +		char *advised_page = (char *)vec[i].iov_base;
> +
> +		/* Access should be successful (kernel provides a new page). */
> +		ASSERT_TRUE(try_read_buf(advised_page));

This is a useless check really. We know page faulting works :)

> +		/* Content must be 0, not 'A'. */
> +		ASSERT_EQ(*advised_page, 0);

This is not clear, you're checking first byte of each page the below would
be clearer:

	ASSERT_EQ(advised_page[0], '\0');

> +	}
> +
> +	/* Check that an un-advised page in between is still 'A'. */
> +	char *unadvised_page = &map[1 * pagesize];
> +
> +	ASSERT_TRUE(try_read_buf(unadvised_page));

I don't see the point in using this. We know page faulting works.

> +	for (int i = 0; i < pagesize; i++)
> +		ASSERT_EQ(unadvised_page[i], 'A');
> +
> +	/* Cleanup. */
> +	ASSERT_EQ(munmap(map, pagesize * 10), 0);
> +}
> +
> +static long get_smaps_anon_huge_pages(pid_t pid, void *addr)
> +{
> +	char smaps_path[64];
> +	char *line = NULL;
> +	unsigned long start, end;
> +	long anon_huge_kb;
> +	size_t len;
> +	FILE *f;
> +	bool in_vma;
> +
> +	in_vma = false;
> +	snprintf(smaps_path, sizeof(smaps_path), "/proc/%d/smaps", pid);
> +	f = fopen(smaps_path, "r");
> +	if (!f)
> +		return -1;
> +
> +	while (getline(&line, &len, f) != -1) {
> +		/* Check if the line describes a VMA range */
> +		if (sscanf(line, "%lx-%lx", &start, &end) == 2) {
> +			if ((unsigned long)addr >= start &&
> +			    (unsigned long)addr < end)
> +				in_vma = true;
> +			else
> +				in_vma = false;
> +			continue;
> +		}
> +
> +		/* If we are in the correct VMA, look for the AnonHugePages field */
> +		if (in_vma &&
> +		    sscanf(line, "AnonHugePages: %ld kB", &anon_huge_kb) == 1)
> +			break;
> +	}
> +
> +	free(line);
> +	fclose(f);
> +
> +	return (anon_huge_kb > 0) ? (anon_huge_kb * 1024) : 0;
> +}

This seems like a lot of effort to check something that's pretty unreliable...

> +
> +static bool is_thp_always(void)
> +{
> +	const char *path = "/sys/kernel/mm/transparent_hugepage/enabled";
> +	char buf[32];
> +	FILE *f = fopen(path, "r");
> +
> +	if (!f)
> +		return false;
> +
> +	if (fgets(buf, sizeof(buf), f))
> +		if (strstr(buf, "[always]")) {
> +			fclose(f);
> +			return true;
> +		}
> +
> +	fclose(f);
> +	return false;
> +}
> +
> +/**
> + * TEST_F(process_madvise, remote_collapse)

We don't need kernel doc style comments in tests :) please just use normal
comments.

> + *
> + * This test deterministically validates process_madvise() with MADV_COLLAPSE
> + * on a remote process, other advices are difficult to verify reliably.
> + *
> + * The test verifies that a memory region in a child process, initially
> + * backed by small pages, can be collapsed into a Transparent Huge Page by a
> + * request from the parent. The result is verified by parsing the child's
> + * /proc/<pid>/smaps file.
> + */

This is clever and you've put a lot of effort in, but this just seems
absolutely prone to flaking and you're essentially testing something that's
highly automated.

I think you're also going way outside of the realms of testing
process_madvise() and are getting into testing essentially MADV_COLLAPSE
here.

We have to try to keep the test specific to what it is you're testing -
which is process_madvise() itself.

So for me, and I realise you've put a ton of work into this and I'm really
sorry to say it, I think you should drop this specific test.

For me simply testing the remote MADV_DONTNEED is enough.

> +TEST_F(process_madvise, remote_collapse)
> +{
> +	const unsigned long pagesize = (unsigned long)sysconf(_SC_PAGESIZE);
> +	int pidfd;
> +	long huge_page_size;
> +	int pipe_info[2];
> +	ssize_t ret;
> +	struct iovec vec;
> +
> +	struct child_info {
> +		pid_t pid;
> +		void *map_addr;
> +	} info;
> +
> +	huge_page_size = default_huge_page_size();
> +	if (huge_page_size <= 0)
> +		ksft_exit_skip("Could not determine a valid huge page size.\n");
> +
> +	ASSERT_EQ(pipe(pipe_info), 0);
> +
> +	self->child_pid = fork();
> +	ASSERT_NE(self->child_pid, -1);
> +
> +	if (self->child_pid == 0) {
> +		char *map;
> +		size_t map_size = 2 * huge_page_size;
> +
> +		close(pipe_info[0]);
> +
> +		map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
> +			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +		ASSERT_NE(map, MAP_FAILED);
> +
> +		/* Fault in as small pages */
> +		for (size_t i = 0; i < map_size; i += pagesize)
> +			map[i] = 'A';
> +
> +		/* Send info and pause */
> +		info.pid = getpid();
> +		info.map_addr = map;
> +		ret = write(pipe_info[1], &info, sizeof(info));
> +		ASSERT_EQ(ret, sizeof(info));
> +		close(pipe_info[1]);
> +
> +		pause();
> +		exit(0);
> +	}
> +
> +	close(pipe_info[1]);
> +
> +	/* Receive child info */
> +	ret = read(pipe_info[0], &info, sizeof(info));
> +	if (ret <= 0) {
> +		waitpid(self->child_pid, NULL, 0);
> +		ksft_exit_skip("Failed to read child info from pipe.\n");
> +	}
> +	ASSERT_EQ(ret, sizeof(info));
> +	close(pipe_info[0]);
> +	self->child_pid = info.pid;
> +
> +	pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
> +	ASSERT_GE(pidfd, 0);
> +
> +	vec.iov_base = info.map_addr;
> +	vec.iov_len = huge_page_size;
> +
> +	if (is_thp_always()) {
> +		long initial_huge_pages;
> +
> +		/*
> +		 * When THP is 'always', khugepaged may pre-emptively
> +		 * collapse the pages before our MADV_COLLAPSE call. Check
> +		 * the initial state to provide a more accurate test report.
> +		 */
> +		initial_huge_pages =
> +			get_smaps_anon_huge_pages(self->child_pid, info.map_addr);
> +
> +		if (initial_huge_pages == 2 * huge_page_size) {
> +			/*
> +			 * The pages were already collapsed by khugepaged.
> +			 * The test goal narrows to verifying that MADV_COLLAPSE
> +			 * correctly returns success on an already-collapsed
> +			 * region, as documented.
> +			 */
> +			ksft_test_result_skip(
> +				"THP is 'always' and pages were pre-collapsed; verifying success on already-collapsed page.\n");
> +
> +			ret = sys_process_madvise(pidfd, &vec, 1, MADV_COLLAPSE,
> +						  0);
> +			ASSERT_EQ(ret, huge_page_size);
> +			goto cleanup;
> +		}

Yeah this is asking for a flake. khugepaged can operate at any time.

> +
> +		/*
> +		 * Pages are still small, creating a race between our call
> +		 * and khugepaged. This is the main test scenario for 'always'.
> +		 */
> +		ret = sys_process_madvise(pidfd, &vec, 1, MADV_COLLAPSE, 0);
> +
> +		if (ret == -1) {
> +			/*
> +			 * MADV_COLLAPSE lost the race to khugepaged, which
> +			 * likely held a page lock. The kernel correctly
> +			 * reports this temporary contention with EAGAIN.
> +			 */
> +			if (errno == EAGAIN) {
> +				ksft_test_result_skip(
> +					"THP is 'always', process_madvise returned EAGAIN due to an expected race with khugepaged.\n");
> +			} else {
> +				ksft_test_result_fail(
> +					"process_madvise failed with unexpected errno %d in 'always' mode.\n",
> +					errno);
> +			}
> +			goto cleanup;
> +		}
> +
> +		/*
> +		 * MADV_COLLAPSE won the race and successfully collapsed
> +		 * the pages. Verify the final state.
> +		 */
> +		ASSERT_EQ(ret, huge_page_size);
> +		ASSERT_EQ(get_smaps_anon_huge_pages(self->child_pid, info.map_addr),
> +			  huge_page_size);
> +		ksft_test_result_pass(
> +			"THP is 'always', MADV_COLLAPSE won race and collapsed pages.\n");
> +		goto cleanup;
> +	}
> +
> +	/*
> +	 * THP is 'madvise' or 'never'. No race is expected with khugepaged.
> +	 * We can perform a straightforward state-change verification.
> +	 */
> +	ASSERT_EQ(get_smaps_anon_huge_pages(self->child_pid, info.map_addr), 0);
> +
> +	ret = sys_process_madvise(pidfd, &vec, 1, MADV_COLLAPSE, 0);
> +	if (ret == -1) {
> +		if (errno == EINVAL)
> +			ksft_exit_skip(
> +				"PROCESS_MADV_ADVISE is not supported.\n");
> +		else if (errno == EPERM)
> +			ksft_exit_skip(
> +				"No process_madvise() permissions, try running as root.\n");
> +		goto cleanup;
> +	}
> +	ASSERT_EQ(ret, huge_page_size);
> +
> +	ASSERT_EQ(get_smaps_anon_huge_pages(self->child_pid, info.map_addr),
> +		  huge_page_size);
> +
> +	ksft_test_result_pass(
> +		"MADV_COLLAPSE successfully verified via smaps.\n");
> +
> +cleanup:
> +	/* Cleanup */
> +	kill(self->child_pid, SIGKILL);
> +	waitpid(self->child_pid, NULL, 0);
> +	if (pidfd >= 0)
> +		close(pidfd);
> +}
> +
> +/*
> + * Test process_madvise() with various invalid pidfds to ensure correct error
> + * handling. This includes negative fds, non-pidfd fds, and pidfds for
> + * processes that no longer exist.
> + */
> +TEST_F(process_madvise, invalid_pidfd)
> +{
> +	struct iovec vec;
> +	pid_t child_pid;
> +	ssize_t ret;
> +	int pidfd;
> +
> +	vec.iov_base = (void *)0x1234;
> +	vec.iov_len = 4096;
> +
> +	/* Using an invalid fd number (-1) should fail with EBADF. */
> +	ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
> +	ASSERT_EQ(ret, -1);
> +	ASSERT_EQ(errno, EBADF);
> +
> +	/*
> +	 * Using a valid fd that is not a pidfd (e.g. stdin) should fail
> +	 * with EBADF.
> +	 */
> +	ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
> +	ASSERT_EQ(ret, -1);
> +	ASSERT_EQ(errno, EBADF);
> +
> +	/*
> +	 * Using a pidfd for a process that has already exited should fail
> +	 * with ESRCH.
> +	 */
> +	child_pid = fork();
> +	ASSERT_NE(child_pid, -1);
> +
> +	if (child_pid == 0)
> +		exit(0);
> +
> +	pidfd = syscall(__NR_pidfd_open, child_pid, 0);
> +	ASSERT_GE(pidfd, 0);
> +
> +	/* Wait for the child to ensure it has terminated. */
> +	waitpid(child_pid, NULL, 0);
> +
> +	ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, 0);
> +	ASSERT_EQ(ret, -1);
> +	ASSERT_EQ(errno, ESRCH);
> +	close(pidfd);
> +}
> +
> +/*
> + * Test process_madvise() with an invalid flag value. Now we only support flag=0
> + * future we will use it support sync so reserve this test.
> + */
> +TEST_F(process_madvise, flag)
> +{
> +	const unsigned long pagesize = (unsigned long)sysconf(_SC_PAGESIZE);
> +	unsigned int invalid_flag;
> +	struct iovec vec;
> +	char *map;
> +	ssize_t ret;
> +
> +	map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
> +		   0);
> +	if (map == MAP_FAILED)
> +		ksft_exit_skip("mmap failed, not enough memory.\n");
> +
> +	vec.iov_base = map;
> +	vec.iov_len = pagesize;
> +
> +	invalid_flag = 0x80000000;
> +
> +	ret = sys_process_madvise(PIDFD_SELF, &vec, 1, MADV_DONTNEED,
> +				  invalid_flag);
> +	ASSERT_EQ(ret, -1);
> +	ASSERT_EQ(errno, EINVAL);
> +
> +	/* Cleanup. */
> +	ASSERT_EQ(munmap(map, pagesize), 0);
> +}
> +
> +TEST_HARNESS_MAIN
> diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
> index dddd1dd8af14..84fb51902c3e 100755
> --- a/tools/testing/selftests/mm/run_vmtests.sh
> +++ b/tools/testing/selftests/mm/run_vmtests.sh
> @@ -65,6 +65,8 @@ separated by spaces:
>  	test pagemap_scan IOCTL
>  - pfnmap
>  	tests for VM_PFNMAP handling
> +- process_madv
> +	test process_madvise
>  - cow
>  	test copy-on-write semantics
>  - thp
> @@ -422,6 +424,9 @@ CATEGORY="hmm" run_test bash ./test_hmm.sh smoke
>  # MADV_GUARD_INSTALL and MADV_GUARD_REMOVE tests
>  CATEGORY="madv_guard" run_test ./guard-regions
>
> +# PROCESS_MADVISE TEST
> +CATEGORY="process_madv" run_test ./process_madv
> +
>  # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
>  CATEGORY="madv_populate" run_test ./madv_populate
>
> diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
> index 5492e3f784df..85b209260e5a 100644
> --- a/tools/testing/selftests/mm/vm_util.c
> +++ b/tools/testing/selftests/mm/vm_util.c
> @@ -20,6 +20,9 @@
>  unsigned int __page_size;
>  unsigned int __page_shift;
>
> +volatile sig_atomic_t signal_jump_set;
> +sigjmp_buf signal_jmp_buf;
> +
>  uint64_t pagemap_get_entry(int fd, char *start)
>  {
>  	const unsigned long pfn = (unsigned long)start / getpagesize();
> @@ -524,3 +527,35 @@ int read_sysfs(const char *file_path, unsigned long *val)
>
>  	return 0;
>  }
> +
> +static void handle_fatal(int c)
> +{
> +	if (!signal_jump_set)
> +		return;
> +
> +	siglongjmp(signal_jmp_buf, c);
> +}
> +
> +void setup_sighandler(void)
> +{
> +	struct sigaction act = {
> +		.sa_handler = &handle_fatal,
> +		.sa_flags = SA_NODEFER,
> +	};
> +
> +	sigemptyset(&act.sa_mask);
> +	if (sigaction(SIGSEGV, &act, NULL))
> +		ksft_exit_fail_perror("sigaction in setup");
> +}
> +
> +void teardown_sighandler(void)
> +{
> +	struct sigaction act = {
> +		.sa_handler = SIG_DFL,
> +		.sa_flags = SA_NODEFER,
> +	};
> +
> +	sigemptyset(&act.sa_mask);
> +	if (sigaction(SIGSEGV, &act, NULL))
> +		ksft_exit_fail_perror("sigaction in teardown");
> +}

As stated above, please do not abstract these.

> diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
> index b8136d12a0f8..6bc4177a2807 100644
> --- a/tools/testing/selftests/mm/vm_util.h
> +++ b/tools/testing/selftests/mm/vm_util.h
> @@ -8,6 +8,8 @@
>  #include <unistd.h> /* _SC_PAGESIZE */
>  #include "../kselftest.h"
>  #include <linux/fs.h>
> +#include <setjmp.h>
> +#include <signal.h>
>
>  #define BIT_ULL(nr)                   (1ULL << (nr))
>  #define PM_SOFT_DIRTY                 BIT_ULL(55)
> @@ -61,6 +63,24 @@ static inline void skip_test_dodgy_fs(const char *op_name)
>  	ksft_test_result_skip("%s failed with ENOENT. Filesystem might be buggy (9pfs?)\n", op_name);
>  }
>
> +/*
> + * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1:
> + *
> + * "If the signal occurs other than as the result of calling the abort or raise
> + *  function, the behavior is undefined if the signal handler refers to any
> + *  object with static storage duration other than by assigning a value to an
> + *  object declared as volatile sig_atomic_t"
> + */
> +extern volatile sig_atomic_t signal_jump_set;
> +extern sigjmp_buf signal_jmp_buf;
> +
> +/*
> + * Ignore the checkpatch warning, we must read from x but don't want to do
> + * anything with it in order to trigger a read page fault. We therefore must use
> + * volatile to stop the compiler from optimising this away.
> + */
> +#define FORCE_READ(x) (*(volatile typeof(x) *)x)
> +
>  uint64_t pagemap_get_entry(int fd, char *start);
>  bool pagemap_is_softdirty(int fd, char *start);
>  bool pagemap_is_swapped(int fd, char *start);
> @@ -90,6 +110,8 @@ bool find_vma_procmap(struct procmap_fd *procmap, void *address);
>  int close_procmap(struct procmap_fd *procmap);
>  int write_sysfs(const char *file_path, unsigned long val);
>  int read_sysfs(const char *file_path, unsigned long *val);
> +void setup_sighandler(void);
> +void teardown_sighandler(void);

And as stated previously, please un-abstract all these.

>
>  static inline int open_self_procmap(struct procmap_fd *procmap_out)
>  {
> --
> 2.43.0
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4] selftests/mm: add process_madvise() tests
  2025-07-10 16:21   ` Zi Yan
@ 2025-07-11  8:05     ` Mark Brown
  0 siblings, 0 replies; 25+ messages in thread
From: Mark Brown @ 2025-07-11  8:05 UTC (permalink / raw)
  To: Zi Yan
  Cc: wang lian, akpm, lorenzo.stoakes, david, sj, linux-kernel,
	linux-mm, linux-kselftest, shuah, Liam.Howlett, brauner, gkwang,
	jannh, p1ucky0923, ryncsn, vbabka, zijing.zhang

[-- Attachment #1: Type: text/plain, Size: 862 bytes --]

On Thu, Jul 10, 2025 at 12:21:36PM -0400, Zi Yan wrote:
> On 10 Jul 2025, at 9:42, Mark Brown wrote:
> > On Thu, Jul 10, 2025 at 07:22:49PM +0800, wang lian wrote:

> >> +#include <linux/pidfd.h>
> >> +#include <linux/uio.h>

> > Does this work without 'make headers_install' for the systems that were
> > affectd by missing headers?  Lorenzo mentioned that we shouldn't depend
> > on that for the mm tests (I'm not enthusiastic about that approach
> > myself, but if it's what mm needs).

> No. “make headers_install” is still needed. I tried to get it compiled
> without it but failed. It seems that a lot of files will need to be
> copied to tools/include from “make headers”.

If you're doing that it should again be a separate patch.  Another
option if it's just a few defines or something is to copy just them into
your program.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4] selftests/mm: add process_madvise() tests
  2025-07-10 16:57 ` [PATCH v4] " Lorenzo Stoakes
@ 2025-07-11  8:11   ` Mark Brown
  2025-07-11  8:53     ` Lorenzo Stoakes
  2025-07-11 11:16   ` wang lian
  1 sibling, 1 reply; 25+ messages in thread
From: Mark Brown @ 2025-07-11  8:11 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: wang lian, akpm, ziy, david, sj, linux-kernel, linux-mm,
	linux-kselftest, shuah, Liam.Howlett, brauner, gkwang, jannh,
	p1ucky0923, ryncsn, vbabka, zijing.zhang

[-- Attachment #1: Type: text/plain, Size: 660 bytes --]

On Thu, Jul 10, 2025 at 05:57:23PM +0100, Lorenzo Stoakes wrote:
> On Thu, Jul 10, 2025 at 07:22:49PM +0800, wang lian wrote:

> > +#include <linux/pidfd.h>

> However, the pidfd tests already have a stub in so you can alternatively
> use:

> #include "../pidfd/pidfd.h"

> As is done in guard-regions.c.

One thing to watch out for with peering into the private header files of
other selftests is that it's a routine source of build and sometimes
runtime failures, people have a tendency to update one selftest without
thinking that other selftests might be peering at their code.  The cross
tree aspect can make it painful to deal with the resulting issues.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-10 16:28         ` Zi Yan
@ 2025-07-11  8:34           ` Mark Brown
  2025-07-11  8:49             ` Lorenzo Stoakes
  0 siblings, 1 reply; 25+ messages in thread
From: Mark Brown @ 2025-07-11  8:34 UTC (permalink / raw)
  To: Zi Yan
  Cc: wang lian, Liam.Howlett, akpm, brauner, david, gkwang, jannh,
	linux-kernel, linux-kselftest, linux-mm, lorenzo.stoakes,
	p1ucky0923, ryncsn, shuah, sj, vbabka, zijing.zhang

[-- Attachment #1: Type: text/plain, Size: 1584 bytes --]

On Thu, Jul 10, 2025 at 12:28:13PM -0400, Zi Yan wrote:
> On 10 Jul 2025, at 4:42, Mark Brown wrote:
> > On Wed, Jul 09, 2025 at 10:46:07AM -0400, Zi Yan wrote:

> >> Right. My /usr/include/sys does not have pidfd.h. IMHO selftests
> >> should not rely on userspace headers, otherwise we cannot test
> >> latest kernel changes.

> > That's not realistic, we need to be able to use things like libc and for
> > many areas you'd just end up copying or reimplmenenting the userspace
> > libraries.  There's some concerns for sure, for example we used to have

> Sure. For libraries like libc, it is unrealistic to not rely on it.
> But for header files, are we expecting to install any kernel headers
> to the running system to get selftests compiled? If we are testing
> RC versions and header files might change before the actual release,
> that would pollute the system header files, right?

Right, for the kernel's headers there's two things - we use a
combination of tools/include and 'make headers_install' which populates
usr/include in the kernel tree (apparently mm rejects the latter but it
is widely used in the selftests, especially for architecture specifics).
These install locally and used before the system headers.

> > OTOH in a case like this where we can just refer directly to a kernel
> > header for some constants or structs then it does make sense to use the
> > kernel headers, or in other cases where we're testing things that are

> That is exactly my point above.

What was said was a bit stronger though, and might lead people down a
wheel reinvention path.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-11  8:34           ` Mark Brown
@ 2025-07-11  8:49             ` Lorenzo Stoakes
  2025-07-11 12:09               ` wang lian
  0 siblings, 1 reply; 25+ messages in thread
From: Lorenzo Stoakes @ 2025-07-11  8:49 UTC (permalink / raw)
  To: Mark Brown
  Cc: Zi Yan, wang lian, Liam.Howlett, akpm, brauner, david, gkwang,
	jannh, linux-kernel, linux-kselftest, linux-mm, p1ucky0923,
	ryncsn, shuah, sj, vbabka, zijing.zhang

On Fri, Jul 11, 2025 at 09:34:38AM +0100, Mark Brown wrote:
> On Thu, Jul 10, 2025 at 12:28:13PM -0400, Zi Yan wrote:
> > On 10 Jul 2025, at 4:42, Mark Brown wrote:
> > > On Wed, Jul 09, 2025 at 10:46:07AM -0400, Zi Yan wrote:
>
> > >> Right. My /usr/include/sys does not have pidfd.h. IMHO selftests
> > >> should not rely on userspace headers, otherwise we cannot test
> > >> latest kernel changes.
>
> > > That's not realistic, we need to be able to use things like libc and for
> > > many areas you'd just end up copying or reimplmenenting the userspace
> > > libraries.  There's some concerns for sure, for example we used to have
>
> > Sure. For libraries like libc, it is unrealistic to not rely on it.
> > But for header files, are we expecting to install any kernel headers
> > to the running system to get selftests compiled? If we are testing
> > RC versions and header files might change before the actual release,
> > that would pollute the system header files, right?
>
> Right, for the kernel's headers there's two things - we use a
> combination of tools/include and 'make headers_install' which populates
> usr/include in the kernel tree (apparently mm rejects the latter but it
> is widely used in the selftests, especially for architecture specifics).
> These install locally and used before the system headers.
>
> > > OTOH in a case like this where we can just refer directly to a kernel
> > > header for some constants or structs then it does make sense to use the
> > > kernel headers, or in other cases where we're testing things that are
>
> > That is exactly my point above.
>
> What was said was a bit stronger though, and might lead people down a
> wheel reinvention path.

Let's PLEASE not rehash all this again...

This patch literally just needs PIDFD_SELF, I've provided a couple of ways
of doing that without introducing this requirement.

We already have a test that uses this with no problems ever reported on
which this patch was based.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4] selftests/mm: add process_madvise() tests
  2025-07-11  8:11   ` Mark Brown
@ 2025-07-11  8:53     ` Lorenzo Stoakes
  2025-07-11  9:29       ` Mark Brown
  0 siblings, 1 reply; 25+ messages in thread
From: Lorenzo Stoakes @ 2025-07-11  8:53 UTC (permalink / raw)
  To: Mark Brown
  Cc: wang lian, akpm, ziy, david, sj, linux-kernel, linux-mm,
	linux-kselftest, shuah, Liam.Howlett, brauner, gkwang, jannh,
	p1ucky0923, ryncsn, vbabka, zijing.zhang

On Fri, Jul 11, 2025 at 09:11:47AM +0100, Mark Brown wrote:
> On Thu, Jul 10, 2025 at 05:57:23PM +0100, Lorenzo Stoakes wrote:
> > On Thu, Jul 10, 2025 at 07:22:49PM +0800, wang lian wrote:
>
> > > +#include <linux/pidfd.h>
>
> > However, the pidfd tests already have a stub in so you can alternatively
> > use:
>
> > #include "../pidfd/pidfd.h"
>
> > As is done in guard-regions.c.
>
> One thing to watch out for with peering into the private header files of
> other selftests is that it's a routine source of build and sometimes
> runtime failures, people have a tendency to update one selftest without
> thinking that other selftests might be peering at their code.  The cross
> tree aspect can make it painful to deal with the resulting issues.

I take it from the lack of reported issues this hasn't happened in reality.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4] selftests/mm: add process_madvise() tests
  2025-07-11  8:53     ` Lorenzo Stoakes
@ 2025-07-11  9:29       ` Mark Brown
  0 siblings, 0 replies; 25+ messages in thread
From: Mark Brown @ 2025-07-11  9:29 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: wang lian, akpm, ziy, david, sj, linux-kernel, linux-mm,
	linux-kselftest, shuah, Liam.Howlett, brauner, gkwang, jannh,
	p1ucky0923, ryncsn, vbabka, zijing.zhang

[-- Attachment #1: Type: text/plain, Size: 996 bytes --]

On Fri, Jul 11, 2025 at 09:53:10AM +0100, Lorenzo Stoakes wrote:
> On Fri, Jul 11, 2025 at 09:11:47AM +0100, Mark Brown wrote:

> > One thing to watch out for with peering into the private header files of
> > other selftests is that it's a routine source of build and sometimes
> > runtime failures, people have a tendency to update one selftest without
> > thinking that other selftests might be peering at their code.  The cross
> > tree aspect can make it painful to deal with the resulting issues.

> I take it from the lack of reported issues this hasn't happened in reality.

That's a general comment about this pattern over the selftests as a
whole rather than this specific header - if this one has been working
well then great (I certainly didn't run into it myself).  In general I'd
say this pattern is up there as one of the most common individual
sources of build breaks in the selftests, and often has a relatively
high level of friction getting things fixed compared to the others.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4] selftests/mm: add process_madvise() tests
  2025-07-10 16:57 ` [PATCH v4] " Lorenzo Stoakes
  2025-07-11  8:11   ` Mark Brown
@ 2025-07-11 11:16   ` wang lian
  2025-07-11 11:33     ` Lorenzo Stoakes
  1 sibling, 1 reply; 25+ messages in thread
From: wang lian @ 2025-07-11 11:16 UTC (permalink / raw)
  To: lorenzo.stoakes
  Cc: Liam.Howlett, akpm, brauner, broonie, david, gkwang, jannh,
	lianux.mm, linux-kernel, linux-kselftest, linux-mm, p1ucky0923,
	ryncsn, shuah, sj, vbabka, zijing.zhang, ziy

Hi Lorenzo Stoakes,

>> + *
>> + * This test deterministically validates process_madvise() with MADV_COLLAPSE
>> + * on a remote process, other advices are difficult to verify reliably.
>> + *
>> + * The test verifies that a memory region in a child process, initially
>> + * backed by small pages, can be collapsed into a Transparent Huge Page by a
>> + * request from the parent. The result is verified by parsing the child's
>> + * /proc/<pid>/smaps file.
>> + */

> This is clever and you've put a lot of effort in, but this just seems
> absolutely prone to flaking and you're essentially testing something that's
> highly automated.

> I think you're also going way outside of the realms of testing
> process_madvise() and are getting into testing essentially MADV_COLLAPSE
> here.

> We have to try to keep the test specific to what it is you're testing -
> which is process_madvise() itself.

> So for me, and I realise you've put a ton of work into this and I'm really
> sorry to say it, I think you should drop this specific test.

> For me simply testing the remote MADV_DONTNEED is enough.

My motivation for this complex test came from the need to verify that
the process_madvise operation was actually successful. Without checking
the outcome, the test would only validate that the syscall returns the
correct number of bytes, not that the advice truly took effect on the
target process's memory.

For remote calls, process_madvise is intentionally limited to
non-destructive advice: MADV_COLD, MADV_PAGEOUT, MADV_WILLNEED,
and MADV_COLLAPSE. However, verifying the effects of COLD, PAGEOUT,
and WILLNEED is very difficult to do reliably in a selftest. This left
MADV_COLLAPSE as what seemed to be the only verifiable option.

But, as you correctly pointed out, MADV_COLLAPSE is too dependent on
the system's THP state and prone to races with khugepaged. This is the
very issue I tried to work around in v4 after the v3 test failures. 
So I think this test is necessary. 
As for your other opinions, I completely agree.



Best regards,
Wang Lian

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4] selftests/mm: add process_madvise() tests
  2025-07-11 11:16   ` wang lian
@ 2025-07-11 11:33     ` Lorenzo Stoakes
  2025-07-11 12:02       ` wang lian
  0 siblings, 1 reply; 25+ messages in thread
From: Lorenzo Stoakes @ 2025-07-11 11:33 UTC (permalink / raw)
  To: wang lian
  Cc: Liam.Howlett, akpm, brauner, broonie, david, gkwang, jannh,
	linux-kernel, linux-kselftest, linux-mm, p1ucky0923, ryncsn,
	shuah, sj, vbabka, zijing.zhang, ziy

On Fri, Jul 11, 2025 at 07:16:00PM +0800, wang lian wrote:
> Hi Lorenzo Stoakes,
>
> >> + *
> >> + * This test deterministically validates process_madvise() with MADV_COLLAPSE
> >> + * on a remote process, other advices are difficult to verify reliably.
> >> + *
> >> + * The test verifies that a memory region in a child process, initially
> >> + * backed by small pages, can be collapsed into a Transparent Huge Page by a
> >> + * request from the parent. The result is verified by parsing the child's
> >> + * /proc/<pid>/smaps file.
> >> + */
>
> > This is clever and you've put a lot of effort in, but this just seems
> > absolutely prone to flaking and you're essentially testing something that's
> > highly automated.
>
> > I think you're also going way outside of the realms of testing
> > process_madvise() and are getting into testing essentially MADV_COLLAPSE
> > here.
>
> > We have to try to keep the test specific to what it is you're testing -
> > which is process_madvise() itself.
>
> > So for me, and I realise you've put a ton of work into this and I'm really
> > sorry to say it, I think you should drop this specific test.
>
> > For me simply testing the remote MADV_DONTNEED is enough.
>
> My motivation for this complex test came from the need to verify that
> the process_madvise operation was actually successful. Without checking
> the outcome, the test would only validate that the syscall returns the
> correct number of bytes, not that the advice truly took effect on the
> target process's memory.
>
> For remote calls, process_madvise is intentionally limited to
> non-destructive advice: MADV_COLD, MADV_PAGEOUT, MADV_WILLNEED,
> and MADV_COLLAPSE. However, verifying the effects of COLD, PAGEOUT,
> and WILLNEED is very difficult to do reliably in a selftest. This left
> MADV_COLLAPSE as what seemed to be the only verifiable option.
>
> But, as you correctly pointed out, MADV_COLLAPSE is too dependent on
> the system's THP state and prone to races with khugepaged. This is the
> very issue I tried to work around in v4 after the v3 test failures.
> So I think this test is necessary.
> As for your other opinions, I completely agree.

MADV_COLLAPSE is not a reliable test and we're going to end up with flakes. The
implementation as-is is unreliable, and I"m not sure there's any way to make it
not-unreliable.

This is especially true as we change THP behaviour over time. I don't want to
see failed test reports because of this.

I think it might be best to simply assert that the operation succesfully
completes without checking whether it actually executes the requested task - it
would render this functionality completely broken if it were not to actually do
what was requested.

>
>
>
> Best regards,
> Wang Lian

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v4] selftests/mm: add process_madvise() tests
  2025-07-11 11:33     ` Lorenzo Stoakes
@ 2025-07-11 12:02       ` wang lian
  0 siblings, 0 replies; 25+ messages in thread
From: wang lian @ 2025-07-11 12:02 UTC (permalink / raw)
  To: lorenzo.stoakes
  Cc: Liam.Howlett, akpm, brauner, broonie, david, gkwang, jannh,
	lianux.mm, linux-kernel, linux-kselftest, linux-mm, p1ucky0923,
	ryncsn, shuah, sj, vbabka, zijing.zhang, ziy

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=yes, Size: 3324 bytes --]

Hi Lorenzo Stoakes,

> > Hi Lorenzo Stoakes,
> >
> > >> + *
> > >> + * This test deterministically validates process_madvise() with MADV_COLLAPSE
> > >> + * on a remote process, other advices are difficult to verify reliably.
> > >> + *
> > >> + * The test verifies that a memory region in a child process, initially
> > >> + * backed by small pages, can be collapsed into a Transparent Huge Page by a
> > >> + * request from the parent. The result is verified by parsing the child's
> > >> + * /proc/<pid>/smaps file.
> > >> + */
> >
> > > This is clever and you've put a lot of effort in, but this just seems
> > > absolutely prone to flaking and you're essentially testing something that's
> > > highly automated.
> >
> > > I think you're also going way outside of the realms of testing
> > > process_madvise() and are getting into testing essentially MADV_COLLAPSE
> > > here.
> >
> > > > We have to try to keep the test specific to what it is you're testing -
> > > which is process_madvise() itself.
> >
> > > So for me, and I realise you've put a ton of work into this and I'm really
> > > sorry to say it, I think you should drop this specific test.
> >
> > > For me simply testing the remote MADV_DONTNEED is enough.
> >
> > My motivation for this complex test came from the need to verify that
> > the process_madvise operation was actually successful. Without checking
> > the outcome, the test would only validate that the syscall returns the
> > correct number of bytes, not that the advice truly took effect on the
> > target process's memory.
> >
> > For remote calls, process_madvise is intentionally limited to
> > non-destructive advice: MADV_COLD, MADV_PAGEOUT, MADV_WILLNEED,
> > and MADV_COLLAPSE. However, verifying the effects of COLD, PAGEOUT,
> > and WILLNEED is very difficult to do reliably in a selftest. This left
> > MADV_COLLAPSE as what seemed to be the only verifiable option.
> >
> > But, as you correctly pointed out, MADV_COLLAPSE is too dependent on
> > the system's THP state and prone to races with khugepaged. This is the
> > very issue I tried to work around in v4 after the v3 test failures.
> > So I think this test is necessary.
> > As for your other opinions, I completely agree.

> MADV_COLLAPSE is not a reliable test and we're going to end up with flakes. The
> implementation as-is is unreliable, and I"m not sure there's any way to make it
> not-unreliable.

> This is especially true as we change THP behaviour over time. I don't want to
> see failed test reports because of this.

> I think it might be best to simply assert that the operation succesfully
> completes without checking whether it actually executes the requested task - it
> would render this functionality completely broken if it were not to actually do
> what was requested.

> >
> >
> >
> > Best regards,
> > Wang Lian

Thank you for the clarification. You've convinced me.

Your suggestion provides a much cleaner path forward. It allows the test
to focus on the process_madvise syscall's interface—asserting the
successful return—without the flakiness of verifying side-effects that
are difficult to observe reliably. This makes the test much more robust.

I will update the patch to implement this clear assertion logic. Thank
you for guiding me to this better solution.


Best regards,
Wang Lian

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-11  8:49             ` Lorenzo Stoakes
@ 2025-07-11 12:09               ` wang lian
  0 siblings, 0 replies; 25+ messages in thread
From: wang lian @ 2025-07-11 12:09 UTC (permalink / raw)
  To: lorenzo.stoakes
  Cc: Liam.Howlett, akpm, brauner, broonie, david, gkwang, jannh,
	lianux.mm, linux-kernel, linux-kselftest, linux-mm, p1ucky0923,
	ryncsn, shuah, sj, vbabka, zijing.zhang, ziy

Hi Lorenzo Stoakes,

> On Fri, Jul 11, 2025 at 09:34:38AM +0100, Mark Brown wrote:
> > On Thu, Jul 10, 2025 at 12:28:13PM -0400, Zi Yan wrote:
> > > On 10 Jul 2025, at 4:42, Mark Brown wrote:
> > > > On Wed, Jul 09, 2025 at 10:46:07AM -0400, Zi Yan wrote:
> > > >
> > > > > Right. My /usr/include/sys does not have pidfd.h. IMHO selftests
> > > > > should not rely on userspace headers, otherwise we cannot test
> > > > > latest kernel changes.
> > > >
> > > > That's not realistic, we need to be able to use things like libc and for
> > > > many areas you'd just end up copying or reimplmenenting the userspace
> > > > libraries.  There's some concerns for sure, for example we used to have
> > >
> > > Sure. For libraries like libc, it is unrealistic to not rely on it.
> > > But for header files, are we expecting to install any kernel headers
> > > to the running system to get selftests compiled? If we are testing
> > > RC versions and header files might change before the actual release,
> > > that would pollute the system header files, right?
> >
> > Right, for the kernel's headers there's two things - we use a
> > combination of tools/include and 'make headers_install' which populates
> > usr/include in the kernel tree (apparently mm rejects the latter but it
> > is widely used in the selftests, especially for architecture specifics).
> > These install locally and used before the system headers.
> >
> > > > OTOH in a case like this where we can just refer directly to a kernel
> > > > header for some constants or structs then it does make sense to use the
> > > > kernel headers, or in other cases where we're testing things that are
> >
> > > That is exactly my point above.
> >
> > What was said was a bit stronger though, and might lead people down a
> > wheel reinvention path.
>
> Let's PLEASE not rehash all this again...
>
> This patch literally just needs PIDFD_SELF, I've provided a couple of ways
> of doing that without introducing this requirement.
>
> We already have a test that uses this with no problems ever reported on
> which this patch was based.
>
> Thanks.

You are absolutely right, and my apologies for introducing this
unnecessary header dependency.

Just to clarify, the build failure Zi Yan reported on arm64 was not
caused by PIDFD_SELF. It was from my use of the pidfd_open() glibc
wrapper in the test, which incorrectly pulled in the system's
<sys/pidfd.h>.

Based on our discussion and a review of other tests, I understand the
correct approach is to avoid such dependencies. I will fix this properly
in the next version by using a direct syscall wrapper for pidfd_open.
Thank you for the guidance.

Best regards,
Wang Lian

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] selftests/mm: add process_madvise() tests
  2025-07-10 13:42 ` Mark Brown
  2025-07-10 16:21   ` Zi Yan
@ 2025-07-11 12:19   ` wang lian
  1 sibling, 0 replies; 25+ messages in thread
From: wang lian @ 2025-07-11 12:19 UTC (permalink / raw)
  To: broonie
  Cc: Liam.Howlett, akpm, brauner, david, gkwang, jannh, lianux.mm,
	linux-kernel, linux-kselftest, linux-mm, lorenzo.stoakes,
	p1ucky0923, ryncsn, shuah, sj, vbabka, zijing.zhang, ziy

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=yes, Size: 3754 bytes --]

Hi Mark Brown,

> On Thu, Jul 10, 2025 at 07:22:49PM +0800, wang lian wrote:
>
> > Add tests for process_madvise(), focusing on verifying behavior under
> > various conditions including valid usage and error cases.
>
> > --- a/tools/testing/selftests/mm/guard-regions.c
> > +++ b/tools/testing/selftests/mm/guard-regions.c
>
> > -static void handle_fatal(int c)
> > -{
> > -	if (!signal_jump_set)
> > -		return;
> > -
> > -	siglongjmp(signal_jmp_buf, c);
> > -}

> I see from looking later in the patch that you're factoring this out of
> the guard regions test into vm_util.c so that it can be used by your new
> test.  This is good and sensible but it's a bit surprising, especially
> since your changelog only said you were adding a new test.  It would be
> better to split this out into a separate refactoring patch that just
> does the code motion, as covered in submitting-patches.rst it's better
> if changes just do one thing.

Thanks for the suggestion. I’ll split this out into a separate patch
that just moves the helper to vm_util.c, and follow up with the new
test in a second patch.

> > +#include <linux/pidfd.h>
> > +#include <linux/uio.h>
>
> Does this work without 'make headers_install' for the systems that were
> affectd by missing headers?  Lorenzo mentioned that we shouldn't depend
> on that for the mm tests (I'm not enthusiastic about that approach
> myself, but if it's what mm needs).

You're right, and I’ve seen build issues due to that as well. I’ll drop
<linux/pidfd.h> and define PIDFD_SELF locally to avoid requiring
installed headers.

> > +	ret = read(pipe_info[0], &info, sizeof(info));
> > +	if (ret <= 0) {
> > +		waitpid(self->child_pid, NULL, 0);
> > +		ksft_exit_skip("Failed to read child info from pipe.\n");
> > +	}

> If you're using the harness you should use SKIP() rather than the ksft
> APIs for reporting test results.  Don't mix and match the result
> reporting APIs, harness will call the ksft_ APIs appropriately for you.

Understood. I’ll convert this and other cases to use SKIP() and ensure
the test consistently uses the test harness macros.

> > +			if (errno == EAGAIN) {
> > +				ksft_test_result_skip(
> > +					"THP is 'always', process_madvise returned EAGAIN due to an expected race with khugepaged.\n");
> > +			} else {
> > +				ksft_test_result_fail(
> > +					"process_madvise failed with unexpected errno %d in 'always' mode.\n",
> > +					errno);
> > +			}

> Similarly, to fail use an ASSERT or EXPECT.  Note also that when using
> the ksft_ API for reporting results each test should report a consistent
> test name as the string, if you want to report an error message print it
> separately to the test result.

I’ll revise this to use ASSERT/EXPECT, and separate error output from
test result strings, as you suggested.

> > + * Test process_madvise() with various invalid pidfds to ensure correct
> > + * error handling. This includes negative fds, non-pidfd fds, and pidfds for
> > + * processes that no longer exist.

> This sounds like it should be a series of small tests rather than a
> single omnibus test, that'd result in clearer error reporting from test
> frameworks since they will say which operation failed directly rather
> than having to look at the logs then match them to the source.

That makes sense. I’ll break this out into multiple smaller tests so
each case reports independently.

> > +	pidfd = syscall(__NR_pidfd_open, child_pid, 0);
> > +	ASSERT_GE(pidfd, 0);

> This is particularly the case given the use of ASSERTs, we'll not report
> any issues other than the first one we hit.

Thanks, I’ll switch to EXPECT_* where appropriate to allow multiple
checks per test case.

Thanks again for the detailed review!


Best regards,
Wang Lian

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2025-07-11 12:20 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-10 11:22 [PATCH v4] selftests/mm: add process_madvise() tests wang lian
2025-07-10 13:42 ` Mark Brown
2025-07-10 16:21   ` Zi Yan
2025-07-11  8:05     ` Mark Brown
2025-07-11 12:19   ` [PATCH v3] " wang lian
2025-07-10 16:57 ` [PATCH v4] " Lorenzo Stoakes
2025-07-11  8:11   ` Mark Brown
2025-07-11  8:53     ` Lorenzo Stoakes
2025-07-11  9:29       ` Mark Brown
2025-07-11 11:16   ` wang lian
2025-07-11 11:33     ` Lorenzo Stoakes
2025-07-11 12:02       ` wang lian
  -- strict thread matches above, loose matches on Subject: below --
2025-07-03  4:43 [PATCH v3] " wang lian
2025-07-05 19:32 ` Lorenzo Stoakes
2025-07-08 17:44 ` Zi Yan
2025-07-09 12:32   ` wang lian
2025-07-09 12:51     ` Lorenzo Stoakes
2025-07-10 11:29       ` wang lian
2025-07-09 14:46     ` Zi Yan
2025-07-10  8:42       ` Mark Brown
2025-07-10 16:28         ` Zi Yan
2025-07-11  8:34           ` Mark Brown
2025-07-11  8:49             ` Lorenzo Stoakes
2025-07-11 12:09               ` wang lian
2025-07-10 11:40       ` wang lian

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).