[PATCH v4] selftests/mm: add folio_split() and filemap_get

public inbox for linux-fsdevel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v4] selftests/mm: add folio_split() and filemap_get_entry() race test
@ 2026-03-20 14:22 Zi Yan
  2026-03-20 18:00 ` Zi Yan
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Zi Yan @ 2026-03-20 14:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Hildenbrand, Lorenzo Stoakes, Zi Yan, Hugh Dickins,
	Baolin Wang, Liam R. Howlett, Nico Pache, Ryan Roberts, Dev Jain,
	Barry Song, Lance Yang, Matthew Wilcox, Bas van Dijk, Eero Kelly,
	Andrew Battat, Adam Bratschi-Kaye, linux-mm, linux-kernel,
	linux-fsdevel

The added folio_split_race_test is a modified C port of the race condition
test from [1]. The test creates shmem huge pages, where the main thread
punches holes in the shmem to cause folio_split() in the kernel and
a set of 16 threads reads the shmem to cause filemap_get_entry() in the
kernel. filemap_get_entry() reads the folio and xarray split by
folio_split() locklessly. The original test[2] is written in rust and uses
memfd (shmem backed). This C port uses shmem directly and use a single
process.

Note: the initial rust to C conversion is done by Cursor.

Link: https://lore.kernel.org/all/CAKNNEtw5_kZomhkugedKMPOG-sxs5Q5OLumWJdiWXv+C9Yct0w@mail.gmail.com/ [1]
Link: https://github.com/dfinity/thp-madv-remove-test [2]
Signed-off-by: Bas van Dijk <bas@dfinity.org>
Signed-off-by: Adam Bratschi-Kaye <adam.bratschikaye@dfinity.org>
Signed-off-by: Zi Yan <ziy@nvidia.com>
---
From V3:
1. fixed for loop stepping issue
2. used PRIu64 instead of %zu for uint64_t.

From V2:
1. simplied the program by removing fork.

From V1:
1. added prctl(PR_SET_PDEATHSIG, SIGTERM) to avoid child looping
   forever.
2. removed page_idx % PUNCH_INTERVAL >= 0, since it is a nop. Added a
   comment.
3. added a child process status check to prevent parent looping forever
   and record that as a failure.
4. used ksft_exit_skip() instead of ksft_finished() when the program is
   not running as root.
5. restored THP settings properly when the program exits abnormally.
 tools/testing/selftests/mm/Makefile           |   1 +
 .../selftests/mm/folio_split_race_test.c      | 293 ++++++++++++++++++
 tools/testing/selftests/mm/run_vmtests.sh     |   2 +
 3 files changed, 296 insertions(+)
 create mode 100644 tools/testing/selftests/mm/folio_split_race_test.c

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 7a5de4e9bf520..cd24596cdd27e 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -105,6 +105,7 @@ TEST_GEN_FILES += droppable
 TEST_GEN_FILES += guard-regions
 TEST_GEN_FILES += merge
 TEST_GEN_FILES += rmap
+TEST_GEN_FILES += folio_split_race_test
 
 ifneq ($(ARCH),arm64)
 TEST_GEN_FILES += soft-dirty
diff --git a/tools/testing/selftests/mm/folio_split_race_test.c b/tools/testing/selftests/mm/folio_split_race_test.c
new file mode 100644
index 0000000000000..c264cc625a7cb
--- /dev/null
+++ b/tools/testing/selftests/mm/folio_split_race_test.c
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * The test creates shmem PMD huge pages, fills all pages with known patterns,
+ * then continuously verifies non-punched pages with 16 threads. Meanwhile, the
+ * main thread punches holes via MADV_REMOVE on the shmem.
+ *
+ * It tests the race condition between folio_split() and filemap_get_entry(),
+ * where the hole punches on shmem lead to folio_split() and reading the shmem
+ * lead to filemap_get_entry().
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/mman.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <unistd.h>
+#include "vm_util.h"
+#include "kselftest.h"
+#include "thp_settings.h"
+
+uint64_t page_size;
+uint64_t pmd_pagesize;
+#define NR_PMD_PAGE 5
+#define FILE_SIZE (pmd_pagesize * NR_PMD_PAGE)
+#define TOTAL_PAGES (FILE_SIZE / page_size)
+
+/* Every N-th to N+M-th pages are punched; not aligned with huge page boundaries. */
+#define PUNCH_INTERVAL 50 /* N */
+#define PUNCH_SIZE_FACTOR 3 /* M */
+
+#define NUM_READER_THREADS 16
+#define FILL_BYTE 0xAF
+#define NUM_ITERATIONS 100
+
+/* Shared control block: control reading threads and record stats */
+struct shared_ctl {
+	atomic_uint_fast32_t stop;
+	atomic_size_t reader_failures;
+	atomic_size_t reader_verified;
+};
+
+static void fill_page(unsigned char *base, size_t page_idx)
+{
+	unsigned char *page_ptr = base + page_idx * page_size;
+	uint64_t idx = (uint64_t)page_idx;
+
+	memset(page_ptr, FILL_BYTE, page_size);
+	memcpy(page_ptr, &idx, sizeof(idx));
+}
+
+/* Returns true if valid, false if corrupted. */
+static bool check_page(unsigned char *base, size_t page_idx)
+{
+	unsigned char *page_ptr = base + page_idx * page_size;
+	uint64_t expected_idx = (uint64_t)page_idx;
+	uint64_t got_idx;
+
+	memcpy(&got_idx, page_ptr, 8);
+
+	if (got_idx != expected_idx) {
+		size_t off;
+		int all_zero = 1;
+
+		for (off = 0; off < page_size; off++) {
+			if (page_ptr[off] != 0) {
+				all_zero = 0;
+				break;
+			}
+		}
+		if (all_zero) {
+			ksft_print_msg(
+				"CORRUPTED: page %zu (huge page %zu) is ALL ZEROS\n",
+				page_idx,
+				(page_idx * page_size) / pmd_pagesize);
+		} else {
+			ksft_print_msg(
+				"CORRUPTED: page %zu (huge page %zu): expected idx %zu, got %lu\n",
+				page_idx, (page_idx * page_size) / pmd_pagesize,
+				page_idx, (unsigned long)got_idx);
+		}
+		return false;
+	}
+	return true;
+}
+
+struct reader_arg {
+	unsigned char *base;
+	struct shared_ctl *ctl;
+	int tid;
+	atomic_size_t *failures;
+	atomic_size_t *verified;
+};
+
+static void *reader_thread(void *arg)
+{
+	struct reader_arg *ra = (struct reader_arg *)arg;
+	unsigned char *base = ra->base;
+	struct shared_ctl *ctl = ra->ctl;
+	int tid = ra->tid;
+	atomic_size_t *failures = ra->failures;
+	atomic_size_t *verified = ra->verified;
+	size_t page_idx;
+
+	while (atomic_load_explicit(&ctl->stop, memory_order_acquire) == 0) {
+		for (page_idx = (size_t)tid; page_idx < TOTAL_PAGES;
+		     page_idx += NUM_READER_THREADS) {
+			/*
+			 * page_idx % PUNCH_INTERVAL is in [0, PUNCH_INTERVAL),
+			 * skip [0, PUNCH_SIZE_FACTOR)
+			 */
+			if (page_idx % PUNCH_INTERVAL < PUNCH_SIZE_FACTOR)
+				continue;
+			if (check_page(base, page_idx))
+				atomic_fetch_add_explicit(verified, 1,
+							  memory_order_relaxed);
+			else
+				atomic_fetch_add_explicit(failures, 1,
+							  memory_order_relaxed);
+		}
+		if (atomic_load_explicit(failures, memory_order_relaxed) > 0)
+			break;
+	}
+
+	return NULL;
+}
+
+static void create_readers(pthread_t *threads, struct reader_arg *args,
+			   unsigned char *base, struct shared_ctl *ctl)
+{
+	int i;
+
+	for (i = 0; i < NUM_READER_THREADS; i++) {
+		args[i].base = base;
+		args[i].ctl = ctl;
+		args[i].tid = i;
+		args[i].failures = &ctl->reader_failures;
+		args[i].verified = &ctl->reader_verified;
+		if (pthread_create(&threads[i], NULL, reader_thread,
+				   &args[i]) != 0)
+			ksft_exit_fail_msg("pthread_create failed\n");
+	}
+}
+
+/* Run a single iteration. Returns total number of corrupted pages. */
+static size_t run_iteration(void)
+{
+	size_t reader_failures, reader_verified;
+	struct reader_arg args[NUM_READER_THREADS];
+	pthread_t threads[NUM_READER_THREADS];
+	unsigned char *mmap_base;
+	struct shared_ctl ctl;
+	size_t i;
+
+	memset(&ctl, 0, sizeof(struct shared_ctl));
+
+	mmap_base = mmap(NULL, FILE_SIZE, PROT_READ | PROT_WRITE,
+			 MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+
+	if (mmap_base == MAP_FAILED)
+		ksft_exit_fail_msg("mmap failed: %d\n", errno);
+
+	if (madvise(mmap_base, FILE_SIZE, MADV_HUGEPAGE) != 0)
+		ksft_exit_fail_msg("madvise(MADV_HUGEPAGE) failed: %d\n",
+				   errno);
+
+	for (i = 0; i < TOTAL_PAGES; i++)
+		fill_page(mmap_base, i);
+
+	if (!check_huge_shmem(mmap_base, NR_PMD_PAGE, pmd_pagesize))
+		ksft_exit_fail_msg("No shmem THP is allocated\n");
+
+	create_readers(threads, args, mmap_base, &ctl);
+
+	for (i = 0; i < TOTAL_PAGES; i++) {
+		if (i % PUNCH_INTERVAL != 0)
+			continue;
+		if (madvise(mmap_base + i * page_size,
+			    PUNCH_SIZE_FACTOR * page_size, MADV_REMOVE) != 0) {
+			ksft_exit_fail_msg(
+				"madvise(MADV_REMOVE) failed on page %zu: %d\n",
+				i, errno);
+		}
+
+		i += PUNCH_SIZE_FACTOR - 1;
+	}
+
+	atomic_store_explicit(&ctl.stop, 1, memory_order_release);
+
+	for (i = 0; i < NUM_READER_THREADS; i++)
+		pthread_join(threads[i], NULL);
+
+	reader_failures = atomic_load_explicit(&ctl.reader_failures,
+					       memory_order_acquire);
+	reader_verified = atomic_load_explicit(&ctl.reader_verified,
+					       memory_order_acquire);
+	if (reader_failures)
+		ksft_print_msg("Child: %zu pages verified, %zu failures\n",
+			       reader_verified, reader_failures);
+
+	munmap(mmap_base, FILE_SIZE);
+
+	return reader_failures;
+}
+
+static void thp_cleanup_handler(int signum)
+{
+	thp_restore_settings();
+	/*
+	 * Restore default handler and re-raise the signal to exit.
+	 * This is to ensure the test process exits with the correct
+	 * status code corresponding to the signal.
+	 */
+	signal(signum, SIG_DFL);
+	raise(signum);
+}
+
+static void thp_settings_cleanup(void)
+{
+	thp_restore_settings();
+}
+
+int main(void)
+{
+	struct thp_settings current_settings;
+	bool failed = false;
+	size_t failures;
+	size_t iter;
+
+	ksft_print_header();
+
+	if (!thp_is_enabled())
+		ksft_exit_skip("Transparent Hugepages not available\n");
+
+	if (geteuid() != 0)
+		ksft_exit_skip("Please run the test as root\n");
+
+	thp_save_settings();
+	/* make sure thp settings are restored */
+	if (atexit(thp_settings_cleanup) != 0)
+		ksft_exit_fail_msg("atexit failed\n");
+
+	signal(SIGINT, thp_cleanup_handler);
+	signal(SIGTERM, thp_cleanup_handler);
+
+	thp_read_settings(&current_settings);
+	current_settings.shmem_enabled = SHMEM_ADVISE;
+	thp_write_settings(&current_settings);
+
+	ksft_set_plan(1);
+
+	page_size = getpagesize();
+	pmd_pagesize = read_pmd_pagesize();
+
+	ksft_print_msg("folio split race test\n");
+	ksft_print_msg("===================================================\n");
+	ksft_print_msg("Shmem size:       %" PRIu64 " MiB\n", FILE_SIZE / 1024 / 1024);
+	ksft_print_msg("Total pages:     %" PRIu64 "\n", TOTAL_PAGES);
+	ksft_print_msg("Child readers:   %d\n", NUM_READER_THREADS);
+	ksft_print_msg("Punching every %dth to %dth page\n", PUNCH_INTERVAL,
+		       PUNCH_INTERVAL + PUNCH_SIZE_FACTOR);
+	ksft_print_msg("Iterations:      %d\n", NUM_ITERATIONS);
+
+	for (iter = 1; iter <= NUM_ITERATIONS; iter++) {
+		failures = run_iteration();
+		if (failures > 0) {
+			failed = true;
+			ksft_print_msg(
+				"FAILED on iteration %zu: %zu pages corrupted by MADV_REMOVE!\n",
+				iter, failures);
+			break;
+		}
+	}
+
+	if (failed) {
+		ksft_test_result_fail("Test failed\n");
+		ksft_exit_fail();
+	} else {
+		ksft_test_result_pass("All %d iterations passed\n",
+				      NUM_ITERATIONS);
+		ksft_exit_pass();
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 606558cc3b098..530980fdf3227 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -553,6 +553,8 @@ if [ -n "${MOUNTED_XFS}" ]; then
     rm -f ${XFS_IMG}
 fi
 
+CATEGORY="thp" run_test ./folio_split_race_test
+
 CATEGORY="migration" run_test ./migration
 
 CATEGORY="mkdirty" run_test ./mkdirty
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v4] selftests/mm: add folio_split() and filemap_get_entry() race test
  2026-03-20 14:22 [PATCH v4] selftests/mm: add folio_split() and filemap_get_entry() race test Zi Yan
@ 2026-03-20 18:00 ` Zi Yan
  2026-03-23  1:12 ` Zi Yan
  2026-03-23 12:48 ` David Hildenbrand (Arm)
  2 siblings, 0 replies; 5+ messages in thread
From: Zi Yan @ 2026-03-20 18:00 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Hildenbrand, Lorenzo Stoakes (Oracle), Zi Yan, Hugh Dickins,
	Baolin Wang, Liam R. Howlett, Nico Pache, Ryan Roberts, Dev Jain,
	Barry Song, Lance Yang, Matthew Wilcox, Bas van Dijk, Eero Kelly,
	Andrew Battat, Adam Bratschi-Kaye, linux-mm, linux-kernel,
	linux-fsdevel

On 20 Mar 2026, at 10:22, Zi Yan wrote:

> The added folio_split_race_test is a modified C port of the race condition
> test from [1]. The test creates shmem huge pages, where the main thread
> punches holes in the shmem to cause folio_split() in the kernel and
> a set of 16 threads reads the shmem to cause filemap_get_entry() in the
> kernel. filemap_get_entry() reads the folio and xarray split by
> folio_split() locklessly. The original test[2] is written in rust and uses
> memfd (shmem backed). This C port uses shmem directly and use a single
> process.
>
> Note: the initial rust to C conversion is done by Cursor.
>
> Link: https://lore.kernel.org/all/CAKNNEtw5_kZomhkugedKMPOG-sxs5Q5OLumWJdiWXv+C9Yct0w@mail.gmail.com/ [1]
> Link: https://github.com/dfinity/thp-madv-remove-test [2]
> Signed-off-by: Bas van Dijk <bas@dfinity.org>
> Signed-off-by: Adam Bratschi-Kaye <adam.bratschikaye@dfinity.org>
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
> From V3:
> 1. fixed for loop stepping issue
> 2. used PRIu64 instead of %zu for uint64_t.
>
> From V2:
> 1. simplied the program by removing fork.
>
> From V1:
> 1. added prctl(PR_SET_PDEATHSIG, SIGTERM) to avoid child looping
>    forever.
> 2. removed page_idx % PUNCH_INTERVAL >= 0, since it is a nop. Added a
>    comment.
> 3. added a child process status check to prevent parent looping forever
>    and record that as a failure.
> 4. used ksft_exit_skip() instead of ksft_finished() when the program is
>    not running as root.
> 5. restored THP settings properly when the program exits abnormally.
>  tools/testing/selftests/mm/Makefile           |   1 +
>  .../selftests/mm/folio_split_race_test.c      | 293 ++++++++++++++++++
>  tools/testing/selftests/mm/run_vmtests.sh     |   2 +
>  3 files changed, 296 insertions(+)
>  create mode 100644 tools/testing/selftests/mm/folio_split_race_test.c
>

Hi Andrew,

The fixup below adds folio_split_race_test binary to .gitignore.
Thank Lorenzo for pointing this out.

From bd164a7090c0e8b6e3013502c64c161214e38714 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Fri, 20 Mar 2026 13:56:04 -0400
Subject: [PATCH] git: add generated binary into gitignore

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 tools/testing/selftests/mm/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 83ad9454dd9d1..b0c30c5ee9e30 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -61,3 +61,4 @@ guard-regions
 merge
 prctl_thp_disable
 rmap
+folio_split_race_test
-- 
2.51.0



Best Regards,
Yan, Zi

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v4] selftests/mm: add folio_split() and filemap_get_entry() race test
  2026-03-20 14:22 [PATCH v4] selftests/mm: add folio_split() and filemap_get_entry() race test Zi Yan
  2026-03-20 18:00 ` Zi Yan
@ 2026-03-23  1:12 ` Zi Yan
  2026-03-23 12:48 ` David Hildenbrand (Arm)
  2 siblings, 0 replies; 5+ messages in thread
From: Zi Yan @ 2026-03-23  1:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Hildenbrand, Lorenzo Stoakes, Zi Yan, Hugh Dickins,
	Baolin Wang, Liam R. Howlett, Nico Pache, Ryan Roberts, Dev Jain,
	Barry Song, Lance Yang, Matthew Wilcox, Bas van Dijk, Eero Kelly,
	Andrew Battat, Adam Bratschi-Kaye, linux-mm, linux-kernel,
	linux-fsdevel

On 20 Mar 2026, at 10:22, Zi Yan wrote:

> The added folio_split_race_test is a modified C port of the race condition
> test from [1]. The test creates shmem huge pages, where the main thread
> punches holes in the shmem to cause folio_split() in the kernel and
> a set of 16 threads reads the shmem to cause filemap_get_entry() in the
> kernel. filemap_get_entry() reads the folio and xarray split by
> folio_split() locklessly. The original test[2] is written in rust and uses
> memfd (shmem backed). This C port uses shmem directly and use a single
> process.
>
> Note: the initial rust to C conversion is done by Cursor.
>
> Link: https://lore.kernel.org/all/CAKNNEtw5_kZomhkugedKMPOG-sxs5Q5OLumWJdiWXv+C9Yct0w@mail.gmail.com/ [1]
> Link: https://github.com/dfinity/thp-madv-remove-test [2]
> Signed-off-by: Bas van Dijk <bas@dfinity.org>
> Signed-off-by: Adam Bratschi-Kaye <adam.bratschikaye@dfinity.org>
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
> From V3:
> 1. fixed for loop stepping issue
> 2. used PRIu64 instead of %zu for uint64_t.
>
> From V2:
> 1. simplied the program by removing fork.
>
> From V1:
> 1. added prctl(PR_SET_PDEATHSIG, SIGTERM) to avoid child looping
>    forever.
> 2. removed page_idx % PUNCH_INTERVAL >= 0, since it is a nop. Added a
>    comment.
> 3. added a child process status check to prevent parent looping forever
>    and record that as a failure.
> 4. used ksft_exit_skip() instead of ksft_finished() when the program is
>    not running as root.
> 5. restored THP settings properly when the program exits abnormally.
>  tools/testing/selftests/mm/Makefile           |   1 +
>  .../selftests/mm/folio_split_race_test.c      | 293 ++++++++++++++++++
>  tools/testing/selftests/mm/run_vmtests.sh     |   2 +
>  3 files changed, 296 insertions(+)
>  create mode 100644 tools/testing/selftests/mm/folio_split_race_test.c
>

Hi Andrew,

The fixup below addressed the new issues (first and third) raised by sashiko[1].

The second issue is that the test only verifies first 8 bytes. Because
the test is intended to verify the race condition causing a wrong page index
is used.

The fourth issue is addressed in Q3 from V3[2]


[1] https://sashiko.dev/#/patchset/20260320142219.375118-1-ziy%40nvidia.com
[2] https://lore.kernel.org/all/8B720FB8-DE4D-487A-9AEF-AC204E9F5755@nvidia.com/


From a66945de00f33c163cf814ac7c2d9620a725bfed Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Sun, 22 Mar 2026 19:53:54 -0400
Subject: [PATCH] selftests/mm: fix sashiko complains on folio_split_race_test

1. used PRIu64 for uint64_t
2. added pthread_barrier_t to ensure main thread starts to punch holes when
   all reader threads are spawned.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 .../selftests/mm/folio_split_race_test.c        | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/mm/folio_split_race_test.c b/tools/testing/selftests/mm/folio_split_race_test.c
index c264cc625a7cb..ab6868e7e2efe 100644
--- a/tools/testing/selftests/mm/folio_split_race_test.c
+++ b/tools/testing/selftests/mm/folio_split_race_test.c
@@ -46,6 +46,7 @@ struct shared_ctl {
 	atomic_uint_fast32_t stop;
 	atomic_size_t reader_failures;
 	atomic_size_t reader_verified;
+	pthread_barrier_t barrier;
 };

 static void fill_page(unsigned char *base, size_t page_idx)
@@ -78,14 +79,14 @@ static bool check_page(unsigned char *base, size_t page_idx)
 		}
 		if (all_zero) {
 			ksft_print_msg(
-				"CORRUPTED: page %zu (huge page %zu) is ALL ZEROS\n",
+				"CORRUPTED: page %zu (huge page %" PRIu64 ") is ALL ZEROS\n",
 				page_idx,
 				(page_idx * page_size) / pmd_pagesize);
 		} else {
 			ksft_print_msg(
-				"CORRUPTED: page %zu (huge page %zu): expected idx %zu, got %lu\n",
+				"CORRUPTED: page %zu (huge page %" PRIu64 "): expected idx %zu, got %" PRIu64 "\n",
 				page_idx, (page_idx * page_size) / pmd_pagesize,
-				page_idx, (unsigned long)got_idx);
+				page_idx, got_idx);
 		}
 		return false;
 	}
@@ -110,6 +111,8 @@ static void *reader_thread(void *arg)
 	atomic_size_t *verified = ra->verified;
 	size_t page_idx;

+	pthread_barrier_wait(&ctl->barrier);
+
 	while (atomic_load_explicit(&ctl->stop, memory_order_acquire) == 0) {
 		for (page_idx = (size_t)tid; page_idx < TOTAL_PAGES;
 		     page_idx += NUM_READER_THREADS) {
@@ -178,8 +181,14 @@ static size_t run_iteration(void)
 	if (!check_huge_shmem(mmap_base, NR_PMD_PAGE, pmd_pagesize))
 		ksft_exit_fail_msg("No shmem THP is allocated\n");

+	if (pthread_barrier_init(&ctl.barrier, NULL, NUM_READER_THREADS + 1) != 0)
+		ksft_exit_fail_msg("pthread_barrier_init failed\n");
+
 	create_readers(threads, args, mmap_base, &ctl);

+	/* Wait for all reader threads to be ready before punching holes. */
+	pthread_barrier_wait(&ctl.barrier);
+
 	for (i = 0; i < TOTAL_PAGES; i++) {
 		if (i % PUNCH_INTERVAL != 0)
 			continue;
@@ -198,6 +207,8 @@ static size_t run_iteration(void)
 	for (i = 0; i < NUM_READER_THREADS; i++)
 		pthread_join(threads[i], NULL);

+	pthread_barrier_destroy(&ctl.barrier);
+
 	reader_failures = atomic_load_explicit(&ctl.reader_failures,
 					       memory_order_acquire);
 	reader_verified = atomic_load_explicit(&ctl.reader_verified,
-- 
2.53.0



--
Best Regards,
Yan, Zi

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v4] selftests/mm: add folio_split() and filemap_get_entry() race test
  2026-03-20 14:22 [PATCH v4] selftests/mm: add folio_split() and filemap_get_entry() race test Zi Yan
  2026-03-20 18:00 ` Zi Yan
  2026-03-23  1:12 ` Zi Yan
@ 2026-03-23 12:48 ` David Hildenbrand (Arm)
  2026-03-23 15:24   ` Zi Yan
  2 siblings, 1 reply; 5+ messages in thread
From: David Hildenbrand (Arm) @ 2026-03-23 12:48 UTC (permalink / raw)
  To: Zi Yan, Andrew Morton
  Cc: Lorenzo Stoakes, Hugh Dickins, Baolin Wang, Liam R. Howlett,
	Nico Pache, Ryan Roberts, Dev Jain, Barry Song, Lance Yang,
	Matthew Wilcox, Bas van Dijk, Eero Kelly, Andrew Battat,
	Adam Bratschi-Kaye, linux-mm, linux-kernel, linux-fsdevel

On 3/20/26 15:22, Zi Yan wrote:
> The added folio_split_race_test is a modified C port of the race condition
> test from [1]. The test creates shmem huge pages, where the main thread
> punches holes in the shmem to cause folio_split() in the kernel and
> a set of 16 threads reads the shmem to cause filemap_get_entry() in the
> kernel. filemap_get_entry() reads the folio and xarray split by
> folio_split() locklessly. The original test[2] is written in rust and uses
> memfd (shmem backed). This C port uses shmem directly and use a single
> process.
> 
> Note: the initial rust to C conversion is done by Cursor.
> 
> Link: https://lore.kernel.org/all/CAKNNEtw5_kZomhkugedKMPOG-sxs5Q5OLumWJdiWXv+C9Yct0w@mail.gmail.com/ [1]
> Link: https://github.com/dfinity/thp-madv-remove-test [2]
> Signed-off-by: Bas van Dijk <bas@dfinity.org>
> Signed-off-by: Adam Bratschi-Kaye <adam.bratschikaye@dfinity.org>

You are likely missing two Co-developed-by.

See Documentation/process/submitting-patches.rst on how to handle such
SOBs.

> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
> From V3:
> 1. fixed for loop stepping issue
> 2. used PRIu64 instead of %zu for uint64_t.
> 
> From V2:
> 1. simplied the program by removing fork.
> 
> From V1:
> 1. added prctl(PR_SET_PDEATHSIG, SIGTERM) to avoid child looping
>    forever.
> 2. removed page_idx % PUNCH_INTERVAL >= 0, since it is a nop. Added a
>    comment.
> 3. added a child process status check to prevent parent looping forever
>    and record that as a failure.
> 4. used ksft_exit_skip() instead of ksft_finished() when the program is
>    not running as root.
> 5. restored THP settings properly when the program exits abnormally.
>  tools/testing/selftests/mm/Makefile           |   1 +
>  .../selftests/mm/folio_split_race_test.c      | 293 ++++++++++++++++++
>  tools/testing/selftests/mm/run_vmtests.sh     |   2 +
>  3 files changed, 296 insertions(+)
>  create mode 100644 tools/testing/selftests/mm/folio_split_race_test.c
> 
> diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
> index 7a5de4e9bf520..cd24596cdd27e 100644
> --- a/tools/testing/selftests/mm/Makefile
> +++ b/tools/testing/selftests/mm/Makefile
> @@ -105,6 +105,7 @@ TEST_GEN_FILES += droppable
>  TEST_GEN_FILES += guard-regions
>  TEST_GEN_FILES += merge
>  TEST_GEN_FILES += rmap
> +TEST_GEN_FILES += folio_split_race_test
>  
>  ifneq ($(ARCH),arm64)
>  TEST_GEN_FILES += soft-dirty
> diff --git a/tools/testing/selftests/mm/folio_split_race_test.c b/tools/testing/selftests/mm/folio_split_race_test.c
> new file mode 100644
> index 0000000000000..c264cc625a7cb
> --- /dev/null
> +++ b/tools/testing/selftests/mm/folio_split_race_test.c
> @@ -0,0 +1,293 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * The test creates shmem PMD huge pages, fills all pages with known patterns,
> + * then continuously verifies non-punched pages with 16 threads. Meanwhile, the
> + * main thread punches holes via MADV_REMOVE on the shmem.
> + *
> + * It tests the race condition between folio_split() and filemap_get_entry(),
> + * where the hole punches on shmem lead to folio_split() and reading the shmem
> + * lead to filemap_get_entry().
> + */
> +
> +#define _GNU_SOURCE
> +#include <errno.h>
> +#include <inttypes.h>
> +#include <linux/mman.h>
> +#include <pthread.h>
> +#include <stdatomic.h>
> +#include <stdbool.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/mman.h>
> +#include <signal.h>
> +#include <unistd.h>
> +#include "vm_util.h"
> +#include "kselftest.h"
> +#include "thp_settings.h"


[...] some comment son the main() part :)

> +int main(void)
> +{
> +	struct thp_settings current_settings;
> +	bool failed = false;
> +	size_t failures;
> +	size_t iter;

Why are iterations a "size_t" ? Similarly for "failures". Just use int / unsigned long?

> +
> +	ksft_print_header();
> +
> +	if (!thp_is_enabled())
> +		ksft_exit_skip("Transparent Hugepages not available\n");

Is checking thp_is_enabled() the right thing to do when you perform your own setup below either way?

I think you should just use thp_available(). Then, configure THP accordingly below?

> +
> +	if (geteuid() != 0)
> +		ksft_exit_skip("Please run the test as root\n");
> +
> +	thp_save_settings();
> +	/* make sure thp settings are restored */
> +	if (atexit(thp_settings_cleanup) != 0)
> +		ksft_exit_fail_msg("atexit failed\n");
> +
> +	signal(SIGINT, thp_cleanup_handler);
> +	signal(SIGTERM, thp_cleanup_handler);
> +
> +	thp_read_settings(&current_settings);
> +	current_settings.shmem_enabled = SHMEM_ADVISE;
> +	thp_write_settings(&current_settings);
> +
> +	ksft_set_plan(1);
> +
> +	page_size = getpagesize();
> +	pmd_pagesize = read_pmd_pagesize();

I wonder whether we should check for 0 here and skip the test (older kernels?).

> +
> +	ksft_print_msg("folio split race test\n");
> +	ksft_print_msg("===================================================\n");
> +	ksft_print_msg("Shmem size:       %" PRIu64 " MiB\n", FILE_SIZE / 1024 / 1024);
> +	ksft_print_msg("Total pages:     %" PRIu64 "\n", TOTAL_PAGES);
> +	ksft_print_msg("Child readers:   %d\n", NUM_READER_THREADS);
> +	ksft_print_msg("Punching every %dth to %dth page\n", PUNCH_INTERVAL,
> +		       PUNCH_INTERVAL + PUNCH_SIZE_FACTOR);
> +	ksft_print_msg("Iterations:      %d\n", NUM_ITERATIONS);


I don't think printing static test information is that helpful.
Do we need all that at all?


> +
> +	for (iter = 1; iter <= NUM_ITERATIONS; iter++) {

Why not start at 0? You know, to confuse less people :)

	for (iter = 0; iter < NUM_ITERATIONS; iter++) {

> +		failures = run_iteration();

"corrupted_pages" ?

> +		if (failures > 0) {
> +			failed = true;

Do you really need that variable?

> +			ksft_print_msg(
> +				"FAILED on iteration %zu: %zu pages corrupted by MADV_REMOVE!\n",
> +				iter, failures);

Can that simply be printed below?

Like

if (iter < NUM_ITERATIONS) {
	ksft_test_result_fail("Test failed on iterations %zu: %zu pages ...\n",
			      iter + 1, corrupted_pages);
} else {
	ksft_test_result_pass ...
}

> +			break;
> +		}
> +	}
> +
> +	if (failed) {
> +		ksft_test_result_fail("Test failed\n");
> +		ksft_exit_fail();
> +	} else {
> +		ksft_test_result_pass("All %d iterations passed\n",
> +				      NUM_ITERATIONS);
> +		ksft_exit_pass();
> +	}
> +
> +	return 0;
-- 
Cheers,

David

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v4] selftests/mm: add folio_split() and filemap_get_entry() race test
  2026-03-23 12:48 ` David Hildenbrand (Arm)
@ 2026-03-23 15:24   ` Zi Yan
  0 siblings, 0 replies; 5+ messages in thread
From: Zi Yan @ 2026-03-23 15:24 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Andrew Morton, Lorenzo Stoakes, Hugh Dickins, Baolin Wang,
	Liam R. Howlett, Nico Pache, Ryan Roberts, Dev Jain, Barry Song,
	Lance Yang, Matthew Wilcox, Bas van Dijk, Eero Kelly,
	Andrew Battat, Adam Bratschi-Kaye, linux-mm, linux-kernel,
	linux-fsdevel

On 23 Mar 2026, at 8:48, David Hildenbrand (Arm) wrote:

> On 3/20/26 15:22, Zi Yan wrote:
>> The added folio_split_race_test is a modified C port of the race condition
>> test from [1]. The test creates shmem huge pages, where the main thread
>> punches holes in the shmem to cause folio_split() in the kernel and
>> a set of 16 threads reads the shmem to cause filemap_get_entry() in the
>> kernel. filemap_get_entry() reads the folio and xarray split by
>> folio_split() locklessly. The original test[2] is written in rust and uses
>> memfd (shmem backed). This C port uses shmem directly and use a single
>> process.
>>
>> Note: the initial rust to C conversion is done by Cursor.
>>
>> Link: https://lore.kernel.org/all/CAKNNEtw5_kZomhkugedKMPOG-sxs5Q5OLumWJdiWXv+C9Yct0w@mail.gmail.com/ [1]
>> Link: https://github.com/dfinity/thp-madv-remove-test [2]
>> Signed-off-by: Bas van Dijk <bas@dfinity.org>
>> Signed-off-by: Adam Bratschi-Kaye <adam.bratschikaye@dfinity.org>
>
> You are likely missing two Co-developed-by.
>
> See Documentation/process/submitting-patches.rst on how to handle such
> SOBs.

Will add them.

>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>> From V3:
>> 1. fixed for loop stepping issue
>> 2. used PRIu64 instead of %zu for uint64_t.
>>
>> From V2:
>> 1. simplied the program by removing fork.
>>
>> From V1:
>> 1. added prctl(PR_SET_PDEATHSIG, SIGTERM) to avoid child looping
>>    forever.
>> 2. removed page_idx % PUNCH_INTERVAL >= 0, since it is a nop. Added a
>>    comment.
>> 3. added a child process status check to prevent parent looping forever
>>    and record that as a failure.
>> 4. used ksft_exit_skip() instead of ksft_finished() when the program is
>>    not running as root.
>> 5. restored THP settings properly when the program exits abnormally.
>>  tools/testing/selftests/mm/Makefile           |   1 +
>>  .../selftests/mm/folio_split_race_test.c      | 293 ++++++++++++++++++
>>  tools/testing/selftests/mm/run_vmtests.sh     |   2 +
>>  3 files changed, 296 insertions(+)
>>  create mode 100644 tools/testing/selftests/mm/folio_split_race_test.c
>>
>> diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
>> index 7a5de4e9bf520..cd24596cdd27e 100644
>> --- a/tools/testing/selftests/mm/Makefile
>> +++ b/tools/testing/selftests/mm/Makefile
>> @@ -105,6 +105,7 @@ TEST_GEN_FILES += droppable
>>  TEST_GEN_FILES += guard-regions
>>  TEST_GEN_FILES += merge
>>  TEST_GEN_FILES += rmap
>> +TEST_GEN_FILES += folio_split_race_test
>>
>>  ifneq ($(ARCH),arm64)
>>  TEST_GEN_FILES += soft-dirty
>> diff --git a/tools/testing/selftests/mm/folio_split_race_test.c b/tools/testing/selftests/mm/folio_split_race_test.c
>> new file mode 100644
>> index 0000000000000..c264cc625a7cb
>> --- /dev/null
>> +++ b/tools/testing/selftests/mm/folio_split_race_test.c
>> @@ -0,0 +1,293 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * The test creates shmem PMD huge pages, fills all pages with known patterns,
>> + * then continuously verifies non-punched pages with 16 threads. Meanwhile, the
>> + * main thread punches holes via MADV_REMOVE on the shmem.
>> + *
>> + * It tests the race condition between folio_split() and filemap_get_entry(),
>> + * where the hole punches on shmem lead to folio_split() and reading the shmem
>> + * lead to filemap_get_entry().
>> + */
>> +
>> +#define _GNU_SOURCE
>> +#include <errno.h>
>> +#include <inttypes.h>
>> +#include <linux/mman.h>
>> +#include <pthread.h>
>> +#include <stdatomic.h>
>> +#include <stdbool.h>
>> +#include <stdint.h>
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <sys/mman.h>
>> +#include <signal.h>
>> +#include <unistd.h>
>> +#include "vm_util.h"
>> +#include "kselftest.h"
>> +#include "thp_settings.h"
>
>
> [...] some comment son the main() part :)
>
>> +int main(void)
>> +{
>> +	struct thp_settings current_settings;
>> +	bool failed = false;
>> +	size_t failures;
>> +	size_t iter;
>
> Why are iterations a "size_t" ? Similarly for "failures". Just use int / unsigned long?

Will change them.

>
>> +
>> +	ksft_print_header();
>> +
>> +	if (!thp_is_enabled())
>> +		ksft_exit_skip("Transparent Hugepages not available\n");
>
> Is checking thp_is_enabled() the right thing to do when you perform your own setup below either way?
>
> I think you should just use thp_available(). Then, configure THP accordingly below?

OK, will use thp_available().

>
>> +
>> +	if (geteuid() != 0)
>> +		ksft_exit_skip("Please run the test as root\n");
>> +
>> +	thp_save_settings();
>> +	/* make sure thp settings are restored */
>> +	if (atexit(thp_settings_cleanup) != 0)
>> +		ksft_exit_fail_msg("atexit failed\n");
>> +
>> +	signal(SIGINT, thp_cleanup_handler);
>> +	signal(SIGTERM, thp_cleanup_handler);
>> +
>> +	thp_read_settings(&current_settings);
>> +	current_settings.shmem_enabled = SHMEM_ADVISE;
>> +	thp_write_settings(&current_settings);
>> +
>> +	ksft_set_plan(1);
>> +
>> +	page_size = getpagesize();
>> +	pmd_pagesize = read_pmd_pagesize();
>
> I wonder whether we should check for 0 here and skip the test (older kernels?).

OK, will take care of that.

>
>> +
>> +	ksft_print_msg("folio split race test\n");
>> +	ksft_print_msg("===================================================\n");
>> +	ksft_print_msg("Shmem size:       %" PRIu64 " MiB\n", FILE_SIZE / 1024 / 1024);
>> +	ksft_print_msg("Total pages:     %" PRIu64 "\n", TOTAL_PAGES);
>> +	ksft_print_msg("Child readers:   %d\n", NUM_READER_THREADS);
>> +	ksft_print_msg("Punching every %dth to %dth page\n", PUNCH_INTERVAL,
>> +		       PUNCH_INTERVAL + PUNCH_SIZE_FACTOR);
>> +	ksft_print_msg("Iterations:      %d\n", NUM_ITERATIONS);
>
>
> I don't think printing static test information is that helpful.
> Do we need all that at all?

To provide some information on what this test is doing? I am OK with
removing them, but one will need to check the source code to get an idea.

>
>
>> +
>> +	for (iter = 1; iter <= NUM_ITERATIONS; iter++) {
>
> Why not start at 0? You know, to confuse less people :)
>
> 	for (iter = 0; iter < NUM_ITERATIONS; iter++) {

Will change it.

>
>> +		failures = run_iteration();
>
> "corrupted_pages" ?
>
>> +		if (failures > 0) {
>> +			failed = true;
>
> Do you really need that variable?
>
>> +			ksft_print_msg(
>> +				"FAILED on iteration %zu: %zu pages corrupted by MADV_REMOVE!\n",
>> +				iter, failures);
>
> Can that simply be printed below?
>
> Like
>
> if (iter < NUM_ITERATIONS) {
> 	ksft_test_result_fail("Test failed on iterations %zu: %zu pages ...\n",
> 			      iter + 1, corrupted_pages);
> } else {
> 	ksft_test_result_pass ...
> }
>

Sure, will simplify it. Thanks for the feedback.

>> +			break;
>> +		}
>> +	}
>> +
>> +	if (failed) {
>> +		ksft_test_result_fail("Test failed\n");
>> +		ksft_exit_fail();
>> +	} else {
>> +		ksft_test_result_pass("All %d iterations passed\n",
>> +				      NUM_ITERATIONS);
>> +		ksft_exit_pass();
>> +	}
>> +
>> +	return 0;
> -- 
> Cheers,
>
> David


Best Regards,
Yan, Zi

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-03-23 15:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-20 14:22 [PATCH v4] selftests/mm: add folio_split() and filemap_get_entry() race test Zi Yan
2026-03-20 18:00 ` Zi Yan
2026-03-23  1:12 ` Zi Yan
2026-03-23 12:48 ` David Hildenbrand (Arm)
2026-03-23 15:24   ` Zi Yan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox