* [RFC 1/2] page_fault: add mmap-backed ioengine for anonymous faults
2026-01-29 18:42 [RFC 0/2] Introduce a page_fault ioengine for MM workflows Nico Pache
@ 2026-01-29 18:43 ` Nico Pache
2026-01-29 18:43 ` [RFC 2/2] page_fault: add hugepage_delay option for delayed MADV_HUGEPAGE Nico Pache
` (2 subsequent siblings)
3 siblings, 0 replies; 8+ messages in thread
From: Nico Pache @ 2026-01-29 18:43 UTC (permalink / raw)
To: fio; +Cc: axboe, vincentfu, npache, david, willy
Introduce a new ioengine that mmaps anonymous memory and copies data
on read/write to trigger page faults. This allows us to leverage FIOs
powerful framework for MM related testing, and will ideally allow us to
quickly expand testing, by leveraging previously FS related fio scripts.
Signed-off-by: Nico Pache <npache@redhat.com>
---
Makefile | 2 +-
engines/page_fault.c | 105 ++++++++++++++++++++++++++++++++++++++++
examples/page_fault.fio | 9 ++++
3 files changed, 115 insertions(+), 1 deletion(-)
create mode 100644 engines/page_fault.c
create mode 100644 examples/page_fault.fio
diff --git a/Makefile b/Makefile
index 0337e8fe..099e2f94 100644
--- a/Makefile
+++ b/Makefile
@@ -57,7 +57,7 @@ SOURCE := $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
smalloc.c filehash.c profile.c debug.c engines/cpu.c \
engines/mmap.c engines/sync.c engines/null.c engines/net.c \
engines/ftruncate.c engines/fileoperations.c \
- engines/exec.c \
+ engines/exec.c engines/page_fault.c \
server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
gettime-thread.c helpers.c json.c idletime.c td_error.c \
profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
diff --git a/engines/page_fault.c b/engines/page_fault.c
new file mode 100644
index 00000000..e0a3c9e5
--- /dev/null
+++ b/engines/page_fault.c
@@ -0,0 +1,105 @@
+#include "ioengines.h"
+#include "fio.h"
+#include <sys/mman.h>
+
+struct fio_page_fault_data {
+ void *mmap_ptr;
+ size_t mmap_sz;
+ off_t mmap_off;
+};
+
+static int fio_page_fault_init(struct thread_data *td)
+{
+ size_t total_io_size;
+ struct fio_page_fault_data *fpd = calloc(1, sizeof(*fpd));
+ if (!fpd)
+ return 1;
+
+ total_io_size = td->o.size;
+ fpd->mmap_sz = total_io_size;
+ fpd->mmap_off = 0;
+ fpd->mmap_ptr = mmap(NULL, total_io_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (fpd->mmap_ptr == MAP_FAILED)
+ {
+ free(fpd);
+ return 1;
+ }
+
+ FILE_SET_ENG_DATA(td->files[0], fpd);
+ return 0;
+}
+
+static int fio_page_fault_prep(struct thread_data *td, struct io_u *io_u)
+{
+ return 0;
+}
+
+static enum fio_q_status fio_page_fault_queue(struct thread_data *td, struct io_u *io_u)
+{
+ void * mmap_head;
+ struct fio_page_fault_data *fpd = FILE_ENG_DATA(io_u->file);
+ if (!fpd)
+ return 1;
+
+ if (io_u->offset + io_u->buflen > fpd->mmap_sz)
+ return 1;
+
+ mmap_head = fpd->mmap_ptr + io_u->offset;
+ switch (io_u->ddir)
+ {
+ case DDIR_READ:
+ for (size_t i = 0; i < io_u->buflen; i++)
+ {
+ ((unsigned char *)(io_u->xfer_buf))[i] = ((unsigned char *)(mmap_head))[i];
+ }
+ break;
+ case DDIR_WRITE:
+ for (size_t i = 0; i < io_u->buflen; i++)
+ {
+ ((unsigned char *)(mmap_head))[i] = ((unsigned char *)(io_u->xfer_buf))[i];
+ }
+ break;
+ default:
+ return 1;
+ }
+
+ return FIO_Q_COMPLETED;
+}
+
+static int fio_page_fault_open_file(struct thread_data *td, struct fio_file *f)
+{
+ return 0;
+}
+
+static int fio_page_fault_close_file(struct thread_data *td, struct fio_file *f)
+{
+ struct fio_page_fault_data *fpd = FILE_ENG_DATA(f);
+ if (!fpd)
+ return 1;
+ if (fpd->mmap_ptr && fpd->mmap_sz)
+ munmap(fpd->mmap_ptr, fpd->mmap_sz);
+ free(fpd);
+ return 0;
+}
+
+static struct ioengine_ops ioengine = {
+ .name = "page_fault",
+ .version = FIO_IOOPS_VERSION,
+ .init = fio_page_fault_init,
+ .prep = fio_page_fault_prep,
+ .queue = fio_page_fault_queue,
+ .open_file = fio_page_fault_open_file,
+ .close_file = fio_page_fault_close_file,
+ .get_file_size = generic_get_file_size,
+ .flags = FIO_SYNCIO | FIO_NOEXTEND | FIO_DISKLESSIO,
+};
+
+static void fio_init fio_page_fault_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_page_fault_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
\ No newline at end of file
diff --git a/examples/page_fault.fio b/examples/page_fault.fio
new file mode 100644
index 00000000..9001f570
--- /dev/null
+++ b/examples/page_fault.fio
@@ -0,0 +1,9 @@
+[global]
+bs=4k
+
+[page_fault]
+ioengine=page_fault
+size=2g
+rw=randrw
+rwmixread=50
+verify=crc32c
\ No newline at end of file
--
2.52.0
^ permalink raw reply related [flat|nested] 8+ messages in thread* [RFC 2/2] page_fault: add hugepage_delay option for delayed MADV_HUGEPAGE
2026-01-29 18:42 [RFC 0/2] Introduce a page_fault ioengine for MM workflows Nico Pache
2026-01-29 18:43 ` [RFC 1/2] page_fault: add mmap-backed ioengine for anonymous faults Nico Pache
@ 2026-01-29 18:43 ` Nico Pache
2026-01-30 20:08 ` Vincent Fu
2026-01-30 21:00 ` [RFC 0/2] Introduce a page_fault ioengine for MM workflows fiotestbot
2026-01-31 13:59 ` Jens Axboe
3 siblings, 1 reply; 8+ messages in thread
From: Nico Pache @ 2026-01-29 18:43 UTC (permalink / raw)
To: fio; +Cc: axboe, vincentfu, npache, david, willy
Introduce hugepage_delay to map with MADV_NOHUGEPAGE first, then
madvise MADV_HUGEPAGE after a configurable delay via a helper thread.
This makes khugepaged candidates reproducible for page_fault tests.
Signed-off-by: Nico Pache <npache@redhat.com>
---
cconv.c | 3 ++
engines/page_fault.c | 94 ++++++++++++++++++++++++++++++++++++++++----
options.c | 10 +++++
thread_options.h | 3 ++
4 files changed, 103 insertions(+), 7 deletions(-)
diff --git a/cconv.c b/cconv.c
index 0c4a3f2d..4fafbf83 100644
--- a/cconv.c
+++ b/cconv.c
@@ -242,6 +242,7 @@ int convert_thread_options_to_cpu(struct thread_options *o,
o->random_center.u.f = fio_uint64_to_double(le64_to_cpu(top->random_center.u.i));
o->random_generator = le32_to_cpu(top->random_generator);
o->hugepage_size = le32_to_cpu(top->hugepage_size);
+ o->hugepage_delay = le32_to_cpu(top->hugepage_delay);
o->rw_min_bs = le64_to_cpu(top->rw_min_bs);
o->thinkcycles = le32_to_cpu(top->thinkcycles);
o->thinktime = le32_to_cpu(top->thinktime);
@@ -494,6 +495,8 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
top->random_center.u.i = __cpu_to_le64(fio_double_to_uint64(o->random_center.u.f));
top->random_generator = cpu_to_le32(o->random_generator);
top->hugepage_size = cpu_to_le32(o->hugepage_size);
+ top->hugepage_delay = cpu_to_le32(o->hugepage_delay);
+ top->hugepage_delay_pad = 0;
top->rw_min_bs = __cpu_to_le64(o->rw_min_bs);
top->thinkcycles = cpu_to_le32(o->thinkcycles);
top->thinktime = cpu_to_le32(o->thinktime);
diff --git a/engines/page_fault.c b/engines/page_fault.c
index e0a3c9e5..1724d553 100644
--- a/engines/page_fault.c
+++ b/engines/page_fault.c
@@ -1,20 +1,65 @@
#include "ioengines.h"
#include "fio.h"
+#include <errno.h>
+#include <pthread.h>
#include <sys/mman.h>
+#include <time.h>
struct fio_page_fault_data {
void *mmap_ptr;
size_t mmap_sz;
off_t mmap_off;
+#ifdef CONFIG_HAVE_THP
+ pthread_t mmap_thread;
+ pthread_mutex_t mmap_lock;
+ pthread_cond_t mmap_cond;
+ int mmap_thread_exit;
+ int mmap_thread_started;
+ unsigned int hugepage_delay;
+#endif
};
+#ifdef CONFIG_HAVE_THP
+static void *mmap_delay_thread(void *data)
+{
+ struct fio_page_fault_data *fpd = data;
+ struct timespec req;
+ int ret;
+
+ clock_gettime(CLOCK_REALTIME, &req);
+ req.tv_sec += fpd->hugepage_delay / 1000;
+ req.tv_nsec += (fpd->hugepage_delay % 1000) * 1000000;
+ if (req.tv_nsec >= 1000000000) {
+ req.tv_sec++;
+ req.tv_nsec -= 1000000000;
+ }
+
+ pthread_mutex_lock(&fpd->mmap_lock);
+ while (!fpd->mmap_thread_exit) {
+ ret = pthread_cond_timedwait(&fpd->mmap_cond, &fpd->mmap_lock, &req);
+ if (ret == ETIMEDOUT)
+ break;
+ }
+
+ if (!fpd->mmap_thread_exit) {
+ dprint(FD_MEM, "fio: madvising hugepage\n");
+ ret = madvise(fpd->mmap_ptr, fpd->mmap_sz, MADV_HUGEPAGE);
+ if (ret < 0)
+ log_err("fio: madvise hugepage failed: %d\n", errno);
+ }
+ pthread_mutex_unlock(&fpd->mmap_lock);
+
+ return NULL;
+}
+#endif
+
static int fio_page_fault_init(struct thread_data *td)
{
size_t total_io_size;
struct fio_page_fault_data *fpd = calloc(1, sizeof(*fpd));
if (!fpd)
return 1;
-
+
total_io_size = td->o.size;
fpd->mmap_sz = total_io_size;
fpd->mmap_off = 0;
@@ -25,6 +70,26 @@ static int fio_page_fault_init(struct thread_data *td)
return 1;
}
+ if (td->o.hugepage_delay) {
+#ifdef CONFIG_HAVE_THP
+ fpd->hugepage_delay = td->o.hugepage_delay;
+ madvise(fpd->mmap_ptr, fpd->mmap_sz, MADV_NOHUGEPAGE);
+
+ pthread_mutex_init(&fpd->mmap_lock, NULL);
+ pthread_cond_init(&fpd->mmap_cond, NULL);
+ fpd->mmap_thread_exit = 0;
+ if (pthread_create(&fpd->mmap_thread, NULL, mmap_delay_thread, fpd)) {
+ log_err("fio: failed to create mmap delay thread\n");
+ pthread_cond_destroy(&fpd->mmap_cond);
+ pthread_mutex_destroy(&fpd->mmap_lock);
+ fpd->hugepage_delay = 0;
+ fpd->mmap_thread_started = 0;
+ } else {
+ fpd->mmap_thread_started = 1;
+ }
+#endif
+ }
+
FILE_SET_ENG_DATA(td->files[0], fpd);
return 0;
}
@@ -73,12 +138,27 @@ static int fio_page_fault_open_file(struct thread_data *td, struct fio_file *f)
static int fio_page_fault_close_file(struct thread_data *td, struct fio_file *f)
{
- struct fio_page_fault_data *fpd = FILE_ENG_DATA(f);
- if (!fpd)
- return 1;
- if (fpd->mmap_ptr && fpd->mmap_sz)
- munmap(fpd->mmap_ptr, fpd->mmap_sz);
- free(fpd);
+ struct fio_page_fault_data *fpd = FILE_ENG_DATA(f);
+
+ if (fpd) {
+#ifdef CONFIG_HAVE_THP
+ if (fpd->mmap_thread_started) {
+ pthread_mutex_lock(&fpd->mmap_lock);
+ fpd->mmap_thread_exit = 1;
+ pthread_cond_signal(&fpd->mmap_cond);
+ pthread_mutex_unlock(&fpd->mmap_lock);
+ pthread_join(fpd->mmap_thread, NULL);
+ pthread_cond_destroy(&fpd->mmap_cond);
+ pthread_mutex_destroy(&fpd->mmap_lock);
+ fpd->mmap_thread_started = 0;
+ }
+#endif
+
+ if (fpd->mmap_ptr && fpd->mmap_sz)
+ munmap(fpd->mmap_ptr, fpd->mmap_sz);
+ free(fpd);
+ }
+
return 0;
}
diff --git a/options.c b/options.c
index f526f5eb..5f8c53cd 100644
--- a/options.c
+++ b/options.c
@@ -5490,6 +5490,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.category = FIO_OPT_C_GENERAL,
.group = FIO_OPT_G_INVALID,
},
+ {
+ .name = "hugepage_delay",
+ .lname = "Hugepage delay",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct thread_options, hugepage_delay),
+ .help = "For mmap, map with MADV_NOHUGEPAGE then MADV_HUGEPAGE after delay (in ms)",
+ .def = "0",
+ .category = FIO_OPT_C_GENERAL,
+ .group = FIO_OPT_G_INVALID,
+ },
{
.name = "flow_id",
.lname = "I/O flow ID",
diff --git a/thread_options.h b/thread_options.h
index b4dd8d7a..f288664f 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -203,6 +203,7 @@ struct thread_options {
unsigned int perc_rand[DDIR_RWDIR_CNT];
unsigned int hugepage_size;
+ unsigned int hugepage_delay;
unsigned long long rw_min_bs;
unsigned int fsync_blocks;
unsigned int fdatasync_blocks;
@@ -539,6 +540,8 @@ struct thread_options_pack {
uint32_t perc_rand[DDIR_RWDIR_CNT];
uint32_t hugepage_size;
+ uint32_t hugepage_delay;
+ uint32_t hugepage_delay_pad;
uint64_t rw_min_bs;
uint32_t fsync_blocks;
uint32_t fdatasync_blocks;
--
2.52.0
^ permalink raw reply related [flat|nested] 8+ messages in thread