From: Nico Pache <npache@redhat.com>
To: fio@vger.kernel.org
Cc: axboe@kernel.dk, vincentfu@gmail.com, npache@redhat.com,
david@kernel.org, willy@infradead.org
Subject: [RFC 2/2] page_fault: add hugepage_delay option for delayed MADV_HUGEPAGE
Date: Thu, 29 Jan 2026 11:43:01 -0700 [thread overview]
Message-ID: <20260129184302.34887-3-npache@redhat.com> (raw)
In-Reply-To: <20260129184302.34887-1-npache@redhat.com>
Introduce hugepage_delay to map with MADV_NOHUGEPAGE first, then
madvise MADV_HUGEPAGE after a configurable delay via a helper thread.
This makes khugepaged candidates reproducible for page_fault tests.
Signed-off-by: Nico Pache <npache@redhat.com>
---
cconv.c | 3 ++
engines/page_fault.c | 94 ++++++++++++++++++++++++++++++++++++++++----
options.c | 10 +++++
thread_options.h | 3 ++
4 files changed, 103 insertions(+), 7 deletions(-)
diff --git a/cconv.c b/cconv.c
index 0c4a3f2d..4fafbf83 100644
--- a/cconv.c
+++ b/cconv.c
@@ -242,6 +242,7 @@ int convert_thread_options_to_cpu(struct thread_options *o,
o->random_center.u.f = fio_uint64_to_double(le64_to_cpu(top->random_center.u.i));
o->random_generator = le32_to_cpu(top->random_generator);
o->hugepage_size = le32_to_cpu(top->hugepage_size);
+ o->hugepage_delay = le32_to_cpu(top->hugepage_delay);
o->rw_min_bs = le64_to_cpu(top->rw_min_bs);
o->thinkcycles = le32_to_cpu(top->thinkcycles);
o->thinktime = le32_to_cpu(top->thinktime);
@@ -494,6 +495,8 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
top->random_center.u.i = __cpu_to_le64(fio_double_to_uint64(o->random_center.u.f));
top->random_generator = cpu_to_le32(o->random_generator);
top->hugepage_size = cpu_to_le32(o->hugepage_size);
+ top->hugepage_delay = cpu_to_le32(o->hugepage_delay);
+ top->hugepage_delay_pad = 0;
top->rw_min_bs = __cpu_to_le64(o->rw_min_bs);
top->thinkcycles = cpu_to_le32(o->thinkcycles);
top->thinktime = cpu_to_le32(o->thinktime);
diff --git a/engines/page_fault.c b/engines/page_fault.c
index e0a3c9e5..1724d553 100644
--- a/engines/page_fault.c
+++ b/engines/page_fault.c
@@ -1,20 +1,65 @@
#include "ioengines.h"
#include "fio.h"
+#include <errno.h>
+#include <pthread.h>
#include <sys/mman.h>
+#include <time.h>
struct fio_page_fault_data {
void *mmap_ptr;
size_t mmap_sz;
off_t mmap_off;
+#ifdef CONFIG_HAVE_THP
+ pthread_t mmap_thread;
+ pthread_mutex_t mmap_lock;
+ pthread_cond_t mmap_cond;
+ int mmap_thread_exit;
+ int mmap_thread_started;
+ unsigned int hugepage_delay;
+#endif
};
+#ifdef CONFIG_HAVE_THP
+static void *mmap_delay_thread(void *data)
+{
+ struct fio_page_fault_data *fpd = data;
+ struct timespec req;
+ int ret;
+
+ clock_gettime(CLOCK_REALTIME, &req);
+ req.tv_sec += fpd->hugepage_delay / 1000;
+ req.tv_nsec += (fpd->hugepage_delay % 1000) * 1000000;
+ if (req.tv_nsec >= 1000000000) {
+ req.tv_sec++;
+ req.tv_nsec -= 1000000000;
+ }
+
+ pthread_mutex_lock(&fpd->mmap_lock);
+ while (!fpd->mmap_thread_exit) {
+ ret = pthread_cond_timedwait(&fpd->mmap_cond, &fpd->mmap_lock, &req);
+ if (ret == ETIMEDOUT)
+ break;
+ }
+
+ if (!fpd->mmap_thread_exit) {
+ dprint(FD_MEM, "fio: madvising hugepage\n");
+ ret = madvise(fpd->mmap_ptr, fpd->mmap_sz, MADV_HUGEPAGE);
+ if (ret < 0)
+ log_err("fio: madvise hugepage failed: %d\n", errno);
+ }
+ pthread_mutex_unlock(&fpd->mmap_lock);
+
+ return NULL;
+}
+#endif
+
static int fio_page_fault_init(struct thread_data *td)
{
size_t total_io_size;
struct fio_page_fault_data *fpd = calloc(1, sizeof(*fpd));
if (!fpd)
return 1;
-
+
total_io_size = td->o.size;
fpd->mmap_sz = total_io_size;
fpd->mmap_off = 0;
@@ -25,6 +70,26 @@ static int fio_page_fault_init(struct thread_data *td)
return 1;
}
+ if (td->o.hugepage_delay) {
+#ifdef CONFIG_HAVE_THP
+ fpd->hugepage_delay = td->o.hugepage_delay;
+ madvise(fpd->mmap_ptr, fpd->mmap_sz, MADV_NOHUGEPAGE);
+
+ pthread_mutex_init(&fpd->mmap_lock, NULL);
+ pthread_cond_init(&fpd->mmap_cond, NULL);
+ fpd->mmap_thread_exit = 0;
+ if (pthread_create(&fpd->mmap_thread, NULL, mmap_delay_thread, fpd)) {
+ log_err("fio: failed to create mmap delay thread\n");
+ pthread_cond_destroy(&fpd->mmap_cond);
+ pthread_mutex_destroy(&fpd->mmap_lock);
+ fpd->hugepage_delay = 0;
+ fpd->mmap_thread_started = 0;
+ } else {
+ fpd->mmap_thread_started = 1;
+ }
+#endif
+ }
+
FILE_SET_ENG_DATA(td->files[0], fpd);
return 0;
}
@@ -73,12 +138,27 @@ static int fio_page_fault_open_file(struct thread_data *td, struct fio_file *f)
static int fio_page_fault_close_file(struct thread_data *td, struct fio_file *f)
{
- struct fio_page_fault_data *fpd = FILE_ENG_DATA(f);
- if (!fpd)
- return 1;
- if (fpd->mmap_ptr && fpd->mmap_sz)
- munmap(fpd->mmap_ptr, fpd->mmap_sz);
- free(fpd);
+ struct fio_page_fault_data *fpd = FILE_ENG_DATA(f);
+
+ if (fpd) {
+#ifdef CONFIG_HAVE_THP
+ if (fpd->mmap_thread_started) {
+ pthread_mutex_lock(&fpd->mmap_lock);
+ fpd->mmap_thread_exit = 1;
+ pthread_cond_signal(&fpd->mmap_cond);
+ pthread_mutex_unlock(&fpd->mmap_lock);
+ pthread_join(fpd->mmap_thread, NULL);
+ pthread_cond_destroy(&fpd->mmap_cond);
+ pthread_mutex_destroy(&fpd->mmap_lock);
+ fpd->mmap_thread_started = 0;
+ }
+#endif
+
+ if (fpd->mmap_ptr && fpd->mmap_sz)
+ munmap(fpd->mmap_ptr, fpd->mmap_sz);
+ free(fpd);
+ }
+
return 0;
}
diff --git a/options.c b/options.c
index f526f5eb..5f8c53cd 100644
--- a/options.c
+++ b/options.c
@@ -5490,6 +5490,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.category = FIO_OPT_C_GENERAL,
.group = FIO_OPT_G_INVALID,
},
+ {
+ .name = "hugepage_delay",
+ .lname = "Hugepage delay",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct thread_options, hugepage_delay),
+ .help = "For mmap, map with MADV_NOHUGEPAGE then MADV_HUGEPAGE after delay (in ms)",
+ .def = "0",
+ .category = FIO_OPT_C_GENERAL,
+ .group = FIO_OPT_G_INVALID,
+ },
{
.name = "flow_id",
.lname = "I/O flow ID",
diff --git a/thread_options.h b/thread_options.h
index b4dd8d7a..f288664f 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -203,6 +203,7 @@ struct thread_options {
unsigned int perc_rand[DDIR_RWDIR_CNT];
unsigned int hugepage_size;
+ unsigned int hugepage_delay;
unsigned long long rw_min_bs;
unsigned int fsync_blocks;
unsigned int fdatasync_blocks;
@@ -539,6 +540,8 @@ struct thread_options_pack {
uint32_t perc_rand[DDIR_RWDIR_CNT];
uint32_t hugepage_size;
+ uint32_t hugepage_delay;
+ uint32_t hugepage_delay_pad;
uint64_t rw_min_bs;
uint32_t fsync_blocks;
uint32_t fdatasync_blocks;
--
2.52.0
next prev parent reply other threads:[~2026-01-29 18:43 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-29 18:42 [RFC 0/2] Introduce a page_fault ioengine for MM workflows Nico Pache
2026-01-29 18:43 ` [RFC 1/2] page_fault: add mmap-backed ioengine for anonymous faults Nico Pache
2026-01-29 18:43 ` Nico Pache [this message]
2026-01-30 20:08 ` [RFC 2/2] page_fault: add hugepage_delay option for delayed MADV_HUGEPAGE Vincent Fu
2026-02-02 15:24 ` Nico Pache
2026-01-30 21:00 ` [RFC 0/2] Introduce a page_fault ioengine for MM workflows fiotestbot
2026-01-31 13:59 ` Jens Axboe
2026-02-02 15:22 ` Nico Pache
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260129184302.34887-3-npache@redhat.com \
--to=npache@redhat.com \
--cc=axboe@kernel.dk \
--cc=david@kernel.org \
--cc=fio@vger.kernel.org \
--cc=vincentfu@gmail.com \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox