* [PATCH v3 1/4] mm: add NR_DONTCACHE_DIRTY node page counter
2026-04-26 11:56 [PATCH v3 0/4] mm: improve write performance with RWF_DONTCACHE Jeff Layton
@ 2026-04-26 11:56 ` Jeff Layton
2026-04-26 11:56 ` [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking Jeff Layton
` (2 subsequent siblings)
3 siblings, 0 replies; 9+ messages in thread
From: Jeff Layton @ 2026-04-26 11:56 UTC (permalink / raw)
To: Alexander Viro, Christian Brauner, Jan Kara,
Matthew Wilcox (Oracle), Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever
Cc: linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
linux-trace-kernel, Jeff Layton
Add a per-node page counter that tracks the number of dirty pages with
the dropbehind flag set (i.e., pages dirtied via RWF_DONTCACHE writes).
Increment the counter alongside NR_FILE_DIRTY in folio_account_dirtied()
when the folio has the dropbehind flag set, and decrement it in
folio_clear_dirty_for_io(), folio_account_cleaned(), and when a
non-DONTCACHE access clears the dropbehind flag on a dirty folio.
The counter is visible via /proc/vmstat as "nr_dontcache_dirty" and
will be used by the writeback flusher to determine how many pages to
write back when expediting writeback for IOCB_DONTCACHE writes, without
flushing the entire BDI's dirty pages.
Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
include/linux/mmzone.h | 1 +
mm/filemap.c | 6 +++++-
mm/page-writeback.c | 7 +++++++
mm/vmstat.c | 1 +
4 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9adb2ad21da5..ed9cc61c7627 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -259,6 +259,7 @@ enum node_stat_item {
only modified from process context */
NR_FILE_PAGES,
NR_FILE_DIRTY,
+ NR_DONTCACHE_DIRTY,
NR_WRITEBACK,
NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
NR_SHMEM_THPS,
diff --git a/mm/filemap.c b/mm/filemap.c
index 4e636647100c..45089fde5150 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2052,8 +2052,12 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
if (!folio)
return ERR_PTR(-ENOENT);
/* not an uncached lookup, clear uncached if set */
- if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
+ if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) {
+ if (folio_test_dirty(folio))
+ lruvec_stat_mod_folio(folio, NR_DONTCACHE_DIRTY,
+ -folio_nr_pages(folio));
folio_clear_dropbehind(folio);
+ }
return folio;
}
EXPORT_SYMBOL(__filemap_get_folio_mpol);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 88cd53d4ba09..e1df93fb3e3b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2630,6 +2630,8 @@ static void folio_account_dirtied(struct folio *folio,
wb = inode_to_wb(inode);
lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
+ if (folio_test_dropbehind(folio))
+ lruvec_stat_mod_folio(folio, NR_DONTCACHE_DIRTY, nr);
__zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
__node_stat_mod_folio(folio, NR_DIRTIED, nr);
wb_stat_mod(wb, WB_RECLAIMABLE, nr);
@@ -2651,6 +2653,8 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
long nr = folio_nr_pages(folio);
lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+ if (folio_test_dropbehind(folio))
+ lruvec_stat_mod_folio(folio, NR_DONTCACHE_DIRTY, -nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
task_io_account_cancelled_write(nr * PAGE_SIZE);
@@ -2920,6 +2924,9 @@ bool folio_clear_dirty_for_io(struct folio *folio)
if (folio_test_clear_dirty(folio)) {
long nr = folio_nr_pages(folio);
lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+ if (folio_test_dropbehind(folio))
+ lruvec_stat_mod_folio(folio,
+ NR_DONTCACHE_DIRTY, -nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
ret = true;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f534972f517d..c3e5dfadb9a5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1240,6 +1240,7 @@ const char * const vmstat_text[] = {
[I(NR_FILE_MAPPED)] = "nr_mapped",
[I(NR_FILE_PAGES)] = "nr_file_pages",
[I(NR_FILE_DIRTY)] = "nr_dirty",
+ [I(NR_DONTCACHE_DIRTY)] = "nr_dontcache_dirty",
[I(NR_WRITEBACK)] = "nr_writeback",
[I(NR_SHMEM)] = "nr_shmem",
[I(NR_SHMEM_THPS)] = "nr_shmem_hugepages",
--
2.53.0
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
2026-04-26 11:56 [PATCH v3 0/4] mm: improve write performance with RWF_DONTCACHE Jeff Layton
2026-04-26 11:56 ` [PATCH v3 1/4] mm: add NR_DONTCACHE_DIRTY node page counter Jeff Layton
@ 2026-04-26 11:56 ` Jeff Layton
2026-04-26 12:28 ` Andrew Morton
2026-04-26 11:56 ` [PATCH v3 3/4] testing: add nfsd-io-bench NFS server benchmark suite Jeff Layton
2026-04-26 11:56 ` [PATCH v3 4/4] testing: add dontcache-bench local filesystem " Jeff Layton
3 siblings, 1 reply; 9+ messages in thread
From: Jeff Layton @ 2026-04-26 11:56 UTC (permalink / raw)
To: Alexander Viro, Christian Brauner, Jan Kara,
Matthew Wilcox (Oracle), Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever
Cc: linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
linux-trace-kernel, Jeff Layton
The IOCB_DONTCACHE writeback path in generic_write_sync() calls
filemap_flush_range() on every write, submitting writeback inline in
the writer's context. Perf lock contention profiling shows the
performance problem is not lock contention but the writeback submission
work itself — walking the page tree and submitting I/O blocks the writer
for milliseconds, inflating p99.9 latency from 23ms (buffered) to 93ms
(dontcache).
Replace the inline filemap_flush_range() call with a flusher kick that
drains dirty pages in the background. This moves writeback submission
completely off the writer's hot path.
To avoid flushing unrelated buffered dirty data, add a dedicated
WB_start_dontcache bit and wb_check_start_dontcache() handler that uses
the new NR_DONTCACHE_DIRTY counter to determine how many pages to write
back. The flusher writes back that many pages from the oldest dirty
inodes (not restricted to dontcache-specific inodes). This helps
preserve I/O batching while limiting the scope of expedited writeback.
Like WB_start_all, the WB_start_dontcache bit coalesces multiple
DONTCACHE writes into a single flusher wakeup without per-write
allocations.
Also add WB_REASON_DONTCACHE as a new writeback reason for tracing
visibility, and target the correct cgroup writeback domain via
unlocked_inode_to_wb_begin().
dontcache-bench results on dual-socket Xeon Gold 6138 (80 CPUs, 256 GB
RAM, Samsung MZ1LB1T9HALS 1.7 TB NVMe, local XFS, io_uring, file size
~503 GB, compared to a v6.19-ish baseline):
Single-client sequential write (MB/s):
baseline patched change
buffered 1449.8 1440.1 -0.7%
dontcache 1347.9 1461.5 +8.4%
direct 1450.0 1440.1 -0.7%
Single-client sequential write latency (us):
baseline patched change
dontcache p50 3031.0 10551.3 +248.1%
dontcache p99 74973.2 21626.9 -71.2%
dontcache p99.9 85459.0 23199.7 -72.9%
Single-client random write (MB/s):
baseline patched change
dontcache 284.2 295.4 +3.9%
Single-client random write p99.9 latency (us):
baseline patched change
dontcache 2277.4 872.4 -61.7%
Multi-writer aggregate throughput (MB/s):
baseline patched change
buffered 1619.5 1611.2 -0.5%
dontcache 1281.1 1629.4 +27.2%
direct 1545.4 1609.4 +4.1%
Mixed-mode noisy neighbor (dontcache writer + buffered readers):
baseline patched change
writer (MB/s) 1297.6 1471.1 +13.4%
readers avg (MB/s) 855.0 462.4 -45.9%
nfsd-io-bench results on same hardware (XFS on NVMe, NFSv3 via fio
NFS engine with libnfs, 1024 NFSD threads, pool_mode=pernode,
file size ~502 GB, compared to v6.19-ish baseline):
Single-client sequential write (MB/s):
baseline patched change
buffered 4844.2 4653.4 -3.9%
dontcache 3028.3 3723.1 +22.9%
direct 957.6 987.8 +3.2%
Single-client sequential write p99.9 latency (us):
baseline patched change
dontcache 759169.0 175112.2 -76.9%
Single-client random write (MB/s):
baseline patched change
dontcache 590.0 1561.0 +164.6%
Multi-writer aggregate throughput (MB/s):
baseline patched change
buffered 9636.3 9422.9 -2.2%
dontcache 1894.9 9442.6 +398.3%
direct 809.6 975.1 +20.4%
Noisy neighbor (dontcache writer + random readers):
baseline patched change
writer (MB/s) 1854.5 4063.6 +119.1%
readers avg (MB/s) 131.2 101.6 -22.5%
The NFS results show even larger improvements than the local benchmarks.
Multi-writer dontcache throughput improves nearly 5x, matching buffered
I/O. Dirty page footprint drops 85-95% in sequential workloads vs.
buffered.
Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
fs/fs-writeback.c | 60 ++++++++++++++++++++++++++++++++++++++++
include/linux/backing-dev-defs.h | 2 ++
include/linux/fs.h | 6 ++--
include/trace/events/writeback.h | 3 +-
4 files changed, 66 insertions(+), 5 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a65694cbfe68..377767db48f7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1334,6 +1334,18 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
wb_wakeup(wb);
}
+static void wb_start_dontcache_writeback(struct bdi_writeback *wb)
+{
+ if (!wb_has_dirty_io(wb))
+ return;
+
+ if (test_bit(WB_start_dontcache, &wb->state) ||
+ test_and_set_bit(WB_start_dontcache, &wb->state))
+ return;
+
+ wb_wakeup(wb);
+}
+
/**
* wb_start_background_writeback - start background writeback
* @wb: bdi_writback to write from
@@ -2373,6 +2385,28 @@ static long wb_check_start_all(struct bdi_writeback *wb)
return nr_pages;
}
+static long wb_check_start_dontcache(struct bdi_writeback *wb)
+{
+ long nr_pages;
+
+ if (!test_bit(WB_start_dontcache, &wb->state))
+ return 0;
+
+ nr_pages = global_node_page_state(NR_DONTCACHE_DIRTY);
+ if (nr_pages) {
+ struct wb_writeback_work work = {
+ .nr_pages = wb_split_bdi_pages(wb, nr_pages),
+ .sync_mode = WB_SYNC_NONE,
+ .range_cyclic = 1,
+ .reason = WB_REASON_DONTCACHE,
+ };
+
+ nr_pages = wb_writeback(wb, &work);
+ }
+
+ clear_bit(WB_start_dontcache, &wb->state);
+ return nr_pages;
+}
/*
* Retrieve work items and do the writeback they describe
@@ -2394,6 +2428,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
*/
wrote += wb_check_start_all(wb);
+ /*
+ * Check for dontcache writeback request
+ */
+ wrote += wb_check_start_dontcache(wb);
+
/*
* Check for periodic writeback, kupdated() style
*/
@@ -2468,6 +2507,27 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
rcu_read_unlock();
}
+/**
+ * filemap_dontcache_kick_writeback - kick flusher for IOCB_DONTCACHE writes
+ * @mapping: address_space that was just written to
+ *
+ * Kick the writeback flusher thread to expedite writeback of dontcache
+ * dirty pages. Uses a dedicated WB_start_dontcache bit so that only
+ * pages tracked by NR_DONTCACHE_DIRTY are written back, rather than
+ * flushing the entire BDI's dirty pages.
+ */
+void filemap_dontcache_kick_writeback(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct wb_lock_cookie cookie = {};
+
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
+ wb_start_dontcache_writeback(wb);
+ unlocked_inode_to_wb_end(inode, &cookie);
+}
+EXPORT_SYMBOL_GPL(filemap_dontcache_kick_writeback);
+
/*
* Wakeup the flusher threads to start writeback of all currently dirty pages
*/
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index a06b93446d10..74f8a9977f5d 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -26,6 +26,7 @@ enum wb_state {
WB_writeback_running, /* Writeback is in progress */
WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
WB_start_all, /* nr_pages == 0 (all) work pending */
+ WB_start_dontcache, /* dontcache writeback pending */
};
enum wb_stat_item {
@@ -55,6 +56,7 @@ enum wb_reason {
*/
WB_REASON_FORKER_THREAD,
WB_REASON_FOREIGN_FLUSH,
+ WB_REASON_DONTCACHE,
WB_REASON_MAX,
};
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11559c513dfb..df72b42a9e9b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2624,6 +2624,7 @@ extern int __must_check file_write_and_wait_range(struct file *file,
loff_t start, loff_t end);
int filemap_flush_range(struct address_space *mapping, loff_t start,
loff_t end);
+void filemap_dontcache_kick_writeback(struct address_space *mapping);
static inline int file_write_and_wait(struct file *file)
{
@@ -2657,10 +2658,7 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
if (ret)
return ret;
} else if (iocb->ki_flags & IOCB_DONTCACHE) {
- struct address_space *mapping = iocb->ki_filp->f_mapping;
-
- filemap_flush_range(mapping, iocb->ki_pos - count,
- iocb->ki_pos - 1);
+ filemap_dontcache_kick_writeback(iocb->ki_filp->f_mapping);
}
return count;
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index bdac0d685a98..13ee076ccd16 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -44,7 +44,8 @@
EM( WB_REASON_PERIODIC, "periodic") \
EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \
EM( WB_REASON_FORKER_THREAD, "forker_thread") \
- EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush")
+ EM( WB_REASON_FOREIGN_FLUSH, "foreign_flush") \
+ EMe(WB_REASON_DONTCACHE, "dontcache")
WB_WORK_REASON
--
2.53.0
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
2026-04-26 11:56 ` [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking Jeff Layton
@ 2026-04-26 12:28 ` Andrew Morton
2026-04-26 14:05 ` Jeff Layton
0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2026-04-26 12:28 UTC (permalink / raw)
To: Jeff Layton
Cc: Alexander Viro, Christian Brauner, Jan Kara,
Matthew Wilcox (Oracle), David Hildenbrand, Lorenzo Stoakes,
Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever,
linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
linux-trace-kernel
Naive questions...
On Sun, 26 Apr 2026 07:56:08 -0400 Jeff Layton <jlayton@kernel.org> wrote:
> The IOCB_DONTCACHE writeback path in generic_write_sync() calls
> filemap_flush_range() on every write, submitting writeback inline in
> the writer's context. Perf lock contention profiling shows the
> performance problem is not lock contention but the writeback submission
> work itself — walking the page tree and submitting I/O blocks the writer
> for milliseconds, inflating p99.9 latency from 23ms (buffered) to 93ms
> (dontcache).
So in the current case, when generic_write_sync() returns, all that
memory is written back and clean&reclaimable (or freed?), yes?
> Replace the inline filemap_flush_range() call with a flusher kick that
> drains dirty pages in the background. This moves writeback submission
> completely off the writer's hot path.
Whereas after this change, that pagecache is probably still dirty,
unreclaimable, waiting for the flusher to do its thing?
So is there potential that the system will get all gummed up with
dirty, to-be-written-soon pagecache? Is there something which limits
this buildup?
> ...
>
> dontcache-bench results on dual-socket Xeon Gold 6138 (80 CPUs, 256 GB
> RAM, Samsung MZ1LB1T9HALS 1.7 TB NVMe, local XFS, io_uring, file size
> ~503 GB, compared to a v6.19-ish baseline):
>
> Single-client sequential write (MB/s):
> baseline patched change
> buffered 1449.8 1440.1 -0.7%
> dontcache 1347.9 1461.5 +8.4%
> direct 1450.0 1440.1 -0.7%
>
> Single-client sequential write latency (us):
> baseline patched change
> dontcache p50 3031.0 10551.3 +248.1%
> dontcache p99 74973.2 21626.9 -71.2%
> dontcache p99.9 85459.0 23199.7 -72.9%
>
> Single-client random write (MB/s):
> baseline patched change
> dontcache 284.2 295.4 +3.9%
>
> Single-client random write p99.9 latency (us):
> baseline patched change
> dontcache 2277.4 872.4 -61.7%
>
> Multi-writer aggregate throughput (MB/s):
> baseline patched change
> buffered 1619.5 1611.2 -0.5%
> dontcache 1281.1 1629.4 +27.2%
> direct 1545.4 1609.4 +4.1%
>
> Mixed-mode noisy neighbor (dontcache writer + buffered readers):
> baseline patched change
> writer (MB/s) 1297.6 1471.1 +13.4%
> readers avg (MB/s) 855.0 462.4 -45.9%
These results look ambiguous. Sometimes better, sometimes worse?
> nfsd-io-bench results on same hardware (XFS on NVMe, NFSv3 via fio
> NFS engine with libnfs, 1024 NFSD threads, pool_mode=pernode,
> file size ~502 GB, compared to v6.19-ish baseline):
>
> Single-client sequential write (MB/s):
> baseline patched change
> buffered 4844.2 4653.4 -3.9%
> dontcache 3028.3 3723.1 +22.9%
> direct 957.6 987.8 +3.2%
>
> Single-client sequential write p99.9 latency (us):
> baseline patched change
> dontcache 759169.0 175112.2 -76.9%
>
> Single-client random write (MB/s):
> baseline patched change
> dontcache 590.0 1561.0 +164.6%
>
> Multi-writer aggregate throughput (MB/s):
> baseline patched change
> buffered 9636.3 9422.9 -2.2%
> dontcache 1894.9 9442.6 +398.3%
> direct 809.6 975.1 +20.4%
>
> Noisy neighbor (dontcache writer + random readers):
> baseline patched change
> writer (MB/s) 1854.5 4063.6 +119.1%
> readers avg (MB/s) 131.2 101.6 -22.5%
Ditto but less so.
> The NFS results show even larger improvements than the local benchmarks.
> Multi-writer dontcache throughput improves nearly 5x, matching buffered
> I/O. Dirty page footprint drops 85-95% in sequential workloads vs.
> buffered.
It sounds that you like the results, so OK ;)
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
2026-04-26 12:28 ` Andrew Morton
@ 2026-04-26 14:05 ` Jeff Layton
0 siblings, 0 replies; 9+ messages in thread
From: Jeff Layton @ 2026-04-26 14:05 UTC (permalink / raw)
To: Andrew Morton
Cc: Alexander Viro, Christian Brauner, Jan Kara,
Matthew Wilcox (Oracle), David Hildenbrand, Lorenzo Stoakes,
Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever,
linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
linux-trace-kernel
On Sun, 2026-04-26 at 05:28 -0700, Andrew Morton wrote:
> Naive questions...
>
> On Sun, 26 Apr 2026 07:56:08 -0400 Jeff Layton <jlayton@kernel.org> wrote:
>
> > The IOCB_DONTCACHE writeback path in generic_write_sync() calls
> > filemap_flush_range() on every write, submitting writeback inline in
> > the writer's context. Perf lock contention profiling shows the
> > performance problem is not lock contention but the writeback submission
> > work itself — walking the page tree and submitting I/O blocks the writer
> > for milliseconds, inflating p99.9 latency from 23ms (buffered) to 93ms
> > (dontcache).
>
> So in the current case, when generic_write_sync() returns, all that
> memory is written back and clean&reclaimable (or freed?), yes?
>
No. Before returning, it submits the I/Os for the portion that it wrote
rather than leaving it to the flusher to take care of things, but it
doesn't wait for the I/Os to complete.
> > Replace the inline filemap_flush_range() call with a flusher kick that
> > drains dirty pages in the background. This moves writeback submission
> > completely off the writer's hot path.
>
> Whereas after this change, that pagecache is probably still dirty,
> unreclaimable, waiting for the flusher to do its thing?
>
Correct, but that's sort of the case today too since DONTCACHE I/Os
don't wait for the completion. With this change we're just deferring
the I/O submission to the flusher thread (which should hopefully soon
wake and take care of business). If the flusher thread can't keep up,
then eventually balance_dirty_pages() will kick in and start slowing
things down.
> So is there potential that the system will get all gummed up with
> dirty, to-be-written-soon pagecache? Is there something which limits
> this buildup?
>
Today in this situation, the writers are limited by the backing device
throughput. Once the I/O submission queues are full, then the DONTCACHE
writers end up stacking up on those. With this change, the writers will
be more limited by traditional VM limits in this situation.
In the test runs I did, the peak pagecache with DONTCACHE writes was
higher than with the unpatched version but still considerably less than
with normal buffered I/O. That's the cost of deferring the I/O
submission to the flusher.
One thing we could consider is going back to submitting the writes
inline when the number of dirty pages is high. But, that could have a
detrimental effect on performance too.
> > ...
> >
> > dontcache-bench results on dual-socket Xeon Gold 6138 (80 CPUs, 256 GB
> > RAM, Samsung MZ1LB1T9HALS 1.7 TB NVMe, local XFS, io_uring, file size
> > ~503 GB, compared to a v6.19-ish baseline):
> >
> > Single-client sequential write (MB/s):
> > baseline patched change
> > buffered 1449.8 1440.1 -0.7%
> > dontcache 1347.9 1461.5 +8.4%
> > direct 1450.0 1440.1 -0.7%
> >
> > Single-client sequential write latency (us):
> > baseline patched change
> > dontcache p50 3031.0 10551.3 +248.1%
> > dontcache p99 74973.2 21626.9 -71.2%
> > dontcache p99.9 85459.0 23199.7 -72.9%
> >
> > Single-client random write (MB/s):
> > baseline patched change
> > dontcache 284.2 295.4 +3.9%
> >
> > Single-client random write p99.9 latency (us):
> > baseline patched change
> > dontcache 2277.4 872.4 -61.7%
> >
> > Multi-writer aggregate throughput (MB/s):
> > baseline patched change
> > buffered 1619.5 1611.2 -0.5%
> > dontcache 1281.1 1629.4 +27.2%
> > direct 1545.4 1609.4 +4.1%
> >
> > Mixed-mode noisy neighbor (dontcache writer + buffered readers):
> > baseline patched change
> > writer (MB/s) 1297.6 1471.1 +13.4%
> > readers avg (MB/s) 855.0 462.4 -45.9%
>
> These results look ambiguous. Sometimes better, sometimes worse?
>
> > nfsd-io-bench results on same hardware (XFS on NVMe, NFSv3 via fio
> > NFS engine with libnfs, 1024 NFSD threads, pool_mode=pernode,
> > file size ~502 GB, compared to v6.19-ish baseline):
> >
> > Single-client sequential write (MB/s):
> > baseline patched change
> > buffered 4844.2 4653.4 -3.9%
> > dontcache 3028.3 3723.1 +22.9%
> > direct 957.6 987.8 +3.2%
> >
> > Single-client sequential write p99.9 latency (us):
> > baseline patched change
> > dontcache 759169.0 175112.2 -76.9%
> >
> > Single-client random write (MB/s):
> > baseline patched change
> > dontcache 590.0 1561.0 +164.6%
> >
> > Multi-writer aggregate throughput (MB/s):
> > baseline patched change
> > buffered 9636.3 9422.9 -2.2%
> > dontcache 1894.9 9442.6 +398.3%
> > direct 809.6 975.1 +20.4%
> >
> > Noisy neighbor (dontcache writer + random readers):
> > baseline patched change
> > writer (MB/s) 1854.5 4063.6 +119.1%
> > readers avg (MB/s) 131.2 101.6 -22.5%
>
> Ditto but less so.
>
> > The NFS results show even larger improvements than the local benchmarks.
> > Multi-writer dontcache throughput improves nearly 5x, matching buffered
> > I/O. Dirty page footprint drops 85-95% in sequential workloads vs.
> > buffered.
>
> It sounds that you like the results, so OK ;)
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH v3 3/4] testing: add nfsd-io-bench NFS server benchmark suite
2026-04-26 11:56 [PATCH v3 0/4] mm: improve write performance with RWF_DONTCACHE Jeff Layton
2026-04-26 11:56 ` [PATCH v3 1/4] mm: add NR_DONTCACHE_DIRTY node page counter Jeff Layton
2026-04-26 11:56 ` [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking Jeff Layton
@ 2026-04-26 11:56 ` Jeff Layton
2026-04-26 12:34 ` Andrew Morton
2026-04-26 11:56 ` [PATCH v3 4/4] testing: add dontcache-bench local filesystem " Jeff Layton
3 siblings, 1 reply; 9+ messages in thread
From: Jeff Layton @ 2026-04-26 11:56 UTC (permalink / raw)
To: Alexander Viro, Christian Brauner, Jan Kara,
Matthew Wilcox (Oracle), Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever
Cc: linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
linux-trace-kernel, Jeff Layton
Add a benchmark suite for testing NFSD I/O mode performance using fio
with the libnfs backend against an NFS server on localhost. Tests
buffered, dontcache, and direct I/O modes via NFSD debugfs controls.
Includes:
- fio job files for sequential/random read/write, multi-writer,
noisy-neighbor, and latency-sensitive reader workloads
- run-benchmarks.sh: orchestrates test matrix with mode switching
- parse-results.sh: extracts metrics from fio JSON output
- setup-server.sh: configures NFS export for testing
Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
.../testing/nfsd-io-bench/fio-jobs/lat-reader.fio | 15 +
.../testing/nfsd-io-bench/fio-jobs/multi-write.fio | 14 +
.../nfsd-io-bench/fio-jobs/noisy-writer.fio | 14 +
tools/testing/nfsd-io-bench/fio-jobs/rand-read.fio | 15 +
.../testing/nfsd-io-bench/fio-jobs/rand-write.fio | 15 +
tools/testing/nfsd-io-bench/fio-jobs/seq-read.fio | 14 +
tools/testing/nfsd-io-bench/fio-jobs/seq-write.fio | 14 +
.../testing/nfsd-io-bench/scripts/parse-results.sh | 238 +++++++++
.../nfsd-io-bench/scripts/run-benchmarks.sh | 591 +++++++++++++++++++++
.../testing/nfsd-io-bench/scripts/setup-server.sh | 94 ++++
10 files changed, 1024 insertions(+)
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/lat-reader.fio b/tools/testing/nfsd-io-bench/fio-jobs/lat-reader.fio
new file mode 100644
index 000000000000..61af37e8b860
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/lat-reader.fio
@@ -0,0 +1,15 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=4k
+numjobs=16
+runtime=300
+time_based=1
+group_reporting=1
+rw=randread
+log_avg_msec=1000
+write_bw_log=latreader
+write_lat_log=latreader
+
+[lat_reader]
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/multi-write.fio b/tools/testing/nfsd-io-bench/fio-jobs/multi-write.fio
new file mode 100644
index 000000000000..16b792aecabb
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/multi-write.fio
@@ -0,0 +1,14 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=1M
+numjobs=16
+time_based=0
+group_reporting=1
+rw=write
+log_avg_msec=1000
+write_bw_log=multiwrite
+write_lat_log=multiwrite
+
+[writer]
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/noisy-writer.fio b/tools/testing/nfsd-io-bench/fio-jobs/noisy-writer.fio
new file mode 100644
index 000000000000..615154a7737e
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/noisy-writer.fio
@@ -0,0 +1,14 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=1M
+numjobs=16
+time_based=0
+group_reporting=1
+rw=write
+log_avg_msec=1000
+write_bw_log=noisywriter
+write_lat_log=noisywriter
+
+[bulk_writer]
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/rand-read.fio b/tools/testing/nfsd-io-bench/fio-jobs/rand-read.fio
new file mode 100644
index 000000000000..501bae7416a8
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/rand-read.fio
@@ -0,0 +1,15 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=4k
+numjobs=16
+runtime=300
+time_based=1
+group_reporting=1
+rw=randread
+log_avg_msec=1000
+write_bw_log=randread
+write_lat_log=randread
+
+[randread]
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/rand-write.fio b/tools/testing/nfsd-io-bench/fio-jobs/rand-write.fio
new file mode 100644
index 000000000000..d891d04197ae
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/rand-write.fio
@@ -0,0 +1,15 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=64k
+numjobs=16
+runtime=300
+time_based=1
+group_reporting=1
+rw=randwrite
+log_avg_msec=1000
+write_bw_log=randwrite
+write_lat_log=randwrite
+
+[randwrite]
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/seq-read.fio b/tools/testing/nfsd-io-bench/fio-jobs/seq-read.fio
new file mode 100644
index 000000000000..6e24ab355026
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/seq-read.fio
@@ -0,0 +1,14 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=1M
+numjobs=16
+time_based=0
+group_reporting=1
+rw=read
+log_avg_msec=1000
+write_bw_log=seqread
+write_lat_log=seqread
+
+[seqread]
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/seq-write.fio b/tools/testing/nfsd-io-bench/fio-jobs/seq-write.fio
new file mode 100644
index 000000000000..260858e345f5
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/seq-write.fio
@@ -0,0 +1,14 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=1M
+numjobs=16
+time_based=0
+group_reporting=1
+rw=write
+log_avg_msec=1000
+write_bw_log=seqwrite
+write_lat_log=seqwrite
+
+[seqwrite]
diff --git a/tools/testing/nfsd-io-bench/scripts/parse-results.sh b/tools/testing/nfsd-io-bench/scripts/parse-results.sh
new file mode 100755
index 000000000000..0427d411db04
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/scripts/parse-results.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Parse fio JSON output and generate comparison tables.
+#
+# Usage: ./parse-results.sh <results-dir>
+
+set -euo pipefail
+
+if [ $# -lt 1 ]; then
+ echo "Usage: $0 <results-dir>"
+ exit 1
+fi
+
+RESULTS_DIR="$1"
+
+if ! command -v jq &>/dev/null; then
+ echo "ERROR: jq is required"
+ exit 1
+fi
+
+# Extract metrics from a single fio JSON result
+extract_metrics() {
+ local json_file=$1
+ local rw_type=$2 # read or write
+
+ if [ ! -f "$json_file" ]; then
+ echo "N/A N/A N/A N/A N/A N/A"
+ return
+ fi
+
+ jq -r --arg rw "$rw_type" '
+ .jobs[0][$rw] as $d |
+ [
+ (($d.bw // 0) / 1024 | . * 10 | round / 10), # MB/s
+ ($d.iops // 0), # IOPS
+ ((($d.clat_ns.mean // 0) / 1000) | . * 10 | round / 10), # avg lat us
+ (($d.clat_ns.percentile["50.000000"] // 0) / 1000), # p50 us
+ (($d.clat_ns.percentile["99.000000"] // 0) / 1000), # p99 us
+ (($d.clat_ns.percentile["99.900000"] // 0) / 1000) # p99.9 us
+ ] | @tsv
+ ' "$json_file" 2>/dev/null || echo "N/A N/A N/A N/A N/A N/A"
+}
+
+# Extract server CPU from vmstat log (average sys%)
+extract_cpu() {
+ local vmstat_log=$1
+ if [ ! -f "$vmstat_log" ]; then
+ echo "N/A"
+ return
+ fi
+ # vmstat columns: us sy id wa st — skip header lines
+ awk 'NR>2 {sum+=$14; n++} END {if(n>0) printf "%.1f", sum/n; else print "N/A"}' \
+ "$vmstat_log" 2>/dev/null || echo "N/A"
+}
+
+# Extract peak dirty pages from meminfo log
+extract_peak_dirty() {
+ local meminfo_log=$1
+ if [ ! -f "$meminfo_log" ]; then
+ echo "N/A"
+ return
+ fi
+ grep "^Dirty:" "$meminfo_log" | awk '{print $2}' | sort -n | tail -1 || echo "N/A"
+}
+
+# Extract peak cached from meminfo log
+extract_peak_cached() {
+ local meminfo_log=$1
+ if [ ! -f "$meminfo_log" ]; then
+ echo "N/A"
+ return
+ fi
+ grep "^Cached:" "$meminfo_log" | awk '{print $2}' | sort -n | tail -1 || echo "N/A"
+}
+
+print_separator() {
+ printf '%*s\n' 120 '' | tr ' ' '-'
+}
+
+########################################################################
+# Deliverable 1: Single-client results
+########################################################################
+echo ""
+echo "=================================================================="
+echo " Deliverable 1: Single-Client fio Benchmarks"
+echo "=================================================================="
+echo ""
+
+for workload in seq-write rand-write seq-read rand-read; do
+ case $workload in
+ seq-write|rand-write) rw_type="write" ;;
+ seq-read|rand-read) rw_type="read" ;;
+ esac
+
+ echo "--- $workload ---"
+ printf "%-16s %10s %10s %10s %10s %10s %10s %10s %12s %12s\n" \
+ "Mode" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)" "Sys CPU%" "PeakDirty(kB)" "PeakCache(kB)"
+ print_separator
+
+ for mode in buffered dontcache direct; do
+ dir="${RESULTS_DIR}/${workload}/${mode}"
+ json_file=$(find "$dir" -name '*.json' -not -name 'client*' 2>/dev/null | head -1 || true)
+ if [ -z "$json_file" ]; then
+ printf "%-16s %10s\n" "$mode" "(no data)"
+ continue
+ fi
+
+ read -r mbps iops avg_lat p50 p99 p999 <<< \
+ "$(extract_metrics "$json_file" "$rw_type")"
+ cpu=$(extract_cpu "${dir}/vmstat.log")
+ dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+ cached=$(extract_peak_cached "${dir}/meminfo.log")
+
+ printf "%-16s %10s %10s %10s %10s %10s %10s %10s %12s %12s\n" \
+ "$mode" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999" \
+ "$cpu" "${dirty:-N/A}" "${cached:-N/A}"
+ done
+ echo ""
+done
+
+########################################################################
+# Deliverable 2: Multi-client results
+########################################################################
+echo "=================================================================="
+echo " Deliverable 2: Noisy-Neighbor Benchmarks"
+echo "=================================================================="
+echo ""
+
+# Scenario A: Multiple writers
+echo "--- Scenario A: Multiple Writers ---"
+for mode in buffered dontcache direct; do
+ dir="${RESULTS_DIR}/multi-write/${mode}"
+ if [ ! -d "$dir" ]; then
+ continue
+ fi
+
+ echo " Mode: $mode"
+ printf " %-10s %10s %10s %10s %10s %10s %10s\n" \
+ "Client" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)"
+
+ total_bw=0
+ count=0
+ for json_file in "${dir}"/client*.json; do
+ [ -f "$json_file" ] || continue
+ client=$(basename "$json_file" .json)
+ read -r mbps iops avg_lat p50 p99 p999 <<< \
+ "$(extract_metrics "$json_file" "write")"
+ printf " %-10s %10s %10s %10s %10s %10s %10s\n" \
+ "$client" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+ total_bw=$(echo "$total_bw + ${mbps:-0}" | bc 2>/dev/null || echo "$total_bw")
+ count=$(( count + 1 ))
+ done
+
+ cpu=$(extract_cpu "${dir}/vmstat.log")
+ dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+ printf " Aggregate BW: %s MB/s | Sys CPU: %s%% | Peak Dirty: %s kB\n" \
+ "$total_bw" "$cpu" "${dirty:-N/A}"
+ echo ""
+done
+
+# Scenario C: Noisy neighbor
+echo "--- Scenario C: Noisy Writer + Latency-Sensitive Readers ---"
+for mode in buffered dontcache direct; do
+ dir="${RESULTS_DIR}/noisy-neighbor/${mode}"
+ if [ ! -d "$dir" ]; then
+ continue
+ fi
+
+ echo " Mode: $mode"
+ printf " %-14s %10s %10s %10s %10s %10s %10s\n" \
+ "Job" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)"
+
+ # Writer
+ if [ -f "${dir}/noisy_writer.json" ]; then
+ read -r mbps iops avg_lat p50 p99 p999 <<< \
+ "$(extract_metrics "${dir}/noisy_writer.json" "write")"
+ printf " %-14s %10s %10s %10s %10s %10s %10s\n" \
+ "Bulk writer" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+ fi
+
+ # Readers
+ for json_file in "${dir}"/reader*.json; do
+ [ -f "$json_file" ] || continue
+ reader=$(basename "$json_file" .json)
+ read -r mbps iops avg_lat p50 p99 p999 <<< \
+ "$(extract_metrics "$json_file" "read")"
+ printf " %-14s %10s %10s %10s %10s %10s %10s\n" \
+ "$reader" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+ done
+
+ cpu=$(extract_cpu "${dir}/vmstat.log")
+ dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+ printf " Sys CPU: %s%% | Peak Dirty: %s kB\n" "$cpu" "${dirty:-N/A}"
+ echo ""
+done
+
+# Scenario D: Mixed-mode noisy neighbor
+echo "--- Scenario D: Mixed-Mode Noisy Writer + Readers ---"
+for dir in "${RESULTS_DIR}"/noisy-neighbor-mixed/*/; do
+ [ -d "$dir" ] || continue
+ label=$(basename "$dir")
+
+ echo " Mode: $label"
+ printf " %-14s %10s %10s %10s %10s %10s %10s\n" \
+ "Job" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)"
+
+ # Writer
+ if [ -f "${dir}/noisy_writer.json" ]; then
+ read -r mbps iops avg_lat p50 p99 p999 <<< \
+ "$(extract_metrics "${dir}/noisy_writer.json" "write")"
+ printf " %-14s %10s %10s %10s %10s %10s %10s\n" \
+ "Bulk writer" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+ fi
+
+ # Readers
+ for json_file in "${dir}"/reader*.json; do
+ [ -f "$json_file" ] || continue
+ reader=$(basename "$json_file" .json)
+ read -r mbps iops avg_lat p50 p99 p999 <<< \
+ "$(extract_metrics "$json_file" "read")"
+ printf " %-14s %10s %10s %10s %10s %10s %10s\n" \
+ "$reader" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+ done
+
+ cpu=$(extract_cpu "${dir}/vmstat.log")
+ dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+ printf " Sys CPU: %s%% | Peak Dirty: %s kB\n" "$cpu" "${dirty:-N/A}"
+ echo ""
+done
+
+echo "=================================================================="
+echo " System Info"
+echo "=================================================================="
+if [ -f "${RESULTS_DIR}/sysinfo.txt" ]; then
+ head -6 "${RESULTS_DIR}/sysinfo.txt"
+fi
+echo ""
diff --git a/tools/testing/nfsd-io-bench/scripts/run-benchmarks.sh b/tools/testing/nfsd-io-bench/scripts/run-benchmarks.sh
new file mode 100755
index 000000000000..2b0cf6e79dff
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/scripts/run-benchmarks.sh
@@ -0,0 +1,591 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# NFS server I/O mode benchmark suite
+#
+# Runs fio with the NFS ioengine against an NFS server on localhost,
+# testing buffered, dontcache, and direct I/O modes.
+#
+# Usage: ./run-benchmarks.sh [OPTIONS]
+#
+# Options:
+# -e EXPORT_PATH Server export path (default: /export)
+# -s SIZE fio file size, should be >= 2x RAM (default: auto-detect)
+# -r RESULTS_DIR Where to store results (default: ./results)
+# -n NFS_VER NFS version: 3 or 4 (default: 3)
+# -j FIO_JOBS_DIR Path to fio job files (default: ../fio-jobs)
+# -d Dry run: print commands without executing
+# -h Show this help
+
+set -euo pipefail
+
+# Defaults
+EXPORT_PATH="/export"
+SIZE=""
+RESULTS_DIR="./results"
+NFS_VER=3
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+FIO_JOBS_DIR="${SCRIPT_DIR}/../fio-jobs"
+DRY_RUN=0
+MODES="0 1 2"
+PERF_LOCK=0
+
+DEBUGFS_BASE="/sys/kernel/debug/nfsd"
+IO_CACHE_READ="${DEBUGFS_BASE}/io_cache_read"
+IO_CACHE_WRITE="${DEBUGFS_BASE}/io_cache_write"
+DISABLE_SPLICE="${DEBUGFS_BASE}/disable-splice-read"
+
+usage() {
+ echo "Usage: $0 [OPTIONS]"
+ echo " -e EXPORT_PATH Server export path (default: /export)"
+ echo " -s SIZE fio file size (default: 2x RAM)"
+ echo " -r RESULTS_DIR Results directory (default: ./results)"
+ echo " -n NFS_VER NFS version: 3 or 4 (default: 3)"
+ echo " -j FIO_JOBS_DIR Path to fio job files"
+ echo " -D Dontcache only (skip buffered and direct tests)"
+ echo " -p Profile kernel lock contention with perf lock"
+ echo " -d Dry run"
+ echo " -h Help"
+ exit 1
+}
+
+while getopts "e:s:r:n:j:Dpdh" opt; do
+ case $opt in
+ e) EXPORT_PATH="$OPTARG" ;;
+ s) SIZE="$OPTARG" ;;
+ r) RESULTS_DIR="$OPTARG" ;;
+ n) NFS_VER="$OPTARG" ;;
+ j) FIO_JOBS_DIR="$OPTARG" ;;
+ D) MODES="1" ;;
+ p) PERF_LOCK=1 ;;
+ d) DRY_RUN=1 ;;
+ h) usage ;;
+ *) usage ;;
+ esac
+done
+
+# Auto-detect size: 2x total RAM
+if [ -z "$SIZE" ]; then
+ MEM_KB=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
+ MEM_GB=$(( MEM_KB / 1024 / 1024 ))
+ SIZE="$(( MEM_GB * 2 ))G"
+ echo "Auto-detected RAM: ${MEM_GB}G, using file size: ${SIZE}"
+fi
+
+
+log() {
+ echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
+}
+
+run_cmd() {
+ if [ "$DRY_RUN" -eq 1 ]; then
+ echo " [DRY RUN] $*"
+ else
+ "$@"
+ fi
+}
+
+# Preflight checks
+preflight() {
+ log "=== Preflight checks ==="
+
+ if ! command -v fio &>/dev/null; then
+ echo "ERROR: fio not found in PATH"
+ exit 1
+ fi
+
+ # Check fio has nfs ioengine
+ if ! fio --enghelp=nfs &>/dev/null; then
+ echo "ERROR: fio does not have the nfs ioengine (needs libnfs)"
+ exit 1
+ fi
+
+ # Check debugfs knobs exist
+ for knob in "$IO_CACHE_READ" "$IO_CACHE_WRITE" "$DISABLE_SPLICE"; do
+ if [ ! -f "$knob" ]; then
+ echo "ERROR: $knob not found. Is the kernel new enough?"
+ exit 1
+ fi
+ done
+
+ # Check NFS server is exporting
+ if ! showmount -e localhost 2>/dev/null | grep -q "$EXPORT_PATH"; then
+ echo "WARNING: $EXPORT_PATH not in showmount output, proceeding anyway"
+ fi
+
+ # Print system info
+ echo "Kernel: $(uname -r)"
+ echo "RAM: $(awk '/MemTotal/ {printf "%.1f GB", $2/1024/1024}' /proc/meminfo)"
+ echo "Export: $EXPORT_PATH"
+ echo "NFS ver: $NFS_VER"
+ echo "File size: $SIZE"
+ echo "Results: $RESULTS_DIR"
+ echo ""
+}
+
+# Set server I/O mode via debugfs
+set_io_mode() {
+ local cache_write=$1
+ local cache_read=$2
+ local splice_off=$3
+
+ log "Setting io_cache_write=$cache_write io_cache_read=$cache_read disable-splice-read=$splice_off"
+ run_cmd bash -c "echo $cache_write > $IO_CACHE_WRITE"
+ run_cmd bash -c "echo $cache_read > $IO_CACHE_READ"
+ run_cmd bash -c "echo $splice_off > $DISABLE_SPLICE"
+}
+
+# Drop page cache on server
+drop_caches() {
+ log "Dropping page cache"
+ run_cmd bash -c "sync && echo 3 > /proc/sys/vm/drop_caches"
+ sleep 1
+}
+
+# Start background server monitoring
+start_monitors() {
+ local outdir=$1
+
+ log "Starting server monitors in $outdir"
+ run_cmd vmstat 1 > "${outdir}/vmstat.log" 2>&1 &
+ VMSTAT_PID=$!
+
+ run_cmd iostat -x 1 > "${outdir}/iostat.log" 2>&1 &
+ IOSTAT_PID=$!
+
+ # Sample /proc/meminfo every second
+ (while true; do
+ echo "=== $(date '+%s') ==="
+ cat /proc/meminfo
+ sleep 1
+ done) > "${outdir}/meminfo.log" 2>&1 &
+ MEMINFO_PID=$!
+}
+
+# Stop background monitors
+stop_monitors() {
+ log "Stopping monitors"
+ kill "$VMSTAT_PID" "$IOSTAT_PID" "$MEMINFO_PID" 2>/dev/null || true
+ wait "$VMSTAT_PID" "$IOSTAT_PID" "$MEMINFO_PID" 2>/dev/null || true
+}
+
+# perf lock profiling — uses BPF-based live contention tracing
+PERF_LOCK_PID=""
+
+start_perf_lock() {
+ local outdir=$1
+
+ if [ "$PERF_LOCK" -ne 1 ]; then
+ return
+ fi
+
+ log "Starting perf lock contention tracing"
+ perf lock contention -a -b --max-stack 8 \
+ > "${outdir}/perf-lock-contention.txt" 2>&1 &
+ PERF_LOCK_PID=$!
+}
+
+stop_perf_lock() {
+ local outdir=$1
+
+ if [ -z "$PERF_LOCK_PID" ]; then
+ return
+ fi
+
+ log "Stopping perf lock contention tracing"
+ kill -TERM "$PERF_LOCK_PID" 2>/dev/null || true
+ wait "$PERF_LOCK_PID" 2>/dev/null || true
+ PERF_LOCK_PID=""
+}
+
+# Run a single fio benchmark.
+# nfs_url is set in the job files; we pass --filename and --size on
+# the command line to vary the target file and data volume per run.
+# Pass "keep" as 5th arg to preserve the test file after the run.
+run_fio() {
+ local job_file=$1
+ local outdir=$2
+ local filename=$3
+ local fio_size=${4:-$SIZE}
+ local keep=${5:-}
+
+ local job_name
+ job_name=$(basename "$job_file" .fio)
+
+ log "Running fio job: $job_name -> $outdir (file=$filename size=$fio_size)"
+ mkdir -p "$outdir"
+
+ drop_caches
+ start_monitors "$outdir"
+ # Skip perf lock profiling for precreate/setup runs
+ [ "$keep" != "keep" ] && start_perf_lock "$outdir"
+
+ run_cmd fio "$job_file" \
+ --output-format=json \
+ --output="${outdir}/${job_name}.json" \
+ --filename="$filename" \
+ --size="$fio_size"
+
+ [ "$keep" != "keep" ] && stop_perf_lock "$outdir"
+ stop_monitors
+
+ log "Finished: $job_name"
+
+ # Clean up test file to free disk space unless told to keep it
+ if [ "$keep" != "keep" ]; then
+ cleanup_test_files "$filename"
+ fi
+}
+
+# Remove test files from the export to free disk space
+cleanup_test_files() {
+ local filename
+ for filename in "$@"; do
+ local filepath="${EXPORT_PATH}/${filename}"
+ log "Cleaning up: $filepath"
+ run_cmd rm -f "$filepath"
+ done
+}
+
+# Ensure parent directories exist under the export for a given filename
+ensure_export_dirs() {
+ local filename
+ for filename in "$@"; do
+ local dirpath="${EXPORT_PATH}/$(dirname "$filename")"
+ if [ "$dirpath" != "${EXPORT_PATH}/." ] && [ ! -d "$dirpath" ]; then
+ log "Creating directory: $dirpath"
+ run_cmd mkdir -p "$dirpath"
+ fi
+ done
+}
+
+# Mode name from numeric value
+mode_name() {
+ case $1 in
+ 0) echo "buffered" ;;
+ 1) echo "dontcache" ;;
+ 2) echo "direct" ;;
+ esac
+}
+
+########################################################################
+# Deliverable 1: Single-client fio benchmarks
+########################################################################
+run_deliverable1() {
+ log "=========================================="
+ log "Deliverable 1: Single-client fio benchmarks"
+ log "=========================================="
+
+ # Write test matrix:
+ # mode 0 (buffered): splice on (default)
+ # mode 1 (dontcache): splice off (required)
+ # mode 2 (direct): splice off (required)
+
+ # Sequential write
+ for wmode in $MODES; do
+ local mname
+ mname=$(mode_name $wmode)
+ local splice_off=0
+ [ "$wmode" -ne 0 ] && splice_off=1
+
+ drop_caches
+ set_io_mode "$wmode" 0 "$splice_off"
+ run_fio "${FIO_JOBS_DIR}/seq-write.fio" \
+ "${RESULTS_DIR}/seq-write/${mname}" \
+ "seq-write_testfile"
+ done
+
+ # Random write
+ for wmode in $MODES; do
+ local mname
+ mname=$(mode_name $wmode)
+ local splice_off=0
+ [ "$wmode" -ne 0 ] && splice_off=1
+
+ drop_caches
+ set_io_mode "$wmode" 0 "$splice_off"
+ run_fio "${FIO_JOBS_DIR}/rand-write.fio" \
+ "${RESULTS_DIR}/rand-write/${mname}" \
+ "rand-write_testfile"
+ done
+
+ # Sequential read — vary read mode, write stays buffered
+ # Pre-create the file for reading
+ log "Pre-creating sequential read test file"
+ set_io_mode 0 0 0
+ run_fio "${FIO_JOBS_DIR}/seq-write.fio" \
+ "${RESULTS_DIR}/seq-read/precreate" \
+ "seq-read_testfile" "$SIZE" "keep"
+
+ # shellcheck disable=SC2086
+ local last_mode
+ last_mode=$(echo $MODES | awk '{print $NF}')
+
+ for rmode in $MODES; do
+ local mname
+ mname=$(mode_name $rmode)
+ local splice_off=0
+ [ "$rmode" -ne 0 ] && splice_off=1
+ # Keep file for subsequent modes; clean up after last
+ local keep="keep"
+ [ "$rmode" = "$last_mode" ] && keep=""
+
+ drop_caches
+ set_io_mode 0 "$rmode" "$splice_off"
+ run_fio "${FIO_JOBS_DIR}/seq-read.fio" \
+ "${RESULTS_DIR}/seq-read/${mname}" \
+ "seq-read_testfile" "$SIZE" "$keep"
+ done
+
+ # Random read — vary read mode, write stays buffered
+ # Pre-create the file for reading
+ log "Pre-creating random read test file"
+ set_io_mode 0 0 0
+ run_fio "${FIO_JOBS_DIR}/seq-write.fio" \
+ "${RESULTS_DIR}/rand-read/precreate" \
+ "rand-read_testfile" "$SIZE" "keep"
+
+ for rmode in $MODES; do
+ local mname
+ mname=$(mode_name $rmode)
+ local splice_off=0
+ [ "$rmode" -ne 0 ] && splice_off=1
+ # Keep file for subsequent modes; clean up after last
+ local keep="keep"
+ [ "$rmode" = "$last_mode" ] && keep=""
+
+ drop_caches
+ set_io_mode 0 "$rmode" "$splice_off"
+ run_fio "${FIO_JOBS_DIR}/rand-read.fio" \
+ "${RESULTS_DIR}/rand-read/${mname}" \
+ "rand-read_testfile" "$SIZE" "$keep"
+ done
+}
+
+########################################################################
+# Deliverable 2: Multi-client (simulated with multiple fio jobs)
+########################################################################
+run_deliverable2() {
+ log "=========================================="
+ log "Deliverable 2: Noisy-neighbor benchmarks"
+ log "=========================================="
+
+ local num_clients=4
+ local client_size
+ local mem_kb
+ mem_kb=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
+ # Each client gets RAM/num_clients so total > RAM
+ client_size="$(( mem_kb / 1024 / num_clients ))M"
+
+ # Scenario A: Multiple writers
+ for mode in $MODES; do
+ local mname
+ mname=$(mode_name $mode)
+ local splice_off=0
+ [ "$mode" -ne 0 ] && splice_off=1
+ local outdir="${RESULTS_DIR}/multi-write/${mname}"
+ mkdir -p "$outdir"
+
+ set_io_mode "$mode" "$mode" "$splice_off"
+ drop_caches
+
+ # Ensure client directories exist on export
+ for i in $(seq 1 $num_clients); do
+ ensure_export_dirs "client${i}/testfile"
+ done
+
+ start_monitors "$outdir"
+ start_perf_lock "$outdir"
+
+ # Launch N parallel fio writers
+ local pids=()
+ for i in $(seq 1 $num_clients); do
+ run_cmd fio "${FIO_JOBS_DIR}/multi-write.fio" \
+ --output-format=json \
+ --output="${outdir}/client${i}.json" \
+ --filename="client${i}/testfile" \
+ --size="$client_size" &
+ pids+=($!)
+ done
+
+ # Wait for all
+ local rc=0
+ for pid in "${pids[@]}"; do
+ wait "$pid" || rc=$?
+ done
+
+ stop_perf_lock "$outdir"
+ stop_monitors
+ [ $rc -ne 0 ] && log "WARNING: some fio jobs exited non-zero"
+
+ # Clean up test files
+ for i in $(seq 1 $num_clients); do
+ cleanup_test_files "client${i}/testfile"
+ done
+ done
+
+ # Scenario C: Noisy writer + latency-sensitive readers
+ for mode in $MODES; do
+ local mname
+ mname=$(mode_name $mode)
+ local splice_off=0
+ [ "$mode" -ne 0 ] && splice_off=1
+ local outdir="${RESULTS_DIR}/noisy-neighbor/${mname}"
+ mkdir -p "$outdir"
+
+ set_io_mode "$mode" "$mode" "$splice_off"
+ drop_caches
+
+ # Pre-create read files for latency readers
+ for i in $(seq 1 $(( num_clients - 1 ))); do
+ ensure_export_dirs "reader${i}/readfile"
+ log "Pre-creating read file for reader $i"
+ run_fio "${FIO_JOBS_DIR}/multi-write.fio" \
+ "${outdir}/precreate_reader${i}" \
+ "reader${i}/readfile" \
+ "512M" "keep"
+ done
+ drop_caches
+ ensure_export_dirs "bulk/testfile"
+ start_monitors "$outdir"
+ start_perf_lock "$outdir"
+
+ # Noisy writer
+ run_cmd fio "${FIO_JOBS_DIR}/noisy-writer.fio" \
+ --output-format=json \
+ --output="${outdir}/noisy_writer.json" \
+ --filename="bulk/testfile" \
+ --size="$SIZE" &
+ local writer_pid=$!
+
+ # Latency-sensitive readers
+ local reader_pids=()
+ for i in $(seq 1 $(( num_clients - 1 ))); do
+ run_cmd fio "${FIO_JOBS_DIR}/lat-reader.fio" \
+ --output-format=json \
+ --output="${outdir}/reader${i}.json" \
+ --filename="reader${i}/readfile" \
+ --size="512M" &
+ reader_pids+=($!)
+ done
+
+ local rc=0
+ wait "$writer_pid" || rc=$?
+ for pid in "${reader_pids[@]}"; do
+ wait "$pid" || rc=$?
+ done
+
+ stop_perf_lock "$outdir"
+ stop_monitors
+ [ $rc -ne 0 ] && log "WARNING: some fio jobs exited non-zero"
+
+ # Clean up test files
+ cleanup_test_files "bulk/testfile"
+ for i in $(seq 1 $(( num_clients - 1 ))); do
+ cleanup_test_files "reader${i}/readfile"
+ done
+ done
+ # Scenario D: Mixed-mode noisy neighbor
+ # Test write/read mode combinations where the writer uses a
+ # cache-friendly mode and readers use buffered reads to benefit
+ # from warm cache.
+ local mixed_modes=(
+ # write_mode read_mode label
+ "1 0 dontcache-w_buffered-r"
+ )
+
+ for combo in "${mixed_modes[@]}"; do
+ local wmode rmode label
+ read -r wmode rmode label <<< "$combo"
+ local splice_off=0
+ [ "$wmode" -ne 0 ] && splice_off=1
+ local outdir="${RESULTS_DIR}/noisy-neighbor-mixed/${label}"
+ mkdir -p "$outdir"
+
+ set_io_mode "$wmode" "$rmode" "$splice_off"
+ drop_caches
+
+ # Pre-create read files for latency readers
+ for i in $(seq 1 $(( num_clients - 1 ))); do
+ ensure_export_dirs "reader${i}/readfile"
+ log "Pre-creating read file for reader $i"
+ run_fio "${FIO_JOBS_DIR}/multi-write.fio" \
+ "${outdir}/precreate_reader${i}" \
+ "reader${i}/readfile" \
+ "512M" "keep"
+ done
+ drop_caches
+ ensure_export_dirs "bulk/testfile"
+ start_monitors "$outdir"
+ start_perf_lock "$outdir"
+
+ # Noisy writer
+ run_cmd fio "${FIO_JOBS_DIR}/noisy-writer.fio" \
+ --output-format=json \
+ --output="${outdir}/noisy_writer.json" \
+ --filename="bulk/testfile" \
+ --size="$SIZE" &
+ local writer_pid=$!
+
+ # Latency-sensitive readers
+ local reader_pids=()
+ for i in $(seq 1 $(( num_clients - 1 ))); do
+ run_cmd fio "${FIO_JOBS_DIR}/lat-reader.fio" \
+ --output-format=json \
+ --output="${outdir}/reader${i}.json" \
+ --filename="reader${i}/readfile" \
+ --size="512M" &
+ reader_pids+=($!)
+ done
+
+ local rc=0
+ wait "$writer_pid" || rc=$?
+ for pid in "${reader_pids[@]}"; do
+ wait "$pid" || rc=$?
+ done
+
+ stop_perf_lock "$outdir"
+ stop_monitors
+ [ $rc -ne 0 ] && log "WARNING: some fio jobs exited non-zero"
+
+ # Clean up test files
+ cleanup_test_files "bulk/testfile"
+ for i in $(seq 1 $(( num_clients - 1 ))); do
+ cleanup_test_files "reader${i}/readfile"
+ done
+ done
+}
+
+########################################################################
+# Main
+########################################################################
+preflight
+
+TIMESTAMP=$(date '+%Y%m%d-%H%M%S')
+RESULTS_DIR="${RESULTS_DIR}/${TIMESTAMP}"
+mkdir -p "$RESULTS_DIR"
+
+# Save system info
+{
+ echo "Timestamp: $TIMESTAMP"
+ echo "Kernel: $(uname -r)"
+ echo "Hostname: $(hostname)"
+ echo "NFS version: $NFS_VER"
+ echo "File size: $SIZE"
+ echo "Export: $EXPORT_PATH"
+ cat /proc/meminfo
+} > "${RESULTS_DIR}/sysinfo.txt"
+
+log "Results will be saved to: $RESULTS_DIR"
+
+run_deliverable1
+run_deliverable2
+
+# Reset to defaults
+set_io_mode 0 0 0
+
+log "=========================================="
+log "All benchmarks complete."
+log "Results in: $RESULTS_DIR"
+log "Run: scripts/parse-results.sh $RESULTS_DIR"
+log "=========================================="
diff --git a/tools/testing/nfsd-io-bench/scripts/setup-server.sh b/tools/testing/nfsd-io-bench/scripts/setup-server.sh
new file mode 100755
index 000000000000..0efdd74a705e
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/scripts/setup-server.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# One-time setup script for the NFS test server.
+# Run this once before running benchmarks.
+#
+# Usage: sudo ./setup-server.sh [EXPORT_PATH]
+
+set -euo pipefail
+
+EXPORT_PATH="${1:-/export}"
+FSTYPE="ext4"
+
+log() {
+ echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
+}
+
+if [ "$(id -u)" -ne 0 ]; then
+ echo "ERROR: must run as root"
+ exit 1
+fi
+
+# Check for required tools
+for cmd in fio exportfs showmount jq; do
+ if ! command -v "$cmd" &>/dev/null; then
+ echo "WARNING: $cmd not found, attempting install"
+ dnf install -y "$cmd" 2>/dev/null || \
+ apt-get install -y "$cmd" 2>/dev/null || \
+ echo "ERROR: cannot install $cmd, please install manually"
+ fi
+done
+
+# Check fio has nfs ioengine
+if ! fio --enghelp=nfs &>/dev/null; then
+ echo "ERROR: fio nfs ioengine not available."
+ echo "You may need to install fio with libnfs support."
+ echo "Try: dnf install fio libnfs-devel (or build fio from source with --enable-nfs)"
+ exit 1
+fi
+
+# Create export directory if needed
+if [ ! -d "$EXPORT_PATH" ]; then
+ log "Creating export directory: $EXPORT_PATH"
+ mkdir -p "$EXPORT_PATH"
+fi
+
+# Create subdirectories for multi-client tests
+for i in 1 2 3 4; do
+ mkdir -p "${EXPORT_PATH}/client${i}"
+ mkdir -p "${EXPORT_PATH}/reader${i}"
+done
+mkdir -p "${EXPORT_PATH}/bulk"
+
+# Check if already exported
+if ! exportfs -s 2>/dev/null | grep -q "$EXPORT_PATH"; then
+ log "Adding NFS export for $EXPORT_PATH"
+ if ! grep -q "$EXPORT_PATH" /etc/exports 2>/dev/null; then
+ echo "${EXPORT_PATH} 127.0.0.1/32(rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports
+ fi
+ exportfs -ra
+fi
+
+# Ensure NFS server is running
+if ! systemctl is-active --quiet nfs-server 2>/dev/null; then
+ log "Starting NFS server"
+ systemctl start nfs-server
+fi
+
+# Verify export
+log "Current exports:"
+showmount -e localhost
+
+# Check debugfs knobs
+log "Checking debugfs knobs:"
+DEBUGFS_BASE="/sys/kernel/debug/nfsd"
+for knob in io_cache_read io_cache_write disable-splice-read; do
+ if [ -f "${DEBUGFS_BASE}/${knob}" ]; then
+ echo " ${knob} = $(cat "${DEBUGFS_BASE}/${knob}")"
+ else
+ echo " ${knob}: NOT FOUND (kernel may be too old)"
+ fi
+done
+
+# Print system summary
+echo ""
+log "=== System Summary ==="
+echo "Kernel: $(uname -r)"
+echo "RAM: $(awk '/MemTotal/ {printf "%.1f GB", $2/1024/1024}' /proc/meminfo)"
+echo "Export: $EXPORT_PATH"
+echo "Filesystem: $(df -T "$EXPORT_PATH" | awk 'NR==2 {print $2}')"
+echo "Disk: $(df -h "$EXPORT_PATH" | awk 'NR==2 {print $2, "total,", $4, "free"}')"
+echo ""
+log "Setup complete. Run benchmarks with:"
+echo " sudo ./scripts/run-benchmarks.sh -e $EXPORT_PATH"
--
2.53.0
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH v3 3/4] testing: add nfsd-io-bench NFS server benchmark suite
2026-04-26 11:56 ` [PATCH v3 3/4] testing: add nfsd-io-bench NFS server benchmark suite Jeff Layton
@ 2026-04-26 12:34 ` Andrew Morton
2026-04-26 14:11 ` Jeff Layton
0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2026-04-26 12:34 UTC (permalink / raw)
To: Jeff Layton
Cc: Alexander Viro, Christian Brauner, Jan Kara,
Matthew Wilcox (Oracle), David Hildenbrand, Lorenzo Stoakes,
Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever,
linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
linux-trace-kernel
On Sun, 26 Apr 2026 07:56:09 -0400 Jeff Layton <jlayton@kernel.org> wrote:
> Add a benchmark suite for testing NFSD I/O mode performance using fio
> with the libnfs backend against an NFS server on localhost. Tests
> buffered, dontcache, and direct I/O modes via NFSD debugfs controls.
>
> Includes:
> - fio job files for sequential/random read/write, multi-writer,
> noisy-neighbor, and latency-sensitive reader workloads
> - run-benchmarks.sh: orchestrates test matrix with mode switching
> - parse-results.sh: extracts metrics from fio JSON output
> - setup-server.sh: configures NFS export for testing
>
> Assisted-by: Claude:claude-opus-4-6
OK, question.
> 10 files changed, 1024 insertions(+)
Seems that this code was largely machine-generated. So I assume that
you're in possession of the scripts/prompts/whatever which were used to
generate this code.
(Can you please briefly describe the process which you used here?)
So how are we to maintain this? Will other developers have to go in
and hack this machine-generated output by hand? Or would it be better
to provide (in-tree) other developers with the means to regenerate this code,
presumably using Claude?
IOW, this feels a bit like shipping the .s file without giving us the .c
file!
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3 3/4] testing: add nfsd-io-bench NFS server benchmark suite
2026-04-26 12:34 ` Andrew Morton
@ 2026-04-26 14:11 ` Jeff Layton
0 siblings, 0 replies; 9+ messages in thread
From: Jeff Layton @ 2026-04-26 14:11 UTC (permalink / raw)
To: Andrew Morton
Cc: Alexander Viro, Christian Brauner, Jan Kara,
Matthew Wilcox (Oracle), David Hildenbrand, Lorenzo Stoakes,
Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever,
linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
linux-trace-kernel
On Sun, 2026-04-26 at 05:34 -0700, Andrew Morton wrote:
> On Sun, 26 Apr 2026 07:56:09 -0400 Jeff Layton <jlayton@kernel.org> wrote:
>
> > Add a benchmark suite for testing NFSD I/O mode performance using fio
> > with the libnfs backend against an NFS server on localhost. Tests
> > buffered, dontcache, and direct I/O modes via NFSD debugfs controls.
> >
> > Includes:
> > - fio job files for sequential/random read/write, multi-writer,
> > noisy-neighbor, and latency-sensitive reader workloads
> > - run-benchmarks.sh: orchestrates test matrix with mode switching
> > - parse-results.sh: extracts metrics from fio JSON output
> > - setup-server.sh: configures NFS export for testing
> >
> > Assisted-by: Claude:claude-opus-4-6
>
> OK, question.
>
> > 10 files changed, 1024 insertions(+)
>
> Seems that this code was largely machine-generated. So I assume that
> you're in possession of the scripts/prompts/whatever which were used to
> generate this code.
>
> (Can you please briefly describe the process which you used here?)
>
It's been a while since it generated these, but I think I just asked it
to concoct a set of benchmarks for DONTCACHE writes when that involved
file sizes that were larger than the machine's memory.
I ended up asking it to make some changes (e.g. the mixed-mode test,
and some of the perf stuff), but it seemed to do a reasonable job of
creating it.
> So how are we to maintain this? Will other developers have to go in
> and hack this machine-generated output by hand? Or would it be better
> to provide (in-tree) other developers with the means to regenerate this code,
> presumably using Claude?
>
> IOW, this feels a bit like shipping the .s file without giving us the .c
> file!
As I mentioned in the cover letter, I mostly included this in the
series to demonstrate how this was tested. I'm not sure if the two
benchmark suites are suitable for inclusion. I'm fine with leaving
those two patches out of the merge. I found the testcases useful for
this, but they are indeed AI slop, and I'm not sure they have long-term
value or will be maintainable.
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH v3 4/4] testing: add dontcache-bench local filesystem benchmark suite
2026-04-26 11:56 [PATCH v3 0/4] mm: improve write performance with RWF_DONTCACHE Jeff Layton
` (2 preceding siblings ...)
2026-04-26 11:56 ` [PATCH v3 3/4] testing: add nfsd-io-bench NFS server benchmark suite Jeff Layton
@ 2026-04-26 11:56 ` Jeff Layton
3 siblings, 0 replies; 9+ messages in thread
From: Jeff Layton @ 2026-04-26 11:56 UTC (permalink / raw)
To: Alexander Viro, Christian Brauner, Jan Kara,
Matthew Wilcox (Oracle), Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever
Cc: linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
linux-trace-kernel, Jeff Layton
Add a benchmark suite for testing IOCB_DONTCACHE on local filesystems
via fio's io_uring engine with the RWF_DONTCACHE flag.
The suite mirrors the nfsd-io-bench test matrix but uses io_uring with
the "uncached" fio option instead of NFSD debugfs mode switching:
- uncached=0: standard buffered I/O
- uncached=1: RWF_DONTCACHE
- Mode 2 uses O_DIRECT via fio's --direct=1
Includes fio job files, run-benchmarks.sh, and parse-results.sh.
Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
.../dontcache-bench/fio-jobs/lat-reader.fio | 12 +
.../dontcache-bench/fio-jobs/multi-write.fio | 9 +
.../dontcache-bench/fio-jobs/noisy-writer.fio | 12 +
.../testing/dontcache-bench/fio-jobs/rand-read.fio | 13 +
.../dontcache-bench/fio-jobs/rand-write.fio | 13 +
.../testing/dontcache-bench/fio-jobs/seq-read.fio | 13 +
.../testing/dontcache-bench/fio-jobs/seq-write.fio | 13 +
.../dontcache-bench/scripts/parse-results.sh | 238 +++++++++
.../dontcache-bench/scripts/run-benchmarks.sh | 562 +++++++++++++++++++++
9 files changed, 885 insertions(+)
diff --git a/tools/testing/dontcache-bench/fio-jobs/lat-reader.fio b/tools/testing/dontcache-bench/fio-jobs/lat-reader.fio
new file mode 100644
index 000000000000..e221e7aedec9
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/lat-reader.fio
@@ -0,0 +1,12 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=4k
+numjobs=1
+time_based=0
+rw=read
+log_avg_msec=1000
+write_bw_log=latreader
+write_lat_log=latreader
+
+[latreader]
diff --git a/tools/testing/dontcache-bench/fio-jobs/multi-write.fio b/tools/testing/dontcache-bench/fio-jobs/multi-write.fio
new file mode 100644
index 000000000000..8fc0770f5860
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/multi-write.fio
@@ -0,0 +1,9 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=1M
+numjobs=1
+time_based=0
+rw=write
+
+[multiwrite]
diff --git a/tools/testing/dontcache-bench/fio-jobs/noisy-writer.fio b/tools/testing/dontcache-bench/fio-jobs/noisy-writer.fio
new file mode 100644
index 000000000000..4524eebd4642
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/noisy-writer.fio
@@ -0,0 +1,12 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=1M
+numjobs=1
+time_based=0
+rw=write
+log_avg_msec=1000
+write_bw_log=noisywriter
+write_lat_log=noisywriter
+
+[noisywriter]
diff --git a/tools/testing/dontcache-bench/fio-jobs/rand-read.fio b/tools/testing/dontcache-bench/fio-jobs/rand-read.fio
new file mode 100644
index 000000000000..e281fa82b86a
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/rand-read.fio
@@ -0,0 +1,13 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=4k
+numjobs=1
+iodepth=16
+time_based=0
+rw=randread
+log_avg_msec=1000
+write_bw_log=randread
+write_lat_log=randread
+
+[randread]
diff --git a/tools/testing/dontcache-bench/fio-jobs/rand-write.fio b/tools/testing/dontcache-bench/fio-jobs/rand-write.fio
new file mode 100644
index 000000000000..cf53bc6f14b9
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/rand-write.fio
@@ -0,0 +1,13 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=4k
+numjobs=1
+iodepth=16
+time_based=0
+rw=randwrite
+log_avg_msec=1000
+write_bw_log=randwrite
+write_lat_log=randwrite
+
+[randwrite]
diff --git a/tools/testing/dontcache-bench/fio-jobs/seq-read.fio b/tools/testing/dontcache-bench/fio-jobs/seq-read.fio
new file mode 100644
index 000000000000..ef87921465a7
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/seq-read.fio
@@ -0,0 +1,13 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=1M
+numjobs=1
+iodepth=16
+time_based=0
+rw=read
+log_avg_msec=1000
+write_bw_log=seqread
+write_lat_log=seqread
+
+[seqread]
diff --git a/tools/testing/dontcache-bench/fio-jobs/seq-write.fio b/tools/testing/dontcache-bench/fio-jobs/seq-write.fio
new file mode 100644
index 000000000000..da3082f9b391
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/seq-write.fio
@@ -0,0 +1,13 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=1M
+numjobs=1
+iodepth=16
+time_based=0
+rw=write
+log_avg_msec=1000
+write_bw_log=seqwrite
+write_lat_log=seqwrite
+
+[seqwrite]
diff --git a/tools/testing/dontcache-bench/scripts/parse-results.sh b/tools/testing/dontcache-bench/scripts/parse-results.sh
new file mode 100755
index 000000000000..0427d411db04
--- /dev/null
+++ b/tools/testing/dontcache-bench/scripts/parse-results.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Parse fio JSON output and generate comparison tables.
+#
+# Usage: ./parse-results.sh <results-dir>
+
+set -euo pipefail
+
+if [ $# -lt 1 ]; then
+ echo "Usage: $0 <results-dir>"
+ exit 1
+fi
+
+RESULTS_DIR="$1"
+
+if ! command -v jq &>/dev/null; then
+ echo "ERROR: jq is required"
+ exit 1
+fi
+
+# Extract metrics from a single fio JSON result
+extract_metrics() {
+ local json_file=$1
+ local rw_type=$2 # read or write
+
+ if [ ! -f "$json_file" ]; then
+ echo "N/A N/A N/A N/A N/A N/A"
+ return
+ fi
+
+ jq -r --arg rw "$rw_type" '
+ .jobs[0][$rw] as $d |
+ [
+ (($d.bw // 0) / 1024 | . * 10 | round / 10), # MB/s
+ ($d.iops // 0), # IOPS
+ ((($d.clat_ns.mean // 0) / 1000) | . * 10 | round / 10), # avg lat us
+ (($d.clat_ns.percentile["50.000000"] // 0) / 1000), # p50 us
+ (($d.clat_ns.percentile["99.000000"] // 0) / 1000), # p99 us
+ (($d.clat_ns.percentile["99.900000"] // 0) / 1000) # p99.9 us
+ ] | @tsv
+ ' "$json_file" 2>/dev/null || echo "N/A N/A N/A N/A N/A N/A"
+}
+
+# Extract server CPU from vmstat log (average sys%)
+extract_cpu() {
+ local vmstat_log=$1
+ if [ ! -f "$vmstat_log" ]; then
+ echo "N/A"
+ return
+ fi
+ # vmstat columns: us sy id wa st — skip header lines
+ awk 'NR>2 {sum+=$14; n++} END {if(n>0) printf "%.1f", sum/n; else print "N/A"}' \
+ "$vmstat_log" 2>/dev/null || echo "N/A"
+}
+
+# Extract peak dirty pages from meminfo log
+extract_peak_dirty() {
+ local meminfo_log=$1
+ if [ ! -f "$meminfo_log" ]; then
+ echo "N/A"
+ return
+ fi
+ grep "^Dirty:" "$meminfo_log" | awk '{print $2}' | sort -n | tail -1 || echo "N/A"
+}
+
+# Extract peak cached from meminfo log
+extract_peak_cached() {
+ local meminfo_log=$1
+ if [ ! -f "$meminfo_log" ]; then
+ echo "N/A"
+ return
+ fi
+ grep "^Cached:" "$meminfo_log" | awk '{print $2}' | sort -n | tail -1 || echo "N/A"
+}
+
+print_separator() {
+ printf '%*s\n' 120 '' | tr ' ' '-'
+}
+
+########################################################################
+# Deliverable 1: Single-client results
+########################################################################
+echo ""
+echo "=================================================================="
+echo " Deliverable 1: Single-Client fio Benchmarks"
+echo "=================================================================="
+echo ""
+
+for workload in seq-write rand-write seq-read rand-read; do
+ case $workload in
+ seq-write|rand-write) rw_type="write" ;;
+ seq-read|rand-read) rw_type="read" ;;
+ esac
+
+ echo "--- $workload ---"
+ printf "%-16s %10s %10s %10s %10s %10s %10s %10s %12s %12s\n" \
+ "Mode" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)" "Sys CPU%" "PeakDirty(kB)" "PeakCache(kB)"
+ print_separator
+
+ for mode in buffered dontcache direct; do
+ dir="${RESULTS_DIR}/${workload}/${mode}"
+ json_file=$(find "$dir" -name '*.json' -not -name 'client*' 2>/dev/null | head -1 || true)
+ if [ -z "$json_file" ]; then
+ printf "%-16s %10s\n" "$mode" "(no data)"
+ continue
+ fi
+
+ read -r mbps iops avg_lat p50 p99 p999 <<< \
+ "$(extract_metrics "$json_file" "$rw_type")"
+ cpu=$(extract_cpu "${dir}/vmstat.log")
+ dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+ cached=$(extract_peak_cached "${dir}/meminfo.log")
+
+ printf "%-16s %10s %10s %10s %10s %10s %10s %10s %12s %12s\n" \
+ "$mode" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999" \
+ "$cpu" "${dirty:-N/A}" "${cached:-N/A}"
+ done
+ echo ""
+done
+
+########################################################################
+# Deliverable 2: Multi-client results
+########################################################################
+echo "=================================================================="
+echo " Deliverable 2: Noisy-Neighbor Benchmarks"
+echo "=================================================================="
+echo ""
+
+# Scenario A: Multiple writers
+echo "--- Scenario A: Multiple Writers ---"
+for mode in buffered dontcache direct; do
+ dir="${RESULTS_DIR}/multi-write/${mode}"
+ if [ ! -d "$dir" ]; then
+ continue
+ fi
+
+ echo " Mode: $mode"
+ printf " %-10s %10s %10s %10s %10s %10s %10s\n" \
+ "Client" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)"
+
+ total_bw=0
+ count=0
+ for json_file in "${dir}"/client*.json; do
+ [ -f "$json_file" ] || continue
+ client=$(basename "$json_file" .json)
+ read -r mbps iops avg_lat p50 p99 p999 <<< \
+ "$(extract_metrics "$json_file" "write")"
+ printf " %-10s %10s %10s %10s %10s %10s %10s\n" \
+ "$client" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+ total_bw=$(echo "$total_bw + ${mbps:-0}" | bc 2>/dev/null || echo "$total_bw")
+ count=$(( count + 1 ))
+ done
+
+ cpu=$(extract_cpu "${dir}/vmstat.log")
+ dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+ printf " Aggregate BW: %s MB/s | Sys CPU: %s%% | Peak Dirty: %s kB\n" \
+ "$total_bw" "$cpu" "${dirty:-N/A}"
+ echo ""
+done
+
+# Scenario C: Noisy neighbor
+echo "--- Scenario C: Noisy Writer + Latency-Sensitive Readers ---"
+for mode in buffered dontcache direct; do
+ dir="${RESULTS_DIR}/noisy-neighbor/${mode}"
+ if [ ! -d "$dir" ]; then
+ continue
+ fi
+
+ echo " Mode: $mode"
+ printf " %-14s %10s %10s %10s %10s %10s %10s\n" \
+ "Job" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)"
+
+ # Writer
+ if [ -f "${dir}/noisy_writer.json" ]; then
+ read -r mbps iops avg_lat p50 p99 p999 <<< \
+ "$(extract_metrics "${dir}/noisy_writer.json" "write")"
+ printf " %-14s %10s %10s %10s %10s %10s %10s\n" \
+ "Bulk writer" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+ fi
+
+ # Readers
+ for json_file in "${dir}"/reader*.json; do
+ [ -f "$json_file" ] || continue
+ reader=$(basename "$json_file" .json)
+ read -r mbps iops avg_lat p50 p99 p999 <<< \
+ "$(extract_metrics "$json_file" "read")"
+ printf " %-14s %10s %10s %10s %10s %10s %10s\n" \
+ "$reader" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+ done
+
+ cpu=$(extract_cpu "${dir}/vmstat.log")
+ dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+ printf " Sys CPU: %s%% | Peak Dirty: %s kB\n" "$cpu" "${dirty:-N/A}"
+ echo ""
+done
+
+# Scenario D: Mixed-mode noisy neighbor
+echo "--- Scenario D: Mixed-Mode Noisy Writer + Readers ---"
+for dir in "${RESULTS_DIR}"/noisy-neighbor-mixed/*/; do
+ [ -d "$dir" ] || continue
+ label=$(basename "$dir")
+
+ echo " Mode: $label"
+ printf " %-14s %10s %10s %10s %10s %10s %10s\n" \
+ "Job" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)"
+
+ # Writer
+ if [ -f "${dir}/noisy_writer.json" ]; then
+ read -r mbps iops avg_lat p50 p99 p999 <<< \
+ "$(extract_metrics "${dir}/noisy_writer.json" "write")"
+ printf " %-14s %10s %10s %10s %10s %10s %10s\n" \
+ "Bulk writer" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+ fi
+
+ # Readers
+ for json_file in "${dir}"/reader*.json; do
+ [ -f "$json_file" ] || continue
+ reader=$(basename "$json_file" .json)
+ read -r mbps iops avg_lat p50 p99 p999 <<< \
+ "$(extract_metrics "$json_file" "read")"
+ printf " %-14s %10s %10s %10s %10s %10s %10s\n" \
+ "$reader" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+ done
+
+ cpu=$(extract_cpu "${dir}/vmstat.log")
+ dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+ printf " Sys CPU: %s%% | Peak Dirty: %s kB\n" "$cpu" "${dirty:-N/A}"
+ echo ""
+done
+
+echo "=================================================================="
+echo " System Info"
+echo "=================================================================="
+if [ -f "${RESULTS_DIR}/sysinfo.txt" ]; then
+ head -6 "${RESULTS_DIR}/sysinfo.txt"
+fi
+echo ""
diff --git a/tools/testing/dontcache-bench/scripts/run-benchmarks.sh b/tools/testing/dontcache-bench/scripts/run-benchmarks.sh
new file mode 100755
index 000000000000..11bf400ef092
--- /dev/null
+++ b/tools/testing/dontcache-bench/scripts/run-benchmarks.sh
@@ -0,0 +1,562 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Local filesystem I/O mode benchmark suite.
+#
+# Runs the same test matrix as run-benchmarks.sh but on a local filesystem
+# using fio's io_uring engine with the RWF_DONTCACHE flag instead of NFSD's
+# debugfs mode knobs.
+#
+# Usage: ./run-local-benchmarks.sh [options]
+# -t <dir> Test directory (must be on a filesystem supporting FOP_DONTCACHE)
+# -s <size> File size (default: auto-sized to exceed RAM)
+# -f <path> Path to fio binary (default: fio in PATH)
+# -o <dir> Output directory for results (default: ./results/<timestamp>)
+# -d Dry run (print commands without executing)
+
+set -euo pipefail
+
+# Defaults
+TEST_DIR=""
+SIZE=""
+FIO_BIN="fio"
+RESULTS_DIR=""
+DRY_RUN=0
+MODES="0 1 2"
+PERF_LOCK=0
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+FIO_JOBS_DIR="${SCRIPT_DIR}/../fio-jobs"
+
+usage() {
+ echo "Usage: $0 -t <test-dir> [-s <size>] [-f <fio-path>] [-o <output-dir>] [-D] [-p] [-d]"
+ echo ""
+ echo " -t <dir> Test directory (required, must support RWF_DONTCACHE)"
+ echo " -s <size> File size (default: 2x RAM)"
+ echo " -f <path> Path to fio binary (default: fio)"
+ echo " -o <dir> Output directory (default: ./results/<timestamp>)"
+ echo " -D Dontcache only (skip buffered and direct tests)"
+ echo " -p Profile kernel lock contention with perf lock"
+ echo " -d Dry run"
+ exit 1
+}
+
+while getopts "t:s:f:o:Dpdh" opt; do
+ case $opt in
+ t) TEST_DIR="$OPTARG" ;;
+ s) SIZE="$OPTARG" ;;
+ f) FIO_BIN="$OPTARG" ;;
+ o) RESULTS_DIR="$OPTARG" ;;
+ D) MODES="1" ;;
+ p) PERF_LOCK=1 ;;
+ d) DRY_RUN=1 ;;
+ h) usage ;;
+ *) usage ;;
+ esac
+done
+
+if [ -z "$TEST_DIR" ]; then
+ echo "ERROR: -t <test-dir> is required"
+ usage
+fi
+
+# Auto-size to 2x RAM if not specified
+if [ -z "$SIZE" ]; then
+ mem_kb=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
+ SIZE="$(( mem_kb * 2 / 1024 ))M"
+fi
+
+if [ -z "$RESULTS_DIR" ]; then
+ RESULTS_DIR="./results/local-$(date +%Y%m%d-%H%M%S)"
+fi
+
+mkdir -p "$RESULTS_DIR"
+
+log() {
+ echo "[$(date '+%H:%M:%S')] $*"
+}
+
+run_cmd() {
+ if [ "$DRY_RUN" -eq 1 ]; then
+ echo " [DRY RUN] $*"
+ else
+ "$@"
+ fi
+}
+
+# I/O mode definitions:
+# buffered: direct=0, uncached=0
+# dontcache: direct=0, uncached=1
+# direct: direct=1, uncached=0
+#
+# Mode name from numeric value
+mode_name() {
+ case $1 in
+ 0) echo "buffered" ;;
+ 1) echo "dontcache" ;;
+ 2) echo "direct" ;;
+ esac
+}
+
+# Return fio command-line flags for a given mode.
+# "direct" is a standard fio option and works on the command line.
+# "uncached" is an io_uring engine option that must be in the job file,
+# so we inject it via make_job_file() below.
+mode_fio_args() {
+ case $1 in
+ 0) echo "--direct=0" ;; # buffered
+ 1) echo "--direct=0" ;; # dontcache
+ 2) echo "--direct=1" ;; # direct
+ esac
+}
+
+# Return the uncached= value for a given mode.
+mode_uncached() {
+ case $1 in
+ 0) echo "0" ;;
+ 1) echo "1" ;;
+ 2) echo "0" ;;
+ esac
+}
+
+# Create a temporary job file with uncached=N injected into [global].
+# For uncached=0 (buffered/direct), return the original file unchanged.
+make_job_file() {
+ local job_file=$1
+ local uncached=$2
+
+ if [ "$uncached" -eq 0 ]; then
+ echo "$job_file"
+ return
+ fi
+
+ local tmp
+ tmp=$(mktemp)
+ sed "/^\[global\]/a uncached=${uncached}" "$job_file" > "$tmp"
+ echo "$tmp"
+}
+
+drop_caches() {
+ run_cmd bash -c "sync && echo 3 > /proc/sys/vm/drop_caches"
+}
+
+# perf lock profiling — uses BPF-based live contention tracing
+PERF_LOCK_PID=""
+
+start_perf_lock() {
+ local outdir=$1
+
+ if [ "$PERF_LOCK" -ne 1 ]; then
+ return
+ fi
+
+ log "Starting perf lock contention tracing"
+ perf lock contention -a -b --max-stack 8 \
+ > "${outdir}/perf-lock-contention.txt" 2>&1 &
+ PERF_LOCK_PID=$!
+}
+
+stop_perf_lock() {
+ local outdir=$1
+
+ if [ -z "$PERF_LOCK_PID" ]; then
+ return
+ fi
+
+ log "Stopping perf lock contention tracing"
+ kill -TERM "$PERF_LOCK_PID" 2>/dev/null || true
+ wait "$PERF_LOCK_PID" 2>/dev/null || true
+ PERF_LOCK_PID=""
+}
+
+# Background monitors
+VMSTAT_PID=""
+IOSTAT_PID=""
+MEMINFO_PID=""
+
+start_monitors() {
+ local outdir=$1
+ log "Starting monitors in $outdir"
+ run_cmd vmstat 1 > "${outdir}/vmstat.log" 2>&1 &
+ VMSTAT_PID=$!
+ run_cmd iostat -x 1 > "${outdir}/iostat.log" 2>&1 &
+ IOSTAT_PID=$!
+ (while true; do
+ echo "=== $(date '+%s') ==="
+ cat /proc/meminfo
+ sleep 1
+ done) > "${outdir}/meminfo.log" 2>&1 &
+ MEMINFO_PID=$!
+}
+
+stop_monitors() {
+ log "Stopping monitors"
+ kill "$VMSTAT_PID" "$IOSTAT_PID" "$MEMINFO_PID" 2>/dev/null || true
+ wait "$VMSTAT_PID" "$IOSTAT_PID" "$MEMINFO_PID" 2>/dev/null || true
+}
+
+cleanup_test_files() {
+ local filepath="${TEST_DIR}/$1"
+ log "Cleaning up $filepath"
+ run_cmd rm -f "$filepath"
+}
+
+# Run a single fio benchmark
+run_fio() {
+ local job_file=$1
+ local outdir=$2
+ local filename=$3
+ local fio_size=${4:-$SIZE}
+ local keep=${5:-}
+ local extra_args=${6:-}
+ local uncached=${7:-0}
+
+ # Inject uncached=N into the job file if needed
+ local actual_job
+ actual_job=$(make_job_file "$job_file" "$uncached")
+
+ local job_name
+ job_name=$(basename "$job_file" .fio)
+
+ log "Running fio job: $job_name -> $outdir (file=${TEST_DIR}/$filename size=$fio_size)"
+ mkdir -p "$outdir"
+
+ drop_caches
+ start_monitors "$outdir"
+ # Skip perf lock profiling for precreate/setup runs
+ [ "$keep" != "keep" ] && start_perf_lock "$outdir"
+
+ # shellcheck disable=SC2086
+ run_cmd "$FIO_BIN" "$actual_job" \
+ --output-format=json \
+ --output="${outdir}/${job_name}.json" \
+ --filename="${TEST_DIR}/$filename" \
+ --size="$fio_size" \
+ $extra_args
+
+ [ "$keep" != "keep" ] && stop_perf_lock "$outdir"
+ stop_monitors
+ log "Finished: $job_name"
+
+ # Clean up temp job file if one was created
+ [ "$actual_job" != "$job_file" ] && rm -f "$actual_job"
+
+ if [ "$keep" != "keep" ]; then
+ cleanup_test_files "$filename"
+ fi
+}
+
+########################################################################
+# Preflight
+########################################################################
+preflight() {
+ log "=== Preflight checks ==="
+
+ if ! command -v "$FIO_BIN" &>/dev/null; then
+ echo "ERROR: fio not found at $FIO_BIN"
+ exit 1
+ fi
+
+ if [ ! -d "$TEST_DIR" ]; then
+ echo "ERROR: Test directory $TEST_DIR does not exist"
+ exit 1
+ fi
+
+ # Quick check that RWF_DONTCACHE works on this filesystem
+ local testfile="${TEST_DIR}/.dontcache_test"
+ if ! "$FIO_BIN" --name=test --ioengine=io_uring --rw=write \
+ --bs=4k --size=4k --direct=0 --uncached=1 \
+ --filename="$testfile" 2>/dev/null; then
+ echo "WARNING: RWF_DONTCACHE may not be supported on $TEST_DIR"
+ echo " (filesystem must support FOP_DONTCACHE)"
+ fi
+ rm -f "$testfile"
+
+ log "Test directory: $TEST_DIR"
+ log "File size: $SIZE"
+ log "fio binary: $FIO_BIN"
+ log "Results: $RESULTS_DIR"
+
+ # Record system info
+ {
+ echo "Timestamp: $(date +%Y%m%d-%H%M%S)"
+ echo "Kernel: $(uname -r)"
+ echo "Hostname: $(hostname)"
+ echo "Filesystem: $(df -T "$TEST_DIR" | tail -1 | awk '{print $2}')"
+ echo "File size: $SIZE"
+ echo "Test dir: $TEST_DIR"
+ } > "${RESULTS_DIR}/sysinfo.txt"
+}
+
+########################################################################
+# Deliverable 1: Single-client benchmarks
+########################################################################
+run_deliverable1() {
+ log "=========================================="
+ log "Deliverable 1: Single-client benchmarks"
+ log "=========================================="
+
+ # Sequential write
+ for mode in $MODES; do
+ local mname
+ mname=$(mode_name $mode)
+ local fio_args
+ fio_args=$(mode_fio_args $mode)
+
+ drop_caches
+ run_fio "${FIO_JOBS_DIR}/seq-write.fio" \
+ "${RESULTS_DIR}/seq-write/${mname}" \
+ "seq-write_testfile" "$SIZE" "" "$fio_args" \
+ "$(mode_uncached $mode)"
+ done
+
+ # Random write
+ for mode in $MODES; do
+ local mname
+ mname=$(mode_name $mode)
+ local fio_args
+ fio_args=$(mode_fio_args $mode)
+
+ drop_caches
+ run_fio "${FIO_JOBS_DIR}/rand-write.fio" \
+ "${RESULTS_DIR}/rand-write/${mname}" \
+ "rand-write_testfile" "$SIZE" "" "$fio_args" \
+ "$(mode_uncached $mode)"
+ done
+
+ # Sequential read — pre-create file, then read with each mode
+ log "Pre-creating sequential read test file"
+ run_fio "${FIO_JOBS_DIR}/seq-write.fio" \
+ "${RESULTS_DIR}/seq-read/precreate" \
+ "seq-read_testfile" "$SIZE" "keep"
+
+ for rmode in $MODES; do
+ local mname
+ mname=$(mode_name $rmode)
+ local fio_args
+ fio_args=$(mode_fio_args $rmode)
+ local keep="keep"
+ [ "$rmode" -eq 2 ] && keep=""
+
+ drop_caches
+ run_fio "${FIO_JOBS_DIR}/seq-read.fio" \
+ "${RESULTS_DIR}/seq-read/${mname}" \
+ "seq-read_testfile" "$SIZE" "$keep" "$fio_args" \
+ "$(mode_uncached $rmode)"
+ done
+
+ # Random read — pre-create file, then read with each mode
+ log "Pre-creating random read test file"
+ run_fio "${FIO_JOBS_DIR}/seq-write.fio" \
+ "${RESULTS_DIR}/rand-read/precreate" \
+ "rand-read_testfile" "$SIZE" "keep"
+
+ for rmode in $MODES; do
+ local mname
+ mname=$(mode_name $rmode)
+ local fio_args
+ fio_args=$(mode_fio_args $rmode)
+ local keep="keep"
+ [ "$rmode" -eq 2 ] && keep=""
+
+ drop_caches
+ run_fio "${FIO_JOBS_DIR}/rand-read.fio" \
+ "${RESULTS_DIR}/rand-read/${mname}" \
+ "rand-read_testfile" "$SIZE" "$keep" "$fio_args" \
+ "$(mode_uncached $rmode)"
+ done
+}
+
+########################################################################
+# Deliverable 2: Multi-client tests
+########################################################################
+run_deliverable2() {
+ log "=========================================="
+ log "Deliverable 2: Noisy-neighbor benchmarks"
+ log "=========================================="
+
+ local num_clients=4
+ local client_size
+ local mem_kb
+ mem_kb=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
+ client_size="$(( mem_kb / 1024 / num_clients ))M"
+
+ # Scenario A: Multiple writers
+ for mode in $MODES; do
+ local mname
+ mname=$(mode_name $mode)
+ local fio_args
+ fio_args=$(mode_fio_args $mode)
+ local uncached
+ uncached=$(mode_uncached $mode)
+ local actual_job
+ actual_job=$(make_job_file "${FIO_JOBS_DIR}/multi-write.fio" "$uncached")
+ local outdir="${RESULTS_DIR}/multi-write/${mname}"
+ mkdir -p "$outdir"
+
+ drop_caches
+ start_monitors "$outdir"
+ start_perf_lock "$outdir"
+
+ local pids=()
+ for i in $(seq 1 $num_clients); do
+ # shellcheck disable=SC2086
+ run_cmd "$FIO_BIN" "$actual_job" \
+ --output-format=json \
+ --output="${outdir}/client${i}.json" \
+ --filename="${TEST_DIR}/client${i}_testfile" \
+ --size="$client_size" \
+ $fio_args &
+ pids+=($!)
+ done
+
+ local rc=0
+ for pid in "${pids[@]}"; do
+ wait "$pid" || rc=$?
+ done
+
+ stop_perf_lock "$outdir"
+ stop_monitors
+ [ $rc -ne 0 ] && log "WARNING: some fio jobs exited non-zero"
+
+ [ "$actual_job" != "${FIO_JOBS_DIR}/multi-write.fio" ] && rm -f "$actual_job"
+ for i in $(seq 1 $num_clients); do
+ cleanup_test_files "client${i}_testfile"
+ done
+ done
+
+ # Scenario C: Noisy writer + latency-sensitive readers
+ for mode in $MODES; do
+ local mname
+ mname=$(mode_name $mode)
+ local fio_args
+ fio_args=$(mode_fio_args $mode)
+ local uncached
+ uncached=$(mode_uncached $mode)
+ local writer_job
+ writer_job=$(make_job_file "${FIO_JOBS_DIR}/noisy-writer.fio" "$uncached")
+ local reader_job
+ reader_job=$(make_job_file "${FIO_JOBS_DIR}/lat-reader.fio" "$uncached")
+ local outdir="${RESULTS_DIR}/noisy-neighbor/${mname}"
+ mkdir -p "$outdir"
+
+ # Pre-create read files
+ for i in $(seq 1 $(( num_clients - 1 ))); do
+ log "Pre-creating read file for reader $i"
+ run_fio "${FIO_JOBS_DIR}/multi-write.fio" \
+ "${outdir}/precreate_reader${i}" \
+ "reader${i}_readfile" \
+ "512M" "keep"
+ done
+ drop_caches
+ start_monitors "$outdir"
+ start_perf_lock "$outdir"
+
+ # Noisy writer
+ # shellcheck disable=SC2086
+ run_cmd "$FIO_BIN" "$writer_job" \
+ --output-format=json \
+ --output="${outdir}/noisy_writer.json" \
+ --filename="${TEST_DIR}/bulk_testfile" \
+ --size="$SIZE" \
+ $fio_args &
+ local writer_pid=$!
+
+ # Latency-sensitive readers
+ local reader_pids=()
+ for i in $(seq 1 $(( num_clients - 1 ))); do
+ # shellcheck disable=SC2086
+ run_cmd "$FIO_BIN" "$reader_job" \
+ --output-format=json \
+ --output="${outdir}/reader${i}.json" \
+ --filename="${TEST_DIR}/reader${i}_readfile" \
+ --size="512M" \
+ $fio_args &
+ reader_pids+=($!)
+ done
+
+ local rc=0
+ wait "$writer_pid" || rc=$?
+ for pid in "${reader_pids[@]}"; do
+ wait "$pid" || rc=$?
+ done
+
+ stop_perf_lock "$outdir"
+ stop_monitors
+ [ $rc -ne 0 ] && log "WARNING: some fio jobs exited non-zero"
+
+ [ "$writer_job" != "${FIO_JOBS_DIR}/noisy-writer.fio" ] && rm -f "$writer_job"
+ [ "$reader_job" != "${FIO_JOBS_DIR}/lat-reader.fio" ] && rm -f "$reader_job"
+ cleanup_test_files "bulk_testfile"
+ for i in $(seq 1 $(( num_clients - 1 ))); do
+ cleanup_test_files "reader${i}_readfile"
+ done
+ done
+
+ # Scenario D: Mixed-mode noisy neighbor
+ # dontcache writes + buffered reads
+ local outdir="${RESULTS_DIR}/noisy-neighbor-mixed/dontcache-w_buffered-r"
+ mkdir -p "$outdir"
+ local writer_job
+ writer_job=$(make_job_file "${FIO_JOBS_DIR}/noisy-writer.fio" 1)
+
+ for i in $(seq 1 $(( num_clients - 1 ))); do
+ log "Pre-creating read file for reader $i"
+ run_fio "${FIO_JOBS_DIR}/multi-write.fio" \
+ "${outdir}/precreate_reader${i}" \
+ "reader${i}_readfile" \
+ "512M" "keep"
+ done
+ drop_caches
+ start_monitors "$outdir"
+ start_perf_lock "$outdir"
+
+ # Writer with dontcache
+ run_cmd "$FIO_BIN" "$writer_job" \
+ --output-format=json \
+ --output="${outdir}/noisy_writer.json" \
+ --filename="${TEST_DIR}/bulk_testfile" \
+ --size="$SIZE" \
+ --direct=0 &
+ local writer_pid=$!
+
+ # Readers with buffered (no uncached flag)
+ local reader_pids=()
+ for i in $(seq 1 $(( num_clients - 1 ))); do
+ run_cmd "$FIO_BIN" "${FIO_JOBS_DIR}/lat-reader.fio" \
+ --output-format=json \
+ --output="${outdir}/reader${i}.json" \
+ --filename="${TEST_DIR}/reader${i}_readfile" \
+ --size="512M" \
+ --direct=0 &
+ reader_pids+=($!)
+ done
+
+ local rc=0
+ wait "$writer_pid" || rc=$?
+ for pid in "${reader_pids[@]}"; do
+ wait "$pid" || rc=$?
+ done
+
+ stop_perf_lock "$outdir"
+ stop_monitors
+ [ $rc -ne 0 ] && log "WARNING: some fio jobs exited non-zero"
+
+ [ "$writer_job" != "${FIO_JOBS_DIR}/noisy-writer.fio" ] && rm -f "$writer_job"
+ cleanup_test_files "bulk_testfile"
+ for i in $(seq 1 $(( num_clients - 1 ))); do
+ cleanup_test_files "reader${i}_readfile"
+ done
+}
+
+########################################################################
+# Main
+########################################################################
+preflight
+run_deliverable1
+run_deliverable2
+
+log "=========================================="
+log "All benchmarks complete."
+log "Results in: $RESULTS_DIR"
+log "Parse with: scripts/parse-results.sh $RESULTS_DIR"
+log "=========================================="
--
2.53.0
^ permalink raw reply related [flat|nested] 9+ messages in thread