Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCH] tracing: simplify pages allocation
From: Rosen Penev @ 2026-04-25  1:44 UTC (permalink / raw)
  To: linux-trace-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Kees Cook,
	Gustavo A. R. Silva, open list:TRACING,
	open list:KERNEL HARDENING (not covered by other areas):Keyword:b__counted_by(_le|_be)?b

Change to a flexible array member to allocate together with the array
struct.

Simplifies code slightly by removing no longer correct null checks for
pages and removing kfrees.

Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 kernel/trace/tracing_map.c | 32 +++++++++++---------------------
 kernel/trace/tracing_map.h |  2 +-
 2 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index bf1a507695b6..627cc3fdf69e 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -288,9 +288,6 @@ static void tracing_map_array_clear(struct tracing_map_array *a)
 {
 	unsigned int i;
 
-	if (!a->pages)
-		return;
-
 	for (i = 0; i < a->n_pages; i++)
 		memset(a->pages[i], 0, PAGE_SIZE);
 }
@@ -302,44 +299,37 @@ static void tracing_map_array_free(struct tracing_map_array *a)
 	if (!a)
 		return;
 
-	if (!a->pages)
-		goto free;
-
 	for (i = 0; i < a->n_pages; i++) {
 		if (!a->pages[i])
 			break;
 		kmemleak_free(a->pages[i]);
 		free_page((unsigned long)a->pages[i]);
 	}
-
-	kfree(a->pages);
-
- free:
-	kfree(a);
 }
 
 static struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
 						  unsigned int entry_size)
 {
 	struct tracing_map_array *a;
+	unsigned int entry_size_shift;
+	unsigned int entries_per_page;
+	unsigned int n_pages;
 	unsigned int i;
 
-	a = kzalloc_obj(*a);
+	entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1);
+	entries_per_page = PAGE_SIZE / (1 << entry_size_shift);
+	n_pages = max(1, n_elts / entries_per_page);
+
+	a = kzalloc_flex(*a, pages, n_pages);
 	if (!a)
 		return NULL;
 
-	a->entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1);
-	a->entries_per_page = PAGE_SIZE / (1 << a->entry_size_shift);
-	a->n_pages = n_elts / a->entries_per_page;
-	if (!a->n_pages)
-		a->n_pages = 1;
+	a->entry_size_shift = entry_size_shift;
+	a->entries_per_page = entries_per_page;
+	a->n_pages = n_pages;
 	a->entry_shift = fls(a->entries_per_page) - 1;
 	a->entry_mask = (1 << a->entry_shift) - 1;
 
-	a->pages = kcalloc(a->n_pages, sizeof(void *), GFP_KERNEL);
-	if (!a->pages)
-		goto free;
-
 	for (i = 0; i < a->n_pages; i++) {
 		a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
 		if (!a->pages[i])
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 99c37eeebc16..18a02959d77b 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -167,7 +167,7 @@ struct tracing_map_array {
 	unsigned int entry_shift;
 	unsigned int entry_mask;
 	unsigned int n_pages;
-	void **pages;
+	void *pages[] __counted_by(n_pages);
 };
 
 #define TRACING_MAP_ARRAY_ELT(array, idx)				\
-- 
2.54.0


^ permalink raw reply related

* [PATCH] mm/page_alloc: add tracepoint for PCP refills
From: Bunyod Suvonov @ 2026-04-25  9:13 UTC (permalink / raw)
  To: akpm, vbabka, linux-mm
  Cc: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel,
	linux-kernel, surenb, mhocko, jackmanb, hannes, ziy,
	Bunyod Suvonov

The page allocator already has mm_page_pcpu_drain to trace pages
drained from the per-cpu page lists back to the buddy allocator. There
is no matching tracepoint for the opposite direction, where
rmqueue_bulk() refills a PCP list from the buddy allocator.

mm_page_alloc_zone_locked is not a good substitute for this. It is
emitted from __rmqueue_smallest(), which is used both by rmqueue_bulk()
and by the direct buddy allocation path. Its percpu_refill field is
derived from the allocation order and migratetype, so it does not
reliably identify whether the allocation came from a PCP refill.

Add mm_page_pcpu_refill and emit it from rmqueue_bulk() for each page
added to the PCP list. The new tracepoint uses the same page, order and
migratetype fields as mm_page_pcpu_drain, making refill and drain
activity directly comparable.

Signed-off-by: Bunyod Suvonov <b.suvonov@sjtu.edu.cn>
---
 include/trace/events/kmem.h | 23 +++++++++++++++++++++++
 mm/page_alloc.c             |  1 +
 2 files changed, 24 insertions(+)

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index cd7920c81f85..16985604fc51 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -243,6 +243,29 @@ DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,
 	TP_ARGS(page, order, migratetype, percpu_refill)
 );
 
+TRACE_EVENT(mm_page_pcpu_refill,
+
+	TP_PROTO(struct page *page, unsigned int order, int migratetype),
+
+	TP_ARGS(page, order, migratetype),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	pfn		)
+		__field(	unsigned int,	order		)
+		__field(	int,		migratetype	)
+	),
+
+	TP_fast_assign(
+		__entry->pfn		= page ? page_to_pfn(page) : -1UL;
+		__entry->order		= order;
+		__entry->migratetype	= migratetype;
+	),
+
+	TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d",
+		pfn_to_page(__entry->pfn), __entry->pfn,
+		__entry->order, __entry->migratetype)
+);
+
 TRACE_EVENT(mm_page_pcpu_drain,
 
 	TP_PROTO(struct page *page, unsigned int order, int migratetype),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 65e205111553..a60b73ed39a4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2544,6 +2544,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 * pages are ordered properly.
 		 */
 		list_add_tail(&page->pcp_list, list);
+		trace_mm_page_pcpu_refill(page, order, migratetype);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH v3 0/4] mm: improve write performance with RWF_DONTCACHE
From: Jeff Layton @ 2026-04-26 11:56 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), Andrew Morton, David Hildenbrand,
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
	Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever
  Cc: linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel, Jeff Layton

This patch series attempts to improve write performance with
RWF_DONTCACHE. The main justification and benchmarks for the series are
in patch #2.

This version implements a scheme that Jan Kara and Christoph Hellwig
suggested during review of the earlier series: after a DONTCACHE write,
kick the flusher thread to do an amount of writeback proportional to the
amount written, but don't target any particular inode or pages when
doing writeback.

The second patch in the series has a summary of the benchmark results.
This seems to work as well or better than the earlier approaches.

The benchmarks I used are in the last two patches. I'm not sure if we
want to merge those into the tree as they are (mostly) AI slop. There
is probably a better tool for this out there.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
Changes in v3:
- Track dirty DONTCACHE pages in the VM
- Have flusher write back a proportional number of pages after DONTCACHE write
- Link to v2: https://lore.kernel.org/r/20260408-dontcache-v2-0-948dec1e756b@kernel.org

Changes in v2:
- kick flusher thread instead of initiating writeback inline
- add mechanism to run 'perf lock' around the testcases
- Link to v1: https://lore.kernel.org/r/20260401-dontcache-v1-0-1f5746fab47a@kernel.org

---
Jeff Layton (4):
      mm: add NR_DONTCACHE_DIRTY node page counter
      mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
      testing: add nfsd-io-bench NFS server benchmark suite
      testing: add dontcache-bench local filesystem benchmark suite

 fs/fs-writeback.c                                  |  60 +++
 include/linux/backing-dev-defs.h                   |   2 +
 include/linux/fs.h                                 |   6 +-
 include/linux/mmzone.h                             |   1 +
 include/trace/events/writeback.h                   |   3 +-
 mm/filemap.c                                       |   6 +-
 mm/page-writeback.c                                |   7 +
 mm/vmstat.c                                        |   1 +
 .../dontcache-bench/fio-jobs/lat-reader.fio        |  12 +
 .../dontcache-bench/fio-jobs/multi-write.fio       |   9 +
 .../dontcache-bench/fio-jobs/noisy-writer.fio      |  12 +
 .../testing/dontcache-bench/fio-jobs/rand-read.fio |  13 +
 .../dontcache-bench/fio-jobs/rand-write.fio        |  13 +
 .../testing/dontcache-bench/fio-jobs/seq-read.fio  |  13 +
 .../testing/dontcache-bench/fio-jobs/seq-write.fio |  13 +
 .../dontcache-bench/scripts/parse-results.sh       | 238 +++++++++
 .../dontcache-bench/scripts/run-benchmarks.sh      | 562 ++++++++++++++++++++
 .../testing/nfsd-io-bench/fio-jobs/lat-reader.fio  |  15 +
 .../testing/nfsd-io-bench/fio-jobs/multi-write.fio |  14 +
 .../nfsd-io-bench/fio-jobs/noisy-writer.fio        |  14 +
 tools/testing/nfsd-io-bench/fio-jobs/rand-read.fio |  15 +
 .../testing/nfsd-io-bench/fio-jobs/rand-write.fio  |  15 +
 tools/testing/nfsd-io-bench/fio-jobs/seq-read.fio  |  14 +
 tools/testing/nfsd-io-bench/fio-jobs/seq-write.fio |  14 +
 .../testing/nfsd-io-bench/scripts/parse-results.sh | 238 +++++++++
 .../nfsd-io-bench/scripts/run-benchmarks.sh        | 591 +++++++++++++++++++++
 .../testing/nfsd-io-bench/scripts/setup-server.sh  |  94 ++++
 27 files changed, 1989 insertions(+), 6 deletions(-)
---
base-commit: 27d128c1cff64c3b8012cc56dd5a1391bb4f1821
change-id: 20260401-dontcache-5811efd7eaf3

Best regards,
-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply

* [PATCH v3 1/4] mm: add NR_DONTCACHE_DIRTY node page counter
From: Jeff Layton @ 2026-04-26 11:56 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), Andrew Morton, David Hildenbrand,
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
	Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever
  Cc: linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel, Jeff Layton
In-Reply-To: <20260426-dontcache-v3-0-79eb37da9547@kernel.org>

Add a per-node page counter that tracks the number of dirty pages with
the dropbehind flag set (i.e., pages dirtied via RWF_DONTCACHE writes).

Increment the counter alongside NR_FILE_DIRTY in folio_account_dirtied()
when the folio has the dropbehind flag set, and decrement it in
folio_clear_dirty_for_io(), folio_account_cleaned(), and when a
non-DONTCACHE access clears the dropbehind flag on a dirty folio.

The counter is visible via /proc/vmstat as "nr_dontcache_dirty" and
will be used by the writeback flusher to determine how many pages to
write back when expediting writeback for IOCB_DONTCACHE writes, without
flushing the entire BDI's dirty pages.

Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/mmzone.h | 1 +
 mm/filemap.c           | 6 +++++-
 mm/page-writeback.c    | 7 +++++++
 mm/vmstat.c            | 1 +
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9adb2ad21da5..ed9cc61c7627 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -259,6 +259,7 @@ enum node_stat_item {
 			   only modified from process context */
 	NR_FILE_PAGES,
 	NR_FILE_DIRTY,
+	NR_DONTCACHE_DIRTY,
 	NR_WRITEBACK,
 	NR_SHMEM,		/* shmem pages (included tmpfs/GEM pages) */
 	NR_SHMEM_THPS,
diff --git a/mm/filemap.c b/mm/filemap.c
index 4e636647100c..45089fde5150 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2052,8 +2052,12 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
 	if (!folio)
 		return ERR_PTR(-ENOENT);
 	/* not an uncached lookup, clear uncached if set */
-	if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
+	if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) {
+		if (folio_test_dirty(folio))
+			lruvec_stat_mod_folio(folio, NR_DONTCACHE_DIRTY,
+					      -folio_nr_pages(folio));
 		folio_clear_dropbehind(folio);
+	}
 	return folio;
 }
 EXPORT_SYMBOL(__filemap_get_folio_mpol);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 88cd53d4ba09..e1df93fb3e3b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2630,6 +2630,8 @@ static void folio_account_dirtied(struct folio *folio,
 		wb = inode_to_wb(inode);
 
 		lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
+		if (folio_test_dropbehind(folio))
+			lruvec_stat_mod_folio(folio, NR_DONTCACHE_DIRTY, nr);
 		__zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
 		__node_stat_mod_folio(folio, NR_DIRTIED, nr);
 		wb_stat_mod(wb, WB_RECLAIMABLE, nr);
@@ -2651,6 +2653,8 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
 	long nr = folio_nr_pages(folio);
 
 	lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+	if (folio_test_dropbehind(folio))
+		lruvec_stat_mod_folio(folio, NR_DONTCACHE_DIRTY, -nr);
 	zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
 	wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
 	task_io_account_cancelled_write(nr * PAGE_SIZE);
@@ -2920,6 +2924,9 @@ bool folio_clear_dirty_for_io(struct folio *folio)
 		if (folio_test_clear_dirty(folio)) {
 			long nr = folio_nr_pages(folio);
 			lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+			if (folio_test_dropbehind(folio))
+				lruvec_stat_mod_folio(folio,
+						NR_DONTCACHE_DIRTY, -nr);
 			zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
 			wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
 			ret = true;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f534972f517d..c3e5dfadb9a5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1240,6 +1240,7 @@ const char * const vmstat_text[] = {
 	[I(NR_FILE_MAPPED)]			= "nr_mapped",
 	[I(NR_FILE_PAGES)]			= "nr_file_pages",
 	[I(NR_FILE_DIRTY)]			= "nr_dirty",
+	[I(NR_DONTCACHE_DIRTY)]			= "nr_dontcache_dirty",
 	[I(NR_WRITEBACK)]			= "nr_writeback",
 	[I(NR_SHMEM)]				= "nr_shmem",
 	[I(NR_SHMEM_THPS)]			= "nr_shmem_hugepages",

-- 
2.53.0


^ permalink raw reply related

* [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
From: Jeff Layton @ 2026-04-26 11:56 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), Andrew Morton, David Hildenbrand,
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
	Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever
  Cc: linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel, Jeff Layton
In-Reply-To: <20260426-dontcache-v3-0-79eb37da9547@kernel.org>

The IOCB_DONTCACHE writeback path in generic_write_sync() calls
filemap_flush_range() on every write, submitting writeback inline in
the writer's context.  Perf lock contention profiling shows the
performance problem is not lock contention but the writeback submission
work itself — walking the page tree and submitting I/O blocks the writer
for milliseconds, inflating p99.9 latency from 23ms (buffered) to 93ms
(dontcache).

Replace the inline filemap_flush_range() call with a flusher kick that
drains dirty pages in the background.  This moves writeback submission
completely off the writer's hot path.

To avoid flushing unrelated buffered dirty data, add a dedicated
WB_start_dontcache bit and wb_check_start_dontcache() handler that uses
the new NR_DONTCACHE_DIRTY counter to determine how many pages to write
back.  The flusher writes back that many pages from the oldest dirty
inodes (not restricted to dontcache-specific inodes). This helps
preserve I/O batching while limiting the scope of expedited writeback.

Like WB_start_all, the WB_start_dontcache bit coalesces multiple
DONTCACHE writes into a single flusher wakeup without per-write
allocations.

Also add WB_REASON_DONTCACHE as a new writeback reason for tracing
visibility, and target the correct cgroup writeback domain via
unlocked_inode_to_wb_begin().

dontcache-bench results on dual-socket Xeon Gold 6138 (80 CPUs, 256 GB
RAM, Samsung MZ1LB1T9HALS 1.7 TB NVMe, local XFS, io_uring, file size
~503 GB, compared to a v6.19-ish baseline):

  Single-client sequential write (MB/s):
                       baseline    patched     change
  buffered              1449.8     1440.1      -0.7%
  dontcache             1347.9     1461.5      +8.4%
  direct                1450.0     1440.1      -0.7%

  Single-client sequential write latency (us):
                       baseline    patched     change
  dontcache p50         3031.0    10551.3    +248.1%
  dontcache p99        74973.2    21626.9     -71.2%
  dontcache p99.9      85459.0    23199.7     -72.9%

  Single-client random write (MB/s):
                       baseline    patched     change
  dontcache              284.2      295.4      +3.9%

  Single-client random write p99.9 latency (us):
                       baseline    patched     change
  dontcache             2277.4      872.4     -61.7%

  Multi-writer aggregate throughput (MB/s):
                       baseline    patched     change
  buffered              1619.5     1611.2      -0.5%
  dontcache             1281.1     1629.4     +27.2%
  direct                1545.4     1609.4      +4.1%

  Mixed-mode noisy neighbor (dontcache writer + buffered readers):
                       baseline    patched     change
  writer (MB/s)         1297.6     1471.1     +13.4%
  readers avg (MB/s)     855.0      462.4     -45.9%

nfsd-io-bench results on same hardware (XFS on NVMe, NFSv3 via fio
NFS engine with libnfs, 1024 NFSD threads, pool_mode=pernode,
file size ~502 GB, compared to v6.19-ish baseline):

  Single-client sequential write (MB/s):
                       baseline    patched     change
  buffered              4844.2     4653.4      -3.9%
  dontcache             3028.3     3723.1     +22.9%
  direct                 957.6      987.8      +3.2%

  Single-client sequential write p99.9 latency (us):
                       baseline    patched     change
  dontcache            759169.0   175112.2     -76.9%

  Single-client random write (MB/s):
                       baseline    patched     change
  dontcache              590.0     1561.0    +164.6%

  Multi-writer aggregate throughput (MB/s):
                       baseline    patched     change
  buffered              9636.3     9422.9      -2.2%
  dontcache             1894.9     9442.6    +398.3%
  direct                 809.6      975.1     +20.4%

  Noisy neighbor (dontcache writer + random readers):
                       baseline    patched     change
  writer (MB/s)         1854.5     4063.6    +119.1%
  readers avg (MB/s)     131.2      101.6     -22.5%

The NFS results show even larger improvements than the local benchmarks.
Multi-writer dontcache throughput improves nearly 5x, matching buffered
I/O. Dirty page footprint drops 85-95% in sequential workloads vs.
buffered.

Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/fs-writeback.c                | 60 ++++++++++++++++++++++++++++++++++++++++
 include/linux/backing-dev-defs.h |  2 ++
 include/linux/fs.h               |  6 ++--
 include/trace/events/writeback.h |  3 +-
 4 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a65694cbfe68..377767db48f7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1334,6 +1334,18 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 	wb_wakeup(wb);
 }
 
+static void wb_start_dontcache_writeback(struct bdi_writeback *wb)
+{
+	if (!wb_has_dirty_io(wb))
+		return;
+
+	if (test_bit(WB_start_dontcache, &wb->state) ||
+	    test_and_set_bit(WB_start_dontcache, &wb->state))
+		return;
+
+	wb_wakeup(wb);
+}
+
 /**
  * wb_start_background_writeback - start background writeback
  * @wb: bdi_writback to write from
@@ -2373,6 +2385,28 @@ static long wb_check_start_all(struct bdi_writeback *wb)
 	return nr_pages;
 }
 
+static long wb_check_start_dontcache(struct bdi_writeback *wb)
+{
+	long nr_pages;
+
+	if (!test_bit(WB_start_dontcache, &wb->state))
+		return 0;
+
+	nr_pages = global_node_page_state(NR_DONTCACHE_DIRTY);
+	if (nr_pages) {
+		struct wb_writeback_work work = {
+			.nr_pages	= wb_split_bdi_pages(wb, nr_pages),
+			.sync_mode	= WB_SYNC_NONE,
+			.range_cyclic	= 1,
+			.reason		= WB_REASON_DONTCACHE,
+		};
+
+		nr_pages = wb_writeback(wb, &work);
+	}
+
+	clear_bit(WB_start_dontcache, &wb->state);
+	return nr_pages;
+}
 
 /*
  * Retrieve work items and do the writeback they describe
@@ -2394,6 +2428,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 	 */
 	wrote += wb_check_start_all(wb);
 
+	/*
+	 * Check for dontcache writeback request
+	 */
+	wrote += wb_check_start_dontcache(wb);
+
 	/*
 	 * Check for periodic writeback, kupdated() style
 	 */
@@ -2468,6 +2507,27 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
 	rcu_read_unlock();
 }
 
+/**
+ * filemap_dontcache_kick_writeback - kick flusher for IOCB_DONTCACHE writes
+ * @mapping:	address_space that was just written to
+ *
+ * Kick the writeback flusher thread to expedite writeback of dontcache
+ * dirty pages.  Uses a dedicated WB_start_dontcache bit so that only
+ * pages tracked by NR_DONTCACHE_DIRTY are written back, rather than
+ * flushing the entire BDI's dirty pages.
+ */
+void filemap_dontcache_kick_writeback(struct address_space *mapping)
+{
+	struct inode *inode = mapping->host;
+	struct bdi_writeback *wb;
+	struct wb_lock_cookie cookie = {};
+
+	wb = unlocked_inode_to_wb_begin(inode, &cookie);
+	wb_start_dontcache_writeback(wb);
+	unlocked_inode_to_wb_end(inode, &cookie);
+}
+EXPORT_SYMBOL_GPL(filemap_dontcache_kick_writeback);
+
 /*
  * Wakeup the flusher threads to start writeback of all currently dirty pages
  */
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index a06b93446d10..74f8a9977f5d 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -26,6 +26,7 @@ enum wb_state {
 	WB_writeback_running,	/* Writeback is in progress */
 	WB_has_dirty_io,	/* Dirty inodes on ->b_{dirty|io|more_io} */
 	WB_start_all,		/* nr_pages == 0 (all) work pending */
+	WB_start_dontcache,	/* dontcache writeback pending */
 };
 
 enum wb_stat_item {
@@ -55,6 +56,7 @@ enum wb_reason {
 	 */
 	WB_REASON_FORKER_THREAD,
 	WB_REASON_FOREIGN_FLUSH,
+	WB_REASON_DONTCACHE,
 
 	WB_REASON_MAX,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11559c513dfb..df72b42a9e9b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2624,6 +2624,7 @@ extern int __must_check file_write_and_wait_range(struct file *file,
 						loff_t start, loff_t end);
 int filemap_flush_range(struct address_space *mapping, loff_t start,
 		loff_t end);
+void filemap_dontcache_kick_writeback(struct address_space *mapping);
 
 static inline int file_write_and_wait(struct file *file)
 {
@@ -2657,10 +2658,7 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
 		if (ret)
 			return ret;
 	} else if (iocb->ki_flags & IOCB_DONTCACHE) {
-		struct address_space *mapping = iocb->ki_filp->f_mapping;
-
-		filemap_flush_range(mapping, iocb->ki_pos - count,
-				iocb->ki_pos - 1);
+		filemap_dontcache_kick_writeback(iocb->ki_filp->f_mapping);
 	}
 
 	return count;
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index bdac0d685a98..13ee076ccd16 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -44,7 +44,8 @@
 	EM( WB_REASON_PERIODIC,			"periodic")		\
 	EM( WB_REASON_FS_FREE_SPACE,		"fs_free_space")	\
 	EM( WB_REASON_FORKER_THREAD,		"forker_thread")	\
-	EMe(WB_REASON_FOREIGN_FLUSH,		"foreign_flush")
+	EM( WB_REASON_FOREIGN_FLUSH,		"foreign_flush")	\
+	EMe(WB_REASON_DONTCACHE,		"dontcache")
 
 WB_WORK_REASON
 

-- 
2.53.0


^ permalink raw reply related

* [PATCH v3 3/4] testing: add nfsd-io-bench NFS server benchmark suite
From: Jeff Layton @ 2026-04-26 11:56 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), Andrew Morton, David Hildenbrand,
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
	Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever
  Cc: linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel, Jeff Layton
In-Reply-To: <20260426-dontcache-v3-0-79eb37da9547@kernel.org>

Add a benchmark suite for testing NFSD I/O mode performance using fio
with the libnfs backend against an NFS server on localhost.  Tests
buffered, dontcache, and direct I/O modes via NFSD debugfs controls.

Includes:
 - fio job files for sequential/random read/write, multi-writer,
   noisy-neighbor, and latency-sensitive reader workloads
 - run-benchmarks.sh: orchestrates test matrix with mode switching
 - parse-results.sh: extracts metrics from fio JSON output
 - setup-server.sh: configures NFS export for testing

Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 .../testing/nfsd-io-bench/fio-jobs/lat-reader.fio  |  15 +
 .../testing/nfsd-io-bench/fio-jobs/multi-write.fio |  14 +
 .../nfsd-io-bench/fio-jobs/noisy-writer.fio        |  14 +
 tools/testing/nfsd-io-bench/fio-jobs/rand-read.fio |  15 +
 .../testing/nfsd-io-bench/fio-jobs/rand-write.fio  |  15 +
 tools/testing/nfsd-io-bench/fio-jobs/seq-read.fio  |  14 +
 tools/testing/nfsd-io-bench/fio-jobs/seq-write.fio |  14 +
 .../testing/nfsd-io-bench/scripts/parse-results.sh | 238 +++++++++
 .../nfsd-io-bench/scripts/run-benchmarks.sh        | 591 +++++++++++++++++++++
 .../testing/nfsd-io-bench/scripts/setup-server.sh  |  94 ++++
 10 files changed, 1024 insertions(+)

diff --git a/tools/testing/nfsd-io-bench/fio-jobs/lat-reader.fio b/tools/testing/nfsd-io-bench/fio-jobs/lat-reader.fio
new file mode 100644
index 000000000000..61af37e8b860
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/lat-reader.fio
@@ -0,0 +1,15 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=4k
+numjobs=16
+runtime=300
+time_based=1
+group_reporting=1
+rw=randread
+log_avg_msec=1000
+write_bw_log=latreader
+write_lat_log=latreader
+
+[lat_reader]
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/multi-write.fio b/tools/testing/nfsd-io-bench/fio-jobs/multi-write.fio
new file mode 100644
index 000000000000..16b792aecabb
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/multi-write.fio
@@ -0,0 +1,14 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=1M
+numjobs=16
+time_based=0
+group_reporting=1
+rw=write
+log_avg_msec=1000
+write_bw_log=multiwrite
+write_lat_log=multiwrite
+
+[writer]
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/noisy-writer.fio b/tools/testing/nfsd-io-bench/fio-jobs/noisy-writer.fio
new file mode 100644
index 000000000000..615154a7737e
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/noisy-writer.fio
@@ -0,0 +1,14 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=1M
+numjobs=16
+time_based=0
+group_reporting=1
+rw=write
+log_avg_msec=1000
+write_bw_log=noisywriter
+write_lat_log=noisywriter
+
+[bulk_writer]
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/rand-read.fio b/tools/testing/nfsd-io-bench/fio-jobs/rand-read.fio
new file mode 100644
index 000000000000..501bae7416a8
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/rand-read.fio
@@ -0,0 +1,15 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=4k
+numjobs=16
+runtime=300
+time_based=1
+group_reporting=1
+rw=randread
+log_avg_msec=1000
+write_bw_log=randread
+write_lat_log=randread
+
+[randread]
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/rand-write.fio b/tools/testing/nfsd-io-bench/fio-jobs/rand-write.fio
new file mode 100644
index 000000000000..d891d04197ae
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/rand-write.fio
@@ -0,0 +1,15 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=64k
+numjobs=16
+runtime=300
+time_based=1
+group_reporting=1
+rw=randwrite
+log_avg_msec=1000
+write_bw_log=randwrite
+write_lat_log=randwrite
+
+[randwrite]
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/seq-read.fio b/tools/testing/nfsd-io-bench/fio-jobs/seq-read.fio
new file mode 100644
index 000000000000..6e24ab355026
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/seq-read.fio
@@ -0,0 +1,14 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=1M
+numjobs=16
+time_based=0
+group_reporting=1
+rw=read
+log_avg_msec=1000
+write_bw_log=seqread
+write_lat_log=seqread
+
+[seqread]
diff --git a/tools/testing/nfsd-io-bench/fio-jobs/seq-write.fio b/tools/testing/nfsd-io-bench/fio-jobs/seq-write.fio
new file mode 100644
index 000000000000..260858e345f5
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/fio-jobs/seq-write.fio
@@ -0,0 +1,14 @@
+[global]
+ioengine=nfs
+nfs_url=nfs://localhost/export
+direct=0
+bs=1M
+numjobs=16
+time_based=0
+group_reporting=1
+rw=write
+log_avg_msec=1000
+write_bw_log=seqwrite
+write_lat_log=seqwrite
+
+[seqwrite]
diff --git a/tools/testing/nfsd-io-bench/scripts/parse-results.sh b/tools/testing/nfsd-io-bench/scripts/parse-results.sh
new file mode 100755
index 000000000000..0427d411db04
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/scripts/parse-results.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Parse fio JSON output and generate comparison tables.
+#
+# Usage: ./parse-results.sh <results-dir>
+
+set -euo pipefail
+
+if [ $# -lt 1 ]; then
+	echo "Usage: $0 <results-dir>"
+	exit 1
+fi
+
+RESULTS_DIR="$1"
+
+if ! command -v jq &>/dev/null; then
+	echo "ERROR: jq is required"
+	exit 1
+fi
+
+# Extract metrics from a single fio JSON result
+extract_metrics() {
+	local json_file=$1
+	local rw_type=$2  # read or write
+
+	if [ ! -f "$json_file" ]; then
+		echo "N/A N/A N/A N/A N/A N/A"
+		return
+	fi
+
+	jq -r --arg rw "$rw_type" '
+		.jobs[0][$rw] as $d |
+		[
+			(($d.bw // 0) / 1024 | . * 10 | round / 10),    # MB/s
+			($d.iops // 0),                                    # IOPS
+			((($d.clat_ns.mean // 0) / 1000) | . * 10 | round / 10), # avg lat us
+			(($d.clat_ns.percentile["50.000000"] // 0) / 1000), # p50 us
+			(($d.clat_ns.percentile["99.000000"] // 0) / 1000), # p99 us
+			(($d.clat_ns.percentile["99.900000"] // 0) / 1000)  # p99.9 us
+		] | @tsv
+	' "$json_file" 2>/dev/null || echo "N/A N/A N/A N/A N/A N/A"
+}
+
+# Extract server CPU from vmstat log (average sys%)
+extract_cpu() {
+	local vmstat_log=$1
+	if [ ! -f "$vmstat_log" ]; then
+		echo "N/A"
+		return
+	fi
+	# vmstat columns: us sy id wa st — skip header lines
+	awk 'NR>2 {sum+=$14; n++} END {if(n>0) printf "%.1f", sum/n; else print "N/A"}' \
+		"$vmstat_log" 2>/dev/null || echo "N/A"
+}
+
+# Extract peak dirty pages from meminfo log
+extract_peak_dirty() {
+	local meminfo_log=$1
+	if [ ! -f "$meminfo_log" ]; then
+		echo "N/A"
+		return
+	fi
+	grep "^Dirty:" "$meminfo_log" | awk '{print $2}' | sort -n | tail -1 || echo "N/A"
+}
+
+# Extract peak cached from meminfo log
+extract_peak_cached() {
+	local meminfo_log=$1
+	if [ ! -f "$meminfo_log" ]; then
+		echo "N/A"
+		return
+	fi
+	grep "^Cached:" "$meminfo_log" | awk '{print $2}' | sort -n | tail -1 || echo "N/A"
+}
+
+print_separator() {
+	printf '%*s\n' 120 '' | tr ' ' '-'
+}
+
+########################################################################
+# Deliverable 1: Single-client results
+########################################################################
+echo ""
+echo "=================================================================="
+echo "  Deliverable 1: Single-Client fio Benchmarks"
+echo "=================================================================="
+echo ""
+
+for workload in seq-write rand-write seq-read rand-read; do
+	case $workload in
+	seq-write|rand-write) rw_type="write" ;;
+	seq-read|rand-read)   rw_type="read" ;;
+	esac
+
+	echo "--- $workload ---"
+	printf "%-16s %10s %10s %10s %10s %10s %10s %10s %12s %12s\n" \
+		"Mode" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)" "Sys CPU%" "PeakDirty(kB)" "PeakCache(kB)"
+	print_separator
+
+	for mode in buffered dontcache direct; do
+		dir="${RESULTS_DIR}/${workload}/${mode}"
+		json_file=$(find "$dir" -name '*.json' -not -name 'client*' 2>/dev/null | head -1 || true)
+		if [ -z "$json_file" ]; then
+			printf "%-16s %10s\n" "$mode" "(no data)"
+			continue
+		fi
+
+		read -r mbps iops avg_lat p50 p99 p999 <<< \
+			"$(extract_metrics "$json_file" "$rw_type")"
+		cpu=$(extract_cpu "${dir}/vmstat.log")
+		dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+		cached=$(extract_peak_cached "${dir}/meminfo.log")
+
+		printf "%-16s %10s %10s %10s %10s %10s %10s %10s %12s %12s\n" \
+			"$mode" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999" \
+			"$cpu" "${dirty:-N/A}" "${cached:-N/A}"
+	done
+	echo ""
+done
+
+########################################################################
+# Deliverable 2: Multi-client results
+########################################################################
+echo "=================================================================="
+echo "  Deliverable 2: Noisy-Neighbor Benchmarks"
+echo "=================================================================="
+echo ""
+
+# Scenario A: Multiple writers
+echo "--- Scenario A: Multiple Writers ---"
+for mode in buffered dontcache direct; do
+	dir="${RESULTS_DIR}/multi-write/${mode}"
+	if [ ! -d "$dir" ]; then
+		continue
+	fi
+
+	echo "  Mode: $mode"
+	printf "  %-10s %10s %10s %10s %10s %10s %10s\n" \
+		"Client" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)"
+
+	total_bw=0
+	count=0
+	for json_file in "${dir}"/client*.json; do
+		[ -f "$json_file" ] || continue
+		client=$(basename "$json_file" .json)
+		read -r mbps iops avg_lat p50 p99 p999 <<< \
+			"$(extract_metrics "$json_file" "write")"
+		printf "  %-10s %10s %10s %10s %10s %10s %10s\n" \
+			"$client" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+		total_bw=$(echo "$total_bw + ${mbps:-0}" | bc 2>/dev/null || echo "$total_bw")
+		count=$(( count + 1 ))
+	done
+
+	cpu=$(extract_cpu "${dir}/vmstat.log")
+	dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+	printf "  Aggregate BW: %s MB/s | Sys CPU: %s%% | Peak Dirty: %s kB\n" \
+		"$total_bw" "$cpu" "${dirty:-N/A}"
+	echo ""
+done
+
+# Scenario C: Noisy neighbor
+echo "--- Scenario C: Noisy Writer + Latency-Sensitive Readers ---"
+for mode in buffered dontcache direct; do
+	dir="${RESULTS_DIR}/noisy-neighbor/${mode}"
+	if [ ! -d "$dir" ]; then
+		continue
+	fi
+
+	echo "  Mode: $mode"
+	printf "  %-14s %10s %10s %10s %10s %10s %10s\n" \
+		"Job" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)"
+
+	# Writer
+	if [ -f "${dir}/noisy_writer.json" ]; then
+		read -r mbps iops avg_lat p50 p99 p999 <<< \
+			"$(extract_metrics "${dir}/noisy_writer.json" "write")"
+		printf "  %-14s %10s %10s %10s %10s %10s %10s\n" \
+			"Bulk writer" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+	fi
+
+	# Readers
+	for json_file in "${dir}"/reader*.json; do
+		[ -f "$json_file" ] || continue
+		reader=$(basename "$json_file" .json)
+		read -r mbps iops avg_lat p50 p99 p999 <<< \
+			"$(extract_metrics "$json_file" "read")"
+		printf "  %-14s %10s %10s %10s %10s %10s %10s\n" \
+			"$reader" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+	done
+
+	cpu=$(extract_cpu "${dir}/vmstat.log")
+	dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+	printf "  Sys CPU: %s%% | Peak Dirty: %s kB\n" "$cpu" "${dirty:-N/A}"
+	echo ""
+done
+
+# Scenario D: Mixed-mode noisy neighbor
+echo "--- Scenario D: Mixed-Mode Noisy Writer + Readers ---"
+for dir in "${RESULTS_DIR}"/noisy-neighbor-mixed/*/; do
+	[ -d "$dir" ] || continue
+	label=$(basename "$dir")
+
+	echo "  Mode: $label"
+	printf "  %-14s %10s %10s %10s %10s %10s %10s\n" \
+		"Job" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)"
+
+	# Writer
+	if [ -f "${dir}/noisy_writer.json" ]; then
+		read -r mbps iops avg_lat p50 p99 p999 <<< \
+			"$(extract_metrics "${dir}/noisy_writer.json" "write")"
+		printf "  %-14s %10s %10s %10s %10s %10s %10s\n" \
+			"Bulk writer" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+	fi
+
+	# Readers
+	for json_file in "${dir}"/reader*.json; do
+		[ -f "$json_file" ] || continue
+		reader=$(basename "$json_file" .json)
+		read -r mbps iops avg_lat p50 p99 p999 <<< \
+			"$(extract_metrics "$json_file" "read")"
+		printf "  %-14s %10s %10s %10s %10s %10s %10s\n" \
+			"$reader" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+	done
+
+	cpu=$(extract_cpu "${dir}/vmstat.log")
+	dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+	printf "  Sys CPU: %s%% | Peak Dirty: %s kB\n" "$cpu" "${dirty:-N/A}"
+	echo ""
+done
+
+echo "=================================================================="
+echo "  System Info"
+echo "=================================================================="
+if [ -f "${RESULTS_DIR}/sysinfo.txt" ]; then
+	head -6 "${RESULTS_DIR}/sysinfo.txt"
+fi
+echo ""
diff --git a/tools/testing/nfsd-io-bench/scripts/run-benchmarks.sh b/tools/testing/nfsd-io-bench/scripts/run-benchmarks.sh
new file mode 100755
index 000000000000..2b0cf6e79dff
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/scripts/run-benchmarks.sh
@@ -0,0 +1,591 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# NFS server I/O mode benchmark suite
+#
+# Runs fio with the NFS ioengine against an NFS server on localhost,
+# testing buffered, dontcache, and direct I/O modes.
+#
+# Usage: ./run-benchmarks.sh [OPTIONS]
+#
+# Options:
+#   -e EXPORT_PATH   Server export path (default: /export)
+#   -s SIZE          fio file size, should be >= 2x RAM (default: auto-detect)
+#   -r RESULTS_DIR   Where to store results (default: ./results)
+#   -n NFS_VER       NFS version: 3 or 4 (default: 3)
+#   -j FIO_JOBS_DIR  Path to fio job files (default: ../fio-jobs)
+#   -d               Dry run: print commands without executing
+#   -h               Show this help
+
+set -euo pipefail
+
+# Defaults
+EXPORT_PATH="/export"
+SIZE=""
+RESULTS_DIR="./results"
+NFS_VER=3
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+FIO_JOBS_DIR="${SCRIPT_DIR}/../fio-jobs"
+DRY_RUN=0
+MODES="0 1 2"
+PERF_LOCK=0
+
+DEBUGFS_BASE="/sys/kernel/debug/nfsd"
+IO_CACHE_READ="${DEBUGFS_BASE}/io_cache_read"
+IO_CACHE_WRITE="${DEBUGFS_BASE}/io_cache_write"
+DISABLE_SPLICE="${DEBUGFS_BASE}/disable-splice-read"
+
+usage() {
+	echo "Usage: $0 [OPTIONS]"
+	echo "  -e EXPORT_PATH   Server export path (default: /export)"
+	echo "  -s SIZE          fio file size (default: 2x RAM)"
+	echo "  -r RESULTS_DIR   Results directory (default: ./results)"
+	echo "  -n NFS_VER       NFS version: 3 or 4 (default: 3)"
+	echo "  -j FIO_JOBS_DIR  Path to fio job files"
+	echo "  -D               Dontcache only (skip buffered and direct tests)"
+	echo "  -p               Profile kernel lock contention with perf lock"
+	echo "  -d               Dry run"
+	echo "  -h               Help"
+	exit 1
+}
+
+while getopts "e:s:r:n:j:Dpdh" opt; do
+	case $opt in
+	e) EXPORT_PATH="$OPTARG" ;;
+	s) SIZE="$OPTARG" ;;
+	r) RESULTS_DIR="$OPTARG" ;;
+	n) NFS_VER="$OPTARG" ;;
+	j) FIO_JOBS_DIR="$OPTARG" ;;
+	D) MODES="1" ;;
+	p) PERF_LOCK=1 ;;
+	d) DRY_RUN=1 ;;
+	h) usage ;;
+	*) usage ;;
+	esac
+done
+
+# Auto-detect size: 2x total RAM
+if [ -z "$SIZE" ]; then
+	MEM_KB=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
+	MEM_GB=$(( MEM_KB / 1024 / 1024 ))
+	SIZE="$(( MEM_GB * 2 ))G"
+	echo "Auto-detected RAM: ${MEM_GB}G, using file size: ${SIZE}"
+fi
+
+
+log() {
+	echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
+}
+
+run_cmd() {
+	if [ "$DRY_RUN" -eq 1 ]; then
+		echo "  [DRY RUN] $*"
+	else
+		"$@"
+	fi
+}
+
+# Preflight checks
+preflight() {
+	log "=== Preflight checks ==="
+
+	if ! command -v fio &>/dev/null; then
+		echo "ERROR: fio not found in PATH"
+		exit 1
+	fi
+
+	# Check fio has nfs ioengine
+	if ! fio --enghelp=nfs &>/dev/null; then
+		echo "ERROR: fio does not have the nfs ioengine (needs libnfs)"
+		exit 1
+	fi
+
+	# Check debugfs knobs exist
+	for knob in "$IO_CACHE_READ" "$IO_CACHE_WRITE" "$DISABLE_SPLICE"; do
+		if [ ! -f "$knob" ]; then
+			echo "ERROR: $knob not found. Is the kernel new enough?"
+			exit 1
+		fi
+	done
+
+	# Check NFS server is exporting
+	if ! showmount -e localhost 2>/dev/null | grep -q "$EXPORT_PATH"; then
+		echo "WARNING: $EXPORT_PATH not in showmount output, proceeding anyway"
+	fi
+
+	# Print system info
+	echo "Kernel:     $(uname -r)"
+	echo "RAM:        $(awk '/MemTotal/ {printf "%.1f GB", $2/1024/1024}' /proc/meminfo)"
+	echo "Export:     $EXPORT_PATH"
+	echo "NFS ver:    $NFS_VER"
+	echo "File size:  $SIZE"
+	echo "Results:    $RESULTS_DIR"
+	echo ""
+}
+
+# Set server I/O mode via debugfs
+set_io_mode() {
+	local cache_write=$1
+	local cache_read=$2
+	local splice_off=$3
+
+	log "Setting io_cache_write=$cache_write io_cache_read=$cache_read disable-splice-read=$splice_off"
+	run_cmd bash -c "echo $cache_write > $IO_CACHE_WRITE"
+	run_cmd bash -c "echo $cache_read  > $IO_CACHE_READ"
+	run_cmd bash -c "echo $splice_off  > $DISABLE_SPLICE"
+}
+
+# Drop page cache on server
+drop_caches() {
+	log "Dropping page cache"
+	run_cmd bash -c "sync && echo 3 > /proc/sys/vm/drop_caches"
+	sleep 1
+}
+
+# Start background server monitoring
+start_monitors() {
+	local outdir=$1
+
+	log "Starting server monitors in $outdir"
+	run_cmd vmstat 1 > "${outdir}/vmstat.log" 2>&1 &
+	VMSTAT_PID=$!
+
+	run_cmd iostat -x 1 > "${outdir}/iostat.log" 2>&1 &
+	IOSTAT_PID=$!
+
+	# Sample /proc/meminfo every second
+	(while true; do
+		echo "=== $(date '+%s') ==="
+		cat /proc/meminfo
+		sleep 1
+	done) > "${outdir}/meminfo.log" 2>&1 &
+	MEMINFO_PID=$!
+}
+
+# Stop background monitors
+stop_monitors() {
+	log "Stopping monitors"
+	kill "$VMSTAT_PID" "$IOSTAT_PID" "$MEMINFO_PID" 2>/dev/null || true
+	wait "$VMSTAT_PID" "$IOSTAT_PID" "$MEMINFO_PID" 2>/dev/null || true
+}
+
+# perf lock profiling — uses BPF-based live contention tracing
+PERF_LOCK_PID=""
+
+start_perf_lock() {
+	local outdir=$1
+
+	if [ "$PERF_LOCK" -ne 1 ]; then
+		return
+	fi
+
+	log "Starting perf lock contention tracing"
+	perf lock contention -a -b --max-stack 8 \
+		> "${outdir}/perf-lock-contention.txt" 2>&1 &
+	PERF_LOCK_PID=$!
+}
+
+stop_perf_lock() {
+	local outdir=$1
+
+	if [ -z "$PERF_LOCK_PID" ]; then
+		return
+	fi
+
+	log "Stopping perf lock contention tracing"
+	kill -TERM "$PERF_LOCK_PID" 2>/dev/null || true
+	wait "$PERF_LOCK_PID" 2>/dev/null || true
+	PERF_LOCK_PID=""
+}
+
+# Run a single fio benchmark.
+# nfs_url is set in the job files; we pass --filename and --size on
+# the command line to vary the target file and data volume per run.
+# Pass "keep" as 5th arg to preserve the test file after the run.
+run_fio() {
+	local job_file=$1
+	local outdir=$2
+	local filename=$3
+	local fio_size=${4:-$SIZE}
+	local keep=${5:-}
+
+	local job_name
+	job_name=$(basename "$job_file" .fio)
+
+	log "Running fio job: $job_name -> $outdir (file=$filename size=$fio_size)"
+	mkdir -p "$outdir"
+
+	drop_caches
+	start_monitors "$outdir"
+	# Skip perf lock profiling for precreate/setup runs
+	[ "$keep" != "keep" ] && start_perf_lock "$outdir"
+
+	run_cmd fio "$job_file" \
+		--output-format=json \
+		--output="${outdir}/${job_name}.json" \
+		--filename="$filename" \
+		--size="$fio_size"
+
+	[ "$keep" != "keep" ] && stop_perf_lock "$outdir"
+	stop_monitors
+
+	log "Finished: $job_name"
+
+	# Clean up test file to free disk space unless told to keep it
+	if [ "$keep" != "keep" ]; then
+		cleanup_test_files "$filename"
+	fi
+}
+
+# Remove test files from the export to free disk space
+cleanup_test_files() {
+	local filename
+	for filename in "$@"; do
+		local filepath="${EXPORT_PATH}/${filename}"
+		log "Cleaning up: $filepath"
+		run_cmd rm -f "$filepath"
+	done
+}
+
+# Ensure parent directories exist under the export for a given filename
+ensure_export_dirs() {
+	local filename
+	for filename in "$@"; do
+		local dirpath="${EXPORT_PATH}/$(dirname "$filename")"
+		if [ "$dirpath" != "${EXPORT_PATH}/." ] && [ ! -d "$dirpath" ]; then
+			log "Creating directory: $dirpath"
+			run_cmd mkdir -p "$dirpath"
+		fi
+	done
+}
+
+# Mode name from numeric value
+mode_name() {
+	case $1 in
+	0) echo "buffered" ;;
+	1) echo "dontcache" ;;
+	2) echo "direct" ;;
+	esac
+}
+
+########################################################################
+# Deliverable 1: Single-client fio benchmarks
+########################################################################
+run_deliverable1() {
+	log "=========================================="
+	log "Deliverable 1: Single-client fio benchmarks"
+	log "=========================================="
+
+	# Write test matrix:
+	# mode 0 (buffered):    splice on  (default)
+	# mode 1 (dontcache):   splice off (required)
+	# mode 2 (direct):      splice off (required)
+
+	# Sequential write
+	for wmode in $MODES; do
+		local mname
+		mname=$(mode_name $wmode)
+		local splice_off=0
+		[ "$wmode" -ne 0 ] && splice_off=1
+
+		drop_caches
+		set_io_mode "$wmode" 0 "$splice_off"
+		run_fio "${FIO_JOBS_DIR}/seq-write.fio" \
+			"${RESULTS_DIR}/seq-write/${mname}" \
+			"seq-write_testfile"
+	done
+
+	# Random write
+	for wmode in $MODES; do
+		local mname
+		mname=$(mode_name $wmode)
+		local splice_off=0
+		[ "$wmode" -ne 0 ] && splice_off=1
+
+		drop_caches
+		set_io_mode "$wmode" 0 "$splice_off"
+		run_fio "${FIO_JOBS_DIR}/rand-write.fio" \
+			"${RESULTS_DIR}/rand-write/${mname}" \
+			"rand-write_testfile"
+	done
+
+	# Sequential read — vary read mode, write stays buffered
+	# Pre-create the file for reading
+	log "Pre-creating sequential read test file"
+	set_io_mode 0 0 0
+	run_fio "${FIO_JOBS_DIR}/seq-write.fio" \
+		"${RESULTS_DIR}/seq-read/precreate" \
+		"seq-read_testfile" "$SIZE" "keep"
+
+	# shellcheck disable=SC2086
+	local last_mode
+	last_mode=$(echo $MODES | awk '{print $NF}')
+
+	for rmode in $MODES; do
+		local mname
+		mname=$(mode_name $rmode)
+		local splice_off=0
+		[ "$rmode" -ne 0 ] && splice_off=1
+		# Keep file for subsequent modes; clean up after last
+		local keep="keep"
+		[ "$rmode" = "$last_mode" ] && keep=""
+
+		drop_caches
+		set_io_mode 0 "$rmode" "$splice_off"
+		run_fio "${FIO_JOBS_DIR}/seq-read.fio" \
+			"${RESULTS_DIR}/seq-read/${mname}" \
+			"seq-read_testfile" "$SIZE" "$keep"
+	done
+
+	# Random read — vary read mode, write stays buffered
+	# Pre-create the file for reading
+	log "Pre-creating random read test file"
+	set_io_mode 0 0 0
+	run_fio "${FIO_JOBS_DIR}/seq-write.fio" \
+		"${RESULTS_DIR}/rand-read/precreate" \
+		"rand-read_testfile" "$SIZE" "keep"
+
+	for rmode in $MODES; do
+		local mname
+		mname=$(mode_name $rmode)
+		local splice_off=0
+		[ "$rmode" -ne 0 ] && splice_off=1
+		# Keep file for subsequent modes; clean up after last
+		local keep="keep"
+		[ "$rmode" = "$last_mode" ] && keep=""
+
+		drop_caches
+		set_io_mode 0 "$rmode" "$splice_off"
+		run_fio "${FIO_JOBS_DIR}/rand-read.fio" \
+			"${RESULTS_DIR}/rand-read/${mname}" \
+			"rand-read_testfile" "$SIZE" "$keep"
+	done
+}
+
+########################################################################
+# Deliverable 2: Multi-client (simulated with multiple fio jobs)
+########################################################################
+run_deliverable2() {
+	log "=========================================="
+	log "Deliverable 2: Noisy-neighbor benchmarks"
+	log "=========================================="
+
+	local num_clients=4
+	local client_size
+	local mem_kb
+	mem_kb=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
+	# Each client gets RAM/num_clients so total > RAM
+	client_size="$(( mem_kb / 1024 / num_clients ))M"
+
+	# Scenario A: Multiple writers
+	for mode in $MODES; do
+		local mname
+		mname=$(mode_name $mode)
+		local splice_off=0
+		[ "$mode" -ne 0 ] && splice_off=1
+		local outdir="${RESULTS_DIR}/multi-write/${mname}"
+		mkdir -p "$outdir"
+
+		set_io_mode "$mode" "$mode" "$splice_off"
+		drop_caches
+
+		# Ensure client directories exist on export
+		for i in $(seq 1 $num_clients); do
+			ensure_export_dirs "client${i}/testfile"
+		done
+
+		start_monitors "$outdir"
+		start_perf_lock "$outdir"
+
+		# Launch N parallel fio writers
+		local pids=()
+		for i in $(seq 1 $num_clients); do
+			run_cmd fio "${FIO_JOBS_DIR}/multi-write.fio" \
+				--output-format=json \
+				--output="${outdir}/client${i}.json" \
+				--filename="client${i}/testfile" \
+				--size="$client_size" &
+			pids+=($!)
+		done
+
+		# Wait for all
+		local rc=0
+		for pid in "${pids[@]}"; do
+			wait "$pid" || rc=$?
+		done
+
+		stop_perf_lock "$outdir"
+		stop_monitors
+		[ $rc -ne 0 ] && log "WARNING: some fio jobs exited non-zero"
+
+		# Clean up test files
+		for i in $(seq 1 $num_clients); do
+			cleanup_test_files "client${i}/testfile"
+		done
+	done
+
+	# Scenario C: Noisy writer + latency-sensitive readers
+	for mode in $MODES; do
+		local mname
+		mname=$(mode_name $mode)
+		local splice_off=0
+		[ "$mode" -ne 0 ] && splice_off=1
+		local outdir="${RESULTS_DIR}/noisy-neighbor/${mname}"
+		mkdir -p "$outdir"
+
+		set_io_mode "$mode" "$mode" "$splice_off"
+		drop_caches
+
+		# Pre-create read files for latency readers
+		for i in $(seq 1 $(( num_clients - 1 ))); do
+			ensure_export_dirs "reader${i}/readfile"
+			log "Pre-creating read file for reader $i"
+			run_fio "${FIO_JOBS_DIR}/multi-write.fio" \
+				"${outdir}/precreate_reader${i}" \
+				"reader${i}/readfile" \
+				"512M" "keep"
+		done
+		drop_caches
+		ensure_export_dirs "bulk/testfile"
+		start_monitors "$outdir"
+		start_perf_lock "$outdir"
+
+		# Noisy writer
+		run_cmd fio "${FIO_JOBS_DIR}/noisy-writer.fio" \
+			--output-format=json \
+			--output="${outdir}/noisy_writer.json" \
+			--filename="bulk/testfile" \
+			--size="$SIZE" &
+		local writer_pid=$!
+
+		# Latency-sensitive readers
+		local reader_pids=()
+		for i in $(seq 1 $(( num_clients - 1 ))); do
+			run_cmd fio "${FIO_JOBS_DIR}/lat-reader.fio" \
+				--output-format=json \
+				--output="${outdir}/reader${i}.json" \
+				--filename="reader${i}/readfile" \
+				--size="512M" &
+			reader_pids+=($!)
+		done
+
+		local rc=0
+		wait "$writer_pid" || rc=$?
+		for pid in "${reader_pids[@]}"; do
+			wait "$pid" || rc=$?
+		done
+
+		stop_perf_lock "$outdir"
+		stop_monitors
+		[ $rc -ne 0 ] && log "WARNING: some fio jobs exited non-zero"
+
+		# Clean up test files
+		cleanup_test_files "bulk/testfile"
+		for i in $(seq 1 $(( num_clients - 1 ))); do
+			cleanup_test_files "reader${i}/readfile"
+		done
+	done
+	# Scenario D: Mixed-mode noisy neighbor
+	# Test write/read mode combinations where the writer uses a
+	# cache-friendly mode and readers use buffered reads to benefit
+	# from warm cache.
+	local mixed_modes=(
+		# write_mode read_mode label
+		"1 0 dontcache-w_buffered-r"
+	)
+
+	for combo in "${mixed_modes[@]}"; do
+		local wmode rmode label
+		read -r wmode rmode label <<< "$combo"
+		local splice_off=0
+		[ "$wmode" -ne 0 ] && splice_off=1
+		local outdir="${RESULTS_DIR}/noisy-neighbor-mixed/${label}"
+		mkdir -p "$outdir"
+
+		set_io_mode "$wmode" "$rmode" "$splice_off"
+		drop_caches
+
+		# Pre-create read files for latency readers
+		for i in $(seq 1 $(( num_clients - 1 ))); do
+			ensure_export_dirs "reader${i}/readfile"
+			log "Pre-creating read file for reader $i"
+			run_fio "${FIO_JOBS_DIR}/multi-write.fio" \
+				"${outdir}/precreate_reader${i}" \
+				"reader${i}/readfile" \
+				"512M" "keep"
+		done
+		drop_caches
+		ensure_export_dirs "bulk/testfile"
+		start_monitors "$outdir"
+		start_perf_lock "$outdir"
+
+		# Noisy writer
+		run_cmd fio "${FIO_JOBS_DIR}/noisy-writer.fio" \
+			--output-format=json \
+			--output="${outdir}/noisy_writer.json" \
+			--filename="bulk/testfile" \
+			--size="$SIZE" &
+		local writer_pid=$!
+
+		# Latency-sensitive readers
+		local reader_pids=()
+		for i in $(seq 1 $(( num_clients - 1 ))); do
+			run_cmd fio "${FIO_JOBS_DIR}/lat-reader.fio" \
+				--output-format=json \
+				--output="${outdir}/reader${i}.json" \
+				--filename="reader${i}/readfile" \
+				--size="512M" &
+			reader_pids+=($!)
+		done
+
+		local rc=0
+		wait "$writer_pid" || rc=$?
+		for pid in "${reader_pids[@]}"; do
+			wait "$pid" || rc=$?
+		done
+
+		stop_perf_lock "$outdir"
+		stop_monitors
+		[ $rc -ne 0 ] && log "WARNING: some fio jobs exited non-zero"
+
+		# Clean up test files
+		cleanup_test_files "bulk/testfile"
+		for i in $(seq 1 $(( num_clients - 1 ))); do
+			cleanup_test_files "reader${i}/readfile"
+		done
+	done
+}
+
+########################################################################
+# Main
+########################################################################
+preflight
+
+TIMESTAMP=$(date '+%Y%m%d-%H%M%S')
+RESULTS_DIR="${RESULTS_DIR}/${TIMESTAMP}"
+mkdir -p "$RESULTS_DIR"
+
+# Save system info
+{
+	echo "Timestamp: $TIMESTAMP"
+	echo "Kernel: $(uname -r)"
+	echo "Hostname: $(hostname)"
+	echo "NFS version: $NFS_VER"
+	echo "File size: $SIZE"
+	echo "Export: $EXPORT_PATH"
+	cat /proc/meminfo
+} > "${RESULTS_DIR}/sysinfo.txt"
+
+log "Results will be saved to: $RESULTS_DIR"
+
+run_deliverable1
+run_deliverable2
+
+# Reset to defaults
+set_io_mode 0 0 0
+
+log "=========================================="
+log "All benchmarks complete."
+log "Results in: $RESULTS_DIR"
+log "Run: scripts/parse-results.sh $RESULTS_DIR"
+log "=========================================="
diff --git a/tools/testing/nfsd-io-bench/scripts/setup-server.sh b/tools/testing/nfsd-io-bench/scripts/setup-server.sh
new file mode 100755
index 000000000000..0efdd74a705e
--- /dev/null
+++ b/tools/testing/nfsd-io-bench/scripts/setup-server.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# One-time setup script for the NFS test server.
+# Run this once before running benchmarks.
+#
+# Usage: sudo ./setup-server.sh [EXPORT_PATH]
+
+set -euo pipefail
+
+EXPORT_PATH="${1:-/export}"
+FSTYPE="ext4"
+
+log() {
+	echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
+}
+
+if [ "$(id -u)" -ne 0 ]; then
+	echo "ERROR: must run as root"
+	exit 1
+fi
+
+# Check for required tools
+for cmd in fio exportfs showmount jq; do
+	if ! command -v "$cmd" &>/dev/null; then
+		echo "WARNING: $cmd not found, attempting install"
+		dnf install -y "$cmd" 2>/dev/null || \
+		apt-get install -y "$cmd" 2>/dev/null || \
+		echo "ERROR: cannot install $cmd, please install manually"
+	fi
+done
+
+# Check fio has nfs ioengine
+if ! fio --enghelp=nfs &>/dev/null; then
+	echo "ERROR: fio nfs ioengine not available."
+	echo "You may need to install fio with libnfs support."
+	echo "Try: dnf install fio libnfs-devel  (or build fio from source with --enable-nfs)"
+	exit 1
+fi
+
+# Create export directory if needed
+if [ ! -d "$EXPORT_PATH" ]; then
+	log "Creating export directory: $EXPORT_PATH"
+	mkdir -p "$EXPORT_PATH"
+fi
+
+# Create subdirectories for multi-client tests
+for i in 1 2 3 4; do
+	mkdir -p "${EXPORT_PATH}/client${i}"
+	mkdir -p "${EXPORT_PATH}/reader${i}"
+done
+mkdir -p "${EXPORT_PATH}/bulk"
+
+# Check if already exported
+if ! exportfs -s 2>/dev/null | grep -q "$EXPORT_PATH"; then
+	log "Adding NFS export for $EXPORT_PATH"
+	if ! grep -q "$EXPORT_PATH" /etc/exports 2>/dev/null; then
+		echo "${EXPORT_PATH} 127.0.0.1/32(rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports
+	fi
+	exportfs -ra
+fi
+
+# Ensure NFS server is running
+if ! systemctl is-active --quiet nfs-server 2>/dev/null; then
+	log "Starting NFS server"
+	systemctl start nfs-server
+fi
+
+# Verify export
+log "Current exports:"
+showmount -e localhost
+
+# Check debugfs knobs
+log "Checking debugfs knobs:"
+DEBUGFS_BASE="/sys/kernel/debug/nfsd"
+for knob in io_cache_read io_cache_write disable-splice-read; do
+	if [ -f "${DEBUGFS_BASE}/${knob}" ]; then
+		echo "  ${knob} = $(cat "${DEBUGFS_BASE}/${knob}")"
+	else
+		echo "  ${knob}: NOT FOUND (kernel may be too old)"
+	fi
+done
+
+# Print system summary
+echo ""
+log "=== System Summary ==="
+echo "Kernel:      $(uname -r)"
+echo "RAM:         $(awk '/MemTotal/ {printf "%.1f GB", $2/1024/1024}' /proc/meminfo)"
+echo "Export:      $EXPORT_PATH"
+echo "Filesystem:  $(df -T "$EXPORT_PATH" | awk 'NR==2 {print $2}')"
+echo "Disk:        $(df -h "$EXPORT_PATH" | awk 'NR==2 {print $2, "total,", $4, "free"}')"
+echo ""
+log "Setup complete. Run benchmarks with:"
+echo "  sudo ./scripts/run-benchmarks.sh -e $EXPORT_PATH"

-- 
2.53.0


^ permalink raw reply related

* [PATCH v3 4/4] testing: add dontcache-bench local filesystem benchmark suite
From: Jeff Layton @ 2026-04-26 11:56 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), Andrew Morton, David Hildenbrand,
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
	Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever
  Cc: linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel, Jeff Layton
In-Reply-To: <20260426-dontcache-v3-0-79eb37da9547@kernel.org>

Add a benchmark suite for testing IOCB_DONTCACHE on local filesystems
via fio's io_uring engine with the RWF_DONTCACHE flag.

The suite mirrors the nfsd-io-bench test matrix but uses io_uring with
the "uncached" fio option instead of NFSD debugfs mode switching:
 - uncached=0: standard buffered I/O
 - uncached=1: RWF_DONTCACHE
 - Mode 2 uses O_DIRECT via fio's --direct=1

Includes fio job files, run-benchmarks.sh, and parse-results.sh.

Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 .../dontcache-bench/fio-jobs/lat-reader.fio        |  12 +
 .../dontcache-bench/fio-jobs/multi-write.fio       |   9 +
 .../dontcache-bench/fio-jobs/noisy-writer.fio      |  12 +
 .../testing/dontcache-bench/fio-jobs/rand-read.fio |  13 +
 .../dontcache-bench/fio-jobs/rand-write.fio        |  13 +
 .../testing/dontcache-bench/fio-jobs/seq-read.fio  |  13 +
 .../testing/dontcache-bench/fio-jobs/seq-write.fio |  13 +
 .../dontcache-bench/scripts/parse-results.sh       | 238 +++++++++
 .../dontcache-bench/scripts/run-benchmarks.sh      | 562 +++++++++++++++++++++
 9 files changed, 885 insertions(+)

diff --git a/tools/testing/dontcache-bench/fio-jobs/lat-reader.fio b/tools/testing/dontcache-bench/fio-jobs/lat-reader.fio
new file mode 100644
index 000000000000..e221e7aedec9
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/lat-reader.fio
@@ -0,0 +1,12 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=4k
+numjobs=1
+time_based=0
+rw=read
+log_avg_msec=1000
+write_bw_log=latreader
+write_lat_log=latreader
+
+[latreader]
diff --git a/tools/testing/dontcache-bench/fio-jobs/multi-write.fio b/tools/testing/dontcache-bench/fio-jobs/multi-write.fio
new file mode 100644
index 000000000000..8fc0770f5860
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/multi-write.fio
@@ -0,0 +1,9 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=1M
+numjobs=1
+time_based=0
+rw=write
+
+[multiwrite]
diff --git a/tools/testing/dontcache-bench/fio-jobs/noisy-writer.fio b/tools/testing/dontcache-bench/fio-jobs/noisy-writer.fio
new file mode 100644
index 000000000000..4524eebd4642
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/noisy-writer.fio
@@ -0,0 +1,12 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=1M
+numjobs=1
+time_based=0
+rw=write
+log_avg_msec=1000
+write_bw_log=noisywriter
+write_lat_log=noisywriter
+
+[noisywriter]
diff --git a/tools/testing/dontcache-bench/fio-jobs/rand-read.fio b/tools/testing/dontcache-bench/fio-jobs/rand-read.fio
new file mode 100644
index 000000000000..e281fa82b86a
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/rand-read.fio
@@ -0,0 +1,13 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=4k
+numjobs=1
+iodepth=16
+time_based=0
+rw=randread
+log_avg_msec=1000
+write_bw_log=randread
+write_lat_log=randread
+
+[randread]
diff --git a/tools/testing/dontcache-bench/fio-jobs/rand-write.fio b/tools/testing/dontcache-bench/fio-jobs/rand-write.fio
new file mode 100644
index 000000000000..cf53bc6f14b9
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/rand-write.fio
@@ -0,0 +1,13 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=4k
+numjobs=1
+iodepth=16
+time_based=0
+rw=randwrite
+log_avg_msec=1000
+write_bw_log=randwrite
+write_lat_log=randwrite
+
+[randwrite]
diff --git a/tools/testing/dontcache-bench/fio-jobs/seq-read.fio b/tools/testing/dontcache-bench/fio-jobs/seq-read.fio
new file mode 100644
index 000000000000..ef87921465a7
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/seq-read.fio
@@ -0,0 +1,13 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=1M
+numjobs=1
+iodepth=16
+time_based=0
+rw=read
+log_avg_msec=1000
+write_bw_log=seqread
+write_lat_log=seqread
+
+[seqread]
diff --git a/tools/testing/dontcache-bench/fio-jobs/seq-write.fio b/tools/testing/dontcache-bench/fio-jobs/seq-write.fio
new file mode 100644
index 000000000000..da3082f9b391
--- /dev/null
+++ b/tools/testing/dontcache-bench/fio-jobs/seq-write.fio
@@ -0,0 +1,13 @@
+[global]
+ioengine=io_uring
+direct=0
+bs=1M
+numjobs=1
+iodepth=16
+time_based=0
+rw=write
+log_avg_msec=1000
+write_bw_log=seqwrite
+write_lat_log=seqwrite
+
+[seqwrite]
diff --git a/tools/testing/dontcache-bench/scripts/parse-results.sh b/tools/testing/dontcache-bench/scripts/parse-results.sh
new file mode 100755
index 000000000000..0427d411db04
--- /dev/null
+++ b/tools/testing/dontcache-bench/scripts/parse-results.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Parse fio JSON output and generate comparison tables.
+#
+# Usage: ./parse-results.sh <results-dir>
+
+set -euo pipefail
+
+if [ $# -lt 1 ]; then
+	echo "Usage: $0 <results-dir>"
+	exit 1
+fi
+
+RESULTS_DIR="$1"
+
+if ! command -v jq &>/dev/null; then
+	echo "ERROR: jq is required"
+	exit 1
+fi
+
+# Extract metrics from a single fio JSON result
+extract_metrics() {
+	local json_file=$1
+	local rw_type=$2  # read or write
+
+	if [ ! -f "$json_file" ]; then
+		echo "N/A N/A N/A N/A N/A N/A"
+		return
+	fi
+
+	jq -r --arg rw "$rw_type" '
+		.jobs[0][$rw] as $d |
+		[
+			(($d.bw // 0) / 1024 | . * 10 | round / 10),    # MB/s
+			($d.iops // 0),                                    # IOPS
+			((($d.clat_ns.mean // 0) / 1000) | . * 10 | round / 10), # avg lat us
+			(($d.clat_ns.percentile["50.000000"] // 0) / 1000), # p50 us
+			(($d.clat_ns.percentile["99.000000"] // 0) / 1000), # p99 us
+			(($d.clat_ns.percentile["99.900000"] // 0) / 1000)  # p99.9 us
+		] | @tsv
+	' "$json_file" 2>/dev/null || echo "N/A N/A N/A N/A N/A N/A"
+}
+
+# Extract server CPU from vmstat log (average sys%)
+extract_cpu() {
+	local vmstat_log=$1
+	if [ ! -f "$vmstat_log" ]; then
+		echo "N/A"
+		return
+	fi
+	# vmstat columns: us sy id wa st — skip header lines
+	awk 'NR>2 {sum+=$14; n++} END {if(n>0) printf "%.1f", sum/n; else print "N/A"}' \
+		"$vmstat_log" 2>/dev/null || echo "N/A"
+}
+
+# Extract peak dirty pages from meminfo log
+extract_peak_dirty() {
+	local meminfo_log=$1
+	if [ ! -f "$meminfo_log" ]; then
+		echo "N/A"
+		return
+	fi
+	grep "^Dirty:" "$meminfo_log" | awk '{print $2}' | sort -n | tail -1 || echo "N/A"
+}
+
+# Extract peak cached from meminfo log
+extract_peak_cached() {
+	local meminfo_log=$1
+	if [ ! -f "$meminfo_log" ]; then
+		echo "N/A"
+		return
+	fi
+	grep "^Cached:" "$meminfo_log" | awk '{print $2}' | sort -n | tail -1 || echo "N/A"
+}
+
+print_separator() {
+	printf '%*s\n' 120 '' | tr ' ' '-'
+}
+
+########################################################################
+# Deliverable 1: Single-client results
+########################################################################
+echo ""
+echo "=================================================================="
+echo "  Deliverable 1: Single-Client fio Benchmarks"
+echo "=================================================================="
+echo ""
+
+for workload in seq-write rand-write seq-read rand-read; do
+	case $workload in
+	seq-write|rand-write) rw_type="write" ;;
+	seq-read|rand-read)   rw_type="read" ;;
+	esac
+
+	echo "--- $workload ---"
+	printf "%-16s %10s %10s %10s %10s %10s %10s %10s %12s %12s\n" \
+		"Mode" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)" "Sys CPU%" "PeakDirty(kB)" "PeakCache(kB)"
+	print_separator
+
+	for mode in buffered dontcache direct; do
+		dir="${RESULTS_DIR}/${workload}/${mode}"
+		json_file=$(find "$dir" -name '*.json' -not -name 'client*' 2>/dev/null | head -1 || true)
+		if [ -z "$json_file" ]; then
+			printf "%-16s %10s\n" "$mode" "(no data)"
+			continue
+		fi
+
+		read -r mbps iops avg_lat p50 p99 p999 <<< \
+			"$(extract_metrics "$json_file" "$rw_type")"
+		cpu=$(extract_cpu "${dir}/vmstat.log")
+		dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+		cached=$(extract_peak_cached "${dir}/meminfo.log")
+
+		printf "%-16s %10s %10s %10s %10s %10s %10s %10s %12s %12s\n" \
+			"$mode" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999" \
+			"$cpu" "${dirty:-N/A}" "${cached:-N/A}"
+	done
+	echo ""
+done
+
+########################################################################
+# Deliverable 2: Multi-client results
+########################################################################
+echo "=================================================================="
+echo "  Deliverable 2: Noisy-Neighbor Benchmarks"
+echo "=================================================================="
+echo ""
+
+# Scenario A: Multiple writers
+echo "--- Scenario A: Multiple Writers ---"
+for mode in buffered dontcache direct; do
+	dir="${RESULTS_DIR}/multi-write/${mode}"
+	if [ ! -d "$dir" ]; then
+		continue
+	fi
+
+	echo "  Mode: $mode"
+	printf "  %-10s %10s %10s %10s %10s %10s %10s\n" \
+		"Client" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)"
+
+	total_bw=0
+	count=0
+	for json_file in "${dir}"/client*.json; do
+		[ -f "$json_file" ] || continue
+		client=$(basename "$json_file" .json)
+		read -r mbps iops avg_lat p50 p99 p999 <<< \
+			"$(extract_metrics "$json_file" "write")"
+		printf "  %-10s %10s %10s %10s %10s %10s %10s\n" \
+			"$client" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+		total_bw=$(echo "$total_bw + ${mbps:-0}" | bc 2>/dev/null || echo "$total_bw")
+		count=$(( count + 1 ))
+	done
+
+	cpu=$(extract_cpu "${dir}/vmstat.log")
+	dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+	printf "  Aggregate BW: %s MB/s | Sys CPU: %s%% | Peak Dirty: %s kB\n" \
+		"$total_bw" "$cpu" "${dirty:-N/A}"
+	echo ""
+done
+
+# Scenario C: Noisy neighbor
+echo "--- Scenario C: Noisy Writer + Latency-Sensitive Readers ---"
+for mode in buffered dontcache direct; do
+	dir="${RESULTS_DIR}/noisy-neighbor/${mode}"
+	if [ ! -d "$dir" ]; then
+		continue
+	fi
+
+	echo "  Mode: $mode"
+	printf "  %-14s %10s %10s %10s %10s %10s %10s\n" \
+		"Job" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)"
+
+	# Writer
+	if [ -f "${dir}/noisy_writer.json" ]; then
+		read -r mbps iops avg_lat p50 p99 p999 <<< \
+			"$(extract_metrics "${dir}/noisy_writer.json" "write")"
+		printf "  %-14s %10s %10s %10s %10s %10s %10s\n" \
+			"Bulk writer" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+	fi
+
+	# Readers
+	for json_file in "${dir}"/reader*.json; do
+		[ -f "$json_file" ] || continue
+		reader=$(basename "$json_file" .json)
+		read -r mbps iops avg_lat p50 p99 p999 <<< \
+			"$(extract_metrics "$json_file" "read")"
+		printf "  %-14s %10s %10s %10s %10s %10s %10s\n" \
+			"$reader" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+	done
+
+	cpu=$(extract_cpu "${dir}/vmstat.log")
+	dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+	printf "  Sys CPU: %s%% | Peak Dirty: %s kB\n" "$cpu" "${dirty:-N/A}"
+	echo ""
+done
+
+# Scenario D: Mixed-mode noisy neighbor
+echo "--- Scenario D: Mixed-Mode Noisy Writer + Readers ---"
+for dir in "${RESULTS_DIR}"/noisy-neighbor-mixed/*/; do
+	[ -d "$dir" ] || continue
+	label=$(basename "$dir")
+
+	echo "  Mode: $label"
+	printf "  %-14s %10s %10s %10s %10s %10s %10s\n" \
+		"Job" "MB/s" "IOPS" "Avg(us)" "p50(us)" "p99(us)" "p99.9(us)"
+
+	# Writer
+	if [ -f "${dir}/noisy_writer.json" ]; then
+		read -r mbps iops avg_lat p50 p99 p999 <<< \
+			"$(extract_metrics "${dir}/noisy_writer.json" "write")"
+		printf "  %-14s %10s %10s %10s %10s %10s %10s\n" \
+			"Bulk writer" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+	fi
+
+	# Readers
+	for json_file in "${dir}"/reader*.json; do
+		[ -f "$json_file" ] || continue
+		reader=$(basename "$json_file" .json)
+		read -r mbps iops avg_lat p50 p99 p999 <<< \
+			"$(extract_metrics "$json_file" "read")"
+		printf "  %-14s %10s %10s %10s %10s %10s %10s\n" \
+			"$reader" "$mbps" "$iops" "$avg_lat" "$p50" "$p99" "$p999"
+	done
+
+	cpu=$(extract_cpu "${dir}/vmstat.log")
+	dirty=$(extract_peak_dirty "${dir}/meminfo.log")
+	printf "  Sys CPU: %s%% | Peak Dirty: %s kB\n" "$cpu" "${dirty:-N/A}"
+	echo ""
+done
+
+echo "=================================================================="
+echo "  System Info"
+echo "=================================================================="
+if [ -f "${RESULTS_DIR}/sysinfo.txt" ]; then
+	head -6 "${RESULTS_DIR}/sysinfo.txt"
+fi
+echo ""
diff --git a/tools/testing/dontcache-bench/scripts/run-benchmarks.sh b/tools/testing/dontcache-bench/scripts/run-benchmarks.sh
new file mode 100755
index 000000000000..11bf400ef092
--- /dev/null
+++ b/tools/testing/dontcache-bench/scripts/run-benchmarks.sh
@@ -0,0 +1,562 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Local filesystem I/O mode benchmark suite.
+#
+# Runs the same test matrix as run-benchmarks.sh but on a local filesystem
+# using fio's io_uring engine with the RWF_DONTCACHE flag instead of NFSD's
+# debugfs mode knobs.
+#
+# Usage: ./run-local-benchmarks.sh [options]
+#   -t <dir>    Test directory (must be on a filesystem supporting FOP_DONTCACHE)
+#   -s <size>   File size (default: auto-sized to exceed RAM)
+#   -f <path>   Path to fio binary (default: fio in PATH)
+#   -o <dir>    Output directory for results (default: ./results/<timestamp>)
+#   -d          Dry run (print commands without executing)
+
+set -euo pipefail
+
+# Defaults
+TEST_DIR=""
+SIZE=""
+FIO_BIN="fio"
+RESULTS_DIR=""
+DRY_RUN=0
+MODES="0 1 2"
+PERF_LOCK=0
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+FIO_JOBS_DIR="${SCRIPT_DIR}/../fio-jobs"
+
+usage() {
+	echo "Usage: $0 -t <test-dir> [-s <size>] [-f <fio-path>] [-o <output-dir>] [-D] [-p] [-d]"
+	echo ""
+	echo "  -t <dir>    Test directory (required, must support RWF_DONTCACHE)"
+	echo "  -s <size>   File size (default: 2x RAM)"
+	echo "  -f <path>   Path to fio binary (default: fio)"
+	echo "  -o <dir>    Output directory (default: ./results/<timestamp>)"
+	echo "  -D          Dontcache only (skip buffered and direct tests)"
+	echo "  -p          Profile kernel lock contention with perf lock"
+	echo "  -d          Dry run"
+	exit 1
+}
+
+while getopts "t:s:f:o:Dpdh" opt; do
+	case $opt in
+	t) TEST_DIR="$OPTARG" ;;
+	s) SIZE="$OPTARG" ;;
+	f) FIO_BIN="$OPTARG" ;;
+	o) RESULTS_DIR="$OPTARG" ;;
+	D) MODES="1" ;;
+	p) PERF_LOCK=1 ;;
+	d) DRY_RUN=1 ;;
+	h) usage ;;
+	*) usage ;;
+	esac
+done
+
+if [ -z "$TEST_DIR" ]; then
+	echo "ERROR: -t <test-dir> is required"
+	usage
+fi
+
+# Auto-size to 2x RAM if not specified
+if [ -z "$SIZE" ]; then
+	mem_kb=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
+	SIZE="$(( mem_kb * 2 / 1024 ))M"
+fi
+
+if [ -z "$RESULTS_DIR" ]; then
+	RESULTS_DIR="./results/local-$(date +%Y%m%d-%H%M%S)"
+fi
+
+mkdir -p "$RESULTS_DIR"
+
+log() {
+	echo "[$(date '+%H:%M:%S')] $*"
+}
+
+run_cmd() {
+	if [ "$DRY_RUN" -eq 1 ]; then
+		echo "  [DRY RUN] $*"
+	else
+		"$@"
+	fi
+}
+
+# I/O mode definitions:
+#   buffered:  direct=0, uncached=0
+#   dontcache: direct=0, uncached=1
+#   direct:    direct=1, uncached=0
+#
+# Mode name from numeric value
+mode_name() {
+	case $1 in
+	0) echo "buffered" ;;
+	1) echo "dontcache" ;;
+	2) echo "direct" ;;
+	esac
+}
+
+# Return fio command-line flags for a given mode.
+# "direct" is a standard fio option and works on the command line.
+# "uncached" is an io_uring engine option that must be in the job file,
+# so we inject it via make_job_file() below.
+mode_fio_args() {
+	case $1 in
+	0) echo "--direct=0" ;;           # buffered
+	1) echo "--direct=0" ;;           # dontcache
+	2) echo "--direct=1" ;;           # direct
+	esac
+}
+
+# Return the uncached= value for a given mode.
+mode_uncached() {
+	case $1 in
+	0) echo "0" ;;
+	1) echo "1" ;;
+	2) echo "0" ;;
+	esac
+}
+
+# Create a temporary job file with uncached=N injected into [global].
+# For uncached=0 (buffered/direct), return the original file unchanged.
+make_job_file() {
+	local job_file=$1
+	local uncached=$2
+
+	if [ "$uncached" -eq 0 ]; then
+		echo "$job_file"
+		return
+	fi
+
+	local tmp
+	tmp=$(mktemp)
+	sed "/^\[global\]/a uncached=${uncached}" "$job_file" > "$tmp"
+	echo "$tmp"
+}
+
+drop_caches() {
+	run_cmd bash -c "sync && echo 3 > /proc/sys/vm/drop_caches"
+}
+
+# perf lock profiling — uses BPF-based live contention tracing
+PERF_LOCK_PID=""
+
+start_perf_lock() {
+	local outdir=$1
+
+	if [ "$PERF_LOCK" -ne 1 ]; then
+		return
+	fi
+
+	log "Starting perf lock contention tracing"
+	perf lock contention -a -b --max-stack 8 \
+		> "${outdir}/perf-lock-contention.txt" 2>&1 &
+	PERF_LOCK_PID=$!
+}
+
+stop_perf_lock() {
+	local outdir=$1
+
+	if [ -z "$PERF_LOCK_PID" ]; then
+		return
+	fi
+
+	log "Stopping perf lock contention tracing"
+	kill -TERM "$PERF_LOCK_PID" 2>/dev/null || true
+	wait "$PERF_LOCK_PID" 2>/dev/null || true
+	PERF_LOCK_PID=""
+}
+
+# Background monitors
+VMSTAT_PID=""
+IOSTAT_PID=""
+MEMINFO_PID=""
+
+start_monitors() {
+	local outdir=$1
+	log "Starting monitors in $outdir"
+	run_cmd vmstat 1 > "${outdir}/vmstat.log" 2>&1 &
+	VMSTAT_PID=$!
+	run_cmd iostat -x 1 > "${outdir}/iostat.log" 2>&1 &
+	IOSTAT_PID=$!
+	(while true; do
+		echo "=== $(date '+%s') ==="
+		cat /proc/meminfo
+		sleep 1
+	done) > "${outdir}/meminfo.log" 2>&1 &
+	MEMINFO_PID=$!
+}
+
+stop_monitors() {
+	log "Stopping monitors"
+	kill "$VMSTAT_PID" "$IOSTAT_PID" "$MEMINFO_PID" 2>/dev/null || true
+	wait "$VMSTAT_PID" "$IOSTAT_PID" "$MEMINFO_PID" 2>/dev/null || true
+}
+
+cleanup_test_files() {
+	local filepath="${TEST_DIR}/$1"
+	log "Cleaning up $filepath"
+	run_cmd rm -f "$filepath"
+}
+
+# Run a single fio benchmark
+run_fio() {
+	local job_file=$1
+	local outdir=$2
+	local filename=$3
+	local fio_size=${4:-$SIZE}
+	local keep=${5:-}
+	local extra_args=${6:-}
+	local uncached=${7:-0}
+
+	# Inject uncached=N into the job file if needed
+	local actual_job
+	actual_job=$(make_job_file "$job_file" "$uncached")
+
+	local job_name
+	job_name=$(basename "$job_file" .fio)
+
+	log "Running fio job: $job_name -> $outdir (file=${TEST_DIR}/$filename size=$fio_size)"
+	mkdir -p "$outdir"
+
+	drop_caches
+	start_monitors "$outdir"
+	# Skip perf lock profiling for precreate/setup runs
+	[ "$keep" != "keep" ] && start_perf_lock "$outdir"
+
+	# shellcheck disable=SC2086
+	run_cmd "$FIO_BIN" "$actual_job" \
+		--output-format=json \
+		--output="${outdir}/${job_name}.json" \
+		--filename="${TEST_DIR}/$filename" \
+		--size="$fio_size" \
+		$extra_args
+
+	[ "$keep" != "keep" ] && stop_perf_lock "$outdir"
+	stop_monitors
+	log "Finished: $job_name"
+
+	# Clean up temp job file if one was created
+	[ "$actual_job" != "$job_file" ] && rm -f "$actual_job"
+
+	if [ "$keep" != "keep" ]; then
+		cleanup_test_files "$filename"
+	fi
+}
+
+########################################################################
+# Preflight
+########################################################################
+preflight() {
+	log "=== Preflight checks ==="
+
+	if ! command -v "$FIO_BIN" &>/dev/null; then
+		echo "ERROR: fio not found at $FIO_BIN"
+		exit 1
+	fi
+
+	if [ ! -d "$TEST_DIR" ]; then
+		echo "ERROR: Test directory $TEST_DIR does not exist"
+		exit 1
+	fi
+
+	# Quick check that RWF_DONTCACHE works on this filesystem
+	local testfile="${TEST_DIR}/.dontcache_test"
+	if ! "$FIO_BIN" --name=test --ioengine=io_uring --rw=write \
+		--bs=4k --size=4k --direct=0 --uncached=1 \
+		--filename="$testfile" 2>/dev/null; then
+		echo "WARNING: RWF_DONTCACHE may not be supported on $TEST_DIR"
+		echo "         (filesystem must support FOP_DONTCACHE)"
+	fi
+	rm -f "$testfile"
+
+	log "Test directory: $TEST_DIR"
+	log "File size: $SIZE"
+	log "fio binary: $FIO_BIN"
+	log "Results: $RESULTS_DIR"
+
+	# Record system info
+	{
+		echo "Timestamp: $(date +%Y%m%d-%H%M%S)"
+		echo "Kernel: $(uname -r)"
+		echo "Hostname: $(hostname)"
+		echo "Filesystem: $(df -T "$TEST_DIR" | tail -1 | awk '{print $2}')"
+		echo "File size: $SIZE"
+		echo "Test dir: $TEST_DIR"
+	} > "${RESULTS_DIR}/sysinfo.txt"
+}
+
+########################################################################
+# Deliverable 1: Single-client benchmarks
+########################################################################
+run_deliverable1() {
+	log "=========================================="
+	log "Deliverable 1: Single-client benchmarks"
+	log "=========================================="
+
+	# Sequential write
+	for mode in $MODES; do
+		local mname
+		mname=$(mode_name $mode)
+		local fio_args
+		fio_args=$(mode_fio_args $mode)
+
+		drop_caches
+		run_fio "${FIO_JOBS_DIR}/seq-write.fio" \
+			"${RESULTS_DIR}/seq-write/${mname}" \
+			"seq-write_testfile" "$SIZE" "" "$fio_args" \
+			"$(mode_uncached $mode)"
+	done
+
+	# Random write
+	for mode in $MODES; do
+		local mname
+		mname=$(mode_name $mode)
+		local fio_args
+		fio_args=$(mode_fio_args $mode)
+
+		drop_caches
+		run_fio "${FIO_JOBS_DIR}/rand-write.fio" \
+			"${RESULTS_DIR}/rand-write/${mname}" \
+			"rand-write_testfile" "$SIZE" "" "$fio_args" \
+			"$(mode_uncached $mode)"
+	done
+
+	# Sequential read — pre-create file, then read with each mode
+	log "Pre-creating sequential read test file"
+	run_fio "${FIO_JOBS_DIR}/seq-write.fio" \
+		"${RESULTS_DIR}/seq-read/precreate" \
+		"seq-read_testfile" "$SIZE" "keep"
+
+	for rmode in $MODES; do
+		local mname
+		mname=$(mode_name $rmode)
+		local fio_args
+		fio_args=$(mode_fio_args $rmode)
+		local keep="keep"
+		[ "$rmode" -eq 2 ] && keep=""
+
+		drop_caches
+		run_fio "${FIO_JOBS_DIR}/seq-read.fio" \
+			"${RESULTS_DIR}/seq-read/${mname}" \
+			"seq-read_testfile" "$SIZE" "$keep" "$fio_args" \
+			"$(mode_uncached $rmode)"
+	done
+
+	# Random read — pre-create file, then read with each mode
+	log "Pre-creating random read test file"
+	run_fio "${FIO_JOBS_DIR}/seq-write.fio" \
+		"${RESULTS_DIR}/rand-read/precreate" \
+		"rand-read_testfile" "$SIZE" "keep"
+
+	for rmode in $MODES; do
+		local mname
+		mname=$(mode_name $rmode)
+		local fio_args
+		fio_args=$(mode_fio_args $rmode)
+		local keep="keep"
+		[ "$rmode" -eq 2 ] && keep=""
+
+		drop_caches
+		run_fio "${FIO_JOBS_DIR}/rand-read.fio" \
+			"${RESULTS_DIR}/rand-read/${mname}" \
+			"rand-read_testfile" "$SIZE" "$keep" "$fio_args" \
+			"$(mode_uncached $rmode)"
+	done
+}
+
+########################################################################
+# Deliverable 2: Multi-client tests
+########################################################################
+run_deliverable2() {
+	log "=========================================="
+	log "Deliverable 2: Noisy-neighbor benchmarks"
+	log "=========================================="
+
+	local num_clients=4
+	local client_size
+	local mem_kb
+	mem_kb=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
+	client_size="$(( mem_kb / 1024 / num_clients ))M"
+
+	# Scenario A: Multiple writers
+	for mode in $MODES; do
+		local mname
+		mname=$(mode_name $mode)
+		local fio_args
+		fio_args=$(mode_fio_args $mode)
+		local uncached
+		uncached=$(mode_uncached $mode)
+		local actual_job
+		actual_job=$(make_job_file "${FIO_JOBS_DIR}/multi-write.fio" "$uncached")
+		local outdir="${RESULTS_DIR}/multi-write/${mname}"
+		mkdir -p "$outdir"
+
+		drop_caches
+		start_monitors "$outdir"
+		start_perf_lock "$outdir"
+
+		local pids=()
+		for i in $(seq 1 $num_clients); do
+			# shellcheck disable=SC2086
+			run_cmd "$FIO_BIN" "$actual_job" \
+				--output-format=json \
+				--output="${outdir}/client${i}.json" \
+				--filename="${TEST_DIR}/client${i}_testfile" \
+				--size="$client_size" \
+				$fio_args &
+			pids+=($!)
+		done
+
+		local rc=0
+		for pid in "${pids[@]}"; do
+			wait "$pid" || rc=$?
+		done
+
+		stop_perf_lock "$outdir"
+		stop_monitors
+		[ $rc -ne 0 ] && log "WARNING: some fio jobs exited non-zero"
+
+		[ "$actual_job" != "${FIO_JOBS_DIR}/multi-write.fio" ] && rm -f "$actual_job"
+		for i in $(seq 1 $num_clients); do
+			cleanup_test_files "client${i}_testfile"
+		done
+	done
+
+	# Scenario C: Noisy writer + latency-sensitive readers
+	for mode in $MODES; do
+		local mname
+		mname=$(mode_name $mode)
+		local fio_args
+		fio_args=$(mode_fio_args $mode)
+		local uncached
+		uncached=$(mode_uncached $mode)
+		local writer_job
+		writer_job=$(make_job_file "${FIO_JOBS_DIR}/noisy-writer.fio" "$uncached")
+		local reader_job
+		reader_job=$(make_job_file "${FIO_JOBS_DIR}/lat-reader.fio" "$uncached")
+		local outdir="${RESULTS_DIR}/noisy-neighbor/${mname}"
+		mkdir -p "$outdir"
+
+		# Pre-create read files
+		for i in $(seq 1 $(( num_clients - 1 ))); do
+			log "Pre-creating read file for reader $i"
+			run_fio "${FIO_JOBS_DIR}/multi-write.fio" \
+				"${outdir}/precreate_reader${i}" \
+				"reader${i}_readfile" \
+				"512M" "keep"
+		done
+		drop_caches
+		start_monitors "$outdir"
+		start_perf_lock "$outdir"
+
+		# Noisy writer
+		# shellcheck disable=SC2086
+		run_cmd "$FIO_BIN" "$writer_job" \
+			--output-format=json \
+			--output="${outdir}/noisy_writer.json" \
+			--filename="${TEST_DIR}/bulk_testfile" \
+			--size="$SIZE" \
+			$fio_args &
+		local writer_pid=$!
+
+		# Latency-sensitive readers
+		local reader_pids=()
+		for i in $(seq 1 $(( num_clients - 1 ))); do
+			# shellcheck disable=SC2086
+			run_cmd "$FIO_BIN" "$reader_job" \
+				--output-format=json \
+				--output="${outdir}/reader${i}.json" \
+				--filename="${TEST_DIR}/reader${i}_readfile" \
+				--size="512M" \
+				$fio_args &
+			reader_pids+=($!)
+		done
+
+		local rc=0
+		wait "$writer_pid" || rc=$?
+		for pid in "${reader_pids[@]}"; do
+			wait "$pid" || rc=$?
+		done
+
+		stop_perf_lock "$outdir"
+		stop_monitors
+		[ $rc -ne 0 ] && log "WARNING: some fio jobs exited non-zero"
+
+		[ "$writer_job" != "${FIO_JOBS_DIR}/noisy-writer.fio" ] && rm -f "$writer_job"
+		[ "$reader_job" != "${FIO_JOBS_DIR}/lat-reader.fio" ] && rm -f "$reader_job"
+		cleanup_test_files "bulk_testfile"
+		for i in $(seq 1 $(( num_clients - 1 ))); do
+			cleanup_test_files "reader${i}_readfile"
+		done
+	done
+
+	# Scenario D: Mixed-mode noisy neighbor
+	# dontcache writes + buffered reads
+	local outdir="${RESULTS_DIR}/noisy-neighbor-mixed/dontcache-w_buffered-r"
+	mkdir -p "$outdir"
+	local writer_job
+	writer_job=$(make_job_file "${FIO_JOBS_DIR}/noisy-writer.fio" 1)
+
+	for i in $(seq 1 $(( num_clients - 1 ))); do
+		log "Pre-creating read file for reader $i"
+		run_fio "${FIO_JOBS_DIR}/multi-write.fio" \
+			"${outdir}/precreate_reader${i}" \
+			"reader${i}_readfile" \
+			"512M" "keep"
+	done
+	drop_caches
+	start_monitors "$outdir"
+	start_perf_lock "$outdir"
+
+	# Writer with dontcache
+	run_cmd "$FIO_BIN" "$writer_job" \
+		--output-format=json \
+		--output="${outdir}/noisy_writer.json" \
+		--filename="${TEST_DIR}/bulk_testfile" \
+		--size="$SIZE" \
+		--direct=0 &
+	local writer_pid=$!
+
+	# Readers with buffered (no uncached flag)
+	local reader_pids=()
+	for i in $(seq 1 $(( num_clients - 1 ))); do
+		run_cmd "$FIO_BIN" "${FIO_JOBS_DIR}/lat-reader.fio" \
+			--output-format=json \
+			--output="${outdir}/reader${i}.json" \
+			--filename="${TEST_DIR}/reader${i}_readfile" \
+			--size="512M" \
+			--direct=0 &
+		reader_pids+=($!)
+	done
+
+	local rc=0
+	wait "$writer_pid" || rc=$?
+	for pid in "${reader_pids[@]}"; do
+		wait "$pid" || rc=$?
+	done
+
+	stop_perf_lock "$outdir"
+	stop_monitors
+	[ $rc -ne 0 ] && log "WARNING: some fio jobs exited non-zero"
+
+	[ "$writer_job" != "${FIO_JOBS_DIR}/noisy-writer.fio" ] && rm -f "$writer_job"
+	cleanup_test_files "bulk_testfile"
+	for i in $(seq 1 $(( num_clients - 1 ))); do
+		cleanup_test_files "reader${i}_readfile"
+	done
+}
+
+########################################################################
+# Main
+########################################################################
+preflight
+run_deliverable1
+run_deliverable2
+
+log "=========================================="
+log "All benchmarks complete."
+log "Results in: $RESULTS_DIR"
+log "Parse with: scripts/parse-results.sh $RESULTS_DIR"
+log "=========================================="

-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
From: Andrew Morton @ 2026-04-26 12:28 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
	Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever,
	linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel
In-Reply-To: <20260426-dontcache-v3-2-79eb37da9547@kernel.org>

Naive questions...

On Sun, 26 Apr 2026 07:56:08 -0400 Jeff Layton <jlayton@kernel.org> wrote:

> The IOCB_DONTCACHE writeback path in generic_write_sync() calls
> filemap_flush_range() on every write, submitting writeback inline in
> the writer's context.  Perf lock contention profiling shows the
> performance problem is not lock contention but the writeback submission
> work itself — walking the page tree and submitting I/O blocks the writer
> for milliseconds, inflating p99.9 latency from 23ms (buffered) to 93ms
> (dontcache).

So in the current case, when generic_write_sync() returns, all that
memory is written back and clean&reclaimable (or freed?), yes?

> Replace the inline filemap_flush_range() call with a flusher kick that
> drains dirty pages in the background.  This moves writeback submission
> completely off the writer's hot path.

Whereas after this change, that pagecache is probably still dirty,
unreclaimable, waiting for the flusher to do its thing?

So is there potential that the system will get all gummed up with
dirty, to-be-written-soon pagecache?  Is there something which limits
this buildup?

> ...
>
> dontcache-bench results on dual-socket Xeon Gold 6138 (80 CPUs, 256 GB
> RAM, Samsung MZ1LB1T9HALS 1.7 TB NVMe, local XFS, io_uring, file size
> ~503 GB, compared to a v6.19-ish baseline):
> 
>   Single-client sequential write (MB/s):
>                        baseline    patched     change
>   buffered              1449.8     1440.1      -0.7%
>   dontcache             1347.9     1461.5      +8.4%
>   direct                1450.0     1440.1      -0.7%
> 
>   Single-client sequential write latency (us):
>                        baseline    patched     change
>   dontcache p50         3031.0    10551.3    +248.1%
>   dontcache p99        74973.2    21626.9     -71.2%
>   dontcache p99.9      85459.0    23199.7     -72.9%
> 
>   Single-client random write (MB/s):
>                        baseline    patched     change
>   dontcache              284.2      295.4      +3.9%
> 
>   Single-client random write p99.9 latency (us):
>                        baseline    patched     change
>   dontcache             2277.4      872.4     -61.7%
> 
>   Multi-writer aggregate throughput (MB/s):
>                        baseline    patched     change
>   buffered              1619.5     1611.2      -0.5%
>   dontcache             1281.1     1629.4     +27.2%
>   direct                1545.4     1609.4      +4.1%
> 
>   Mixed-mode noisy neighbor (dontcache writer + buffered readers):
>                        baseline    patched     change
>   writer (MB/s)         1297.6     1471.1     +13.4%
>   readers avg (MB/s)     855.0      462.4     -45.9%

These results look ambiguous.  Sometimes better, sometimes worse?

> nfsd-io-bench results on same hardware (XFS on NVMe, NFSv3 via fio
> NFS engine with libnfs, 1024 NFSD threads, pool_mode=pernode,
> file size ~502 GB, compared to v6.19-ish baseline):
> 
>   Single-client sequential write (MB/s):
>                        baseline    patched     change
>   buffered              4844.2     4653.4      -3.9%
>   dontcache             3028.3     3723.1     +22.9%
>   direct                 957.6      987.8      +3.2%
> 
>   Single-client sequential write p99.9 latency (us):
>                        baseline    patched     change
>   dontcache            759169.0   175112.2     -76.9%
> 
>   Single-client random write (MB/s):
>                        baseline    patched     change
>   dontcache              590.0     1561.0    +164.6%
> 
>   Multi-writer aggregate throughput (MB/s):
>                        baseline    patched     change
>   buffered              9636.3     9422.9      -2.2%
>   dontcache             1894.9     9442.6    +398.3%
>   direct                 809.6      975.1     +20.4%
> 
>   Noisy neighbor (dontcache writer + random readers):
>                        baseline    patched     change
>   writer (MB/s)         1854.5     4063.6    +119.1%
>   readers avg (MB/s)     131.2      101.6     -22.5%

Ditto but less so.

> The NFS results show even larger improvements than the local benchmarks.
> Multi-writer dontcache throughput improves nearly 5x, matching buffered
> I/O. Dirty page footprint drops 85-95% in sequential workloads vs.
> buffered.

It sounds that you like the results, so OK ;)


^ permalink raw reply

* Re: [PATCH v3 3/4] testing: add nfsd-io-bench NFS server benchmark suite
From: Andrew Morton @ 2026-04-26 12:34 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
	Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever,
	linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel
In-Reply-To: <20260426-dontcache-v3-3-79eb37da9547@kernel.org>

On Sun, 26 Apr 2026 07:56:09 -0400 Jeff Layton <jlayton@kernel.org> wrote:

> Add a benchmark suite for testing NFSD I/O mode performance using fio
> with the libnfs backend against an NFS server on localhost.  Tests
> buffered, dontcache, and direct I/O modes via NFSD debugfs controls.
> 
> Includes:
>  - fio job files for sequential/random read/write, multi-writer,
>    noisy-neighbor, and latency-sensitive reader workloads
>  - run-benchmarks.sh: orchestrates test matrix with mode switching
>  - parse-results.sh: extracts metrics from fio JSON output
>  - setup-server.sh: configures NFS export for testing
> 
> Assisted-by: Claude:claude-opus-4-6

OK, question.

>  10 files changed, 1024 insertions(+)

Seems that this code was largely machine-generated.  So I assume that
you're in possession of the scripts/prompts/whatever which were used to
generate this code.

(Can you please briefly describe the process which you used here?)

So how are we to maintain this?  Will other developers have to go in
and hack this machine-generated output by hand?  Or would it be better
to provide (in-tree) other developers with the means to regenerate this code,
presumably using Claude?

IOW, this feels a bit like shipping the .s file without giving us the .c
file!

^ permalink raw reply

* Re: [RFC PATCH 1/2] kernel/notifier: replace single-linked list with double-linked list for reverse traversal
From: Song Chen @ 2026-04-26 13:56 UTC (permalink / raw)
  To: Petr Mladek, Masami Hiramatsu
  Cc: chensong_2000, rafael, lenb, mturquette, sboyd, viresh.kumar, agk,
	snitzer, mpatocka, bmarzins, song, yukuai, linan122, jason.wessel,
	danielt, dianders, horms, davem, edumazet, kuba, pabeni, paulmck,
	frederic, mcgrof, petr.pavlu, da.gomez, samitolvanen, atomlin,
	jpoimboe, jikos, mbenes, joe.lawrence, rostedt, mark.rutland,
	mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <aec90caYZDHDAHgw@pathway.suse.cz>

Hi,

On 4/21/26 17:05, Petr Mladek wrote:
> On Mon 2026-04-20 14:44:29, Masami Hiramatsu wrote:
>> Hi Song,
>>
>> On Wed, 15 Apr 2026 15:01:37 +0800
>> chensong_2000@189.cn wrote:
>>
>>> From: Song Chen <chensong_2000@189.cn>
>>>
>>> The current notifier chain implementation uses a single-linked list
>>> (struct notifier_block *next), which only supports forward traversal
>>> in priority order. This makes it difficult to handle cleanup/teardown
>>> scenarios that require notifiers to be called in reverse priority order.
>>
>> What about introducing a new notification callback API that allows you
>> to describe dependencies between callback functions?
>>
>> For example, when registering a callback, you could register a string
>> as an ID and specify whether to call it before or after that ID,
>> or you could register a comparison function that is called when adding
>> to a list. (I prefer @name and @depends fields so that it can be easily
>> maintained.)
> 
> This looks too complex. It would make sense only
> when this API has more users.
> 
> Also this won't be enough for the ftrace/livepatch callbacks.
> They need to be ordered against against each other. But they
> also need to be called before/after all other callbacks.
> For example, when the module is loaded:
> 
>     + 1st frace
>     + 2nd livepatch
>     + then other notifiers
> 
> See the commit c1bf08ac26e92122 ("ftrace: Be first to run code
> modification on modules").
> 
>> This would allow for better dependency building when adding to the list.
>   
>>>
>>> A concrete example is the ordering dependency between ftrace and
>>> livepatch during module load/unload. see the detail here [1].
>>
>> If this only concerns notification callback issues with the ftrace
>> and livepatch modules, it's far more robust to simply call the
>> necessary processing directly when the modules load and unload,
>> rather than registering notification callbacks externally.
>>
>> There are fprobe, kprobe and its trace-events, all of them are using
>> ftrace as its fundation layer. In this case, I always needs to
>> consider callback order when a module is unloaded.
>>
>> If ftrace is working as a part of module callbacks, it will conflict
>> with fprobe/kprobe module callback. Of course we can reorder it with
>> modifying its priority. But this is ugly, because when we introduce
>> a new other feature which depends on another layer, we need to
>> reorder the callback's priority number on the list.
>>
>> Based on the above, I don't think this can be resolved simply by
>> changing the list of notification callbacks to a bidirectional list.
> 
> I agree. I would keep it as is (hardcoded).
> 
> Best Regards,
> Petr
> 


Thanks for the feedback, the necessity doesn't convincing enough. I will 
try the proposal from Masami Hiramatsu.

Best regards,

Song


^ permalink raw reply

* Re: [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
From: Jeff Layton @ 2026-04-26 14:05 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
	Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever,
	linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel
In-Reply-To: <20260426052854.8372fb9d4c616f16a8aa0a0f@linux-foundation.org>

On Sun, 2026-04-26 at 05:28 -0700, Andrew Morton wrote:
> Naive questions...
> 
> On Sun, 26 Apr 2026 07:56:08 -0400 Jeff Layton <jlayton@kernel.org> wrote:
> 
> > The IOCB_DONTCACHE writeback path in generic_write_sync() calls
> > filemap_flush_range() on every write, submitting writeback inline in
> > the writer's context.  Perf lock contention profiling shows the
> > performance problem is not lock contention but the writeback submission
> > work itself — walking the page tree and submitting I/O blocks the writer
> > for milliseconds, inflating p99.9 latency from 23ms (buffered) to 93ms
> > (dontcache).
> 
> So in the current case, when generic_write_sync() returns, all that
> memory is written back and clean&reclaimable (or freed?), yes?
> 

No. Before returning, it submits the I/Os for the portion that it wrote
rather than leaving it to the flusher to take care of things, but it
doesn't wait for the I/Os to complete.

> > Replace the inline filemap_flush_range() call with a flusher kick that
> > drains dirty pages in the background.  This moves writeback submission
> > completely off the writer's hot path.
> 
> Whereas after this change, that pagecache is probably still dirty,
> unreclaimable, waiting for the flusher to do its thing?
> 

Correct, but that's sort of the case today too since DONTCACHE I/Os
don't wait for the completion. With this change we're just deferring
the I/O submission to the flusher thread (which should hopefully soon
wake and take care of business). If the flusher thread can't keep up,
then eventually balance_dirty_pages() will kick in and start slowing
things down.

> So is there potential that the system will get all gummed up with
> dirty, to-be-written-soon pagecache?  Is there something which limits
> this buildup?
> 

Today in this situation, the writers are limited by the backing device
throughput. Once the I/O submission queues are full, then the DONTCACHE
writers end up stacking up on those. With this change, the writers will
be more limited by traditional VM limits in this situation. 

In the test runs I did, the peak pagecache with DONTCACHE writes was
higher than with the unpatched version but still considerably less than
with normal buffered I/O. That's the cost of deferring the I/O
submission to the flusher.

One thing we could consider is going back to submitting the writes
inline when the number of dirty pages is high. But, that could have a
detrimental effect on performance too.

> > ...
> > 
> > dontcache-bench results on dual-socket Xeon Gold 6138 (80 CPUs, 256 GB
> > RAM, Samsung MZ1LB1T9HALS 1.7 TB NVMe, local XFS, io_uring, file size
> > ~503 GB, compared to a v6.19-ish baseline):
> > 
> >   Single-client sequential write (MB/s):
> >                        baseline    patched     change
> >   buffered              1449.8     1440.1      -0.7%
> >   dontcache             1347.9     1461.5      +8.4%
> >   direct                1450.0     1440.1      -0.7%
> > 
> >   Single-client sequential write latency (us):
> >                        baseline    patched     change
> >   dontcache p50         3031.0    10551.3    +248.1%
> >   dontcache p99        74973.2    21626.9     -71.2%
> >   dontcache p99.9      85459.0    23199.7     -72.9%
> > 
> >   Single-client random write (MB/s):
> >                        baseline    patched     change
> >   dontcache              284.2      295.4      +3.9%
> > 
> >   Single-client random write p99.9 latency (us):
> >                        baseline    patched     change
> >   dontcache             2277.4      872.4     -61.7%
> > 
> >   Multi-writer aggregate throughput (MB/s):
> >                        baseline    patched     change
> >   buffered              1619.5     1611.2      -0.5%
> >   dontcache             1281.1     1629.4     +27.2%
> >   direct                1545.4     1609.4      +4.1%
> > 
> >   Mixed-mode noisy neighbor (dontcache writer + buffered readers):
> >                        baseline    patched     change
> >   writer (MB/s)         1297.6     1471.1     +13.4%
> >   readers avg (MB/s)     855.0      462.4     -45.9%
> 
> These results look ambiguous.  Sometimes better, sometimes worse?
> 
> > nfsd-io-bench results on same hardware (XFS on NVMe, NFSv3 via fio
> > NFS engine with libnfs, 1024 NFSD threads, pool_mode=pernode,
> > file size ~502 GB, compared to v6.19-ish baseline):
> > 
> >   Single-client sequential write (MB/s):
> >                        baseline    patched     change
> >   buffered              4844.2     4653.4      -3.9%
> >   dontcache             3028.3     3723.1     +22.9%
> >   direct                 957.6      987.8      +3.2%
> > 
> >   Single-client sequential write p99.9 latency (us):
> >                        baseline    patched     change
> >   dontcache            759169.0   175112.2     -76.9%
> > 
> >   Single-client random write (MB/s):
> >                        baseline    patched     change
> >   dontcache              590.0     1561.0    +164.6%
> > 
> >   Multi-writer aggregate throughput (MB/s):
> >                        baseline    patched     change
> >   buffered              9636.3     9422.9      -2.2%
> >   dontcache             1894.9     9442.6    +398.3%
> >   direct                 809.6      975.1     +20.4%
> > 
> >   Noisy neighbor (dontcache writer + random readers):
> >                        baseline    patched     change
> >   writer (MB/s)         1854.5     4063.6    +119.1%
> >   readers avg (MB/s)     131.2      101.6     -22.5%
> 
> Ditto but less so.
> 
> > The NFS results show even larger improvements than the local benchmarks.
> > Multi-writer dontcache throughput improves nearly 5x, matching buffered
> > I/O. Dirty page footprint drops 85-95% in sequential workloads vs.
> > buffered.
> 
> It sounds that you like the results, so OK ;)

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply

* Re: [PATCH v3 3/4] testing: add nfsd-io-bench NFS server benchmark suite
From: Jeff Layton @ 2026-04-26 14:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
	Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever,
	linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel
In-Reply-To: <20260426053455.4c06140446976964e6fbb8ab@linux-foundation.org>

On Sun, 2026-04-26 at 05:34 -0700, Andrew Morton wrote:
> On Sun, 26 Apr 2026 07:56:09 -0400 Jeff Layton <jlayton@kernel.org> wrote:
> 
> > Add a benchmark suite for testing NFSD I/O mode performance using fio
> > with the libnfs backend against an NFS server on localhost.  Tests
> > buffered, dontcache, and direct I/O modes via NFSD debugfs controls.
> > 
> > Includes:
> >  - fio job files for sequential/random read/write, multi-writer,
> >    noisy-neighbor, and latency-sensitive reader workloads
> >  - run-benchmarks.sh: orchestrates test matrix with mode switching
> >  - parse-results.sh: extracts metrics from fio JSON output
> >  - setup-server.sh: configures NFS export for testing
> > 
> > Assisted-by: Claude:claude-opus-4-6
> 
> OK, question.
> 
> >  10 files changed, 1024 insertions(+)
> 
> Seems that this code was largely machine-generated.  So I assume that
> you're in possession of the scripts/prompts/whatever which were used to
> generate this code.
> 
> (Can you please briefly describe the process which you used here?)
> 

It's been a while since it generated these, but I think I just asked it
to concoct a set of benchmarks for DONTCACHE writes when that involved
file sizes that were larger than the machine's memory. 

I ended up asking it to make some changes (e.g. the mixed-mode test,
and some of the perf stuff), but it seemed to do a reasonable job of
creating it.

> So how are we to maintain this?  Will other developers have to go in
> and hack this machine-generated output by hand?  Or would it be better
> to provide (in-tree) other developers with the means to regenerate this code,
> presumably using Claude?
> 
> IOW, this feels a bit like shipping the .s file without giving us the .c
> file!

As I mentioned in the cover letter, I mostly included this in the
series to demonstrate how this was tested. I'm not sure if the two
benchmark suites are suitable for inclusion. I'm fine with leaving
those two patches out of the merge. I found the testcases useful for
this, but they are indeed AI slop, and I'm not sure they have long-term
value or will be maintainable.
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply

* Re: [RFC PATCH 1/2] kernel/notifier: replace single-linked list with double-linked list for reverse traversal
From: Song Chen @ 2026-04-26 14:14 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: rafael, lenb, mturquette, sboyd, viresh.kumar, agk, snitzer,
	mpatocka, bmarzins, song, yukuai, linan122, jason.wessel, danielt,
	dianders, horms, davem, edumazet, kuba, pabeni, paulmck, frederic,
	mcgrof, petr.pavlu, da.gomez, samitolvanen, atomlin, jpoimboe,
	jikos, mbenes, pmladek, joe.lawrence, rostedt, mark.rutland,
	mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <20260420144429.57b45f2beece690bceea96ec@kernel.org>

Hi Hiramatsu san,


On 4/20/26 13:44, Masami Hiramatsu (Google) wrote:
> Hi Song,
> 
> On Wed, 15 Apr 2026 15:01:37 +0800
> chensong_2000@189.cn wrote:
> 
>> From: Song Chen <chensong_2000@189.cn>
>>
>> The current notifier chain implementation uses a single-linked list
>> (struct notifier_block *next), which only supports forward traversal
>> in priority order. This makes it difficult to handle cleanup/teardown
>> scenarios that require notifiers to be called in reverse priority order.
> 
> What about introducing a new notification callback API that allows you
> to describe dependencies between callback functions?
> 
> For example, when registering a callback, you could register a string
> as an ID and specify whether to call it before or after that ID,
> or you could register a comparison function that is called when adding
> to a list. (I prefer @name and @depends fields so that it can be easily
> maintained.)
> 
> This would allow for better dependency building when adding to the list.
> 

Is the new notification callback API going to replace 
blocking_notifier_chain in module loader? or an expansion inside 
blocking_notifier_chain but introducing less complexity?
>>
>> A concrete example is the ordering dependency between ftrace and
>> livepatch during module load/unload. see the detail here [1].
> 
> If this only concerns notification callback issues with the ftrace
> and livepatch modules, it's far more robust to simply call the
> necessary processing directly when the modules load and unload,
> rather than registering notification callbacks externally.
> 
> There are fprobe, kprobe and its trace-events, all of them are using
> ftrace as its fundation layer. In this case, I always needs to
> consider callback order when a module is unloaded.
> 
> If ftrace is working as a part of module callbacks, it will conflict
> with fprobe/kprobe module callback. Of course we can reorder it with
> modifying its priority. But this is ugly, because when we introduce
> a new other feature which depends on another layer, we need to
> reorder the callback's priority number on the list.
> 
> Based on the above, I don't think this can be resolved simply by
> changing the list of notification callbacks to a bidirectional list.
> 
> Thank you,
> 

understood, many thanks for your proposal, i will think  about it.

best regards,

Song


^ permalink raw reply

* Re: [RFC PATCH 2/2] kernel/module: Decouple klp and ftrace from load_module
From: Song Chen @ 2026-04-26 14:26 UTC (permalink / raw)
  To: Masami Hiramatsu (Google), Petr Mladek
  Cc: Petr Pavlu, rafael, lenb, mturquette, sboyd, viresh.kumar, agk,
	snitzer, mpatocka, bmarzins, song, yukuai, linan122, jason.wessel,
	danielt, dianders, horms, davem, edumazet, kuba, pabeni, paulmck,
	frederic, mcgrof, da.gomez, samitolvanen, atomlin, jpoimboe,
	jikos, mbenes, joe.lawrence, rostedt, mark.rutland,
	mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <20260420112707.aa3627ca9f975eeaf7d8ea0e@kernel.org>

Hi,


On 4/20/26 10:27, Masami Hiramatsu (Google) wrote:
> On Thu, 16 Apr 2026 16:49:32 +0200
> Petr Mladek <pmladek@suse.com> wrote:
> 
>> On Thu 2026-04-16 13:18:30, Petr Pavlu wrote:
>>> On 4/15/26 8:43 AM, Song Chen wrote:
>>>> On 4/14/26 22:33, Petr Pavlu wrote:
>>>>> On 4/13/26 10:07 AM, chensong_2000@189.cn wrote:
>>>>>> diff --git a/include/linux/module.h b/include/linux/module.h
>>>>>> index 14f391b186c6..0bdd56f9defd 100644
>>>>>> --- a/include/linux/module.h
>>>>>> +++ b/include/linux/module.h
>>>>>> @@ -308,6 +308,14 @@ enum module_state {
>>>>>>        MODULE_STATE_COMING,    /* Full formed, running module_init. */
>>>>>>        MODULE_STATE_GOING,    /* Going away. */
>>>>>>        MODULE_STATE_UNFORMED,    /* Still setting it up. */
>>>>>> +    MODULE_STATE_FORMED,
>>>>>
>>>>> I don't see a reason to add a new module state. Why is it necessary and
>>>>> how does it fit with the existing states?
>>>>>
>>>> because once notifier fails in state MODULE_STATE_UNFORMED (now only ftrace has someting to do in this state), notifier chain will roll back by calling blocking_notifier_call_chain_robust, i'm afraid MODULE_STATE_GOING is going to jeopardise the notifers which don't handle it appropriately, like:
>>>>
>>>> case MODULE_STATE_COMING:
>>>>       kmalloc();
>>>> case MODULE_STATE_GOING:
>>>>       kfree();
>>>
>>> My understanding is that the current module "state machine" operates as
>>> follows. Transitions marked with an asterisk (*) are announced via the
>>> module notifier.
>>>
>>> ---> UNFORMED --*> COMING --*> LIVE --*> GOING -.
>>>          ^            |                     ^    |
>>>          |            '---------------------*    |
>>>          '---------------------------------------'
>>>
>>> The new code aims to replace the current ftrace_module_init() call in
>>> load_module(). To achieve this, it adds a notification for the UNFORMED
>>> state (only when loading a module) and introduces a new FORMED state for
>>> rollback. FORMED is purely a fake state because it never appears in
>>> module::state. The new structure is as follows:
>>>
>>>          ,--*> (FORMED)
>>>          |
>>> --*> UNFORMED --*> COMING --*> LIVE --*> GOING -.
>>>          ^            |                     ^    |
>>>          |            '---------------------*    |
>>>          '---------------------------------------'
>>>
>>> I'm afraid this is quite complex and inconsistent. Unless it can be kept
>>> simple, we would be just replacing one special handling with a different
>>> complexity, which is not worth it.
>>
>>>>>
>>>>>> +    if (err)
>>>>>> +        goto ddebug_cleanup;
>>>>>>          /* Finally it's fully formed, ready to start executing. */
>>>>>>        err = complete_formation(mod, info);
>>>>>> -    if (err)
>>>>>> +    if (err) {
>>>>>> +        blocking_notifier_call_chain_reverse(&module_notify_list,
>>>>>> +                MODULE_STATE_FORMED, mod);
>>>>>>            goto ddebug_cleanup;
>>>>>> +    }
>>>>>>    -    err = prepare_coming_module(mod);
>>>>>> +    err = prepare_module_state_transaction(mod,
>>>>>> +                MODULE_STATE_COMING, MODULE_STATE_GOING);
>>>>>>        if (err)
>>>>>>            goto bug_cleanup;
>>>>>>    @@ -3522,7 +3519,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
>>>>>>        destroy_params(mod->kp, mod->num_kp);
>>>>>>        blocking_notifier_call_chain(&module_notify_list,
>>>>>>                         MODULE_STATE_GOING, mod);
>>>>>
>>>>> My understanding is that all notifier chains for MODULE_STATE_GOING
>>>>> should be reversed.
>>>> yes, all, from lowest priority notifier to highest.
>>>> I will resend patch 1 which was failed due to my proxy setting.
>>>
>>> What I meant here is that the call:
>>>
>>> blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod);
>>>
>>> should be replaced with:
>>>
>>> blocking_notifier_call_chain_reverse(&module_notify_list, MODULE_STATE_GOING, mod);
>>>
>>>>
>>>>>
>>>>>> -    klp_module_going(mod);
>>>>>>     bug_cleanup:
>>>>>>        mod->state = MODULE_STATE_GOING;
>>>>>>        /* module_bug_cleanup needs module_mutex protection */
>>>>>
>>>>> The patch removes the klp_module_going() cleanup call in load_module().
>>>>> Similarly, the ftrace_release_mod() call under the ddebug_cleanup label
>>>>> should be removed and appropriately replaced with a cleanup via
>>>>> a notifier.
>>>>>
>>>>      err = prepare_module_state_transaction(mod,
>>>>                  MODULE_STATE_UNFORMED, MODULE_STATE_FORMED);
>>>>      if (err)
>>>>          goto ddebug_cleanup;
>>>>
>>>> ftrace will be cleanup in blocking_notifier_call_chain_robust rolling back.
>>>>
>>>>      err = prepare_module_state_transaction(mod,
>>>>                  MODULE_STATE_COMING, MODULE_STATE_GOING);
>>>>
>>>> each notifier including ftrace and klp will be cleanup in blocking_notifier_call_chain_robust rolling back.
>>>>
>>>> if all notifiers are successful in MODULE_STATE_COMING, they all will be clean up in
>>>>   coming_cleanup:
>>>>      mod->state = MODULE_STATE_GOING;
>>>>      destroy_params(mod->kp, mod->num_kp);
>>>>      blocking_notifier_call_chain(&module_notify_list,
>>>>                       MODULE_STATE_GOING, mod);
>>>>
>>>> if  something wrong underneath.
>>>
>>> My point is that the patch leaves a call to ftrace_release_mod() in
>>> load_module(), which I expected to be handled via a notifier.
>>
>> I think that I have got it. The ftrace code needs two notifiers when
>> the module is being loaded and two when it is going.
>>
>> This is why Sond added the new state. But I think that we would
>> need two new states to call:
>>
>>      + ftrace_module_init() in MODULE_STATE_UNFORMED
>>      + ftrace_module_enable() in MODULE_STATE_FORMED
>>
>> and
>>
>>      + ftrace_free_mem() in MODULE_STATE_PRE_GOING
>>      + ftrace_free_mem() in MODULE_STATE_GOING
>>
>>
>> By using the ascii art:
>>
>>   -*> UNFORMED -*> FORMED -> COMING -*> LIVE -*> PRE_GOING -*> GOING -.
>>                |          |         |                ^           ^    ^
>>                |          |         '----------------'           |    |
>>                |          '--------------------------------------'    |
>>                '------------------------------------------------------'
>>
>>
>> But I think that this is not worth it.
> 
> Agree.
> 
> If this needs to be ordered so strictly, why we will use a "single"
> module notifier chain for this complex situation?
> 
> I think the notifier call chain is just for notice a single signal,
> instead of sending several different signals, especially if there is
> any dependency among the callbacks.
> 
> If notification callbacks need to be ordered, they are currently
> sorted by representing priority numerically, but this is quite
> fragile for updating. It has to look up other registered priorities
> and adjust the order among dependencies each time. For this reason,
> this mechanism is not suitable for global ordering. (It's like line
> numbers in BASIC.)
> It is probably only useful for representing dependencies between
> two components maintained by the same maintainer.
> 
> I'm against a general-purpose system that makes everything modular.
> It unnecessarily complicates things. If there are processes that
> require strict ordering, especially processes that must be performed
> before each stage as part of the framework, they should be called
> directly from the framework, not via notification callbacks.
> 
> This makes it simpler and more robust to maintain.
> 
> Only the framework's end users should utilize notification callbacks.
> 
> Thank you,
> 
> 

my motivation is to decouple ftrace and klp from module loader and make 
blocking_notifier_chain more generic, but it doesn't become generic 
completely. I understand your and Petr's comments and agree.

Thanks

Best regards

Song

>>
>> Best Regards,
>> Petr
>>
> 
> 


^ permalink raw reply

* Re: [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
From: Jeff Layton @ 2026-04-26 18:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Ritesh Harjani, Christoph Hellwig, Kairui Song, Qi Zheng,
	Shakeel Butt, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever,
	linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel
In-Reply-To: <20260426052854.8372fb9d4c616f16a8aa0a0f@linux-foundation.org>

On Sun, 2026-04-26 at 05:28 -0700, Andrew Morton wrote:
> Naive questions...
> 
> On Sun, 26 Apr 2026 07:56:08 -0400 Jeff Layton <jlayton@kernel.org> wrote:
> 
> > The IOCB_DONTCACHE writeback path in generic_write_sync() calls
> > filemap_flush_range() on every write, submitting writeback inline in
> > the writer's context.  Perf lock contention profiling shows the
> > performance problem is not lock contention but the writeback submission
> > work itself — walking the page tree and submitting I/O blocks the writer
> > for milliseconds, inflating p99.9 latency from 23ms (buffered) to 93ms
> > (dontcache).
> 
> So in the current case, when generic_write_sync() returns, all that
> memory is written back and clean&reclaimable (or freed?), yes?
> 
> > Replace the inline filemap_flush_range() call with a flusher kick that
> > drains dirty pages in the background.  This moves writeback submission
> > completely off the writer's hot path.
> 
> Whereas after this change, that pagecache is probably still dirty,
> unreclaimable, waiting for the flusher to do its thing?
> 
> So is there potential that the system will get all gummed up with
> dirty, to-be-written-soon pagecache?  Is there something which limits
> this buildup?
> 
> > ...
> > 
> > dontcache-bench results on dual-socket Xeon Gold 6138 (80 CPUs, 256 GB
> > RAM, Samsung MZ1LB1T9HALS 1.7 TB NVMe, local XFS, io_uring, file size
> > ~503 GB, compared to a v6.19-ish baseline):
> > 
> >   Single-client sequential write (MB/s):
> >                        baseline    patched     change
> >   buffered              1449.8     1440.1      -0.7%
> >   dontcache             1347.9     1461.5      +8.4%
> >   direct                1450.0     1440.1      -0.7%
> > 
> >   Single-client sequential write latency (us):
> >                        baseline    patched     change
> >   dontcache p50         3031.0    10551.3    +248.1%
> >   dontcache p99        74973.2    21626.9     -71.2%
> >   dontcache p99.9      85459.0    23199.7     -72.9%
> > 
> >   Single-client random write (MB/s):
> >                        baseline    patched     change
> >   dontcache              284.2      295.4      +3.9%
> > 
> >   Single-client random write p99.9 latency (us):
> >                        baseline    patched     change
> >   dontcache             2277.4      872.4     -61.7%
> > 
> >   Multi-writer aggregate throughput (MB/s):
> >                        baseline    patched     change
> >   buffered              1619.5     1611.2      -0.5%
> >   dontcache             1281.1     1629.4     +27.2%
> >   direct                1545.4     1609.4      +4.1%
> > 
> >   Mixed-mode noisy neighbor (dontcache writer + buffered readers):
> >                        baseline    patched     change
> >   writer (MB/s)         1297.6     1471.1     +13.4%
> >   readers avg (MB/s)     855.0      462.4     -45.9%
> 
> These results look ambiguous.  Sometimes better, sometimes worse?
> 

Forgot to comment on this part earlier...

This is the "mixed-mode" (dontcache writes + buffered reads). I played
with a bunch of different settings under nfsd, and those settings
turned out to perform the best with this benchmark.

I suspect what's happening is that the increase in write throughput
from writing via the flusher thread is crowding out reads. So, read
throughput suffers in this test from that. There are a number of ways
we could probably make that more fair.

> > nfsd-io-bench results on same hardware (XFS on NVMe, NFSv3 via fio
> > NFS engine with libnfs, 1024 NFSD threads, pool_mode=pernode,
> > file size ~502 GB, compared to v6.19-ish baseline):
> > 
> >   Single-client sequential write (MB/s):
> >                        baseline    patched     change
> >   buffered              4844.2     4653.4      -3.9%
> >   dontcache             3028.3     3723.1     +22.9%
> >   direct                 957.6      987.8      +3.2%
> > 
> >   Single-client sequential write p99.9 latency (us):
> >                        baseline    patched     change
> >   dontcache            759169.0   175112.2     -76.9%
> > 
> >   Single-client random write (MB/s):
> >                        baseline    patched     change
> >   dontcache              590.0     1561.0    +164.6%
> > 
> >   Multi-writer aggregate throughput (MB/s):
> >                        baseline    patched     change
> >   buffered              9636.3     9422.9      -2.2%
> >   dontcache             1894.9     9442.6    +398.3%
> >   direct                 809.6      975.1     +20.4%
> > 
> >   Noisy neighbor (dontcache writer + random readers):
> >                        baseline    patched     change
> >   writer (MB/s)         1854.5     4063.6    +119.1%
> >   readers avg (MB/s)     131.2      101.6     -22.5%
> 
> Ditto but less so.
> 

Same reason for the drop, I think.

> > The NFS results show even larger improvements than the local benchmarks.
> > Multi-writer dontcache throughput improves nearly 5x, matching buffered
> > I/O. Dirty page footprint drops 85-95% in sequential workloads vs.
> > buffered.
> 
> It sounds that you like the results, so OK ;)

I think it's a win overall. As with anything writeback-related, it's a
game of tradeoffs. The good news is that DONTCACHE is still fairly new
and not many applications are using it yet, so the blast radius from
any change here should be rather small.

As a side note: I've long thought that we in general wait too long to
kick off writeback with normal buffered I/O, particularly with modern
memory sizes. DONTCACHE gives us a place to experiment with this
scheme, but we may want to think about kicking off writeback earlier in
the normal buffered case too.
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply

* [syzbot ci] Re: mm: improve write performance with RWF_DONTCACHE
From: syzbot ci @ 2026-04-26 19:02 UTC (permalink / raw)
  To: akpm, axboe, axelrasmussen, baohua, brauner, chuck.lever, david,
	hch, jack, jlayton, kasong, liam.howlett, linux-fsdevel,
	linux-kernel, linux-mm, linux-nfs, linux-trace-kernel, ljs,
	mathieu.desnoyers, mhiramat, mhocko, qi.zheng, ritesh.list,
	rostedt, rppt, shakeel.butt, snitzer, surenb, vbabka, viro,
	weixugc, willy, yuanchu
  Cc: syzbot, syzkaller-bugs
In-Reply-To: <20260426-dontcache-v3-0-79eb37da9547@kernel.org>

syzbot ci has tested the following series

[v3] mm: improve write performance with RWF_DONTCACHE
https://lore.kernel.org/all/20260426-dontcache-v3-0-79eb37da9547@kernel.org
* [PATCH v3 1/4] mm: add NR_DONTCACHE_DIRTY node page counter
* [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
* [PATCH v3 3/4] testing: add nfsd-io-bench NFS server benchmark suite
* [PATCH v3 4/4] testing: add dontcache-bench local filesystem benchmark suite

and found the following issue:
WARNING in __mod_memcg_lruvec_state

Full report is available here:
https://ci.syzbot.org/series/e53aef43-ac7a-4cb7-8714-bb927aaee659

***

WARNING in __mod_memcg_lruvec_state

tree:      torvalds
URL:       https://kernel.googlesource.com/pub/scm/linux/kernel/git/torvalds/linux
base:      27d128c1cff64c3b8012cc56dd5a1391bb4f1821
arch:      amd64
compiler:  Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8
config:    https://ci.syzbot.org/builds/c10ddd10-bb16-48c2-90fb-3625d3b258aa/config
syz repro: https://ci.syzbot.org/findings/1e8993c1-818b-4ddf-b90b-30f051b3a9d6/syz_repro

------------[ cut here ]------------
__mod_memcg_lruvec_state: missing stat item 21
WARNING: mm/memcontrol.c:911 at __mod_memcg_lruvec_state+0x1f3/0x360 mm/memcontrol.c:911, CPU#0: syz.0.17/5831
Modules linked in:
CPU: 0 UID: 0 PID: 5831 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:__mod_memcg_lruvec_state+0x1fc/0x360 mm/memcontrol.c:911
Code: 00 11 85 c0 74 31 48 83 c4 08 5b 41 5c 41 5d 41 5e 41 5f 5d e9 95 2e 72 09 cc 48 8d 3d 7d c4 fd 0d 48 c7 c6 5d b4 f5 8d 89 da <67> 48 0f b9 3a eb d5 90 0f 0b 90 eb 90 e8 02 22 fb fe eb c8 48 8d
RSP: 0018:ffffc900039e7520 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000015 RCX: dffffc0000000000
RDX: 0000000000000015 RSI: ffffffff8df5b45d RDI: ffffffff90363d90
RBP: 0000000000000001 R08: ffffffff82388833 R09: ffffffff8e95cd60
R10: dffffc0000000000 R11: fffff940008c3f49 R12: ffff8881026eee80
R13: 00000000000000ff R14: 0000000000000001 R15: ffff888173a80e00
FS:  00007f5f76bca6c0(0000) GS:ffff88818dc95000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055d77c624128 CR3: 0000000171fde000 CR4: 00000000000006f0
Call Trace:
 <TASK>
 mod_memcg_lruvec_state+0xa7/0x220 mm/memcontrol.c:941
 mod_lruvec_state mm/memcontrol.c:964 [inline]
 lruvec_stat_mod_folio+0x239/0x3e0 mm/memcontrol.c:984
 folio_account_dirtied mm/page-writeback.c:2634 [inline]
 __folio_mark_dirty+0x633/0xec0 mm/page-writeback.c:2692
 mark_buffer_dirty+0x261/0x410 fs/buffer.c:1110
 block_commit_write+0x15d/0x270 fs/buffer.c:2115
 block_write_end+0x6e/0xb0 fs/buffer.c:2191
 ext4_write_end+0x27d/0xa30 fs/ext4/inode.c:1458
 ext4_da_write_end+0x86/0xcb0 fs/ext4/inode.c:3296
 generic_perform_write+0x620/0x8f0 mm/filemap.c:4350
 ext4_buffered_write_iter+0xcb/0x370 fs/ext4/file.c:316
 ext4_file_write_iter+0x298/0x1bd0 fs/ext4/file.c:-1
 do_iter_readv_writev+0x619/0x8c0 fs/read_write.c:-1
 vfs_writev+0x33c/0x990 fs/read_write.c:1059
 do_pwritev fs/read_write.c:1155 [inline]
 __do_sys_pwritev2 fs/read_write.c:1213 [inline]
 __se_sys_pwritev2+0x184/0x2a0 fs/read_write.c:1204
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x15f/0xf80 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f5f75d9cdd9
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f5f76bca028 EFLAGS: 00000246 ORIG_RAX: 0000000000000148
RAX: ffffffffffffffda RBX: 00007f5f76015fa0 RCX: 00007f5f75d9cdd9
RDX: 0000000000000001 RSI: 00002000000001c0 RDI: 0000000000000004
RBP: 00007f5f75e32d69 R08: 0000000000000001 R09: 0000000000000081
R10: 0000000000000003 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f5f76016038 R14: 00007f5f76015fa0 R15: 00007fffe7503ad8
 </TASK>
----------------
Code disassembly (best guess):
   0:	00 11                	add    %dl,(%rcx)
   2:	85 c0                	test   %eax,%eax
   4:	74 31                	je     0x37
   6:	48 83 c4 08          	add    $0x8,%rsp
   a:	5b                   	pop    %rbx
   b:	41 5c                	pop    %r12
   d:	41 5d                	pop    %r13
   f:	41 5e                	pop    %r14
  11:	41 5f                	pop    %r15
  13:	5d                   	pop    %rbp
  14:	e9 95 2e 72 09       	jmp    0x9722eae
  19:	cc                   	int3
  1a:	48 8d 3d 7d c4 fd 0d 	lea    0xdfdc47d(%rip),%rdi        # 0xdfdc49e
  21:	48 c7 c6 5d b4 f5 8d 	mov    $0xffffffff8df5b45d,%rsi
  28:	89 da                	mov    %ebx,%edx
* 2a:	67 48 0f b9 3a       	ud1    (%edx),%rdi <-- trapping instruction
  2f:	eb d5                	jmp    0x6
  31:	90                   	nop
  32:	0f 0b                	ud2
  34:	90                   	nop
  35:	eb 90                	jmp    0xffffffc7
  37:	e8 02 22 fb fe       	call   0xfefb223e
  3c:	eb c8                	jmp    0x6
  3e:	48                   	rex.W
  3f:	8d                   	.byte 0x8d


***

If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
  Tested-by: syzbot@syzkaller.appspotmail.com

---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.

To test a patch for this bug, please reply with `#syz test`
(should be on a separate line).

The patch should be attached to the email.
Note: arguments like custom git repos and branches are not supported.

^ permalink raw reply

* [PATCH] mm/damon: fix damos_stat tracepoint format for sz_applied
From: SeongJae Park @ 2026-04-26 19:31 UTC (permalink / raw)
  To: Andrew Morton
  Cc: SeongJae Park, # 7 . 0 . x, Masami Hiramatsu, Mathieu Desnoyers,
	Steven Rostedt, damon, linux-kernel, linux-mm, linux-trace-kernel

The print format is wrongly marking sz_applied as sz_tried.  Fix it.

Fixes: 804c26b961da ("mm/damon/core: add trace point for damos stat per apply interval")
Cc: <stable@vger.kernel.org> # 7.0.x
Signed-off-by: SeongJae Park <sj@kernel.org>
---
 include/trace/events/damon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index 24fc402ab3c85..7e25f4469b81b 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -41,7 +41,7 @@ TRACE_EVENT(damos_stat_after_apply_interval,
 	),
 
 	TP_printk("ctx_idx=%u scheme_idx=%u nr_tried=%lu sz_tried=%lu "
-			"nr_applied=%lu sz_tried=%lu sz_ops_filter_passed=%lu "
+			"nr_applied=%lu sz_applied=%lu sz_ops_filter_passed=%lu "
 			"qt_exceeds=%lu nr_snapshots=%lu",
 			__entry->context_idx, __entry->scheme_idx,
 			__entry->nr_tried, __entry->sz_tried,

base-commit: 2e98f54b5a2b874905c71f3bc40eb8c0e8e757f0
-- 
2.47.3

^ permalink raw reply related

* Re: [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
From: Matthew Wilcox @ 2026-04-26 20:44 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Alexander Viro, Christian Brauner, Jan Kara, Andrew Morton,
	David Hildenbrand, Lorenzo Stoakes, Liam R. Howlett,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	Mike Snitzer, Jens Axboe, Ritesh Harjani, Christoph Hellwig,
	Kairui Song, Qi Zheng, Shakeel Butt, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Chuck Lever, linux-fsdevel, linux-kernel,
	linux-nfs, linux-mm, linux-trace-kernel
In-Reply-To: <20260426-dontcache-v3-2-79eb37da9547@kernel.org>

On Sun, Apr 26, 2026 at 07:56:08AM -0400, Jeff Layton wrote:
>   Mixed-mode noisy neighbor (dontcache writer + buffered readers):
>                        baseline    patched     change
>   writer (MB/s)         1297.6     1471.1     +13.4%
>   readers avg (MB/s)     855.0      462.4     -45.9%

hm.  This wasn't what I thought of when I thought of "noisy neighbour".
I'd have process A doing DONTCACHE writes to file A and process B doing
normal buffered writes to file B.


^ permalink raw reply

* [RFC PATCH 00/19] mm/damon: introduce data attributes monitoring
From: SeongJae Park @ 2026-04-26 20:52 UTC (permalink / raw)
  Cc: SeongJae Park, Liam R. Howlett, Andrew Morton, David Hildenbrand,
	Jonathan Corbet, Lorenzo Stoakes, Masami Hiramatsu,
	Mathieu Desnoyers, Michal Hocko, Mike Rapoport, Shuah Khan,
	Shuah Khan, Steven Rostedt, Suren Baghdasaryan, Vlastimil Babka,
	damon, linux-doc, linux-kernel, linux-kselftest, linux-mm,
	linux-trace-kernel

TL; DR
======

Extend DAMON for monitoring general data attributes other than accesses.
This is for enabling light-weight page type (e.g., belonging cgroup)
aware monitoring in short term.  In long term, this will help extending
DAMON for multiple access events capture primitives (e.g., page faults
and PMU) and eventually pivotting DAMON to a "Data Attributes Monitoring
and Operations eNgine" in long term.

Background: High Cost of Page Level Properties Monitoring
=========================================================

DAMON is initially introduced as a Data Access MONitor.  It has been
extended for not only access monitoring but also data access-aware
system operations (DAMOS).  But still the monitoring part is only for
data accesses.

Data access patterns is good information, but some users need more
holistic views.  Particularly, users want to show the access pattern
information together with the types of the memory.  For example, users
who work for making huge pages efficiently want to know how much of
DAMON-found hot/cold regions are backed by huge pages.  Users who run
multiple workloads with different cgroups want to know how much of
DAMON-found hot/cold regions belong to specific cgroups.

For the user demand, we developed a DAMOS extension for page level
properties based monitoring [1], which has landed on 6.14.  Using the
feature, users can inform the page level data properties that they are
interested in, in a flexible format that uses DAMOS filters.  Then,
DAMON applies the filters to each folio of the entire DAMON region and
lets users know how many bytes of memory in each DAMON region passed the
given filters.

This gives page level detailed and deterministic information to users.
But, because the operation is done at page level, the overhead is
proportional to the memory size.  It was useful for test or debugging
purposes on a small number of machines.  But it was obviously too heavy
to be enabled always on all machines running the real user workloads.
For real world workloads, it was recommended to use the feature with
user-space controlled sampling approaches.  For example, users could do
the page level monitoring only once per hour, on randomly selected one
percent of machines of their fleet.  If the runtime and the  size of the
fleet is long and big enough, it should provide statistically meaningful
data.

But users are too busy to implement such controls on their own.

Data Attributes Monitoring
==========================

Extend DAMON to monitor not only data accesses, but also general data
attributes.  Do the extension while keeping the main promise of DAMON,
the bounded and best-effort minimum overhead.

Allow users to specify what data attributes in addition to the data
access they want to monitor.  Users can install one 'data probe' per
data attribute of their interest for this purpose.  The 'data probe'
should be able to be applied to any memory, and determine if the given
memory has the appropriate data attribute.  E.g., if memory of physical
address 42 belongs to cgroup A.  Each 'data probe' is configured with
filters that are very similar to the DAMOS filters.

When DAMON checks if each sampling address memory of each region is
accessed since the last check, it applies data probes if registered.
Same to the number of access check-positive samples accounting
(nr_accesses), it accounts the number of each data probe-positive
samples in another per-region counters array, namely 'probe_hits'. When
DAMON resets nr_accesses every aggregation interval, it resets
'probe_hits' together.

Users can read 'probe_hits' just before the values are reset.  In this
way, users can know how many hot/cold memory regions have data
attributes of their interest.  E.g., 30 percent of this system's hot
memory is belonging to cgroup A and 80 percent of the hot cgroup A
memory is backed by huge pages.

Patches Sequence
================

First eight patches implement the core feature, interface and the
working support.  Patch 1 introduces data probe data structure, namely
damon_probe.  Patch 2 extends damon_ctx for installing data probes.
Patch 3 introduces another data structure for filters of each data
probe, namely damon_filter.  Patch 4 updates damon_ctx commit function
to handle the probes.  Patch 5 extends damon_region for the per-region
per-probe positive samples counter, namely probe_hits.  Patch 6 extends
damon_operations for applying probes on the underlying DAMON operations
implementation.  Patch 7 updates kdamond_fn() to invoke the probes
applying callback.  Patch 8 finally implements the probes support on
paddr ops.

Eight changes for user interface (patches 9-16) come next.  Patches 9-13
implements sysfs directories and files for setting data probes, namely
probes directory, probe directory, filters directory, filter directory
and filter directory internal files, respectively.  Patch 14 connects
the user inputs that are made via the sysfs files to DAMON core.
Patch 15 implements sysfs files for showing the per-region per-probe
positive samples count, namely probe_hits.  Patch 16 introduces a new
tracepoint for showing the counts via tracefs.

Patch 14 adds a selftest for the sysfs files.

Patches 15 and 16 documents the design and usage of the new feature,
respectively.

Discussions
===========

This allows the page properties monitoring with overhead that is low
enough to be enabled always on real world workloads.  Because the
sampling time for access check is reused for data attributes check,  the
upper-bounded and best-effort minimum overhead of DAMON is kept.
Because the sampling memory for access check is reused for data
attributes check, additional overhead is minimum.

Still DAMOS-based page level properties monitoring should be useful,
because it provides a deterministic page level information.  When in
doubt of the sampling based information, running DAMOS-based one
together and comparing the results would be useful, for debugging and
tuning.

Plan for Dropping RFC tag
=========================

The user ABI for reading probe_hits is not yet convincing.  It is
exposed to users by a tracepoint and new sysfs file.  For the
tracepoint, a new one namely damon:damon_aggregated_v2 is introduced.
The name is not convincing, and its internal mechanism seems to have
room to be improved before dropping RFC.  For the sysfs, a file under
the DAMOS-tried region directory namely 'probe_hits' is added.  Reading
it returns four probe_hits values with ',' as a separator.  With the
maximum number of data probes, this should work.  This can make future
changes of the limit difficult.  I will try to find a better way before
dropping the RFC tag.  Maybe 'probe_hits/' directory having files of
name '0' to 'N-1' for each of user-registered 'N' data probes.

I'm currently hoping to drop the RFC tag by 7.2-rc1.

Future Works: Short Term
========================

This series is introducing only a single type of data attribute:
anonymous page.  Once this is landed, I will extend it for
cgroup-belonging, so that we can do cgroup-level monitoring with low
overhead.  After that, I may further work on supporting all DAMOS filter
types.  And as demands are found, we could extend the types.

This version of implementation is limiting the maximum number of data
probes to four.  I will try to find a way to remove the limit in future,
if it is easy to do.  I personally think it should be enough for common
use cases, though, and therefore not giving high priority at the moment.

Future Works: Long Term
=======================

There are user requests for extending DAMON with detailed access
information, for example, per-CPUs/threads/read/writes monitoring.  For
that, I was working [2] on extending DAMON to use page fault events as
another access check primitives, and making the infrastructure flexible
for future use of yet another access check primitive.  Actually there is
another ongoing work [3] for extending DAMON with PMU events.  The
motivation of the work is reducing the overhead, though.

In my work [2], I was introducing a new interface for access sampling
primitives control.  Now I think this data probe interface can be used
for that, too.  That is, data access becomes just one type of data
attribute.  Also, pg_idle-confirmed access, page fault-confirmed access,
and PMU event-confirmed access will be different types of data
attributes.

The regions adjustment mechanism is currently working based on the
access information.  That's because DAMON is designed for data access
monitoring.  That is, data access information is the primary interest,
and therefore DAMON adjusts regions in a way that can best-present the
information.

Once data access becomes just one of data attributes, there is no reason
to think data access that special.  There might be some users not
interested in access at all but want to know the location of memory of
specific type.  Data probes interface will allow doing that.  Further,
we could extend the interface to let users set any data attribute as the
'primary' attribute.  Then, DAMON will split and merge regions in a way
that can best-present the 'primary' attributes.

DAMOS will also be extended, to specify targets based on not only the
data access pattern, but all user-registered data attributes.  From this
stage, we may be able to call DAMON as a "Data Attributes Monitoring and
Operations eNgine".

[1] https://lore.kernel.org/20250106193401.109161-1-sj@kernel.org
[2] https://lore.kernel.org/20251208062943.68824-1-sj@kernel.org/
[3] https://lore.kernel.org/20260423004211.7037-1-akinobu.mita@gmail.com

SeongJae Park (19):
  mm/damon/core: introduce struct damon_probe
  mm/damon/core: embed damon_probe objects in damon_ctx
  mm/damon/core: introduce damon_filter
  mm/damon/core: commit probes
  mm/damon/core: introduce damon_region->probe_hits
  mm/damon/core: introduce damon_ops->apply_probes
  mm/damon/core: do data attributes monitoring
  mm/damon/paddr: support data attributes monitoring
  mm/damon/sysfs: implement probes dir
  mm/damon/sysfs: implement probe dir
  mm/damon/sysfs: implement filters directory
  mm/damon/sysfs: implement filter dir
  mm/damon/sysfs: implement filter dir files
  mm/damon/sysfs: setup probes on DAMON core API parameters
  mm/damon/sysfs-schemes: implement tried_region/probe_hits file
  mm/damon: trace probe_hits
  selftests/damon/sysfs.sh: test probes dir
  Docs/mm/damon/design: document data attributes monitoring
  Docs/admin-guide/mm/damon/usage: document data attributes monitoring

 Documentation/admin-guide/mm/damon/usage.rst |  44 +-
 Documentation/mm/damon/design.rst            |  37 ++
 include/linux/damon.h                        |  60 +++
 include/trace/events/damon.h                 |  41 ++
 mm/damon/core.c                              | 182 +++++++
 mm/damon/paddr.c                             |  45 ++
 mm/damon/sysfs-schemes.c                     |  30 ++
 mm/damon/sysfs.c                             | 502 +++++++++++++++++++
 tools/testing/selftests/damon/sysfs.sh       |  48 ++
 9 files changed, 982 insertions(+), 7 deletions(-)


base-commit: 8f22aa2e28454419ed2031119ad32ea4a6c9f1f1
-- 
2.47.3

^ permalink raw reply

* [RFC PATCH 16/19] mm/damon: trace probe_hits
From: SeongJae Park @ 2026-04-26 20:52 UTC (permalink / raw)
  Cc: SeongJae Park, Andrew Morton, Masami Hiramatsu, Mathieu Desnoyers,
	Steven Rostedt, damon, linux-kernel, linux-mm, linux-trace-kernel
In-Reply-To: <20260426205222.93895-1-sj@kernel.org>

Introduce a new tracepoint for exposing the per-region per-probe
positive sample count via tracefs.

Signed-off-by: SeongJae Park <sj@kernel.org>
---
 include/trace/events/damon.h | 41 ++++++++++++++++++++++++++++++++++++
 mm/damon/core.c              |  1 +
 2 files changed, 42 insertions(+)

diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index 7e25f4469b81b..121d7bc3a2c27 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -130,6 +130,47 @@ TRACE_EVENT(damon_monitor_intervals_tune,
 	TP_printk("sample_us=%lu", __entry->sample_us)
 );
 
+TRACE_EVENT(damon_aggregated_v2,
+
+	TP_PROTO(unsigned int target_id, struct damon_region *r,
+		unsigned int nr_regions),
+
+	TP_ARGS(target_id, r, nr_regions),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, target_id)
+		__field(unsigned int, nr_regions)
+		__field(unsigned long, start)
+		__field(unsigned long, end)
+		__field(unsigned int, nr_accesses)
+		__field(unsigned int, age)
+		__field(unsigned char, probe_hit0)
+		__field(unsigned char, probe_hit1)
+		__field(unsigned char, probe_hit2)
+		__field(unsigned char, probe_hit3)
+	),
+
+	TP_fast_assign(
+		__entry->target_id = target_id;
+		__entry->nr_regions = nr_regions;
+		__entry->start = r->ar.start;
+		__entry->end = r->ar.end;
+		__entry->nr_accesses = r->nr_accesses;
+		__entry->age = r->age;
+		__entry->probe_hit0 = r->probe_hits[0];
+		__entry->probe_hit1 = r->probe_hits[1];
+		__entry->probe_hit2 = r->probe_hits[2];
+		__entry->probe_hit3 = r->probe_hits[3];
+	),
+
+	TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u %hhu %hhu %hhu %hhu",
+			__entry->target_id, __entry->nr_regions,
+			__entry->start, __entry->end,
+			__entry->nr_accesses, __entry->age,
+			__entry->probe_hit0, __entry->probe_hit1,
+			__entry->probe_hit2, __entry->probe_hit3)
+);
+
 TRACE_EVENT(damon_aggregated,
 
 	TP_PROTO(unsigned int target_id, struct damon_region *r,
diff --git a/mm/damon/core.c b/mm/damon/core.c
index fe14971d72747..54834b74efef4 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1924,6 +1924,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
 			int i;
 
 			trace_damon_aggregated(ti, r, damon_nr_regions(t));
+			trace_damon_aggregated_v2(ti, r, damon_nr_regions(t));
 			damon_warn_fix_nr_accesses_corruption(r);
 			r->last_nr_accesses = r->nr_accesses;
 			r->nr_accesses = 0;
-- 
2.47.3

^ permalink raw reply related

* Re: [PATCH] mm/page_alloc: add tracepoint for PCP refills
From: Vishal Moola @ 2026-04-26 22:06 UTC (permalink / raw)
  To: Bunyod Suvonov
  Cc: akpm, vbabka, linux-mm, rostedt, mhiramat, mathieu.desnoyers,
	linux-trace-kernel, linux-kernel, surenb, mhocko, jackmanb,
	hannes, ziy
In-Reply-To: <20260425091335.346504-1-b.suvonov@sjtu.edu.cn>

On Sat, Apr 25, 2026 at 05:13:35PM +0800, Bunyod Suvonov wrote:
> The page allocator already has mm_page_pcpu_drain to trace pages
> drained from the per-cpu page lists back to the buddy allocator. There
> is no matching tracepoint for the opposite direction, where
> rmqueue_bulk() refills a PCP list from the buddy allocator.

This sounds like a reasonable idea. Does this tracepoint show us
something that a workload might care about? Not opposed, just curious.

For future versions, would you mind including documentation about it
in Documentation/trace/events-kmem.rst?

> mm_page_alloc_zone_locked is not a good substitute for this. It is
> emitted from __rmqueue_smallest(), which is used both by rmqueue_bulk()
> and by the direct buddy allocation path. Its percpu_refill field is
> derived from the allocation order and migratetype, so it does not
> reliably identify whether the allocation came from a PCP refill.
> 
> Add mm_page_pcpu_refill and emit it from rmqueue_bulk() for each page
> added to the PCP list. The new tracepoint uses the same page, order and
> migratetype fields as mm_page_pcpu_drain, making refill and drain
> activity directly comparable.
> 
> Signed-off-by: Bunyod Suvonov <b.suvonov@sjtu.edu.cn>
> ---
>  include/trace/events/kmem.h | 23 +++++++++++++++++++++++
>  mm/page_alloc.c             |  1 +
>  2 files changed, 24 insertions(+)
> 
> diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
> index cd7920c81f85..16985604fc51 100644
> --- a/include/trace/events/kmem.h
> +++ b/include/trace/events/kmem.h
> @@ -243,6 +243,29 @@ DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,
>  	TP_ARGS(page, order, migratetype, percpu_refill)
>  );
>  
> +TRACE_EVENT(mm_page_pcpu_refill,
> +
> +	TP_PROTO(struct page *page, unsigned int order, int migratetype),
> +
> +	TP_ARGS(page, order, migratetype),
> +
> +	TP_STRUCT__entry(
> +		__field(	unsigned long,	pfn		)
> +		__field(	unsigned int,	order		)
> +		__field(	int,		migratetype	)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->pfn		= page ? page_to_pfn(page) : -1UL;
> +		__entry->order		= order;
> +		__entry->migratetype	= migratetype;
> +	),
> +
> +	TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d",
> +		pfn_to_page(__entry->pfn), __entry->pfn,
> +		__entry->order, __entry->migratetype)
> +);
> +
>  TRACE_EVENT(mm_page_pcpu_drain,
>  
>  	TP_PROTO(struct page *page, unsigned int order, int migratetype),
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 65e205111553..a60b73ed39a4 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2544,6 +2544,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>  		 * pages are ordered properly.
>  		 */
>  		list_add_tail(&page->pcp_list, list);
> +		trace_mm_page_pcpu_refill(page, order, migratetype);

If you're trying to trace all pages as they come onto the pcp lists,
should you also account for the free_frozen_page_commit() path?

>  	}
>  	spin_unlock_irqrestore(&zone->lock, flags);
>  
> -- 
> 2.53.0
> 

^ permalink raw reply

* Re: [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
From: Ritesh Harjani @ 2026-04-26 22:31 UTC (permalink / raw)
  To: Jeff Layton, Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), Andrew Morton, David Hildenbrand,
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Christoph Hellwig, Kairui Song, Qi Zheng, Shakeel Butt,
	Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever
  Cc: linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel, Jeff Layton
In-Reply-To: <20260426-dontcache-v3-2-79eb37da9547@kernel.org>

Jeff Layton <jlayton@kernel.org> writes:

> The IOCB_DONTCACHE writeback path in generic_write_sync() calls
> filemap_flush_range() on every write, submitting writeback inline in
> the writer's context.  Perf lock contention profiling shows the
> performance problem is not lock contention but the writeback submission
> work itself — walking the page tree and submitting I/O blocks the writer
> for milliseconds, inflating p99.9 latency from 23ms (buffered) to 93ms
> (dontcache).
>
> Replace the inline filemap_flush_range() call with a flusher kick that
> drains dirty pages in the background.  This moves writeback submission
> completely off the writer's hot path.
>
> To avoid flushing unrelated buffered dirty data, add a dedicated
> WB_start_dontcache bit and wb_check_start_dontcache() handler that uses
> the new NR_DONTCACHE_DIRTY counter to determine how many pages to write
> back.  The flusher writes back that many pages from the oldest dirty
> inodes (not restricted to dontcache-specific inodes). This helps
> preserve I/O batching while limiting the scope of expedited writeback.
>

Yup, so, we wakeup the writeback flusher, which will write those many
"number" of dirty pages. Those dirty pages written by writeback, can be
of any type though, can be DONTCACHE or normal (non-dontcache) dirty
pages. IIUC, writeback doesn't distinguish between them while writing.


IMO, what we could also include in the commit msg is why is this above
approach taken? IIUC, that is because, by writing NR_DONTCACHE_DIRTY
pages, it still reduces the page cache pressure and still reduces the
amount of work that the reclaim has to do, even though some of those
pages maybe non-dontcache pages, in case if there was a parallel
buffered write in the system.


Also should the following change be documented somewhere? Like in Man
page maybe? i.e.
Earlier RWF_DONTCACHE writes made sure that those dirty pages are
immediately submitted for writeback and completion would release those
pages. But now, in certain cases when there is a mixed buffered write in
the system, those dontcache dirty pages might be written back after a
delay (whenever the next time writeback kicks in).
However for RWF_DONTCACHE reads, it should not affect anything.

> Like WB_start_all, the WB_start_dontcache bit coalesces multiple
> DONTCACHE writes into a single flusher wakeup without per-write
> allocations.
>
> Also add WB_REASON_DONTCACHE as a new writeback reason for tracing
> visibility, and target the correct cgroup writeback domain via
> unlocked_inode_to_wb_begin().
>
> dontcache-bench results on dual-socket Xeon Gold 6138 (80 CPUs, 256 GB
> RAM, Samsung MZ1LB1T9HALS 1.7 TB NVMe, local XFS, io_uring, file size
> ~503 GB, compared to a v6.19-ish baseline):
>

Can we please also test parallel buffered writes and dontcache writes? 
Since this patch series definitely affects that.

BTW - adding these numbers in the commit msg itself is much helpful.

>   Single-client sequential write (MB/s):
>                        baseline    patched     change
>   buffered              1449.8     1440.1      -0.7%
>   dontcache             1347.9     1461.5      +8.4%
>   direct                1450.0     1440.1      -0.7%
>
>   Single-client sequential write latency (us):
>                        baseline    patched     change
>   dontcache p50         3031.0    10551.3    +248.1%
>   dontcache p99        74973.2    21626.9     -71.2%
>   dontcache p99.9      85459.0    23199.7     -72.9%
>
>   Single-client random write (MB/s):
>                        baseline    patched     change
>   dontcache              284.2      295.4      +3.9%
>
>   Single-client random write p99.9 latency (us):
>                        baseline    patched     change
>   dontcache             2277.4      872.4     -61.7%
>
>   Multi-writer aggregate throughput (MB/s):

Can you please help describe this test scenario if possible.. In above
you mentioned we are writing file_size as 2x RAM_SIZE. But your
multi-client tests says something else..

local num_clients=4
+	mem_kb=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
+	client_size="$(( mem_kb / 1024 / num_clients ))M"

Also the multi-writer case is spawning parallel fio jobs, and then
parsing and aggregating the bandwidth results instead of using fio to
spawn multiple parallel threads... which is ok, but a bit wierd.
Why not let fio do the aggregate bandwidth, and latency calculation
instead?

>                        baseline    patched     change
>   buffered              1619.5     1611.2      -0.5%
>   dontcache             1281.1     1629.4     +27.2%
>   direct                1545.4     1609.4      +4.1%
>
>   Mixed-mode noisy neighbor (dontcache writer + buffered readers):
>                        baseline    patched     change
>   writer (MB/s)         1297.6     1471.1     +13.4%
>   readers avg (MB/s)     855.0      462.4     -45.9%
>
> nfsd-io-bench results on same hardware (XFS on NVMe, NFSv3 via fio
> NFS engine with libnfs, 1024 NFSD threads, pool_mode=pernode,
> file size ~502 GB, compared to v6.19-ish baseline):
>
>   Single-client sequential write (MB/s):
>                        baseline    patched     change
>   buffered              4844.2     4653.4      -3.9%
>   dontcache             3028.3     3723.1     +22.9%
>   direct                 957.6      987.8      +3.2%
>
>   Single-client sequential write p99.9 latency (us):
>                        baseline    patched     change
>   dontcache            759169.0   175112.2     -76.9%
>
>   Single-client random write (MB/s):
>                        baseline    patched     change
>   dontcache              590.0     1561.0    +164.6%
>
>   Multi-writer aggregate throughput (MB/s):
>                        baseline    patched     change
>   buffered              9636.3     9422.9      -2.2%
>   dontcache             1894.9     9442.6    +398.3%
>   direct                 809.6      975.1     +20.4%
>
>   Noisy neighbor (dontcache writer + random readers):
>                        baseline    patched     change
>   writer (MB/s)         1854.5     4063.6    +119.1%
>   readers avg (MB/s)     131.2      101.6     -22.5%
>
> The NFS results show even larger improvements than the local benchmarks.
> Multi-writer dontcache throughput improves nearly 5x, matching buffered
> I/O. Dirty page footprint drops 85-95% in sequential workloads vs.
> buffered.
>

Nice :)
Some explaination here of why 5x improvement with NFS compared to local
filesystems please?
(I am not much aware of NFS side, but a possible reasoning would help)

-ritesh


^ permalink raw reply

* Re: [PATCH v3 3/4] testing: add nfsd-io-bench NFS server benchmark suite
From: Ritesh Harjani @ 2026-04-26 23:54 UTC (permalink / raw)
  To: Jeff Layton, Andrew Morton
  Cc: Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Christoph Hellwig, Kairui Song, Qi Zheng, Shakeel Butt,
	Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever, linux-fsdevel,
	linux-kernel, linux-nfs, linux-mm, linux-trace-kernel, Zorro Lang
In-Reply-To: <a1e784d7006fe5d4331d41a0638be117ac67fb21.camel@kernel.org>

Jeff Layton <jlayton@kernel.org> writes:

> On Sun, 2026-04-26 at 05:34 -0700, Andrew Morton wrote:
>> So how are we to maintain this?

Maybe in xfstests? It has tests/perf/, but that just have 1 test.
Maybe others can tell whether it make sense to maintain such fio based
performance benchmarking scripts in there.

-ritesh

^ permalink raw reply

* [PATCH v5 0/2] blk-mq: introduce tag starvation observability
From: Aaron Tomlin @ 2026-04-27  2:01 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: bvanassche, johannes.thumshirn, kch, dlemoal, ritesh.list,
	loberman, neelx, sean, mproche, chjohnst, linux-block,
	linux-kernel, linux-trace-kernel

Hi Jens, Steve, Masami,

In high-performance storage environments, particularly when utilising RAID
controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe latency
spikes can occur when fast devices are starved of available tags.
Currently, diagnosing this specific queue contention requires deploying
dynamic kprobes or inferring sleep states, which lacks a simple,
out-of-the-box diagnostic path.

This short series introduces dedicated, low-overhead observability for tag
exhaustion events in the block layer:

  - Patch 1 introduces the "block_rq_tag_wait" tracepoint in the tag
    allocation slow-path to capture precise, event-based starvation.

  - Patch 2 complements this by exposing "wait_on_hw_tag" and
    "wait_on_sched_tag" per-CPU counters via debugfs for quick,
    point-in-time cumulative polling.

Together, these provide storage engineers with zero-configuration
mechanisms to definitively identify shared-tag bottlenecks.

Please let me know your thoughts.


Changes since v4 [1]:
 - Prevented a NULL pointer dereference in the tracepoint fast-assign for
   disk-less request queues by safely checking q->disk before resolving the
   dev_t

 - Fixed a Use-After-Free (UAF) and permanent memory leak by decoupling
   the per-CPU counter allocation from the volatile debugfs lifecycle and
   tying it directly to the core hctx lifecycle (i.e., blk_mq_init_hctx()
   and blk_mq_exit_hctx())

 - Fixed a potential compiler double-fetch bug by wrapping the per-CPU
   pointer evaluations with READ_ONCE() in blk_mq_debugfs_inc_wait_tags()

 - Passed the appropriate gfp_t flags down to the allocation routines to
   maintain the strict GFP_NOIO context

 - Updated kernel-doc descriptions to clarify that the NULL pointer 
   checks guard against memory allocation failures under pressure, rather 
   than initialisation race conditions

Changes since v3 [2]:
 - Transitioned tracking architecture from shared atomic_t variables to
   dynamically allocated per-CPU counters to resolve cache line bouncing
   (Bart Van Assche)

Changes since v2 [3]:
 - Added "Reviewed-by:" and "Tested-by:" tags for patch 1

 - Evaluate is_sched_tag directly within TP_fast_assign (Steven Rostedt)

 - Introduced atomic counters via debugfs 

Changes since v1 [4]:
 - Improved the description of the trace point (Damien Le Moal)

 - Removed the redundant "active requests" (Laurence Oberman)

 - Introduced pool-specific starvation tracking

[1]: https://lore.kernel.org/lkml/20260419023036.1419514-1-atomlin@atomlin.com/
[2]: https://lore.kernel.org/lkml/20260319221956.332770-1-atomlin@atomlin.com/
[3]: https://lore.kernel.org/lkml/20260319015300.287653-1-atomlin@atomlin.com/
[4]: https://lore.kernel.org/lkml/20260317182835.258183-1-atomlin@atomlin.com/


Aaron Tomlin (2):
  blk-mq: add tracepoint block_rq_tag_wait
  blk-mq: expose tag starvation counts via debugfs

 block/blk-mq-debugfs.c       | 109 +++++++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.h       |  19 ++++++
 block/blk-mq-tag.c           |   8 +++
 block/blk-mq.c               |   5 ++
 include/linux/blk-mq.h       |  12 ++++
 include/trace/events/block.h |  43 ++++++++++++++
 6 files changed, 196 insertions(+)

-- 
2.51.0


^ permalink raw reply

* [PATCH v5 1/2] blk-mq: add tracepoint block_rq_tag_wait
From: Aaron Tomlin @ 2026-04-27  2:01 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: bvanassche, johannes.thumshirn, kch, dlemoal, ritesh.list,
	loberman, neelx, sean, mproche, chjohnst, linux-block,
	linux-kernel, linux-trace-kernel
In-Reply-To: <20260427020142.358912-1-atomlin@atomlin.com>

In high-performance storage environments, particularly when utilising
RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
latency spikes can occur when fast devices (SSDs) are starved of hardware
tags when sharing the same blk_mq_tag_set.

Currently, diagnosing this specific hardware queue contention is
difficult. When a CPU thread exhausts the tag pool, blk_mq_get_tag()
forces the current thread to block uninterruptible via io_schedule().
While this can be inferred via sched:sched_switch or dynamically
traced by attaching a kprobe to blk_mq_mark_tag_wait(), there is no
dedicated, out-of-the-box observability for this event.

This patch introduces the block_rq_tag_wait trace point in the tag
allocation slow-path. It triggers immediately before the thread yields
the CPU, exposing the exact hardware context (hctx) that is starved, the
specific pool experiencing starvation (hardware or software scheduler),
and the total pool depth.

This provides storage engineers and performance monitoring agents
with a zero-configuration, low-overhead mechanism to definitively
identify shared-tag bottlenecks and tune I/O schedulers or cgroup
throttling accordingly.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Laurence Oberman <loberman@redhat.com>
Tested-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
---
 block/blk-mq-tag.c           |  4 ++++
 include/trace/events/block.h | 43 ++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 33946cdb5716..66138dd043d4 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
 #include <linux/kmemleak.h>
 
 #include <linux/delay.h>
+#include <trace/events/block.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
@@ -187,6 +188,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != BLK_MQ_NO_TAG)
 			break;
 
+		trace_block_rq_tag_wait(data->q, data->hctx,
+					data->rq_flags & RQF_SCHED_TAGS);
+
 		bt_prev = bt;
 		io_schedule();
 
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 6aa79e2d799c..7c1026d1cb35 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -226,6 +226,49 @@ DECLARE_EVENT_CLASS(block_rq,
 		  IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm)
 );
 
+/**
+ * block_rq_tag_wait - triggered when a request is starved of a tag
+ * @q: request queue of the target device
+ * @hctx: hardware context of the request experiencing starvation
+ * @is_sched_tag: indicates whether the starved pool is the software scheduler
+ *
+ * Called immediately before the submitting context is forced to block due
+ * to the exhaustion of available tags (i.e., physical hardware driver tags
+ * or software scheduler tags). This trace point indicates that the context
+ * will be placed into an uninterruptible state via io_schedule() until an
+ * active request completes and relinquishes its assigned tag.
+ */
+TRACE_EVENT(block_rq_tag_wait,
+
+	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx, bool is_sched_tag),
+
+	TP_ARGS(q, hctx, is_sched_tag),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( u32,		hctx_id			)
+		__field( u32,		nr_tags			)
+		__field( bool,		is_sched_tag		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= q->disk ? disk_devt(q->disk);
+		__entry->hctx_id	= hctx->queue_num;
+		__entry->is_sched_tag	= is_sched_tag;
+
+		if (is_sched_tag)
+			__entry->nr_tags = hctx->sched_tags->nr_tags;
+		else
+			__entry->nr_tags = hctx->tags->nr_tags;
+	),
+
+	TP_printk("%d,%d hctx=%u starved on %s tags (depth=%u)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->hctx_id,
+		  __entry->is_sched_tag ? "scheduler" : "hardware",
+		  __entry->nr_tags)
+);
+
 /**
  * block_rq_insert - insert block operation request into queue
  * @rq: block IO operation request
-- 
2.51.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox