public inbox for linux-fsdevel@vger.kernel.org
 help / color / mirror / Atom feed
* A comparison of the new nfsd iomodes (and an experimental one)
@ 2026-03-26 15:23 Jeff Layton
  2026-03-26 15:30 ` Chuck Lever
  0 siblings, 1 reply; 8+ messages in thread
From: Jeff Layton @ 2026-03-26 15:23 UTC (permalink / raw)
  To: linux-nfs, linux-fsdevel, linux-block
  Cc: Chuck Lever, Mike Snitzer, Jens Axboe

I've been doing some benchmarking of the new nfsd iomodes, using
different fio-based workloads.

The results have been interesting, but one thing that stands out is
that RWF_DONTCACHE is absolutely terrible for streaming write
workloads. That prompted me to experiment with a new iomode that added
some optimizations (DONTCACHE_LAZY).

The results along with Claude's analysis are here:

    https://markdownpastebin.com/?id=387375d00b5443b3a2e37d58a062331f

He gets a bit out over his skis on the upstream plan, but tl;dr is that
DONTCACHE_LAZY (which is DONTCACHE with some optimizations) outperforms
the other write iomodes.

The core DONTCACHE_LAZY patch is below. I doubt we'll want a new iomode
long-term. What we'll probably want to do is modify DONTCACHE to work
like DONTCACHE_LAZY:

-------------------8<-------------------

[PATCH] mm: add IOCB_DONTCACHE_LAZY and RWF_DONTCACHE_LAZY

IOCB_DONTCACHE flushes all dirty pages on every write via
filemap_flush_range() with nr_to_write=LONG_MAX.  Under concurrent
writers, this creates severe serialization: every writer contends on
the writeback submission path, leading to catastrophic throughput
collapse (~1 GB/s vs ~10 GB/s for buffered) and multi-second tail
latency.

Add IOCB_DONTCACHE_LAZY as a gentler alternative with two mechanisms:

 1. Skip-if-busy: check mapping_tagged(PAGECACHE_TAG_WRITEBACK) before
    flushing.  If writeback is already in progress on the mapping, the
    flush is skipped entirely, eliminating writeback submission
    contention between concurrent writers.

 2. Proportional cap: when flushing does occur, cap nr_to_write to the
    number of pages just written.  This prevents any single write from
    triggering a full-file flush that would starve concurrent readers.

Together these mechanisms rate-limit writeback to match the incoming
write rate while avoiding I/O bursts that cause tail latency spikes.

Like IOCB_DONTCACHE, pages touched under IOCB_DONTCACHE_LAZY are
marked for eviction (dropbehind) to keep page cache usage bounded.

Also add RWF_DONTCACHE_LAZY (0x200) as a user-visible pwritev2/io_uring
flag that maps to IOCB_DONTCACHE_LAZY.  The flag follows the same
validation as RWF_DONTCACHE: the filesystem must support FOP_DONTCACHE,
DAX is not supported, and RWF_DONTCACHE and RWF_DONTCACHE_LAZY are
mutually exclusive.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/iomap/buffered-io.c  |  2 +-
 include/linux/fs.h      | 18 ++++++++++++++++--
 include/linux/pagemap.h |  2 +-
 include/uapi/linux/fs.h |  6 +++++-
 mm/filemap.c            | 40 +++++++++++++++++++++++++++++++++++++---
 5 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e3bedcbb5f1ea..069d4378bf457 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1185,7 +1185,7 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		iter.flags |= IOMAP_NOWAIT;
-	if (iocb->ki_flags & IOCB_DONTCACHE)
+	if (iocb->ki_flags & (IOCB_DONTCACHE | IOCB_DONTCACHE_LAZY))
 		iter.flags |= IOMAP_DONTCACHE;
 
 	while ((ret = iomap_iter(&iter, ops)) > 0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 94695ce5e25b5..04ff531473e82 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -359,6 +359,7 @@ struct readahead_control;
 /* kiocb is a read or write operation submitted by fs/aio.c. */
 #define IOCB_AIO_RW		(1 << 22)
 #define IOCB_HAS_METADATA	(1 << 23)
+#define IOCB_DONTCACHE_LAZY	(__force int) RWF_DONTCACHE_LAZY
 
 /* for use in trace events */
 #define TRACE_IOCB_STRINGS \
@@ -376,7 +377,8 @@ struct readahead_control;
 	{ IOCB_NOIO,		"NOIO" }, \
 	{ IOCB_ALLOC_CACHE,	"ALLOC_CACHE" }, \
 	{ IOCB_AIO_RW,		"AIO_RW" }, \
-	{ IOCB_HAS_METADATA,	"AIO_HAS_METADATA" }
+	{ IOCB_HAS_METADATA,	"AIO_HAS_METADATA" }, \
+	{ IOCB_DONTCACHE_LAZY,	"DONTCACHE_LAZY" }
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -2589,6 +2591,8 @@ extern int __must_check file_write_and_wait_range(struct file *file,
 						loff_t start, loff_t end);
 int filemap_flush_range(struct address_space *mapping, loff_t start,
 		loff_t end);
+int filemap_dontcache_writeback_range(struct address_space *mapping,
+		loff_t start, loff_t end, ssize_t nr_written);
 
 static inline int file_write_and_wait(struct file *file)
 {
@@ -2626,6 +2630,12 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
 
 		filemap_flush_range(mapping, iocb->ki_pos - count,
 				iocb->ki_pos - 1);
+	} else if (iocb->ki_flags & IOCB_DONTCACHE_LAZY) {
+		struct address_space *mapping = iocb->ki_filp->f_mapping;
+
+		filemap_dontcache_writeback_range(mapping,
+				iocb->ki_pos - count,
+				iocb->ki_pos - 1, count);
 	}
 
 	return count;
@@ -3393,13 +3403,17 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
 		if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
 			return -EOPNOTSUPP;
 	}
-	if (flags & RWF_DONTCACHE) {
+	if (flags & (RWF_DONTCACHE | RWF_DONTCACHE_LAZY)) {
 		/* file system must support it */
 		if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE))
 			return -EOPNOTSUPP;
 		/* DAX mappings not supported */
 		if (IS_DAX(ki->ki_filp->f_mapping->host))
 			return -EOPNOTSUPP;
+		/* can't use both at once */
+		if ((flags & (RWF_DONTCACHE | RWF_DONTCACHE_LAZY)) ==
+		    (RWF_DONTCACHE | RWF_DONTCACHE_LAZY))
+			return -EINVAL;
 	}
 	kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
 	if (flags & RWF_SYNC)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 9f5c4e8b4a7d3..3539a7b4ed53c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -798,7 +798,7 @@ static inline struct folio *write_begin_get_folio(const struct kiocb *iocb,
 
         fgp_flags |= fgf_set_order(len);
 
-        if (iocb && iocb->ki_flags & IOCB_DONTCACHE)
+        if (iocb && iocb->ki_flags & (IOCB_DONTCACHE | IOCB_DONTCACHE_LAZY))
                 fgp_flags |= FGP_DONTCACHE;
 
         return __filemap_get_folio(mapping, index, fgp_flags,
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 66ca526cf786c..74f7c75901e0c 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -434,10 +434,14 @@ typedef int __bitwise __kernel_rwf_t;
 /* prevent pipe and socket writes from raising SIGPIPE */
 #define RWF_NOSIGNAL	((__force __kernel_rwf_t)0x00000100)
 
+/* buffered IO that drops the cache after reading or writing data,
+ * with rate-limited writeback (skip if writeback already in progress) */
+#define RWF_DONTCACHE_LAZY	((__force __kernel_rwf_t)0x00000200)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
 			 RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\
-			 RWF_DONTCACHE | RWF_NOSIGNAL)
+			 RWF_DONTCACHE | RWF_NOSIGNAL | RWF_DONTCACHE_LAZY)
 
 #define PROCFS_IOCTL_MAGIC 'f'
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 9697e12dfbdcc..448bee3f3f1ce 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -440,6 +440,40 @@ int filemap_flush_range(struct address_space *mapping, loff_t start,
 }
 EXPORT_SYMBOL_GPL(filemap_flush_range);
 
+/**
+ * filemap_dontcache_writeback_range - rate-limited writeback for dontcache I/O
+ * @mapping:	target address_space
+ * @start:	byte offset to start writeback
+ * @end:	byte offset to end writeback (inclusive)
+ * @nr_written:	number of bytes just written by the caller
+ *
+ * Kick writeback for dontcache I/O, but avoid piling on if writeback is
+ * already in progress.  When writeback is kicked, limit the number of pages
+ * submitted to be proportional to the amount just written, rather than
+ * flushing the entire dirty range.
+ *
+ * This reduces tail latency compared to filemap_flush_range() which submits
+ * writeback for all dirty pages on every call, creating queue contention
+ * under concurrent writers.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int filemap_dontcache_writeback_range(struct address_space *mapping,
+				      loff_t start, loff_t end,
+				      ssize_t nr_written)
+{
+	long nr;
+
+	/* If writeback is already active, don't pile on */
+	if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+		return 0;
+
+	nr = (nr_written + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	return filemap_writeback(mapping, start, end, WB_SYNC_NONE, &nr,
+			WB_REASON_BACKGROUND);
+}
+EXPORT_SYMBOL_GPL(filemap_dontcache_writeback_range);
+
 /**
  * filemap_flush - mostly a non-blocking flush
  * @mapping:	target address_space
@@ -2633,7 +2667,7 @@ static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)
 	folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL);
 	if (!folio)
 		return -ENOMEM;
-	if (iocb->ki_flags & IOCB_DONTCACHE)
+	if (iocb->ki_flags & (IOCB_DONTCACHE | IOCB_DONTCACHE_LAZY))
 		__folio_set_dropbehind(folio);
 
 	/*
@@ -2680,7 +2714,7 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file,
 
 	if (iocb->ki_flags & IOCB_NOIO)
 		return -EAGAIN;
-	if (iocb->ki_flags & IOCB_DONTCACHE)
+	if (iocb->ki_flags & (IOCB_DONTCACHE | IOCB_DONTCACHE_LAZY))
 		ractl.dropbehind = 1;
 	page_cache_async_ra(&ractl, folio, last_index - folio->index);
 	return 0;
@@ -2712,7 +2746,7 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count,
 			return -EAGAIN;
 		if (iocb->ki_flags & IOCB_NOWAIT)
 			flags = memalloc_noio_save();
-		if (iocb->ki_flags & IOCB_DONTCACHE)
+		if (iocb->ki_flags & (IOCB_DONTCACHE | IOCB_DONTCACHE_LAZY))
 			ractl.dropbehind = 1;
 		page_cache_sync_ra(&ractl, last_index - index);
 		if (iocb->ki_flags & IOCB_NOWAIT)

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-03-28 12:37 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-26 15:23 A comparison of the new nfsd iomodes (and an experimental one) Jeff Layton
2026-03-26 15:30 ` Chuck Lever
2026-03-26 16:35   ` Jeff Layton
2026-03-26 20:48     ` Mike Snitzer
2026-03-27 11:32       ` Jeff Layton
2026-03-27 13:19         ` Chuck Lever
2026-03-27 16:57           ` Mike Snitzer
2026-03-28 12:37             ` Jeff Layton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox