linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Mike Snitzer <snitzer@kernel.org>
To: Chuck Lever <chuck.lever@oracle.com>, Jeff Layton <jlayton@kernel.org>
Cc: linux-nfs@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 6/6] NFSD: issue READs using O_DIRECT even if IO is misaligned
Date: Tue, 10 Jun 2025 16:57:37 -0400	[thread overview]
Message-ID: <20250610205737.63343-7-snitzer@kernel.org> (raw)
In-Reply-To: <20250610205737.63343-1-snitzer@kernel.org>

If enable-dontcache is used, expand any misaligned READ to the next
DIO-aligned block (on either end of the READ).

Reserve an extra page in svc_serv_maxpages() because nfsd_iter_read()
might need two extra pages when a READ payload is not DIO-aligned --
but nfsd_iter_read() and nfsd_splice_actor() are mutually exclusive
(so reuse page reserved for nfsd_splice_actor).

Also add nfsd_read_vector_dio trace event. This combination of
trace events is useful:

 echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector/enable
 echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector_dio/enable
 echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_io_done/enable
 echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_read/enable

Which for this dd command:

 dd if=/mnt/share1/test of=/dev/null bs=47008 count=2 iflag=direct

Results in:

 nfsd-16580   [001] .....  5672.403130: nfsd_read_vector_dio: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47008 start=0+0 end=47104-96
 nfsd-16580   [001] .....  5672.403131: nfsd_read_vector: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47104
 nfsd-16580   [001] .....  5672.403134: xfs_file_direct_read: dev 253:0 ino 0x1c2388c1 disize 0x16f40 pos 0x0 bytecount 0xb800
 nfsd-16580   [001] .....  5672.404380: nfsd_read_io_done: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47008

 nfsd-16580   [001] .....  5672.404672: nfsd_read_vector_dio: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=47008 len=47008 start=46592+416 end=94208-192
 nfsd-16580   [001] .....  5672.404672: nfsd_read_vector: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=46592 len=47616
 nfsd-16580   [001] .....  5672.404673: xfs_file_direct_read: dev 253:0 ino 0x1c2388c1 disize 0x16f40 pos 0xb600 bytecount 0xba00
 nfsd-16580   [001] .....  5672.405771: nfsd_read_io_done: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=47008 len=47008

Suggested-by: Jeff Layton <jlayton@kernel.org>
Suggested-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 fs/nfsd/trace.h            | 37 ++++++++++++++++++++++
 fs/nfsd/vfs.c              | 65 ++++++++++++++++++++++++++++----------
 include/linux/sunrpc/svc.h |  5 ++-
 3 files changed, 90 insertions(+), 17 deletions(-)

diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 3c5505ef5e3a..a46515b953f4 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -473,6 +473,43 @@ DEFINE_NFSD_IO_EVENT(write_done);
 DEFINE_NFSD_IO_EVENT(commit_start);
 DEFINE_NFSD_IO_EVENT(commit_done);
 
+TRACE_EVENT(nfsd_read_vector_dio,
+	TP_PROTO(struct svc_rqst *rqstp,
+		 struct svc_fh	*fhp,
+		 u64		offset,
+		 u32		len,
+		 loff_t         start,
+		 loff_t         start_extra,
+		 loff_t         end,
+		 loff_t         end_extra),
+	TP_ARGS(rqstp, fhp, offset, len, start, start_extra, end, end_extra),
+	TP_STRUCT__entry(
+		__field(u32, xid)
+		__field(u32, fh_hash)
+		__field(u64, offset)
+		__field(u32, len)
+		__field(loff_t, start)
+		__field(loff_t, start_extra)
+		__field(loff_t, end)
+		__field(loff_t, end_extra)
+	),
+	TP_fast_assign(
+		__entry->xid = be32_to_cpu(rqstp->rq_xid);
+		__entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+		__entry->offset = offset;
+		__entry->len = len;
+		__entry->start = start;
+		__entry->start_extra = start_extra;
+		__entry->end = end;
+		__entry->end_extra = end_extra;
+	),
+	TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u start=%llu+%llu end=%llu-%llu",
+		  __entry->xid, __entry->fh_hash,
+		  __entry->offset, __entry->len,
+		  __entry->start, __entry->start_extra,
+		  __entry->end, __entry->end_extra)
+);
+
 DECLARE_EVENT_CLASS(nfsd_err_class,
 	TP_PROTO(struct svc_rqst *rqstp,
 		 struct svc_fh	*fhp,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a942609e3ab9..be5d025b4680 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -19,6 +19,7 @@
 #include <linux/splice.h>
 #include <linux/falloc.h>
 #include <linux/fcntl.h>
+#include <linux/math.h>
 #include <linux/namei.h>
 #include <linux/delay.h>
 #include <linux/fsnotify.h>
@@ -1101,15 +1102,41 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		      unsigned int base, u32 *eof)
 {
 	struct file *file = nf->nf_file;
-	unsigned long v, total;
+	unsigned long v, total, in_count = *count;
+	loff_t start_extra = 0, end_extra = 0;
 	struct iov_iter iter;
-	loff_t ppos = offset;
+	loff_t ppos;
 	rwf_t flags = 0;
 	ssize_t host_err;
 	size_t len;
 
+	/*
+	 * If dontcache enabled, expand any misaligned READ to
+	 * the next DIO-aligned block (on either end of the READ).
+	 */
+	if (nfsd_enable_dontcache && nf->nf_dio_mem_align &&
+	    (base & (nf->nf_dio_mem_align-1)) == 0) {
+		const u32 dio_blocksize = nf->nf_dio_read_offset_align;
+		loff_t orig_end = offset + *count;
+		loff_t start = round_down(offset, dio_blocksize);
+		loff_t end = round_up(orig_end, dio_blocksize);
+
+		WARN_ON_ONCE(dio_blocksize > PAGE_SIZE);
+		start_extra = offset - start;
+		end_extra = end - orig_end;
+
+		/* Show original offset and count, and how it was expanded for DIO */
+		trace_nfsd_read_vector_dio(rqstp, fhp, offset, *count,
+					   start, start_extra, end, end_extra);
+
+		/* trace_nfsd_read_vector() will reflect larger DIO-aligned READ */
+		offset = start;
+		in_count = end - start;
+		flags |= RWF_DIRECT;
+	}
+
 	v = 0;
-	total = *count;
+	total = in_count;
 	while (total) {
 		len = min_t(size_t, total, PAGE_SIZE - base);
 		bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++),
@@ -1120,21 +1147,27 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	}
 	WARN_ON_ONCE(v > rqstp->rq_maxpages);
 
-	trace_nfsd_read_vector(rqstp, fhp, offset, *count);
-	iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count);
-
-	if (nfsd_enable_dontcache) {
-		if (is_dio_aligned(&iter, offset, nf->nf_dio_read_offset_align))
-			flags |= RWF_DIRECT;
-		/* FIXME: not using RWF_DONTCACHE for misaligned IO because it works
-		 * against us (due to RMW needing to read without benefit of cache),
-		 * whereas buffered IO enables misaligned IO to be more performant.
-		 */
-		//else
-		//	flags |= RWF_DONTCACHE;
-	}
+	trace_nfsd_read_vector(rqstp, fhp, offset, in_count);
+	iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, in_count);
 
+	ppos = offset;
 	host_err = vfs_iter_read(file, &iter, &ppos, flags);
+
+	if ((start_extra || end_extra) && host_err >= 0) {
+		rqstp->rq_bvec[0].bv_offset += start_extra;
+		rqstp->rq_bvec[0].bv_len -= start_extra;
+		rqstp->rq_bvec[v].bv_len -= end_extra;
+		/* Must adjust returned read size to reflect original extent */
+		offset += start_extra;
+		if (likely(host_err >= start_extra)) {
+			host_err -= start_extra;
+			if (host_err > *count)
+				host_err = *count;
+		} else {
+			/* Short read that didn't read any of requested data */
+			host_err = 0;
+		}
+	}
 	return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
 }
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 46f7991cea58..52f5c9ec35aa 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -163,10 +163,13 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
  * pages, one for the request, and one for the reply.
  * nfsd_splice_actor() might need an extra page when a READ payload
  * is not page-aligned.
+ * nfsd_iter_read() might need two extra pages when a READ payload
+ * is not DIO-aligned -- but nfsd_iter_read() and nfsd_splice_actor()
+ * are mutually exclusive.
  */
 static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv)
 {
-	return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1;
+	return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1 + 1;
 }
 
 /*
-- 
2.44.0


  parent reply	other threads:[~2025-06-10 20:57 UTC|newest]

Thread overview: 75+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-06-10 20:57 [PATCH 0/6] NFSD: add enable-dontcache and initially use it to add DIO support Mike Snitzer
2025-06-10 20:57 ` [PATCH 1/6] NFSD: add the ability to enable use of RWF_DONTCACHE for all IO Mike Snitzer
2025-06-11  6:57   ` Christoph Hellwig
2025-06-11 10:44     ` Mike Snitzer
2025-06-11 13:04       ` Jeff Layton
2025-06-11 13:56     ` Chuck Lever
2025-06-11 14:31   ` Chuck Lever
2025-06-11 19:18     ` Mike Snitzer
2025-06-11 20:29       ` Jeff Layton
2025-06-11 21:36         ` need SUNRPC TCP to receive into aligned pages [was: Re: [PATCH 1/6] NFSD: add the ability to enable use of RWF_DONTCACHE for all IO] Mike Snitzer
2025-06-12 10:28           ` Jeff Layton
2025-06-12 11:28             ` Jeff Layton
2025-06-12 13:28             ` Chuck Lever
2025-06-12 14:17               ` Benjamin Coddington
2025-06-12 15:56                 ` Mike Snitzer
2025-06-12 15:58                   ` Chuck Lever
2025-06-12 16:12                     ` Mike Snitzer
2025-06-12 16:32                       ` Chuck Lever
2025-06-13  5:39                     ` Christoph Hellwig
2025-06-12 16:22               ` Jeff Layton
2025-06-13  5:46                 ` Christoph Hellwig
2025-06-13  9:23                   ` Mike Snitzer
2025-06-13 13:02                     ` Jeff Layton
2025-06-16 12:35                       ` Christoph Hellwig
2025-06-16 12:29                     ` Christoph Hellwig
2025-06-16 16:07                       ` Mike Snitzer
2025-06-17  4:37                         ` Christoph Hellwig
2025-06-17 20:26                           ` Mike Snitzer
2025-06-17 22:23                             ` [RFC PATCH] lib/iov_iter: remove piecewise bvec length checking in iov_iter_aligned_bvec [was: Re: need SUNRPC TCP to receive into aligned pages] Mike Snitzer
2025-07-03  0:12             ` need SUNRPC TCP to receive into aligned pages [was: Re: [PATCH 1/6] NFSD: add the ability to enable use of RWF_DONTCACHE for all IO] NeilBrown
2025-06-12  7:13         ` [PATCH 1/6] NFSD: add the ability to enable use of RWF_DONTCACHE for all IO Christoph Hellwig
2025-06-12 13:15           ` Chuck Lever
2025-06-12 13:21       ` Chuck Lever
2025-06-12 16:00         ` Mike Snitzer
2025-06-16 13:32           ` Chuck Lever
2025-06-16 16:10             ` Mike Snitzer
2025-06-17 17:22               ` Mike Snitzer
2025-06-17 17:31                 ` Chuck Lever
2025-06-19 20:19                   ` Mike Snitzer
2025-06-30 14:50                     ` Chuck Lever
2025-07-04 19:46                       ` Mike Snitzer
2025-07-04 19:49                         ` Chuck Lever
2025-06-10 20:57 ` [PATCH 2/6] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Mike Snitzer
2025-06-10 20:57 ` [PATCH 3/6] NFSD: pass nfsd_file to nfsd_iter_read() Mike Snitzer
2025-06-10 20:57 ` [PATCH 4/6] fs: introduce RWF_DIRECT to allow using O_DIRECT on a per-IO basis Mike Snitzer
2025-06-11  6:58   ` Christoph Hellwig
2025-06-11 10:51     ` Mike Snitzer
2025-06-11 14:17     ` Chuck Lever
2025-06-12  7:15       ` Christoph Hellwig
2025-06-10 20:57 ` [PATCH 5/6] NFSD: leverage DIO alignment to selectively issue O_DIRECT reads and writes Mike Snitzer
2025-06-11  7:00   ` Christoph Hellwig
2025-06-11 12:23     ` Mike Snitzer
2025-06-11 13:30       ` Jeff Layton
2025-06-12  7:22         ` Christoph Hellwig
2025-06-12  7:23       ` Christoph Hellwig
2025-06-11 14:42   ` Chuck Lever
2025-06-11 15:07     ` Jeff Layton
2025-06-11 15:11       ` Chuck Lever
2025-06-11 15:44         ` Jeff Layton
2025-06-11 20:51           ` Mike Snitzer
2025-06-12  7:32           ` Christoph Hellwig
2025-06-12  7:28         ` Christoph Hellwig
2025-06-12  7:25       ` Christoph Hellwig
2025-06-10 20:57 ` Mike Snitzer [this message]
2025-06-11 12:55 ` [PATCH 0/6] NFSD: add enable-dontcache and initially use it to add DIO support Jeff Layton
2025-06-12  7:39   ` Christoph Hellwig
2025-06-12 20:37     ` Mike Snitzer
2025-06-13  5:31       ` Christoph Hellwig
2025-06-11 14:16 ` Chuck Lever
2025-06-11 18:02   ` Mike Snitzer
2025-06-11 19:06     ` Chuck Lever
2025-06-11 19:58       ` Mike Snitzer
2025-06-12 13:46 ` Chuck Lever
2025-06-12 19:08   ` Mike Snitzer
2025-06-12 20:17     ` Chuck Lever

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250610205737.63343-7-snitzer@kernel.org \
    --to=snitzer@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=chuck.lever@oracle.com \
    --cc=jlayton@kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).