From: Mike Snitzer <snitzer@kernel.org>
To: Chuck Lever <chuck.lever@oracle.com>, Jeff Layton <jlayton@kernel.org>
Cc: linux-nfs@vger.kernel.org, linux-fsdevel@vger.kernel.org,
Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 6/6] NFSD: issue READs using O_DIRECT even if IO is misaligned
Date: Tue, 10 Jun 2025 16:57:37 -0400 [thread overview]
Message-ID: <20250610205737.63343-7-snitzer@kernel.org> (raw)
In-Reply-To: <20250610205737.63343-1-snitzer@kernel.org>
If enable-dontcache is used, expand any misaligned READ to the next
DIO-aligned block (on either end of the READ).
Reserve an extra page in svc_serv_maxpages() because nfsd_iter_read()
might need two extra pages when a READ payload is not DIO-aligned --
but nfsd_iter_read() and nfsd_splice_actor() are mutually exclusive
(so reuse page reserved for nfsd_splice_actor).
Also add nfsd_read_vector_dio trace event. This combination of
trace events is useful:
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector/enable
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector_dio/enable
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_io_done/enable
echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_read/enable
Which for this dd command:
dd if=/mnt/share1/test of=/dev/null bs=47008 count=2 iflag=direct
Results in:
nfsd-16580 [001] ..... 5672.403130: nfsd_read_vector_dio: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47008 start=0+0 end=47104-96
nfsd-16580 [001] ..... 5672.403131: nfsd_read_vector: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47104
nfsd-16580 [001] ..... 5672.403134: xfs_file_direct_read: dev 253:0 ino 0x1c2388c1 disize 0x16f40 pos 0x0 bytecount 0xb800
nfsd-16580 [001] ..... 5672.404380: nfsd_read_io_done: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47008
nfsd-16580 [001] ..... 5672.404672: nfsd_read_vector_dio: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=47008 len=47008 start=46592+416 end=94208-192
nfsd-16580 [001] ..... 5672.404672: nfsd_read_vector: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=46592 len=47616
nfsd-16580 [001] ..... 5672.404673: xfs_file_direct_read: dev 253:0 ino 0x1c2388c1 disize 0x16f40 pos 0xb600 bytecount 0xba00
nfsd-16580 [001] ..... 5672.405771: nfsd_read_io_done: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=47008 len=47008
Suggested-by: Jeff Layton <jlayton@kernel.org>
Suggested-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfsd/trace.h | 37 ++++++++++++++++++++++
fs/nfsd/vfs.c | 65 ++++++++++++++++++++++++++++----------
include/linux/sunrpc/svc.h | 5 ++-
3 files changed, 90 insertions(+), 17 deletions(-)
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 3c5505ef5e3a..a46515b953f4 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -473,6 +473,43 @@ DEFINE_NFSD_IO_EVENT(write_done);
DEFINE_NFSD_IO_EVENT(commit_start);
DEFINE_NFSD_IO_EVENT(commit_done);
+TRACE_EVENT(nfsd_read_vector_dio,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *fhp,
+ u64 offset,
+ u32 len,
+ loff_t start,
+ loff_t start_extra,
+ loff_t end,
+ loff_t end_extra),
+ TP_ARGS(rqstp, fhp, offset, len, start, start_extra, end, end_extra),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(u32, fh_hash)
+ __field(u64, offset)
+ __field(u32, len)
+ __field(loff_t, start)
+ __field(loff_t, start_extra)
+ __field(loff_t, end)
+ __field(loff_t, end_extra)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->offset = offset;
+ __entry->len = len;
+ __entry->start = start;
+ __entry->start_extra = start_extra;
+ __entry->end = end;
+ __entry->end_extra = end_extra;
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u start=%llu+%llu end=%llu-%llu",
+ __entry->xid, __entry->fh_hash,
+ __entry->offset, __entry->len,
+ __entry->start, __entry->start_extra,
+ __entry->end, __entry->end_extra)
+);
+
DECLARE_EVENT_CLASS(nfsd_err_class,
TP_PROTO(struct svc_rqst *rqstp,
struct svc_fh *fhp,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a942609e3ab9..be5d025b4680 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -19,6 +19,7 @@
#include <linux/splice.h>
#include <linux/falloc.h>
#include <linux/fcntl.h>
+#include <linux/math.h>
#include <linux/namei.h>
#include <linux/delay.h>
#include <linux/fsnotify.h>
@@ -1101,15 +1102,41 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int base, u32 *eof)
{
struct file *file = nf->nf_file;
- unsigned long v, total;
+ unsigned long v, total, in_count = *count;
+ loff_t start_extra = 0, end_extra = 0;
struct iov_iter iter;
- loff_t ppos = offset;
+ loff_t ppos;
rwf_t flags = 0;
ssize_t host_err;
size_t len;
+ /*
+ * If dontcache enabled, expand any misaligned READ to
+ * the next DIO-aligned block (on either end of the READ).
+ */
+ if (nfsd_enable_dontcache && nf->nf_dio_mem_align &&
+ (base & (nf->nf_dio_mem_align-1)) == 0) {
+ const u32 dio_blocksize = nf->nf_dio_read_offset_align;
+ loff_t orig_end = offset + *count;
+ loff_t start = round_down(offset, dio_blocksize);
+ loff_t end = round_up(orig_end, dio_blocksize);
+
+ WARN_ON_ONCE(dio_blocksize > PAGE_SIZE);
+ start_extra = offset - start;
+ end_extra = end - orig_end;
+
+ /* Show original offset and count, and how it was expanded for DIO */
+ trace_nfsd_read_vector_dio(rqstp, fhp, offset, *count,
+ start, start_extra, end, end_extra);
+
+ /* trace_nfsd_read_vector() will reflect larger DIO-aligned READ */
+ offset = start;
+ in_count = end - start;
+ flags |= RWF_DIRECT;
+ }
+
v = 0;
- total = *count;
+ total = in_count;
while (total) {
len = min_t(size_t, total, PAGE_SIZE - base);
bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++),
@@ -1120,21 +1147,27 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
WARN_ON_ONCE(v > rqstp->rq_maxpages);
- trace_nfsd_read_vector(rqstp, fhp, offset, *count);
- iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count);
-
- if (nfsd_enable_dontcache) {
- if (is_dio_aligned(&iter, offset, nf->nf_dio_read_offset_align))
- flags |= RWF_DIRECT;
- /* FIXME: not using RWF_DONTCACHE for misaligned IO because it works
- * against us (due to RMW needing to read without benefit of cache),
- * whereas buffered IO enables misaligned IO to be more performant.
- */
- //else
- // flags |= RWF_DONTCACHE;
- }
+ trace_nfsd_read_vector(rqstp, fhp, offset, in_count);
+ iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, in_count);
+ ppos = offset;
host_err = vfs_iter_read(file, &iter, &ppos, flags);
+
+ if ((start_extra || end_extra) && host_err >= 0) {
+ rqstp->rq_bvec[0].bv_offset += start_extra;
+ rqstp->rq_bvec[0].bv_len -= start_extra;
+ rqstp->rq_bvec[v].bv_len -= end_extra;
+ /* Must adjust returned read size to reflect original extent */
+ offset += start_extra;
+ if (likely(host_err >= start_extra)) {
+ host_err -= start_extra;
+ if (host_err > *count)
+ host_err = *count;
+ } else {
+ /* Short read that didn't read any of requested data */
+ host_err = 0;
+ }
+ }
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 46f7991cea58..52f5c9ec35aa 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -163,10 +163,13 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
* pages, one for the request, and one for the reply.
* nfsd_splice_actor() might need an extra page when a READ payload
* is not page-aligned.
+ * nfsd_iter_read() might need two extra pages when a READ payload
+ * is not DIO-aligned -- but nfsd_iter_read() and nfsd_splice_actor()
+ * are mutually exclusive.
*/
static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv)
{
- return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1;
+ return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1 + 1;
}
/*
--
2.44.0
next prev parent reply other threads:[~2025-06-10 20:57 UTC|newest]
Thread overview: 75+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-10 20:57 [PATCH 0/6] NFSD: add enable-dontcache and initially use it to add DIO support Mike Snitzer
2025-06-10 20:57 ` [PATCH 1/6] NFSD: add the ability to enable use of RWF_DONTCACHE for all IO Mike Snitzer
2025-06-11 6:57 ` Christoph Hellwig
2025-06-11 10:44 ` Mike Snitzer
2025-06-11 13:04 ` Jeff Layton
2025-06-11 13:56 ` Chuck Lever
2025-06-11 14:31 ` Chuck Lever
2025-06-11 19:18 ` Mike Snitzer
2025-06-11 20:29 ` Jeff Layton
2025-06-11 21:36 ` need SUNRPC TCP to receive into aligned pages [was: Re: [PATCH 1/6] NFSD: add the ability to enable use of RWF_DONTCACHE for all IO] Mike Snitzer
2025-06-12 10:28 ` Jeff Layton
2025-06-12 11:28 ` Jeff Layton
2025-06-12 13:28 ` Chuck Lever
2025-06-12 14:17 ` Benjamin Coddington
2025-06-12 15:56 ` Mike Snitzer
2025-06-12 15:58 ` Chuck Lever
2025-06-12 16:12 ` Mike Snitzer
2025-06-12 16:32 ` Chuck Lever
2025-06-13 5:39 ` Christoph Hellwig
2025-06-12 16:22 ` Jeff Layton
2025-06-13 5:46 ` Christoph Hellwig
2025-06-13 9:23 ` Mike Snitzer
2025-06-13 13:02 ` Jeff Layton
2025-06-16 12:35 ` Christoph Hellwig
2025-06-16 12:29 ` Christoph Hellwig
2025-06-16 16:07 ` Mike Snitzer
2025-06-17 4:37 ` Christoph Hellwig
2025-06-17 20:26 ` Mike Snitzer
2025-06-17 22:23 ` [RFC PATCH] lib/iov_iter: remove piecewise bvec length checking in iov_iter_aligned_bvec [was: Re: need SUNRPC TCP to receive into aligned pages] Mike Snitzer
2025-07-03 0:12 ` need SUNRPC TCP to receive into aligned pages [was: Re: [PATCH 1/6] NFSD: add the ability to enable use of RWF_DONTCACHE for all IO] NeilBrown
2025-06-12 7:13 ` [PATCH 1/6] NFSD: add the ability to enable use of RWF_DONTCACHE for all IO Christoph Hellwig
2025-06-12 13:15 ` Chuck Lever
2025-06-12 13:21 ` Chuck Lever
2025-06-12 16:00 ` Mike Snitzer
2025-06-16 13:32 ` Chuck Lever
2025-06-16 16:10 ` Mike Snitzer
2025-06-17 17:22 ` Mike Snitzer
2025-06-17 17:31 ` Chuck Lever
2025-06-19 20:19 ` Mike Snitzer
2025-06-30 14:50 ` Chuck Lever
2025-07-04 19:46 ` Mike Snitzer
2025-07-04 19:49 ` Chuck Lever
2025-06-10 20:57 ` [PATCH 2/6] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Mike Snitzer
2025-06-10 20:57 ` [PATCH 3/6] NFSD: pass nfsd_file to nfsd_iter_read() Mike Snitzer
2025-06-10 20:57 ` [PATCH 4/6] fs: introduce RWF_DIRECT to allow using O_DIRECT on a per-IO basis Mike Snitzer
2025-06-11 6:58 ` Christoph Hellwig
2025-06-11 10:51 ` Mike Snitzer
2025-06-11 14:17 ` Chuck Lever
2025-06-12 7:15 ` Christoph Hellwig
2025-06-10 20:57 ` [PATCH 5/6] NFSD: leverage DIO alignment to selectively issue O_DIRECT reads and writes Mike Snitzer
2025-06-11 7:00 ` Christoph Hellwig
2025-06-11 12:23 ` Mike Snitzer
2025-06-11 13:30 ` Jeff Layton
2025-06-12 7:22 ` Christoph Hellwig
2025-06-12 7:23 ` Christoph Hellwig
2025-06-11 14:42 ` Chuck Lever
2025-06-11 15:07 ` Jeff Layton
2025-06-11 15:11 ` Chuck Lever
2025-06-11 15:44 ` Jeff Layton
2025-06-11 20:51 ` Mike Snitzer
2025-06-12 7:32 ` Christoph Hellwig
2025-06-12 7:28 ` Christoph Hellwig
2025-06-12 7:25 ` Christoph Hellwig
2025-06-10 20:57 ` Mike Snitzer [this message]
2025-06-11 12:55 ` [PATCH 0/6] NFSD: add enable-dontcache and initially use it to add DIO support Jeff Layton
2025-06-12 7:39 ` Christoph Hellwig
2025-06-12 20:37 ` Mike Snitzer
2025-06-13 5:31 ` Christoph Hellwig
2025-06-11 14:16 ` Chuck Lever
2025-06-11 18:02 ` Mike Snitzer
2025-06-11 19:06 ` Chuck Lever
2025-06-11 19:58 ` Mike Snitzer
2025-06-12 13:46 ` Chuck Lever
2025-06-12 19:08 ` Mike Snitzer
2025-06-12 20:17 ` Chuck Lever
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250610205737.63343-7-snitzer@kernel.org \
--to=snitzer@kernel.org \
--cc=axboe@kernel.dk \
--cc=chuck.lever@oracle.com \
--cc=jlayton@kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-nfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).