From: Mike Snitzer <snitzer@kernel.org>
To: Chuck Lever <chuck.lever@oracle.com>, Jeff Layton <jlayton@kernel.org>
Cc: linux-nfs@vger.kernel.org
Subject: [PATCH v3 3/4] NFSD: issue WRITEs using O_DIRECT even if IO is misaligned
Date: Thu, 31 Jul 2025 19:06:32 -0400 [thread overview]
Message-ID: <20250731230633.89983-4-snitzer@kernel.org> (raw)
In-Reply-To: <20250731230633.89983-1-snitzer@kernel.org>
If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start,
middle and end as needed. The large middle extent is DIO-aligned and
the start and/or end are misaligned. Buffered IO is used for the
misaligned extents and O_DIRECT is used for the middle DIO-aligned
extent.
The nfsd_analyze_write_dio trace event shows how NFSD splits a given
misaligned WRITE into a mix of misaligned extent(s) and a DIO-aligned
extent.
This combination of trace events is useful:
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_opened/enable
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_analyze_write_dio/enable
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_io_done/enable
echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_write/enable
Which for this dd command:
dd if=/dev/zero of=/mnt/share1/test bs=47008 count=2 oflag=direct
Results in:
nfsd-55714 [043] ..... 79976.260851: nfsd_write_opened: xid=0x966c5d2d fh_hash=0x4d34e6c1 offset=0 len=47008
nfsd-55714 [043] ..... 79976.260852: nfsd_analyze_write_dio: xid=0x966c5d2d fh_hash=0x4d34e6c1 offset=0 len=47008 start=0+0 middle=0+45056 end=45056+1952
nfsd-55714 [043] ..... 79976.260857: xfs_file_direct_write: dev 259:12 ino 0x3e00008f disize 0x0 pos 0x0 bytecount 0xb000
nfsd-55714 [043] ..... 79976.260965: nfsd_write_io_done: xid=0x966c5d2d fh_hash=0x4d34e6c1 offset=0 len=47008
nfsd-55714 [043] ..... 79976.307762: nfsd_write_opened: xid=0x67e5ce6f fh_hash=0x4d34e6c1 offset=47008 len=47008
nfsd-55714 [043] ..... 79976.307762: nfsd_analyze_write_dio: xid=0x67e5ce6f fh_hash=0x4d34e6c1 offset=47008 len=47008 start=47008+2144 middle=49152+40960 end=90112+3904
nfsd-55714 [043] ..... 79976.307797: xfs_file_direct_write: dev 259:12 ino 0x3e00008f disize 0xc000 pos 0xc000 bytecount 0xa000
nfsd-55714 [043] ..... 79976.307866: nfsd_write_io_done: xid=0x67e5ce6f fh_hash=0x4d34e6c1 offset=47008 len=47008
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
fs/nfsd/vfs.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 124 insertions(+), 11 deletions(-)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index edac73349da0f..f0cfd7b457240 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1314,6 +1314,113 @@ static int wait_for_concurrent_writes(struct file *file)
return err;
}
+struct nfsd_write_dio
+{
+ loff_t middle_offset; /* Offset for start of DIO-aligned middle */
+ loff_t end_offset; /* Offset for start of DIO-aligned end */
+ ssize_t start_len; /* Length for misaligned first extent */
+ ssize_t middle_len; /* Length for DIO-aligned middle extent */
+ ssize_t end_len; /* Length for misaligned last extent */
+};
+
+static void init_nfsd_write_dio(struct nfsd_write_dio *write_dio)
+{
+ memset(write_dio, 0, sizeof(*write_dio));
+}
+
+static bool nfsd_analyze_write_dio(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file *nf, loff_t offset,
+ unsigned long len, struct nfsd_write_dio *write_dio)
+{
+ const u32 dio_blocksize = nf->nf_dio_offset_align;
+ loff_t orig_end, middle_end, start_end, start_offset = offset;
+ ssize_t start_len = len;
+ bool aligned = true;
+
+ if (WARN_ONCE(!nf->nf_dio_mem_align || !dio_blocksize,
+ "%s: underlying filesystem has not provided DIO alignment info\n",
+ __func__))
+ return false;
+
+ if (WARN_ONCE(dio_blocksize > PAGE_SIZE,
+ "%s: underlying storage's dio_blocksize=%u > PAGE_SIZE=%lu\n",
+ __func__, dio_blocksize, PAGE_SIZE))
+ return false;
+
+ if (unlikely(len < dio_blocksize)) {
+ aligned = false;
+ goto out;
+ }
+
+ if (((offset | len) & (dio_blocksize-1)) == 0) {
+ /* already DIO-aligned, no misaligned head or tail */
+ write_dio->middle_offset = offset;
+ write_dio->middle_len = len;
+ /* clear these for the benefit of trace_nfsd_analyze_write_dio */
+ start_offset = 0;
+ start_len = 0;
+ goto out;
+ }
+
+ start_end = round_up(offset, dio_blocksize);
+ start_len = start_end - offset;
+ orig_end = offset + len;
+ middle_end = round_down(orig_end, dio_blocksize);
+
+ write_dio->start_len = start_len;
+ write_dio->middle_offset = start_end;
+ write_dio->middle_len = middle_end - start_end;
+ write_dio->end_offset = middle_end;
+ write_dio->end_len = orig_end - middle_end;
+out:
+ trace_nfsd_analyze_write_dio(rqstp, fhp, offset, len, start_offset, start_len,
+ write_dio->middle_offset, write_dio->middle_len,
+ write_dio->end_offset, write_dio->end_len);
+ return aligned;
+}
+
+/*
+ * Setup as many as 3 iov_iter based on extents possibly described by @write_dio.
+ * @iterp: pointer to pointer to onstack array of 3 iov_iter structs from caller.
+ * @rq_bvec: backing bio_vec used to setup all 3 iov_iter permutations.
+ * @nvecs: number of segments in @rq_bvec
+ * @cnt: size of the request in bytes
+ * @write_dio: nfsd_write_dio struct that describes start, middle and end extents.
+ *
+ * Returns the number of iov_iter that were setup.
+ */
+static int nfsd_setup_write_iters(struct iov_iter **iterp, struct bio_vec *rq_bvec,
+ unsigned int nvecs, unsigned long cnt,
+ struct nfsd_write_dio *write_dio)
+{
+ int n_iters = 0;
+ struct iov_iter *iters = *iterp;
+
+ /* Setup misaligned start? */
+ if (write_dio->start_len) {
+ iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+ iters[n_iters].count = write_dio->start_len;
+ n_iters++;
+ }
+
+ /* Setup possibly DIO-aligned middle */
+ iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+ if (write_dio->start_len)
+ iov_iter_advance(&iters[n_iters], write_dio->start_len);
+ iters[n_iters].count -= write_dio->end_len;
+ n_iters++;
+
+ /* Setup misaligned end? */
+ if (write_dio->end_len) {
+ iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+ iov_iter_advance(&iters[n_iters],
+ write_dio->start_len + write_dio->middle_len);
+ n_iters++;
+ }
+
+ return n_iters;
+}
+
/**
* nfsd_vfs_write - write data to an already-open file
* @rqstp: RPC execution context
@@ -1348,9 +1455,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int pflags = current->flags;
bool restore_flags = false;
unsigned int nvecs;
- struct iov_iter iter_stack[1];
+ struct iov_iter iter_stack[3];
struct iov_iter *iter = iter_stack;
unsigned int n_iters = 0;
+ bool dio_aligned = false;
+ struct nfsd_write_dio write_dio;
trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
@@ -1379,18 +1488,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (stable && !fhp->fh_use_wgather)
kiocb.ki_flags |= IOCB_DSYNC;
- nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
- iov_iter_bvec(&iter[0], ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
- n_iters++;
-
+ init_nfsd_write_dio(&write_dio);
switch (nfsd_io_cache_write) {
case NFSD_IO_DIRECT:
- /* direct I/O must be aligned to device logical sector size */
- if (nf->nf_dio_mem_align && nf->nf_dio_offset_align &&
- (((offset | *cnt) & (nf->nf_dio_offset_align-1)) == 0) &&
- iov_iter_is_aligned(&iter[0], nf->nf_dio_mem_align - 1,
- nf->nf_dio_offset_align - 1))
- kiocb.ki_flags = IOCB_DIRECT;
+ if (nfsd_analyze_write_dio(rqstp, fhp, nf, offset,
+ *cnt, &write_dio))
+ dio_aligned = true;
break;
case NFSD_IO_DONTCACHE:
kiocb.ki_flags = IOCB_DONTCACHE;
@@ -1399,11 +1502,21 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
break;
}
+ nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
+ n_iters = nfsd_setup_write_iters(&iter, rqstp->rq_bvec, nvecs, *cnt, &write_dio);
+
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
*cnt = 0;
for (int i = 0; i < n_iters; i++) {
+ if (dio_aligned) {
+ if (iov_iter_is_aligned(&iter[i], nf->nf_dio_mem_align - 1,
+ nf->nf_dio_offset_align - 1))
+ kiocb.ki_flags |= IOCB_DIRECT;
+ else
+ kiocb.ki_flags &= ~IOCB_DIRECT;
+ }
host_err = vfs_iocb_iter_write(file, &kiocb, &iter[i]);
if (host_err < 0) {
commit_reset_write_verifier(nn, rqstp, host_err);
--
2.44.0
next prev parent reply other threads:[~2025-07-31 23:06 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-07-31 23:06 [PATCH v3 0/4] NFSD DIRECT: add handling for misaligned WRITEs Mike Snitzer
2025-07-31 23:06 ` [PATCH v3 1/4] NFSD: refactor nfsd_read_vector_dio to EVENT_CLASS useful for READ and WRITE Mike Snitzer
2025-07-31 23:06 ` [PATCH v3 2/4] NFSD: prepare nfsd_vfs_write() to use O_DIRECT on misaligned WRITEs Mike Snitzer
2025-08-01 20:52 ` Chuck Lever
2025-08-01 22:29 ` Mike Snitzer
2025-07-31 23:06 ` Mike Snitzer [this message]
2025-08-05 14:55 ` [PATCH v3 3/4] NFSD: issue WRITEs using O_DIRECT even if IO is misaligned Chuck Lever
2025-08-05 19:02 ` Mike Snitzer
2025-07-31 23:06 ` [PATCH v3 4/4] NFSD: handle unaligned DIO for NFS reexport Mike Snitzer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250731230633.89983-4-snitzer@kernel.org \
--to=snitzer@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=jlayton@kernel.org \
--cc=linux-nfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).