From: Mike Snitzer <snitzer@kernel.org>
To: Chuck Lever <chuck.lever@oracle.com>, Jeff Layton <jlayton@kernel.org>
Cc: linux-nfs@vger.kernel.org
Subject: [PATCH v3 3/4] NFSD: issue WRITEs using O_DIRECT even if IO is misaligned
Date: Thu, 31 Jul 2025 19:06:32 -0400 [thread overview]
Message-ID: <20250731230633.89983-4-snitzer@kernel.org> (raw)
In-Reply-To: <20250731230633.89983-1-snitzer@kernel.org>
If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start,
middle and end as needed. The large middle extent is DIO-aligned and
the start and/or end are misaligned. Buffered IO is used for the
misaligned extents and O_DIRECT is used for the middle DIO-aligned
extent.
The nfsd_analyze_write_dio trace event shows how NFSD splits a given
misaligned WRITE into a mix of misaligned extent(s) and a DIO-aligned
extent.
This combination of trace events is useful:
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_opened/enable
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_analyze_write_dio/enable
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_io_done/enable
echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_write/enable
Which for this dd command:
dd if=/dev/zero of=/mnt/share1/test bs=47008 count=2 oflag=direct
Results in:
nfsd-55714 [043] ..... 79976.260851: nfsd_write_opened: xid=0x966c5d2d fh_hash=0x4d34e6c1 offset=0 len=47008
nfsd-55714 [043] ..... 79976.260852: nfsd_analyze_write_dio: xid=0x966c5d2d fh_hash=0x4d34e6c1 offset=0 len=47008 start=0+0 middle=0+45056 end=45056+1952
nfsd-55714 [043] ..... 79976.260857: xfs_file_direct_write: dev 259:12 ino 0x3e00008f disize 0x0 pos 0x0 bytecount 0xb000
nfsd-55714 [043] ..... 79976.260965: nfsd_write_io_done: xid=0x966c5d2d fh_hash=0x4d34e6c1 offset=0 len=47008
nfsd-55714 [043] ..... 79976.307762: nfsd_write_opened: xid=0x67e5ce6f fh_hash=0x4d34e6c1 offset=47008 len=47008
nfsd-55714 [043] ..... 79976.307762: nfsd_analyze_write_dio: xid=0x67e5ce6f fh_hash=0x4d34e6c1 offset=47008 len=47008 start=47008+2144 middle=49152+40960 end=90112+3904
nfsd-55714 [043] ..... 79976.307797: xfs_file_direct_write: dev 259:12 ino 0x3e00008f disize 0xc000 pos 0xc000 bytecount 0xa000
nfsd-55714 [043] ..... 79976.307866: nfsd_write_io_done: xid=0x67e5ce6f fh_hash=0x4d34e6c1 offset=47008 len=47008
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
fs/nfsd/vfs.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 124 insertions(+), 11 deletions(-)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index edac73349da0f..f0cfd7b457240 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1314,6 +1314,113 @@ static int wait_for_concurrent_writes(struct file *file)
return err;
}
+struct nfsd_write_dio
+{
+ loff_t middle_offset; /* Offset for start of DIO-aligned middle */
+ loff_t end_offset; /* Offset for start of DIO-aligned end */
+ ssize_t start_len; /* Length for misaligned first extent */
+ ssize_t middle_len; /* Length for DIO-aligned middle extent */
+ ssize_t end_len; /* Length for misaligned last extent */
+};
+
+static void init_nfsd_write_dio(struct nfsd_write_dio *write_dio)
+{
+ memset(write_dio, 0, sizeof(*write_dio));
+}
+
+static bool nfsd_analyze_write_dio(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file *nf, loff_t offset,
+ unsigned long len, struct nfsd_write_dio *write_dio)
+{
+ const u32 dio_blocksize = nf->nf_dio_offset_align;
+ loff_t orig_end, middle_end, start_end, start_offset = offset;
+ ssize_t start_len = len;
+ bool aligned = true;
+
+ if (WARN_ONCE(!nf->nf_dio_mem_align || !dio_blocksize,
+ "%s: underlying filesystem has not provided DIO alignment info\n",
+ __func__))
+ return false;
+
+ if (WARN_ONCE(dio_blocksize > PAGE_SIZE,
+ "%s: underlying storage's dio_blocksize=%u > PAGE_SIZE=%lu\n",
+ __func__, dio_blocksize, PAGE_SIZE))
+ return false;
+
+ if (unlikely(len < dio_blocksize)) {
+ aligned = false;
+ goto out;
+ }
+
+ if (((offset | len) & (dio_blocksize-1)) == 0) {
+ /* already DIO-aligned, no misaligned head or tail */
+ write_dio->middle_offset = offset;
+ write_dio->middle_len = len;
+ /* clear these for the benefit of trace_nfsd_analyze_write_dio */
+ start_offset = 0;
+ start_len = 0;
+ goto out;
+ }
+
+ start_end = round_up(offset, dio_blocksize);
+ start_len = start_end - offset;
+ orig_end = offset + len;
+ middle_end = round_down(orig_end, dio_blocksize);
+
+ write_dio->start_len = start_len;
+ write_dio->middle_offset = start_end;
+ write_dio->middle_len = middle_end - start_end;
+ write_dio->end_offset = middle_end;
+ write_dio->end_len = orig_end - middle_end;
+out:
+ trace_nfsd_analyze_write_dio(rqstp, fhp, offset, len, start_offset, start_len,
+ write_dio->middle_offset, write_dio->middle_len,
+ write_dio->end_offset, write_dio->end_len);
+ return aligned;
+}
+
+/*
+ * Setup as many as 3 iov_iter based on extents possibly described by @write_dio.
+ * @iterp: pointer to pointer to onstack array of 3 iov_iter structs from caller.
+ * @rq_bvec: backing bio_vec used to setup all 3 iov_iter permutations.
+ * @nvecs: number of segments in @rq_bvec
+ * @cnt: size of the request in bytes
+ * @write_dio: nfsd_write_dio struct that describes start, middle and end extents.
+ *
+ * Returns the number of iov_iter that were setup.
+ */
+static int nfsd_setup_write_iters(struct iov_iter **iterp, struct bio_vec *rq_bvec,
+ unsigned int nvecs, unsigned long cnt,
+ struct nfsd_write_dio *write_dio)
+{
+ int n_iters = 0;
+ struct iov_iter *iters = *iterp;
+
+ /* Setup misaligned start? */
+ if (write_dio->start_len) {
+ iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+ iters[n_iters].count = write_dio->start_len;
+ n_iters++;
+ }
+
+ /* Setup possibly DIO-aligned middle */
+ iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+ if (write_dio->start_len)
+ iov_iter_advance(&iters[n_iters], write_dio->start_len);
+ iters[n_iters].count -= write_dio->end_len;
+ n_iters++;
+
+ /* Setup misaligned end? */
+ if (write_dio->end_len) {
+ iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+ iov_iter_advance(&iters[n_iters],
+ write_dio->start_len + write_dio->middle_len);
+ n_iters++;
+ }
+
+ return n_iters;
+}
+
/**
* nfsd_vfs_write - write data to an already-open file
* @rqstp: RPC execution context
@@ -1348,9 +1455,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int pflags = current->flags;
bool restore_flags = false;
unsigned int nvecs;
- struct iov_iter iter_stack[1];
+ struct iov_iter iter_stack[3];
struct iov_iter *iter = iter_stack;
unsigned int n_iters = 0;
+ bool dio_aligned = false;
+ struct nfsd_write_dio write_dio;
trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
@@ -1379,18 +1488,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (stable && !fhp->fh_use_wgather)
kiocb.ki_flags |= IOCB_DSYNC;
- nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
- iov_iter_bvec(&iter[0], ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
- n_iters++;
-
+ init_nfsd_write_dio(&write_dio);
switch (nfsd_io_cache_write) {
case NFSD_IO_DIRECT:
- /* direct I/O must be aligned to device logical sector size */
- if (nf->nf_dio_mem_align && nf->nf_dio_offset_align &&
- (((offset | *cnt) & (nf->nf_dio_offset_align-1)) == 0) &&
- iov_iter_is_aligned(&iter[0], nf->nf_dio_mem_align - 1,
- nf->nf_dio_offset_align - 1))
- kiocb.ki_flags = IOCB_DIRECT;
+ if (nfsd_analyze_write_dio(rqstp, fhp, nf, offset,
+ *cnt, &write_dio))
+ dio_aligned = true;
break;
case NFSD_IO_DONTCACHE:
kiocb.ki_flags = IOCB_DONTCACHE;
@@ -1399,11 +1502,21 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
break;
}
+ nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
+ n_iters = nfsd_setup_write_iters(&iter, rqstp->rq_bvec, nvecs, *cnt, &write_dio);
+
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
*cnt = 0;
for (int i = 0; i < n_iters; i++) {
+ if (dio_aligned) {
+ if (iov_iter_is_aligned(&iter[i], nf->nf_dio_mem_align - 1,
+ nf->nf_dio_offset_align - 1))
+ kiocb.ki_flags |= IOCB_DIRECT;
+ else
+ kiocb.ki_flags &= ~IOCB_DIRECT;
+ }
host_err = vfs_iocb_iter_write(file, &kiocb, &iter[i]);
if (host_err < 0) {
commit_reset_write_verifier(nn, rqstp, host_err);
--
2.44.0
next prev parent reply other threads:[~2025-07-31 23:06 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-07-31 23:06 [PATCH v3 0/4] NFSD DIRECT: add handling for misaligned WRITEs Mike Snitzer
2025-07-31 23:06 ` [PATCH v3 1/4] NFSD: refactor nfsd_read_vector_dio to EVENT_CLASS useful for READ and WRITE Mike Snitzer
2025-07-31 23:06 ` [PATCH v3 2/4] NFSD: prepare nfsd_vfs_write() to use O_DIRECT on misaligned WRITEs Mike Snitzer
2025-08-01 20:52 ` Chuck Lever
2025-08-01 22:29 ` Mike Snitzer
2025-07-31 23:06 ` Mike Snitzer [this message]
2025-08-05 14:55 ` [PATCH v3 3/4] NFSD: issue WRITEs using O_DIRECT even if IO is misaligned Chuck Lever
2025-08-05 19:02 ` Mike Snitzer
2025-07-31 23:06 ` [PATCH v3 4/4] NFSD: handle unaligned DIO for NFS reexport Mike Snitzer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250731230633.89983-4-snitzer@kernel.org \
--to=snitzer@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=jlayton@kernel.org \
--cc=linux-nfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.