From: Mike Snitzer <snitzer@kernel.org>
To: Chuck Lever <chuck.lever@oracle.com>, Jeff Layton <jlayton@kernel.org>
Cc: NeilBrown <neil@brown.name>,
Olga Kornievskaia <okorniev@redhat.com>,
Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>,
linux-nfs@vger.kernel.org
Subject: [PATCH 2/2] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
Date: Tue, 9 Sep 2025 19:33:15 -0400 [thread overview]
Message-ID: <20250909233315.80318-3-snitzer@kernel.org> (raw)
In-Reply-To: <20250909233315.80318-1-snitzer@kernel.org>
If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start,
middle and end as needed. The large middle extent is DIO-aligned and
the start and/or end are misaligned. Buffered IO (with preference
towards using DONTCACHE) is used for the misaligned extents and
O_DIRECT is used for the middle DIO-aligned extent.
If vfs_iocb_iter_write() returns -ENOTBLK, due to its inability to
invalidate the page cache on behalf of the DIO WRITE, then
nfsd_issue_write_dio() will fall back to using buffered IO.
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfsd/debugfs.c | 1 +
fs/nfsd/trace.h | 1 +
fs/nfsd/vfs.c | 215 ++++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 212 insertions(+), 5 deletions(-)
diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
index 00eb1ecef6ac1..7f44689e0a53e 100644
--- a/fs/nfsd/debugfs.c
+++ b/fs/nfsd/debugfs.c
@@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val)
switch (val) {
case NFSD_IO_BUFFERED:
case NFSD_IO_DONTCACHE:
+ case NFSD_IO_DIRECT:
nfsd_io_cache_write = val;
break;
default:
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 88901df5fbb11..4e560427f6e1e 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -469,6 +469,7 @@ DEFINE_NFSD_IO_EVENT(read_io_done);
DEFINE_NFSD_IO_EVENT(read_done);
DEFINE_NFSD_IO_EVENT(write_start);
DEFINE_NFSD_IO_EVENT(write_opened);
+DEFINE_NFSD_IO_EVENT(write_direct);
DEFINE_NFSD_IO_EVENT(write_io_done);
DEFINE_NFSD_IO_EVENT(write_done);
DEFINE_NFSD_IO_EVENT(commit_start);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 239266c241d2c..e2039feaf95c8 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1246,6 +1246,208 @@ static int wait_for_concurrent_writes(struct file *file)
return err;
}
+struct nfsd_write_dio {
+ ssize_t start_len; /* Length for misaligned first extent */
+ ssize_t middle_len; /* Length for DIO-aligned middle extent */
+ ssize_t end_len; /* Length for misaligned last extent */
+};
+
+static bool
+nfsd_is_write_dio_possible(loff_t offset, unsigned long len,
+ struct nfsd_file *nf, struct nfsd_write_dio *write_dio)
+{
+ const u32 dio_blocksize = nf->nf_dio_offset_align;
+ loff_t start_end, orig_end, middle_end;
+
+ if (unlikely(!nf->nf_dio_mem_align || !dio_blocksize))
+ return false;
+ if (unlikely(dio_blocksize > PAGE_SIZE))
+ return false;
+ if (unlikely(len < dio_blocksize))
+ return false;
+
+ start_end = round_up(offset, dio_blocksize);
+ orig_end = offset + len;
+ middle_end = round_down(orig_end, dio_blocksize);
+
+ write_dio->start_len = start_end - offset;
+ write_dio->middle_len = middle_end - start_end;
+ write_dio->end_len = orig_end - middle_end;
+
+ return true;
+}
+
+static bool nfsd_iov_iter_aligned_bvec(const struct iov_iter *i,
+ unsigned int addr_mask, unsigned int len_mask)
+{
+ const struct bio_vec *bvec = i->bvec;
+ size_t skip = i->iov_offset;
+ size_t size = i->count;
+
+ if (size & len_mask)
+ return false;
+ do {
+ size_t len = bvec->bv_len;
+
+ if (len > size)
+ len = size;
+ if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
+ return false;
+ bvec++;
+ size -= len;
+ skip = 0;
+ } while (size);
+
+ return true;
+}
+
+/*
+ * Setup as many as 3 iov_iter based on extents described by @write_dio.
+ * @iterp: pointer to pointer to onstack array of 3 iov_iter structs from caller.
+ * @iter_is_dio_aligned: pointer to onstack array of 3 bools from caller.
+ * @rq_bvec: backing bio_vec used to setup all 3 iov_iter permutations.
+ * @nvecs: number of segments in @rq_bvec
+ * @cnt: size of the request in bytes
+ * @write_dio: nfsd_write_dio struct that describes start, middle and end extents.
+ *
+ * Returns the number of iov_iter that were setup.
+ */
+static int
+nfsd_setup_write_dio_iters(struct iov_iter **iterp, bool *iter_is_dio_aligned,
+ struct bio_vec *rq_bvec, unsigned int nvecs,
+ unsigned long cnt, struct nfsd_write_dio *write_dio,
+ struct nfsd_file *nf)
+{
+ int n_iters = 0;
+ struct iov_iter *iters = *iterp;
+
+ /* Setup misaligned start? */
+ if (write_dio->start_len) {
+ iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+ iters[n_iters].count = write_dio->start_len;
+ iter_is_dio_aligned[n_iters] = false;
+ ++n_iters;
+ }
+
+ /* Setup DIO-aligned middle */
+ iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+ if (write_dio->start_len)
+ iov_iter_advance(&iters[n_iters], write_dio->start_len);
+ iters[n_iters].count -= write_dio->end_len;
+ iter_is_dio_aligned[n_iters] =
+ nfsd_iov_iter_aligned_bvec(&iters[n_iters],
+ nf->nf_dio_mem_align-1, nf->nf_dio_offset_align-1);
+ if (unlikely(!iter_is_dio_aligned[n_iters]))
+ return 0; /* no DIO-aligned IO possible */
+ ++n_iters;
+
+ /* Setup misaligned end? */
+ if (write_dio->end_len) {
+ iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+ iov_iter_advance(&iters[n_iters],
+ write_dio->start_len + write_dio->middle_len);
+ iter_is_dio_aligned[n_iters] = false;
+ ++n_iters;
+ }
+
+ return n_iters;
+}
+
+static int
+nfsd_buffered_write(struct svc_rqst *rqstp, struct file *file,
+ unsigned int nvecs, unsigned long *cnt,
+ struct kiocb *kiocb)
+{
+ struct iov_iter iter;
+ int host_err;
+
+ iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
+ host_err = vfs_iocb_iter_write(file, kiocb, &iter);
+ if (host_err < 0)
+ return host_err;
+ *cnt = host_err;
+
+ return 0;
+}
+
+static int
+nfsd_issue_write_dio(struct svc_rqst *rqstp, struct nfsd_file *nf,
+ loff_t offset, unsigned int nvecs,
+ unsigned long *cnt, struct kiocb *kiocb,
+ struct nfsd_write_dio *write_dio)
+{
+ struct file *file = nf->nf_file;
+ bool iter_is_dio_aligned[3];
+ struct iov_iter iter_stack[3];
+ struct iov_iter *iter = iter_stack;
+ unsigned int n_iters = 0;
+ unsigned long in_count = *cnt;
+ loff_t in_offset = kiocb->ki_pos;
+ ssize_t host_err;
+
+ n_iters = nfsd_setup_write_dio_iters(&iter, iter_is_dio_aligned,
+ rqstp->rq_bvec, nvecs, *cnt, write_dio, nf);
+ if (unlikely(!n_iters))
+ return nfsd_buffered_write(rqstp, file, nvecs, cnt, kiocb);
+
+ *cnt = 0;
+ for (int i = 0; i < n_iters; i++) {
+ if (iter_is_dio_aligned[i])
+ kiocb->ki_flags |= IOCB_DIRECT;
+ else
+ kiocb->ki_flags &= ~IOCB_DIRECT;
+
+ host_err = vfs_iocb_iter_write(file, kiocb, &iter[i]);
+ if (host_err < 0) {
+ /* VFS will return -ENOTBLK if DIO WRITE fails to
+ * invalidate the page cache. Retry using buffered IO.
+ */
+ if (unlikely(host_err == -ENOTBLK)) {
+ kiocb->ki_flags &= ~IOCB_DIRECT;
+ *cnt = in_count;
+ kiocb->ki_pos = in_offset;
+ return nfsd_buffered_write(rqstp, file,
+ nvecs, cnt, kiocb);
+ } else if (unlikely(host_err == -EINVAL)) {
+ pr_info_ratelimited("nfsd: Unexpected direct I/O alignment failure\n");
+ host_err = -ESERVERFAULT;
+ }
+ return host_err;
+ }
+ *cnt += host_err;
+ if (host_err < iter[i].count) /* partial write? */
+ break;
+ }
+
+ return 0;
+}
+
+static noinline_for_stack int
+nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file *nf, loff_t offset, unsigned int nvecs,
+ unsigned long *cnt, struct kiocb *kiocb)
+{
+ struct nfsd_write_dio write_dio;
+
+ /* Any buffered IO issued here will be misaligned, use
+ * IOCB_SYNC to ensure it has completed before returning.
+ */
+ kiocb->ki_flags |= IOCB_SYNC;
+ /* Check if IOCB_DONTCACHE should be used when issuing buffered IO;
+ * if so, it will be ignored for any DIO issued here.
+ */
+ if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE)
+ kiocb->ki_flags |= IOCB_DONTCACHE;
+
+ if (nfsd_is_write_dio_possible(offset, *cnt, nf, &write_dio)) {
+ trace_nfsd_write_direct(rqstp, fhp, offset, *cnt);
+ return nfsd_issue_write_dio(rqstp, nf, offset, nvecs,
+ cnt, kiocb, &write_dio);
+ }
+
+ return nfsd_buffered_write(rqstp, nf->nf_file, nvecs, cnt, kiocb);
+}
+
/**
* nfsd_vfs_write - write data to an already-open file
* @rqstp: RPC execution context
@@ -1273,7 +1475,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct super_block *sb = file_inode(file)->i_sb;
struct kiocb kiocb;
struct svc_export *exp;
- struct iov_iter iter;
errseq_t since;
__be32 nfserr;
int host_err;
@@ -1310,25 +1511,29 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
kiocb.ki_flags |= IOCB_DSYNC;
nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
- iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
+
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
switch (nfsd_io_cache_write) {
- case NFSD_IO_BUFFERED:
+ case NFSD_IO_DIRECT:
+ host_err = nfsd_direct_write(rqstp, fhp, nf, offset,
+ nvecs, cnt, &kiocb);
break;
case NFSD_IO_DONTCACHE:
if (file->f_op->fop_flags & FOP_DONTCACHE)
kiocb.ki_flags |= IOCB_DONTCACHE;
+ fallthrough; /* must call nfsd_buffered_write */
+ case NFSD_IO_BUFFERED:
+ host_err = nfsd_buffered_write(rqstp, file,
+ nvecs, cnt, &kiocb);
break;
}
- host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
if (host_err < 0) {
commit_reset_write_verifier(nn, rqstp, host_err);
goto out_nfserr;
}
- *cnt = host_err;
nfsd_stats_io_write_add(nn, exp, *cnt);
fsnotify_modify(file);
host_err = filemap_check_wb_err(file->f_mapping, since);
--
2.44.0
next prev parent reply other threads:[~2025-09-09 23:33 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-09-09 19:05 [PATCH v1 0/3] NFSD direct I/O read Chuck Lever
2025-09-09 19:05 ` [PATCH v1 1/3] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Chuck Lever
2025-09-09 23:07 ` NeilBrown
2025-09-09 19:05 ` [PATCH v1 2/3] NFSD: pass nfsd_file to nfsd_iter_read() Chuck Lever
2025-09-09 23:20 ` NeilBrown
2025-09-09 19:05 ` [PATCH v1 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ Chuck Lever
2025-09-09 23:16 ` Mike Snitzer
2025-09-09 23:37 ` NeilBrown
2025-09-09 23:39 ` Chuck Lever
2025-09-09 23:48 ` Chuck Lever
2025-09-10 1:54 ` NeilBrown
2025-09-10 1:52 ` NeilBrown
2025-09-10 14:23 ` Chuck Lever
2025-09-09 23:56 ` Mike Snitzer
2025-09-10 11:37 ` Jeff Layton
2025-09-09 23:33 ` [PATCH 0/2] NFSD: continuation of NFSD DIRECT Mike Snitzer
2025-09-09 23:33 ` [PATCH 1/2] sunrpc: add an extra reserve page to svc_serv_maxpages() Mike Snitzer
2025-09-10 14:29 ` Chuck Lever
2025-09-09 23:33 ` Mike Snitzer [this message]
2025-10-08 18:59 ` [PATCH v2] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE Mike Snitzer
2025-10-09 15:04 ` Jeff Layton
2025-10-09 17:46 ` Chuck Lever
2025-10-13 15:41 ` Mike Snitzer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250909233315.80318-3-snitzer@kernel.org \
--to=snitzer@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=dai.ngo@oracle.com \
--cc=jlayton@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=neil@brown.name \
--cc=okorniev@redhat.com \
--cc=tom@talpey.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox