Linux NFS development
 help / color / mirror / Atom feed
From: Chuck Lever <cel@kernel.org>
To: NeilBrown <neil@brown.name>, Jeff Layton <jlayton@kernel.org>,
	Olga Kornievskaia <okorniev@redhat.com>,
	Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>
Cc: <linux-nfs@vger.kernel.org>, Mike Snitzer <snitzer@kernel.org>
Subject: [PATCH v7 04/14] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
Date: Fri, 24 Oct 2025 10:42:56 -0400	[thread overview]
Message-ID: <20251024144306.35652-5-cel@kernel.org> (raw)
In-Reply-To: <20251024144306.35652-1-cel@kernel.org>

From: Mike Snitzer <snitzer@kernel.org>

If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start,
middle and end as needed. The large middle extent is DIO-aligned and
the start and/or end are misaligned. Synchronous buffered IO (with
preference towards using DONTCACHE) is used for the misaligned extents
and O_DIRECT is used for the middle DIO-aligned extent.

nfsd_issue_write_dio() promotes @stable_how to NFS_FILE_SYNC, which
allows the client to drop its dirty data and avoid needing an extra
COMMIT operation.

If vfs_iocb_iter_write() returns -ENOTBLK, due to its inability to
invalidate the page cache on behalf of the DIO WRITE, then
nfsd_issue_write_dio() will fall back to using buffered IO.

These changes served as the original starting point for the NFS
client's misaligned O_DIRECT support that landed with
commit c817248fc831 ("nfs/localio: add proper O_DIRECT support for
READ and WRITE"). But NFSD's support is simpler because it currently
doesn't use AIO completion.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/debugfs.c |   1 +
 fs/nfsd/trace.h   |   1 +
 fs/nfsd/vfs.c     | 197 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 199 insertions(+)

diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
index 00eb1ecef6ac..7f44689e0a53 100644
--- a/fs/nfsd/debugfs.c
+++ b/fs/nfsd/debugfs.c
@@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val)
 	switch (val) {
 	case NFSD_IO_BUFFERED:
 	case NFSD_IO_DONTCACHE:
+	case NFSD_IO_DIRECT:
 		nfsd_io_cache_write = val;
 		break;
 	default:
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index bfd41236aff2..ad74439d0105 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -469,6 +469,7 @@ DEFINE_NFSD_IO_EVENT(read_io_done);
 DEFINE_NFSD_IO_EVENT(read_done);
 DEFINE_NFSD_IO_EVENT(write_start);
 DEFINE_NFSD_IO_EVENT(write_opened);
+DEFINE_NFSD_IO_EVENT(write_direct);
 DEFINE_NFSD_IO_EVENT(write_io_done);
 DEFINE_NFSD_IO_EVENT(write_done);
 DEFINE_NFSD_IO_EVENT(commit_start);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6076821bb541..2832a66cda5b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1254,6 +1254,109 @@ static int wait_for_concurrent_writes(struct file *file)
 	return err;
 }
 
+struct nfsd_write_dio {
+	ssize_t	start_len;	/* Length for misaligned first extent */
+	ssize_t	middle_len;	/* Length for DIO-aligned middle extent */
+	ssize_t	end_len;	/* Length for misaligned last extent */
+};
+
+static bool
+nfsd_is_write_dio_possible(loff_t offset, unsigned long len,
+			   struct nfsd_file *nf,
+			   struct nfsd_write_dio *write_dio)
+{
+	const u32 dio_blocksize = nf->nf_dio_offset_align;
+	loff_t start_end, orig_end, middle_end;
+
+	if (unlikely(!nf->nf_dio_mem_align || !dio_blocksize))
+		return false;
+	if (unlikely(dio_blocksize > PAGE_SIZE))
+		return false;
+	if (unlikely(len < dio_blocksize))
+		return false;
+
+	start_end = round_up(offset, dio_blocksize);
+	orig_end = offset + len;
+	middle_end = round_down(orig_end, dio_blocksize);
+
+	write_dio->start_len = start_end - offset;
+	write_dio->middle_len = middle_end - start_end;
+	write_dio->end_len = orig_end - middle_end;
+
+	return true;
+}
+
+static bool
+nfsd_iov_iter_aligned_bvec(const struct iov_iter *i, unsigned int addr_mask,
+			   unsigned int len_mask)
+{
+	const struct bio_vec *bvec = i->bvec;
+	size_t skip = i->iov_offset;
+	size_t size = i->count;
+
+	if (size & len_mask)
+		return false;
+	do {
+		size_t len = bvec->bv_len;
+
+		if (len > size)
+			len = size;
+		if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
+			return false;
+		bvec++;
+		size -= len;
+		skip = 0;
+	} while (size);
+
+	return true;
+}
+
+/*
+ * Setup as many as 3 iov_iter based on extents described by @write_dio.
+ * Returns the number of iov_iter that were setup.
+ */
+static int
+nfsd_setup_write_dio_iters(struct iov_iter **iterp, bool *iter_is_dio_aligned,
+			   struct bio_vec *rq_bvec, unsigned int nvecs,
+			   unsigned long cnt, struct nfsd_write_dio *write_dio,
+			   struct nfsd_file *nf)
+{
+	int n_iters = 0;
+	struct iov_iter *iters = *iterp;
+
+	/* Setup misaligned start? */
+	if (write_dio->start_len) {
+		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+		iters[n_iters].count = write_dio->start_len;
+		iter_is_dio_aligned[n_iters] = false;
+		++n_iters;
+	}
+
+	/* Setup DIO-aligned middle */
+	iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+	if (write_dio->start_len)
+		iov_iter_advance(&iters[n_iters], write_dio->start_len);
+	iters[n_iters].count -= write_dio->end_len;
+	iter_is_dio_aligned[n_iters] =
+		nfsd_iov_iter_aligned_bvec(&iters[n_iters],
+					   nf->nf_dio_mem_align - 1,
+					   nf->nf_dio_offset_align - 1);
+	if (unlikely(!iter_is_dio_aligned[n_iters]))
+		return 0; /* no DIO-aligned IO possible */
+	++n_iters;
+
+	/* Setup misaligned end? */
+	if (write_dio->end_len) {
+		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+		iov_iter_advance(&iters[n_iters],
+				 write_dio->start_len + write_dio->middle_len);
+		iter_is_dio_aligned[n_iters] = false;
+		++n_iters;
+	}
+
+	return n_iters;
+}
+
 static int
 nfsd_iocb_write(struct file *file, struct bio_vec *bvec, unsigned int nvecs,
 		unsigned long *cnt, struct kiocb *kiocb)
@@ -1270,6 +1373,95 @@ nfsd_iocb_write(struct file *file, struct bio_vec *bvec, unsigned int nvecs,
 	return 0;
 }
 
+static int
+nfsd_issue_write_dio(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+		     u32 *stable_how, unsigned int nvecs, unsigned long *cnt,
+		     struct kiocb *kiocb, struct nfsd_write_dio *write_dio)
+{
+	struct file *file = nf->nf_file;
+	bool iter_is_dio_aligned[3];
+	struct iov_iter iter_stack[3];
+	struct iov_iter *iter = iter_stack;
+	unsigned int n_iters = 0;
+	unsigned long in_count = *cnt;
+	loff_t in_offset = kiocb->ki_pos;
+	ssize_t host_err;
+
+	n_iters = nfsd_setup_write_dio_iters(&iter, iter_is_dio_aligned,
+					     rqstp->rq_bvec, nvecs, *cnt,
+					     write_dio, nf);
+	if (unlikely(!n_iters))
+		return nfsd_iocb_write(file, rqstp->rq_bvec, nvecs,
+				       cnt, kiocb);
+
+	trace_nfsd_write_direct(rqstp, fhp, in_offset, in_count);
+
+	/*
+	 * Any buffered IO issued here will be misaligned, use
+	 * sync IO to ensure it has completed before returning.
+	 * Also update @stable_how to avoid need for COMMIT.
+	 */
+	kiocb->ki_flags |= IOCB_DSYNC;
+	*stable_how = NFS_FILE_SYNC;
+
+	*cnt = 0;
+	for (int i = 0; i < n_iters; i++) {
+		if (iter_is_dio_aligned[i])
+			kiocb->ki_flags |= IOCB_DIRECT;
+		else
+			kiocb->ki_flags &= ~IOCB_DIRECT;
+
+		host_err = vfs_iocb_iter_write(file, kiocb, &iter[i]);
+		if (host_err < 0) {
+			/*
+			 * VFS will return -ENOTBLK if DIO WRITE fails to
+			 * invalidate the page cache. Retry using buffered IO.
+			 */
+			if (unlikely(host_err == -ENOTBLK)) {
+				kiocb->ki_flags &= ~IOCB_DIRECT;
+				*cnt = in_count;
+				kiocb->ki_pos = in_offset;
+				return nfsd_iocb_write(file, rqstp->rq_bvec,
+						       nvecs, cnt, kiocb);
+			} else if (unlikely(host_err == -EINVAL)) {
+				struct inode *inode = d_inode(fhp->fh_dentry);
+
+				pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%ld\n",
+						    inode->i_sb->s_id, inode->i_ino);
+				host_err = -ESERVERFAULT;
+			}
+			return host_err;
+		}
+		*cnt += host_err;
+		if (host_err < iter[i].count) /* partial write? */
+			break;
+	}
+
+	return 0;
+}
+
+static noinline_for_stack int
+nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		  struct nfsd_file *nf, u32 *stable_how, unsigned int nvecs,
+		  unsigned long *cnt, struct kiocb *kiocb)
+{
+	struct nfsd_write_dio write_dio;
+
+	/*
+	 * Check if IOCB_DONTCACHE can be used when issuing buffered IO;
+	 * if so, set it to preserve intent of NFSD_IO_DIRECT (it will
+	 * be ignored for any DIO issued here).
+	 */
+	if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE)
+		kiocb->ki_flags |= IOCB_DONTCACHE;
+
+	if (nfsd_is_write_dio_possible(kiocb->ki_pos, *cnt, nf, &write_dio))
+		return nfsd_issue_write_dio(rqstp, fhp, nf, stable_how, nvecs,
+					    cnt, kiocb, &write_dio);
+
+	return nfsd_iocb_write(nf->nf_file, rqstp->rq_bvec, nvecs, cnt, kiocb);
+}
+
 /**
  * nfsd_vfs_write - write data to an already-open file
  * @rqstp: RPC execution context
@@ -1346,6 +1538,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		nfsd_copy_write_verifier(verf, nn);
 
 	switch (nfsd_io_cache_write) {
+	case NFSD_IO_DIRECT:
+		host_err = nfsd_direct_write(rqstp, fhp, nf, stable_how,
+					     nvecs, cnt, &kiocb);
+		stable = *stable_how;
+		break;
 	case NFSD_IO_DONTCACHE:
 		if (file->f_op->fop_flags & FOP_DONTCACHE)
 			kiocb.ki_flags |= IOCB_DONTCACHE;
-- 
2.51.0


  parent reply	other threads:[~2025-10-24 14:43 UTC|newest]

Thread overview: 85+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-24 14:42 [PATCH v7 00/14] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE Chuck Lever
2025-10-24 14:42 ` [PATCH v7 01/14] NFSD: Make FILE_SYNC WRITEs comply with spec Chuck Lever
2025-10-24 15:21   ` Jeff Layton
2025-10-27  8:02   ` Christoph Hellwig
2025-10-24 14:42 ` [PATCH v7 02/14] NFSD: Enable return of an updated stable_how to NFS clients Chuck Lever
2025-10-27  8:03   ` Christoph Hellwig
2025-10-24 14:42 ` [PATCH v7 03/14] NFSD: Refactor nfsd_vfs_write() Chuck Lever
2025-10-27  8:04   ` Christoph Hellwig
2025-10-24 14:42 ` Chuck Lever [this message]
2025-10-24 17:12   ` [PATCH v7 04/14] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE Mike Snitzer
2025-10-24 17:24     ` Chuck Lever
2025-10-24 14:42 ` [PATCH v7 05/14] NFSD: @stable for direct writes is always NFS_FILE_SYNC Chuck Lever
2025-10-24 15:22   ` Jeff Layton
2025-10-24 15:23     ` Chuck Lever
2025-10-27  8:05   ` Christoph Hellwig
2025-10-27 13:23     ` Chuck Lever
2025-10-27 13:27       ` Christoph Hellwig
2025-10-27 14:31         ` Mike Snitzer
2025-10-27 14:36           ` Christoph Hellwig
2025-10-27 14:58             ` Mike Snitzer
2025-10-27 15:04               ` Chuck Lever
2025-10-27 15:19                 ` Mike Snitzer
2025-10-27 15:05               ` Christoph Hellwig
2025-10-24 14:42 ` [PATCH v7 06/14] NFSD: Always set IOCB_SYNC in direct write path Chuck Lever
2025-10-24 15:22   ` Jeff Layton
2025-10-27  8:08   ` Christoph Hellwig
2025-10-27 10:38     ` Jeff Layton
2025-10-27 10:40       ` Christoph Hellwig
2025-10-24 14:42 ` [PATCH v7 07/14] NFSD: Remove specific error handling Chuck Lever
2025-10-24 15:22   ` Jeff Layton
2025-10-24 14:43 ` [PATCH v7 08/14] NFSD: Remove alignment size checking Chuck Lever
2025-10-24 15:22   ` Jeff Layton
2025-10-27  8:09   ` Christoph Hellwig
2025-10-27 13:25     ` Chuck Lever
2025-10-27 13:30       ` Christoph Hellwig
2025-10-24 14:43 ` [PATCH v7 09/14] NFSD: Remove the len_mask check Chuck Lever
2025-10-24 15:23   ` Jeff Layton
2025-10-24 17:16   ` Mike Snitzer
2025-10-24 17:22     ` Chuck Lever
2025-10-24 14:43 ` [PATCH v7 10/14] NFSD: Clean up synopsis of nfsd_iov_iter_aligned_bvec() Chuck Lever
2025-10-24 15:24   ` Jeff Layton
2025-10-24 14:43 ` [PATCH v7 11/14] NFSD: Clean up struct nfsd_write_dio Chuck Lever
2025-10-24 15:26   ` Jeff Layton
2025-10-24 17:20   ` Mike Snitzer
2025-10-24 14:43 ` [PATCH v7 12/14] NFSD: Introduce struct nfsd_write_dio_seg Chuck Lever
2025-10-24 15:30   ` Jeff Layton
2025-10-24 15:37     ` Chuck Lever
2025-10-24 17:57   ` Mike Snitzer
2025-10-24 14:43 ` [PATCH v7 13/14] NFSD: Clean up direct write fall back error flow Chuck Lever
2025-10-24 15:32   ` Jeff Layton
2025-10-24 18:01   ` Mike Snitzer
2025-10-24 14:43 ` [PATCH v7 14/14] NFSD: Initialize separate ki_flags Chuck Lever
2025-10-24 15:34   ` Jeff Layton
2025-10-24 18:13   ` Mike Snitzer
2025-10-24 19:34     ` Chuck Lever
2025-10-24 20:37       ` Mike Snitzer
2025-10-24 21:16         ` Chuck Lever
2025-10-24 23:56           ` Mike Snitzer
2025-10-27  8:15             ` Christoph Hellwig
2025-10-27 10:50               ` Jeff Layton
2025-10-27 10:55                 ` Christoph Hellwig
2025-10-27 13:48                 ` Chuck Lever
2025-10-27 13:49                   ` Christoph Hellwig
2025-10-27 16:18                   ` Mike Snitzer
2025-10-27 16:59                     ` Mike Snitzer
2025-10-29  7:20                     ` Christoph Hellwig
2025-10-27 16:05                 ` Mike Snitzer
2025-10-27 17:57                   ` Chuck Lever
2025-10-28  3:26                     ` Mike Snitzer
2025-10-28 15:37                       ` Chuck Lever
2025-10-28 16:04                         ` Mike Snitzer
2025-10-28 18:48                           ` Chuck Lever
2025-10-28 23:56                             ` Mike Snitzer
2025-10-29 15:22                               ` Chuck Lever
2025-10-29 16:54                                 ` Mike Snitzer
2025-10-29  7:37                         ` Christoph Hellwig
2025-10-29  7:32                       ` Christoph Hellwig
2025-10-29  7:25                     ` Christoph Hellwig
2025-10-27  8:14         ` Christoph Hellwig
2025-10-27  8:12       ` Christoph Hellwig
2025-10-27 13:27         ` Chuck Lever
2025-10-27 13:30           ` Chuck Lever
2025-10-27 13:31             ` Christoph Hellwig
2025-10-27 14:11         ` Chuck Lever
2025-10-27 14:45           ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251024144306.35652-5-cel@kernel.org \
    --to=cel@kernel.org \
    --cc=dai.ngo@oracle.com \
    --cc=jlayton@kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=neil@brown.name \
    --cc=okorniev@redhat.com \
    --cc=snitzer@kernel.org \
    --cc=tom@talpey.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox