From: Mike Snitzer <snitzer@kernel.org>
To: Chuck Lever <chuck.lever@oracle.com>, Jeff Layton <jlayton@kernel.org>
Cc: linux-nfs@vger.kernel.org
Subject: [PATCH v9 5/9] NFSD: issue READs using O_DIRECT even if IO is misaligned
Date: Wed, 3 Sep 2025 16:51:17 -0400 [thread overview]
Message-ID: <20250903205121.41380-6-snitzer@kernel.org> (raw)
In-Reply-To: <20250903205121.41380-1-snitzer@kernel.org>
If NFSD_IO_DIRECT is used, expand any misaligned READ to the next
DIO-aligned block (on either end of the READ). The expanded READ is
verified to have proper offset/len (logical_block_size) and
dma_alignment checking.
Any misaligned READ that is less than 32K won't be expanded to be
DIO-aligned (this heuristic just avoids excess work, like allocating
start_extra_page, for smaller IO that can generally already perform
well using buffered IO).
Suggested-by: Jeff Layton <jlayton@kernel.org>
Suggested-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfsd/vfs.c | 184 +++++++++++++++++++++++++++++++++++--
include/linux/sunrpc/svc.h | 5 +-
2 files changed, 178 insertions(+), 11 deletions(-)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 358d10a0665f6..96ae86419dc80 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -19,6 +19,7 @@
#include <linux/splice.h>
#include <linux/falloc.h>
#include <linux/fcntl.h>
+#include <linux/math.h>
#include <linux/namei.h>
#include <linux/delay.h>
#include <linux/fsnotify.h>
@@ -1073,6 +1074,137 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
+struct nfsd_read_dio {
+ loff_t start;
+ loff_t end;
+ unsigned long start_extra;
+ unsigned long end_extra;
+};
+
+static void init_nfsd_read_dio(struct nfsd_read_dio *read_dio)
+{
+ memset(read_dio, 0, sizeof(*read_dio));
+}
+
+#define NFSD_READ_DIO_MIN_KB (32 << 10)
+
+static bool nfsd_analyze_read_dio(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file *nf, loff_t offset,
+ unsigned long len, unsigned int base,
+ struct nfsd_read_dio *read_dio)
+{
+ const u32 dio_blocksize = nf->nf_dio_read_offset_align;
+ loff_t middle_end, orig_end = offset + len;
+
+ if (unlikely(!nf->nf_dio_mem_align || !dio_blocksize))
+ return false;
+ if (unlikely(dio_blocksize > PAGE_SIZE))
+ return false;
+ if (unlikely(len < dio_blocksize))
+ return false;
+
+ /* Return early if IO is irreparably misaligned (base not aligned).
+ * Ondisk alignment is implied by the following code that expands
+ * misaligned IO to have a DIO-aligned offset and len.
+ */
+ if ((base & (nf->nf_dio_mem_align-1)) != 0)
+ return false;
+
+ init_nfsd_read_dio(read_dio);
+
+ read_dio->start = round_down(offset, dio_blocksize);
+ read_dio->end = round_up(orig_end, dio_blocksize);
+ read_dio->start_extra = offset - read_dio->start;
+ read_dio->end_extra = read_dio->end - orig_end;
+
+ /*
+ * Any misaligned READ less than NFSD_READ_DIO_MIN_KB won't be expanded
+ * to be DIO-aligned (this heuristic avoids excess work, for smaller IO
+ * that can generally already perform well using buffered IO).
+ */
+ if ((read_dio->start_extra || read_dio->end_extra) &&
+ (len < NFSD_READ_DIO_MIN_KB)) {
+ init_nfsd_read_dio(read_dio);
+ return false;
+ }
+
+ return true;
+}
+
+static ssize_t nfsd_complete_misaligned_read_dio(struct svc_rqst *rqstp,
+ struct nfsd_read_dio *read_dio,
+ ssize_t bytes_read,
+ unsigned long bytes_expected,
+ loff_t *offset,
+ unsigned long *rq_bvec_numpages)
+{
+ ssize_t host_err = bytes_read;
+ loff_t v;
+
+ if (!read_dio->start_extra && !read_dio->end_extra)
+ return host_err;
+
+ /* If nfsd_analyze_read_dio() found start_extra (front-pad) page needed it
+ * must be removed from rqstp->rq_bvec[] to avoid returning unwanted data.
+ */
+ if (read_dio->start_extra) {
+ *rq_bvec_numpages -= 1;
+ v = *rq_bvec_numpages;
+ memmove(rqstp->rq_bvec, rqstp->rq_bvec + 1,
+ v * sizeof(struct bio_vec));
+ }
+ /* Eliminate any end_extra bytes from the last page */
+ v = *rq_bvec_numpages;
+ rqstp->rq_bvec[v].bv_len -= read_dio->end_extra;
+
+ if (host_err < 0) {
+ /* Underlying FS will return -EINVAL if DIO is misaligned. */
+ if (unlikely(host_err == -EINVAL))
+ pr_warn_ratelimited("%s: unexpected host_err=%zd\n",
+ __func__, host_err);
+ return host_err;
+ }
+
+ /* nfsd_analyze_read_dio() may have expanded the start and end,
+ * if so adjust returned read size to reflect original extent.
+ */
+ *offset += read_dio->start_extra;
+ if (likely(host_err >= read_dio->start_extra)) {
+ host_err -= read_dio->start_extra;
+ if (host_err > bytes_expected)
+ host_err = bytes_expected;
+ } else {
+ /* Short read that didn't read any of requested data */
+ host_err = 0;
+ }
+
+ return host_err;
+}
+
+static bool nfsd_iov_iter_aligned_bvec(const struct iov_iter *i,
+ unsigned int addr_mask, unsigned int len_mask)
+{
+ const struct bio_vec *bvec = i->bvec;
+ size_t skip = i->iov_offset;
+ size_t size = i->count;
+
+ if (size & len_mask)
+ return false;
+ do {
+ size_t len = bvec->bv_len;
+
+ if (len > size)
+ len = size;
+ if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
+ return false;
+ bvec++;
+ size -= len;
+ skip = 0;
+ } while (size);
+
+ return true;
+}
+
/**
* nfsd_iter_read - Perform a VFS read using an iterator
* @rqstp: RPC transaction context
@@ -1094,7 +1226,8 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int base, u32 *eof)
{
struct file *file = nf->nf_file;
- unsigned long v, total;
+ unsigned long v, total, in_count = *count;
+ struct nfsd_read_dio read_dio;
struct iov_iter iter;
struct kiocb kiocb;
ssize_t host_err;
@@ -1102,13 +1235,34 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
init_sync_kiocb(&kiocb, file);
+ v = 0;
+ total = in_count;
+
switch (nfsd_io_cache_read) {
case NFSD_IO_DIRECT:
- /* Verify ondisk and memory DIO alignment */
- if (nf->nf_dio_mem_align && nf->nf_dio_read_offset_align &&
- (((offset | *count) & (nf->nf_dio_read_offset_align - 1)) == 0) &&
- (base & (nf->nf_dio_mem_align - 1)) == 0)
- kiocb.ki_flags = IOCB_DIRECT;
+ /*
+ * If NFSD_IO_DIRECT enabled, expand any misaligned READ to
+ * the next DIO-aligned block (on either end of the READ).
+ */
+ if (nfsd_analyze_read_dio(rqstp, fhp, nf, offset,
+ in_count, base, &read_dio)) {
+ /* trace_nfsd_read_vector() will reflect larger
+ * DIO-aligned READ.
+ */
+ offset = read_dio.start;
+ in_count = read_dio.end - offset;
+ total = in_count;
+
+ kiocb.ki_flags |= IOCB_DIRECT;
+ if (read_dio.start_extra) {
+ len = read_dio.start_extra;
+ bvec_set_page(&rqstp->rq_bvec[v],
+ *(rqstp->rq_next_page++),
+ len, PAGE_SIZE - len);
+ total -= len;
+ ++v;
+ }
+ }
break;
case NFSD_IO_DONTCACHE:
kiocb.ki_flags = IOCB_DONTCACHE;
@@ -1119,8 +1273,6 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
kiocb.ki_pos = offset;
- v = 0;
- total = *count;
while (total) {
len = min_t(size_t, total, PAGE_SIZE - base);
bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++),
@@ -1131,9 +1283,21 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
WARN_ON_ONCE(v > rqstp->rq_maxpages);
- trace_nfsd_read_vector(rqstp, fhp, offset, *count);
- iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count);
+ trace_nfsd_read_vector(rqstp, fhp, offset, in_count);
+ iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, in_count);
+
+ if ((kiocb.ki_flags & IOCB_DIRECT) &&
+ !nfsd_iov_iter_aligned_bvec(&iter, nf->nf_dio_mem_align-1,
+ nf->nf_dio_read_offset_align-1))
+ kiocb.ki_flags &= ~IOCB_DIRECT;
+
host_err = vfs_iocb_iter_read(file, &kiocb, &iter);
+
+ if (in_count != *count) {
+ /* misaligned DIO expanded read to be DIO-aligned */
+ host_err = nfsd_complete_misaligned_read_dio(rqstp, &read_dio,
+ host_err, *count, &offset, &v);
+ }
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e64ab444e0a7f..190c2667500e2 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -163,10 +163,13 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
* pages, one for the request, and one for the reply.
* nfsd_splice_actor() might need an extra page when a READ payload
* is not page-aligned.
+ * nfsd_iter_read() might need two extra pages when a READ payload
+ * is not DIO-aligned -- but nfsd_iter_read() and nfsd_splice_actor()
+ * are mutually exclusive (so reuse page reserved for nfsd_splice_actor).
*/
static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv)
{
- return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1;
+ return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1 + 1;
}
/*
--
2.44.0
next prev parent reply other threads:[~2025-09-03 20:51 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-09-03 20:51 [PATCH v9 0/9] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes Mike Snitzer
2025-09-03 20:51 ` [PATCH v9 1/9] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Mike Snitzer
2025-09-03 20:51 ` [PATCH v9 2/9] NFSD: pass nfsd_file to nfsd_iter_read() Mike Snitzer
2025-09-03 20:51 ` [PATCH v9 3/9] NFSD: add io_cache_read controls to debugfs interface Mike Snitzer
2025-09-03 20:51 ` [PATCH v9 4/9] NFSD: add io_cache_write " Mike Snitzer
2025-09-03 20:51 ` Mike Snitzer [this message]
2025-09-03 20:51 ` [PATCH v9 6/9] NFSD: issue WRITEs using O_DIRECT even if IO is misaligned Mike Snitzer
2025-09-03 20:51 ` [PATCH v9 7/9] NFSD: add nfsd_analyze_read_dio and nfsd_analyze_write_dio trace events Mike Snitzer
2025-09-03 20:51 ` [PATCH v9 8/9] NFSD: add Documentation/filesystems/nfs/nfsd-io-modes.rst Mike Snitzer
2025-09-03 20:51 ` [PATCH v9 9/9] NFSD: use /end/ of rq_pages for misaligned DIO READ's start_extra page Mike Snitzer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250903205121.41380-6-snitzer@kernel.org \
--to=snitzer@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=jlayton@kernel.org \
--cc=linux-nfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox