From: Mike Snitzer <snitzer@kernel.org>
To: Chuck Lever <chuck.lever@oracle.com>,
Jeff Layton <jlayton@kernel.org>,
Trond Myklebust <trond.myklebust@hammerspace.com>,
Anna Schumaker <anna.schumaker@oracle.com>
Cc: linux-nfs@vger.kernel.org
Subject: [PATCH v5 12/13] nfs/direct: add misaligned READ handling
Date: Thu, 24 Jul 2025 15:31:01 -0400 [thread overview]
Message-ID: <20250724193102.65111-13-snitzer@kernel.org> (raw)
In-Reply-To: <20250724193102.65111-1-snitzer@kernel.org>
Because the NFS client will already happily handle misaligned O_DIRECT
IO (by sending it out to NFSD via RPC) this commit's new capabilities
are for the benefit of LOCALIO and require the nfs modparam:
localio_O_DIRECT_align_misaligned_IO=Y
When enabled, misaligned READ IO is expanded to consist of a
DIO-aligned extent followed by a single misaligned tail page (due to
it being a partial page).
Also add an nfs_analyze_dio trace event that shows how the NFS client
split a given misaligned IO into a mix of misaligned page(s) and a
DIO-aligned extent.
This combination of trace events is useful for LOCALIO READs:
echo 1 > /sys/kernel/tracing/events/nfs/nfs_analyze_dio/enable
echo 1 > /sys/kernel/tracing/events/nfs/nfs_initiate_read/enable
echo 1 > /sys/kernel/tracing/events/nfs/nfs_readpage_done/enable
echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_read/enable
Which for this dd command:
dd if=/mnt/share1/test of=/dev/null bs=47008 count=2 iflag=direct
Results in:
dd-63258 [002] ..... 83742.428577: nfs_analyze_dio: READ offset=0 len=47008 start=0+0 middle=0+45056 end=45056+1952
dd-63258 [002] ..... 83742.428591: nfs_initiate_read: fileid=00:2e:219750 fhandle=0xf6927a01 offset=0 count=45056
kworker/u193:3-62985 [011] ..... 83742.428594: xfs_file_direct_read: dev 259:22 ino 0x5e0000a3 disize 0x16f40 pos 0x0 bytecount 0xb000
dd-63258 [002] ..... 83742.428595: nfs_initiate_read: fileid=00:2e:219750 fhandle=0xf6927a01 offset=45056 count=1952
kworker/u193:4-63221 [004] ..... 83742.428598: nfs_readpage_done: error=0 fileid=00:2e:219750 fhandle=0xf6927a01 offset=45056 count=1952 res=1952
kworker/u193:4-63221 [004] ..... 83742.428613: nfs_readpage_done: error=0 fileid=00:2e:219750 fhandle=0xf6927a01 offset=0 count=45056 res=45056
dd-63258 [002] ..... 83742.428619: nfs_analyze_dio: READ offset=47008 len=47008 start=45056+1952 middle=47008+43104 end=90112+3904
dd-63258 [002] ..... 83742.428622: nfs_initiate_read: fileid=00:2e:219750 fhandle=0xf6927a01 offset=45056 count=45056
dd-63258 [002] ..... 83742.428624: nfs_initiate_read: fileid=00:2e:219750 fhandle=0xf6927a01 offset=90112 count=3904
kworker/u193:4-63221 [004] ..... 83742.428624: xfs_file_direct_read: dev 259:22 ino 0x5e0000a3 disize 0x16f40 pos 0xb000 bytecount 0xb000
kworker/u193:3-62985 [011] ..... 83742.428628: nfs_readpage_done: error=0 fileid=00:2e:219750 fhandle=0xf6927a01 offset=90112 count=3904 res=3904 eof
kworker/u193:3-62985 [011] ..... 83742.428642: nfs_readpage_done: error=0 fileid=00:2e:219750 fhandle=0xf6927a01 offset=45056 count=45056 res=45056
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfs/direct.c | 178 ++++++++++++++++++++++++++++++++++++---
fs/nfs/internal.h | 7 ++
fs/nfs/nfstrace.h | 41 +++++++++
fs/nfs/pagelist.c | 7 ++
include/linux/nfs_page.h | 1 +
5 files changed, 223 insertions(+), 11 deletions(-)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 48d89716193a..4e1e668eaa1f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -210,6 +210,13 @@ static void nfs_direct_req_free(struct kref *kref)
nfs_put_lock_context(dreq->l_ctx);
if (dreq->ctx != NULL)
put_nfs_open_context(dreq->ctx);
+
+ if (dreq->start_extra_bvec != NULL) {
+ if (dreq->start_extra_bvec->bv_page != NULL)
+ __free_page(dreq->start_extra_bvec->bv_page);
+ kfree(dreq->start_extra_bvec);
+ }
+
kmem_cache_free(nfs_direct_cachep, dreq);
}
@@ -264,6 +271,10 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
if (dreq->count != 0) {
res = (long) dreq->count;
WARN_ON_ONCE(dreq->count < 0);
+ /* Reduce res by front_pad */
+ if ((dreq->start_extra_bvec != NULL) &&
+ res >= dreq->start_extra_bvec->bv_len)
+ res -= dreq->start_extra_bvec->bv_len;
}
dreq->iocb->ki_complete(dreq->iocb, res);
}
@@ -285,6 +296,15 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
}
nfs_direct_count_bytes(dreq, hdr);
+
+ if (dreq->start_extra_bvec != NULL && (dreq->count == dreq->max_count)) {
+ unsigned front_pad = dreq->start_extra_bvec->bv_len;
+
+ hdr->res.count -= front_pad;
+ hdr->good_bytes -= front_pad;
+ hdr->args.count -= front_pad;
+ hdr->args.offset += front_pad;
+ }
spin_unlock(&dreq->lock);
nfs_update_delegated_atime(dreq->inode);
@@ -353,6 +373,30 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
desc.pg_dreq = dreq;
inode_dio_begin(inode);
+ if (dreq->start_extra_bvec != NULL) {
+ struct nfs_page *req;
+ size_t pgbase = dreq->start_extra_bvec->bv_offset;
+ unsigned int front_pad = dreq->start_extra_bvec->bv_len;
+
+ /* Must force start pos to DIO-aligned start */
+ WARN_ON(pos != dreq->io_start);
+ req = nfs_page_create_from_page(dreq->ctx,
+ dreq->start_extra_bvec->bv_page,
+ pgbase, pos, front_pad);
+ if (IS_ERR(req)) {
+ result = PTR_ERR(req);
+ goto out;
+ }
+ if (!nfs_pageio_add_request(&desc, req)) {
+ result = desc.pg_error;
+ nfs_release_request(req);
+ goto out;
+ }
+
+ requested_bytes += front_pad;
+ pos += front_pad;
+ }
+
while (iov_iter_count(iter)) {
struct page **pagevec;
size_t bytes;
@@ -363,12 +407,19 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
rsize, &pgbase);
if (result < 0)
break;
-
- bytes = result;
- npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ /* Limit the first batch of pages to DIO-aligned boundary? */
+ if (pos < dreq->end_offset && dreq->middle_len)
+ bytes = min_t(size_t, dreq->middle_len, result);
+ else
+ bytes = result;
+ npages = (bytes + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+
for (i = 0; i < npages; i++) {
struct nfs_page *req;
unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
+ bool issue_dio_now = false;
+
/* XXX do we need to do the eof zeroing found in async_filler? */
req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
pgbase, pos, req_len);
@@ -376,15 +427,33 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
result = PTR_ERR(req);
break;
}
+
+ pgbase = 0;
+ result -= req_len;
+ bytes -= req_len;
+ requested_bytes += req_len;
+ pos += req_len;
+
+ /* Looking ahead, is this req the end of the DIO-aligned middle? */
+ if (bytes == 0 && dreq->end_len &&
+ pos == dreq->end_offset && result == dreq->end_len) {
+ desc.pg_doio_now = 1;
+ issue_dio_now = true;
+ /* Reset iter to the last page (known misaligned),
+ * issue previous DIO-aligned page and then handle
+ * the last partial page stored in iter
+ */
+ iov_iter_revert(iter, result);
+ }
+
if (!nfs_pageio_add_request(&desc, req)) {
result = desc.pg_error;
nfs_release_request(req);
break;
}
- pgbase = 0;
- bytes -= req_len;
- requested_bytes += req_len;
- pos += req_len;
+
+ if (issue_dio_now)
+ break;
}
nfs_direct_release_pages(pagevec, npages);
kvfree(pagevec);
@@ -398,6 +467,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* If no bytes were started, return the error, and let the
* generic layer handle the completion.
*/
+out:
if (requested_bytes == 0) {
inode_dio_end(inode);
nfs_direct_req_release(dreq);
@@ -409,6 +479,70 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
return requested_bytes;
}
+/*
+ * If localio_O_DIRECT_align_misaligned_READ enabled, expand any
+ * misaligned READ to include the previous DIO-aligned block.
+ * - FIXME: expanding the end to also be DIO-aligned requires a
+ * bounce page that must be copied to original partial end page.
+ */
+static bool nfs_analyze_read_dio(loff_t offset, __u32 len,
+ struct nfs_direct_req *dreq)
+{
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+ /* Hardcoded to PAGE_SIZE (since don't have LOCALIO nfsd_file's
+ * dio_alignment), works for smaller alignment too (e.g. 512b).
+ */
+ u32 dio_blocksize = PAGE_SIZE;
+ loff_t start, front_pad, orig_end, middle_end;
+
+ /* Return early if feature disabled, if IO is irreparably
+ * misaligned (len < PAGE_SIZE) or if IO is already DIO-aligned.
+ */
+ if (!nfs_localio_O_DIRECT_align_misaligned_IO() ||
+ unlikely(len < dio_blocksize) ||
+ (((offset | len) & (dio_blocksize-1)) == 0))
+ return false;
+
+ start = round_down(offset, dio_blocksize);
+ front_pad = offset - start;
+ orig_end = offset + len;
+ middle_end = round_down(orig_end, dio_blocksize);
+
+ if (front_pad) {
+ gfp_t gfp_mask = nfs_io_gfp_mask();
+
+ dreq->start_extra_bvec = kmalloc(sizeof(struct bio_vec), gfp_mask);
+ if (dreq->start_extra_bvec == NULL)
+ return false;
+ dreq->start_extra_bvec->bv_page = alloc_page(gfp_mask);
+ if (dreq->start_extra_bvec->bv_page == NULL) {
+ kfree(dreq->start_extra_bvec);
+ dreq->start_extra_bvec = NULL;
+ return false;
+ }
+
+ bvec_set_page(dreq->start_extra_bvec,
+ dreq->start_extra_bvec->bv_page,
+ front_pad, PAGE_SIZE - front_pad);
+ }
+
+ dreq->middle_offset = offset;
+ dreq->middle_len = middle_end - offset;
+ dreq->end_offset = middle_end;
+ dreq->end_len = orig_end - middle_end;
+
+ dreq->io_start = start;
+ dreq->max_count = orig_end - start;
+
+ trace_nfs_analyze_dio(READ, offset, len, start, front_pad,
+ dreq->middle_offset, dreq->middle_len,
+ dreq->end_offset, dreq->end_len);
+ return true;
+#else
+ return false;
+#endif
+}
+
/**
* nfs_file_direct_read - file direct read operation for NFS files
* @iocb: target I/O control block
@@ -439,6 +573,9 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
struct nfs_lock_context *l_ctx;
ssize_t result, requested;
size_t count = iov_iter_count(iter);
+ size_t in_count = count;
+ unsigned int front_pad = 0;
+
nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
@@ -455,9 +592,20 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
if (dreq == NULL)
goto out;
+ if (!swap && nfs_analyze_read_dio(iocb->ki_pos, count, dreq)) {
+ /* note that dreq values do include front_pad
+ * (dreq->io_start -> dreq->start_extra_bvec->bv_offset)
+ */
+ iocb->ki_pos = dreq->io_start;
+ count = dreq->max_count;
+ if (dreq->start_extra_bvec)
+ front_pad = dreq->start_extra_bvec->bv_len;
+ } else {
+ dreq->io_start = iocb->ki_pos;
+ dreq->max_count = count;
+ }
+
dreq->inode = inode;
- dreq->max_count = count;
- dreq->io_start = iocb->ki_pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
@@ -483,16 +631,24 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
}
}
- NFS_I(inode)->read_io += count;
+ NFS_I(inode)->read_io += in_count;
requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
if (!swap)
nfs_end_io_direct(inode);
if (requested > 0) {
+ if (front_pad) {
+ /* given the iov_iter_revert below, must exclude the
+ * front_pad (dreq->start_extra_bvec) from requested,
+ */
+ requested -= front_pad;
+ }
+
result = nfs_direct_wait(dreq);
if (result > 0) {
- requested -= result;
+ if (front_pad && result >= front_pad)
+ result -= front_pad;
iocb->ki_pos += result;
}
iov_iter_revert(iter, requested);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f54030684c97..06a15bf08357 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -990,4 +990,11 @@ struct nfs_direct_req {
/* for read */
#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */
#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */
+
+ /* State for expanding misaligned IO to be DIO-aligned (for LOCALIO) */
+ struct bio_vec * start_extra_bvec;
+ loff_t middle_offset; /* Offset for start of DIO-aligned middle */
+ loff_t end_offset; /* Offset for start of DIO-aligned end */
+ ssize_t middle_len; /* Length for DIO-aligned middle */
+ ssize_t end_len; /* Length for misaligned last page */
};
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 334e65d6bc72..a0b9af10a744 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -1593,6 +1593,47 @@ DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion);
DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec);
DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io);
+TRACE_EVENT(nfs_analyze_dio,
+ TP_PROTO(u32 rw,
+ u64 offset,
+ u32 len,
+ loff_t start,
+ loff_t start_extra,
+ loff_t middle,
+ loff_t middle_len,
+ loff_t end,
+ loff_t end_len),
+ TP_ARGS(rw, offset, len, start, start_extra, middle, middle_len, end, end_len),
+ TP_STRUCT__entry(
+ __field(u32, rw)
+ __field(u64, offset)
+ __field(u32, len)
+ __field(loff_t, start)
+ __field(loff_t, start_extra)
+ __field(loff_t, middle)
+ __field(loff_t, middle_len)
+ __field(loff_t, end)
+ __field(loff_t, end_len)
+ ),
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->offset = offset;
+ __entry->len = len;
+ __entry->start = start;
+ __entry->start_extra = start_extra;
+ __entry->middle = middle;
+ __entry->middle_len = middle_len;
+ __entry->end = end;
+ __entry->end_len = end_len;
+ ),
+ TP_printk("%s offset=%llu len=%u start=%llu+%llu middle=%llu+%llu end=%llu+%llu",
+ __entry->rw ? "WRITE" : "READ",
+ __entry->offset, __entry->len,
+ __entry->start, __entry->start_extra,
+ __entry->middle, __entry->middle_len,
+ __entry->end, __entry->end_len)
+);
+
TRACE_EVENT(nfs_fh_to_dentry,
TP_PROTO(
const struct super_block *sb,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 9ddff27e96e9..8d877360042d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -832,6 +832,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
int io_flags)
{
desc->pg_moreio = 0;
+ desc->pg_doio_now = 0;
desc->pg_inode = inode;
desc->pg_ops = pg_ops;
desc->pg_completion_ops = compl_ops;
@@ -1141,6 +1142,8 @@ nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
return size;
nfs_list_move_request(req, &mirror->pg_list);
mirror->pg_count += req->wb_bytes;
+ if (desc->pg_doio_now)
+ return 0; /* trigger nfs_pageio_doio() in caller */
return req->wb_bytes;
}
@@ -1220,6 +1223,10 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
nfs_pageio_doio(desc);
if (desc->pg_error < 0 || mirror->pg_recoalesce)
return 0;
+ if (desc->pg_doio_now) {
+ desc->pg_doio_now = 0;
+ return 1;
+ }
/* retry add_request for this subreq */
nfs_page_group_lock(req);
continue;
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 169b4ae30ff4..2e88dc2ff3fe 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -117,6 +117,7 @@ struct nfs_pageio_descriptor {
u32 pg_mirror_idx; /* current mirror */
unsigned short pg_maxretrans;
unsigned char pg_moreio : 1;
+ unsigned char pg_doio_now : 1;
};
/* arbitrarily selected limit to number of mirrors */
--
2.44.0
next prev parent reply other threads:[~2025-07-24 19:31 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-07-24 19:30 [PATCH v5 00/13] NFSD DIRECT and NFS DIRECT Mike Snitzer
2025-07-24 19:30 ` [PATCH v5 01/13] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Mike Snitzer
2025-07-24 19:30 ` [PATCH v5 02/13] NFSD: pass nfsd_file to nfsd_iter_read() Mike Snitzer
2025-07-24 19:30 ` [PATCH v5 03/13] NFSD: add io_cache_read controls to debugfs interface Mike Snitzer
2025-07-24 19:30 ` [PATCH v5 04/13] NFSD: add io_cache_write " Mike Snitzer
2025-07-24 19:30 ` [PATCH v5 05/13] NFSD: filecache: only get DIO alignment attrs if NFSD_IO_DIRECT enabled Mike Snitzer
2025-07-24 19:30 ` [PATCH v5 06/13] NFSD: issue READs using O_DIRECT even if IO is misaligned Mike Snitzer
2025-07-24 19:30 ` [PATCH v5 07/13] nfs/localio: avoid bouncing LOCALIO if nfs_client_is_local() Mike Snitzer
2025-07-24 19:30 ` [PATCH v5 08/13] nfs/localio: make trace_nfs_local_open_fh more useful Mike Snitzer
2025-07-24 19:30 ` [PATCH v5 09/13] nfs/localio: add nfsd_file_dio_alignment Mike Snitzer
2025-07-24 19:30 ` [PATCH v5 10/13] nfs/localio: refactor iocb initialization Mike Snitzer
2025-07-24 19:31 ` [PATCH v5 11/13] nfs/localio: fallback to NFSD for misaligned O_DIRECT READs Mike Snitzer
2025-07-24 19:31 ` Mike Snitzer [this message]
2025-07-24 19:31 ` [PATCH v5 13/13] nfs/direct: add misaligned WRITE handling Mike Snitzer
2025-07-27 15:39 ` [PATCH v5 00/13] NFSD DIRECT and NFS DIRECT Chuck Lever
2025-07-28 13:44 ` Mike Snitzer
2025-07-28 13:48 ` Chuck Lever
2025-07-28 14:08 ` Mike Snitzer
2025-07-27 16:16 ` (subset) " Chuck Lever
2025-07-28 13:51 ` Mike Snitzer
2025-07-28 13:53 ` Chuck Lever
2025-07-28 13:58 ` Mike Snitzer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250724193102.65111-13-snitzer@kernel.org \
--to=snitzer@kernel.org \
--cc=anna.schumaker@oracle.com \
--cc=chuck.lever@oracle.com \
--cc=jlayton@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=trond.myklebust@hammerspace.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).