From: Mike Snitzer <snitzer@kernel.org>
To: Trond Myklebust <trond.myklebust@hammerspace.com>,
Anna Schumaker <anna@kernel.org>
Cc: linux-nfs@vger.kernel.org
Subject: SYNCFrom cb702c86aa12e5477e1e4aca0c1384a821f2afef Mon Sep 17 00:00:00 2001
Date: Mon, 15 Sep 2025 11:41:13 -0400 [thread overview]
Message-ID: <20250915154115.19579-6-snitzer@kernel.org> (raw)
In-Reply-To: <20250915154115.19579-1-snitzer@kernel.org>
Because the NFS client will already happily handle misaligned O_DIRECT
IO (by sending it out to NFSD via RPC) this commit's new capabilities
are for the benefit of LOCALIO.
LOCALIO will make best effort to transform misaligned IO to
DIO-aligned extents when possible.
LOCALIO's READ and WRITE DIO that is misaligned will be split into as
many as 3 component IOs (@start, @middle and @end) as needed -- IFF
the @middle extent is verified to be DIO-aligned, and then the @start
and/or @end are misaligned (due to each being a partial page).
Otherwise if the @middle isn't DIO-aligned the code will fallback to
issuing only a single contiguous buffered IO.
The @middle is only DIO-aligned if both the memory and on-disk offsets
for the IO are aligned relative to the underlying local filesystem's
block device limits (@dma_alignment and @logical_block_size
respectively).
The misaligned @start and/or @end extents are issued using buffered IO
and the DIO-aligned @middle is issued using O_DIRECT. The @start and
@end IOs are issued first using buffered IO and then the @middle is
issued last using direct IO with async completion (AIO). This out of
order IO completion means that LOCALIO's IO completion code
(nfs_local_read_done and nfs_local_write_done) is only called for the
IO's last associated iov_iter completion. And in the case of
DIO-aligned @middle it completes last using AIO. nfs_local_pgio_done()
is updated to handle piece-wise partial completion of each iov_iter.
This implementation for LOCALIO's misaligned DIO handling uses 3
iov_iter that share the same backing pages in their bio_vecs (so
unfortunately 'struct nfs_local_kiocb' has 3 instead of only 1).
[Reducing LOCALIO's per-IO (struct nfs_local_kiocb) memory use can be
explored in the future. One logical progression to improve this code,
and eliminate explicit loops over up to 3 iov_iter, is by extending
'struct iov_iter' to support iov_iter_clone() and iov_iter_chain()
interfaces that are comparable to what 'struct bio' is able to support
in the block layer. But even that wouldn't avoid the need to
allocate/use up to 3 iov_iter]
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfs/localio.c | 249 ++++++++++++++++++++++++++++++++++++++---------
1 file changed, 203 insertions(+), 46 deletions(-)
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 82894962966e8..92e5378ad63c6 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -30,14 +30,23 @@
#define NFSDBG_FACILITY NFSDBG_VFS
+#define NFSLOCAL_MAX_IOS 3
+
struct nfs_local_kiocb {
struct kiocb kiocb;
struct bio_vec *bvec;
struct nfs_pgio_header *hdr;
struct work_struct work;
void (*aio_complete_work)(struct work_struct *);
- struct iov_iter iter ____cacheline_aligned;
struct nfsd_file *localio;
+ /* Begin mostly DIO-specific members */
+ size_t end_len;
+ short int end_iter_index;
+ short int n_iters;
+ bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
+ loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
+ struct iov_iter iters[NFSLOCAL_MAX_IOS];
+ /* End mostly DIO-specific members */
};
struct nfs_local_fsync_ctx {
@@ -291,7 +300,7 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
{
struct nfs_local_kiocb *iocb;
- iocb = kmalloc(sizeof(*iocb), flags);
+ iocb = kzalloc(sizeof(*iocb), flags);
if (iocb == NULL)
return NULL;
@@ -303,25 +312,72 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
}
init_sync_kiocb(&iocb->kiocb, file);
- if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags))
- iocb->kiocb.ki_flags = IOCB_DIRECT;
- iocb->kiocb.ki_pos = hdr->args.offset;
iocb->hdr = hdr;
iocb->kiocb.ki_flags &= ~IOCB_APPEND;
iocb->aio_complete_work = NULL;
+ iocb->end_iter_index = -1;
+
return iocb;
}
+struct nfs_local_dio {
+ u32 mem_align;
+ u32 offset_align;
+ loff_t middle_offset;
+ loff_t end_offset;
+ ssize_t start_len; /* Length for misaligned first extent */
+ ssize_t middle_len; /* Length for DIO-aligned middle extent */
+ ssize_t end_len; /* Length for misaligned last extent */
+};
+
+static bool
+nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw,
+ size_t len, struct nfs_local_dio *local_dio)
+{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+ loff_t offset = hdr->args.offset;
+ u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
+ loff_t start_end, orig_end, middle_end;
+
+ nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
+ &nf_dio_offset_align, &nf_dio_read_offset_align);
+ if (rw == ITER_DEST)
+ nf_dio_offset_align = nf_dio_read_offset_align;
+
+ if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align))
+ return false;
+ if (unlikely(nf_dio_offset_align > PAGE_SIZE))
+ return false;
+ if (unlikely(len < nf_dio_offset_align))
+ return false;
+
+ local_dio->mem_align = nf_dio_mem_align;
+ local_dio->offset_align = nf_dio_offset_align;
+
+ start_end = round_up(offset, nf_dio_offset_align);
+ orig_end = offset + len;
+ middle_end = round_down(orig_end, nf_dio_offset_align);
+
+ local_dio->middle_offset = start_end;
+ local_dio->end_offset = middle_end;
+
+ local_dio->start_len = start_end - offset;
+ local_dio->middle_len = middle_end - start_end;
+ local_dio->end_len = orig_end - middle_end;
+
+ return true;
+}
+
static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
- loff_t offset, unsigned int addr_mask, unsigned int len_mask)
+ unsigned int addr_mask, unsigned int len_mask)
{
const struct bio_vec *bvec = i->bvec;
size_t skip = i->iov_offset;
size_t size = i->count;
- if ((offset | size) & len_mask)
+ if (size & len_mask)
return false;
do {
size_t len = bvec->bv_len;
@@ -338,8 +394,68 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
return true;
}
-static void
-nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
+/*
+ * Setup as many as 3 iov_iter based on extents described by @local_dio.
+ * Returns the number of iov_iter that were setup.
+ */
+static int
+nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
+ unsigned int nvecs, size_t len,
+ struct nfs_local_dio *local_dio)
+{
+ int n_iters = 0;
+ struct iov_iter *iters = iocb->iters;
+
+ /* Setup misaligned start? */
+ if (local_dio->start_len) {
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ iters[n_iters].count = local_dio->start_len;
+ iocb->offset[n_iters] = iocb->hdr->args.offset;
+ iocb->iter_is_dio_aligned[n_iters] = false;
+ ++n_iters;
+ }
+
+ /* Setup misaligned end?
+ * If so, the end is purposely setup to be issued using buffered IO
+ * before the middle (which will use DIO, if DIO-aligned, with AIO).
+ * This creates problems if/when the end results in a partial write.
+ * So must save index and length of end to handle this corner case.
+ */
+ if (local_dio->end_len) {
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ iocb->offset[n_iters] = local_dio->end_offset;
+ iov_iter_advance(&iters[n_iters],
+ local_dio->start_len + local_dio->middle_len);
+ iocb->iter_is_dio_aligned[n_iters] = false;
+ /* Save index and length of end */
+ iocb->end_iter_index = n_iters;
+ iocb->end_len = local_dio->end_len;
+ ++n_iters;
+ }
+
+ /* Setup DIO-aligned middle to be issued last, to allow for
+ * DIO with AIO completion (see nfs_local_call_{read,write}).
+ */
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ if (local_dio->start_len)
+ iov_iter_advance(&iters[n_iters], local_dio->start_len);
+ iters[n_iters].count -= local_dio->end_len;
+ iocb->offset[n_iters] = local_dio->middle_offset;
+
+ iocb->iter_is_dio_aligned[n_iters] =
+ nfs_iov_iter_aligned_bvec(&iters[n_iters],
+ local_dio->mem_align-1, local_dio->offset_align-1);
+
+ if (unlikely(!iocb->iter_is_dio_aligned[n_iters]))
+ return 0; /* no DIO-aligned IO possible */
+ ++n_iters;
+
+ iocb->n_iters = n_iters;
+ return n_iters;
+}
+
+static noinline_for_stack void
+nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
{
struct nfs_pgio_header *hdr = iocb->hdr;
struct page **pagevec = hdr->page_array.pagevec;
@@ -360,26 +476,18 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
}
len = hdr->args.count - total;
- iov_iter_bvec(i, rw, iocb->bvec, v, len);
+ if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
+ struct nfs_local_dio local_dio;
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
- /* Verify the IO is DIO-aligned as required */
- nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
- &nf_dio_offset_align,
- &nf_dio_read_offset_align);
- if (rw == ITER_DEST)
- nf_dio_offset_align = nf_dio_read_offset_align;
-
- if (nf_dio_mem_align && nf_dio_offset_align &&
- nfs_iov_iter_aligned_bvec(i, hdr->args.offset,
- nf_dio_mem_align - 1,
- nf_dio_offset_align - 1))
+ if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) &&
+ nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0)
return; /* is DIO-aligned */
-
- /* Fallback to using buffered for this misaligned IO */
- iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
}
+
+ /* Use buffered IO */
+ iocb->offset[0] = hdr->args.offset;
+ iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
+ iocb->n_iters = 1;
}
static void
@@ -402,10 +510,12 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr,
static void
nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
{
+ /* Must handle partial completions */
if (status >= 0) {
- hdr->res.count = status;
- hdr->res.op_status = NFS4_OK;
- hdr->task.tk_status = 0;
+ hdr->res.count += status;
+ /* @hdr was initialized to 0 (zeroed during allocation) */
+ if (hdr->task.tk_status == 0)
+ hdr->res.op_status = NFS4_OK;
} else {
hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
hdr->task.tk_status = status;
@@ -447,14 +557,14 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
struct file *filp = iocb->kiocb.ki_filp;
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+ /* DIO is last to complete (via AIO) */
if (status == -EINVAL) {
/* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n");
}
+ nfs_local_pgio_done(hdr, status);
}
- nfs_local_pgio_done(hdr, status);
-
/*
* Must clear replen otherwise NFSv3 data corruption will occur
* if/when switching from LOCALIO back to using normal RPC.
@@ -496,12 +606,21 @@ static void nfs_local_call_read(struct work_struct *work)
save_cred = override_creds(filp->f_cred);
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
- iocb->aio_complete_work = nfs_local_read_aio_complete_work;
- }
+ for (int i = 0; i < iocb->n_iters ; i++) {
+ if (iocb->iter_is_dio_aligned[i]) {
+ iocb->kiocb.ki_flags |= IOCB_DIRECT;
+ iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
+ iocb->aio_complete_work = nfs_local_read_aio_complete_work;
+ }
- status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iter);
+ iocb->kiocb.ki_pos = iocb->offset[i];
+ status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
+ if (status != -EIOCBQUEUED) {
+ nfs_local_pgio_done(iocb->hdr, status);
+ if (iocb->hdr->task.tk_status)
+ break;
+ }
+ }
revert_creds(save_cred);
@@ -632,13 +751,16 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0);
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+ /* DIO is last to complete (via AIO) */
if (status == -EINVAL) {
/* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n");
}
+ nfs_local_pgio_done(hdr, status);
}
/* Handle short writes as if they are ENOSPC */
+ status = hdr->res.count;
if (status > 0 && status < hdr->args.count) {
hdr->mds_offset += status;
hdr->args.offset += status;
@@ -646,11 +768,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
hdr->args.count -= status;
nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset);
status = -ENOSPC;
+ /* record -ENOSPC in terms of nfs_local_pgio_done */
+ nfs_local_pgio_done(hdr, status);
}
- if (status < 0)
+ if (hdr->task.tk_status < 0)
nfs_reset_boot_verifier(inode);
-
- nfs_local_pgio_done(hdr, status);
}
static void nfs_local_write_aio_complete_work(struct work_struct *work)
@@ -683,13 +805,48 @@ static void nfs_local_call_write(struct work_struct *work)
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
save_cred = override_creds(filp->f_cred);
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
- iocb->aio_complete_work = nfs_local_write_aio_complete_work;
- }
-
file_start_write(filp);
- status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iter);
+ for (int i = 0; i < iocb->n_iters ; i++) {
+ if (iocb->iter_is_dio_aligned[i]) {
+ iocb->kiocb.ki_flags |= IOCB_DIRECT;
+ iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
+ iocb->aio_complete_work = nfs_local_write_aio_complete_work;
+ }
+retry:
+ iocb->kiocb.ki_pos = iocb->offset[i];
+ status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
+ if (status != -EIOCBQUEUED) {
+ if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
+ /* partial write */
+ if (i == iocb->end_iter_index) {
+ /* Must not account partial end, otherwise, due
+ * to end being issued before middle: the partial
+ * write accounting in nfs_local_write_done()
+ * would incorrectly advance hdr->args.offset
+ */
+ status = 0;
+ } else {
+ /* Partial write at start or buffered middle,
+ * exit early.
+ */
+ nfs_local_pgio_done(iocb->hdr, status);
+ break;
+ }
+ } else if (unlikely(status == -ENOTBLK &&
+ (iocb->kiocb.ki_flags & IOCB_DIRECT))) {
+ /* VFS will return -ENOTBLK if DIO WRITE fails to
+ * invalidate the page cache. Retry using buffered IO.
+ */
+ iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
+ iocb->kiocb.ki_complete = NULL;
+ iocb->aio_complete_work = NULL;
+ goto retry;
+ }
+ nfs_local_pgio_done(iocb->hdr, status);
+ if (iocb->hdr->task.tk_status)
+ break;
+ }
+ }
file_end_write(filp);
revert_creds(save_cred);
@@ -758,7 +915,7 @@ nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio)
iocb->hdr = hdr;
iocb->localio = localio;
- nfs_local_iter_init(&iocb->iter, iocb, rw);
+ nfs_local_iters_init(iocb, rw);
return iocb;
}
--
2.44.0
next prev parent reply other threads:[~2025-09-15 15:41 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-09-15 15:41 [PATCH v9 0/7] NFS DIRECT: align misaligned DIO for LOCALIO Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 1/7] nfs/localio: make trace_nfs_local_open_fh more useful Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 2/7] nfs/localio: avoid issuing misaligned IO using O_DIRECT Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 3/7] nfs/localio: refactor iocb and iov_iter_bvec initialization Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 4/7] nfs/localio: refactor iocb initialization further Mike Snitzer
2025-09-15 15:41 ` Mike Snitzer [this message]
2025-09-15 15:47 ` [PATCH v9 5/7] nfs/localio: add proper O_DIRECT support for READ and WRITE Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 6/7] nfs/localio: add tracepoints for misaligned DIO READ and WRITE support Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 7/7] NFS: add basic STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Mike Snitzer
[not found] ` <aMiMpYAcHV8bYU4W@kernel.org>
[not found] ` <aNLfroQ8Ti1Vh5wh@kernel.org>
[not found] ` <aNQqUprZ3DuJhMe4@kernel.org>
[not found] ` <aNgSOM9EzMS_Q6bR@kernel.org>
2025-09-30 16:26 ` [GIT PULL] NFS LOCALIO O_DIRECT changes for Linux 6.18 Mike Snitzer
2025-09-30 17:15 ` Chuck Lever
2025-09-30 17:35 ` Mike Snitzer
2025-09-30 17:59 ` Chuck Lever
2025-09-30 19:32 ` [GIT PULL v2] " Mike Snitzer
2025-09-30 20:53 ` Anna Schumaker
2025-09-30 21:30 ` Mike Snitzer
2025-10-01 16:04 ` Mike Snitzer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250915154115.19579-6-snitzer@kernel.org \
--to=snitzer@kernel.org \
--cc=anna@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=trond.myklebust@hammerspace.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).