* [PATCH v9 1/7] nfs/localio: make trace_nfs_local_open_fh more useful
2025-09-15 15:41 [PATCH v9 0/7] NFS DIRECT: align misaligned DIO for LOCALIO Mike Snitzer
@ 2025-09-15 15:41 ` Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 2/7] nfs/localio: avoid issuing misaligned IO using O_DIRECT Mike Snitzer
` (6 subsequent siblings)
7 siblings, 0 replies; 17+ messages in thread
From: Mike Snitzer @ 2025-09-15 15:41 UTC (permalink / raw)
To: Trond Myklebust, Anna Schumaker; +Cc: linux-nfs
Always trigger trace event when LOCALIO opens a file.
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfs/localio.c | 5 +++--
fs/nfs/nfstrace.h | 6 +++---
2 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 97abf62f109d2..42ea50d42c995 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -231,13 +231,13 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
struct nfsd_file __rcu **pnf,
const fmode_t mode)
{
+ int status = 0;
struct nfsd_file *localio;
localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient,
cred, fh, nfl, pnf, mode);
if (IS_ERR(localio)) {
- int status = PTR_ERR(localio);
- trace_nfs_local_open_fh(fh, mode, status);
+ status = PTR_ERR(localio);
switch (status) {
case -ENOMEM:
case -ENXIO:
@@ -247,6 +247,7 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
nfs_local_probe(clp);
}
}
+ trace_nfs_local_open_fh(fh, mode, status);
return localio;
}
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 627115179795f..d5949da8c2e5d 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -1713,10 +1713,10 @@ TRACE_EVENT(nfs_local_open_fh,
),
TP_printk(
- "error=%d fhandle=0x%08x mode=%s",
- __entry->error,
+ "fhandle=0x%08x mode=%s result=%d",
__entry->fhandle,
- show_fs_fmode_flags(__entry->fmode)
+ show_fs_fmode_flags(__entry->fmode),
+ __entry->error
)
);
--
2.44.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v9 2/7] nfs/localio: avoid issuing misaligned IO using O_DIRECT
2025-09-15 15:41 [PATCH v9 0/7] NFS DIRECT: align misaligned DIO for LOCALIO Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 1/7] nfs/localio: make trace_nfs_local_open_fh more useful Mike Snitzer
@ 2025-09-15 15:41 ` Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 3/7] nfs/localio: refactor iocb and iov_iter_bvec initialization Mike Snitzer
` (5 subsequent siblings)
7 siblings, 0 replies; 17+ messages in thread
From: Mike Snitzer @ 2025-09-15 15:41 UTC (permalink / raw)
To: Trond Myklebust, Anna Schumaker; +Cc: linux-nfs
Add nfsd_file_dio_alignment and use it to avoid issuing misaligned IO
using O_DIRECT. Any misaligned DIO falls back to using buffered IO.
Because misaligned DIO is now handled safely, remove the nfs modparam
'localio_O_DIRECT_semantics' that was added to require users opt-in to
the requirement that all O_DIRECT be properly DIO-aligned.
Also, introduce nfs_iov_iter_aligned_bvec() which is a variant of
iov_iter_aligned_bvec() that also verifies the offset associated with
an iov_iter is DIO-aligned. NOTE: in a parallel effort,
iov_iter_aligned_bvec() is being removed along with
iov_iter_is_aligned().
Lastly, add pr_info_ratelimited if underlying filesystem returns
-EINVAL because it was made to try O_DIRECT for IO that is not
DIO-aligned (shouldn't happen, so its best to be louder if it does).
Fixes: 3feec68563d ("nfs/localio: add direct IO enablement with sync and async IO support")
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfs/localio.c | 69 ++++++++++++++++++++++++++++++++------
fs/nfsd/localio.c | 11 ++++++
include/linux/nfslocalio.h | 2 ++
3 files changed, 72 insertions(+), 10 deletions(-)
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 42ea50d42c995..380407c41822c 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -49,11 +49,6 @@ struct nfs_local_fsync_ctx {
static bool localio_enabled __read_mostly = true;
module_param(localio_enabled, bool, 0644);
-static bool localio_O_DIRECT_semantics __read_mostly = false;
-module_param(localio_O_DIRECT_semantics, bool, 0644);
-MODULE_PARM_DESC(localio_O_DIRECT_semantics,
- "LOCALIO will use O_DIRECT semantics to filesystem.");
-
static inline bool nfs_client_is_local(const struct nfs_client *clp)
{
return !!rcu_access_pointer(clp->cl_uuid.net);
@@ -322,12 +317,9 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
return NULL;
}
- if (localio_O_DIRECT_semantics &&
- test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
- iocb->kiocb.ki_filp = file;
+ init_sync_kiocb(&iocb->kiocb, file);
+ if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags))
iocb->kiocb.ki_flags = IOCB_DIRECT;
- } else
- init_sync_kiocb(&iocb->kiocb, file);
iocb->kiocb.ki_pos = hdr->args.offset;
iocb->hdr = hdr;
@@ -337,6 +329,30 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
return iocb;
}
+static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
+ loff_t offset, unsigned int addr_mask, unsigned int len_mask)
+{
+ const struct bio_vec *bvec = i->bvec;
+ size_t skip = i->iov_offset;
+ size_t size = i->count;
+
+ if ((offset | size) & len_mask)
+ return false;
+ do {
+ size_t len = bvec->bv_len;
+
+ if (len > size)
+ len = size;
+ if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
+ return false;
+ bvec++;
+ size -= len;
+ skip = 0;
+ } while (size);
+
+ return true;
+}
+
static void
nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
{
@@ -346,6 +362,25 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
hdr->args.count + hdr->args.pgbase);
if (hdr->args.pgbase != 0)
iov_iter_advance(i, hdr->args.pgbase);
+
+ if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+ u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
+ /* Verify the IO is DIO-aligned as required */
+ nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
+ &nf_dio_offset_align,
+ &nf_dio_read_offset_align);
+ if (dir == READ)
+ nf_dio_offset_align = nf_dio_read_offset_align;
+
+ if (nf_dio_mem_align && nf_dio_offset_align &&
+ nfs_iov_iter_aligned_bvec(i, hdr->args.offset,
+ nf_dio_mem_align - 1,
+ nf_dio_offset_align - 1))
+ return; /* is DIO-aligned */
+
+ /* Fallback to using buffered for this misaligned IO */
+ iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
+ }
}
static void
@@ -406,6 +441,13 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
struct nfs_pgio_header *hdr = iocb->hdr;
struct file *filp = iocb->kiocb.ki_filp;
+ if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+ if (status == -EINVAL) {
+ /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
+ pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n");
+ }
+ }
+
nfs_local_pgio_done(hdr, status);
/*
@@ -598,6 +640,13 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0);
+ if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+ if (status == -EINVAL) {
+ /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
+ pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n");
+ }
+ }
+
/* Handle short writes as if they are ENOSPC */
if (status > 0 && status < hdr->args.count) {
hdr->mds_offset += status;
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index 269fa9391dc46..be710d809a3ba 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -117,12 +117,23 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
return localio;
}
+static void nfsd_file_dio_alignment(struct nfsd_file *nf,
+ u32 *nf_dio_mem_align,
+ u32 *nf_dio_offset_align,
+ u32 *nf_dio_read_offset_align)
+{
+ *nf_dio_mem_align = nf->nf_dio_mem_align;
+ *nf_dio_offset_align = nf->nf_dio_offset_align;
+ *nf_dio_read_offset_align = nf->nf_dio_read_offset_align;
+}
+
static const struct nfsd_localio_operations nfsd_localio_ops = {
.nfsd_net_try_get = nfsd_net_try_get,
.nfsd_net_put = nfsd_net_put,
.nfsd_open_local_fh = nfsd_open_local_fh,
.nfsd_file_put_local = nfsd_file_put_local,
.nfsd_file_file = nfsd_file_file,
+ .nfsd_file_dio_alignment = nfsd_file_dio_alignment,
};
void nfsd_localio_ops_init(void)
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index 59ea90bd136b6..3d91043254e64 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -64,6 +64,8 @@ struct nfsd_localio_operations {
const fmode_t);
struct net *(*nfsd_file_put_local)(struct nfsd_file __rcu **);
struct file *(*nfsd_file_file)(struct nfsd_file *);
+ void (*nfsd_file_dio_alignment)(struct nfsd_file *,
+ u32 *, u32 *, u32 *);
} ____cacheline_aligned;
extern void nfsd_localio_ops_init(void);
--
2.44.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v9 3/7] nfs/localio: refactor iocb and iov_iter_bvec initialization
2025-09-15 15:41 [PATCH v9 0/7] NFS DIRECT: align misaligned DIO for LOCALIO Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 1/7] nfs/localio: make trace_nfs_local_open_fh more useful Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 2/7] nfs/localio: avoid issuing misaligned IO using O_DIRECT Mike Snitzer
@ 2025-09-15 15:41 ` Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 4/7] nfs/localio: refactor iocb initialization further Mike Snitzer
` (4 subsequent siblings)
7 siblings, 0 replies; 17+ messages in thread
From: Mike Snitzer @ 2025-09-15 15:41 UTC (permalink / raw)
To: Trond Myklebust, Anna Schumaker; +Cc: linux-nfs
nfs_local_iter_init() is updated to follow the same pattern to
initializing LOCALIO's iov_iter_bvec as was established by
nfsd_iter_read().
Other LOCALIO iocb initialization refactoring in this commit offers
incremental cleanup that will be taken further by the next commit.
No functional change.
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfs/localio.c | 70 +++++++++++++++++++++++-------------------------
1 file changed, 33 insertions(+), 37 deletions(-)
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 380407c41822c..2ffe9fb5c5c1a 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -277,23 +277,6 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
}
EXPORT_SYMBOL_GPL(nfs_local_open_fh);
-static struct bio_vec *
-nfs_bvec_alloc_and_import_pagevec(struct page **pagevec,
- unsigned int npages, gfp_t flags)
-{
- struct bio_vec *bvec, *p;
-
- bvec = kmalloc_array(npages, sizeof(*bvec), flags);
- if (bvec != NULL) {
- for (p = bvec; npages > 0; p++, pagevec++, npages--) {
- p->bv_page = *pagevec;
- p->bv_len = PAGE_SIZE;
- p->bv_offset = 0;
- }
- }
- return bvec;
-}
-
static void
nfs_local_iocb_free(struct nfs_local_kiocb *iocb)
{
@@ -310,8 +293,9 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
iocb = kmalloc(sizeof(*iocb), flags);
if (iocb == NULL)
return NULL;
- iocb->bvec = nfs_bvec_alloc_and_import_pagevec(hdr->page_array.pagevec,
- hdr->page_array.npages, flags);
+
+ iocb->bvec = kmalloc_array(hdr->page_array.npages,
+ sizeof(struct bio_vec), flags);
if (iocb->bvec == NULL) {
kfree(iocb);
return NULL;
@@ -354,14 +338,28 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
}
static void
-nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
+nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
{
struct nfs_pgio_header *hdr = iocb->hdr;
+ struct page **pagevec = hdr->page_array.pagevec;
+ unsigned long v, total;
+ unsigned int base;
+ size_t len;
- iov_iter_bvec(i, dir, iocb->bvec, hdr->page_array.npages,
- hdr->args.count + hdr->args.pgbase);
- if (hdr->args.pgbase != 0)
- iov_iter_advance(i, hdr->args.pgbase);
+ v = 0;
+ total = hdr->args.count;
+ base = hdr->args.pgbase;
+ while (total) {
+ len = min_t(size_t, total, PAGE_SIZE - base);
+ bvec_set_page(&iocb->bvec[v], *(pagevec++), len, base);
+ total -= len;
+ base = 0;
+ if (++v == hdr->page_array.npages)
+ break;
+ }
+ len = hdr->args.count - total;
+
+ iov_iter_bvec(i, rw, iocb->bvec, v, len);
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
@@ -369,7 +367,7 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
&nf_dio_offset_align,
&nf_dio_read_offset_align);
- if (dir == READ)
+ if (rw == ITER_DEST)
nf_dio_offset_align = nf_dio_read_offset_align;
if (nf_dio_mem_align && nf_dio_offset_align &&
@@ -492,7 +490,11 @@ static void nfs_local_call_read(struct work_struct *work)
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, READ);
+ nfs_local_iter_init(&iter, iocb, ITER_DEST);
+ if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+ iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
+ iocb->aio_complete_work = nfs_local_read_aio_complete_work;
+ }
status = filp->f_op->read_iter(&iocb->kiocb, &iter);
@@ -527,11 +529,6 @@ nfs_do_local_read(struct nfs_pgio_header *hdr,
nfs_local_pgio_init(hdr, call_ops);
hdr->res.eof = false;
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
- iocb->aio_complete_work = nfs_local_read_aio_complete_work;
- }
-
INIT_WORK(&iocb->work, nfs_local_call_read);
queue_work(nfslocaliod_workqueue, &iocb->work);
@@ -693,7 +690,11 @@ static void nfs_local_call_write(struct work_struct *work)
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, WRITE);
+ nfs_local_iter_init(&iter, iocb, ITER_SOURCE);
+ if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+ iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
+ iocb->aio_complete_work = nfs_local_write_aio_complete_work;
+ }
file_start_write(filp);
status = filp->f_op->write_iter(&iocb->kiocb, &iter);
@@ -744,11 +745,6 @@ nfs_do_local_write(struct nfs_pgio_header *hdr,
nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
- iocb->aio_complete_work = nfs_local_write_aio_complete_work;
- }
-
INIT_WORK(&iocb->work, nfs_local_call_write);
queue_work(nfslocaliod_workqueue, &iocb->work);
--
2.44.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v9 4/7] nfs/localio: refactor iocb initialization further
2025-09-15 15:41 [PATCH v9 0/7] NFS DIRECT: align misaligned DIO for LOCALIO Mike Snitzer
` (2 preceding siblings ...)
2025-09-15 15:41 ` [PATCH v9 3/7] nfs/localio: refactor iocb and iov_iter_bvec initialization Mike Snitzer
@ 2025-09-15 15:41 ` Mike Snitzer
2025-09-15 15:41 ` SYNCFrom cb702c86aa12e5477e1e4aca0c1384a821f2afef Mon Sep 17 00:00:00 2001 Mike Snitzer
` (3 subsequent siblings)
7 siblings, 0 replies; 17+ messages in thread
From: Mike Snitzer @ 2025-09-15 15:41 UTC (permalink / raw)
To: Trond Myklebust, Anna Schumaker; +Cc: linux-nfs
The goal of this commit's various refactoring is to have LOCALIO's per
IO initialization occur in process context so that we don't get into a
situation where IO fails to be issued from workqueue (e.g. due to lack
of memory, etc). Better to have LOCALIO's iocb initialization fail
early.
There isn't immediate need but this commit makes it possible for
LOCALIO to fallback to NFS pagelist code in process context to allow
for immediate retry over RPC.
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfs/localio.c | 95 ++++++++++++++++++++++++++++--------------------
1 file changed, 56 insertions(+), 39 deletions(-)
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 2ffe9fb5c5c1a..82894962966e8 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -36,6 +36,7 @@ struct nfs_local_kiocb {
struct nfs_pgio_header *hdr;
struct work_struct work;
void (*aio_complete_work)(struct work_struct *);
+ struct iov_iter iter ____cacheline_aligned;
struct nfsd_file *localio;
};
@@ -412,12 +413,18 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
}
static void
-nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
+nfs_local_iocb_release(struct nfs_local_kiocb *iocb)
{
- struct nfs_pgio_header *hdr = iocb->hdr;
-
nfs_local_file_put(iocb->localio);
nfs_local_iocb_free(iocb);
+}
+
+static void
+nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
+{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+
+ nfs_local_iocb_release(iocb);
nfs_local_hdr_release(hdr, hdr->task.tk_ops);
}
@@ -485,18 +492,16 @@ static void nfs_local_call_read(struct work_struct *work)
container_of(work, struct nfs_local_kiocb, work);
struct file *filp = iocb->kiocb.ki_filp;
const struct cred *save_cred;
- struct iov_iter iter;
ssize_t status;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, ITER_DEST);
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
iocb->aio_complete_work = nfs_local_read_aio_complete_work;
}
- status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+ status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iter);
revert_creds(save_cred);
@@ -507,25 +512,14 @@ static void nfs_local_call_read(struct work_struct *work)
}
static int
-nfs_do_local_read(struct nfs_pgio_header *hdr,
- struct nfsd_file *localio,
+nfs_local_do_read(struct nfs_local_kiocb *iocb,
const struct rpc_call_ops *call_ops)
{
- struct nfs_local_kiocb *iocb;
- struct file *file = nfs_to->nfsd_file_file(localio);
-
- /* Don't support filesystems without read_iter */
- if (!file->f_op->read_iter)
- return -EAGAIN;
+ struct nfs_pgio_header *hdr = iocb->hdr;
dprintk("%s: vfs_read count=%u pos=%llu\n",
__func__, hdr->args.count, hdr->args.offset);
- iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL);
- if (iocb == NULL)
- return -ENOMEM;
- iocb->localio = localio;
-
nfs_local_pgio_init(hdr, call_ops);
hdr->res.eof = false;
@@ -684,20 +678,18 @@ static void nfs_local_call_write(struct work_struct *work)
struct file *filp = iocb->kiocb.ki_filp;
unsigned long old_flags = current->flags;
const struct cred *save_cred;
- struct iov_iter iter;
ssize_t status;
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, ITER_SOURCE);
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
iocb->aio_complete_work = nfs_local_write_aio_complete_work;
}
file_start_write(filp);
- status = filp->f_op->write_iter(&iocb->kiocb, &iter);
+ status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iter);
file_end_write(filp);
revert_creds(save_cred);
@@ -711,26 +703,15 @@ static void nfs_local_call_write(struct work_struct *work)
}
static int
-nfs_do_local_write(struct nfs_pgio_header *hdr,
- struct nfsd_file *localio,
+nfs_local_do_write(struct nfs_local_kiocb *iocb,
const struct rpc_call_ops *call_ops)
{
- struct nfs_local_kiocb *iocb;
- struct file *file = nfs_to->nfsd_file_file(localio);
-
- /* Don't support filesystems without write_iter */
- if (!file->f_op->write_iter)
- return -EAGAIN;
+ struct nfs_pgio_header *hdr = iocb->hdr;
dprintk("%s: vfs_write count=%u pos=%llu %s\n",
__func__, hdr->args.count, hdr->args.offset,
(hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable");
- iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO);
- if (iocb == NULL)
- return -ENOMEM;
- iocb->localio = localio;
-
switch (hdr->args.stable) {
default:
break;
@@ -751,32 +732,68 @@ nfs_do_local_write(struct nfs_pgio_header *hdr,
return 0;
}
+static struct nfs_local_kiocb *
+nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio)
+{
+ struct file *file = nfs_to->nfsd_file_file(localio);
+ struct nfs_local_kiocb *iocb;
+ gfp_t gfp_mask;
+ int rw;
+
+ if (hdr->rw_mode & FMODE_READ) {
+ if (!file->f_op->read_iter)
+ return ERR_PTR(-EOPNOTSUPP);
+ gfp_mask = GFP_KERNEL;
+ rw = ITER_DEST;
+ } else {
+ if (!file->f_op->write_iter)
+ return ERR_PTR(-EOPNOTSUPP);
+ gfp_mask = GFP_NOIO;
+ rw = ITER_SOURCE;
+ }
+
+ iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask);
+ if (iocb == NULL)
+ return ERR_PTR(-ENOMEM);
+ iocb->hdr = hdr;
+ iocb->localio = localio;
+
+ nfs_local_iter_init(&iocb->iter, iocb, rw);
+
+ return iocb;
+}
+
int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops)
{
+ struct nfs_local_kiocb *iocb;
int status = 0;
if (!hdr->args.count)
return 0;
+ iocb = nfs_local_iocb_init(hdr, localio);
+ if (IS_ERR(iocb))
+ return PTR_ERR(iocb);
+
switch (hdr->rw_mode) {
case FMODE_READ:
- status = nfs_do_local_read(hdr, localio, call_ops);
+ status = nfs_local_do_read(iocb, call_ops);
break;
case FMODE_WRITE:
- status = nfs_do_local_write(hdr, localio, call_ops);
+ status = nfs_local_do_write(iocb, call_ops);
break;
default:
dprintk("%s: invalid mode: %d\n", __func__,
hdr->rw_mode);
- status = -EINVAL;
+ status = -EOPNOTSUPP;
}
if (status != 0) {
if (status == -EAGAIN)
nfs_localio_disable_client(clp);
- nfs_local_file_put(localio);
+ nfs_local_iocb_release(iocb);
hdr->task.tk_status = status;
nfs_local_hdr_release(hdr, call_ops);
}
--
2.44.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* SYNCFrom cb702c86aa12e5477e1e4aca0c1384a821f2afef Mon Sep 17 00:00:00 2001
2025-09-15 15:41 [PATCH v9 0/7] NFS DIRECT: align misaligned DIO for LOCALIO Mike Snitzer
` (3 preceding siblings ...)
2025-09-15 15:41 ` [PATCH v9 4/7] nfs/localio: refactor iocb initialization further Mike Snitzer
@ 2025-09-15 15:41 ` Mike Snitzer
2025-09-15 15:47 ` [PATCH v9 5/7] nfs/localio: add proper O_DIRECT support for READ and WRITE Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 6/7] nfs/localio: add tracepoints for misaligned DIO READ and WRITE support Mike Snitzer
` (2 subsequent siblings)
7 siblings, 1 reply; 17+ messages in thread
From: Mike Snitzer @ 2025-09-15 15:41 UTC (permalink / raw)
To: Trond Myklebust, Anna Schumaker; +Cc: linux-nfs
Because the NFS client will already happily handle misaligned O_DIRECT
IO (by sending it out to NFSD via RPC) this commit's new capabilities
are for the benefit of LOCALIO.
LOCALIO will make best effort to transform misaligned IO to
DIO-aligned extents when possible.
LOCALIO's READ and WRITE DIO that is misaligned will be split into as
many as 3 component IOs (@start, @middle and @end) as needed -- IFF
the @middle extent is verified to be DIO-aligned, and then the @start
and/or @end are misaligned (due to each being a partial page).
Otherwise if the @middle isn't DIO-aligned the code will fallback to
issuing only a single contiguous buffered IO.
The @middle is only DIO-aligned if both the memory and on-disk offsets
for the IO are aligned relative to the underlying local filesystem's
block device limits (@dma_alignment and @logical_block_size
respectively).
The misaligned @start and/or @end extents are issued using buffered IO
and the DIO-aligned @middle is issued using O_DIRECT. The @start and
@end IOs are issued first using buffered IO and then the @middle is
issued last using direct IO with async completion (AIO). This out of
order IO completion means that LOCALIO's IO completion code
(nfs_local_read_done and nfs_local_write_done) is only called for the
IO's last associated iov_iter completion. And in the case of
DIO-aligned @middle it completes last using AIO. nfs_local_pgio_done()
is updated to handle piece-wise partial completion of each iov_iter.
This implementation for LOCALIO's misaligned DIO handling uses 3
iov_iter that share the same backing pages in their bio_vecs (so
unfortunately 'struct nfs_local_kiocb' has 3 instead of only 1).
[Reducing LOCALIO's per-IO (struct nfs_local_kiocb) memory use can be
explored in the future. One logical progression to improve this code,
and eliminate explicit loops over up to 3 iov_iter, is by extending
'struct iov_iter' to support iov_iter_clone() and iov_iter_chain()
interfaces that are comparable to what 'struct bio' is able to support
in the block layer. But even that wouldn't avoid the need to
allocate/use up to 3 iov_iter]
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfs/localio.c | 249 ++++++++++++++++++++++++++++++++++++++---------
1 file changed, 203 insertions(+), 46 deletions(-)
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 82894962966e8..92e5378ad63c6 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -30,14 +30,23 @@
#define NFSDBG_FACILITY NFSDBG_VFS
+#define NFSLOCAL_MAX_IOS 3
+
struct nfs_local_kiocb {
struct kiocb kiocb;
struct bio_vec *bvec;
struct nfs_pgio_header *hdr;
struct work_struct work;
void (*aio_complete_work)(struct work_struct *);
- struct iov_iter iter ____cacheline_aligned;
struct nfsd_file *localio;
+ /* Begin mostly DIO-specific members */
+ size_t end_len;
+ short int end_iter_index;
+ short int n_iters;
+ bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
+ loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
+ struct iov_iter iters[NFSLOCAL_MAX_IOS];
+ /* End mostly DIO-specific members */
};
struct nfs_local_fsync_ctx {
@@ -291,7 +300,7 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
{
struct nfs_local_kiocb *iocb;
- iocb = kmalloc(sizeof(*iocb), flags);
+ iocb = kzalloc(sizeof(*iocb), flags);
if (iocb == NULL)
return NULL;
@@ -303,25 +312,72 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
}
init_sync_kiocb(&iocb->kiocb, file);
- if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags))
- iocb->kiocb.ki_flags = IOCB_DIRECT;
- iocb->kiocb.ki_pos = hdr->args.offset;
iocb->hdr = hdr;
iocb->kiocb.ki_flags &= ~IOCB_APPEND;
iocb->aio_complete_work = NULL;
+ iocb->end_iter_index = -1;
+
return iocb;
}
+struct nfs_local_dio {
+ u32 mem_align;
+ u32 offset_align;
+ loff_t middle_offset;
+ loff_t end_offset;
+ ssize_t start_len; /* Length for misaligned first extent */
+ ssize_t middle_len; /* Length for DIO-aligned middle extent */
+ ssize_t end_len; /* Length for misaligned last extent */
+};
+
+static bool
+nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw,
+ size_t len, struct nfs_local_dio *local_dio)
+{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+ loff_t offset = hdr->args.offset;
+ u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
+ loff_t start_end, orig_end, middle_end;
+
+ nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
+ &nf_dio_offset_align, &nf_dio_read_offset_align);
+ if (rw == ITER_DEST)
+ nf_dio_offset_align = nf_dio_read_offset_align;
+
+ if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align))
+ return false;
+ if (unlikely(nf_dio_offset_align > PAGE_SIZE))
+ return false;
+ if (unlikely(len < nf_dio_offset_align))
+ return false;
+
+ local_dio->mem_align = nf_dio_mem_align;
+ local_dio->offset_align = nf_dio_offset_align;
+
+ start_end = round_up(offset, nf_dio_offset_align);
+ orig_end = offset + len;
+ middle_end = round_down(orig_end, nf_dio_offset_align);
+
+ local_dio->middle_offset = start_end;
+ local_dio->end_offset = middle_end;
+
+ local_dio->start_len = start_end - offset;
+ local_dio->middle_len = middle_end - start_end;
+ local_dio->end_len = orig_end - middle_end;
+
+ return true;
+}
+
static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
- loff_t offset, unsigned int addr_mask, unsigned int len_mask)
+ unsigned int addr_mask, unsigned int len_mask)
{
const struct bio_vec *bvec = i->bvec;
size_t skip = i->iov_offset;
size_t size = i->count;
- if ((offset | size) & len_mask)
+ if (size & len_mask)
return false;
do {
size_t len = bvec->bv_len;
@@ -338,8 +394,68 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
return true;
}
-static void
-nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
+/*
+ * Setup as many as 3 iov_iter based on extents described by @local_dio.
+ * Returns the number of iov_iter that were setup.
+ */
+static int
+nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
+ unsigned int nvecs, size_t len,
+ struct nfs_local_dio *local_dio)
+{
+ int n_iters = 0;
+ struct iov_iter *iters = iocb->iters;
+
+ /* Setup misaligned start? */
+ if (local_dio->start_len) {
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ iters[n_iters].count = local_dio->start_len;
+ iocb->offset[n_iters] = iocb->hdr->args.offset;
+ iocb->iter_is_dio_aligned[n_iters] = false;
+ ++n_iters;
+ }
+
+ /* Setup misaligned end?
+ * If so, the end is purposely setup to be issued using buffered IO
+ * before the middle (which will use DIO, if DIO-aligned, with AIO).
+ * This creates problems if/when the end results in a partial write.
+ * So must save index and length of end to handle this corner case.
+ */
+ if (local_dio->end_len) {
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ iocb->offset[n_iters] = local_dio->end_offset;
+ iov_iter_advance(&iters[n_iters],
+ local_dio->start_len + local_dio->middle_len);
+ iocb->iter_is_dio_aligned[n_iters] = false;
+ /* Save index and length of end */
+ iocb->end_iter_index = n_iters;
+ iocb->end_len = local_dio->end_len;
+ ++n_iters;
+ }
+
+ /* Setup DIO-aligned middle to be issued last, to allow for
+ * DIO with AIO completion (see nfs_local_call_{read,write}).
+ */
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ if (local_dio->start_len)
+ iov_iter_advance(&iters[n_iters], local_dio->start_len);
+ iters[n_iters].count -= local_dio->end_len;
+ iocb->offset[n_iters] = local_dio->middle_offset;
+
+ iocb->iter_is_dio_aligned[n_iters] =
+ nfs_iov_iter_aligned_bvec(&iters[n_iters],
+ local_dio->mem_align-1, local_dio->offset_align-1);
+
+ if (unlikely(!iocb->iter_is_dio_aligned[n_iters]))
+ return 0; /* no DIO-aligned IO possible */
+ ++n_iters;
+
+ iocb->n_iters = n_iters;
+ return n_iters;
+}
+
+static noinline_for_stack void
+nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
{
struct nfs_pgio_header *hdr = iocb->hdr;
struct page **pagevec = hdr->page_array.pagevec;
@@ -360,26 +476,18 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
}
len = hdr->args.count - total;
- iov_iter_bvec(i, rw, iocb->bvec, v, len);
+ if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
+ struct nfs_local_dio local_dio;
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
- /* Verify the IO is DIO-aligned as required */
- nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
- &nf_dio_offset_align,
- &nf_dio_read_offset_align);
- if (rw == ITER_DEST)
- nf_dio_offset_align = nf_dio_read_offset_align;
-
- if (nf_dio_mem_align && nf_dio_offset_align &&
- nfs_iov_iter_aligned_bvec(i, hdr->args.offset,
- nf_dio_mem_align - 1,
- nf_dio_offset_align - 1))
+ if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) &&
+ nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0)
return; /* is DIO-aligned */
-
- /* Fallback to using buffered for this misaligned IO */
- iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
}
+
+ /* Use buffered IO */
+ iocb->offset[0] = hdr->args.offset;
+ iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
+ iocb->n_iters = 1;
}
static void
@@ -402,10 +510,12 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr,
static void
nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
{
+ /* Must handle partial completions */
if (status >= 0) {
- hdr->res.count = status;
- hdr->res.op_status = NFS4_OK;
- hdr->task.tk_status = 0;
+ hdr->res.count += status;
+ /* @hdr was initialized to 0 (zeroed during allocation) */
+ if (hdr->task.tk_status == 0)
+ hdr->res.op_status = NFS4_OK;
} else {
hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
hdr->task.tk_status = status;
@@ -447,14 +557,14 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
struct file *filp = iocb->kiocb.ki_filp;
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+ /* DIO is last to complete (via AIO) */
if (status == -EINVAL) {
/* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n");
}
+ nfs_local_pgio_done(hdr, status);
}
- nfs_local_pgio_done(hdr, status);
-
/*
* Must clear replen otherwise NFSv3 data corruption will occur
* if/when switching from LOCALIO back to using normal RPC.
@@ -496,12 +606,21 @@ static void nfs_local_call_read(struct work_struct *work)
save_cred = override_creds(filp->f_cred);
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
- iocb->aio_complete_work = nfs_local_read_aio_complete_work;
- }
+ for (int i = 0; i < iocb->n_iters ; i++) {
+ if (iocb->iter_is_dio_aligned[i]) {
+ iocb->kiocb.ki_flags |= IOCB_DIRECT;
+ iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
+ iocb->aio_complete_work = nfs_local_read_aio_complete_work;
+ }
- status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iter);
+ iocb->kiocb.ki_pos = iocb->offset[i];
+ status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
+ if (status != -EIOCBQUEUED) {
+ nfs_local_pgio_done(iocb->hdr, status);
+ if (iocb->hdr->task.tk_status)
+ break;
+ }
+ }
revert_creds(save_cred);
@@ -632,13 +751,16 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0);
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
+ /* DIO is last to complete (via AIO) */
if (status == -EINVAL) {
/* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n");
}
+ nfs_local_pgio_done(hdr, status);
}
/* Handle short writes as if they are ENOSPC */
+ status = hdr->res.count;
if (status > 0 && status < hdr->args.count) {
hdr->mds_offset += status;
hdr->args.offset += status;
@@ -646,11 +768,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
hdr->args.count -= status;
nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset);
status = -ENOSPC;
+ /* record -ENOSPC in terms of nfs_local_pgio_done */
+ nfs_local_pgio_done(hdr, status);
}
- if (status < 0)
+ if (hdr->task.tk_status < 0)
nfs_reset_boot_verifier(inode);
-
- nfs_local_pgio_done(hdr, status);
}
static void nfs_local_write_aio_complete_work(struct work_struct *work)
@@ -683,13 +805,48 @@ static void nfs_local_call_write(struct work_struct *work)
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
save_cred = override_creds(filp->f_cred);
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
- iocb->aio_complete_work = nfs_local_write_aio_complete_work;
- }
-
file_start_write(filp);
- status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iter);
+ for (int i = 0; i < iocb->n_iters ; i++) {
+ if (iocb->iter_is_dio_aligned[i]) {
+ iocb->kiocb.ki_flags |= IOCB_DIRECT;
+ iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
+ iocb->aio_complete_work = nfs_local_write_aio_complete_work;
+ }
+retry:
+ iocb->kiocb.ki_pos = iocb->offset[i];
+ status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
+ if (status != -EIOCBQUEUED) {
+ if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
+ /* partial write */
+ if (i == iocb->end_iter_index) {
+ /* Must not account partial end, otherwise, due
+ * to end being issued before middle: the partial
+ * write accounting in nfs_local_write_done()
+ * would incorrectly advance hdr->args.offset
+ */
+ status = 0;
+ } else {
+ /* Partial write at start or buffered middle,
+ * exit early.
+ */
+ nfs_local_pgio_done(iocb->hdr, status);
+ break;
+ }
+ } else if (unlikely(status == -ENOTBLK &&
+ (iocb->kiocb.ki_flags & IOCB_DIRECT))) {
+ /* VFS will return -ENOTBLK if DIO WRITE fails to
+ * invalidate the page cache. Retry using buffered IO.
+ */
+ iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
+ iocb->kiocb.ki_complete = NULL;
+ iocb->aio_complete_work = NULL;
+ goto retry;
+ }
+ nfs_local_pgio_done(iocb->hdr, status);
+ if (iocb->hdr->task.tk_status)
+ break;
+ }
+ }
file_end_write(filp);
revert_creds(save_cred);
@@ -758,7 +915,7 @@ nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio)
iocb->hdr = hdr;
iocb->localio = localio;
- nfs_local_iter_init(&iocb->iter, iocb, rw);
+ nfs_local_iters_init(iocb, rw);
return iocb;
}
--
2.44.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v9 5/7] nfs/localio: add proper O_DIRECT support for READ and WRITE
2025-09-15 15:41 ` SYNCFrom cb702c86aa12e5477e1e4aca0c1384a821f2afef Mon Sep 17 00:00:00 2001 Mike Snitzer
@ 2025-09-15 15:47 ` Mike Snitzer
0 siblings, 0 replies; 17+ messages in thread
From: Mike Snitzer @ 2025-09-15 15:47 UTC (permalink / raw)
To: Trond Myklebust, Anna Schumaker; +Cc: linux-nfs
The subject for this patch got messed up (should've used
git-send-email's --dry-run like I generally always do).
Subject: nfs/localio: add proper O_DIRECT support for READ and WRITE
On Mon, Sep 15, 2025 at 11:41:13AM -0400, Mike Snitzer wrote:
> Because the NFS client will already happily handle misaligned O_DIRECT
> IO (by sending it out to NFSD via RPC) this commit's new capabilities
> are for the benefit of LOCALIO.
>
> LOCALIO will make best effort to transform misaligned IO to
> DIO-aligned extents when possible.
>
> LOCALIO's READ and WRITE DIO that is misaligned will be split into as
> many as 3 component IOs (@start, @middle and @end) as needed -- IFF
> the @middle extent is verified to be DIO-aligned, and then the @start
> and/or @end are misaligned (due to each being a partial page).
> Otherwise if the @middle isn't DIO-aligned the code will fallback to
> issuing only a single contiguous buffered IO.
>
> The @middle is only DIO-aligned if both the memory and on-disk offsets
> for the IO are aligned relative to the underlying local filesystem's
> block device limits (@dma_alignment and @logical_block_size
> respectively).
>
> The misaligned @start and/or @end extents are issued using buffered IO
> and the DIO-aligned @middle is issued using O_DIRECT. The @start and
> @end IOs are issued first using buffered IO and then the @middle is
> issued last using direct IO with async completion (AIO). This out of
> order IO completion means that LOCALIO's IO completion code
> (nfs_local_read_done and nfs_local_write_done) is only called for the
> IO's last associated iov_iter completion. And in the case of
> DIO-aligned @middle it completes last using AIO. nfs_local_pgio_done()
> is updated to handle piece-wise partial completion of each iov_iter.
>
> This implementation for LOCALIO's misaligned DIO handling uses 3
> iov_iter that share the same backing pages in their bio_vecs (so
> unfortunately 'struct nfs_local_kiocb' has 3 instead of only 1).
>
> [Reducing LOCALIO's per-IO (struct nfs_local_kiocb) memory use can be
> explored in the future. One logical progression to improve this code,
> and eliminate explicit loops over up to 3 iov_iter, is by extending
> 'struct iov_iter' to support iov_iter_clone() and iov_iter_chain()
> interfaces that are comparable to what 'struct bio' is able to support
> in the block layer. But even that wouldn't avoid the need to
> allocate/use up to 3 iov_iter]
>
> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> ---
> fs/nfs/localio.c | 249 ++++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 203 insertions(+), 46 deletions(-)
>
> diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
> index 82894962966e8..92e5378ad63c6 100644
> --- a/fs/nfs/localio.c
> +++ b/fs/nfs/localio.c
> @@ -30,14 +30,23 @@
>
> #define NFSDBG_FACILITY NFSDBG_VFS
>
> +#define NFSLOCAL_MAX_IOS 3
> +
> struct nfs_local_kiocb {
> struct kiocb kiocb;
> struct bio_vec *bvec;
> struct nfs_pgio_header *hdr;
> struct work_struct work;
> void (*aio_complete_work)(struct work_struct *);
> - struct iov_iter iter ____cacheline_aligned;
> struct nfsd_file *localio;
> + /* Begin mostly DIO-specific members */
> + size_t end_len;
> + short int end_iter_index;
> + short int n_iters;
> + bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
> + loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
> + struct iov_iter iters[NFSLOCAL_MAX_IOS];
> + /* End mostly DIO-specific members */
> };
>
> struct nfs_local_fsync_ctx {
> @@ -291,7 +300,7 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
> {
> struct nfs_local_kiocb *iocb;
>
> - iocb = kmalloc(sizeof(*iocb), flags);
> + iocb = kzalloc(sizeof(*iocb), flags);
> if (iocb == NULL)
> return NULL;
>
> @@ -303,25 +312,72 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
> }
>
> init_sync_kiocb(&iocb->kiocb, file);
> - if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags))
> - iocb->kiocb.ki_flags = IOCB_DIRECT;
>
> - iocb->kiocb.ki_pos = hdr->args.offset;
> iocb->hdr = hdr;
> iocb->kiocb.ki_flags &= ~IOCB_APPEND;
> iocb->aio_complete_work = NULL;
>
> + iocb->end_iter_index = -1;
> +
> return iocb;
> }
>
> +struct nfs_local_dio {
> + u32 mem_align;
> + u32 offset_align;
> + loff_t middle_offset;
> + loff_t end_offset;
> + ssize_t start_len; /* Length for misaligned first extent */
> + ssize_t middle_len; /* Length for DIO-aligned middle extent */
> + ssize_t end_len; /* Length for misaligned last extent */
> +};
> +
> +static bool
> +nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw,
> + size_t len, struct nfs_local_dio *local_dio)
> +{
> + struct nfs_pgio_header *hdr = iocb->hdr;
> + loff_t offset = hdr->args.offset;
> + u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
> + loff_t start_end, orig_end, middle_end;
> +
> + nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
> + &nf_dio_offset_align, &nf_dio_read_offset_align);
> + if (rw == ITER_DEST)
> + nf_dio_offset_align = nf_dio_read_offset_align;
> +
> + if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align))
> + return false;
> + if (unlikely(nf_dio_offset_align > PAGE_SIZE))
> + return false;
> + if (unlikely(len < nf_dio_offset_align))
> + return false;
> +
> + local_dio->mem_align = nf_dio_mem_align;
> + local_dio->offset_align = nf_dio_offset_align;
> +
> + start_end = round_up(offset, nf_dio_offset_align);
> + orig_end = offset + len;
> + middle_end = round_down(orig_end, nf_dio_offset_align);
> +
> + local_dio->middle_offset = start_end;
> + local_dio->end_offset = middle_end;
> +
> + local_dio->start_len = start_end - offset;
> + local_dio->middle_len = middle_end - start_end;
> + local_dio->end_len = orig_end - middle_end;
> +
> + return true;
> +}
> +
> static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
> - loff_t offset, unsigned int addr_mask, unsigned int len_mask)
> + unsigned int addr_mask, unsigned int len_mask)
> {
> const struct bio_vec *bvec = i->bvec;
> size_t skip = i->iov_offset;
> size_t size = i->count;
>
> - if ((offset | size) & len_mask)
> + if (size & len_mask)
> return false;
> do {
> size_t len = bvec->bv_len;
> @@ -338,8 +394,68 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
> return true;
> }
>
> -static void
> -nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
> +/*
> + * Setup as many as 3 iov_iter based on extents described by @local_dio.
> + * Returns the number of iov_iter that were setup.
> + */
> +static int
> +nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
> + unsigned int nvecs, size_t len,
> + struct nfs_local_dio *local_dio)
> +{
> + int n_iters = 0;
> + struct iov_iter *iters = iocb->iters;
> +
> + /* Setup misaligned start? */
> + if (local_dio->start_len) {
> + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
> + iters[n_iters].count = local_dio->start_len;
> + iocb->offset[n_iters] = iocb->hdr->args.offset;
> + iocb->iter_is_dio_aligned[n_iters] = false;
> + ++n_iters;
> + }
> +
> + /* Setup misaligned end?
> + * If so, the end is purposely setup to be issued using buffered IO
> + * before the middle (which will use DIO, if DIO-aligned, with AIO).
> + * This creates problems if/when the end results in a partial write.
> + * So must save index and length of end to handle this corner case.
> + */
> + if (local_dio->end_len) {
> + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
> + iocb->offset[n_iters] = local_dio->end_offset;
> + iov_iter_advance(&iters[n_iters],
> + local_dio->start_len + local_dio->middle_len);
> + iocb->iter_is_dio_aligned[n_iters] = false;
> + /* Save index and length of end */
> + iocb->end_iter_index = n_iters;
> + iocb->end_len = local_dio->end_len;
> + ++n_iters;
> + }
> +
> + /* Setup DIO-aligned middle to be issued last, to allow for
> + * DIO with AIO completion (see nfs_local_call_{read,write}).
> + */
> + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
> + if (local_dio->start_len)
> + iov_iter_advance(&iters[n_iters], local_dio->start_len);
> + iters[n_iters].count -= local_dio->end_len;
> + iocb->offset[n_iters] = local_dio->middle_offset;
> +
> + iocb->iter_is_dio_aligned[n_iters] =
> + nfs_iov_iter_aligned_bvec(&iters[n_iters],
> + local_dio->mem_align-1, local_dio->offset_align-1);
> +
> + if (unlikely(!iocb->iter_is_dio_aligned[n_iters]))
> + return 0; /* no DIO-aligned IO possible */
> + ++n_iters;
> +
> + iocb->n_iters = n_iters;
> + return n_iters;
> +}
> +
> +static noinline_for_stack void
> +nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
> {
> struct nfs_pgio_header *hdr = iocb->hdr;
> struct page **pagevec = hdr->page_array.pagevec;
> @@ -360,26 +476,18 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
> }
> len = hdr->args.count - total;
>
> - iov_iter_bvec(i, rw, iocb->bvec, v, len);
> + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
> + struct nfs_local_dio local_dio;
>
> - if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
> - u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
> - /* Verify the IO is DIO-aligned as required */
> - nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
> - &nf_dio_offset_align,
> - &nf_dio_read_offset_align);
> - if (rw == ITER_DEST)
> - nf_dio_offset_align = nf_dio_read_offset_align;
> -
> - if (nf_dio_mem_align && nf_dio_offset_align &&
> - nfs_iov_iter_aligned_bvec(i, hdr->args.offset,
> - nf_dio_mem_align - 1,
> - nf_dio_offset_align - 1))
> + if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) &&
> + nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0)
> return; /* is DIO-aligned */
> -
> - /* Fallback to using buffered for this misaligned IO */
> - iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
> }
> +
> + /* Use buffered IO */
> + iocb->offset[0] = hdr->args.offset;
> + iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
> + iocb->n_iters = 1;
> }
>
> static void
> @@ -402,10 +510,12 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr,
> static void
> nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
> {
> + /* Must handle partial completions */
> if (status >= 0) {
> - hdr->res.count = status;
> - hdr->res.op_status = NFS4_OK;
> - hdr->task.tk_status = 0;
> + hdr->res.count += status;
> + /* @hdr was initialized to 0 (zeroed during allocation) */
> + if (hdr->task.tk_status == 0)
> + hdr->res.op_status = NFS4_OK;
> } else {
> hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
> hdr->task.tk_status = status;
> @@ -447,14 +557,14 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
> struct file *filp = iocb->kiocb.ki_filp;
>
> if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
> + /* DIO is last to complete (via AIO) */
> if (status == -EINVAL) {
> /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
> pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n");
> }
> + nfs_local_pgio_done(hdr, status);
> }
>
> - nfs_local_pgio_done(hdr, status);
> -
> /*
> * Must clear replen otherwise NFSv3 data corruption will occur
> * if/when switching from LOCALIO back to using normal RPC.
> @@ -496,12 +606,21 @@ static void nfs_local_call_read(struct work_struct *work)
>
> save_cred = override_creds(filp->f_cred);
>
> - if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
> - iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
> - iocb->aio_complete_work = nfs_local_read_aio_complete_work;
> - }
> + for (int i = 0; i < iocb->n_iters ; i++) {
> + if (iocb->iter_is_dio_aligned[i]) {
> + iocb->kiocb.ki_flags |= IOCB_DIRECT;
> + iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
> + iocb->aio_complete_work = nfs_local_read_aio_complete_work;
> + }
>
> - status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iter);
> + iocb->kiocb.ki_pos = iocb->offset[i];
> + status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
> + if (status != -EIOCBQUEUED) {
> + nfs_local_pgio_done(iocb->hdr, status);
> + if (iocb->hdr->task.tk_status)
> + break;
> + }
> + }
>
> revert_creds(save_cred);
>
> @@ -632,13 +751,16 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
> dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0);
>
> if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
> + /* DIO is last to complete (via AIO) */
> if (status == -EINVAL) {
> /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
> pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n");
> }
> + nfs_local_pgio_done(hdr, status);
> }
>
> /* Handle short writes as if they are ENOSPC */
> + status = hdr->res.count;
> if (status > 0 && status < hdr->args.count) {
> hdr->mds_offset += status;
> hdr->args.offset += status;
> @@ -646,11 +768,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
> hdr->args.count -= status;
> nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset);
> status = -ENOSPC;
> + /* record -ENOSPC in terms of nfs_local_pgio_done */
> + nfs_local_pgio_done(hdr, status);
> }
> - if (status < 0)
> + if (hdr->task.tk_status < 0)
> nfs_reset_boot_verifier(inode);
> -
> - nfs_local_pgio_done(hdr, status);
> }
>
> static void nfs_local_write_aio_complete_work(struct work_struct *work)
> @@ -683,13 +805,48 @@ static void nfs_local_call_write(struct work_struct *work)
> current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
> save_cred = override_creds(filp->f_cred);
>
> - if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
> - iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
> - iocb->aio_complete_work = nfs_local_write_aio_complete_work;
> - }
> -
> file_start_write(filp);
> - status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iter);
> + for (int i = 0; i < iocb->n_iters ; i++) {
> + if (iocb->iter_is_dio_aligned[i]) {
> + iocb->kiocb.ki_flags |= IOCB_DIRECT;
> + iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
> + iocb->aio_complete_work = nfs_local_write_aio_complete_work;
> + }
> +retry:
> + iocb->kiocb.ki_pos = iocb->offset[i];
> + status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
> + if (status != -EIOCBQUEUED) {
> + if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
> + /* partial write */
> + if (i == iocb->end_iter_index) {
> + /* Must not account partial end, otherwise, due
> + * to end being issued before middle: the partial
> + * write accounting in nfs_local_write_done()
> + * would incorrectly advance hdr->args.offset
> + */
> + status = 0;
> + } else {
> + /* Partial write at start or buffered middle,
> + * exit early.
> + */
> + nfs_local_pgio_done(iocb->hdr, status);
> + break;
> + }
> + } else if (unlikely(status == -ENOTBLK &&
> + (iocb->kiocb.ki_flags & IOCB_DIRECT))) {
> + /* VFS will return -ENOTBLK if DIO WRITE fails to
> + * invalidate the page cache. Retry using buffered IO.
> + */
> + iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
> + iocb->kiocb.ki_complete = NULL;
> + iocb->aio_complete_work = NULL;
> + goto retry;
> + }
> + nfs_local_pgio_done(iocb->hdr, status);
> + if (iocb->hdr->task.tk_status)
> + break;
> + }
> + }
> file_end_write(filp);
>
> revert_creds(save_cred);
> @@ -758,7 +915,7 @@ nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio)
> iocb->hdr = hdr;
> iocb->localio = localio;
>
> - nfs_local_iter_init(&iocb->iter, iocb, rw);
> + nfs_local_iters_init(iocb, rw);
>
> return iocb;
> }
> --
> 2.44.0
>
>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH v9 6/7] nfs/localio: add tracepoints for misaligned DIO READ and WRITE support
2025-09-15 15:41 [PATCH v9 0/7] NFS DIRECT: align misaligned DIO for LOCALIO Mike Snitzer
` (4 preceding siblings ...)
2025-09-15 15:41 ` SYNCFrom cb702c86aa12e5477e1e4aca0c1384a821f2afef Mon Sep 17 00:00:00 2001 Mike Snitzer
@ 2025-09-15 15:41 ` Mike Snitzer
2025-09-15 15:41 ` [PATCH v9 7/7] NFS: add basic STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Mike Snitzer
[not found] ` <aMiMpYAcHV8bYU4W@kernel.org>
7 siblings, 0 replies; 17+ messages in thread
From: Mike Snitzer @ 2025-09-15 15:41 UTC (permalink / raw)
To: Trond Myklebust, Anna Schumaker; +Cc: linux-nfs
Add nfs_local_dio_class and use it to create nfs_local_dio_read,
nfs_local_dio_write and nfs_local_dio_misaligned trace events.
These trace events show how NFS LOCALIO splits a given misaligned
IO into a mix of misaligned head and/or tail extents and a DIO-aligned
middle extent. The misaligned head and/or tail extents are issued
using buffered IO and the DIO-aligned middle is issued using O_DIRECT.
This combination of trace events is useful for LOCALIO DIO READs:
echo 1 > /sys/kernel/tracing/events/nfs/nfs_local_dio_read/enable
echo 1 > /sys/kernel/tracing/events/nfs/nfs_local_dio_misaligned/enable
echo 1 > /sys/kernel/tracing/events/nfs/nfs_initiate_read/enable
echo 1 > /sys/kernel/tracing/events/nfs/nfs_readpage_done/enable
echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_read/enable
This combination of trace events is useful for LOCALIO DIO WRITEs:
echo 1 > /sys/kernel/tracing/events/nfs/nfs_local_dio_write/enable
echo 1 > /sys/kernel/tracing/events/nfs/nfs_local_dio_misaligned/enable
echo 1 > /sys/kernel/tracing/events/nfs/nfs_initiate_write/enable
echo 1 > /sys/kernel/tracing/events/nfs/nfs_writeback_done/enable
echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_write/enable
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfs/internal.h | 10 +++++++
fs/nfs/localio.c | 19 ++++++-------
fs/nfs/nfs3xdr.c | 2 +-
fs/nfs/nfstrace.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 89 insertions(+), 12 deletions(-)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d44233cfd7949..3d380c45b5ef3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -456,6 +456,16 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
/* localio.c */
+struct nfs_local_dio {
+ u32 mem_align;
+ u32 offset_align;
+ loff_t middle_offset;
+ loff_t end_offset;
+ ssize_t start_len; /* Length for misaligned first extent */
+ ssize_t middle_len; /* Length for DIO-aligned middle extent */
+ ssize_t end_len; /* Length for misaligned last extent */
+};
+
extern void nfs_local_probe_async(struct nfs_client *);
extern void nfs_local_probe_async_work(struct work_struct *);
extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 92e5378ad63c6..d53e404566c6d 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -322,16 +322,6 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
return iocb;
}
-struct nfs_local_dio {
- u32 mem_align;
- u32 offset_align;
- loff_t middle_offset;
- loff_t end_offset;
- ssize_t start_len; /* Length for misaligned first extent */
- ssize_t middle_len; /* Length for DIO-aligned middle extent */
- ssize_t end_len; /* Length for misaligned last extent */
-};
-
static bool
nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw,
size_t len, struct nfs_local_dio *local_dio)
@@ -367,6 +357,10 @@ nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw,
local_dio->middle_len = middle_end - start_end;
local_dio->end_len = orig_end - middle_end;
+ if (rw == ITER_DEST)
+ trace_nfs_local_dio_read(hdr->inode, offset, len, local_dio);
+ else
+ trace_nfs_local_dio_write(hdr->inode, offset, len, local_dio);
return true;
}
@@ -446,8 +440,11 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
nfs_iov_iter_aligned_bvec(&iters[n_iters],
local_dio->mem_align-1, local_dio->offset_align-1);
- if (unlikely(!iocb->iter_is_dio_aligned[n_iters]))
+ if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) {
+ trace_nfs_local_dio_misaligned(iocb->hdr->inode,
+ iocb->hdr->args.offset, len, local_dio);
return 0; /* no DIO-aligned IO possible */
+ }
++n_iters;
iocb->n_iters = n_iters;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 4ae01c10b7e28..e17d729084125 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -23,8 +23,8 @@
#include <linux/nfsacl.h>
#include <linux/nfs_common.h>
-#include "nfstrace.h"
#include "internal.h"
+#include "nfstrace.h"
#define NFSDBG_FACILITY NFSDBG_XDR
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index d5949da8c2e5d..132c1b87fa3eb 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -1599,6 +1599,76 @@ DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion);
DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec);
DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io);
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+
+DECLARE_EVENT_CLASS(nfs_local_dio_class,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t offset,
+ ssize_t count,
+ const struct nfs_local_dio *local_dio
+ ),
+ TP_ARGS(inode, offset, count, local_dio),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, offset)
+ __field(ssize_t, count)
+ __field(u32, mem_align)
+ __field(u32, offset_align)
+ __field(loff_t, start)
+ __field(ssize_t, start_len)
+ __field(loff_t, middle)
+ __field(ssize_t, middle_len)
+ __field(loff_t, end)
+ __field(ssize_t, end_len)
+ ),
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = &nfsi->fh;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = offset;
+ __entry->count = count;
+ __entry->mem_align = local_dio->mem_align;
+ __entry->offset_align = local_dio->offset_align;
+ __entry->start = offset;
+ __entry->start_len = local_dio->start_len;
+ __entry->middle = local_dio->middle_offset;
+ __entry->middle_len = local_dio->middle_len;
+ __entry->end = local_dio->end_offset;
+ __entry->end_len = local_dio->end_len;
+ ),
+ TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%zd "
+ "mem_align=%u offset_align=%u "
+ "start=%llu+%zd middle=%llu+%zd end=%llu+%zd",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->offset, __entry->count,
+ __entry->mem_align, __entry->offset_align,
+ __entry->start, __entry->start_len,
+ __entry->middle, __entry->middle_len,
+ __entry->end, __entry->end_len)
+)
+
+#define DEFINE_NFS_LOCAL_DIO_EVENT(name) \
+DEFINE_EVENT(nfs_local_dio_class, nfs_local_dio_##name, \
+ TP_PROTO(const struct inode *inode, \
+ loff_t offset, \
+ ssize_t count, \
+ const struct nfs_local_dio *local_dio),\
+ TP_ARGS(inode, offset, count, local_dio))
+
+DEFINE_NFS_LOCAL_DIO_EVENT(read);
+DEFINE_NFS_LOCAL_DIO_EVENT(write);
+DEFINE_NFS_LOCAL_DIO_EVENT(misaligned);
+
+#endif /* CONFIG_NFS_LOCALIO */
+
TRACE_EVENT(nfs_fh_to_dentry,
TP_PROTO(
const struct super_block *sb,
--
2.44.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v9 7/7] NFS: add basic STATX_DIOALIGN and STATX_DIO_READ_ALIGN support
2025-09-15 15:41 [PATCH v9 0/7] NFS DIRECT: align misaligned DIO for LOCALIO Mike Snitzer
` (5 preceding siblings ...)
2025-09-15 15:41 ` [PATCH v9 6/7] nfs/localio: add tracepoints for misaligned DIO READ and WRITE support Mike Snitzer
@ 2025-09-15 15:41 ` Mike Snitzer
[not found] ` <aMiMpYAcHV8bYU4W@kernel.org>
7 siblings, 0 replies; 17+ messages in thread
From: Mike Snitzer @ 2025-09-15 15:41 UTC (permalink / raw)
To: Trond Myklebust, Anna Schumaker; +Cc: linux-nfs
NFS doesn't have DIO alignment constraints, so have NFS respond with
accommodating DIO alignment attributes (rather than plumb in GETATTR
support for STATX_DIOALIGN and STATX_DIO_READ_ALIGN).
The most coarse-grained dio_offset_align is the most accommodating
(e.g. PAGE_SIZE, in future larger may be supported).
Now that NFS has support, NFS reexport will now handle unaligned DIO
(NFSD's NFSD_IO_DIRECT support requires the underlying filesystem
support STATX_DIOALIGN and/or STATX_DIO_READ_ALIGN).
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfs/inode.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 49df9debb1a69..84bf3d21c25cc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1073,6 +1073,21 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path,
if (S_ISDIR(inode->i_mode))
stat->blksize = NFS_SERVER(inode)->dtsize;
stat->btime = NFS_I(inode)->btime;
+
+ /* Special handling for STATX_DIOALIGN and STATX_DIO_READ_ALIGN
+ * - NFS doesn't have DIO alignment constraints, avoid getting
+ * these DIO attrs from remote and just respond with most
+ * accommodating limits (so client will issue supported DIO).
+ * - this is unintuitive, but the most coarse-grained
+ * dio_offset_align is the most accommodating.
+ */
+ if ((request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) &&
+ S_ISREG(inode->i_mode)) {
+ stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN;
+ stat->dio_mem_align = 4; /* 4-byte alignment */
+ stat->dio_offset_align = PAGE_SIZE;
+ stat->dio_read_offset_align = stat->dio_offset_align;
+ }
out:
trace_nfs_getattr_exit(inode, err);
return err;
--
2.44.0
^ permalink raw reply related [flat|nested] 17+ messages in thread[parent not found: <aMiMpYAcHV8bYU4W@kernel.org>]