From: Mike Snitzer <snitzer@kernel.org>
To: Trond Myklebust <trondmy@kernel.org>, Anna Schumaker <anna@kernel.org>
Cc: linux-nfs@vger.kernel.org, snitzer@kernel.org,
Mike Snitzer <snitzer@hammerspace.com>
Subject: [RFC PATCH 6/6] nfs/localio: fallback to NFSD for misaligned O_DIRECT READs
Date: Tue, 8 Jul 2025 12:20:47 -0400 [thread overview]
Message-ID: <20250708162047.65017-7-snitzer@kernel.org> (raw)
In-Reply-To: <20250708162047.65017-1-snitzer@kernel.org>
From: Mike Snitzer <snitzer@hammerspace.com>
LOCALIO is left enabled, it just isn't used for any misaligned
O_DIRECT READ.
If LOCALIO determines that an O_DIRECT READ is misaligned then it
makes sense to immediately issue the READ remotely via NFSD (which has
the ability to expand a misaligned O_DIRECT READ to be DIO-aligned)
if/when NFSD is configured to use O_DIRECT for READ IO with:
echo 2 > /sys/kernel/debug/nfsd/io_cache_read
This change in behavior for LOCALIO's O_DIRECT support really should
be dependent on NFSD running with io_cache_read=2 but there isn't an
interface to check for that (currently).
Add 'localio_O_DIRECT_align_misaligned_READ' modparm, which depends on
localio_O_DIRECT_semantics=Y, to control if LOCALIO will expand
misaligned READ to DIO-aligned (boolean control is Y by default).
Signed-off-by: Mike Snitzer <snitzer@hammerspace.com>
---
fs/nfs/flexfilelayout/flexfilelayout.c | 1 +
fs/nfs/internal.h | 4 +-
fs/nfs/localio.c | 152 +++++++++++++++----------
fs/nfs/pagelist.c | 15 ++-
4 files changed, 107 insertions(+), 65 deletions(-)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 4bea008dbebd..fbcf7c5ac118 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1911,6 +1911,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, FMODE_READ);
if (localio) {
hdr->task.tk_start = ktime_get();
+ // FIXME: if fallback occurs is this stats start bogus?
ff_layout_read_record_layoutstats_start(&hdr->task, hdr);
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3a2ff094216c..5c39e143efc0 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -463,7 +463,7 @@ extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
struct nfs_file_localio *,
const fmode_t);
extern int nfs_local_doio(struct nfs_client *,
- struct nfsd_file *,
+ struct nfsd_file **,
struct nfs_pgio_header *,
const struct rpc_call_ops *);
extern int nfs_local_commit(struct nfsd_file *,
@@ -482,7 +482,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
return NULL;
}
static inline int nfs_local_doio(struct nfs_client *clp,
- struct nfsd_file *localio,
+ struct nfsd_file **localio,
struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops)
{
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index d577fb27fd2f..3eab88715d5b 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -36,11 +36,8 @@ struct nfs_local_kiocb {
struct nfs_pgio_header *hdr;
struct work_struct work;
void (*aio_complete_work)(struct work_struct *);
+ struct iov_iter iter ____cacheline_aligned;
struct nfsd_file *localio;
- /* local copy of nfsd_file's dio alignment attrs */
- u32 nf_dio_mem_align;
- u32 nf_dio_offset_align;
- u32 nf_dio_read_offset_align;
};
struct nfs_local_fsync_ctx {
@@ -63,6 +60,12 @@ module_param(localio_O_DIRECT_semantics, bool, 0644);
MODULE_PARM_DESC(localio_O_DIRECT_semantics,
"LOCALIO will use O_DIRECT semantics to filesystem.");
+static bool localio_O_DIRECT_align_misaligned_READ __read_mostly = true;
+module_param(localio_O_DIRECT_align_misaligned_READ, bool, 0644);
+/* This feature also depends on: echo 2 > /sys/kernel/debug/nfsd/io_cache_read */
+MODULE_PARM_DESC(localio_O_DIRECT_align_misaligned_READ,
+ "If LOCALIO_O_DIRECT_semantics=Y it will expand misaligned READ to DIO-aligned.");
+
static inline bool nfs_client_is_local(const struct nfs_client *clp)
{
return !!rcu_access_pointer(clp->cl_uuid.net);
@@ -324,21 +327,23 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
return iocb;
}
-static void
-nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
+static int
+nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
{
struct nfs_pgio_header *hdr = iocb->hdr;
struct page **pagevec = hdr->page_array.pagevec;
- u32 nf_dio_mem_align, nf_dio_offset_align;
+ u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
unsigned long v, total;
size_t len;
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- nf_dio_mem_align = iocb->nf_dio_mem_align;
- if (dir == READ)
- nf_dio_offset_align = iocb->nf_dio_read_offset_align;
+ nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
+ &nf_dio_offset_align,
+ &nf_dio_read_offset_align);
+ if (rw == ITER_DEST)
+ nf_dio_offset_align = nf_dio_read_offset_align;
else
- nf_dio_offset_align = iocb->nf_dio_offset_align;
+ nf_dio_offset_align = nf_dio_offset_align;
}
v = 0;
@@ -352,7 +357,7 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
}
WARN_ON_ONCE(v != hdr->page_array.npages);
- iov_iter_bvec(i, dir, iocb->bvec, v,
+ iov_iter_bvec(i, rw, iocb->bvec, v,
hdr->args.count + hdr->args.pgbase);
if (hdr->args.pgbase != 0)
iov_iter_advance(i, hdr->args.pgbase);
@@ -361,10 +366,20 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
if (iocb->kiocb.ki_flags & IOCB_DIRECT &&
!iov_iter_is_aligned(i, nf_dio_mem_align - 1,
nf_dio_offset_align - 1)) {
- /* Fallback to buffered IO */
+ if (localio_O_DIRECT_align_misaligned_READ &&
+ rw == ITER_DEST) {
+ /*
+ * Fallback to sending this READ to NFSD since it
+ * can expand misaligned READ IO to be DIO-aligned.
+ */
+ return -ENOSYS;
+ }
+ /* Fallback to using buffered IO for this WRITE */
iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
iocb->kiocb.ki_filp->f_flags &= ~O_DIRECT;
}
+
+ return 0;
}
static void
@@ -397,13 +412,18 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
}
}
-static void
-nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
+static void nfs_local_iocb_release(struct nfs_local_kiocb *iocb)
{
- struct nfs_pgio_header *hdr = iocb->hdr;
-
nfs_local_file_put(iocb->localio);
nfs_local_iocb_free(iocb);
+}
+
+static void
+nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
+{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+
+ nfs_local_iocb_release(iocb);
nfs_local_hdr_release(hdr, hdr->task.tk_ops);
}
@@ -464,18 +484,16 @@ static void nfs_local_call_read(struct work_struct *work)
container_of(work, struct nfs_local_kiocb, work);
struct file *filp = iocb->kiocb.ki_filp;
const struct cred *save_cred;
- struct iov_iter iter;
ssize_t status;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, READ);
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
iocb->aio_complete_work = nfs_local_read_aio_complete_work;
}
- status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+ status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iter);
if (status != -EIOCBQUEUED) {
nfs_local_read_done(iocb, status);
nfs_local_pgio_release(iocb);
@@ -485,28 +503,14 @@ static void nfs_local_call_read(struct work_struct *work)
}
static int
-nfs_do_local_read(struct nfs_pgio_header *hdr,
- struct nfsd_file *localio,
+nfs_local_do_read(struct nfs_local_kiocb *iocb,
const struct rpc_call_ops *call_ops)
{
- struct nfs_local_kiocb *iocb;
- struct file *file = nfs_to->nfsd_file_file(localio);
-
- /* Don't support filesystems without read_iter */
- if (!file->f_op->read_iter)
- return -EAGAIN;
+ struct nfs_pgio_header *hdr = iocb->hdr;
dprintk("%s: vfs_read count=%u pos=%llu\n",
__func__, hdr->args.count, hdr->args.offset);
- iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL);
- if (iocb == NULL)
- return -ENOMEM;
- iocb->localio = localio;
- nfs_to->nfsd_file_dio_alignment(localio, &iocb->nf_dio_mem_align,
- &iocb->nf_dio_offset_align,
- &iocb->nf_dio_read_offset_align);
-
nfs_local_pgio_init(hdr, call_ops);
hdr->res.eof = false;
@@ -659,20 +663,18 @@ static void nfs_local_call_write(struct work_struct *work)
struct file *filp = iocb->kiocb.ki_filp;
unsigned long old_flags = current->flags;
const struct cred *save_cred;
- struct iov_iter iter;
ssize_t status;
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, WRITE);
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
iocb->aio_complete_work = nfs_local_write_aio_complete_work;
}
file_start_write(filp);
- status = filp->f_op->write_iter(&iocb->kiocb, &iter);
+ status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iter);
file_end_write(filp);
if (status != -EIOCBQUEUED) {
nfs_local_write_done(iocb, status);
@@ -685,29 +687,15 @@ static void nfs_local_call_write(struct work_struct *work)
}
static int
-nfs_do_local_write(struct nfs_pgio_header *hdr,
- struct nfsd_file *localio,
+nfs_local_do_write(struct nfs_local_kiocb *iocb,
const struct rpc_call_ops *call_ops)
{
- struct nfs_local_kiocb *iocb;
- struct file *file = nfs_to->nfsd_file_file(localio);
-
- /* Don't support filesystems without write_iter */
- if (!file->f_op->write_iter)
- return -EAGAIN;
+ struct nfs_pgio_header *hdr = iocb->hdr;
dprintk("%s: vfs_write count=%u pos=%llu %s\n",
__func__, hdr->args.count, hdr->args.offset,
(hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable");
- iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO);
- if (iocb == NULL)
- return -ENOMEM;
- iocb->localio = localio;
- nfs_to->nfsd_file_dio_alignment(localio, &iocb->nf_dio_mem_align,
- &iocb->nf_dio_offset_align,
- &iocb->nf_dio_read_offset_align);
-
switch (hdr->args.stable) {
default:
break;
@@ -728,21 +716,67 @@ nfs_do_local_write(struct nfs_pgio_header *hdr,
return 0;
}
-int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
+static struct nfs_local_kiocb *
+nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file **localio)
+{
+ struct file *file = nfs_to->nfsd_file_file(*localio);
+ struct nfs_local_kiocb *iocb;
+ gfp_t gfp_mask;
+ int rw, status;
+
+ if (hdr->rw_mode & FMODE_READ) {
+ if (!file->f_op->read_iter)
+ return ERR_PTR(-EINVAL);
+ gfp_mask = GFP_KERNEL;
+ rw = ITER_DEST;
+ } else {
+ if (!file->f_op->write_iter)
+ return ERR_PTR(-EINVAL);
+ gfp_mask = GFP_NOIO;
+ rw = ITER_SOURCE;
+ }
+
+ iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask);
+ if (iocb == NULL)
+ return ERR_PTR(-ENOMEM);
+ iocb->hdr = hdr;
+ iocb->localio = *localio;
+
+ status = nfs_local_iter_init(&iocb->iter, iocb, rw);
+ if (status == -ENOSYS) {
+ /* close nfsd_file and clear localio,
+ * this informs callers that IO should
+ * be serviced remotely.
+ */
+ nfs_local_iocb_release(iocb);
+ *localio = NULL;
+ return ERR_PTR(status);
+ }
+ WARN_ON_ONCE(status != 0);
+
+ return iocb;
+}
+
+int nfs_local_doio(struct nfs_client *clp, struct nfsd_file **localio,
struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops)
{
+ struct nfs_local_kiocb *iocb;
int status = 0;
if (!hdr->args.count)
return 0;
+ iocb = nfs_local_iocb_init(hdr, localio);
+ if (IS_ERR(iocb))
+ return PTR_ERR(iocb);
+
switch (hdr->rw_mode) {
case FMODE_READ:
- status = nfs_do_local_read(hdr, localio, call_ops);
+ status = nfs_local_do_read(iocb, call_ops);
break;
case FMODE_WRITE:
- status = nfs_do_local_write(hdr, localio, call_ops);
+ status = nfs_local_do_write(iocb, call_ops);
break;
default:
dprintk("%s: invalid mode: %d\n", __func__,
@@ -753,7 +787,7 @@ int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
if (status != 0) {
if (status == -EAGAIN)
nfs_localio_disable_client(clp);
- nfs_local_file_put(localio);
+ nfs_local_iocb_release(iocb);
hdr->task.tk_status = status;
nfs_local_hdr_release(hdr, call_ops);
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 11968dcb7243..9ddff27e96e9 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -762,9 +762,17 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
hdr->args.count,
(unsigned long long)hdr->args.offset);
- if (localio)
- return nfs_local_doio(NFS_SERVER(hdr->inode)->nfs_client,
- localio, hdr, call_ops);
+ if (localio) {
+ int status = nfs_local_doio(NFS_SERVER(hdr->inode)->nfs_client,
+ &localio, hdr, call_ops);
+ /* nfs_local_doio() will clear localio and return -ENOSYS if
+ * it is prudent to immediately service this IO remotely.
+ */
+ if (status != -ENOSYS)
+ return status;
+ WARN_ON_ONCE(localio != NULL);
+ /* fallthrough */
+ }
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
@@ -959,7 +967,6 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
ret = nfs_generic_pgio(desc, hdr);
if (ret == 0) {
struct nfs_client *clp = NFS_SERVER(hdr->inode)->nfs_client;
-
struct nfsd_file *localio =
nfs_local_open_fh(clp, hdr->cred, hdr->args.fh,
&hdr->args.context->nfl,
--
2.44.0
prev parent reply other threads:[~2025-07-08 16:21 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-07-08 16:20 [RFC PATCH 0/6] NFS: LOCALIO improvements and support for misaligned O_DIRECT READs Mike Snitzer
2025-07-08 16:20 ` [RFC PATCH 1/6] nfs/localio: avoid bouncing LOCALIO if nfs_client_is_local() Mike Snitzer
2025-07-08 16:20 ` [RFC PATCH 2/6] nfs/localio: add localio_async_probe modparm Mike Snitzer
2025-07-08 16:20 ` [RFC PATCH 3/6] nfs/localio: make trace_nfs_local_open_fh more useful Mike Snitzer
2025-07-08 16:20 ` [RFC PATCH 4/6] nfs/localio: add nfsd_file_dio_alignment Mike Snitzer
2025-07-08 16:20 ` [RFC PATCH 5/6] nfs/localio: refactor iocb initialization Mike Snitzer
2025-07-10 7:23 ` Christoph Hellwig
2025-07-10 7:32 ` Mike Snitzer
2025-07-08 16:20 ` Mike Snitzer [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250708162047.65017-7-snitzer@kernel.org \
--to=snitzer@kernel.org \
--cc=anna@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=snitzer@hammerspace.com \
--cc=trondmy@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox