From: Mike Snitzer <snitzer@kernel.org>
To: Trond Myklebust <trond.myklebust@hammerspace.com>,
Anna Schumaker <anna.schumaker@oracle.com>
Cc: linux-nfs@vger.kernel.org
Subject: [PATCH v6 5/7] nfs/localio: fallback to NFSD for misaligned O_DIRECT READs
Date: Fri, 1 Aug 2025 13:10:47 -0400 [thread overview]
Message-ID: <20250801171049.94235-6-snitzer@kernel.org> (raw)
In-Reply-To: <20250801171049.94235-1-snitzer@kernel.org>
But this fallback is sub-optimal due to resorting to using RPC and
will only serve as a last resort if NFS client's O_DIRECT support
fails to align misaligned IO (support is added in subsequent patches).
Add 'localio_O_DIRECT_align_misaligned_IO' modparm, which depends on
localio_O_DIRECT_semantics=Y, to control if LOCALIO will make best
effort to transform misaligned IO to DIO-aligned (e.g. expanding
misaligned READ to DIO-aligned).
If LOCALIO determines that an O_DIRECT READ is misaligned, and larger
than 32K, then it makes sense to immediately issue the READ remotely
via NFSD (which has the ability to expand a misaligned O_DIRECT READ
to be DIO-aligned) if/when NFSD is configured to use O_DIRECT for READ
IO with: echo 3 > /sys/kernel/debug/nfsd/io_cache_read
This commit's various refactoring makes it possible for LOCALIO to
fallback to NFS pagelist code in process context to allow for
immediate retry over RPC. This refactoring alone makes this commit
worthwile even though it is highly unlikely that LOCALIO will ever
fallback to NFSD for misaligned READs (again, only a bug in the
subsequent patches would be cause for fallback).
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfs/internal.h | 9 ++-
fs/nfs/localio.c | 138 +++++++++++++++++++++++++++++++---------------
fs/nfs/pagelist.c | 15 +++--
3 files changed, 111 insertions(+), 51 deletions(-)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 217d4c69b6822..ea496a457d194 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -462,13 +462,14 @@ extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
struct nfs_file_localio *,
const fmode_t);
extern int nfs_local_doio(struct nfs_client *,
- struct nfsd_file *,
+ struct nfsd_file **,
struct nfs_pgio_header *,
const struct rpc_call_ops *);
extern int nfs_local_commit(struct nfsd_file *,
struct nfs_commit_data *,
const struct rpc_call_ops *, int);
extern bool nfs_server_is_local(const struct nfs_client *clp);
+extern bool nfs_localio_O_DIRECT_align_misaligned_IO(void);
#else /* CONFIG_NFS_LOCALIO */
static inline void nfs_local_probe(struct nfs_client *clp) {}
@@ -481,7 +482,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
return NULL;
}
static inline int nfs_local_doio(struct nfs_client *clp,
- struct nfsd_file *localio,
+ struct nfsd_file **localio,
struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops)
{
@@ -497,6 +498,10 @@ static inline bool nfs_server_is_local(const struct nfs_client *clp)
{
return false;
}
+static inline bool nfs_localio_O_DIRECT_align_misaligned_IO(void)
+{
+ return false;
+}
#endif /* CONFIG_NFS_LOCALIO */
/* super.c */
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 9ce242454c665..8864abc0e1c12 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -36,6 +36,7 @@ struct nfs_local_kiocb {
struct nfs_pgio_header *hdr;
struct work_struct work;
void (*aio_complete_work)(struct work_struct *);
+ struct iov_iter iter ____cacheline_aligned;
struct nfsd_file *localio;
};
@@ -54,6 +55,11 @@ module_param(localio_O_DIRECT_semantics, bool, 0644);
MODULE_PARM_DESC(localio_O_DIRECT_semantics,
"LOCALIO will use O_DIRECT semantics to filesystem.");
+static bool localio_O_DIRECT_align_misaligned_IO __read_mostly = true;
+module_param(localio_O_DIRECT_align_misaligned_IO, bool, 0644);
+MODULE_PARM_DESC(localio_O_DIRECT_align_misaligned_IO,
+ "If LOCALIO_O_DIRECT_semantics=Y make best effort to transform misaligned IO to DIO-aligned.");
+
static inline bool nfs_client_is_local(const struct nfs_client *clp)
{
return !!rcu_access_pointer(clp->cl_uuid.net);
@@ -65,6 +71,12 @@ bool nfs_server_is_local(const struct nfs_client *clp)
}
EXPORT_SYMBOL_GPL(nfs_server_is_local);
+bool nfs_localio_O_DIRECT_align_misaligned_IO(void)
+{
+ return localio_O_DIRECT_align_misaligned_IO;
+}
+EXPORT_SYMBOL_GPL(nfs_localio_O_DIRECT_align_misaligned_IO);
+
/*
* UUID_IS_LOCAL XDR functions
*/
@@ -319,8 +331,8 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
return iocb;
}
-static void
-nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
+static int
+nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
{
struct nfs_pgio_header *hdr = iocb->hdr;
struct page **pagevec = hdr->page_array.pagevec;
@@ -338,7 +350,7 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
}
WARN_ON_ONCE(v != hdr->page_array.npages);
- iov_iter_bvec(i, dir, iocb->bvec, v,
+ iov_iter_bvec(i, rw, iocb->bvec, v,
hdr->args.count + hdr->args.pgbase);
if (hdr->args.pgbase != 0)
iov_iter_advance(i, hdr->args.pgbase);
@@ -349,7 +361,7 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
&nf_dio_offset_align,
&nf_dio_read_offset_align);
- if (dir == READ)
+ if (rw == ITER_DEST)
nf_dio_offset_align = nf_dio_read_offset_align;
/* direct I/O must be aligned to device logical sector size */
if (nf_dio_mem_align && nf_dio_offset_align &&
@@ -358,10 +370,21 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
nf_dio_offset_align - 1))
return 0;
+ /* Only send misaligned READ to NFSD if 32K or larger */
+ if (localio_O_DIRECT_align_misaligned_IO &&
+ (rw == ITER_DEST) && (hdr->args.count >= (32 << 10))) {
+ /*
+ * Fallback to sending this READ to NFSD since it
+ * can expand misaligned READ IO to be DIO-aligned.
+ */
+ return -ENOSYS;
+ }
/* Fallback to using buffered for this misaligned IO */
iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
iocb->kiocb.ki_filp->f_flags &= ~O_DIRECT;
}
+
+ return 0;
}
static void
@@ -394,13 +417,18 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
}
}
-static void
-nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
+static void nfs_local_iocb_release(struct nfs_local_kiocb *iocb)
{
- struct nfs_pgio_header *hdr = iocb->hdr;
-
nfs_local_file_put(iocb->localio);
nfs_local_iocb_free(iocb);
+}
+
+static void
+nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
+{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+
+ nfs_local_iocb_release(iocb);
nfs_local_hdr_release(hdr, hdr->task.tk_ops);
}
@@ -461,18 +489,16 @@ static void nfs_local_call_read(struct work_struct *work)
container_of(work, struct nfs_local_kiocb, work);
struct file *filp = iocb->kiocb.ki_filp;
const struct cred *save_cred;
- struct iov_iter iter;
ssize_t status;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, READ);
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
iocb->aio_complete_work = nfs_local_read_aio_complete_work;
}
- status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+ status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iter);
if (status != -EIOCBQUEUED) {
nfs_local_read_done(iocb, status);
nfs_local_pgio_release(iocb);
@@ -482,25 +508,14 @@ static void nfs_local_call_read(struct work_struct *work)
}
static int
-nfs_do_local_read(struct nfs_pgio_header *hdr,
- struct nfsd_file *localio,
+nfs_local_do_read(struct nfs_local_kiocb *iocb,
const struct rpc_call_ops *call_ops)
{
- struct nfs_local_kiocb *iocb;
- struct file *file = nfs_to->nfsd_file_file(localio);
-
- /* Don't support filesystems without read_iter */
- if (!file->f_op->read_iter)
- return -EAGAIN;
+ struct nfs_pgio_header *hdr = iocb->hdr;
dprintk("%s: vfs_read count=%u pos=%llu\n",
__func__, hdr->args.count, hdr->args.offset);
- iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL);
- if (iocb == NULL)
- return -ENOMEM;
- iocb->localio = localio;
-
nfs_local_pgio_init(hdr, call_ops);
hdr->res.eof = false;
@@ -653,20 +668,18 @@ static void nfs_local_call_write(struct work_struct *work)
struct file *filp = iocb->kiocb.ki_filp;
unsigned long old_flags = current->flags;
const struct cred *save_cred;
- struct iov_iter iter;
ssize_t status;
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, WRITE);
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
iocb->aio_complete_work = nfs_local_write_aio_complete_work;
}
file_start_write(filp);
- status = filp->f_op->write_iter(&iocb->kiocb, &iter);
+ status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iter);
file_end_write(filp);
if (status != -EIOCBQUEUED) {
nfs_local_write_done(iocb, status);
@@ -679,26 +692,15 @@ static void nfs_local_call_write(struct work_struct *work)
}
static int
-nfs_do_local_write(struct nfs_pgio_header *hdr,
- struct nfsd_file *localio,
+nfs_local_do_write(struct nfs_local_kiocb *iocb,
const struct rpc_call_ops *call_ops)
{
- struct nfs_local_kiocb *iocb;
- struct file *file = nfs_to->nfsd_file_file(localio);
-
- /* Don't support filesystems without write_iter */
- if (!file->f_op->write_iter)
- return -EAGAIN;
+ struct nfs_pgio_header *hdr = iocb->hdr;
dprintk("%s: vfs_write count=%u pos=%llu %s\n",
__func__, hdr->args.count, hdr->args.offset,
(hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable");
- iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO);
- if (iocb == NULL)
- return -ENOMEM;
- iocb->localio = localio;
-
switch (hdr->args.stable) {
default:
break;
@@ -719,32 +721,78 @@ nfs_do_local_write(struct nfs_pgio_header *hdr,
return 0;
}
-int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
+static struct nfs_local_kiocb *
+nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file **localio)
+{
+ struct file *file = nfs_to->nfsd_file_file(*localio);
+ struct nfs_local_kiocb *iocb;
+ gfp_t gfp_mask;
+ int rw, status;
+
+ if (hdr->rw_mode & FMODE_READ) {
+ if (!file->f_op->read_iter)
+ return ERR_PTR(-EOPNOTSUPP);
+ gfp_mask = GFP_KERNEL;
+ rw = ITER_DEST;
+ } else {
+ if (!file->f_op->write_iter)
+ return ERR_PTR(-EOPNOTSUPP);
+ gfp_mask = GFP_NOIO;
+ rw = ITER_SOURCE;
+ }
+
+ iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask);
+ if (iocb == NULL)
+ return ERR_PTR(-ENOMEM);
+ iocb->hdr = hdr;
+ iocb->localio = *localio;
+
+ status = nfs_local_iter_init(&iocb->iter, iocb, rw);
+ if (status == -ENOSYS) {
+ /* close nfsd_file and clear localio,
+ * this informs callers that IO should
+ * be serviced remotely.
+ */
+ nfs_local_iocb_release(iocb);
+ *localio = NULL;
+ return ERR_PTR(status);
+ }
+ WARN_ON_ONCE(status != 0);
+
+ return iocb;
+}
+
+int nfs_local_doio(struct nfs_client *clp, struct nfsd_file **localio,
struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops)
{
+ struct nfs_local_kiocb *iocb;
int status = 0;
if (!hdr->args.count)
return 0;
+ iocb = nfs_local_iocb_init(hdr, localio);
+ if (IS_ERR(iocb))
+ return PTR_ERR(iocb);
+
switch (hdr->rw_mode) {
case FMODE_READ:
- status = nfs_do_local_read(hdr, localio, call_ops);
+ status = nfs_local_do_read(iocb, call_ops);
break;
case FMODE_WRITE:
- status = nfs_do_local_write(hdr, localio, call_ops);
+ status = nfs_local_do_write(iocb, call_ops);
break;
default:
dprintk("%s: invalid mode: %d\n", __func__,
hdr->rw_mode);
- status = -EINVAL;
+ status = -EOPNOTSUPP;
}
if (status != 0) {
if (status == -EAGAIN)
nfs_localio_disable_client(clp);
- nfs_local_file_put(localio);
+ nfs_local_iocb_release(iocb);
hdr->task.tk_status = status;
nfs_local_hdr_release(hdr, call_ops);
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 11968dcb72431..9ddff27e96e9f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -762,9 +762,17 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
hdr->args.count,
(unsigned long long)hdr->args.offset);
- if (localio)
- return nfs_local_doio(NFS_SERVER(hdr->inode)->nfs_client,
- localio, hdr, call_ops);
+ if (localio) {
+ int status = nfs_local_doio(NFS_SERVER(hdr->inode)->nfs_client,
+ &localio, hdr, call_ops);
+ /* nfs_local_doio() will clear localio and return -ENOSYS if
+ * it is prudent to immediately service this IO remotely.
+ */
+ if (status != -ENOSYS)
+ return status;
+ WARN_ON_ONCE(localio != NULL);
+ /* fallthrough */
+ }
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
@@ -959,7 +967,6 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
ret = nfs_generic_pgio(desc, hdr);
if (ret == 0) {
struct nfs_client *clp = NFS_SERVER(hdr->inode)->nfs_client;
-
struct nfsd_file *localio =
nfs_local_open_fh(clp, hdr->cred, hdr->args.fh,
&hdr->args.context->nfl,
--
2.44.0
next prev parent reply other threads:[~2025-08-01 17:10 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-08-01 17:10 [PATCH v6 0/7] NFS DIRECT: align misaligned DIO for LOCALIO Mike Snitzer
2025-08-01 17:10 ` [PATCH v6 1/7] nfs/localio: avoid bouncing LOCALIO if nfs_client_is_local() Mike Snitzer
2025-08-01 17:10 ` [PATCH v6 2/7] nfs/localio: make trace_nfs_local_open_fh more useful Mike Snitzer
2025-08-01 17:10 ` [PATCH v6 3/7] nfs/localio: add nfsd_file_dio_alignment Mike Snitzer
2025-08-01 17:10 ` [PATCH v6 4/7] nfs/localio: refactor iocb initialization Mike Snitzer
2025-08-01 17:10 ` Mike Snitzer [this message]
2025-08-01 17:10 ` [PATCH v6 6/7] nfs/direct: add misaligned READ handling Mike Snitzer
2025-08-01 17:10 ` [PATCH v6 7/7] nfs/direct: add misaligned WRITE handling Mike Snitzer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250801171049.94235-6-snitzer@kernel.org \
--to=snitzer@kernel.org \
--cc=anna.schumaker@oracle.com \
--cc=linux-nfs@vger.kernel.org \
--cc=trond.myklebust@hammerspace.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).