From: Mike Snitzer <snitzer@kernel.org>
To: linux-nfs@vger.kernel.org
Cc: Jeff Layton <jlayton@kernel.org>,
Chuck Lever <chuck.lever@oracle.com>,
Trond Myklebust <trondmy@hammerspace.com>,
NeilBrown <neilb@suse.de>,
snitzer@hammerspace.com
Subject: [RFC PATCH v2 08/15] NFS: for localio don't call filesystem read() and write() routines directly
Date: Tue, 11 Jun 2024 23:07:45 -0400 [thread overview]
Message-ID: <20240612030752.31754-9-snitzer@kernel.org> (raw)
In-Reply-To: <20240612030752.31754-1-snitzer@kernel.org>
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Some filesystem writeback routines can end up taking up a lot of stack
space (particularly xfs). Instead of risking running over due to the
extra overhead from the NFS stack, we should just call these routines
from a workqueue job.
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfs/inode.c | 57 +++++++++++++++++---------
fs/nfs/internal.h | 1 +
fs/nfs/localio.c | 102 +++++++++++++++++++++++++++++++++++-----------
3 files changed, 118 insertions(+), 42 deletions(-)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4f88b860494f..f5a618887c6f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2394,35 +2394,54 @@ static void nfs_destroy_inodecache(void)
kmem_cache_destroy(nfs_inode_cachep);
}
+struct workqueue_struct *nfslocaliod_workqueue;
struct workqueue_struct *nfsiod_workqueue;
EXPORT_SYMBOL_GPL(nfsiod_workqueue);
/*
- * start up the nfsiod workqueue
- */
-static int nfsiod_start(void)
-{
- struct workqueue_struct *wq;
- dprintk("RPC: creating workqueue nfsiod\n");
- wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
- if (wq == NULL)
- return -ENOMEM;
- nfsiod_workqueue = wq;
- return 0;
-}
-
-/*
- * Destroy the nfsiod workqueue
+ * Destroy the nfsiod workqueues
*/
static void nfsiod_stop(void)
{
struct workqueue_struct *wq;
wq = nfsiod_workqueue;
- if (wq == NULL)
- return;
- nfsiod_workqueue = NULL;
- destroy_workqueue(wq);
+ if (wq != NULL) {
+ nfsiod_workqueue = NULL;
+ destroy_workqueue(wq);
+ }
+#if defined(CONFIG_NFS_V3_LOCALIO) || defined(CONFIG_NFS_V4_LOCALIO)
+ wq = nfslocaliod_workqueue;
+ if (wq != NULL) {
+ nfslocaliod_workqueue = NULL;
+ destroy_workqueue(wq);
+ }
+#endif /* CONFIG_NFS_V3_LOCALIO || CONFIG_NFS_V4_LOCALIO */
+}
+
+/*
+ * Start the nfsiod workqueues
+ */
+static int nfsiod_start(void)
+{
+ dprintk("RPC: creating workqueue nfsiod\n");
+ nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (nfsiod_workqueue == NULL)
+ return -ENOMEM;
+#if defined(CONFIG_NFS_V3_LOCALIO) || defined(CONFIG_NFS_V4_LOCALIO)
+ /*
+ * localio writes need to use a normal (non-memreclaim) workqueue.
+ * When we start getting low on space, XFS goes and calls flush_work() on
+ * a non-memreclaim work queue, which causes a priority inversion problem.
+ */
+ dprintk("RPC: creating workqueue nfslocaliod\n");
+ nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0);
+ if (unlikely(nfslocaliod_workqueue == NULL)) {
+ nfsiod_stop();
+ return -ENOMEM;
+ }
+#endif /* CONFIG_NFS_V3_LOCALIO || CONFIG_NFS_V4_LOCALIO */
+ return 0;
}
unsigned int nfs_net_id;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 62909e5594b1..c299b1bfae9b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -440,6 +440,7 @@ int nfs_check_flags(int);
/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
+extern struct workqueue_struct *nfslocaliod_workqueue;
extern struct inode *nfs_alloc_inode(struct super_block *sb);
extern void nfs_free_inode(struct inode *);
extern int nfs_write_inode(struct inode *, struct writeback_control *);
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index b3996bf9c45d..00832c9be9f5 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -64,6 +64,12 @@ struct nfs_local_fsync_ctx {
};
static void nfs_local_fsync_work(struct work_struct *work);
+struct nfs_local_io_args {
+ struct nfs_local_kiocb *iocb;
+ struct work_struct work;
+ struct completion *done;
+};
+
/*
* We need to translate between nfs status return values and
* the local errno values which may not be the same.
@@ -421,21 +427,38 @@ nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
nfs_local_pgio_complete(iocb);
}
-static int
-nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
- const struct rpc_call_ops *call_ops)
+static void nfs_local_call_read(struct work_struct *work)
{
- struct nfs_local_kiocb *iocb;
+ struct nfs_local_io_args *args =
+ container_of(work, struct nfs_local_io_args, work);
+ struct nfs_local_kiocb *iocb = args->iocb;
+ struct file *filp = iocb->kiocb.ki_filp;
struct iov_iter iter;
ssize_t status;
+ nfs_local_iter_init(&iter, iocb, READ);
+
+ status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+ if (status != -EIOCBQUEUED) {
+ nfs_local_read_done(iocb, status);
+ nfs_local_pgio_release(iocb);
+ }
+ complete(args->done);
+}
+
+static int nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
+ const struct rpc_call_ops *call_ops)
+{
+ struct nfs_local_io_args args;
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct nfs_local_kiocb *iocb;
+
dprintk("%s: vfs_read count=%u pos=%llu\n",
__func__, hdr->args.count, hdr->args.offset);
iocb = nfs_local_iocb_alloc(hdr, filp, GFP_KERNEL);
if (iocb == NULL)
return -ENOMEM;
- nfs_local_iter_init(&iter, iocb, READ);
nfs_local_pgio_init(hdr, call_ops);
hdr->res.eof = false;
@@ -445,11 +468,18 @@ nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
}
- status = filp->f_op->read_iter(&iocb->kiocb, &iter);
- if (status != -EIOCBQUEUED) {
- nfs_local_read_done(iocb, status);
- nfs_local_pgio_release(iocb);
- }
+ /*
+ * Don't call filesystem read() routines directly.
+ * In order to avoid issues with stack overflow,
+ * call the read routines from a workqueue job.
+ */
+ args.iocb = iocb;
+ args.done = &done;
+ INIT_WORK_ONSTACK(&args.work, nfs_local_call_read);
+ queue_work(nfslocaliod_workqueue, &args.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&args.work);
+
return 0;
}
@@ -559,14 +589,35 @@ nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
nfs_local_pgio_complete(iocb);
}
-static int
-nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
- const struct rpc_call_ops *call_ops)
+static void nfs_local_call_write(struct work_struct *work)
{
- struct nfs_local_kiocb *iocb;
+ struct nfs_local_io_args *args =
+ container_of(work, struct nfs_local_io_args, work);
+ struct nfs_local_kiocb *iocb = args->iocb;
+ struct file *filp = iocb->kiocb.ki_filp;
struct iov_iter iter;
ssize_t status;
+ nfs_local_iter_init(&iter, iocb, WRITE);
+
+ file_start_write(filp);
+ status = filp->f_op->write_iter(&iocb->kiocb, &iter);
+ file_end_write(filp);
+ if (status != -EIOCBQUEUED) {
+ nfs_local_write_done(iocb, status);
+ nfs_get_vfs_attr(filp, iocb->hdr->res.fattr);
+ nfs_local_pgio_release(iocb);
+ }
+ complete(args->done);
+}
+
+static int nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
+ const struct rpc_call_ops *call_ops)
+{
+ struct nfs_local_io_args args;
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct nfs_local_kiocb *iocb;
+
dprintk("%s: vfs_write count=%u pos=%llu %s\n",
__func__, hdr->args.count, hdr->args.offset,
(hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable");
@@ -574,7 +625,6 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
iocb = nfs_local_iocb_alloc(hdr, filp, GFP_NOIO);
if (iocb == NULL)
return -ENOMEM;
- nfs_local_iter_init(&iter, iocb, WRITE);
switch (hdr->args.stable) {
default:
@@ -594,14 +644,20 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
- file_start_write(filp);
- status = filp->f_op->write_iter(&iocb->kiocb, &iter);
- file_end_write(filp);
- if (status != -EIOCBQUEUED) {
- nfs_local_write_done(iocb, status);
- nfs_get_vfs_attr(filp, hdr->res.fattr);
- nfs_local_pgio_release(iocb);
- }
+ /*
+ * Don't call filesystem write() routines directly.
+ * Some filesystem writeback routines can end up taking up a lot of
+ * stack (particularly xfs). Instead of risking running over due to
+ * the extra overhead from the NFS stack, call these write routines
+ * from a workqueue job.
+ */
+ args.iocb = iocb;
+ args.done = &done;
+ INIT_WORK_ONSTACK(&args.work, nfs_local_call_write);
+ queue_work(nfslocaliod_workqueue, &args.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&args.work);
+
return 0;
}
--
2.44.0
next prev parent reply other threads:[~2024-06-12 3:08 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-06-12 3:07 [RFC PATCH v2 00/15] nfs/nfsd: add support for localio Mike Snitzer
2024-06-12 3:07 ` [RFC PATCH v2 01/15] nfs: pass nfs_client to nfs_initiate_pgio Mike Snitzer
2024-06-12 3:07 ` [RFC PATCH v2 02/15] nfs: pass descriptor thru nfs_initiate_pgio path Mike Snitzer
2024-06-12 3:07 ` [RFC PATCH v2 03/15] nfs: pass struct file to nfs_init_pgio and nfs_init_commit Mike Snitzer
2024-06-12 3:07 ` [RFC PATCH v2 04/15] sunrpc: add rpcauth_map_to_svc_cred_local Mike Snitzer
2024-06-12 4:28 ` NeilBrown
2024-06-12 18:32 ` Anna Schumaker
2024-06-12 3:07 ` [RFC PATCH v2 05/15] nfs: move nfs_stat_to_errno to nfs.h Mike Snitzer
2024-06-12 3:23 ` NeilBrown
2024-06-12 3:50 ` Mike Snitzer
2024-06-12 3:07 ` [RFC PATCH v2 06/15] nfs_common: add NFS LOCALIO protocol extension enablement Mike Snitzer
2024-06-12 3:07 ` [RFC PATCH v2 07/15] nfs/nfsd: add "localio" support Mike Snitzer
2024-06-12 4:36 ` NeilBrown
2024-06-12 4:51 ` Mike Snitzer
2024-06-12 3:07 ` Mike Snitzer [this message]
2024-06-12 4:39 ` [RFC PATCH v2 08/15] NFS: for localio don't call filesystem read() and write() routines directly NeilBrown
2024-06-12 3:07 ` [RFC PATCH v2 09/15] NFS: Enable localio for non-pNFS I/O Mike Snitzer
2024-06-12 5:12 ` NeilBrown
2024-06-12 3:07 ` [RFC PATCH v2 10/15] pnfs/flexfiles: Enable localio for flexfiles I/O Mike Snitzer
2024-06-12 3:07 ` [RFC PATCH v2 11/15] nfs: implement v3 and v4 client support for NFS_LOCALIO_PROGRAM Mike Snitzer
2024-06-12 3:07 ` [RFC PATCH v2 12/15] nfsd: implement v3 and v4 server " Mike Snitzer
2024-06-12 3:07 ` [RFC PATCH v2 13/15] nfs/nfsd: consolidate {encode,decode}_opaque_fixed in nfs_xdr.h Mike Snitzer
2024-06-12 3:07 ` [RFC PATCH v2 14/15] nfs/localio: move managing nfsd_open_local_fh symbol to nfs_common Mike Snitzer
2024-06-12 3:07 ` [RFC PATCH v2 15/15] nfs/nfsd: ensure localio server always uses its network namespace Mike Snitzer
2024-06-12 5:11 ` NeilBrown
2024-06-12 3:12 ` [RFC PATCH v2 00/15] nfs/nfsd: add support for localio Mike Snitzer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240612030752.31754-9-snitzer@kernel.org \
--to=snitzer@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=jlayton@kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=neilb@suse.de \
--cc=snitzer@hammerspace.com \
--cc=trondmy@hammerspace.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox