public inbox for linux-nfs@vger.kernel.org
 help / color / mirror / Atom feed
From: Mike Snitzer <snitzer@kernel.org>
To: linux-nfs@vger.kernel.org
Cc: Jeff Layton <jlayton@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Trond Myklebust <trondmy@hammerspace.com>,
	NeilBrown <neilb@suse.de>,
	snitzer@hammerspace.com
Subject: [PATCH v7 10/20] nfs/localio: use dedicated workqueues for filesystem read and write
Date: Mon, 24 Jun 2024 12:27:31 -0400	[thread overview]
Message-ID: <20240624162741.68216-11-snitzer@kernel.org> (raw)
In-Reply-To: <20240624162741.68216-1-snitzer@kernel.org>

From: Trond Myklebust <trond.myklebust@hammerspace.com>

For localio access, don't call filesystem read() and write() routines
directly.

Some filesystem writeback routines can end up taking up a lot of stack
space (particularly xfs). Instead of risking running over due to the
extra overhead from the NFS stack, we should just call these routines
from a workqueue job.

Use of dedicated workqueues improves performance over using the
system_unbound_wq. Localio is motivated by the promise of improved
performance, it makes little sense to yield it back.

But further analysis of the latest stack depth requirements would be
useful. It'd be nice to root cause and fix the latest stack hogs,
because using workqueues at all can cause a loss in performance due to
context switches.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 fs/nfs/inode.c    |  57 +++++++++++++++++---------
 fs/nfs/internal.h |   1 +
 fs/nfs/localio.c  | 102 +++++++++++++++++++++++++++++++++++-----------
 3 files changed, 118 insertions(+), 42 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f9923cbf6058..aac8c5302503 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2394,35 +2394,54 @@ static void nfs_destroy_inodecache(void)
 	kmem_cache_destroy(nfs_inode_cachep);
 }
 
+struct workqueue_struct *nfslocaliod_workqueue;
 struct workqueue_struct *nfsiod_workqueue;
 EXPORT_SYMBOL_GPL(nfsiod_workqueue);
 
 /*
- * start up the nfsiod workqueue
- */
-static int nfsiod_start(void)
-{
-	struct workqueue_struct *wq;
-	dprintk("RPC:       creating workqueue nfsiod\n");
-	wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
-	if (wq == NULL)
-		return -ENOMEM;
-	nfsiod_workqueue = wq;
-	return 0;
-}
-
-/*
- * Destroy the nfsiod workqueue
+ * Destroy the nfsiod workqueues
  */
 static void nfsiod_stop(void)
 {
 	struct workqueue_struct *wq;
 
 	wq = nfsiod_workqueue;
-	if (wq == NULL)
-		return;
-	nfsiod_workqueue = NULL;
-	destroy_workqueue(wq);
+	if (wq != NULL) {
+		nfsiod_workqueue = NULL;
+		destroy_workqueue(wq);
+	}
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+	wq = nfslocaliod_workqueue;
+	if (wq != NULL) {
+		nfslocaliod_workqueue = NULL;
+		destroy_workqueue(wq);
+	}
+#endif /* CONFIG_NFS_LOCALIO */
+}
+
+/*
+ * Start the nfsiod workqueues
+ */
+static int nfsiod_start(void)
+{
+	dprintk("RPC:       creating workqueue nfsiod\n");
+	nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+	if (nfsiod_workqueue == NULL)
+		return -ENOMEM;
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+	/*
+	 * localio writes need to use a normal (non-memreclaim) workqueue.
+	 * When we start getting low on space, XFS goes and calls flush_work() on
+	 * a non-memreclaim work queue, which causes a priority inversion problem.
+	 */
+	dprintk("RPC:       creating workqueue nfslocaliod\n");
+	nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0);
+	if (unlikely(nfslocaliod_workqueue == NULL)) {
+		nfsiod_stop();
+		return -ENOMEM;
+	}
+#endif /* CONFIG_NFS_LOCALIO */
+	return 0;
 }
 
 unsigned int nfs_net_id;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d352040e3232..9251a357d097 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -440,6 +440,7 @@ int nfs_check_flags(int);
 
 /* inode.c */
 extern struct workqueue_struct *nfsiod_workqueue;
+extern struct workqueue_struct *nfslocaliod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_free_inode(struct inode *);
 extern int nfs_write_inode(struct inode *, struct writeback_control *);
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 724e81716b16..418b8d76692b 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -44,6 +44,12 @@ struct nfs_local_fsync_ctx {
 };
 static void nfs_local_fsync_work(struct work_struct *work);
 
+struct nfs_local_io_args {
+	struct nfs_local_kiocb *iocb;
+	struct work_struct work;
+	struct completion *done;
+};
+
 /*
  * We need to translate between nfs status return values and
  * the local errno values which may not be the same.
@@ -307,30 +313,54 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
 			status > 0 ? status : 0, hdr->res.eof);
 }
 
-static int
-nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
-		const struct rpc_call_ops *call_ops)
+static void nfs_local_call_read(struct work_struct *work)
 {
-	struct nfs_local_kiocb *iocb;
+	struct nfs_local_io_args *args =
+		container_of(work, struct nfs_local_io_args, work);
+	struct nfs_local_kiocb *iocb = args->iocb;
+	struct file *filp = iocb->kiocb.ki_filp;
 	struct iov_iter iter;
 	ssize_t status;
 
+	nfs_local_iter_init(&iter, iocb, READ);
+
+	status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+	if (status != -EIOCBQUEUED) {
+		nfs_local_read_done(iocb, status);
+		nfs_local_pgio_release(iocb);
+	}
+	complete(args->done);
+}
+
+static int nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
+			     const struct rpc_call_ops *call_ops)
+{
+	struct nfs_local_io_args args;
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct nfs_local_kiocb *iocb;
+
 	dprintk("%s: vfs_read count=%u pos=%llu\n",
 		__func__, hdr->args.count, hdr->args.offset);
 
 	iocb = nfs_local_iocb_alloc(hdr, filp, GFP_KERNEL);
 	if (iocb == NULL)
 		return -ENOMEM;
-	nfs_local_iter_init(&iter, iocb, READ);
 
 	nfs_local_pgio_init(hdr, call_ops);
 	hdr->res.eof = false;
 
-	status = filp->f_op->read_iter(&iocb->kiocb, &iter);
-	if (status != -EIOCBQUEUED) {
-		nfs_local_read_done(iocb, status);
-		nfs_local_pgio_release(iocb);
-	}
+	/*
+	 * Don't call filesystem read() routines directly.
+	 * In order to avoid issues with stack overflow,
+	 * call the read routines from a workqueue job.
+	 */
+	args.iocb = iocb;
+	args.done = &done;
+	INIT_WORK_ONSTACK(&args.work, nfs_local_call_read);
+	queue_work(nfslocaliod_workqueue, &args.work);
+	wait_for_completion(&done);
+	destroy_work_on_stack(&args.work);
+
 	return 0;
 }
 
@@ -420,14 +450,35 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
 	nfs_local_pgio_done(hdr, status);
 }
 
-static int
-nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
-		const struct rpc_call_ops *call_ops)
+static void nfs_local_call_write(struct work_struct *work)
 {
-	struct nfs_local_kiocb *iocb;
+	struct nfs_local_io_args *args =
+		container_of(work, struct nfs_local_io_args, work);
+	struct nfs_local_kiocb *iocb = args->iocb;
+	struct file *filp = iocb->kiocb.ki_filp;
 	struct iov_iter iter;
 	ssize_t status;
 
+	nfs_local_iter_init(&iter, iocb, WRITE);
+
+	file_start_write(filp);
+	status = filp->f_op->write_iter(&iocb->kiocb, &iter);
+	file_end_write(filp);
+	if (status != -EIOCBQUEUED) {
+		nfs_local_write_done(iocb, status);
+		nfs_get_vfs_attr(filp, iocb->hdr->res.fattr);
+		nfs_local_pgio_release(iocb);
+	}
+	complete(args->done);
+}
+
+static int nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
+			      const struct rpc_call_ops *call_ops)
+{
+	struct nfs_local_io_args args;
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct nfs_local_kiocb *iocb;
+
 	dprintk("%s: vfs_write count=%u pos=%llu %s\n",
 		__func__, hdr->args.count, hdr->args.offset,
 		(hdr->args.stable == NFS_UNSTABLE) ?  "unstable" : "stable");
@@ -435,7 +486,6 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
 	iocb = nfs_local_iocb_alloc(hdr, filp, GFP_NOIO);
 	if (iocb == NULL)
 		return -ENOMEM;
-	nfs_local_iter_init(&iter, iocb, WRITE);
 
 	switch (hdr->args.stable) {
 	default:
@@ -450,14 +500,20 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
 
 	nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
 
-	file_start_write(filp);
-	status = filp->f_op->write_iter(&iocb->kiocb, &iter);
-	file_end_write(filp);
-	if (status != -EIOCBQUEUED) {
-		nfs_local_write_done(iocb, status);
-		nfs_get_vfs_attr(filp, hdr->res.fattr);
-		nfs_local_pgio_release(iocb);
-	}
+	/*
+	 * Don't call filesystem write() routines directly.
+	 * Some filesystem writeback routines can end up taking up a lot of
+	 * stack (particularly xfs). Instead of risking running over due to
+	 * the extra overhead from the NFS stack, call these write routines
+	 * from a workqueue job.
+	 */
+	args.iocb = iocb;
+	args.done = &done;
+	INIT_WORK_ONSTACK(&args.work, nfs_local_call_write);
+	queue_work(nfslocaliod_workqueue, &args.work);
+	wait_for_completion(&done);
+	destroy_work_on_stack(&args.work);
+
 	return 0;
 }
 
-- 
2.44.0


  parent reply	other threads:[~2024-06-24 16:27 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-24 16:27 [PATCH v7 00/20] nfs/nfsd: add support for localio Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 01/20] nfs: pass nfs_client to nfs_initiate_pgio Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 02/20] nfs: pass descriptor thru nfs_initiate_pgio path Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 03/20] nfs: pass struct file to nfs_init_pgio and nfs_init_commit Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 04/20] sunrpc: add rpcauth_map_to_svc_cred_local Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 05/20] nfs_common: add NFS LOCALIO auxiliary protocol enablement Mike Snitzer
2024-06-25 23:33   ` NeilBrown
2024-06-26 16:50     ` Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 06/20] nfs/nfsd: add "localio" support Mike Snitzer
2024-06-24 18:26   ` Chuck Lever
2024-06-25  4:57     ` Mike Snitzer
2024-06-25 13:59       ` Chuck Lever
2024-06-24 16:27 ` [PATCH v7 07/20] nfsd/localio: manage netns reference in nfsd_open_local_fh Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 08/20] NFS: Enable localio for non-pNFS I/O Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 09/20] pnfs/flexfiles: Enable localio for flexfiles I/O Mike Snitzer
2024-06-24 16:27 ` Mike Snitzer [this message]
2024-06-25 23:15   ` [PATCH v7 10/20] nfs/localio: use dedicated workqueues for filesystem read and write NeilBrown
2024-06-24 16:27 ` [PATCH v7 11/20] nfs/nfsd: factor out {encode,decode}_opaque_fixed to nfs_xdr.h Mike Snitzer
2024-06-24 18:28   ` Chuck Lever
2024-06-24 16:27 ` [PATCH v7 12/20] SUNRPC: remove call_allocate() BUG_ON if p_arglen=0 to allow RPC with void arg Mike Snitzer
2024-06-25 23:19   ` NeilBrown
2024-06-26 16:53     ` Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 13/20] nfs: implement client support for NFS_LOCALIO_PROGRAM Mike Snitzer
2024-06-25 23:21   ` NeilBrown
2024-06-26 16:45     ` Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 14/20] nfsd: implement server " Mike Snitzer
2024-06-24 18:45   ` Chuck Lever
2024-06-25 23:23   ` NeilBrown
2024-06-26 16:27     ` Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 15/20] SUNRPC: replace program list with program array Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 16/20] nfsd: prepare to use SRCU to dereference nn->nfsd_serv Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 17/20] nfsd: " Mike Snitzer
2024-06-25 12:43   ` Jeff Layton
2024-06-25 23:29     ` NeilBrown
2024-06-26 16:49       ` Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 18/20] nfsd/localio: use SRCU to dereference nn->nfsd_serv in nfsd_open_local_fh Mike Snitzer
2024-06-24 16:27 ` [PATCH v7 19/20] nfs: add Documentation/filesystems/nfs/localio.rst Mike Snitzer
2024-06-25 11:59   ` Jeff Layton
2024-06-24 16:27 ` [PATCH v7 20/20] nfs/nfsd: add Kconfig options to allow localio to be enabled Mike Snitzer
2024-06-25 12:49 ` [PATCH v7 00/20] nfs/nfsd: add support for localio Jeff Layton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240624162741.68216-11-snitzer@kernel.org \
    --to=snitzer@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=jlayton@kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=neilb@suse.de \
    --cc=snitzer@hammerspace.com \
    --cc=trondmy@hammerspace.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox