From: cel@kernel.org
To: <linux-nfs@vger.kernel.org>, <linux-rdma@vger.kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Subject: [RFC PATCH 3/4] xprtrdma: Delay releasing connection hardware resources
Date: Mon, 29 Apr 2024 11:25:41 -0400 [thread overview]
Message-ID: <20240429152537.212958-9-cel@kernel.org> (raw)
In-Reply-To: <20240429152537.212958-6-cel@kernel.org>
From: Chuck Lever <chuck.lever@oracle.com>
xprtiod_workqueue is a MEM_RECLAIM-enabled workqueue. However, the
RDMA core API functions are not memory reclaim-safe. This was
partially accomplished by commit 6b1eb3b22272 ("SUNRPC: Replace the
use of the xprtiod WQ in rpcrdma").
This commit addressed the issue in the connect path, but not in the
disconnect path. Thus sometimes a transport disconnect results in
this splat:
workqueue: WQ_MEM_RECLAIM xprtiod:xprt_autoclose [sunrpc] is flushing !WQ_MEM_RECLAIM events_highpri:rpcrdma_mr_refresh_worker [rpcrdma]
WARNING: CPU: 1 PID: 20378 at kernel/workqueue.c:3728 check_flush_dependency+0x101/0x120
? check_flush_dependency+0x101/0x120
? report_bug+0x175/0x1a0
? handle_bug+0x44/0x90
? exc_invalid_op+0x1c/0x70
? asm_exc_invalid_op+0x1f/0x30
? __pfx_rpcrdma_mr_refresh_worker+0x10/0x10 [rpcrdma aefd3d1b298311368fa14fa93ae5fb3818c3aeac]
? check_flush_dependency+0x101/0x120
__flush_work.isra.0+0x20a/0x290
__cancel_work_sync+0x129/0x1c0
cancel_work_sync+0x14/0x20
rpcrdma_xprt_disconnect+0x229/0x3f0 [rpcrdma aefd3d1b298311368fa14fa93ae5fb3818c3aeac]
xprt_rdma_close+0x16/0x40 [rpcrdma aefd3d1b298311368fa14fa93ae5fb3818c3aeac]
xprt_autoclose+0x63/0x110 [sunrpc a04d701bce94b5a8fb541cafbe1a489d6b1ab5b3]
process_one_work+0x19e/0x3f0
worker_thread+0x340/0x510
? __pfx_worker_thread+0x10/0x10
kthread+0xf7/0x130
? __pfx_kthread+0x10/0x10
ret_from_fork+0x41/0x60
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1a/0x30
Create a context in which it is safe to manage resources that are
not memory reclaim-safe that can be invoked during transport
disconnect. Essentially this means that releasing an rpcrdma_ep is
now done completely asynchronously.
Subsequent patches will move the release of transport resources into
this new context.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218704
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/xprtrdma/transport.c | 20 +++++++++++++-
net/sunrpc/xprtrdma/verbs.c | 46 ++++++++++++++++++++-------------
net/sunrpc/xprtrdma/xprt_rdma.h | 5 +++-
3 files changed, 51 insertions(+), 20 deletions(-)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 29b0562d62e7..237d78c1ec54 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -761,8 +761,12 @@ static struct xprt_class xprt_rdma = {
.netid = { "rdma", "rdma6", "" },
};
+struct workqueue_struct *rpcrdma_release_wq __read_mostly;
+
void xprt_rdma_cleanup(void)
{
+ struct workqueue_struct *wq;
+
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
if (sunrpc_table_header) {
unregister_sysctl_table(sunrpc_table_header);
@@ -772,18 +776,32 @@ void xprt_rdma_cleanup(void)
xprt_unregister_transport(&xprt_rdma);
xprt_unregister_transport(&xprt_rdma_bc);
+
+ wq = rpcrdma_release_wq;
+ rpcrdma_release_wq = NULL;
+ destroy_workqueue(wq);
}
int xprt_rdma_init(void)
{
+ struct workqueue_struct *wq;
int rc;
+ /* provision a WQ that is always unbound and !mem_reclaim */
+ wq = alloc_workqueue("rpcrdma_release", WQ_UNBOUND, 0);
+ if (!wq)
+ return -ENOMEM;
+ rpcrdma_release_wq = wq;
+
rc = xprt_register_transport(&xprt_rdma);
- if (rc)
+ if (rc) {
+ destroy_workqueue(wq);
return rc;
+ }
rc = xprt_register_transport(&xprt_rdma_bc);
if (rc) {
+ destroy_workqueue(wq);
xprt_unregister_transport(&xprt_rdma);
return rc;
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index c6d9d94c28ba..f1e4a28325fa 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -73,7 +73,7 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_ep_get(struct rpcrdma_ep *ep);
-static int rpcrdma_ep_put(struct rpcrdma_ep *ep);
+static void rpcrdma_ep_put(struct rpcrdma_ep *ep);
static struct rpcrdma_regbuf *
rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction,
int node);
@@ -234,15 +234,15 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
case RDMA_CM_EVENT_ROUTE_RESOLVED:
ep->re_async_rc = 0;
complete(&ep->re_done);
- return 0;
+ break;
case RDMA_CM_EVENT_ADDR_ERROR:
ep->re_async_rc = -EPROTO;
complete(&ep->re_done);
- return 0;
+ break;
case RDMA_CM_EVENT_ROUTE_ERROR:
ep->re_async_rc = -ENETUNREACH;
complete(&ep->re_done);
- return 0;
+ break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
pr_info("rpcrdma: removing device %s for %pISpc\n",
ep->re_id->device->name, sap);
@@ -269,12 +269,13 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
ep->re_connect_status = -ENOTCONN;
wake_connect_worker:
wake_up_all(&ep->re_connect_wait);
- return 0;
+ break;
case RDMA_CM_EVENT_DISCONNECTED:
ep->re_connect_status = -ECONNABORTED;
disconnected:
rpcrdma_force_disconnect(ep);
- return rpcrdma_ep_put(ep);
+ rpcrdma_ep_put(ep);
+ fallthrough;
default:
break;
}
@@ -328,9 +329,13 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
return ERR_PTR(rc);
}
-static void rpcrdma_ep_destroy(struct kref *kref)
+/* Delayed release of a connection's hardware resources. Releasing
+ * RDMA hardware resources is done in a !MEM_RECLAIM context because
+ * the RDMA core API functions are generally not reclaim-safe.
+ */
+static void rpcrdma_ep_destroy(struct work_struct *work)
{
- struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
+ struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, re_worker);
if (ep->re_id->qp) {
rdma_destroy_qp(ep->re_id);
@@ -348,22 +353,30 @@ static void rpcrdma_ep_destroy(struct kref *kref)
ib_dealloc_pd(ep->re_pd);
ep->re_pd = NULL;
+ if (ep->re_id)
+ rdma_destroy_id(ep->re_id);
+ ep->re_id = NULL;
+
kfree(ep);
module_put(THIS_MODULE);
}
+static void rpcrdma_ep_release(struct kref *kref)
+{
+ struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
+
+ INIT_WORK(&ep->re_worker, rpcrdma_ep_destroy);
+ queue_work(rpcrdma_release_wq, &ep->re_worker);
+}
+
static noinline void rpcrdma_ep_get(struct rpcrdma_ep *ep)
{
kref_get(&ep->re_kref);
}
-/* Returns:
- * %0 if @ep still has a positive kref count, or
- * %1 if @ep was destroyed successfully.
- */
-static noinline int rpcrdma_ep_put(struct rpcrdma_ep *ep)
+static noinline void rpcrdma_ep_put(struct rpcrdma_ep *ep)
{
- return kref_put(&ep->re_kref, rpcrdma_ep_destroy);
+ kref_put(&ep->re_kref, rpcrdma_ep_release);
}
static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
@@ -475,7 +488,6 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
out_destroy:
rpcrdma_ep_put(ep);
- rdma_destroy_id(id);
return rc;
}
@@ -566,10 +578,8 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt)
rpcrdma_mrs_destroy(r_xprt);
rpcrdma_sendctxs_destroy(r_xprt);
- if (rpcrdma_ep_put(ep))
- rdma_destroy_id(id);
-
r_xprt->rx_ep = NULL;
+ rpcrdma_ep_put(ep);
}
/* Fixed-size circular FIFO queue. This implementation is wait-free and
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 08bda29ed953..048d2e329384 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -70,7 +70,6 @@
*/
struct rpcrdma_mr;
struct rpcrdma_ep {
- struct kref re_kref;
struct rdma_cm_id *re_id;
struct ib_pd *re_pd;
unsigned int re_max_rdma_segs;
@@ -100,6 +99,9 @@ struct rpcrdma_ep {
atomic_t re_completion_ids;
char re_write_pad[XDR_UNIT];
+
+ struct kref re_kref;
+ struct work_struct re_worker;
};
/* Pre-allocate extra Work Requests for handling reverse-direction
@@ -583,6 +585,7 @@ void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
void xprt_rdma_close(struct rpc_xprt *xprt);
void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
+extern struct workqueue_struct *rpcrdma_release_wq;
int xprt_rdma_init(void);
void xprt_rdma_cleanup(void);
--
2.44.0
next prev parent reply other threads:[~2024-04-29 15:25 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-04-29 15:25 [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat cel
2024-04-29 15:25 ` [RFC PATCH 1/4] xprtrdma: Remove temp allocation of rpcrdma_rep objects cel
2024-04-29 15:25 ` [RFC PATCH 2/4] xprtrdma: Clean up synopsis of frwr_mr_unmap() cel
2024-04-29 15:25 ` cel [this message]
2024-04-29 15:25 ` [RFC PATCH 4/4] xprtrdma: Move MRs to struct rpcrdma_ep cel
2024-04-30 7:26 ` [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat Zhu Yanjun
2024-04-30 13:42 ` Chuck Lever III
2024-04-30 13:58 ` Zhu Yanjun
2024-04-30 14:13 ` Chuck Lever III
2024-04-30 14:45 ` Zhu Yanjun
2024-04-30 14:52 ` Chuck Lever III
2024-04-30 14:57 ` Zhu Yanjun
2024-06-02 15:40 ` Chuck Lever III
2024-06-02 18:14 ` Zhu Yanjun
2024-06-03 15:59 ` Chuck Lever III
2024-06-03 16:54 ` Zhu Yanjun
2024-06-03 17:06 ` Chuck Lever III
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240429152537.212958-9-cel@kernel.org \
--to=cel@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=linux-nfs@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.