From: cel@kernel.org
To: <linux-nfs@vger.kernel.org>, <linux-rdma@vger.kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Subject: [RFC PATCH 3/4] xprtrdma: Delay releasing connection hardware resources
Date: Mon, 29 Apr 2024 11:25:41 -0400 [thread overview]
Message-ID: <20240429152537.212958-9-cel@kernel.org> (raw)
In-Reply-To: <20240429152537.212958-6-cel@kernel.org>
From: Chuck Lever <chuck.lever@oracle.com>
xprtiod_workqueue is a MEM_RECLAIM-enabled workqueue. However, the
RDMA core API functions are not memory reclaim-safe. This was
partially accomplished by commit 6b1eb3b22272 ("SUNRPC: Replace the
use of the xprtiod WQ in rpcrdma").
This commit addressed the issue in the connect path, but not in the
disconnect path. Thus sometimes a transport disconnect results in
this splat:
workqueue: WQ_MEM_RECLAIM xprtiod:xprt_autoclose [sunrpc] is flushing !WQ_MEM_RECLAIM events_highpri:rpcrdma_mr_refresh_worker [rpcrdma]
WARNING: CPU: 1 PID: 20378 at kernel/workqueue.c:3728 check_flush_dependency+0x101/0x120
? check_flush_dependency+0x101/0x120
? report_bug+0x175/0x1a0
? handle_bug+0x44/0x90
? exc_invalid_op+0x1c/0x70
? asm_exc_invalid_op+0x1f/0x30
? __pfx_rpcrdma_mr_refresh_worker+0x10/0x10 [rpcrdma aefd3d1b298311368fa14fa93ae5fb3818c3aeac]
? check_flush_dependency+0x101/0x120
__flush_work.isra.0+0x20a/0x290
__cancel_work_sync+0x129/0x1c0
cancel_work_sync+0x14/0x20
rpcrdma_xprt_disconnect+0x229/0x3f0 [rpcrdma aefd3d1b298311368fa14fa93ae5fb3818c3aeac]
xprt_rdma_close+0x16/0x40 [rpcrdma aefd3d1b298311368fa14fa93ae5fb3818c3aeac]
xprt_autoclose+0x63/0x110 [sunrpc a04d701bce94b5a8fb541cafbe1a489d6b1ab5b3]
process_one_work+0x19e/0x3f0
worker_thread+0x340/0x510
? __pfx_worker_thread+0x10/0x10
kthread+0xf7/0x130
? __pfx_kthread+0x10/0x10
ret_from_fork+0x41/0x60
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1a/0x30
Create a context in which it is safe to manage resources that are
not memory reclaim-safe that can be invoked during transport
disconnect. Essentially this means that releasing an rpcrdma_ep is
now done completely asynchronously.
Subsequent patches will move the release of transport resources into
this new context.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218704
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/xprtrdma/transport.c | 20 +++++++++++++-
net/sunrpc/xprtrdma/verbs.c | 46 ++++++++++++++++++++-------------
net/sunrpc/xprtrdma/xprt_rdma.h | 5 +++-
3 files changed, 51 insertions(+), 20 deletions(-)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 29b0562d62e7..237d78c1ec54 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -761,8 +761,12 @@ static struct xprt_class xprt_rdma = {
.netid = { "rdma", "rdma6", "" },
};
+struct workqueue_struct *rpcrdma_release_wq __read_mostly;
+
void xprt_rdma_cleanup(void)
{
+ struct workqueue_struct *wq;
+
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
if (sunrpc_table_header) {
unregister_sysctl_table(sunrpc_table_header);
@@ -772,18 +776,32 @@ void xprt_rdma_cleanup(void)
xprt_unregister_transport(&xprt_rdma);
xprt_unregister_transport(&xprt_rdma_bc);
+
+ wq = rpcrdma_release_wq;
+ rpcrdma_release_wq = NULL;
+ destroy_workqueue(wq);
}
int xprt_rdma_init(void)
{
+ struct workqueue_struct *wq;
int rc;
+ /* provision a WQ that is always unbound and !mem_reclaim */
+ wq = alloc_workqueue("rpcrdma_release", WQ_UNBOUND, 0);
+ if (!wq)
+ return -ENOMEM;
+ rpcrdma_release_wq = wq;
+
rc = xprt_register_transport(&xprt_rdma);
- if (rc)
+ if (rc) {
+ destroy_workqueue(wq);
return rc;
+ }
rc = xprt_register_transport(&xprt_rdma_bc);
if (rc) {
+ destroy_workqueue(wq);
xprt_unregister_transport(&xprt_rdma);
return rc;
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index c6d9d94c28ba..f1e4a28325fa 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -73,7 +73,7 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_ep_get(struct rpcrdma_ep *ep);
-static int rpcrdma_ep_put(struct rpcrdma_ep *ep);
+static void rpcrdma_ep_put(struct rpcrdma_ep *ep);
static struct rpcrdma_regbuf *
rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction,
int node);
@@ -234,15 +234,15 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
case RDMA_CM_EVENT_ROUTE_RESOLVED:
ep->re_async_rc = 0;
complete(&ep->re_done);
- return 0;
+ break;
case RDMA_CM_EVENT_ADDR_ERROR:
ep->re_async_rc = -EPROTO;
complete(&ep->re_done);
- return 0;
+ break;
case RDMA_CM_EVENT_ROUTE_ERROR:
ep->re_async_rc = -ENETUNREACH;
complete(&ep->re_done);
- return 0;
+ break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
pr_info("rpcrdma: removing device %s for %pISpc\n",
ep->re_id->device->name, sap);
@@ -269,12 +269,13 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
ep->re_connect_status = -ENOTCONN;
wake_connect_worker:
wake_up_all(&ep->re_connect_wait);
- return 0;
+ break;
case RDMA_CM_EVENT_DISCONNECTED:
ep->re_connect_status = -ECONNABORTED;
disconnected:
rpcrdma_force_disconnect(ep);
- return rpcrdma_ep_put(ep);
+ rpcrdma_ep_put(ep);
+ fallthrough;
default:
break;
}
@@ -328,9 +329,13 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
return ERR_PTR(rc);
}
-static void rpcrdma_ep_destroy(struct kref *kref)
+/* Delayed release of a connection's hardware resources. Releasing
+ * RDMA hardware resources is done in a !MEM_RECLAIM context because
+ * the RDMA core API functions are generally not reclaim-safe.
+ */
+static void rpcrdma_ep_destroy(struct work_struct *work)
{
- struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
+ struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, re_worker);
if (ep->re_id->qp) {
rdma_destroy_qp(ep->re_id);
@@ -348,22 +353,30 @@ static void rpcrdma_ep_destroy(struct kref *kref)
ib_dealloc_pd(ep->re_pd);
ep->re_pd = NULL;
+ if (ep->re_id)
+ rdma_destroy_id(ep->re_id);
+ ep->re_id = NULL;
+
kfree(ep);
module_put(THIS_MODULE);
}
+static void rpcrdma_ep_release(struct kref *kref)
+{
+ struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
+
+ INIT_WORK(&ep->re_worker, rpcrdma_ep_destroy);
+ queue_work(rpcrdma_release_wq, &ep->re_worker);
+}
+
static noinline void rpcrdma_ep_get(struct rpcrdma_ep *ep)
{
kref_get(&ep->re_kref);
}
-/* Returns:
- * %0 if @ep still has a positive kref count, or
- * %1 if @ep was destroyed successfully.
- */
-static noinline int rpcrdma_ep_put(struct rpcrdma_ep *ep)
+static noinline void rpcrdma_ep_put(struct rpcrdma_ep *ep)
{
- return kref_put(&ep->re_kref, rpcrdma_ep_destroy);
+ kref_put(&ep->re_kref, rpcrdma_ep_release);
}
static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
@@ -475,7 +488,6 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
out_destroy:
rpcrdma_ep_put(ep);
- rdma_destroy_id(id);
return rc;
}
@@ -566,10 +578,8 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt)
rpcrdma_mrs_destroy(r_xprt);
rpcrdma_sendctxs_destroy(r_xprt);
- if (rpcrdma_ep_put(ep))
- rdma_destroy_id(id);
-
r_xprt->rx_ep = NULL;
+ rpcrdma_ep_put(ep);
}
/* Fixed-size circular FIFO queue. This implementation is wait-free and
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 08bda29ed953..048d2e329384 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -70,7 +70,6 @@
*/
struct rpcrdma_mr;
struct rpcrdma_ep {
- struct kref re_kref;
struct rdma_cm_id *re_id;
struct ib_pd *re_pd;
unsigned int re_max_rdma_segs;
@@ -100,6 +99,9 @@ struct rpcrdma_ep {
atomic_t re_completion_ids;
char re_write_pad[XDR_UNIT];
+
+ struct kref re_kref;
+ struct work_struct re_worker;
};
/* Pre-allocate extra Work Requests for handling reverse-direction
@@ -583,6 +585,7 @@ void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
void xprt_rdma_close(struct rpc_xprt *xprt);
void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
+extern struct workqueue_struct *rpcrdma_release_wq;
int xprt_rdma_init(void);
void xprt_rdma_cleanup(void);
--
2.44.0
next prev parent reply other threads:[~2024-04-29 15:25 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-04-29 15:25 [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat cel
2024-04-29 15:25 ` [RFC PATCH 1/4] xprtrdma: Remove temp allocation of rpcrdma_rep objects cel
2024-04-29 15:25 ` [RFC PATCH 2/4] xprtrdma: Clean up synopsis of frwr_mr_unmap() cel
2024-04-29 15:25 ` cel [this message]
2024-04-29 15:25 ` [RFC PATCH 4/4] xprtrdma: Move MRs to struct rpcrdma_ep cel
2024-04-30 7:26 ` [RFC PATCH 0/4] NFS: Fix another 'check_flush_dependency' splat Zhu Yanjun
2024-04-30 13:42 ` Chuck Lever III
2024-04-30 13:58 ` Zhu Yanjun
2024-04-30 14:13 ` Chuck Lever III
2024-04-30 14:45 ` Zhu Yanjun
2024-04-30 14:52 ` Chuck Lever III
2024-04-30 14:57 ` Zhu Yanjun
2024-06-02 15:40 ` Chuck Lever III
2024-06-02 18:14 ` Zhu Yanjun
2024-06-03 15:59 ` Chuck Lever III
2024-06-03 16:54 ` Zhu Yanjun
2024-06-03 17:06 ` Chuck Lever III
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240429152537.212958-9-cel@kernel.org \
--to=cel@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=linux-nfs@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox