From: James Simmons <jsimmons@infradead.org>
To: Andreas Dilger <adilger@whamcloud.com>,
Oleg Drokin <green@whamcloud.com>, NeilBrown <neilb@suse.de>
Cc: Chris Horn <chris.horn@hpe.com>,
Lustre Development List <lustre-devel@lists.lustre.org>
Subject: [lustre-devel] [PATCH 6/7] lnet: ln_api_mutex deadlocks
Date: Mon, 18 Apr 2022 20:31:03 -0400 [thread overview]
Message-ID: <1650328264-8763-7-git-send-email-jsimmons@infradead.org> (raw)
In-Reply-To: <1650328264-8763-1-git-send-email-jsimmons@infradead.org>
From: Chris Horn <chris.horn@hpe.com>
LNetNIFini() acquires the ln_api_mutex and holds onto it throughout
various shutdown routines. Meanwhile, LND threads (via
lnet_nid2peerni_locked()) or the discovery thread (via
lnet_peer_data_present()) may need to acquire this mutex in order to
progress.
Address these potential deadlocks by setting the_lnet.ln_state to
LNET_STATE_STOPPING earlier in LNetNIFini(), and release the mutex
prior to any call into LND module or before any wait.
LNetNIInit() is modified to return -ESHUTDOWN if it finds that there
is a concurrent shutdown in progress.
HPE-bug-id: LUS-10681
WC-bug-id: https://jira.whamcloud.com/browse/LU-15616
Lustre-commit: 22de0bd145b649768 ("LU-15616 lnet: ln_api_mutex deadlocks")
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Reviewed-on: https://review.whamcloud.com/46727
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
net/lnet/lnet/api-ni.c | 40 +++++++++++++++++++++++++++++++++++-----
net/lnet/lnet/lib-move.c | 2 ++
net/lnet/lnet/peer.c | 11 ++++++++---
3 files changed, 45 insertions(+), 8 deletions(-)
diff --git a/net/lnet/lnet/api-ni.c b/net/lnet/lnet/api-ni.c
index 1978905..44d5014 100644
--- a/net/lnet/lnet/api-ni.c
+++ b/net/lnet/lnet/api-ni.c
@@ -2244,14 +2244,16 @@ static void lnet_push_target_fini(void)
islo = ni->ni_net->net_lnd->lnd_type == LOLND;
LASSERT(!in_interrupt());
- /* Holding the mutex makes it safe for lnd_shutdown
+ /* Holding the LND mutex makes it safe for lnd_shutdown
* to call module_put(). Module unload cannot finish
* until lnet_unregister_lnd() completes, and that
- * requires the mutex.
+ * requires the LND mutex.
*/
+ mutex_unlock(&the_lnet.ln_api_mutex);
mutex_lock(&the_lnet.ln_lnd_mutex);
net->net_lnd->lnd_shutdown(ni);
mutex_unlock(&the_lnet.ln_lnd_mutex);
+ mutex_lock(&the_lnet.ln_api_mutex);
if (!islo)
CDEBUG(D_LNI, "Removed LNI %s\n",
@@ -2323,7 +2325,8 @@ static void lnet_push_target_fini(void)
/* NB called holding the global mutex */
/* All quiet on the API front */
- LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
+ LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING ||
+ the_lnet.ln_state == LNET_STATE_STOPPING);
LASSERT(!the_lnet.ln_refcount);
lnet_net_lock(LNET_LOCK_EX);
@@ -2823,6 +2826,11 @@ void lnet_lib_exit(void)
CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+ if (the_lnet.ln_state == LNET_STATE_STOPPING) {
+ mutex_unlock(&the_lnet.ln_api_mutex);
+ return -ESHUTDOWN;
+ }
+
if (the_lnet.ln_refcount > 0) {
rc = the_lnet.ln_refcount++;
mutex_unlock(&the_lnet.ln_api_mutex);
@@ -2968,6 +2976,10 @@ void lnet_lib_exit(void)
} else {
LASSERT(!the_lnet.ln_niinit_self);
+ lnet_net_lock(LNET_LOCK_EX);
+ the_lnet.ln_state = LNET_STATE_STOPPING;
+ lnet_net_unlock(LNET_LOCK_EX);
+
lnet_fault_fini();
lnet_router_debugfs_fini();
lnet_monitor_thr_stop();
@@ -3433,6 +3445,10 @@ static int lnet_handle_legacy_ip2nets(char *ip2nets,
lnet_set_tune_defaults(tun);
mutex_lock(&the_lnet.ln_api_mutex);
+ if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+ rc = -ESHUTDOWN;
+ goto out;
+ }
while ((net = list_first_entry_or_null(&net_head,
struct lnet_net,
net_list)) != NULL) {
@@ -3498,8 +3514,10 @@ int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf)
lnet_set_tune_defaults(tun);
mutex_lock(&the_lnet.ln_api_mutex);
-
- rc = lnet_add_net_common(net, tun);
+ if (the_lnet.ln_state != LNET_STATE_RUNNING)
+ rc = -ESHUTDOWN;
+ else
+ rc = lnet_add_net_common(net, tun);
mutex_unlock(&the_lnet.ln_api_mutex);
@@ -3522,6 +3540,10 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
return -EINVAL;
mutex_lock(&the_lnet.ln_api_mutex);
+ if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+ rc = -ESHUTDOWN;
+ goto unlock_api_mutex;
+ }
lnet_net_lock(0);
@@ -3615,6 +3637,10 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
return rc == 0 ? -EINVAL : rc;
mutex_lock(&the_lnet.ln_api_mutex);
+ if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+ rc = -ESHUTDOWN;
+ goto out_unlock_clean;
+ }
if (rc > 1) {
rc = -EINVAL; /* only add one network per call */
@@ -3668,6 +3694,10 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
return -EINVAL;
mutex_lock(&the_lnet.ln_api_mutex);
+ if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+ rc = -ESHUTDOWN;
+ goto out;
+ }
lnet_net_lock(0);
diff --git a/net/lnet/lnet/lib-move.c b/net/lnet/lnet/lib-move.c
index 0b3986e..0496bf5 100644
--- a/net/lnet/lnet/lib-move.c
+++ b/net/lnet/lnet/lib-move.c
@@ -3872,7 +3872,9 @@ void lnet_monitor_thr_stop(void)
complete(&the_lnet.ln_mt_wait_complete);
/* block until monitor thread signals that it's done */
+ mutex_unlock(&the_lnet.ln_api_mutex);
wait_for_completion(&the_lnet.ln_mt_signal);
+ mutex_lock(&the_lnet.ln_api_mutex);
LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN);
/* perform cleanup tasks */
diff --git a/net/lnet/lnet/peer.c b/net/lnet/lnet/peer.c
index 98f71dd..714326a 100644
--- a/net/lnet/lnet/peer.c
+++ b/net/lnet/lnet/peer.c
@@ -3244,12 +3244,15 @@ static int lnet_peer_deletion(struct lnet_peer *lp)
if (lp->lp_state & LNET_PEER_MARK_DELETED)
return 0;
- if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
- return -ESHUTDOWN;
-
spin_unlock(&lp->lp_lock);
mutex_lock(&the_lnet.ln_api_mutex);
+ if (the_lnet.ln_state != LNET_STATE_RUNNING ||
+ the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) {
+ mutex_unlock(&the_lnet.ln_api_mutex);
+ spin_lock(&lp->lp_lock);
+ return -ESHUTDOWN;
+ }
lnet_net_lock(LNET_LOCK_EX);
/* remove the peer from the discovery work
@@ -3929,8 +3932,10 @@ void lnet_peer_discovery_stop(void)
else
wake_up(&the_lnet.ln_dc_waitq);
+ mutex_unlock(&the_lnet.ln_api_mutex);
wait_event(the_lnet.ln_dc_waitq,
the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);
+ mutex_lock(&the_lnet.ln_api_mutex);
LASSERT(list_empty(&the_lnet.ln_dc_request));
LASSERT(list_empty(&the_lnet.ln_dc_working));
--
1.8.3.1
_______________________________________________
lustre-devel mailing list
lustre-devel@lists.lustre.org
http://lists.lustre.org/listinfo.cgi/lustre-devel-lustre.org
next prev parent reply other threads:[~2022-04-19 0:31 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-04-19 0:30 [lustre-devel] [PATCH 0/7] lustre: OpenSFS updates April 18, 2022 James Simmons
2022-04-19 0:30 ` [lustre-devel] [PATCH 1/7] lustre: ptlrpc: unregister reply buffer on rq_err James Simmons
2022-04-19 0:30 ` [lustre-devel] [PATCH 2/7] lustre: llite: Fix use of uninitialized fields James Simmons
2022-04-19 0:31 ` [lustre-devel] [PATCH 3/7] lustre: lov: remove lo_trunc_stripeno James Simmons
2022-04-19 0:31 ` [lustre-devel] [PATCH 4/7] lustre: lmv: change default hash back to fnv_1a_64 James Simmons
2022-04-19 0:31 ` [lustre-devel] [PATCH 5/7] lnet: only update gateway NI status on discovery James Simmons
2022-04-19 0:31 ` James Simmons [this message]
2022-04-19 0:31 ` [lustre-devel] [PATCH 7/7] lustre: clio: Disable lockless for DIO with O_APPEND James Simmons
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1650328264-8763-7-git-send-email-jsimmons@infradead.org \
--to=jsimmons@infradead.org \
--cc=adilger@whamcloud.com \
--cc=chris.horn@hpe.com \
--cc=green@whamcloud.com \
--cc=lustre-devel@lists.lustre.org \
--cc=neilb@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).