lustre-devel-lustre.org archive mirror
 help / color / mirror / Atom feed
From: James Simmons <jsimmons@infradead.org>
To: Andreas Dilger <adilger@whamcloud.com>,
	Oleg Drokin <green@whamcloud.com>, NeilBrown <neilb@suse.de>
Cc: Chris Horn <chris.horn@hpe.com>,
	Lustre Development List <lustre-devel@lists.lustre.org>
Subject: [lustre-devel] [PATCH 6/7] lnet: ln_api_mutex deadlocks
Date: Mon, 18 Apr 2022 20:31:03 -0400	[thread overview]
Message-ID: <1650328264-8763-7-git-send-email-jsimmons@infradead.org> (raw)
In-Reply-To: <1650328264-8763-1-git-send-email-jsimmons@infradead.org>

From: Chris Horn <chris.horn@hpe.com>

LNetNIFini() acquires the ln_api_mutex and holds onto it throughout
various shutdown routines. Meanwhile, LND threads (via
lnet_nid2peerni_locked()) or the discovery thread (via
lnet_peer_data_present()) may need to acquire this mutex in order to
progress.

Address these potential deadlocks by setting the_lnet.ln_state to
LNET_STATE_STOPPING earlier in LNetNIFini(), and release the mutex
prior to any call into LND module or before any wait.

LNetNIInit() is modified to return -ESHUTDOWN if it finds that there
is a concurrent shutdown in progress.

HPE-bug-id: LUS-10681
WC-bug-id: https://jira.whamcloud.com/browse/LU-15616
Lustre-commit: 22de0bd145b649768 ("LU-15616 lnet: ln_api_mutex deadlocks")
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Reviewed-on: https://review.whamcloud.com/46727
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
 net/lnet/lnet/api-ni.c   | 40 +++++++++++++++++++++++++++++++++++-----
 net/lnet/lnet/lib-move.c |  2 ++
 net/lnet/lnet/peer.c     | 11 ++++++++---
 3 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/net/lnet/lnet/api-ni.c b/net/lnet/lnet/api-ni.c
index 1978905..44d5014 100644
--- a/net/lnet/lnet/api-ni.c
+++ b/net/lnet/lnet/api-ni.c
@@ -2244,14 +2244,16 @@ static void lnet_push_target_fini(void)
 		islo = ni->ni_net->net_lnd->lnd_type == LOLND;
 
 		LASSERT(!in_interrupt());
-		/* Holding the mutex makes it safe for lnd_shutdown
+		/* Holding the LND mutex makes it safe for lnd_shutdown
 		 * to call module_put(). Module unload cannot finish
 		 * until lnet_unregister_lnd() completes, and that
-		 * requires the mutex.
+		 * requires the LND mutex.
 		 */
+		mutex_unlock(&the_lnet.ln_api_mutex);
 		mutex_lock(&the_lnet.ln_lnd_mutex);
 		net->net_lnd->lnd_shutdown(ni);
 		mutex_unlock(&the_lnet.ln_lnd_mutex);
+		mutex_lock(&the_lnet.ln_api_mutex);
 
 		if (!islo)
 			CDEBUG(D_LNI, "Removed LNI %s\n",
@@ -2323,7 +2325,8 @@ static void lnet_push_target_fini(void)
 	/* NB called holding the global mutex */
 
 	/* All quiet on the API front */
-	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING ||
+		the_lnet.ln_state == LNET_STATE_STOPPING);
 	LASSERT(!the_lnet.ln_refcount);
 
 	lnet_net_lock(LNET_LOCK_EX);
@@ -2823,6 +2826,11 @@ void lnet_lib_exit(void)
 
 	CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
 
+	if (the_lnet.ln_state == LNET_STATE_STOPPING) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return -ESHUTDOWN;
+	}
+
 	if (the_lnet.ln_refcount > 0) {
 		rc = the_lnet.ln_refcount++;
 		mutex_unlock(&the_lnet.ln_api_mutex);
@@ -2968,6 +2976,10 @@ void lnet_lib_exit(void)
 	} else {
 		LASSERT(!the_lnet.ln_niinit_self);
 
+		lnet_net_lock(LNET_LOCK_EX);
+		the_lnet.ln_state = LNET_STATE_STOPPING;
+		lnet_net_unlock(LNET_LOCK_EX);
+
 		lnet_fault_fini();
 		lnet_router_debugfs_fini();
 		lnet_monitor_thr_stop();
@@ -3433,6 +3445,10 @@ static int lnet_handle_legacy_ip2nets(char *ip2nets,
 	lnet_set_tune_defaults(tun);
 
 	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
 	while ((net = list_first_entry_or_null(&net_head,
 					       struct lnet_net,
 					       net_list)) != NULL) {
@@ -3498,8 +3514,10 @@ int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf)
 	lnet_set_tune_defaults(tun);
 
 	mutex_lock(&the_lnet.ln_api_mutex);
-
-	rc = lnet_add_net_common(net, tun);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		rc = -ESHUTDOWN;
+	else
+		rc = lnet_add_net_common(net, tun);
 
 	mutex_unlock(&the_lnet.ln_api_mutex);
 
@@ -3522,6 +3540,10 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 		return -EINVAL;
 
 	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = -ESHUTDOWN;
+		goto unlock_api_mutex;
+	}
 
 	lnet_net_lock(0);
 
@@ -3615,6 +3637,10 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 		return rc == 0 ? -EINVAL : rc;
 
 	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = -ESHUTDOWN;
+		goto out_unlock_clean;
+	}
 
 	if (rc > 1) {
 		rc = -EINVAL; /* only add one network per call */
@@ -3668,6 +3694,10 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 		return -EINVAL;
 
 	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
 
 	lnet_net_lock(0);
 
diff --git a/net/lnet/lnet/lib-move.c b/net/lnet/lnet/lib-move.c
index 0b3986e..0496bf5 100644
--- a/net/lnet/lnet/lib-move.c
+++ b/net/lnet/lnet/lib-move.c
@@ -3872,7 +3872,9 @@ void lnet_monitor_thr_stop(void)
 	complete(&the_lnet.ln_mt_wait_complete);
 
 	/* block until monitor thread signals that it's done */
+	mutex_unlock(&the_lnet.ln_api_mutex);
 	wait_for_completion(&the_lnet.ln_mt_signal);
+	mutex_lock(&the_lnet.ln_api_mutex);
 	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN);
 
 	/* perform cleanup tasks */
diff --git a/net/lnet/lnet/peer.c b/net/lnet/lnet/peer.c
index 98f71dd..714326a 100644
--- a/net/lnet/lnet/peer.c
+++ b/net/lnet/lnet/peer.c
@@ -3244,12 +3244,15 @@ static int lnet_peer_deletion(struct lnet_peer *lp)
 	if (lp->lp_state & LNET_PEER_MARK_DELETED)
 		return 0;
 
-	if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
-		return -ESHUTDOWN;
-
 	spin_unlock(&lp->lp_lock);
 
 	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING ||
+	    the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		spin_lock(&lp->lp_lock);
+		return -ESHUTDOWN;
+	}
 
 	lnet_net_lock(LNET_LOCK_EX);
 	/* remove the peer from the discovery work
@@ -3929,8 +3932,10 @@ void lnet_peer_discovery_stop(void)
 	else
 		wake_up(&the_lnet.ln_dc_waitq);
 
+	mutex_unlock(&the_lnet.ln_api_mutex);
 	wait_event(the_lnet.ln_dc_waitq,
 		   the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);
+	mutex_lock(&the_lnet.ln_api_mutex);
 
 	LASSERT(list_empty(&the_lnet.ln_dc_request));
 	LASSERT(list_empty(&the_lnet.ln_dc_working));
-- 
1.8.3.1

_______________________________________________
lustre-devel mailing list
lustre-devel@lists.lustre.org
http://lists.lustre.org/listinfo.cgi/lustre-devel-lustre.org

  parent reply	other threads:[~2022-04-19  0:31 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-19  0:30 [lustre-devel] [PATCH 0/7] lustre: OpenSFS updates April 18, 2022 James Simmons
2022-04-19  0:30 ` [lustre-devel] [PATCH 1/7] lustre: ptlrpc: unregister reply buffer on rq_err James Simmons
2022-04-19  0:30 ` [lustre-devel] [PATCH 2/7] lustre: llite: Fix use of uninitialized fields James Simmons
2022-04-19  0:31 ` [lustre-devel] [PATCH 3/7] lustre: lov: remove lo_trunc_stripeno James Simmons
2022-04-19  0:31 ` [lustre-devel] [PATCH 4/7] lustre: lmv: change default hash back to fnv_1a_64 James Simmons
2022-04-19  0:31 ` [lustre-devel] [PATCH 5/7] lnet: only update gateway NI status on discovery James Simmons
2022-04-19  0:31 ` James Simmons [this message]
2022-04-19  0:31 ` [lustre-devel] [PATCH 7/7] lustre: clio: Disable lockless for DIO with O_APPEND James Simmons

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1650328264-8763-7-git-send-email-jsimmons@infradead.org \
    --to=jsimmons@infradead.org \
    --cc=adilger@whamcloud.com \
    --cc=chris.horn@hpe.com \
    --cc=green@whamcloud.com \
    --cc=lustre-devel@lists.lustre.org \
    --cc=neilb@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).