All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oleg Drokin <green@linuxhacker.ru>
To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	linux-kernel@vger.kernel.org, devel@driverdev.osuosl.org
Cc: Andriy Skulysh <Andriy_Skulysh@xyratex.com>,
	Oleg Drokin <oleg.drokin@intel.com>
Subject: [PATCH 06/18] staging/lustre/mgc: mgc import reconnect race
Date: Sun, 22 Jun 2014 21:32:10 -0400	[thread overview]
Message-ID: <1403487142-4880-7-git-send-email-green@linuxhacker.ru> (raw)
In-Reply-To: <1403487142-4880-1-git-send-email-green@linuxhacker.ru>

From: Andriy Skulysh <Andriy_Skulysh@xyratex.com>

mgc import can be reconnected by pinger or
ptlrpc_reconnect_import().
ptlrpc_invalidate_import() isn't protected against
alteration of imp_invalid state. Import can be
reconnected by pinger which makes imp_invalid
equal to false. Thus LASSERT(imp->imp_invalid) fails
in ptlrpc_invalidate_import().

It is safe to call ptlrpc_invalidate_import() when
import is deactivated, but ptlrpc_reconnect_import() doesn't
deactivate it.
Let's use only pinger when available to reconnect import

Signed-off-by: Andriy Skulysh <Andriy_Skulysh@xyratex.com>
Reviewed-on: http://review.whamcloud.com/9967
Xyratex-bug-id: MRP-1746
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-4913
Reviewed-by: Mike Pershin <mike.pershin@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Signed-off-by: Oleg Drokin <oleg.drokin@intel.com>
---
 drivers/staging/lustre/lustre/obdclass/obd_mount.c | 13 ++-----
 drivers/staging/lustre/lustre/ptlrpc/import.c      | 41 +++++++++++++++++-----
 drivers/staging/lustre/lustre/ptlrpc/pinger.c      |  5 +++
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
index a034aee..03d9a6a 100644
--- a/drivers/staging/lustre/lustre/obdclass/obd_mount.c
+++ b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
@@ -219,7 +219,6 @@ int lustre_start_mgc(struct super_block *sb)
 	lnet_nid_t nid;
 	char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
 	char *ptr;
-	int recov_bk;
 	int rc = 0, i = 0, j, len;
 
 	LASSERT(lsi->lsi_lmd);
@@ -269,6 +268,8 @@ int lustre_start_mgc(struct super_block *sb)
 
 	obd = class_name2obd(mgcname);
 	if (obd && !obd->obd_stopping) {
+		int recov_bk;
+
 		rc = obd_set_info_async(NULL, obd->obd_self_export,
 					strlen(KEY_MGSSEC), KEY_MGSSEC,
 					strlen(mgssec), mgssec, NULL);
@@ -429,16 +430,6 @@ int lustre_start_mgc(struct super_block *sb)
 	   so we know when we can get rid of the mgc. */
 	atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
 
-	/* Try all connections, but only once. */
-	recov_bk = 1;
-	rc = obd_set_info_async(NULL, obd->obd_self_export,
-				sizeof(KEY_INIT_RECOV_BACKUP),
-				KEY_INIT_RECOV_BACKUP,
-				sizeof(recov_bk), &recov_bk, NULL);
-	if (rc)
-		/* nonfatal */
-		CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
-
 	/* We connect to the MGS at setup, and don't disconnect until cleanup */
 	data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
 				  OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
diff --git a/drivers/staging/lustre/lustre/ptlrpc/import.c b/drivers/staging/lustre/lustre/ptlrpc/import.c
index 8573f32..b4def8a 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/import.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/import.c
@@ -275,6 +275,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
 	if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
 		ptlrpc_deactivate_import(imp);
 
+	CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2);
 	LASSERT(imp->imp_invalid);
 
 	/* Wait forever until inflight == 0. We really can't do it another
@@ -392,6 +393,19 @@ void ptlrpc_activate_import(struct obd_import *imp)
 }
 EXPORT_SYMBOL(ptlrpc_activate_import);
 
+static void ptlrpc_pinger_force(struct obd_import *imp)
+{
+	CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(imp->imp_state));
+
+	spin_lock(&imp->imp_lock);
+	imp->imp_force_verify = 1;
+	spin_unlock(&imp->imp_lock);
+
+	if (imp->imp_state != LUSTRE_IMP_CONNECTING)
+		ptlrpc_pinger_wake_up();
+}
+
 void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
 {
 	LASSERT(!imp->imp_dlm_fake);
@@ -406,20 +420,30 @@ void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
 			ptlrpc_deactivate_import(imp);
 		}
 
-		CDEBUG(D_HA, "%s: waking up pinger\n",
-		       obd2cli_tgt(imp->imp_obd));
-
-		spin_lock(&imp->imp_lock);
-		imp->imp_force_verify = 1;
-		spin_unlock(&imp->imp_lock);
-
-		ptlrpc_pinger_wake_up();
+		ptlrpc_pinger_force(imp);
 	}
 }
 EXPORT_SYMBOL(ptlrpc_fail_import);
 
 int ptlrpc_reconnect_import(struct obd_import *imp)
 {
+#ifdef ENABLE_PINGER
+	struct l_wait_info lwi;
+	int secs = cfs_time_seconds(obd_timeout);
+	int rc;
+
+	ptlrpc_pinger_force(imp);
+
+	CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+	       obd2cli_tgt(imp->imp_obd), secs);
+
+	lwi = LWI_TIMEOUT(secs, NULL, NULL);
+	rc = l_wait_event(imp->imp_recovery_waitq,
+			  !ptlrpc_import_in_recovery(imp), &lwi);
+	CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(imp->imp_state));
+	return rc;
+#else
 	ptlrpc_set_import_discon(imp, 0);
 	/* Force a new connect attempt */
 	ptlrpc_invalidate_import(imp);
@@ -444,6 +468,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp)
 	/* Attempt a new connect */
 	ptlrpc_recover_import(imp, NULL, 0);
 	return 0;
+#endif
 }
 EXPORT_SYMBOL(ptlrpc_reconnect_import);
 
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pinger.c b/drivers/staging/lustre/lustre/ptlrpc/pinger.c
index 38099d9..2898087 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/pinger.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/pinger.c
@@ -224,6 +224,11 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp,
 		       "or recovery disabled: %s)\n",
 		       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
 		       ptlrpc_import_state_name(level));
+		if (force) {
+			spin_lock(&imp->imp_lock);
+			imp->imp_force_verify = 1;
+			spin_unlock(&imp->imp_lock);
+		}
 	} else if ((imp->imp_pingable && !suppress) || force_next || force) {
 		ptlrpc_ping(imp);
 	}
-- 
1.9.0


  parent reply	other threads:[~2014-06-23  1:32 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-06-23  1:32 [PATCH 00/18] Lustre fixes Oleg Drokin
2014-06-23  1:32 ` [PATCH 01/18] staging/lustre/libcfs: revert changes to libcfs_sock_ioctl Oleg Drokin
2014-06-23  1:32 ` [PATCH 02/18] staging/lustre/ptlrpc: Protect request buffer changing Oleg Drokin
2014-06-23  1:32 ` [PATCH 03/18] staging/lustre/llite: Only kill SGID/SUID bits Oleg Drokin
2014-06-23  1:32 ` [PATCH 04/18] staging/lustre: fix frong ldlm flags type used Oleg Drokin
2014-06-23  1:32 ` [PATCH 05/18] staging/lustre/ptlrpc: fix NULL pointer dereference of {exp,imp}_obd Oleg Drokin
2014-06-23  1:32 ` Oleg Drokin [this message]
2014-06-23  1:32 ` [PATCH 07/18] staging/lustre/osc: get rid of old checksum initial value Oleg Drokin
2014-06-23  1:32 ` [PATCH 08/18] staging/lustre/ptlrpc: race at req processing Oleg Drokin
2014-06-23  1:32 ` [PATCH 09/18] staging/lustre/mgc: replace hard-coded MGC_ENQUEUE_LIMIT value Oleg Drokin
2014-06-23  1:32 ` [PATCH 10/18] staging/lustre/ptlrpc: Add schedule point to ptlrpc_check_set() Oleg Drokin
2014-06-23  1:32 ` [PATCH 11/18] staging/lustre/obdclass: Fix uninitialized variables Oleg Drokin
2014-06-23  1:32 ` [PATCH 12/18] staging/lustre/osc: osc_extent_truncate()) ASSERTION( !ext->oe_urgent ) failed Oleg Drokin
2014-06-23  1:32 ` [PATCH 13/18] staging/lustre/llite: Fix uninitialized variable Oleg Drokin
2014-06-23  1:32 ` [PATCH 14/18] staging/lustre/ptlrpc: unlink request buffer correctly Oleg Drokin
2014-06-23  1:32 ` [PATCH 15/18] staging/lustre/obdclass: runtime load lustre client when needed Oleg Drokin
2014-06-23  1:32 ` [PATCH 16/18] staging/lustre/vvp: release mmap_sem in error case Oleg Drokin
2014-06-23  1:32 ` [PATCH 17/18] staging/lustre/llite: fix a flag bug of vvp_io_kernel_fault() Oleg Drokin
2014-06-23  1:32 ` [PATCH 18/18] staging/lustre/lnet: abort messages whose MD has been unlinked Oleg Drokin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1403487142-4880-7-git-send-email-green@linuxhacker.ru \
    --to=green@linuxhacker.ru \
    --cc=Andriy_Skulysh@xyratex.com \
    --cc=devel@driverdev.osuosl.org \
    --cc=gregkh@linuxfoundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=oleg.drokin@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.