cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
From: jbrassow@sourceware.org <jbrassow@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
Date: 27 Sep 2007 20:31:24 -0000	[thread overview]
Message-ID: <20070927203124.27602.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-09-27 20:31:20

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	Bug 290821: cmirror write path appears deadlocked after recovery ...
	
	In some device failure cases, regions must be marked 'out-of-sync' -
	this was causing a following write to block because it thought the
	region had not yet been recovered - when in fact, it had just been
	put out-of-sync due to failing device.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.53&r2=1.1.2.54
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.15&r2=1.1.2.16
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.39&r2=1.1.2.40

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/26 03:15:40	1.1.2.53
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/27 20:31:18	1.1.2.54
@@ -773,6 +773,7 @@
 	struct region_state *rs, *tmp_rs;
 	struct log_c *lc = (struct log_c *) log->context;
 
+	DMDEBUG("cluster_postsuspend");
 	spin_lock(&lc->state_lock);
 	if (!list_empty(&lc->mark_waiting)) {
 		DMERR("Mark requests remain at postsuspend!");
@@ -833,6 +834,7 @@
 	struct log_c *lc = (struct log_c *) log->context;
 
 	lc->sync_search = 0;
+	lc->recovery_halted = 0;
 	resume_server_requests();
 	atomic_set(&lc->suspended, 0);
 
@@ -861,7 +863,7 @@
 {
 	int rtn;
 	struct log_c *lc = (struct log_c *) log->context;
- 	 
+	 
 	if (atomic_read(&lc->in_sync) == 1) {
 		return 0;
 	}
@@ -1170,6 +1172,10 @@
 	region_t rtn;
 	struct log_c *lc = (struct log_c *) log->context;
 
+	if (atomic_read(&lc->suspended)) {
+		return (atomic_read(&lc->in_sync)) ? lc->region_count : 0;
+	}
+
 	/* Try to get sync count up to five times */
 	for (i = 0; i < 5 && consult_server(lc, 0, LRT_GET_SYNC_COUNT, &rtn); i++);
 	if(i >= 5){
@@ -1226,6 +1232,7 @@
 		DMDEBUG(" ?sync_search : %d", lc->sync_search);
 		DMDEBUG("  in_sync     : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO");
 		DMDEBUG("  suspended   : %s", (atomic_read(&lc->suspended)) ? "YES" : "NO");
+		DMDEBUG("  recovery_halted : %s", (lc->recovery_halted) ? "YES" : "NO");
 		DMDEBUG("  server_id   : %u", lc->server_id);
 		DMDEBUG("  server_valid: %s",
 			((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO");
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/09/26 03:15:40	1.1.2.15
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/09/27 20:31:18	1.1.2.16
@@ -102,6 +102,7 @@
 
 	int sync_pass;          /* number of passes attempting to resync */
 	int sync_search;
+	int recovery_halted;    /* only useful for is_remote_recovering */
 
 	/* Resync flag */
 	enum sync {
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/26 03:15:40	1.1.2.39
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/27 20:31:18	1.1.2.40
@@ -451,6 +451,14 @@
 	if ((lc->sync_search > lc->region_count) && !lc->sync_pass)
 		return 0;
 
+	if (lc->recovery_halted &&
+	    (lc->recovering_region != lr->u.lr_region)) {
+		DMDEBUG("Recovery halted, allowing client: %Lu/%s",
+			lr->u.lr_region,
+			lc->uuid + (strlen(lc->uuid) - 8));
+		return 0;
+	}
+
 	/*
 	 * If the region hasn't been recovered yet,
 	 * we need to block the write
@@ -598,6 +606,12 @@
 
 	lr->u.lr_int_rtn = 0; /* Default to no work */
 
+	if (lc->recovery_halted) {
+		DMDEBUG("Recovery halted due to error on %s",
+			lc->uuid + (strlen(lc->uuid) - 8));
+		return 0;
+	}
+
 	if (lc->recovering_region != (uint64_t)-1) {
 		DMDEBUG("Someone is already recovering region %Lu/%s",
 			lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
@@ -704,11 +718,18 @@
 	/*
 	 * Recovery failed or mirror is being marked out-of-sync
 	 *
+	 * We need to stop dishing out recovery work.  If we don't
+	 * writes happening to NOSYNC regions can't proceed and the
+	 * mirror won't be able to suspend for reconfiguration - due
+	 * to the return of is_remote_recovering().
+	 *
 	 * We can recieve multiple calls to mark out-of-sync
 	 * if there were several writes to the same region that
 	 * failed.  In this case, there will not be a record for
 	 * the region.
 	 */
+	lc->recovery_halted = 1;
+
 	ru = find_ru(lc, who, lr->u.lr_region);
 
 	if ((lr->u.lr_region == lc->recovering_region) && !ru) {
@@ -873,8 +894,14 @@
 	 * New node joins and needs to know I am the server
 	 * We shortcut the election here and respond directly
 	 * to the inquirer
-	 */
+	 *
 	if((lc->server_id == my_id) && !atomic_read(&lc->suspended)){
+	*/
+	if (lc->server_id == my_id) {
+		if (atomic_read(&lc->suspended)) {
+			DMDEBUG("I'm suspended, but still responding as server: %s",
+				lc->uuid + (strlen(lc->uuid) - 8));
+		}
 		lr->u.lr_coordinator = my_id;
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
 			return -1;



             reply	other threads:[~2007-09-27 20:31 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-09-27 20:31 jbrassow [this message]
  -- strict thread matches above, loose matches on Subject: below --
2007-10-03 19:02 [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c jbrassow
2007-09-26  3:15 jbrassow
2007-09-21 20:07 jbrassow
2007-09-13 15:24 jbrassow
2007-07-11 16:18 jbrassow
2007-04-26 16:55 jbrassow
2007-04-26 16:54 jbrassow
2007-04-24 20:10 jbrassow
2007-04-24 20:08 jbrassow
2007-04-10  7:13 jbrassow
2007-04-10  7:12 jbrassow
2007-04-05 21:33 jbrassow
2007-04-05 21:32 jbrassow
2007-04-03 18:23 jbrassow
2007-04-03 18:21 jbrassow
2007-03-22 22:34 jbrassow
2007-03-22 22:22 jbrassow
2007-03-14  4:28 jbrassow
2007-02-26 17:38 jbrassow
2007-02-20 19:35 jbrassow
2007-02-19 16:29 jbrassow
2007-02-14 17:44 jbrassow
2007-02-02 17:22 jbrassow
2007-01-08 19:28 jbrassow
2006-12-07 18:58 jbrassow
2006-09-05 17:50 jbrassow
2006-09-05 17:48 jbrassow
2006-07-27 23:11 jbrassow
2006-07-27 23:11 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:12 jbrassow
2006-06-29 19:49 jbrassow
2006-06-29 19:48 jbrassow
2006-06-29 19:46 jbrassow
2006-06-27 20:19 jbrassow
2006-06-15 19:48 jbrassow
2006-06-15 19:34 jbrassow
2006-06-13 16:26 jbrassow

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070927203124.27602.qmail@sourceware.org \
    --to=jbrassow@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).