From mboxrd@z Thu Jan  1 00:00:00 1970
From: jbrassow@sourceware.org <jbrassow@sourceware.org>
Date: 11 Jul 2007 16:18:04 -0000
Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
Message-ID: <20070711161804.22419.qmail@sourceware.org>
List-Id: <cluster-devel.redhat.com>
To: cluster-devel.redhat.com
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-07-11 16:18:03

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 
	                    dm-cmirror-xfr.h 

Log message:
	Bug 238629: dm-cmirror: Remote recovery conflict...
	
	The kernel changes are now in place (marking/clearing the log
	during writes to nosync regions) to allow nominal I/O to
	region that have yet to be recovered.
	
	Also moved around some debugging messages and removed
	'is_remote_recovering()'.  (is_remote_recovering is obviated by
	the new mechanism for handling recovery/write ordering.)

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.48&r2=1.1.2.49
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.35&r2=1.1.2.36
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.5&r2=1.1.2.6

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/05/09 21:44:34	1.1.2.48
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/07/11 16:18:03	1.1.2.49
@@ -286,8 +286,8 @@
 		lc->server_id = lr.u.lr_coordinator;
 	} else {
 		/* ATTENTION -- what do we do with this ? */
-		DMWARN("Failed to receive election results from server: (%s,%d)",
-		       lc->uuid + (strlen(lc->uuid) - 8), len);
+		DMWARN("Failed to receive election results from server: (%s/%d,%d)",
+		       lc->uuid + (strlen(lc->uuid) - 8), lc->uuid_ref, len);
 		error = len;
 	}
 
@@ -601,6 +601,7 @@
 	INIT_LIST_HEAD(&lc->mark_logged);
 	spin_lock_init(&lc->state_lock);
 
+	atomic_set(&lc->suspended, 1);
 	lc->server_valid = 0;
 	lc->server_id = 0xDEAD;
 
@@ -853,15 +854,7 @@
 
 static int cluster_is_remote_recovering(struct dirty_log *log, region_t region)
 {
-	int rtn;
-	struct log_c *lc = (struct log_c *) log->context;
-
-	if(atomic_read(&lc->in_sync) == 1){
-		return 0;
-	}
-
-	rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL);
-	return rtn;
+	return 0;
 }
 
 static int cluster_in_sync(struct dirty_log *log, region_t region, int block)
@@ -977,57 +970,43 @@
 
 	spin_lock(&lc->state_lock);
 
-
 	/*
-	 * It is possible for the following in the mirror code:
-	 *  0) Mark is already logged for a region
-	 *  1) rh_dec, sets region state to RH_CLEAN (asynchronous)
-	 *  2) rh_update_states (DOESN'T FLUSH!!!, bug #235040)
-	 *  3) do_writes, trys to mark region
-	 *
-	 * The following shouldn't have to be handled b/c of the flush
-	 *  0) Region finishes recovery
-	 *  1) rh_update_states clears region (DOES FLUSH)
-	 *  2) do_writes, trys to mark region
-	 *
-	 * This can lead to this next case being valid.
+	 * An item on the clear_waiting list should have been flushed
+	 * before getting this mark_region call.
 	 */
-	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
+	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list)
 		if (region == rs->rs_region) {
-			if (!rs->rs_mark_logged) {
-				DMERR("Moving region(%Lu/%s) from clear_waiting -> mark_waiting",
-				      region, lc->uuid + (strlen(lc->uuid) - 8));
-			}
-			list_del_init(&rs->rs_list);
-			list_add(&rs->rs_list,
-				 (rs->rs_mark_logged) ?
-				 &lc->mark_logged : &lc->mark_waiting);
-			goto out;
+			DMERR("Region being marked found on clear_waiting list (%Lu/%s)",
+			      region, lc->uuid + (strlen(lc->uuid) - 8));
+			BUG();
 		}
-	}
+
+	/*
+	 * We should never get two marks before a flush, unless the
+	 * region is not in-sync.  One valid scenario would be:
+	 *  0) region not in-sync
+	 *  1) rh_inc (mark region)
+	 *  2) rh_update_states
+	 *  3) rh_dec (dec pending and put on clean_region list)
+	 *  4) do_writes -> rh_inc (second mark)
+	 */
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list)
+		if (region == rs->rs_region)
+			goto out;
 
 	/*
 	 * It is possible for the following in the mirror code:
 	 *  0) Mark is already logged for a region
-	 *  1) rh_update_states
+	 *  1) rh_update_states (were the clear_region happens)
 	 *  2) rh_dec, sets region state to RH_CLEAN (asynchronous)
 	 *  3) do_writes, trys to mark region
 	 *
 	 * This can lead to this next case being valid.
 	 */
-	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){
-		if (region == rs->rs_region) {
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list)
+		if (region == rs->rs_region)
 			goto out;
-		}
-	}
 
-	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
-		if (region == rs->rs_region) {
-			DMERR("Mark already waiting (%Lu/%s)",
-			      region, lc->uuid + (strlen(lc->uuid) - 8));
-			BUG();
-		}
-	}
 	spin_unlock(&lc->state_lock);
 
 	rs_new = mempool_alloc(region_state_pool, GFP_NOFS);
@@ -1074,14 +1053,13 @@
 	 * 6) we recover the region
 	 * 7) clearing the region after recovery causes us to get here
 	 *
-	 * Once 235040 is cleared, any entries found in this list should
-	 * cause a bug.
+	 * Bug 235040 cleared.  This should no longer happen.
 	 */
 	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){
 		if(region == rs->rs_region){
-			DMERR("%d) Double clear on region ("
-			      SECTOR_FORMAT ")", __LINE__, region);
-			goto out;
+			DMERR("Double clear on region (%Lu/%s)",
+			      region, lc->uuid + (strlen(lc->uuid) - 8));
+			BUG();
 		}
 	}
 
@@ -1140,7 +1118,7 @@
 		list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
 			if (*region == rs->rs_region) {
 				DMERR("WARNING: Bug 235039/235040 detected!");
-				DMERR("Work-around in place.");
+				BUG();
 			}
 		}
 	}
@@ -1204,16 +1182,6 @@
 
 	switch(status){
 	case STATUSTYPE_INFO:
-		DMDEBUG("LOG INFO:");
-		DMDEBUG("  uuid: %s", lc->uuid);
-		DMDEBUG("  uuid_ref    : %d", lc->uuid_ref);
-		DMDEBUG(" ?region_count: %Lu", lc->region_count);
-		DMDEBUG(" ?sync_count  : %Lu", lc->sync_count);
-		DMDEBUG(" ?sync_search : %d", lc->sync_search);
-		DMDEBUG("  in_sync     : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO");
-		DMDEBUG("  server_id   : %u", lc->server_id);
-		DMDEBUG("  server_valid: %s",
-			((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO");
 		if(lc->sync != DEFAULTSYNC)
 			arg_count++;
 
@@ -1228,6 +1196,42 @@
                 break;
 
         case STATUSTYPE_TABLE:
+		DMDEBUG("LOG INFO:");
+		DMDEBUG("  uuid: %s", lc->uuid);
+		DMDEBUG("  uuid_ref    : %d", lc->uuid_ref);
+		DMDEBUG(" ?region_count: %Lu", lc->region_count);
+		DMDEBUG(" ?sync_count  : %Lu", lc->sync_count);
+		DMDEBUG(" ?sync_search : %d", lc->sync_search);
+		DMDEBUG("  in_sync     : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO");
+		DMDEBUG("  suspended   : %s", (atomic_read(&lc->suspended)) ? "YES" : "NO");
+		DMDEBUG("  server_id   : %u", lc->server_id);
+		DMDEBUG("  server_valid: %s",
+			((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO");
+		{
+			struct log_c *tmp_lc;
+
+			down(&log_list_lock);
+			list_for_each_entry(tmp_lc, &log_list_head, log_list){
+				if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN) &&
+				   (tmp_lc->uuid_ref != lc->uuid_ref)){
+					DMDEBUG("LOG INFO [COPY FOUND]:");
+					DMDEBUG("  uuid: %s", tmp_lc->uuid);
+					DMDEBUG("  uuid_ref    : %d", tmp_lc->uuid_ref);
+					DMDEBUG(" ?region_count: %Lu", tmp_lc->region_count);
+					DMDEBUG(" ?sync_count  : %Lu", tmp_lc->sync_count);
+					DMDEBUG(" ?sync_search : %d", tmp_lc->sync_search);
+					DMDEBUG("  in_sync     : %s",
+						(atomic_read(&tmp_lc->in_sync)) ? "YES" : "NO");
+					DMDEBUG("  suspended   : %s",
+						(atomic_read(&tmp_lc->suspended)) ? "YES" : "NO");
+					DMDEBUG("  server_id   : %u", tmp_lc->server_id);
+					DMDEBUG("  server_valid: %s",
+						((tmp_lc->server_id != 0xDEAD) &&
+						 tmp_lc->server_valid) ? "YES" : "NO");
+				}
+			}
+			up(&log_list_lock);
+		}
 		if(lc->sync != DEFAULTSYNC)
 			arg_count++;
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/26 16:54:49	1.1.2.35
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/07/11 16:18:03	1.1.2.36
@@ -221,54 +221,6 @@
 	return count;
 }
 
-struct region_user *find_ru_by_region(struct log_c *lc, region_t region);
-static int _core_get_resync_work(struct log_c *lc, region_t *region)
-{
-	int sync_search, conflict = 0;
-
-	if (lc->recovering_region != (uint64_t)-1) {
-		DMDEBUG("Someone is already recovering region %Lu/%s",
-			lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
-		return 0;
-	}
-
-	if (lc->sync_search >= lc->region_count) {
-		/*
-		 * FIXME: pvmove is not supported yet, but when it is,
-		 * an audit of sync_count changes will need to be made
-		 */
-		if ((lc->sync_count < lc->region_count) && !lc->sync_pass) {
-			lc->sync_search = 0;
-			lc->sync_pass++;
-		} else {
-			return 0;
-		}
-	}
-	for (sync_search = lc->sync_search;
-	     sync_search < lc->region_count;
-	     sync_search = (*region + 1)) {
-		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
-						  lc->region_count,
-						  sync_search);
-		if (find_ru_by_region(lc, *region)) {
-			conflict = 1;
-			DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
-				*region, lc->uuid + (strlen(lc->uuid) - 8));
-		} else {
-			break;
-		}
-	}
-	if (!conflict)
-		lc->sync_search = *region + 1;
-
-	if (*region >= lc->region_count)
-		return 0;
-
-	lc->recovering_region = *region;
-	return 1;
-}
-
-
 static int print_zero_bits(unsigned char *str, int offset, int bit_count){
 	int i,j;
 	int count=0;
@@ -492,39 +444,6 @@
 	return 0;
 }
 
-static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr)
-{
-	region_t region;
-	struct region_user *ru;
-
-	/*
-	 * This gets a bit complicated.  I wish we didn't have to use this
-	 * function, but because the mirror code doesn't mark regions which
-	 * it writes to that are out-of-sync, we need this function.
-	 *
-	 * Problem is, we don't know how long the user is going to take to
-	 * write to the region after they have called this function.  So,
-	 * we are forced at this point to deny any writes to regions we
-	 * are recovering or might recover in the future.
-	 *
-	 * We can count on the client side to not send us one of these
-	 * requests if the mirror is known to be in-sync.
-	 *
-	 * And yes, it sucks to take this much time off the I/O.
-	 */
-	region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
-					 lc->region_count, 0);
-
-	if (lr->u.lr_region >= region) {
-		DMDEBUG("Remote recovery conflict: (%Lu >= %Lu)/%s",
-			lr->u.lr_region, region, lc->uuid + (strlen(lc->uuid) - 8));
-		lr->u.lr_int_rtn = 1;
-	} else
-		lr->u.lr_int_rtn = 0;
-
-	return 0;
-}
-
 static int server_in_sync(struct log_c *lc, struct log_request *lr)
 {
 	if (lr->u.lr_region > lc->region_count) {
@@ -555,7 +474,13 @@
 	new->ru_region = lr->u.lr_region;
 	new->ru_rw = RU_WRITE;
 
-	if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
+	if (find_ru(lc, who, lr->u.lr_region)) {
+		DMWARN("Attempt to mark a already marked region (%u,"
+		       SECTOR_FORMAT
+		       "/%s)",
+		       who, lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+		mempool_free(new, region_user_pool);
+	} else if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
 		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
 		list_add(&new->ru_list, &lc->region_users);
 	} else if (ru->ru_rw == RU_RECOVER) {
@@ -572,6 +497,22 @@
 		DMDEBUG("Mark requester   : %u", who);
 		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
 		list_add_tail(&new->ru_list, &lc->region_users);
+	} else {
+		list_add(&new->ru_list, &ru->ru_list);
+	}
+
+	/*
+	if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
+		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
+		list_add(&new->ru_list, &lc->region_users);
+	} else if (ru->ru_rw == RU_RECOVER) {
+		DMDEBUG("Attempt to mark a region " SECTOR_FORMAT
+		      "/%s which is being recovered.",
+		       lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+		DMDEBUG("Current recoverer: %u", ru->ru_nodeid);
+		DMDEBUG("Mark requester   : %u", who);
+		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
+		list_add_tail(&new->ru_list, &lc->region_users);
 	} else if (!find_ru(lc, who, lr->u.lr_region)) {
 		list_add(&new->ru_list, &ru->ru_list);
 	} else {
@@ -581,6 +522,7 @@
 		       who, lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
 		mempool_free(new, region_user_pool);
 	}
+	*/
 
 	return 0;
 }
@@ -611,35 +553,28 @@
 {
 	int r = 0;
 	int count = 0;
-	struct region_user *ru, *ru2;
+	struct region_user *ru, *marker = NULL, *recoverer = NULL;
 
 	if (lc->recovering_region != (uint64_t)-1) {
 		list_for_each_entry(ru, &lc->region_users, ru_list)
-			if (ru->ru_region == lc->recovering_region)
-				count++;
-
-		if (count > 1) {
-			list_for_each_entry(ru, &lc->region_users, ru_list)
+			if (ru->ru_region == lc->recovering_region) {
 				if (ru->ru_rw == RU_RECOVER)
-					break;
+					recoverer = ru;
+				else
+					marker = ru;
+				count++;
+			}
 
-			DMDEBUG("Flush includes region which is being recovered (%u/%Lu).  Delaying...",
-				ru->ru_nodeid, ru->ru_region);
-			DMDEBUG("Recovering region: %Lu", lc->recovering_region);
+		if (marker && recoverer) {
+			DMDEBUG("Flush/recovery collision (%Lu/%s): count = %d, marker = %u, recoverer = %u",
+				marker->ru_region, lc->uuid + (strlen(lc->uuid) - 8),
+				count, marker->ru_nodeid, recoverer->ru_nodeid);
+			/*
 			DMDEBUG("  sync_bit: %s, clean_bit: %s",
 				log_test_bit(lc->sync_bits, lc->recovering_region) ? "set" : "unset",
 				log_test_bit(lc->clean_bits, lc->recovering_region) ? "set" : "unset");
+			*/
 
-			list_for_each_entry(ru2, &lc->region_users, ru_list)
-				if (ru->ru_region == ru2->ru_region)
-					DMDEBUG("  %s", (ru2->ru_rw == RU_RECOVER) ? "recover" :
-						(ru2->ru_rw == RU_WRITE) ? "writer" : "unknown");
-
-			/* FIXME: work-around for bug 235040 */
-			DMDEBUG("Revoking resync work");
-			lc->recovering_region = (uint64_t)-1;
-			list_del(&ru->ru_list);
-			mempool_free(ru, region_user_pool);
 			return -EBUSY;
 		}
 	}
@@ -1121,9 +1056,6 @@
 		case LRT_IS_CLEAN:
 			error = server_is_clean(lc, lr);
 			break;
-		case LRT_IS_REMOTE_RECOVERING:
-			error = server_is_remote_recovering(lc, lr);
-			break;
 		case LRT_IN_SYNC:
 			error = server_in_sync(lc, lr);
 			break;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/04/24 20:08:57	1.1.2.5
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/07/11 16:18:03	1.1.2.6
@@ -10,19 +10,18 @@
 #define MAX_NAME_LEN 128
 
 #define LRT_IS_CLEAN			1
-#define LRT_IS_REMOTE_RECOVERING	2
-#define LRT_IN_SYNC             	3
-#define LRT_MARK_REGION         	4
-#define LRT_CLEAR_REGION        	5
-#define LRT_FLUSH                       6
-#define LRT_GET_RESYNC_WORK     	7
-#define LRT_COMPLETE_RESYNC_WORK        8
-#define LRT_GET_SYNC_COUNT      	9
-
-#define LRT_ELECTION			10
-#define LRT_SELECTION			11
-#define LRT_MASTER_ASSIGN		12
-#define LRT_MASTER_LEAVING		13
+#define LRT_IN_SYNC             	2
+#define LRT_MARK_REGION         	3
+#define LRT_CLEAR_REGION        	4
+#define LRT_FLUSH                       5
+#define LRT_GET_RESYNC_WORK     	6
+#define LRT_COMPLETE_RESYNC_WORK        7
+#define LRT_GET_SYNC_COUNT      	8
+
+#define LRT_ELECTION			9
+#define LRT_SELECTION			10
+#define LRT_MASTER_ASSIGN		11
+#define LRT_MASTER_LEAVING		12
 
 #define CLUSTER_LOG_PORT 51005