From mboxrd@z Thu Jan  1 00:00:00 1970
From: jbrassow@sourceware.org <jbrassow@sourceware.org>
Date: 26 Apr 2007 16:55:53 -0000
Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
Message-ID: <20070426165553.25025.qmail@sourceware.org>
List-Id: <cluster-devel.redhat.com>
To: cluster-devel.redhat.com
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL45
Changes by:	jbrassow at sourceware.org	2007-04-26 17:55:51

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	Bug 238031: cluster mirrors not handling all recovery/write conflicts
	
	Problem is that the kernel (main mirror code) does not do any marks/clears when
	writing to a region before its recovery.  So, it is not possible for the server
	to detect a conflict.  Basically, we must turn back on the
	'is_remote_recovering' function and disallow any writes to regions that are OR
	WILL BE recovering.
	
	It's really going to cause some pain during writes while mirrors are re-syncing.
	The better fix for the future is to have the writes always mark/clear the
	regions - then we can again remove the 'is_remote_recovering' function.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.5&r2=1.1.2.41.2.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.7&r2=1.1.2.26.2.8

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/24 20:10:20	1.1.2.41.2.5
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/26 16:55:51	1.1.2.41.2.6
@@ -861,11 +861,10 @@
 	int rtn;
 	struct log_c *lc = (struct log_c *) log->context;
 
-/* take out optimization
 	if(atomic_read(&lc->in_sync) == 1){
 		return 0;
 	}
-*/
+
 	rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL);
 	return rtn;
 }
@@ -876,11 +875,11 @@
 	struct log_c *lc = (struct log_c *) log->context;
   
 	/* check known_regions, return if found */
-/* take out optimization
+
 	if(atomic_read(&lc->in_sync) == 1){
 		return 1;
 	}
-*/
+
 	if(!block){
 		return -EWOULDBLOCK;
 	}
@@ -1414,7 +1413,7 @@
 	.resume = cluster_resume,
 	.get_region_size = cluster_get_region_size,
 	.is_clean = cluster_is_clean,
-/*	.is_remote_recovering = cluster_is_remote_recovering,*/
+	.is_remote_recovering = cluster_is_remote_recovering,
 	.in_sync = cluster_in_sync,
 	.flush = cluster_flush,
 	.mark_region = cluster_mark_region,
@@ -1436,7 +1435,7 @@
 	.resume = cluster_resume,
 	.get_region_size = cluster_get_region_size,
 	.is_clean = cluster_is_clean,
-/*	.is_remote_recovering = cluster_is_remote_recovering,*/
+	.is_remote_recovering = cluster_is_remote_recovering,
 	.in_sync = cluster_in_sync,
 	.flush = cluster_flush,
 	.mark_region = cluster_mark_region,
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/24 20:10:20	1.1.2.26.2.7
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/26 16:55:51	1.1.2.26.2.8
@@ -494,12 +494,32 @@
 
 static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr)
 {
+	region_t region;
 	struct region_user *ru;
 
-	if ((ru = find_ru_by_region(lc, lr->u.lr_region)) && 
-	    (ru->ru_rw == RU_RECOVER))
+	/*
+	 * This gets a bit complicated.  I wish we didn't have to use this
+	 * function, but because the mirror code doesn't mark regions which
+	 * it writes to that are out-of-sync, we need this function.
+	 *
+	 * Problem is, we don't know how long the user is going to take to
+	 * write to the region after they have called this function.  So,
+	 * we are forced at this point to deny any writes to regions we
+	 * are recovering or might recover in the future.
+	 *
+	 * We can count on the client side to not send us one of these
+	 * requests if the mirror is known to be in-sync.
+	 *
+	 * And yes, it sucks to take this much time off the I/O.
+	 */
+	region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+					 lc->region_count, 0);
+
+	if (lr->u.lr_region >= region) {
+		DMDEBUG("Remote recovery conflict: (%Lu >= %Lu)/%s",
+			lr->u.lr_region, region, lc->uuid + (strlen(lc->uuid) - 8));
 		lr->u.lr_int_rtn = 1;
-	else
+	} else
 		lr->u.lr_int_rtn = 0;
 
 	return 0;
@@ -639,24 +659,65 @@
 static int server_get_resync_work(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
 	struct region_user *new;
+	int sync_search, conflict = 0;
+	region_t *region = &(lr->u.lr_region_rtn);
 
-	new = mempool_alloc(region_user_pool, GFP_NOFS);
-	if(!new){
-		lr->u.lr_int_rtn = 0;
-		return -ENOMEM;
+	lr->u.lr_int_rtn = 0; /* Default to no work */
+
+	if (lc->recovering_region != (uint64_t)-1) {
+		DMDEBUG("Someone is already recovering region %Lu/%s",
+			lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
+		return 0;
 	}
-	
-	if ((lr->u.lr_int_rtn = _core_get_resync_work(lc, &(lr->u.lr_region_rtn)))){
-		new->ru_nodeid = who;
-		new->ru_region = lr->u.lr_region_rtn;
-		new->ru_rw = RU_RECOVER;
-		list_add(&new->ru_list, &lc->region_users);
-		DMDEBUG("Assigning recovery work to %u/%s: %Lu",
-			who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
-	} else {
-		mempool_free(new, region_user_pool);
+
+	if (lc->sync_search >= lc->region_count) {
+		/*
+		 * FIXME: pvmove is not supported yet, but when it is,
+		 * an audit of sync_count changes will need to be made
+		 */
+		if ((lc->sync_count < lc->region_count) && !lc->sync_pass) {
+			lc->sync_search = 0;
+			lc->sync_pass++;
+		} else {
+			return 0;
+		}
+	}
+
+	for (sync_search = lc->sync_search;
+	     sync_search < lc->region_count;
+	     sync_search = (*region + 1)) {
+		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+						  lc->region_count,
+						  sync_search);
+		if (find_ru_by_region(lc, *region)) {
+			conflict = 1;
+			DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
+				*region, lc->uuid + (strlen(lc->uuid) - 8));
+		} else {
+			break;
+		}
 	}
 
+	if (*region >= lc->region_count)
+		return 0;
+
+	new = mempool_alloc(region_user_pool, GFP_NOFS);
+	if (!new)
+		return -ENOMEM;
+
+	if (!conflict)
+		lc->sync_search = *region + 1;
+
+	lc->recovering_region = *region;
+
+	lr->u.lr_int_rtn = 1; /* Assigning work */
+	new->ru_nodeid = who;
+	new->ru_region = *region;
+	new->ru_rw = RU_RECOVER;
+	list_add(&new->ru_list, &lc->region_users);
+	DMDEBUG("Assigning recovery work to %u/%s: %Lu",
+		who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
+
 	return 0;
 }