All of lore.kernel.org
 help / color / mirror / Atom feed
From: jbrassow@sourceware.org <jbrassow@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
Date: 22 Mar 2007 22:22:01 -0000	[thread overview]
Message-ID: <20070322222201.9752.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-03-22 22:21:59

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 
	                    dm-cmirror-xfr.h 

Log message:
	Bug 233034: cmirror server failure/migration during GFS I/O causes metad...
	(Likely fixes other bugs as well.)
	
	When a cmirror client timed out waiting for a response from the server,
	it would send the request again.  Sometimes, the server simply took to
	long to get back to the client... it would then develop an off-by-one
	error - responding to the first _and_ the second request.  The client
	could then be asking to mark a region, and recieve a response for
	a previous request.  This has the potential to cause many problems.
	
	Sequence numbers have been added to fix the problem.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.41&r2=1.1.2.42
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.26&r2=1.1.2.27
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.2&r2=1.1.2.3

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/03/14 04:28:32	1.1.2.41
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/03/22 22:21:59	1.1.2.42
@@ -53,12 +53,6 @@
 /* These vars are just for stats, and will be removed */
 static uint32_t request_count=0;
 static uint32_t request_retry_count=0;
-static int clear_req=0;
-static int mark_req=0;
-static int insync_req=0;
-static int clear_req2ser=0;
-static int mark_req2ser=0;
-static int insync_req2ser=0;
 
 static void *region_state_alloc(int gfp_mask, void *pool_data){
 	return kmalloc(sizeof(struct region_state), gfp_mask);
@@ -316,6 +310,7 @@
 
 static int _consult_server(struct log_c *lc, region_t region,
 			  int type, region_t *result, int *retry){
+	static int seq = 0;
 	int len;
 	int error=0;
 	struct sockaddr_in saddr_in;
@@ -336,6 +331,7 @@
 	memset(lr, 0, sizeof(struct log_request));
 	
 	lr->lr_type = type;
+	lr->lr_seq = seq;
 	if(type == LRT_MASTER_LEAVING){
 		lr->u.lr_starter = my_id;
 	} else {
@@ -369,18 +365,6 @@
 	iov.iov_len = sizeof(struct log_request);
 	iov.iov_base = lr;
 
-	if(lr->lr_type == LRT_MARK_REGION){
-		mark_req2ser++;
-	}
-
-	if(lr->lr_type == LRT_CLEAR_REGION){
-		clear_req2ser++;
-	}
-	
-	if(lr->lr_type == LRT_IN_SYNC){
-		insync_req2ser++;
-	}
-	
 	fs = get_fs();
 	set_fs(get_ds());
   
@@ -394,6 +378,7 @@
 		goto fail;
 	}
 
+rerecv:
 	iov.iov_len = sizeof(struct log_request);
 	iov.iov_base = lr;
 
@@ -414,9 +399,44 @@
 		DMWARN("Error while listening for server response: %d", len);
 		error = len;
 		*retry = 1;
+		seq++;
 		goto fail;
 	}
     
+	if (seq != lr->lr_seq) {
+		DMERR("Message sequence number mismatch: %d/%d",
+		      seq, lr->lr_seq);
+		if (seq > lr->lr_seq) {
+			DMERR(" Skipping.  Listening again for response to %s",
+			      RQ_STRING(type));
+			memset(lr, 0, sizeof(struct log_request));
+			goto rerecv;
+		}
+		DMERR(" Must try to resend request, %s", RQ_STRING(type));
+		error = -EBADE;
+		*retry = 1;
+		seq++;
+		goto fail;
+	}
+	seq++;
+
+	if (type != lr->lr_type) {
+		DMERR("Got incorrect message type back: %s/%s",
+		      RQ_STRING(type), RQ_STRING(lr->lr_type));
+		error = -EBADE;
+		*retry = 1;
+		goto fail;
+	}
+
+	if (memcmp(lc->uuid, lr->lr_uuid, MAX_NAME_LEN)) {
+		DMERR("Got reply from server for wrong log:");
+		DMERR(" Expected UUID: %s", lc->uuid);
+		DMERR(" Recieved UUID: %s", lr->lr_uuid);
+		error = -EBADE;
+		*retry = 1;
+		goto fail;
+	}
+
 	if(lr->u.lr_int_rtn == -EAGAIN){
 		DMWARN("Server (%u), request type %d, -EAGAIN."
 		       "  Mirror suspended?",
@@ -453,17 +473,7 @@
 			DMDEBUG(" - log uuid:: %s (%s)",
 			       lc->uuid + (strlen(lc->uuid) - 8),
 			       atomic_read(&lc->suspended) ? "suspended" : "active");
-			DMDEBUG(" - request :: %s",
-			       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-			       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-			       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-			       (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-			       (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-			       (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-			       (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-			       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-			       (type == LRT_ELECTION)? "LRT_ELECTION":
-			       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			DMDEBUG(" - request :: %s", RQ_STRING(type));
 			DMDEBUG(" - error   :: %d", error);
 			DMINFO("Too many retries, attempting to re-establish server connection.");
 			lc->server_id = 0xDEAD;
@@ -519,7 +529,7 @@
 			}
 			clear_region_count -= i;
 			DMINFO(" - %d clear region requests wiped", i);
-
+			i=0;
 			DMINFO(" - Resending all mark region requests");
 			list_for_each_entry(rs, &marked_region_list, rs_list){
 				/* Resend only those associated with referenced log */
@@ -527,7 +537,7 @@
 					continue;
 				do {
 					retry = 0;
-					DMINFO("   - " SECTOR_FORMAT, rs->rs_region);
+					i++;
 					rtn = _consult_server(rs->rs_lc, rs->rs_region,
 							      LRT_MARK_REGION, NULL, &retry);
 					if (lc->server_id == 0xDEAD) {
@@ -536,6 +546,7 @@
 					}
 				} while(retry);
 			}
+			DMINFO(" - %d mark region requests resent", i);
 			DMINFO("Clean-up complete");
 			if(type == LRT_MARK_REGION){
 				/* we just handled all marks */
@@ -544,17 +555,7 @@
 				goto out;
 			} else {
 				DMINFO("Continuing request type, %d (%s)", type,
-				      (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-				      (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-				      (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-				      (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-				      (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-				      (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-				      (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-				      (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-				      (type == LRT_ELECTION)? "LRT_ELECTION":
-				      (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN"
-					);
+				       RQ_STRING(type));
 			}
 			new_server = 0;
 		}
@@ -886,7 +887,6 @@
 	struct log_c *lc = (struct log_c *) log->context;
   
 	/* check known_regions, return if found */
-	insync_req++;
 /* take out optimization
 	if(atomic_read(&lc->in_sync) == 1){
 		return 1;
@@ -915,8 +915,6 @@
 	struct region_state *rs, *tmp_rs, *rs_new;
 	struct log_c *lc = (struct log_c *) log->context;
 
-	mark_req++;
-
 	rs_new = mempool_alloc(region_state_pool, GFP_KERNEL);
 
 	memset(rs_new, 0, sizeof(struct region_state));
@@ -924,8 +922,10 @@
 	spin_lock(&region_state_lock);
 	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
 		if(lc == rs->rs_lc && region == rs->rs_region){
+			/*
 			DMDEBUG("Mark pre-empting clear (%Lu/%s)",
 				region, lc->uuid + (strlen(lc->uuid) - 8));
+			*/
 			list_del_init(&rs->rs_list);
 			list_add(&rs->rs_list, &marked_region_list);
 			clear_region_count--;
@@ -1007,7 +1007,6 @@
 {
 	struct log_c *lc = (struct log_c *) log->context;
 	struct region_state *rs, *tmp_rs, *rs_new;
-	clear_req++;
 
 	rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
 
@@ -1140,21 +1139,6 @@
 		DMINFO("  Regions marked   : %d", j);
 		DMINFO("  Regions clearing : %d", i);
 
-		DMINFO("  Mark requests    : %d", mark_req);
-		if(mark_req)
-			DMINFO("  Mark req to serv : %d (%d%%)", mark_req2ser,
-			       (mark_req2ser*100)/mark_req);
-
-		DMINFO("  Clear requests   : %d", clear_req);
-		if(clear_req)
-			DMINFO("  Clear req to serv: %d (%d%%)", clear_req2ser,
-			       (clear_req2ser*100)/clear_req);
-
-		DMINFO("  Sync  requests   : %d", insync_req);
-		if(insync_req)
-			DMINFO("  Sync req to serv : %d (%d%%)", insync_req2ser,
-			       (insync_req2ser*100)/insync_req);
-
 		if(lc->server_id == my_id){
 			print_server_status(lc);
 		}
@@ -1216,9 +1200,11 @@
 		atomic_set(&lc->in_sync, 0);
 	}
 	spin_unlock(&log_list_lock);
-	
+
+	/*
 	if (likely(!shutting_down))
 		suspend_server();
+	*/
 
 	return 0;
 }
@@ -1267,7 +1253,9 @@
 		BUG();
 		break;
 	}
+	/*
 	resume_server();
+	*/
 	return 0;
 }
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/03/14 04:28:32	1.1.2.26
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/03/22 22:21:59	1.1.2.27
@@ -911,17 +911,7 @@
 			DMDEBUG("Getting request while server (%u) is suspended:", my_id);
 			DMDEBUG(" - Requester :: %u", nodeid);
 			DMDEBUG(" - log uuid  :: %s", lc->uuid + (strlen(lc->uuid) - 8));
-			DMDEBUG(" - req type  :: %s",
-				(lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
-				(lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-				(lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-				(lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-				(lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-				(lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-				(lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-				(lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-				(lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
-				(lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			DMDEBUG(" - req type  :: %s", RQ_STRING(lr.lr_type));
 			*/
 			if (my_id != nodeid) {
 				lr.u.lr_int_rtn = -ENXIO;
@@ -981,17 +971,7 @@
 		if(error){
 /*
 			DMWARN("Error (%d) while processing request (%s)",
-			       error,
-			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
-			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			       error, RQ_STRING(lr.lr_type));
 */
 			lr.u.lr_int_rtn = error;
 		}
@@ -1011,17 +991,7 @@
 		set_fs(fs);
 		if(error < 0){
 			DMWARN("unable to sendmsg to client (type = %s, error = %d)",
-			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
-			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
-			       error);
+			       RQ_STRING(lr.lr_type), error);
 			return error;
 		}
 	} else if(error == -EAGAIN || error == -ETIMEDOUT){
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/02/14 17:44:07	1.1.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/03/22 22:21:59	1.1.2.3
@@ -25,8 +25,21 @@
 
 #define CLUSTER_LOG_PORT 51005
 
+#define RQ_STRING(x) \
+	((x) == LRT_IS_CLEAN) ? "LRT_IS_CLEAN": \
+	((x) == LRT_IN_SYNC) ? "LRT_IN_SYNC": \
+	((x) == LRT_MARK_REGION) ? "LRT_MARK_REGION": \
+	((x) == LRT_GET_RESYNC_WORK) ? "LRT_GET_RESYNC_WORK": \
+	((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
+	((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \
+	((x) == LRT_COMPLETE_RESYNC_WORK) ? "LRT_COMPLETE_RESYNC_WORK": \
+	((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING": \
+	((x) == LRT_ELECTION) ? "LRT_ELECTION": \
+	((x) == LRT_SELECTION) ? "LRT_SELECTION": "UNKNOWN"
+
 struct log_request {
 	int lr_type;
+	int lr_seq;
 	union {
 		struct {
 			uint32_t lr_starter;



             reply	other threads:[~2007-03-22 22:22 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-03-22 22:22 jbrassow [this message]
  -- strict thread matches above, loose matches on Subject: below --
2007-10-03 19:02 [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c jbrassow
2007-09-27 20:31 jbrassow
2007-09-26  3:15 jbrassow
2007-09-21 20:07 jbrassow
2007-09-13 15:24 jbrassow
2007-07-11 16:18 jbrassow
2007-04-26 16:55 jbrassow
2007-04-26 16:54 jbrassow
2007-04-24 20:10 jbrassow
2007-04-24 20:08 jbrassow
2007-04-10  7:13 jbrassow
2007-04-10  7:12 jbrassow
2007-04-05 21:33 jbrassow
2007-04-05 21:32 jbrassow
2007-04-03 18:23 jbrassow
2007-04-03 18:21 jbrassow
2007-03-22 22:34 jbrassow
2007-03-14  4:28 jbrassow
2007-02-26 17:38 jbrassow
2007-02-20 19:35 jbrassow
2007-02-19 16:29 jbrassow
2007-02-14 17:44 jbrassow
2007-02-02 17:22 jbrassow
2007-01-08 19:28 jbrassow
2006-12-07 18:58 jbrassow
2006-09-05 17:50 jbrassow
2006-09-05 17:48 jbrassow
2006-07-27 23:11 jbrassow
2006-07-27 23:11 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:12 jbrassow
2006-06-29 19:49 jbrassow
2006-06-29 19:48 jbrassow
2006-06-29 19:46 jbrassow
2006-06-27 20:19 jbrassow
2006-06-15 19:48 jbrassow
2006-06-15 19:34 jbrassow
2006-06-13 16:26 jbrassow

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070322222201.9752.qmail@sourceware.org \
    --to=jbrassow@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.