[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...

All of lore.kernel.org
 help / color / mirror / Atom feed

From: jbrassow@sourceware.org <jbrassow@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
Date: 10 Apr 2007 07:13:18 -0000	[thread overview]
Message-ID: <20070410071318.2114.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL45
Changes by:	jbrassow at sourceware.org	2007-04-10 08:13:15

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c dm-cmirror-server.h 

Log message:
	Bug 235686: Kernel BUG at dm_cmirror_server while recovering region
	
	Several fixes have gone in to fix the handling of this bug:
	1) During server relocation (which can happen due to machine failure
	or normal mirror suspension), the server value could get set before
	the client had a chance to clean-up.  This caused the server to
	become confused and issue a BUG().
	
	2) perform a flush of the log before suspending.  This ensures
	that regions which are in-sync get correctly flushed to the disk
	log.  Without this, there will always be recovery work to be done
	when a mirror starts up - even if it was properly in-sync during
	shutdown.
	
	3) clean-up memory used to record region users when a mirror is
	shutdown.  It was possible for some regions to be left over
	(causing a memory leak) during certain fault scenarios.
	
	4) properly initialize the state field (ru_rw) in the region
	user structure when a mark occurs.  Without the initialization,
	it was sometimes possible for the region to be misinterpretted
	as recovering instead of marked.
	
	5) resolve and unhandled case in server_complete_resync_work
	
	6) reset a variable in cluster_complete_resync_work.  Failure
	to do so was causing a retry to include the wrong value for
	the completion of the resync work - confusing the server.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.3&r2=1.1.2.41.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.12.2.1&r2=1.1.2.12.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.4&r2=1.1.2.26.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2&r2=1.1.2.2.8.1

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/05 21:33:36	1.1.2.41.2.3
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/10 07:13:15	1.1.2.41.2.4
@@ -373,13 +373,8 @@
 	fs = get_fs();
 	set_fs(get_ds());
 
-	if(type == LRT_MASTER_LEAVING){
-		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
-				 0, 10);
-	} else {
-		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
-				 0, 5);
-	}
+	len = my_recvmsg(lc->client_sock, &msg,
+			 sizeof(struct log_request), 0, 15);
 	set_fs(fs);
 
 	if(len <= 0){
@@ -492,9 +487,14 @@
 			goto out;
 		}
 	election:
-		while(lc->server_id == 0xDEAD){
+		while ((lc->server_id == 0xDEAD) || (!lc->server_valid)) {
+			DMDEBUG("server_id=%x, server_valid=%u, %s",
+				lc->server_id, lc->server_valid,
+				lc->uuid + (strlen(lc->uuid) - 8));
+			DMDEBUG("trigger = %s", RQ_STRING(type));
 			run_election(lc, my_id);
 			new_server = 1;
+			lc->server_valid = 1;
 		}
 
 		spin_lock(&lc->state_lock);
@@ -600,6 +600,7 @@
 	INIT_LIST_HEAD(&lc->mark_logged);
 	spin_lock_init(&lc->state_lock);
 
+	lc->server_valid = 0;
 	lc->server_id = 0xDEAD;
 
 	if ((error = cluster_connect())) {
@@ -731,19 +732,20 @@
 	}
 
 	if (!list_empty(&lc->mark_logged)) {
-		DMERR("Mark requests remain at cluster log deactivation");
-		/*
-		 * Should I BUG() this?
-		 * No.  In the worst case, they will get cleaned up later
-		 */
-	}
-	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) {
-		list_del_init(&rs->rs_list);
-		mempool_free(rs, region_state_pool);
+		int i = 0;
+		
+		list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) {
+			i++;
+			list_del_init(&rs->rs_list);
+			mempool_free(rs, region_state_pool);
+		}
+		DMDEBUG("%d mark requests remain at cluster log deactivation", i);
 	}
 
 	spin_unlock(&lc->state_lock);
 
+	server_free_region_users(lc);
+
 	if (lc->log_dev)
 		disk_dtr(log);
 	else
@@ -753,8 +755,13 @@
 		DMERR("Unable to disconnect from cluster infrastructure.\n");
 }
 
+static int cluster_flush(struct dirty_log *log);
 static int cluster_presuspend(struct dirty_log *log)
 {
+	/* FIXME: flush is work-around for bug 235040 */
+	DMDEBUG("Performing flush to work around bug 235040");
+	cluster_flush(log);
+	DMDEBUG("Log flush complete");
 	return 0;
 }
 
@@ -784,6 +791,7 @@
 	atomic_set(&lc->suspended, 1);
 	if(lc->server_id == my_id) {
 		while (1) {
+			DMDEBUG("Telling everyone I'm suspending");
 			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 
 			down(&consult_server_lock);
@@ -791,11 +799,13 @@
 			up(&consult_server_lock);
 
 			if ((my_id && (lc->server_id == my_id))) {
+				DMDEBUG("Delaying suspend, work to be done.");
 				atomic_set(&lc->suspended, 0);
 				set_current_state(TASK_INTERRUPTIBLE);
 				schedule_timeout(HZ*2);
 				atomic_set(&lc->suspended, 1);
 			} else {
+				DMDEBUG("Suspending now");
 				break;
 			}
 		}
@@ -907,7 +917,7 @@
 				DMDEBUG("Delaying mark to region %Lu, due to recovery",
 					rs->rs_region);
 				set_current_state(TASK_INTERRUPTIBLE);
-				schedule_timeout(HZ/2);
+				schedule_timeout(HZ);
 				continue;
 			}
 
@@ -933,7 +943,7 @@
 		if (r == -EBUSY) {
 			DMDEBUG("Delaying flush due to recovery");
 			set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ/2);
+			schedule_timeout(HZ);
 			continue;
 		}
 
@@ -1109,6 +1119,10 @@
 		rtn = 0;
 	}
 
+	if (rtn)
+		DMDEBUG("Client received resync work: %Lu/%s",
+			*region, lc->uuid + (strlen(lc->uuid) - 8));
+
 	/*
 	 * Check for bug 235039
 	 * Note the changes in cluser_clear_region
@@ -1129,10 +1143,16 @@
 static void cluster_complete_resync_work(struct dirty_log *log,
 					 region_t region, int success)
 {
+	int i;
 	region_t success_tmp = success;
 	struct log_c *lc = (struct log_c *) log->context;
-	while(consult_server(lc, region, LRT_COMPLETE_RESYNC_WORK, &success_tmp)){
-		DMWARN("unable to notify server of completed resync work");
+
+	for (i = 0; i < 5; i++) {
+		if (!consult_server(lc, region,
+				    LRT_COMPLETE_RESYNC_WORK, &success_tmp))
+			break;
+		success_tmp = success;
+		DMWARN("Unable to notify server of completed resync work");
 	}
 	return;
 }
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/04/03 18:23:01	1.1.2.12.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/04/10 07:13:15	1.1.2.12.2.2
@@ -141,6 +141,7 @@
 	struct list_head mark_waiting;
 	struct list_head mark_logged;
 
+	uint32_t server_valid;
 	uint32_t server_id;
 	struct socket *client_sock;
 };
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/05 21:33:36	1.1.2.26.2.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/10 07:13:15	1.1.2.26.2.5
@@ -229,7 +229,8 @@
 	int sync_search, conflict = 0;
 
 	if (lc->recovering_region != (uint64_t)-1) {
-		DMDEBUG("Someone is already recovering region %Lu", lc->recovering_region);
+		DMDEBUG("Someone is already recovering region %Lu/%s",
+			lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
 		return 0;
 	}
 
@@ -253,8 +254,8 @@
 						  sync_search);
 		if (find_ru_by_region(lc, *region)) {
 			conflict = 1;
-			DMDEBUG("Recovery blocked by outstanding write on region %Lu",
-			      *region);
+			DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
+				*region, lc->uuid + (strlen(lc->uuid) - 8));
 		} else {
 			break;
 		}
@@ -534,7 +535,8 @@
 
 	new->ru_nodeid = who;
 	new->ru_region = lr->u.lr_region;
-    
+	new->ru_rw = RU_WRITE;
+
 	if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
 		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
 		list_add(&new->ru_list, &lc->region_users);
@@ -615,6 +617,11 @@
 					DMDEBUG("  %s", (ru2->ru_rw == RU_RECOVER) ? "recover" :
 						(ru2->ru_rw == RU_WRITE) ? "writer" : "unknown");
 
+			/* FIXME: work-around for bug 235040 */
+			DMDEBUG("Revoking resync work");
+			lc->recovering_region = (uint64_t)-1;
+			list_del(&ru->ru_list);
+			mempool_free(ru, region_user_pool);
 			return -EBUSY;
 		}
 	}
@@ -646,7 +653,8 @@
 		new->ru_region = lr->u.lr_region_rtn;
 		new->ru_rw = RU_RECOVER;
 		list_add(&new->ru_list, &lc->region_users);
-		DMDEBUG("Assigning recovery work to %u: %Lu", who, new->ru_region);
+		DMDEBUG("Assigning recovery work to %u/%s: %Lu",
+			who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
 	} else {
 		mempool_free(new, region_user_pool);
 	}
@@ -654,8 +662,8 @@
 	return 0;
 }
 
-
-static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){
+static int server_complete_resync_work(struct log_c *lc, struct log_request *lr,
+				       int success, uint32_t who){
 	struct region_user *ru;
 
 	if (lr->u.lr_region > lc->region_count) {
@@ -679,51 +687,61 @@
 		lc->sync_pass = 0;
 
 		DMDEBUG("Resync work completed: %Lu", lr->u.lr_region);
-	} else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) {
-		ru = find_ru_by_region(lc, lr->u.lr_region);
+		return 0;
+	}
 
-		/*
-		 * The following condition can never happen unless we have
-		 * a corrupted list or we had a communication error.
-		 *
-		 * If a write failed to one of the mirror devices, the ru
-		 * should be RU_WRITE.  If a recovery failed, it should be
-		 * RU_RECOVER
-		 */
-		if (!ru) {
-			DMERR("Unable to find region being marked out-of-sync: %Lu",
-			      lr->u.lr_region);
-			return -EINVAL;
-		}
+	/*
+	 * Recovery failed or mirror is being marked out-of-sync
+	 *
+	 * We can recieve multiple calls to mark out-of-sync
+	 * if there were several writes to the same region that
+	 * failed.  In this case, there will not be a record for
+	 * the region.
+	 */
+	ru = find_ru(lc, who, lr->u.lr_region);
 
-		if (ru->ru_rw == RU_RECOVER) {
-			if (lr->u.lr_region != lc->recovering_region) {
-				DMERR("Recovering region mismatch: (%Lu/%Lu)",
-				      lr->u.lr_region, lc->recovering_region);
-				BUG();
-			}
-			/*
-			 * Clear the recovery
-			 */
-			lc->recovering_region = (uint64_t)-1;
-			list_del(&ru->ru_list);
-			mempool_free(ru, region_user_pool);
-		} else {  /* ru->ru_rw == RU_WRITE */
-			/*
-			 * Mirror has place the region into RH_NOSYNC
-			 * It is safe to pull the ru
-			 */
-			list_del(&ru->ru_list);
-			mempool_free(ru, region_user_pool);			
+	if ((lr->u.lr_region == lc->recovering_region) && !ru) {
+		DMERR("Unable to locate record of recovery");
+		BUG();
+	}
+
+	if (!ru) {
+		DMERR("Unable to find region to be marked out-of-sync: %Lu/%s/%u",
+		      lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
+		BUG();
+	} else 	if (ru->ru_rw == RU_RECOVER) {
+		if (lr->u.lr_region != lc->recovering_region) {
+			DMERR("Recovering region mismatch from node %u: (%Lu/%Lu)",
+			      who, lr->u.lr_region, lc->recovering_region);
+			BUG();
 		}
-		/* gone again: lc->sync_count--;*/
-		log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
+		/*
+		 * Clear the recovery
+		 */
+		DMDEBUG("Setting recovering region out-of-sync: %Lu/%s/%u",
+			lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
+		lc->recovering_region = (uint64_t)-1;
+		list_del(&ru->ru_list);
+		mempool_free(ru, region_user_pool);
 	}
 
+	/* else if (ru->ru_rw == RU_WRITE)
+	 * Mirror has place the region into RH_NOSYNC
+	 * We will leave the record in place.  It is
+	 * likely there are more requests coming to
+	 * mark this region out-of-sync, due to the
+	 * way mirror handles the situation.
+	 *
+	 DMDEBUG("Setting marked region out-of-sync: %Lu/%s",
+                 lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+	 */
+
+	/* gone again: lc->sync_count--;*/
+	log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
+
 	return 0;
 }
 
-
 static int server_get_sync_count(struct log_c *lc, struct log_request *lr){
 	if (lc->sync_count > lc->region_count) {
 		DMERR("sync_count (" SECTOR_FORMAT ") > region_count (" SECTOR_FORMAT ") in %s!",
@@ -901,6 +919,7 @@
 		 * If we are the server, assign it
 		 */
 		if(lr->u.lr_coordinator == my_id){
+			lc->server_valid = 0;
 			lc->server_id = my_id;
 		}
 
@@ -988,7 +1007,8 @@
 				DMDEBUG("I'm the cluster mirror log server for %s",
 				       lc->uuid + (strlen(lc->uuid) - 8));
 				if (atomic_read(&lc->suspended)) {
-					DMDEBUG("Not reading disk log because I'm suspended.");
+					DMDEBUG("Not reading disk log because I'm suspended (%s)",
+						lc->uuid + (strlen(lc->uuid) - 8));
 					
 				} else if (disk_resume(lc) == -EDEADLK) {
 					DMDEBUG("Unable to read disk log - deadlock potential.");
@@ -1060,7 +1080,12 @@
 			error = server_get_resync_work(lc, lr, nodeid);
 			break;
 		case LRT_COMPLETE_RESYNC_WORK:
-			error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn);
+			if(!(nodeid =
+			     ipaddr_to_nodeid((struct sockaddr *)msg.msg_name))){
+				error = -ENXIO;
+				break;
+			}
+			error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn, nodeid);
 			lr->u.lr_int_rtn = 0;
 			break;
 		case LRT_GET_SYNC_COUNT:
@@ -1236,6 +1261,22 @@
 }
 */
 
+int server_free_region_users(struct log_c *lc)
+{
+	int i = 0;
+	struct region_user *ru, *tmp_ru;
+
+	list_for_each_entry_safe(ru, tmp_ru, &lc->region_users, ru_list) {
+		i++;
+		list_del(&ru->ru_list);
+		mempool_free(ru, region_user_pool);
+	}
+
+	DMDEBUG("%d region user structures freed", i);
+	return 0;
+}
+
+
 int suspend_server(void){
 	atomic_set(&_suspend, 1);
 	return 0;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h	2005/08/11 18:26:19	1.1.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h	2007/04/10 07:13:15	1.1.2.2.8.1
@@ -13,5 +13,6 @@
 int start_server(void);
 void stop_server(void);
 void print_server_status(struct log_c *lc);
+int server_free_region_users(struct log_c *lc);
 
 #endif /* __DM_CMIRROR_SERVER_H__ */

next             reply	other threads:[~2007-04-10  7:13 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-04-10  7:13 jbrassow [this message]
  -- strict thread matches above, loose matches on Subject: below --
2007-10-03 19:02 [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c jbrassow
2007-09-27 20:31 jbrassow
2007-09-26  3:15 jbrassow
2007-09-21 20:07 jbrassow
2007-09-13 15:24 jbrassow
2007-07-11 16:18 jbrassow
2007-04-26 16:55 jbrassow
2007-04-26 16:54 jbrassow
2007-04-24 20:10 jbrassow
2007-04-24 20:08 jbrassow
2007-04-10  7:12 jbrassow
2007-04-05 21:33 jbrassow
2007-04-05 21:32 jbrassow
2007-04-03 18:23 jbrassow
2007-04-03 18:21 jbrassow
2007-03-22 22:34 jbrassow
2007-03-22 22:22 jbrassow
2007-03-14  4:28 jbrassow
2007-02-26 17:38 jbrassow
2007-02-20 19:35 jbrassow
2007-02-19 16:29 jbrassow
2007-02-14 17:44 jbrassow
2007-02-02 17:22 jbrassow
2007-01-08 19:28 jbrassow
2006-12-07 18:58 jbrassow
2006-09-05 17:50 jbrassow
2006-09-05 17:48 jbrassow
2006-07-27 23:11 jbrassow
2006-07-27 23:11 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:12 jbrassow
2006-06-29 19:49 jbrassow
2006-06-29 19:48 jbrassow
2006-06-29 19:46 jbrassow
2006-06-27 20:19 jbrassow
2006-06-15 19:48 jbrassow
2006-06-15 19:34 jbrassow
2006-06-13 16:26 jbrassow

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070410071318.2114.qmail@sourceware.org \
    --to=jbrassow@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.