All of lore.kernel.org
 help / color / mirror / Atom feed
From: jbrassow@sourceware.org <jbrassow@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
Date: 27 Jul 2006 23:11:56 -0000	[thread overview]
Message-ID: <20060727231156.29572.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2006-07-27 23:11:55

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	- further tightening for 199826
	
	We now refuse to read/write the disk log if we are suspended.  We also
	add BUG_ON(<suspended>) to operations that do I/O to the log device.
	
	The reason for the BUG_ON() is that it is better to drop the machine
	than to have it hang the cluster while it attempts to read/write from
	a suspended device.  That being said, it should now be impossible to
	get to those functions which would perform I/O operations during
	suspension.
	
	I have still seen cases where the mirror will stall.  However, I think
	this is due to LVM (clvmd), because it happens when a mirror is created
	while the log device is suspended - which must not happen.  I've only
	seen this when doing simultaneous create/convert/remove from all nodes
	in the cluster.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.22&r2=1.1.2.23
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.9&r2=1.1.2.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.15&r2=1.1.2.16

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/22 22:19:34	1.1.2.22
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/27 23:11:55	1.1.2.23
@@ -302,7 +302,7 @@
 		lc->server_id = lr.u.lr_coordinator;
 	} else {
 		/* ATTENTION -- what do we do with this ? */
-		DMWARN("Failed to receive election results from server");
+		DMWARN("Failed to receive election results from server: %d", len);
 		error = len;
 	}
 
@@ -363,21 +363,7 @@
 
 	iov.iov_len = sizeof(struct log_request);
 	iov.iov_base = lr;
-/*
-	DMERR("To  :: 0x%x, %s", 
-	       saddr_in.sin_addr.s_addr,
-	       (lr->lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-	       (lr->lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-	       (lr->lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-	       (lr->lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-	       (lr->lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-	       (lr->lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-	       (lr->lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-	       (lr->lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-	       (lr->lr_type == LRT_ELECTION)? "LRT_ELECTION":
-	       (lr->lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN"
-		);
-*/
+
 	if(lr->lr_type == LRT_MARK_REGION){
 		mark_req2ser++;
 	}
@@ -453,25 +439,28 @@
 			       request_retry_count,
 			       request_count,
 			       dm_div_up(request_retry_count*100, request_count));
+			DMDEBUG("Last request:");
+			DMDEBUG(" - my_id   :: %u", my_id);
+			DMDEBUG(" - server  :: %u", lc->server_id);
+			DMDEBUG(" - log uuid:: %s (%s)",
+			       lc->uuid + (strlen(lc->uuid) - 8),
+			       atomic_read(&lc->suspended) ? "suspended" : "active");
+			DMDEBUG(" - request :: %s",
+			       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
+			       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+			       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+			       (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+			       (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+			       (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+			       (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+			       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+			       (type == LRT_ELECTION)? "LRT_ELECTION":
+			       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			DMDEBUG(" - error   :: %d", error);
 		}
 	}
 
 	if(lr) kfree(lr);
-#if 0
-	DMINFO("My (%u) request (%s) to server (%u) failed :: %d",
-	       my_id,
-	       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-	       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-	       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-	       (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-	       (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-	       (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-	       (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-	       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-	       (type == LRT_ELECTION)? "LRT_ELECTION":
-	       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
-	       lc->server_id, error);
-#endif
 	return error;
 }
 
@@ -739,7 +728,7 @@
 		DMINFO("Leaving while clear region requests remain.");
 
 	list_del_init(&lc->log_list);
-	if(lc->server_id == my_id)
+	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 	sock_release(lc->client_sock);
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/07/22 22:19:34	1.1.2.9
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/07/27 23:11:55	1.1.2.10
@@ -22,6 +22,7 @@
 #define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
 #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
 #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
+#define DMDEBUG(f, x...) printk(KERN_DEBUG DM_NAME ": " f "\n" , ## x)
 #define DMEMIT(x...) sz += ((sz >= maxlen) ? \
 	  0 : scnprintf(result + sz, maxlen - sz, x))
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/22 22:50:38	1.1.2.15
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/27 23:11:55	1.1.2.16
@@ -107,6 +107,7 @@
 	if (!log->log_dev)
 		return 0;
 
+	BUG_ON(atomic_read(&log->suspended));
 	r = dm_io_sync_vm(1, &log->header_location, READ,
 			  log->disk_header, &ebits);
 	if (unlikely(r))
@@ -137,6 +138,7 @@
 	if (!log->log_dev)
 		return 0;
 
+	BUG_ON(atomic_read(&log->suspended));
 	header_to_disk(&log->header, log->disk_header);
 	return dm_io_sync_vm(1, &log->header_location, WRITE,
 			     log->disk_header, &ebits);
@@ -180,6 +182,7 @@
 	if (!log->log_dev)
 		return 0;
 
+	BUG_ON(atomic_read(&log->suspended));
 	r = dm_io_sync_vm(1, &log->bits_location, READ,
 			  log->clean_bits, &ebits);
 
@@ -196,6 +199,7 @@
 	if (!log->log_dev)
 		return 0;
 
+	BUG_ON(atomic_read(&log->suspended));
 	return dm_io_sync_vm(1, &log->bits_location, WRITE,
 			     log->clean_bits, &ebits);
 }
@@ -295,7 +299,8 @@
 	struct region_user *tmp_ru, *ru;
 	unsigned char live_nodes[16]; /* Attention -- max of 128 nodes... */
 
-	DMINFO("Disk Resume::");
+	DMDEBUG("Disk Resume::  %s (%s)", lc->uuid + (strlen(lc->uuid) - 8),
+		atomic_read(&lc->suspended) ? "suspended" : "active");
 
 	debug_disk_write = 1;
 	memset(live_nodes, 0, sizeof(live_nodes));
@@ -355,20 +360,20 @@
 		}
 	}
 
-	DMINFO("  Live nodes        :: %d", global_count);
-	DMINFO("  In-Use Regions    :: %d", good_count+bad_count);
-	DMINFO("  Good IUR's        :: %d", good_count);
-	DMINFO("  Bad IUR's         :: %d", bad_count);
+	DMDEBUG("  Live nodes        :: %d", global_count);
+	DMDEBUG("  In-Use Regions    :: %d", good_count+bad_count);
+	DMDEBUG("  Good IUR's        :: %d", good_count);
+	DMDEBUG("  Bad IUR's         :: %d", bad_count);
 
 	lc->sync_count = count_bits32(lc->sync_bits, lc->bitset_uint32_count);
 	lc->sync_search = 0;
 
-	DMINFO("  Sync count        :: %Lu", lc->sync_count);
-	DMINFO("  Disk Region count :: %Lu", lc->header.nr_regions);
-	DMINFO("  Region count      :: %Lu", lc->region_count);
+	DMDEBUG("  Sync count        :: %Lu", lc->sync_count);
+	DMDEBUG("  Disk Region count :: %Lu", lc->header.nr_regions);
+	DMDEBUG("  Region count      :: %Lu", lc->region_count);
 
 	if(lc->header.nr_regions != lc->region_count){
-		DMINFO("  NOTE:  Mapping has changed.");
+		DMDEBUG("  NOTE:  Mapping has changed.");
 	}
 /* Take this out for now.
 	if(list_empty(&lc->region_users) && (lc->sync_count != lc->header.nr_regions)){
@@ -398,13 +403,13 @@
 	}			
 
 */
-	DMINFO("Marked regions::");
+	DMDEBUG("Marked regions::");
 	i = print_zero_bits((unsigned char *)lc->clean_bits, 0, lc->region_count);
-	DMINFO("  Total = %d", i);
+	DMDEBUG("  Total = %d", i);
 
-	DMINFO("Out-of-sync regions::");
+	DMDEBUG("Out-of-sync regions::");
 	i = print_zero_bits((unsigned char *)lc->sync_bits, 0, lc->region_count);
-	DMINFO("  Total = %d", i);
+	DMDEBUG("  Total = %d", i);
 
 	/* set the correct number of regions in the header */
 	lc->header.nr_regions = lc->region_count;
@@ -529,7 +534,7 @@
 
 	ru = find_ru(lc, who, lr->u.lr_region);
 	if(!ru){
-		DMINFO("Request to remove unrecorded region user (%u/%Lu)",
+		DMDEBUG("Request to remove unrecorded region user (%u/%Lu)",
 		       who, lr->u.lr_region);
 		return -EINVAL;
 	} else {
@@ -596,7 +601,7 @@
 	   (info < 1000 && !(info%100)) ||
 	   (info < 200 && !(info%25)) ||
 	   (info < 6)){
-		DMINFO(SECTOR_FORMAT " out-of-sync regions remaining for %s.",
+		DMDEBUG(SECTOR_FORMAT " out-of-sync regions remaining for %s.",
 		       lc->region_count - lc->sync_count,
 		       lc->uuid + (strlen(lc->uuid) - 8));
 	}
@@ -843,9 +848,12 @@
 				return -1;
 			}
 			if(lc && (old != lc->server_id) && (my_id == lc->server_id)){
-				DMINFO("I'm the cluster mirror log server for %s",
+				DMDEBUG("I'm the cluster mirror log server for %s",
 				       lc->uuid + (strlen(lc->uuid) - 8));
-				disk_resume(lc);
+				if (!atomic_read(&lc->suspended))
+					disk_resume(lc);
+				else
+					DMDEBUG("Not reading disk log because I'm suspended.");
 			}
 			goto reply;
 		}
@@ -860,6 +868,30 @@
 			goto reply;
 		}
 
+		if (atomic_read(&lc->suspended)) {
+			nodeid = ipaddr_to_nodeid((struct sockaddr *)msg.msg_name);
+			/*
+			DMDEBUG("Getting request while server (%u) is suspended:", my_id);
+			DMDEBUG(" - Requester :: %u", nodeid);
+			DMDEBUG(" - log uuid  :: %s", lc->uuid + (strlen(lc->uuid) - 8));
+			DMDEBUG(" - req type  :: %s",
+				(lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
+				(lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+				(lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+				(lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+				(lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+				(lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+				(lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+				(lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+				(lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
+				(lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			*/
+			if (my_id != nodeid) {
+				lr.u.lr_int_rtn = -ENXIO;
+				goto reply;
+			}
+		}			
+
 		switch(lr.lr_type){
 		case LRT_IS_CLEAN:
 			error = server_is_clean(lc, &lr);
@@ -1004,7 +1036,10 @@
 			
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
-					disk_resume(lc);
+					if (!atomic_read(&lc->suspended))
+						disk_resume(lc);
+					else
+						DMDEBUG("Not reading disk log because I'm suspended.");
 				}
 			}
 			break;



             reply	other threads:[~2006-07-27 23:11 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-07-27 23:11 jbrassow [this message]
  -- strict thread matches above, loose matches on Subject: below --
2007-10-03 19:02 [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c jbrassow
2007-09-27 20:31 jbrassow
2007-09-26  3:15 jbrassow
2007-09-21 20:07 jbrassow
2007-09-13 15:24 jbrassow
2007-07-11 16:18 jbrassow
2007-04-26 16:55 jbrassow
2007-04-26 16:54 jbrassow
2007-04-24 20:10 jbrassow
2007-04-24 20:08 jbrassow
2007-04-10  7:13 jbrassow
2007-04-10  7:12 jbrassow
2007-04-05 21:33 jbrassow
2007-04-05 21:32 jbrassow
2007-04-03 18:23 jbrassow
2007-04-03 18:21 jbrassow
2007-03-22 22:34 jbrassow
2007-03-22 22:22 jbrassow
2007-03-14  4:28 jbrassow
2007-02-26 17:38 jbrassow
2007-02-20 19:35 jbrassow
2007-02-19 16:29 jbrassow
2007-02-14 17:44 jbrassow
2007-02-02 17:22 jbrassow
2007-01-08 19:28 jbrassow
2006-12-07 18:58 jbrassow
2006-09-05 17:50 jbrassow
2006-09-05 17:48 jbrassow
2006-07-27 23:11 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:12 jbrassow
2006-06-29 19:49 jbrassow
2006-06-29 19:48 jbrassow
2006-06-29 19:46 jbrassow
2006-06-27 20:19 jbrassow
2006-06-15 19:48 jbrassow
2006-06-15 19:34 jbrassow
2006-06-13 16:26 jbrassow

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060727231156.29572.qmail@sourceware.org \
    --to=jbrassow@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.