From: jbrassow@sourceware.org <jbrassow@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
Date: 27 Jul 2006 23:11:56 -0000 [thread overview]
Message-ID: <20060727231156.29572.qmail@sourceware.org> (raw)
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL4
Changes by: jbrassow at sourceware.org 2006-07-27 23:11:55
Modified files:
cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h
dm-cmirror-server.c
Log message:
- further tightening for 199826
We now refuse to read/write the disk log if we are suspended. We also
add BUG_ON(<suspended>) to operations that do I/O to the log device.
The reason for the BUG_ON() is that it is better to drop the machine
than to have it hang the cluster while it attempts to read/write from
a suspended device. That being said, it should now be impossible to
get to those functions which would perform I/O operations during
suspension.
I have still seen cases where the mirror will stall. However, I think
this is due to LVM (clvmd), because it happens when a mirror is created
while the log device is suspended - which must not happen. I've only
seen this when doing simultaneous create/convert/remove from all nodes
in the cluster.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.22&r2=1.1.2.23
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.9&r2=1.1.2.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.15&r2=1.1.2.16
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2006/07/22 22:19:34 1.1.2.22
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2006/07/27 23:11:55 1.1.2.23
@@ -302,7 +302,7 @@
lc->server_id = lr.u.lr_coordinator;
} else {
/* ATTENTION -- what do we do with this ? */
- DMWARN("Failed to receive election results from server");
+ DMWARN("Failed to receive election results from server: %d", len);
error = len;
}
@@ -363,21 +363,7 @@
iov.iov_len = sizeof(struct log_request);
iov.iov_base = lr;
-/*
- DMERR("To :: 0x%x, %s",
- saddr_in.sin_addr.s_addr,
- (lr->lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
- (lr->lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
- (lr->lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
- (lr->lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
- (lr->lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
- (lr->lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
- (lr->lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
- (lr->lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
- (lr->lr_type == LRT_ELECTION)? "LRT_ELECTION":
- (lr->lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN"
- );
-*/
+
if(lr->lr_type == LRT_MARK_REGION){
mark_req2ser++;
}
@@ -453,25 +439,28 @@
request_retry_count,
request_count,
dm_div_up(request_retry_count*100, request_count));
+ DMDEBUG("Last request:");
+ DMDEBUG(" - my_id :: %u", my_id);
+ DMDEBUG(" - server :: %u", lc->server_id);
+ DMDEBUG(" - log uuid:: %s (%s)",
+ lc->uuid + (strlen(lc->uuid) - 8),
+ atomic_read(&lc->suspended) ? "suspended" : "active");
+ DMDEBUG(" - request :: %s",
+ (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
+ (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+ (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+ (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+ (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+ (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+ (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+ (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+ (type == LRT_ELECTION)? "LRT_ELECTION":
+ (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+ DMDEBUG(" - error :: %d", error);
}
}
if(lr) kfree(lr);
-#if 0
- DMINFO("My (%u) request (%s) to server (%u) failed :: %d",
- my_id,
- (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
- (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
- (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
- (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
- (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
- (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
- (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
- (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
- (type == LRT_ELECTION)? "LRT_ELECTION":
- (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
- lc->server_id, error);
-#endif
return error;
}
@@ -739,7 +728,7 @@
DMINFO("Leaving while clear region requests remain.");
list_del_init(&lc->log_list);
- if(lc->server_id == my_id)
+ if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
sock_release(lc->client_sock);
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h 2006/07/22 22:19:34 1.1.2.9
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h 2006/07/27 23:11:55 1.1.2.10
@@ -22,6 +22,7 @@
#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
+#define DMDEBUG(f, x...) printk(KERN_DEBUG DM_NAME ": " f "\n" , ## x)
#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
0 : scnprintf(result + sz, maxlen - sz, x))
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2006/07/22 22:50:38 1.1.2.15
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2006/07/27 23:11:55 1.1.2.16
@@ -107,6 +107,7 @@
if (!log->log_dev)
return 0;
+ BUG_ON(atomic_read(&log->suspended));
r = dm_io_sync_vm(1, &log->header_location, READ,
log->disk_header, &ebits);
if (unlikely(r))
@@ -137,6 +138,7 @@
if (!log->log_dev)
return 0;
+ BUG_ON(atomic_read(&log->suspended));
header_to_disk(&log->header, log->disk_header);
return dm_io_sync_vm(1, &log->header_location, WRITE,
log->disk_header, &ebits);
@@ -180,6 +182,7 @@
if (!log->log_dev)
return 0;
+ BUG_ON(atomic_read(&log->suspended));
r = dm_io_sync_vm(1, &log->bits_location, READ,
log->clean_bits, &ebits);
@@ -196,6 +199,7 @@
if (!log->log_dev)
return 0;
+ BUG_ON(atomic_read(&log->suspended));
return dm_io_sync_vm(1, &log->bits_location, WRITE,
log->clean_bits, &ebits);
}
@@ -295,7 +299,8 @@
struct region_user *tmp_ru, *ru;
unsigned char live_nodes[16]; /* Attention -- max of 128 nodes... */
- DMINFO("Disk Resume::");
+ DMDEBUG("Disk Resume:: %s (%s)", lc->uuid + (strlen(lc->uuid) - 8),
+ atomic_read(&lc->suspended) ? "suspended" : "active");
debug_disk_write = 1;
memset(live_nodes, 0, sizeof(live_nodes));
@@ -355,20 +360,20 @@
}
}
- DMINFO(" Live nodes :: %d", global_count);
- DMINFO(" In-Use Regions :: %d", good_count+bad_count);
- DMINFO(" Good IUR's :: %d", good_count);
- DMINFO(" Bad IUR's :: %d", bad_count);
+ DMDEBUG(" Live nodes :: %d", global_count);
+ DMDEBUG(" In-Use Regions :: %d", good_count+bad_count);
+ DMDEBUG(" Good IUR's :: %d", good_count);
+ DMDEBUG(" Bad IUR's :: %d", bad_count);
lc->sync_count = count_bits32(lc->sync_bits, lc->bitset_uint32_count);
lc->sync_search = 0;
- DMINFO(" Sync count :: %Lu", lc->sync_count);
- DMINFO(" Disk Region count :: %Lu", lc->header.nr_regions);
- DMINFO(" Region count :: %Lu", lc->region_count);
+ DMDEBUG(" Sync count :: %Lu", lc->sync_count);
+ DMDEBUG(" Disk Region count :: %Lu", lc->header.nr_regions);
+ DMDEBUG(" Region count :: %Lu", lc->region_count);
if(lc->header.nr_regions != lc->region_count){
- DMINFO(" NOTE: Mapping has changed.");
+ DMDEBUG(" NOTE: Mapping has changed.");
}
/* Take this out for now.
if(list_empty(&lc->region_users) && (lc->sync_count != lc->header.nr_regions)){
@@ -398,13 +403,13 @@
}
*/
- DMINFO("Marked regions::");
+ DMDEBUG("Marked regions::");
i = print_zero_bits((unsigned char *)lc->clean_bits, 0, lc->region_count);
- DMINFO(" Total = %d", i);
+ DMDEBUG(" Total = %d", i);
- DMINFO("Out-of-sync regions::");
+ DMDEBUG("Out-of-sync regions::");
i = print_zero_bits((unsigned char *)lc->sync_bits, 0, lc->region_count);
- DMINFO(" Total = %d", i);
+ DMDEBUG(" Total = %d", i);
/* set the correct number of regions in the header */
lc->header.nr_regions = lc->region_count;
@@ -529,7 +534,7 @@
ru = find_ru(lc, who, lr->u.lr_region);
if(!ru){
- DMINFO("Request to remove unrecorded region user (%u/%Lu)",
+ DMDEBUG("Request to remove unrecorded region user (%u/%Lu)",
who, lr->u.lr_region);
return -EINVAL;
} else {
@@ -596,7 +601,7 @@
(info < 1000 && !(info%100)) ||
(info < 200 && !(info%25)) ||
(info < 6)){
- DMINFO(SECTOR_FORMAT " out-of-sync regions remaining for %s.",
+ DMDEBUG(SECTOR_FORMAT " out-of-sync regions remaining for %s.",
lc->region_count - lc->sync_count,
lc->uuid + (strlen(lc->uuid) - 8));
}
@@ -843,9 +848,12 @@
return -1;
}
if(lc && (old != lc->server_id) && (my_id == lc->server_id)){
- DMINFO("I'm the cluster mirror log server for %s",
+ DMDEBUG("I'm the cluster mirror log server for %s",
lc->uuid + (strlen(lc->uuid) - 8));
- disk_resume(lc);
+ if (!atomic_read(&lc->suspended))
+ disk_resume(lc);
+ else
+ DMDEBUG("Not reading disk log because I'm suspended.");
}
goto reply;
}
@@ -860,6 +868,30 @@
goto reply;
}
+ if (atomic_read(&lc->suspended)) {
+ nodeid = ipaddr_to_nodeid((struct sockaddr *)msg.msg_name);
+ /*
+ DMDEBUG("Getting request while server (%u) is suspended:", my_id);
+ DMDEBUG(" - Requester :: %u", nodeid);
+ DMDEBUG(" - log uuid :: %s", lc->uuid + (strlen(lc->uuid) - 8));
+ DMDEBUG(" - req type :: %s",
+ (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C LEAN":
+ (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+ (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+ (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+ (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+ (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+ (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+ (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+ (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
+ (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+ */
+ if (my_id != nodeid) {
+ lr.u.lr_int_rtn = -ENXIO;
+ goto reply;
+ }
+ }
+
switch(lr.lr_type){
case LRT_IS_CLEAN:
error = server_is_clean(lc, &lr);
@@ -1004,7 +1036,10 @@
list_for_each_entry(lc, &log_list_head, log_list){
if(lc->server_id == my_id){
- disk_resume(lc);
+ if (!atomic_read(&lc->suspended))
+ disk_resume(lc);
+ else
+ DMDEBUG("Not reading disk log because I'm suspended.");
}
}
break;
next reply other threads:[~2006-07-27 23:11 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-07-27 23:11 jbrassow [this message]
-- strict thread matches above, loose matches on Subject: below --
2007-10-03 19:02 [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c jbrassow
2007-09-27 20:31 jbrassow
2007-09-26 3:15 jbrassow
2007-09-21 20:07 jbrassow
2007-09-13 15:24 jbrassow
2007-07-11 16:18 jbrassow
2007-04-26 16:55 jbrassow
2007-04-26 16:54 jbrassow
2007-04-24 20:10 jbrassow
2007-04-24 20:08 jbrassow
2007-04-10 7:13 jbrassow
2007-04-10 7:12 jbrassow
2007-04-05 21:33 jbrassow
2007-04-05 21:32 jbrassow
2007-04-03 18:23 jbrassow
2007-04-03 18:21 jbrassow
2007-03-22 22:34 jbrassow
2007-03-22 22:22 jbrassow
2007-03-14 4:28 jbrassow
2007-02-26 17:38 jbrassow
2007-02-20 19:35 jbrassow
2007-02-19 16:29 jbrassow
2007-02-14 17:44 jbrassow
2007-02-02 17:22 jbrassow
2007-01-08 19:28 jbrassow
2006-12-07 18:58 jbrassow
2006-09-05 17:50 jbrassow
2006-09-05 17:48 jbrassow
2006-07-27 23:11 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:12 jbrassow
2006-06-29 19:49 jbrassow
2006-06-29 19:48 jbrassow
2006-06-29 19:46 jbrassow
2006-06-27 20:19 jbrassow
2006-06-15 19:48 jbrassow
2006-06-15 19:34 jbrassow
2006-06-13 16:26 jbrassow
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20060727231156.29572.qmail@sourceware.org \
--to=jbrassow@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.