From: jbrassow@sourceware.org <jbrassow@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
Date: 22 Mar 2007 22:34:46 -0000 [thread overview]
Message-ID: <20070322223446.14408.qmail@sourceware.org> (raw)
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL45
Changes by: jbrassow at sourceware.org 2007-03-22 22:34:44
Modified files:
cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c
dm-cmirror-xfr.h
Log message:
Bug 233034: cmirror server failure/migration during GFS I/O causes metad...
Add sequence number to messages to ensure
that cmirror clients get the response they expect.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41&r2=1.1.2.41.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26&r2=1.1.2.26.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2&r2=1.1.2.2.2.1
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/03/14 04:28:32 1.1.2.41
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/03/22 22:34:44 1.1.2.41.2.1
@@ -53,12 +53,6 @@
/* These vars are just for stats, and will be removed */
static uint32_t request_count=0;
static uint32_t request_retry_count=0;
-static int clear_req=0;
-static int mark_req=0;
-static int insync_req=0;
-static int clear_req2ser=0;
-static int mark_req2ser=0;
-static int insync_req2ser=0;
static void *region_state_alloc(int gfp_mask, void *pool_data){
return kmalloc(sizeof(struct region_state), gfp_mask);
@@ -316,6 +310,7 @@
static int _consult_server(struct log_c *lc, region_t region,
int type, region_t *result, int *retry){
+ static int seq = 0;
int len;
int error=0;
struct sockaddr_in saddr_in;
@@ -336,6 +331,7 @@
memset(lr, 0, sizeof(struct log_request));
lr->lr_type = type;
+ lr->lr_seq = seq;
if(type == LRT_MASTER_LEAVING){
lr->u.lr_starter = my_id;
} else {
@@ -369,18 +365,6 @@
iov.iov_len = sizeof(struct log_request);
iov.iov_base = lr;
- if(lr->lr_type == LRT_MARK_REGION){
- mark_req2ser++;
- }
-
- if(lr->lr_type == LRT_CLEAR_REGION){
- clear_req2ser++;
- }
-
- if(lr->lr_type == LRT_IN_SYNC){
- insync_req2ser++;
- }
-
fs = get_fs();
set_fs(get_ds());
@@ -394,6 +378,7 @@
goto fail;
}
+rerecv:
iov.iov_len = sizeof(struct log_request);
iov.iov_base = lr;
@@ -414,9 +399,44 @@
DMWARN("Error while listening for server response: %d", len);
error = len;
*retry = 1;
+ seq++;
goto fail;
}
+ if (seq != lr->lr_seq) {
+ DMERR("Message sequence number mismatch: %d/%d",
+ seq, lr->lr_seq);
+ if (seq > lr->lr_seq) {
+ DMERR(" Skipping. Listening again for response to %s",
+ RQ_STRING(type));
+ memset(lr, 0, sizeof(struct log_request));
+ goto rerecv;
+ }
+ DMERR(" Must try to resend request, %s", RQ_STRING(type));
+ error = -EBADE;
+ *retry = 1;
+ seq++;
+ goto fail;
+ }
+ seq++;
+
+ if (type != lr->lr_type) {
+ DMERR("Got incorrect message type back: %s/%s",
+ RQ_STRING(type), RQ_STRING(lr->lr_type));
+ error = -EBADE;
+ *retry = 1;
+ goto fail;
+ }
+
+ if (memcmp(lc->uuid, lr->lr_uuid, MAX_NAME_LEN)) {
+ DMERR("Got reply from server for wrong log:");
+ DMERR(" Expected UUID: %s", lc->uuid);
+ DMERR(" Recieved UUID: %s", lr->lr_uuid);
+ error = -EBADE;
+ *retry = 1;
+ goto fail;
+ }
+
if(lr->u.lr_int_rtn == -EAGAIN){
DMWARN("Server (%u), request type %d, -EAGAIN."
" Mirror suspended?",
@@ -453,17 +473,7 @@
DMDEBUG(" - log uuid:: %s (%s)",
lc->uuid + (strlen(lc->uuid) - 8),
atomic_read(&lc->suspended) ? "suspended" : "active");
- DMDEBUG(" - request :: %s",
- (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
- (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
- (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
- (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
- (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
- (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
- (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
- (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
- (type == LRT_ELECTION)? "LRT_ELECTION":
- (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+ DMDEBUG(" - request :: %s", RQ_STRING(type));
DMDEBUG(" - error :: %d", error);
DMINFO("Too many retries, attempting to re-establish server connection.");
lc->server_id = 0xDEAD;
@@ -519,7 +529,7 @@
}
clear_region_count -= i;
DMINFO(" - %d clear region requests wiped", i);
-
+ i=0;
DMINFO(" - Resending all mark region requests");
list_for_each_entry(rs, &marked_region_list, rs_list){
/* Resend only those associated with referenced log */
@@ -527,7 +537,7 @@
continue;
do {
retry = 0;
- DMINFO(" - " SECTOR_FORMAT, rs->rs_region);
+ i++;
rtn = _consult_server(rs->rs_lc, rs->rs_region,
LRT_MARK_REGION, NULL, &retry);
if (lc->server_id == 0xDEAD) {
@@ -536,6 +546,7 @@
}
} while(retry);
}
+ DMINFO(" - %d mark region requests resent", i);
DMINFO("Clean-up complete");
if(type == LRT_MARK_REGION){
/* we just handled all marks */
@@ -544,17 +555,7 @@
goto out;
} else {
DMINFO("Continuing request type, %d (%s)", type,
- (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
- (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
- (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
- (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
- (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
- (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
- (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
- (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
- (type == LRT_ELECTION)? "LRT_ELECTION":
- (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN"
- );
+ RQ_STRING(type));
}
new_server = 0;
}
@@ -886,7 +887,6 @@
struct log_c *lc = (struct log_c *) log->context;
/* check known_regions, return if found */
- insync_req++;
/* take out optimization
if(atomic_read(&lc->in_sync) == 1){
return 1;
@@ -915,8 +915,6 @@
struct region_state *rs, *tmp_rs, *rs_new;
struct log_c *lc = (struct log_c *) log->context;
- mark_req++;
-
rs_new = mempool_alloc(region_state_pool, GFP_KERNEL);
memset(rs_new, 0, sizeof(struct region_state));
@@ -924,8 +922,10 @@
spin_lock(®ion_state_lock);
list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
if(lc == rs->rs_lc && region == rs->rs_region){
+ /*
DMDEBUG("Mark pre-empting clear (%Lu/%s)",
region, lc->uuid + (strlen(lc->uuid) - 8));
+ */
list_del_init(&rs->rs_list);
list_add(&rs->rs_list, &marked_region_list);
clear_region_count--;
@@ -1007,7 +1007,6 @@
{
struct log_c *lc = (struct log_c *) log->context;
struct region_state *rs, *tmp_rs, *rs_new;
- clear_req++;
rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
@@ -1140,21 +1139,6 @@
DMINFO(" Regions marked : %d", j);
DMINFO(" Regions clearing : %d", i);
- DMINFO(" Mark requests : %d", mark_req);
- if(mark_req)
- DMINFO(" Mark req to serv : %d (%d%%)", mark_req2ser,
- (mark_req2ser*100)/mark_req);
-
- DMINFO(" Clear requests : %d", clear_req);
- if(clear_req)
- DMINFO(" Clear req to serv: %d (%d%%)", clear_req2ser,
- (clear_req2ser*100)/clear_req);
-
- DMINFO(" Sync requests : %d", insync_req);
- if(insync_req)
- DMINFO(" Sync req to serv : %d (%d%%)", insync_req2ser,
- (insync_req2ser*100)/insync_req);
-
if(lc->server_id == my_id){
print_server_status(lc);
}
@@ -1216,9 +1200,11 @@
atomic_set(&lc->in_sync, 0);
}
spin_unlock(&log_list_lock);
-
+
+ /*
if (likely(!shutting_down))
suspend_server();
+ */
return 0;
}
@@ -1267,7 +1253,9 @@
BUG();
break;
}
+ /*
resume_server();
+ */
return 0;
}
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/03/14 04:28:32 1.1.2.26
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/03/22 22:34:44 1.1.2.26.2.1
@@ -911,17 +911,7 @@
DMDEBUG("Getting request while server (%u) is suspended:", my_id);
DMDEBUG(" - Requester :: %u", nodeid);
DMDEBUG(" - log uuid :: %s", lc->uuid + (strlen(lc->uuid) - 8));
- DMDEBUG(" - req type :: %s",
- (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C LEAN":
- (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
- (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
- (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
- (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
- (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
- (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
- (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
- (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
- (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+ DMDEBUG(" - req type :: %s", RQ_STRING(lr.lr_type));
*/
if (my_id != nodeid) {
lr.u.lr_int_rtn = -ENXIO;
@@ -981,17 +971,7 @@
if(error){
/*
DMWARN("Error (%d) while processing request (%s)",
- error,
- (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
- (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
- (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
- (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
- (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
- (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
- (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
- (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
- (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
- (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+ error, RQ_STRING(lr.lr_type));
*/
lr.u.lr_int_rtn = error;
}
@@ -1011,17 +991,7 @@
set_fs(fs);
if(error < 0){
DMWARN("unable to sendmsg to client (type = %s, error = %d)",
- (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
- (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
- (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
- (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
- (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
- (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
- (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
- (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
- (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
- (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
- error);
+ RQ_STRING(lr.lr_type), error);
return error;
}
} else if(error == -EAGAIN || error == -ETIMEDOUT){
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h 2007/02/14 17:44:07 1.1.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h 2007/03/22 22:34:44 1.1.2.2.2.1
@@ -25,8 +25,21 @@
#define CLUSTER_LOG_PORT 51005
+#define RQ_STRING(x) \
+ ((x) == LRT_IS_CLEAN) ? "LRT_IS_CLEAN": \
+ ((x) == LRT_IN_SYNC) ? "LRT_IN_SYNC": \
+ ((x) == LRT_MARK_REGION) ? "LRT_MARK_REGION": \
+ ((x) == LRT_GET_RESYNC_WORK) ? "LRT_GET_RESYNC_WORK": \
+ ((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
+ ((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \
+ ((x) == LRT_COMPLETE_RESYNC_WORK) ? "LRT_COMPLETE_RESYNC_WORK": \
+ ((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING": \
+ ((x) == LRT_ELECTION) ? "LRT_ELECTION": \
+ ((x) == LRT_SELECTION) ? "LRT_SELECTION": "UNKNOWN"
+
struct log_request {
int lr_type;
+ int lr_seq;
union {
struct {
uint32_t lr_starter;
next reply other threads:[~2007-03-22 22:34 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-03-22 22:34 jbrassow [this message]
-- strict thread matches above, loose matches on Subject: below --
2007-10-03 19:02 [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c jbrassow
2007-09-27 20:31 jbrassow
2007-09-26 3:15 jbrassow
2007-09-21 20:07 jbrassow
2007-09-13 15:24 jbrassow
2007-07-11 16:18 jbrassow
2007-04-26 16:55 jbrassow
2007-04-26 16:54 jbrassow
2007-04-24 20:10 jbrassow
2007-04-24 20:08 jbrassow
2007-04-10 7:13 jbrassow
2007-04-10 7:12 jbrassow
2007-04-05 21:33 jbrassow
2007-04-05 21:32 jbrassow
2007-04-03 18:23 jbrassow
2007-04-03 18:21 jbrassow
2007-03-22 22:22 jbrassow
2007-03-14 4:28 jbrassow
2007-02-26 17:38 jbrassow
2007-02-20 19:35 jbrassow
2007-02-19 16:29 jbrassow
2007-02-14 17:44 jbrassow
2007-02-02 17:22 jbrassow
2007-01-08 19:28 jbrassow
2006-12-07 18:58 jbrassow
2006-09-05 17:50 jbrassow
2006-09-05 17:48 jbrassow
2006-07-27 23:11 jbrassow
2006-07-27 23:11 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:12 jbrassow
2006-06-29 19:49 jbrassow
2006-06-29 19:48 jbrassow
2006-06-29 19:46 jbrassow
2006-06-27 20:19 jbrassow
2006-06-15 19:48 jbrassow
2006-06-15 19:34 jbrassow
2006-06-13 16:26 jbrassow
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20070322223446.14408.qmail@sourceware.org \
--to=jbrassow@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).