From: jbrassow@sourceware.org <jbrassow@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
Date: 27 Sep 2007 20:31:24 -0000 [thread overview]
Message-ID: <20070927203124.27602.qmail@sourceware.org> (raw)
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL4
Changes by: jbrassow at sourceware.org 2007-09-27 20:31:20
Modified files:
cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h
dm-cmirror-server.c
Log message:
Bug 290821: cmirror write path appears deadlocked after recovery ...
In some device failure cases, regions must be marked 'out-of-sync' -
this was causing a following write to block because it thought the
region had not yet been recovered - when in fact, it had just been
put out-of-sync due to failing device.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.53&r2=1.1.2.54
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.15&r2=1.1.2.16
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.39&r2=1.1.2.40
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/09/26 03:15:40 1.1.2.53
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/09/27 20:31:18 1.1.2.54
@@ -773,6 +773,7 @@
struct region_state *rs, *tmp_rs;
struct log_c *lc = (struct log_c *) log->context;
+ DMDEBUG("cluster_postsuspend");
spin_lock(&lc->state_lock);
if (!list_empty(&lc->mark_waiting)) {
DMERR("Mark requests remain at postsuspend!");
@@ -833,6 +834,7 @@
struct log_c *lc = (struct log_c *) log->context;
lc->sync_search = 0;
+ lc->recovery_halted = 0;
resume_server_requests();
atomic_set(&lc->suspended, 0);
@@ -861,7 +863,7 @@
{
int rtn;
struct log_c *lc = (struct log_c *) log->context;
-
+
if (atomic_read(&lc->in_sync) == 1) {
return 0;
}
@@ -1170,6 +1172,10 @@
region_t rtn;
struct log_c *lc = (struct log_c *) log->context;
+ if (atomic_read(&lc->suspended)) {
+ return (atomic_read(&lc->in_sync)) ? lc->region_count : 0;
+ }
+
/* Try to get sync count up to five times */
for (i = 0; i < 5 && consult_server(lc, 0, LRT_GET_SYNC_COUNT, &rtn); i++);
if(i >= 5){
@@ -1226,6 +1232,7 @@
DMDEBUG(" ?sync_search : %d", lc->sync_search);
DMDEBUG(" in_sync : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO");
DMDEBUG(" suspended : %s", (atomic_read(&lc->suspended)) ? "YES" : "NO");
+ DMDEBUG(" recovery_halted : %s", (lc->recovery_halted) ? "YES" : "NO");
DMDEBUG(" server_id : %u", lc->server_id);
DMDEBUG(" server_valid: %s",
((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO");
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h 2007/09/26 03:15:40 1.1.2.15
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h 2007/09/27 20:31:18 1.1.2.16
@@ -102,6 +102,7 @@
int sync_pass; /* number of passes attempting to resync */
int sync_search;
+ int recovery_halted; /* only useful for is_remote_recovering */
/* Resync flag */
enum sync {
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/09/26 03:15:40 1.1.2.39
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/09/27 20:31:18 1.1.2.40
@@ -451,6 +451,14 @@
if ((lc->sync_search > lc->region_count) && !lc->sync_pass)
return 0;
+ if (lc->recovery_halted &&
+ (lc->recovering_region != lr->u.lr_region)) {
+ DMDEBUG("Recovery halted, allowing client: %Lu/%s",
+ lr->u.lr_region,
+ lc->uuid + (strlen(lc->uuid) - 8));
+ return 0;
+ }
+
/*
* If the region hasn't been recovered yet,
* we need to block the write
@@ -598,6 +606,12 @@
lr->u.lr_int_rtn = 0; /* Default to no work */
+ if (lc->recovery_halted) {
+ DMDEBUG("Recovery halted due to error on %s",
+ lc->uuid + (strlen(lc->uuid) - 8));
+ return 0;
+ }
+
if (lc->recovering_region != (uint64_t)-1) {
DMDEBUG("Someone is already recovering region %Lu/%s",
lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
@@ -704,11 +718,18 @@
/*
* Recovery failed or mirror is being marked out-of-sync
*
+ * We need to stop dishing out recovery work. If we don't
+ * writes happening to NOSYNC regions can't proceed and the
+ * mirror won't be able to suspend for reconfiguration - due
+ * to the return of is_remote_recovering().
+ *
* We can recieve multiple calls to mark out-of-sync
* if there were several writes to the same region that
* failed. In this case, there will not be a record for
* the region.
*/
+ lc->recovery_halted = 1;
+
ru = find_ru(lc, who, lr->u.lr_region);
if ((lr->u.lr_region == lc->recovering_region) && !ru) {
@@ -873,8 +894,14 @@
* New node joins and needs to know I am the server
* We shortcut the election here and respond directly
* to the inquirer
- */
+ *
if((lc->server_id == my_id) && !atomic_read(&lc->suspended)){
+ */
+ if (lc->server_id == my_id) {
+ if (atomic_read(&lc->suspended)) {
+ DMDEBUG("I'm suspended, but still responding as server: %s",
+ lc->uuid + (strlen(lc->uuid) - 8));
+ }
lr->u.lr_coordinator = my_id;
if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
return -1;
next reply other threads:[~2007-09-27 20:31 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-09-27 20:31 jbrassow [this message]
-- strict thread matches above, loose matches on Subject: below --
2007-10-03 19:02 [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c jbrassow
2007-09-26 3:15 jbrassow
2007-09-21 20:07 jbrassow
2007-09-13 15:24 jbrassow
2007-07-11 16:18 jbrassow
2007-04-26 16:55 jbrassow
2007-04-26 16:54 jbrassow
2007-04-24 20:10 jbrassow
2007-04-24 20:08 jbrassow
2007-04-10 7:13 jbrassow
2007-04-10 7:12 jbrassow
2007-04-05 21:33 jbrassow
2007-04-05 21:32 jbrassow
2007-04-03 18:23 jbrassow
2007-04-03 18:21 jbrassow
2007-03-22 22:34 jbrassow
2007-03-22 22:22 jbrassow
2007-03-14 4:28 jbrassow
2007-02-26 17:38 jbrassow
2007-02-20 19:35 jbrassow
2007-02-19 16:29 jbrassow
2007-02-14 17:44 jbrassow
2007-02-02 17:22 jbrassow
2007-01-08 19:28 jbrassow
2006-12-07 18:58 jbrassow
2006-09-05 17:50 jbrassow
2006-09-05 17:48 jbrassow
2006-07-27 23:11 jbrassow
2006-07-27 23:11 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:12 jbrassow
2006-06-29 19:49 jbrassow
2006-06-29 19:48 jbrassow
2006-06-29 19:46 jbrassow
2006-06-27 20:19 jbrassow
2006-06-15 19:48 jbrassow
2006-06-15 19:34 jbrassow
2006-06-13 16:26 jbrassow
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20070927203124.27602.qmail@sourceware.org \
--to=jbrassow@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.