* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-10-16 15:09 teigland
0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-10-16 15:09 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-10-16 15:09:26
Modified files:
group/gfs_controld: recover.c
Log message:
fix typo in debug message
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.20&r2=1.21
--- cluster/group/gfs_controld/recover.c 2006/10/16 14:44:02 1.20
+++ cluster/group/gfs_controld/recover.c 2006/10/16 15:09:25 1.21
@@ -826,7 +826,7 @@
which means no first mounter recovery is needed or is current */
if (mg->global_first_recover_done) {
- log_group(mg, "assign_journal: global_firsts_recover_done");
+ log_group(mg, "assign_journal: global_first_recover_done");
goto out;
}
^ permalink raw reply [flat|nested] 10+ messages in thread* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2007-06-06 15:47 teigland
0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2007-06-06 15:47 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL5
Changes by: teigland at sourceware.org 2007-06-06 15:47:42
Modified files:
group/gfs_controld: recover.c
Log message:
return a different error number to mount.gfs for each specific failure
case, so mount can translate that into a helpful error message
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.23.2.6&r2=1.23.2.7
--- cluster/group/gfs_controld/recover.c 2007/04/26 19:01:42 1.23.2.6
+++ cluster/group/gfs_controld/recover.c 2007/06/06 15:47:42 1.23.2.7
@@ -1537,7 +1537,7 @@
if (strcmp(mg->dev, dev)) {
log_error("different fs dev %s with same name", mg->dev);
- return -EINVAL;
+ return -EADDRINUSE;
}
if (find_mountpoint(mg, dir)) {
@@ -1575,14 +1575,14 @@
struct mountgroup *mg = NULL;
char table2[MAXLINE];
char *cluster = NULL, *name = NULL;
- int rv;
+ int rv, new_mg = 0;
log_debug("mount: %s %s %s %s %s %s",
dir, type, proto, table, options, dev);
if (strcmp(proto, "lock_dlm")) {
log_error("mount: lockproto %s not supported", proto);
- rv = -EINVAL;
+ rv = -EPROTONOSUPPORT;
goto out;
}
@@ -1590,7 +1590,7 @@
strstr(options, "first=") ||
strstr(options, "id=")) {
log_error("mount: jid, first and id are reserved options");
- rv = -EINVAL;
+ rv = -EOPNOTSUPP;
goto out;
}
@@ -1601,7 +1601,7 @@
name = strstr(table2, ":");
if (!name) {
- rv = -EINVAL;
+ rv = -EBADFD;
goto out;
}
@@ -1625,6 +1625,7 @@
rv = -ENOMEM;
goto out;
}
+ new_mg = 1;
mg->mount_client = ci;
strncpy(mg->type, type, sizeof(mg->type));
@@ -1634,7 +1635,7 @@
if (strlen(cluster) != strlen(clustername) ||
strlen(cluster) == 0 || strcmp(cluster, clustername)) {
- rv = -1;
+ rv = -EBADR;
log_error("mount: fs requires cluster=\"%s\" current=\"%s\"",
cluster, clustername);
goto out;
@@ -1646,7 +1647,7 @@
mg->spectator = 1;
} else {
if (!we_are_in_fence_domain()) {
- rv = -EINVAL;
+ rv = -ENOANO;
log_error("mount: not in default fence domain");
goto out;
}
@@ -1656,7 +1657,7 @@
mg->rw = 1;
else if (strstr(options, "ro")) {
if (mg->spectator) {
- rv = -EINVAL;
+ rv = -EROFS;
log_error("mount: readonly invalid with spectator");
goto out;
}
@@ -1664,7 +1665,7 @@
}
if (strlen(options) > MAX_OPTIONS_LEN-1) {
- rv = -EINVAL;
+ rv = -EMLINK;
log_error("mount: options too long %d", strlen(options));
goto out;
}
@@ -1673,8 +1674,12 @@
group_join(gh, name);
rv = 0;
out:
- *mg_ret = mg;
- log_group(mg, "do_mount: rv %d", rv);
+ if (mg) {
+ *mg_ret = mg;
+ log_group(mg, "do_mount: rv %d", rv);
+ }
+ if (rv && new_mg)
+ free(mg);
return rv;
}
^ permalink raw reply [flat|nested] 10+ messages in thread* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2007-06-06 15:44 teigland
0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2007-06-06 15:44 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2007-06-06 15:44:49
Modified files:
group/gfs_controld: recover.c
Log message:
return a different error number to mount.gfs for each specific failure
case, so mount can translate that into a helpful error message
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.29&r2=1.30
--- cluster/group/gfs_controld/recover.c 2007/04/26 19:01:34 1.29
+++ cluster/group/gfs_controld/recover.c 2007/06/06 15:44:49 1.30
@@ -1537,7 +1537,7 @@
if (strcmp(mg->dev, dev)) {
log_error("different fs dev %s with same name", mg->dev);
- return -EINVAL;
+ return -EADDRINUSE;
}
if (find_mountpoint(mg, dir)) {
@@ -1575,14 +1575,14 @@
struct mountgroup *mg = NULL;
char table2[MAXLINE];
char *cluster = NULL, *name = NULL;
- int rv;
+ int rv, new_mg = 0;
log_debug("mount: %s %s %s %s %s %s",
dir, type, proto, table, options, dev);
if (strcmp(proto, "lock_dlm")) {
log_error("mount: lockproto %s not supported", proto);
- rv = -EINVAL;
+ rv = -EPROTONOSUPPORT;
goto out;
}
@@ -1590,7 +1590,7 @@
strstr(options, "first=") ||
strstr(options, "id=")) {
log_error("mount: jid, first and id are reserved options");
- rv = -EINVAL;
+ rv = -EOPNOTSUPP;
goto out;
}
@@ -1601,7 +1601,7 @@
name = strstr(table2, ":");
if (!name) {
- rv = -EINVAL;
+ rv = -EBADFD;
goto out;
}
@@ -1625,6 +1625,7 @@
rv = -ENOMEM;
goto out;
}
+ new_mg = 1;
mg->mount_client = ci;
strncpy(mg->type, type, sizeof(mg->type));
@@ -1634,7 +1635,7 @@
if (strlen(cluster) != strlen(clustername) ||
strlen(cluster) == 0 || strcmp(cluster, clustername)) {
- rv = -1;
+ rv = -EBADR;
log_error("mount: fs requires cluster=\"%s\" current=\"%s\"",
cluster, clustername);
goto out;
@@ -1646,7 +1647,7 @@
mg->spectator = 1;
} else {
if (!we_are_in_fence_domain()) {
- rv = -EINVAL;
+ rv = -ENOANO;
log_error("mount: not in default fence domain");
goto out;
}
@@ -1656,7 +1657,7 @@
mg->rw = 1;
else if (strstr(options, "ro")) {
if (mg->spectator) {
- rv = -EINVAL;
+ rv = -EROFS;
log_error("mount: readonly invalid with spectator");
goto out;
}
@@ -1664,7 +1665,7 @@
}
if (strlen(options) > MAX_OPTIONS_LEN-1) {
- rv = -EINVAL;
+ rv = -EMLINK;
log_error("mount: options too long %d", strlen(options));
goto out;
}
@@ -1673,8 +1674,12 @@
group_join(gh, name);
rv = 0;
out:
- *mg_ret = mg;
- log_group(mg, "do_mount: rv %d", rv);
+ if (mg) {
+ *mg_ret = mg;
+ log_group(mg, "do_mount: rv %d", rv);
+ }
+ if (rv && new_mg)
+ free(mg);
return rv;
}
^ permalink raw reply [flat|nested] 10+ messages in thread* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-12-19 22:20 teigland
0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-12-19 22:20 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL50
Changes by: teigland at sourceware.org 2006-12-19 22:20:08
Modified files:
group/gfs_controld: recover.c
Log message:
Fixes related to the needs_recovery state and first-mounter recovery.
Probably not perfect yet, but working in the tests I'm able to contrive.
bz 218551
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&only_with_tag=RHEL50&r1=1.23.4.3&r2=1.23.4.4
--- cluster/group/gfs_controld/recover.c 2006/12/19 17:07:22 1.23.4.3
+++ cluster/group/gfs_controld/recover.c 2006/12/19 22:20:08 1.23.4.4
@@ -29,6 +29,7 @@
void start_spectator_2(struct mountgroup *mg);
void notify_mount_client(struct mountgroup *mg);
+
int set_sysfs(struct mountgroup *mg, char *field, int val)
{
char fname[512];
@@ -325,6 +326,9 @@
continue;
if (memb->jid == -9)
continue;
+ if (memb->spectator || memb->readonly || memb->withdrawing ||
+ memb->ms_kernel_mount_done)
+ continue;
if (low == -1 || memb->nodeid < low) {
next = memb;
low = memb->nodeid;
@@ -641,12 +645,11 @@
goto out;
}
- if (mg->needs_recovery) {
- log_group(mg, "receive_remount from %d needs_recovery", from);
- msg = "error: needs recovery";
- error = -1;
- goto out;
- }
+ /* FIXME: check if we've even fully completed our normal mount yet
+ (received our own mount-status?) if not, then disallow remount */
+
+ /* FIXME: going ro->rw may mean we can now do journal or first-mounter
+ recovery that we couldn't do before. */
memb->readonly = ro;
memb->rw = !ro;
@@ -746,30 +749,19 @@
else if (memb->readonly)
ro_count++;
- if (memb->opts & MEMB_OPT_RECOVER)
+ if (memb->opts & MEMB_OPT_RECOVER) {
memb_recover = memb;
+ log_group(mg, "assign_journal: memb %d has OPT_RECOVER",
+ memb->nodeid);
+ }
if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error)
memb_mounted = memb;
}
- log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d",
- total, invalid_count, rw_count, ro_count, spect_count);
-
- /* do we let the new member mount? jid=-2 means no.
- - we only allow an rw mount when the fs needs recovery
- - we only allow a single rw mount when the fs needs recovery */
-
- if (mg->needs_recovery) {
- if (!new->rw || rw_count)
- new->jid = -2;
- }
-
- if (new->jid == -2) {
- log_group(mg, "assign_journal: fail - needs_recovery %d",
- mg->needs_recovery);
- goto out;
- }
+ log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d "
+ "needs_recovery %d", total, invalid_count, rw_count,
+ ro_count, spect_count, mg->needs_recovery);
if (new->spectator) {
log_group(mg, "assign_journal: new spectator allowed");
@@ -785,17 +777,33 @@
}
}
- /* Currently the fs needs recovery, i.e. none of the current
- mounters (ro/spectators) can recover journals. So, this new rw
- mounter is told to do first-mounter recovery of all the journals. */
-
+ /* Repeat first-mounter recovery: the fs has been mounted and in-use,
+ but nodes have failed and none of the current mounters has been able
+ to do recovery (all remaining nodes may be ro/spect for example).
+ This puts us into the special "needs_recovery" state where new
+ mounters are asked to do first-mounter recovery of the fs while
+ the current mounters sit in a blocked state. */
+
if (mg->needs_recovery) {
- log_group(mg, "assign_journal: memb %d gets OPT_RECOVER, "
- "needs_recovery", new->nodeid);
- new->opts |= MEMB_OPT_RECOVER;
+ if (!memb_recover) {
+ log_group(mg, "assign_journal: needs_recovery: "
+ "new memb %d gets OPT_RECOVER",
+ new->nodeid);
+ new->opts |= MEMB_OPT_RECOVER;
+ } else {
+ log_group(mg, "assign_journal: needs_recovery: "
+ "new memb %d memb %d has OPT_RECOVER",
+ new->nodeid, memb_recover->nodeid);
+ }
goto out;
}
+ /* Initial first-mounter recovery: the fs is coming online, the first
+ mg member assumes first-mounter role and other nodes join the mg
+ while the first-mounter is working. These non-first mounters wait
+ for the first-mounter to finish before notifying mount.gfs. If the
+ first-mounter fails, one of them will become the first-mounter. */
+
/* it shouldn't be possible to have someone doing first mounter
recovery and also have someone with the fs fully mounted */
@@ -839,7 +847,8 @@
mg->kernel_mount_done, mg->kernel_mount_error,
mg->first_mounter, mg->first_mounter_done);
- log_group(mg, "assign_journal: memb %d gets OPT_RECOVER", new->nodeid);
+ log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: "
+ "fs not mounted", new->nodeid);
new->opts |= MEMB_OPT_RECOVER;
out:
@@ -1006,7 +1015,7 @@
/* delay notifying mount client until we get a successful
mount status from the first mounter */
log_group(mg, "other node doing first mounter recovery, "
- "delay notify_mount_client");
+ "set mount_client_delay");
mg->mount_client_delay = 1;
mg->save_plocks = 0;
return;
@@ -1402,7 +1411,6 @@
if (memb_gone_recover) {
log_group(mg, "failed node %d had MEMB_OPT_RECOVER",
memb_gone_recover->nodeid);
- ASSERT(!mg->mount_client_notified);
memb_gone_recover->tell_gfs_to_recover = 0;
}
@@ -2168,14 +2176,39 @@
return 0;
}
-/* FIXME: what happens if a node is unmounting, others have it in members_gone,
- and it crashes? It shouldn't need journal recovery since the kernel umount
- happens before leaving the group. */
+/* After a start that initiated a recovery, everyone will go and see if they
+ can do recovery and try if they can. If a node can't, it does start_done,
+ if it tries and fails, it does start_done, if it tries and succeeds it
+ sends a message and then does start_done once it receives's it back. So,
+ when we get a finish we know that we have all the results from the recovery
+ cycle and can judge if everything is recovered properly or not. If so, we
+ can unblock locks (in the finish), if not, we leave them blocked (in the
+ finish).
+
+ If we leave locks blocked in the finish, then they can only be unblocked
+ after someone is able to do the recovery that's needed. So, leaving locks
+ blocked in a finish because recovery hasn't worked puts us into a special
+ state: the fs needs recovery, none of the current mounters has been able to
+ recover it, all current mounters have locks blocked in gfs, new mounters
+ are allowed, nodes can unmount, new mounters are asked to do first-mounter
+ recovery, if one of them succeeds then we can all clear this special state
+ and unblock locks (the unblock would happen upon recving the success
+ message from the new pseudo-first mounter, not as part of a finish), future
+ finishes would then go back to being able to unblock locks.
+
+ While in this special state, a new node has been added and asked to do
+ first-mounter recovery, other nodes can also be added while the new
+ first-mounter is active. These other nodes don't notify mount.gfs.
+ They'll receive the result of the first mounter and if it succeeded they'll
+ notify mount.gfs, otherwise one of them will become the next first-mounter
+ and notify mount.gfs. */
int do_finish(struct mountgroup *mg)
{
struct mg_member *memb, *safe;
- int leave_blocked = 0;
+
+ log_group(mg, "finish %d needs_recovery %d", mg->last_finish,
+ mg->needs_recovery);
/* members_gone list are the members that were removed from the
members list when processing a start. members are removed
@@ -2192,11 +2225,10 @@
list_del(&memb->list);
free(memb);
} else {
+ log_error("%s finish: needs recovery jid %d nodeid %d "
+ "status %d", mg->name, memb->jid,
+ memb->nodeid, memb->recovery_status);
mg->needs_recovery = 1;
- log_group(mg, "finish: needs recovery "
- "jid %d nodeid %d status %d",
- memb->jid, memb->nodeid,
- memb->recovery_status);
}
}
@@ -2210,12 +2242,7 @@
return 0;
}
- if (mg->needs_recovery) {
- log_group(mg, "finish: leave locks blocked for needs_recovery");
- leave_blocked = 1;
- }
-
- if (!leave_blocked) {
+ if (!mg->needs_recovery) {
set_sysfs(mg, "block", 0);
/* we may have been holding back our local mount due to
@@ -2224,7 +2251,8 @@
mg->mount_client_delay = 0;
notify_mount_client(mg);
}
- }
+ } else
+ log_group(mg, "finish: leave locks blocked for needs_recovery");
return 0;
}
^ permalink raw reply [flat|nested] 10+ messages in thread* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-12-19 22:19 teigland
0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-12-19 22:19 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL5
Changes by: teigland at sourceware.org 2006-12-19 22:19:59
Modified files:
group/gfs_controld: recover.c
Log message:
Fixes related to the needs_recovery state and first-mounter recovery.
Probably not perfect yet, but working in the tests I'm able to contrive.
bz 218551
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.23.2.3&r2=1.23.2.4
--- cluster/group/gfs_controld/recover.c 2006/12/19 17:07:12 1.23.2.3
+++ cluster/group/gfs_controld/recover.c 2006/12/19 22:19:59 1.23.2.4
@@ -29,6 +29,7 @@
void start_spectator_2(struct mountgroup *mg);
void notify_mount_client(struct mountgroup *mg);
+
int set_sysfs(struct mountgroup *mg, char *field, int val)
{
char fname[512];
@@ -325,6 +326,9 @@
continue;
if (memb->jid == -9)
continue;
+ if (memb->spectator || memb->readonly || memb->withdrawing ||
+ memb->ms_kernel_mount_done)
+ continue;
if (low == -1 || memb->nodeid < low) {
next = memb;
low = memb->nodeid;
@@ -641,12 +645,11 @@
goto out;
}
- if (mg->needs_recovery) {
- log_group(mg, "receive_remount from %d needs_recovery", from);
- msg = "error: needs recovery";
- error = -1;
- goto out;
- }
+ /* FIXME: check if we've even fully completed our normal mount yet
+ (received our own mount-status?) if not, then disallow remount */
+
+ /* FIXME: going ro->rw may mean we can now do journal or first-mounter
+ recovery that we couldn't do before. */
memb->readonly = ro;
memb->rw = !ro;
@@ -746,30 +749,19 @@
else if (memb->readonly)
ro_count++;
- if (memb->opts & MEMB_OPT_RECOVER)
+ if (memb->opts & MEMB_OPT_RECOVER) {
memb_recover = memb;
+ log_group(mg, "assign_journal: memb %d has OPT_RECOVER",
+ memb->nodeid);
+ }
if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error)
memb_mounted = memb;
}
- log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d",
- total, invalid_count, rw_count, ro_count, spect_count);
-
- /* do we let the new member mount? jid=-2 means no.
- - we only allow an rw mount when the fs needs recovery
- - we only allow a single rw mount when the fs needs recovery */
-
- if (mg->needs_recovery) {
- if (!new->rw || rw_count)
- new->jid = -2;
- }
-
- if (new->jid == -2) {
- log_group(mg, "assign_journal: fail - needs_recovery %d",
- mg->needs_recovery);
- goto out;
- }
+ log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d "
+ "needs_recovery %d", total, invalid_count, rw_count,
+ ro_count, spect_count, mg->needs_recovery);
if (new->spectator) {
log_group(mg, "assign_journal: new spectator allowed");
@@ -785,17 +777,33 @@
}
}
- /* Currently the fs needs recovery, i.e. none of the current
- mounters (ro/spectators) can recover journals. So, this new rw
- mounter is told to do first-mounter recovery of all the journals. */
-
+ /* Repeat first-mounter recovery: the fs has been mounted and in-use,
+ but nodes have failed and none of the current mounters has been able
+ to do recovery (all remaining nodes may be ro/spect for example).
+ This puts us into the special "needs_recovery" state where new
+ mounters are asked to do first-mounter recovery of the fs while
+ the current mounters sit in a blocked state. */
+
if (mg->needs_recovery) {
- log_group(mg, "assign_journal: memb %d gets OPT_RECOVER, "
- "needs_recovery", new->nodeid);
- new->opts |= MEMB_OPT_RECOVER;
+ if (!memb_recover) {
+ log_group(mg, "assign_journal: needs_recovery: "
+ "new memb %d gets OPT_RECOVER",
+ new->nodeid);
+ new->opts |= MEMB_OPT_RECOVER;
+ } else {
+ log_group(mg, "assign_journal: needs_recovery: "
+ "new memb %d memb %d has OPT_RECOVER",
+ new->nodeid, memb_recover->nodeid);
+ }
goto out;
}
+ /* Initial first-mounter recovery: the fs is coming online, the first
+ mg member assumes first-mounter role and other nodes join the mg
+ while the first-mounter is working. These non-first mounters wait
+ for the first-mounter to finish before notifying mount.gfs. If the
+ first-mounter fails, one of them will become the first-mounter. */
+
/* it shouldn't be possible to have someone doing first mounter
recovery and also have someone with the fs fully mounted */
@@ -839,7 +847,8 @@
mg->kernel_mount_done, mg->kernel_mount_error,
mg->first_mounter, mg->first_mounter_done);
- log_group(mg, "assign_journal: memb %d gets OPT_RECOVER", new->nodeid);
+ log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: "
+ "fs not mounted", new->nodeid);
new->opts |= MEMB_OPT_RECOVER;
out:
@@ -1006,7 +1015,7 @@
/* delay notifying mount client until we get a successful
mount status from the first mounter */
log_group(mg, "other node doing first mounter recovery, "
- "delay notify_mount_client");
+ "set mount_client_delay");
mg->mount_client_delay = 1;
mg->save_plocks = 0;
return;
@@ -1402,7 +1411,6 @@
if (memb_gone_recover) {
log_group(mg, "failed node %d had MEMB_OPT_RECOVER",
memb_gone_recover->nodeid);
- ASSERT(!mg->mount_client_notified);
memb_gone_recover->tell_gfs_to_recover = 0;
}
@@ -2168,14 +2176,39 @@
return 0;
}
-/* FIXME: what happens if a node is unmounting, others have it in members_gone,
- and it crashes? It shouldn't need journal recovery since the kernel umount
- happens before leaving the group. */
+/* After a start that initiated a recovery, everyone will go and see if they
+ can do recovery and try if they can. If a node can't, it does start_done,
+ if it tries and fails, it does start_done, if it tries and succeeds it
+ sends a message and then does start_done once it receives's it back. So,
+ when we get a finish we know that we have all the results from the recovery
+ cycle and can judge if everything is recovered properly or not. If so, we
+ can unblock locks (in the finish), if not, we leave them blocked (in the
+ finish).
+
+ If we leave locks blocked in the finish, then they can only be unblocked
+ after someone is able to do the recovery that's needed. So, leaving locks
+ blocked in a finish because recovery hasn't worked puts us into a special
+ state: the fs needs recovery, none of the current mounters has been able to
+ recover it, all current mounters have locks blocked in gfs, new mounters
+ are allowed, nodes can unmount, new mounters are asked to do first-mounter
+ recovery, if one of them succeeds then we can all clear this special state
+ and unblock locks (the unblock would happen upon recving the success
+ message from the new pseudo-first mounter, not as part of a finish), future
+ finishes would then go back to being able to unblock locks.
+
+ While in this special state, a new node has been added and asked to do
+ first-mounter recovery, other nodes can also be added while the new
+ first-mounter is active. These other nodes don't notify mount.gfs.
+ They'll receive the result of the first mounter and if it succeeded they'll
+ notify mount.gfs, otherwise one of them will become the next first-mounter
+ and notify mount.gfs. */
int do_finish(struct mountgroup *mg)
{
struct mg_member *memb, *safe;
- int leave_blocked = 0;
+
+ log_group(mg, "finish %d needs_recovery %d", mg->last_finish,
+ mg->needs_recovery);
/* members_gone list are the members that were removed from the
members list when processing a start. members are removed
@@ -2192,11 +2225,10 @@
list_del(&memb->list);
free(memb);
} else {
+ log_error("%s finish: needs recovery jid %d nodeid %d "
+ "status %d", mg->name, memb->jid,
+ memb->nodeid, memb->recovery_status);
mg->needs_recovery = 1;
- log_group(mg, "finish: needs recovery "
- "jid %d nodeid %d status %d",
- memb->jid, memb->nodeid,
- memb->recovery_status);
}
}
@@ -2210,12 +2242,7 @@
return 0;
}
- if (mg->needs_recovery) {
- log_group(mg, "finish: leave locks blocked for needs_recovery");
- leave_blocked = 1;
- }
-
- if (!leave_blocked) {
+ if (!mg->needs_recovery) {
set_sysfs(mg, "block", 0);
/* we may have been holding back our local mount due to
@@ -2224,7 +2251,8 @@
mg->mount_client_delay = 0;
notify_mount_client(mg);
}
- }
+ } else
+ log_group(mg, "finish: leave locks blocked for needs_recovery");
return 0;
}
^ permalink raw reply [flat|nested] 10+ messages in thread* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-12-19 22:19 teigland
0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-12-19 22:19 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-12-19 22:19:02
Modified files:
group/gfs_controld: recover.c
Log message:
Fixes related to the needs_recovery state and first-mounter recovery.
Probably not perfect yet, but working in the tests I'm able to contrive.
bz 218551
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.26&r2=1.27
--- cluster/group/gfs_controld/recover.c 2006/12/19 17:05:59 1.26
+++ cluster/group/gfs_controld/recover.c 2006/12/19 22:19:01 1.27
@@ -29,6 +29,7 @@
void start_spectator_2(struct mountgroup *mg);
void notify_mount_client(struct mountgroup *mg);
+
int set_sysfs(struct mountgroup *mg, char *field, int val)
{
char fname[512];
@@ -325,6 +326,9 @@
continue;
if (memb->jid == -9)
continue;
+ if (memb->spectator || memb->readonly || memb->withdrawing ||
+ memb->ms_kernel_mount_done)
+ continue;
if (low == -1 || memb->nodeid < low) {
next = memb;
low = memb->nodeid;
@@ -641,12 +645,11 @@
goto out;
}
- if (mg->needs_recovery) {
- log_group(mg, "receive_remount from %d needs_recovery", from);
- msg = "error: needs recovery";
- error = -1;
- goto out;
- }
+ /* FIXME: check if we've even fully completed our normal mount yet
+ (received our own mount-status?) if not, then disallow remount */
+
+ /* FIXME: going ro->rw may mean we can now do journal or first-mounter
+ recovery that we couldn't do before. */
memb->readonly = ro;
memb->rw = !ro;
@@ -746,30 +749,19 @@
else if (memb->readonly)
ro_count++;
- if (memb->opts & MEMB_OPT_RECOVER)
+ if (memb->opts & MEMB_OPT_RECOVER) {
memb_recover = memb;
+ log_group(mg, "assign_journal: memb %d has OPT_RECOVER",
+ memb->nodeid);
+ }
if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error)
memb_mounted = memb;
}
- log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d",
- total, invalid_count, rw_count, ro_count, spect_count);
-
- /* do we let the new member mount? jid=-2 means no.
- - we only allow an rw mount when the fs needs recovery
- - we only allow a single rw mount when the fs needs recovery */
-
- if (mg->needs_recovery) {
- if (!new->rw || rw_count)
- new->jid = -2;
- }
-
- if (new->jid == -2) {
- log_group(mg, "assign_journal: fail - needs_recovery %d",
- mg->needs_recovery);
- goto out;
- }
+ log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d "
+ "needs_recovery %d", total, invalid_count, rw_count,
+ ro_count, spect_count, mg->needs_recovery);
if (new->spectator) {
log_group(mg, "assign_journal: new spectator allowed");
@@ -785,17 +777,33 @@
}
}
- /* Currently the fs needs recovery, i.e. none of the current
- mounters (ro/spectators) can recover journals. So, this new rw
- mounter is told to do first-mounter recovery of all the journals. */
-
+ /* Repeat first-mounter recovery: the fs has been mounted and in-use,
+ but nodes have failed and none of the current mounters has been able
+ to do recovery (all remaining nodes may be ro/spect for example).
+ This puts us into the special "needs_recovery" state where new
+ mounters are asked to do first-mounter recovery of the fs while
+ the current mounters sit in a blocked state. */
+
if (mg->needs_recovery) {
- log_group(mg, "assign_journal: memb %d gets OPT_RECOVER, "
- "needs_recovery", new->nodeid);
- new->opts |= MEMB_OPT_RECOVER;
+ if (!memb_recover) {
+ log_group(mg, "assign_journal: needs_recovery: "
+ "new memb %d gets OPT_RECOVER",
+ new->nodeid);
+ new->opts |= MEMB_OPT_RECOVER;
+ } else {
+ log_group(mg, "assign_journal: needs_recovery: "
+ "new memb %d memb %d has OPT_RECOVER",
+ new->nodeid, memb_recover->nodeid);
+ }
goto out;
}
+ /* Initial first-mounter recovery: the fs is coming online, the first
+ mg member assumes first-mounter role and other nodes join the mg
+ while the first-mounter is working. These non-first mounters wait
+ for the first-mounter to finish before notifying mount.gfs. If the
+ first-mounter fails, one of them will become the first-mounter. */
+
/* it shouldn't be possible to have someone doing first mounter
recovery and also have someone with the fs fully mounted */
@@ -839,7 +847,8 @@
mg->kernel_mount_done, mg->kernel_mount_error,
mg->first_mounter, mg->first_mounter_done);
- log_group(mg, "assign_journal: memb %d gets OPT_RECOVER", new->nodeid);
+ log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: "
+ "fs not mounted", new->nodeid);
new->opts |= MEMB_OPT_RECOVER;
out:
@@ -1006,7 +1015,7 @@
/* delay notifying mount client until we get a successful
mount status from the first mounter */
log_group(mg, "other node doing first mounter recovery, "
- "delay notify_mount_client");
+ "set mount_client_delay");
mg->mount_client_delay = 1;
mg->save_plocks = 0;
return;
@@ -1402,7 +1411,6 @@
if (memb_gone_recover) {
log_group(mg, "failed node %d had MEMB_OPT_RECOVER",
memb_gone_recover->nodeid);
- ASSERT(!mg->mount_client_notified);
memb_gone_recover->tell_gfs_to_recover = 0;
}
@@ -2168,14 +2176,39 @@
return 0;
}
-/* FIXME: what happens if a node is unmounting, others have it in members_gone,
- and it crashes? It shouldn't need journal recovery since the kernel umount
- happens before leaving the group. */
+/* After a start that initiated a recovery, everyone will go and see if they
+ can do recovery and try if they can. If a node can't, it does start_done,
+ if it tries and fails, it does start_done, if it tries and succeeds it
+ sends a message and then does start_done once it receives's it back. So,
+ when we get a finish we know that we have all the results from the recovery
+ cycle and can judge if everything is recovered properly or not. If so, we
+ can unblock locks (in the finish), if not, we leave them blocked (in the
+ finish).
+
+ If we leave locks blocked in the finish, then they can only be unblocked
+ after someone is able to do the recovery that's needed. So, leaving locks
+ blocked in a finish because recovery hasn't worked puts us into a special
+ state: the fs needs recovery, none of the current mounters has been able to
+ recover it, all current mounters have locks blocked in gfs, new mounters
+ are allowed, nodes can unmount, new mounters are asked to do first-mounter
+ recovery, if one of them succeeds then we can all clear this special state
+ and unblock locks (the unblock would happen upon recving the success
+ message from the new pseudo-first mounter, not as part of a finish), future
+ finishes would then go back to being able to unblock locks.
+
+ While in this special state, a new node has been added and asked to do
+ first-mounter recovery, other nodes can also be added while the new
+ first-mounter is active. These other nodes don't notify mount.gfs.
+ They'll receive the result of the first mounter and if it succeeded they'll
+ notify mount.gfs, otherwise one of them will become the next first-mounter
+ and notify mount.gfs. */
int do_finish(struct mountgroup *mg)
{
struct mg_member *memb, *safe;
- int leave_blocked = 0;
+
+ log_group(mg, "finish %d needs_recovery %d", mg->last_finish,
+ mg->needs_recovery);
/* members_gone list are the members that were removed from the
members list when processing a start. members are removed
@@ -2192,11 +2225,10 @@
list_del(&memb->list);
free(memb);
} else {
+ log_error("%s finish: needs recovery jid %d nodeid %d "
+ "status %d", mg->name, memb->jid,
+ memb->nodeid, memb->recovery_status);
mg->needs_recovery = 1;
- log_group(mg, "finish: needs recovery "
- "jid %d nodeid %d status %d",
- memb->jid, memb->nodeid,
- memb->recovery_status);
}
}
@@ -2210,12 +2242,7 @@
return 0;
}
- if (mg->needs_recovery) {
- log_group(mg, "finish: leave locks blocked for needs_recovery");
- leave_blocked = 1;
- }
-
- if (!leave_blocked) {
+ if (!mg->needs_recovery) {
set_sysfs(mg, "block", 0);
/* we may have been holding back our local mount due to
@@ -2224,7 +2251,8 @@
mg->mount_client_delay = 0;
notify_mount_client(mg);
}
- }
+ } else
+ log_group(mg, "finish: leave locks blocked for needs_recovery");
return 0;
}
^ permalink raw reply [flat|nested] 10+ messages in thread* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-08-31 18:56 teigland
0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-08-31 18:56 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-08-31 18:56:25
Modified files:
group/gfs_controld: recover.c
Log message:
When deciding whether we need to unlink the checkpoint and resend journals
for a failed master node we weren't distinguishing between the master
failing (where we need to do this stuff) and the master just leaving
(where we don't).
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.16&r2=1.17
--- cluster/group/gfs_controld/recover.c 2006/08/21 19:38:53 1.16
+++ cluster/group/gfs_controld/recover.c 2006/08/31 18:56:25 1.17
@@ -967,6 +967,7 @@
{
struct mg_member *memb, *safe;
int i, found, id, pos = 0, neg = 0, prev_master_nodeid;
+ int master_failed = 0;
/* move departed nodes from members list to members_gone */
@@ -1017,6 +1018,10 @@
memb->wait_gfs_recover_done);
purge_plocks(mg, memb->nodeid, 0);
+
+ if (mg->master_nodeid == memb->nodeid &&
+ memb->gone_type == GROUP_NODE_FAILED)
+ master_failed = 1;
}
}
@@ -1048,7 +1053,7 @@
- store plocks in ckpt for the new mounters to read when they
get the journals msg from us */
- if (neg &&
+ if (neg && master_failed &&
(prev_master_nodeid != -1) &&
(prev_master_nodeid != mg->master_nodeid) &&
(our_nodeid == mg->master_nodeid)) {
^ permalink raw reply [flat|nested] 10+ messages in thread* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-08-21 19:38 teigland
0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-08-21 19:38 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-08-21 19:38:53
Modified files:
group/gfs_controld: recover.c
Log message:
expand the number of cases where we don't tell gfs-kernel to do recovery
because it won't be able to -- esp cases related to a mount in progress
but not yet far enough for gfs to be able to do journal recovery
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.15&r2=1.16
--- cluster/group/gfs_controld/recover.c 2006/08/21 17:46:19 1.15
+++ cluster/group/gfs_controld/recover.c 2006/08/21 19:38:53 1.16
@@ -1286,25 +1286,49 @@
and moves the memb structs for those nodes into members_gone
and sets memb->tell_gfs_to_recover on them */
+/* we don't want to tell gfs-kernel to do journal recovery for a failed
+ node in a number of cases:
+ - we're a spectator or readonly mount
+ - gfs-kernel is currently withdrawing
+ - we're mounting and haven't received a journals message yet
+ - we're mounting and got a kernel mount error back from mount.gfs
+ - we're mounting and haven't notified mount.gfs yet (to do mount(2))
+ - we're mounting and got_kernel_mount is 0, i.e. we've not seen a uevent
+ related to the kernel mount yet
+ (some of the mounting checks should be obviated by others)
+
+ the problem we're trying to avoid here is telling gfs-kernel to do
+ recovery when it can't for some reason and then waiting forever for
+ a recovery_done signal that will never arrive. */
+
void recover_journals(struct mountgroup *mg)
{
struct mg_member *memb;
int rv;
- /* we can't do journal recovery if: we're a spectator or readonly
- mount, gfs is currently withdrawing, or we're mounting and haven't
- received a journals message yet */
+ if (mg->spectator ||
+ mg->readonly ||
+ mg->withdraw ||
+ mg->our_jid == JID_INIT ||
+ mg->kernel_mount_error ||
+ !mg->mount_client_notified ||
+ !mg->got_kernel_mount) {
- if (mg->spectator || mg->readonly || mg->withdraw ||
- mg->our_jid == JID_INIT) {
list_for_each_entry(memb, &mg->members_gone, list) {
if (!memb->tell_gfs_to_recover)
continue;
- log_group(mg, "recover journal %d nodeid %d skip, "
- "spect %d ro %d our_jid %d",
+ log_group(mg, "recover journal %d nodeid %d skip: "
+ "%d %d %d %d %d %d %d",
memb->jid, memb->nodeid,
- mg->spectator, mg->readonly, mg->our_jid);
+ mg->spectator,
+ mg->readonly,
+ mg->withdraw,
+ mg->our_jid,
+ mg->kernel_mount_error,
+ mg->mount_client_notified,
+ mg->got_kernel_mount);
+
memb->tell_gfs_to_recover = 0;
memb->local_recovery_status = RS_READONLY;
}
^ permalink raw reply [flat|nested] 10+ messages in thread* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-08-15 21:38 teigland
0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-08-15 21:38 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-08-15 21:38:00
Modified files:
group/gfs_controld: recover.c
Log message:
errors opening sysfs files are normal/expected in many cases, so
don't complain in syslog about it
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.12&r2=1.13
--- cluster/group/gfs_controld/recover.c 2006/08/14 17:22:53 1.12
+++ cluster/group/gfs_controld/recover.c 2006/08/15 21:38:00 1.13
@@ -41,7 +41,7 @@
fd = open(fname, O_RDWR);
if (fd < 0) {
- log_error("open %s error %d %d", fname, fd, errno);
+ log_group(mg, "set open %s error %d %d", fname, fd, errno);
return -1;
}
@@ -71,7 +71,7 @@
fd = open(fname, O_RDONLY);
if (fd < 0) {
- log_error("open %s error %d %d", fname, fd, errno);
+ log_group(mg, "get open %s error %d %d", fname, fd, errno);
return -1;
}
^ permalink raw reply [flat|nested] 10+ messages in thread* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-08-10 19:40 teigland
0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-08-10 19:40 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-08-10 19:40:50
Modified files:
group/gfs_controld: recover.c
Log message:
log_debug() when we receive a withdraw message
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.10&r2=1.11
--- cluster/group/gfs_controld/recover.c 2006/08/08 21:19:17 1.10
+++ cluster/group/gfs_controld/recover.c 2006/08/10 19:40:50 1.11
@@ -170,6 +170,7 @@
log_group(mg, "receive_withdraw no member %d", from);
return;
}
+ log_group(mg, "receive_withdraw from %d", from);
memb->withdrawing = 1;
if (from == our_nodeid)
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2007-06-06 15:47 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-10-16 15:09 [Cluster-devel] cluster/group/gfs_controld recover.c teigland
-- strict thread matches above, loose matches on Subject: below --
2007-06-06 15:47 teigland
2007-06-06 15:44 teigland
2006-12-19 22:20 teigland
2006-12-19 22:19 teigland
2006-12-19 22:19 teigland
2006-08-31 18:56 teigland
2006-08-21 19:38 teigland
2006-08-15 21:38 teigland
2006-08-10 19:40 teigland
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).