From mboxrd@z Thu Jan 1 00:00:00 1970 From: teigland@sourceware.org Date: 19 Dec 2006 22:20:09 -0000 Subject: [Cluster-devel] cluster/group/gfs_controld recover.c Message-ID: <20061219222009.22891.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL50 Changes by: teigland at sourceware.org 2006-12-19 22:20:08 Modified files: group/gfs_controld: recover.c Log message: Fixes related to the needs_recovery state and first-mounter recovery. Probably not perfect yet, but working in the tests I'm able to contrive. bz 218551 Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&only_with_tag=RHEL50&r1=1.23.4.3&r2=1.23.4.4 --- cluster/group/gfs_controld/recover.c 2006/12/19 17:07:22 1.23.4.3 +++ cluster/group/gfs_controld/recover.c 2006/12/19 22:20:08 1.23.4.4 @@ -29,6 +29,7 @@ void start_spectator_2(struct mountgroup *mg); void notify_mount_client(struct mountgroup *mg); + int set_sysfs(struct mountgroup *mg, char *field, int val) { char fname[512]; @@ -325,6 +326,9 @@ continue; if (memb->jid == -9) continue; + if (memb->spectator || memb->readonly || memb->withdrawing || + memb->ms_kernel_mount_done) + continue; if (low == -1 || memb->nodeid < low) { next = memb; low = memb->nodeid; @@ -641,12 +645,11 @@ goto out; } - if (mg->needs_recovery) { - log_group(mg, "receive_remount from %d needs_recovery", from); - msg = "error: needs recovery"; - error = -1; - goto out; - } + /* FIXME: check if we've even fully completed our normal mount yet + (received our own mount-status?) if not, then disallow remount */ + + /* FIXME: going ro->rw may mean we can now do journal or first-mounter + recovery that we couldn't do before. */ memb->readonly = ro; memb->rw = !ro; @@ -746,30 +749,19 @@ else if (memb->readonly) ro_count++; - if (memb->opts & MEMB_OPT_RECOVER) + if (memb->opts & MEMB_OPT_RECOVER) { memb_recover = memb; + log_group(mg, "assign_journal: memb %d has OPT_RECOVER", + memb->nodeid); + } if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error) memb_mounted = memb; } - log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d", - total, invalid_count, rw_count, ro_count, spect_count); - - /* do we let the new member mount? jid=-2 means no. - - we only allow an rw mount when the fs needs recovery - - we only allow a single rw mount when the fs needs recovery */ - - if (mg->needs_recovery) { - if (!new->rw || rw_count) - new->jid = -2; - } - - if (new->jid == -2) { - log_group(mg, "assign_journal: fail - needs_recovery %d", - mg->needs_recovery); - goto out; - } + log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d " + "needs_recovery %d", total, invalid_count, rw_count, + ro_count, spect_count, mg->needs_recovery); if (new->spectator) { log_group(mg, "assign_journal: new spectator allowed"); @@ -785,17 +777,33 @@ } } - /* Currently the fs needs recovery, i.e. none of the current - mounters (ro/spectators) can recover journals. So, this new rw - mounter is told to do first-mounter recovery of all the journals. */ - + /* Repeat first-mounter recovery: the fs has been mounted and in-use, + but nodes have failed and none of the current mounters has been able + to do recovery (all remaining nodes may be ro/spect for example). + This puts us into the special "needs_recovery" state where new + mounters are asked to do first-mounter recovery of the fs while + the current mounters sit in a blocked state. */ + if (mg->needs_recovery) { - log_group(mg, "assign_journal: memb %d gets OPT_RECOVER, " - "needs_recovery", new->nodeid); - new->opts |= MEMB_OPT_RECOVER; + if (!memb_recover) { + log_group(mg, "assign_journal: needs_recovery: " + "new memb %d gets OPT_RECOVER", + new->nodeid); + new->opts |= MEMB_OPT_RECOVER; + } else { + log_group(mg, "assign_journal: needs_recovery: " + "new memb %d memb %d has OPT_RECOVER", + new->nodeid, memb_recover->nodeid); + } goto out; } + /* Initial first-mounter recovery: the fs is coming online, the first + mg member assumes first-mounter role and other nodes join the mg + while the first-mounter is working. These non-first mounters wait + for the first-mounter to finish before notifying mount.gfs. If the + first-mounter fails, one of them will become the first-mounter. */ + /* it shouldn't be possible to have someone doing first mounter recovery and also have someone with the fs fully mounted */ @@ -839,7 +847,8 @@ mg->kernel_mount_done, mg->kernel_mount_error, mg->first_mounter, mg->first_mounter_done); - log_group(mg, "assign_journal: memb %d gets OPT_RECOVER", new->nodeid); + log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: " + "fs not mounted", new->nodeid); new->opts |= MEMB_OPT_RECOVER; out: @@ -1006,7 +1015,7 @@ /* delay notifying mount client until we get a successful mount status from the first mounter */ log_group(mg, "other node doing first mounter recovery, " - "delay notify_mount_client"); + "set mount_client_delay"); mg->mount_client_delay = 1; mg->save_plocks = 0; return; @@ -1402,7 +1411,6 @@ if (memb_gone_recover) { log_group(mg, "failed node %d had MEMB_OPT_RECOVER", memb_gone_recover->nodeid); - ASSERT(!mg->mount_client_notified); memb_gone_recover->tell_gfs_to_recover = 0; } @@ -2168,14 +2176,39 @@ return 0; } -/* FIXME: what happens if a node is unmounting, others have it in members_gone, - and it crashes? It shouldn't need journal recovery since the kernel umount - happens before leaving the group. */ +/* After a start that initiated a recovery, everyone will go and see if they + can do recovery and try if they can. If a node can't, it does start_done, + if it tries and fails, it does start_done, if it tries and succeeds it + sends a message and then does start_done once it receives's it back. So, + when we get a finish we know that we have all the results from the recovery + cycle and can judge if everything is recovered properly or not. If so, we + can unblock locks (in the finish), if not, we leave them blocked (in the + finish). + + If we leave locks blocked in the finish, then they can only be unblocked + after someone is able to do the recovery that's needed. So, leaving locks + blocked in a finish because recovery hasn't worked puts us into a special + state: the fs needs recovery, none of the current mounters has been able to + recover it, all current mounters have locks blocked in gfs, new mounters + are allowed, nodes can unmount, new mounters are asked to do first-mounter + recovery, if one of them succeeds then we can all clear this special state + and unblock locks (the unblock would happen upon recving the success + message from the new pseudo-first mounter, not as part of a finish), future + finishes would then go back to being able to unblock locks. + + While in this special state, a new node has been added and asked to do + first-mounter recovery, other nodes can also be added while the new + first-mounter is active. These other nodes don't notify mount.gfs. + They'll receive the result of the first mounter and if it succeeded they'll + notify mount.gfs, otherwise one of them will become the next first-mounter + and notify mount.gfs. */ int do_finish(struct mountgroup *mg) { struct mg_member *memb, *safe; - int leave_blocked = 0; + + log_group(mg, "finish %d needs_recovery %d", mg->last_finish, + mg->needs_recovery); /* members_gone list are the members that were removed from the members list when processing a start. members are removed @@ -2192,11 +2225,10 @@ list_del(&memb->list); free(memb); } else { + log_error("%s finish: needs recovery jid %d nodeid %d " + "status %d", mg->name, memb->jid, + memb->nodeid, memb->recovery_status); mg->needs_recovery = 1; - log_group(mg, "finish: needs recovery " - "jid %d nodeid %d status %d", - memb->jid, memb->nodeid, - memb->recovery_status); } } @@ -2210,12 +2242,7 @@ return 0; } - if (mg->needs_recovery) { - log_group(mg, "finish: leave locks blocked for needs_recovery"); - leave_blocked = 1; - } - - if (!leave_blocked) { + if (!mg->needs_recovery) { set_sysfs(mg, "block", 0); /* we may have been holding back our local mount due to @@ -2224,7 +2251,8 @@ mg->mount_client_delay = 0; notify_mount_client(mg); } - } + } else + log_group(mg, "finish: leave locks blocked for needs_recovery"); return 0; }