From mboxrd@z Thu Jan 1 00:00:00 1970 From: teigland@sourceware.org Date: 21 Aug 2006 17:46:20 -0000 Subject: [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c ... Message-ID: <20060821174620.30566.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: teigland at sourceware.org 2006-08-21 17:46:20 Modified files: group/gfs_controld: lock_dlm.h plock.c recover.c Log message: - the check for us becoming the new low nodeid after the previous one failed and unlinking the ckpt wasn't adequately checking for the old low node having failed - rename low_finished_nodeid to master_nodeid and clarify some of the code using this since it was confusing and misleading Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.15&r2=1.16 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.18&r2=1.19 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.14&r2=1.15 --- cluster/group/gfs_controld/lock_dlm.h 2006/08/18 16:33:08 1.15 +++ cluster/group/gfs_controld/lock_dlm.h 2006/08/21 17:46:19 1.16 @@ -148,8 +148,8 @@ int first_mounter_done; int emulate_first_mounter; int wait_first_done; - int low_finished_nodeid; int low_nodeid; + int master_nodeid; int save_plocks; uint64_t cp_handle; --- cluster/group/gfs_controld/plock.c 2006/08/18 16:33:08 1.18 +++ cluster/group/gfs_controld/plock.c 2006/08/21 17:46:19 1.19 @@ -1374,7 +1374,8 @@ saCkptSectionIterationFinalize(itr); out: if (mg->low_nodeid == our_nodeid) { - log_group(mg, "retrieve_plocks: unlink ckpt from old low node"); + /* we're the new low nodeid, will be master */ + log_group(mg, "retrieve_plocks: unlink ckpt from old master"); _unlink_checkpoint(mg, &name); } else saCkptCheckpointClose(h); --- cluster/group/gfs_controld/recover.c 2006/08/18 16:33:08 1.14 +++ cluster/group/gfs_controld/recover.c 2006/08/21 17:46:19 1.15 @@ -514,9 +514,8 @@ free(buf); } -/* We set the new member's jid to the lowest unused jid. - If we're the lowest existing member (by nodeid), then - send jid info to the new node. */ +/* We set the new member's jid to the lowest unused jid. If we're the lowest + existing member (by nodeid), then send jid info to the new node. */ /* Look at rw/ro/spectator status of all existing mounters and whether we need to do recovery. Based on that, decide if the current mount @@ -590,14 +589,14 @@ log_group(mg, "assign_journal: new member %d got jid %d", new->nodeid, new->jid); - if (mg->low_finished_nodeid == our_nodeid) + if (mg->master_nodeid == our_nodeid) { store_plocks(mg, new->nodeid); - /* if we're the first mounter and haven't gotten others_may_mount - yet, then don't send journals until kernel_recovery_done_first - so the second node won't mount the fs until omm. */ + /* if we're the first mounter and haven't gotten + others_may_mount yet, then don't send journals until + kernel_recovery_done_first so the second node won't mount + the fs until omm. */ - if (mg->low_finished_nodeid == our_nodeid) { if (mg->first_mounter && !mg->first_mounter_done) { log_group(mg, "delay sending journals to %d", new->nodeid); @@ -911,13 +910,63 @@ clear_memb_list(&mg->members_gone); } +/* New mounters may be waiting for a journals message that a failed node (as + master) would have sent. If the master failed and we're the new master, + then send a journals message to any nodes for whom we've not seen a journals + message. We also need to checkpoint the plock state for the new nodes to + read after they get their journals message. */ + +void resend_journals(struct mountgroup *mg) +{ + struct mg_member *memb; + int stored_plocks = 0; + + list_for_each_entry(memb, &mg->members, list) { + if (!memb->needs_journals) + continue; + + if (!stored_plocks) { + store_plocks(mg, memb->nodeid); + stored_plocks = 1; + } + + log_group(mg, "resend_journals to %d", memb->nodeid); + send_journals(mg, memb->nodeid); + } +} + +/* The master node is the member of the group with the lowest nodeid who + was also a member of the last "finished" group, i.e. a member of the + group the last time it got a finish callback. The job of the master + is to send state info to new nodes joining the group, and doing that + requires that the master has all the state to send -- a new joining + node that has the lowest nodeid doesn't have any state, which is why + we add the "finished" requirement. */ + +void update_master_nodeid(struct mountgroup *mg) +{ + struct mg_member *memb; + int new = -1, low = -1; + + list_for_each_entry(memb, &mg->members, list) { + if (low == -1 || memb->nodeid < low) + low = memb->nodeid; + if (!memb->finished) + continue; + if (new == -1 || memb->nodeid < new) + new = memb->nodeid; + } + mg->master_nodeid = new; + mg->low_nodeid = low; +} + /* This can happen before we receive a journals message for our mount. */ void recover_members(struct mountgroup *mg, int num_nodes, int *nodeids, int *pos_out, int *neg_out) { struct mg_member *memb, *safe; - int i, found, id, pos = 0, neg = 0, low = -1, old_low_finished_nodeid; + int i, found, id, pos = 0, neg = 0, prev_master_nodeid; /* move departed nodes from members list to members_gone */ @@ -982,30 +1031,31 @@ log_group(mg, "add member %d", id); } - list_for_each_entry(memb, &mg->members, list) { - if (mg->low_nodeid == -1 || memb->nodeid < mg->low_nodeid) - mg->low_nodeid = memb->nodeid; - if (!memb->finished) - continue; - if (low == -1 || memb->nodeid < low) - low = memb->nodeid; - } - old_low_finished_nodeid = mg->low_finished_nodeid; - mg->low_finished_nodeid = low; + prev_master_nodeid = mg->master_nodeid; + update_master_nodeid(mg); *pos_out = pos; *neg_out = neg; - log_group(mg, "total members %d low_finished_nodeid %d", - mg->memb_count, low); + log_group(mg, "total members %d master_nodeid %d prev %d", + mg->memb_count, mg->master_nodeid, prev_master_nodeid); - /* the low nodeid failed and we're the new low nodeid, we need - to unlink the ckpt that the failed node had open so new ckpts - can be created down the road */ - if ((old_low_finished_nodeid != low) && (our_nodeid == low)) { - log_group(mg, "unlink ckpt for failed low node %d", - old_low_finished_nodeid); + /* the master failed and we're the new master, we need to: + - unlink the ckpt that the failed master had open so new ckpts + can be created down the road + - resend journals msg to any nodes that needed one from the + failed master + - store plocks in ckpt for the new mounters to read when they + get the journals msg from us */ + + if (neg && + (prev_master_nodeid != -1) && + (prev_master_nodeid != mg->master_nodeid) && + (our_nodeid == mg->master_nodeid)) { + log_group(mg, "unlink ckpt for failed master %d", + prev_master_nodeid); unlink_checkpoint(mg); + resend_journals(mg); } } @@ -1021,6 +1071,7 @@ INIT_LIST_HEAD(&mg->resources); INIT_LIST_HEAD(&mg->saved_messages); mg->init = 1; + mg->master_nodeid = -1; mg->low_nodeid = -1; strncpy(mg->name, name, MAXNAME); @@ -1925,31 +1976,6 @@ } } -/* New mounters may be waiting for a journals message that a failed node (as - low nodeid) would have sent. If the low nodeid failed and we're the new low - nodeid, then send a journals message to any nodes for whom we've not seen a - journals message. We also need to checkpoint the plock state for the new - nodes to read after they get their journals message. */ - -void resend_journals(struct mountgroup *mg) -{ - struct mg_member *memb; - int stored_plocks = 0; - - list_for_each_entry(memb, &mg->members, list) { - if (!memb->needs_journals) - continue; - - if (!stored_plocks) { - store_plocks(mg, memb->nodeid); - stored_plocks = 1; - } - - log_group(mg, "resend_journals to %d", memb->nodeid); - send_journals(mg, memb->nodeid); - } -} - /* old method: A is rw mount, B mounts rw @@ -1987,7 +2013,7 @@ void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids) { - int pos = 0, neg = 0, low; + int pos = 0, neg = 0; mg->start_event_nr = mg->last_start; mg->start_type = type; @@ -1995,18 +2021,9 @@ log_group(mg, "start %d init %d type %d member_count %d", mg->last_start, mg->init, type, member_count); - low = mg->low_finished_nodeid; - recover_members(mg, member_count, nodeids, &pos, &neg); - reset_unfinished_recoveries(mg); - if (neg && low != mg->low_finished_nodeid && low == our_nodeid) { - log_group(mg, "low nodeid failed old %d new %d", - low, mg->low_finished_nodeid); - resend_journals(mg); - } - if (mg->init) { if (member_count == 1) start_first_mounter(mg);