From: teigland@sourceware.org <teigland@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c ...
Date: 21 Aug 2006 17:46:20 -0000 [thread overview]
Message-ID: <20060821174620.30566.qmail@sourceware.org> (raw)
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-08-21 17:46:20
Modified files:
group/gfs_controld: lock_dlm.h plock.c recover.c
Log message:
- the check for us becoming the new low nodeid after the previous one
failed and unlinking the ckpt wasn't adequately checking for the old
low node having failed
- rename low_finished_nodeid to master_nodeid and clarify some of the
code using this since it was confusing and misleading
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.15&r2=1.16
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.18&r2=1.19
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.14&r2=1.15
--- cluster/group/gfs_controld/lock_dlm.h 2006/08/18 16:33:08 1.15
+++ cluster/group/gfs_controld/lock_dlm.h 2006/08/21 17:46:19 1.16
@@ -148,8 +148,8 @@
int first_mounter_done;
int emulate_first_mounter;
int wait_first_done;
- int low_finished_nodeid;
int low_nodeid;
+ int master_nodeid;
int save_plocks;
uint64_t cp_handle;
--- cluster/group/gfs_controld/plock.c 2006/08/18 16:33:08 1.18
+++ cluster/group/gfs_controld/plock.c 2006/08/21 17:46:19 1.19
@@ -1374,7 +1374,8 @@
saCkptSectionIterationFinalize(itr);
out:
if (mg->low_nodeid == our_nodeid) {
- log_group(mg, "retrieve_plocks: unlink ckpt from old low node");
+ /* we're the new low nodeid, will be master */
+ log_group(mg, "retrieve_plocks: unlink ckpt from old master");
_unlink_checkpoint(mg, &name);
} else
saCkptCheckpointClose(h);
--- cluster/group/gfs_controld/recover.c 2006/08/18 16:33:08 1.14
+++ cluster/group/gfs_controld/recover.c 2006/08/21 17:46:19 1.15
@@ -514,9 +514,8 @@
free(buf);
}
-/* We set the new member's jid to the lowest unused jid.
- If we're the lowest existing member (by nodeid), then
- send jid info to the new node. */
+/* We set the new member's jid to the lowest unused jid. If we're the lowest
+ existing member (by nodeid), then send jid info to the new node. */
/* Look at rw/ro/spectator status of all existing mounters and whether
we need to do recovery. Based on that, decide if the current mount
@@ -590,14 +589,14 @@
log_group(mg, "assign_journal: new member %d got jid %d",
new->nodeid, new->jid);
- if (mg->low_finished_nodeid == our_nodeid)
+ if (mg->master_nodeid == our_nodeid) {
store_plocks(mg, new->nodeid);
- /* if we're the first mounter and haven't gotten others_may_mount
- yet, then don't send journals until kernel_recovery_done_first
- so the second node won't mount the fs until omm. */
+ /* if we're the first mounter and haven't gotten
+ others_may_mount yet, then don't send journals until
+ kernel_recovery_done_first so the second node won't mount
+ the fs until omm. */
- if (mg->low_finished_nodeid == our_nodeid) {
if (mg->first_mounter && !mg->first_mounter_done) {
log_group(mg, "delay sending journals to %d",
new->nodeid);
@@ -911,13 +910,63 @@
clear_memb_list(&mg->members_gone);
}
+/* New mounters may be waiting for a journals message that a failed node (as
+ master) would have sent. If the master failed and we're the new master,
+ then send a journals message to any nodes for whom we've not seen a journals
+ message. We also need to checkpoint the plock state for the new nodes to
+ read after they get their journals message. */
+
+void resend_journals(struct mountgroup *mg)
+{
+ struct mg_member *memb;
+ int stored_plocks = 0;
+
+ list_for_each_entry(memb, &mg->members, list) {
+ if (!memb->needs_journals)
+ continue;
+
+ if (!stored_plocks) {
+ store_plocks(mg, memb->nodeid);
+ stored_plocks = 1;
+ }
+
+ log_group(mg, "resend_journals to %d", memb->nodeid);
+ send_journals(mg, memb->nodeid);
+ }
+}
+
+/* The master node is the member of the group with the lowest nodeid who
+ was also a member of the last "finished" group, i.e. a member of the
+ group the last time it got a finish callback. The job of the master
+ is to send state info to new nodes joining the group, and doing that
+ requires that the master has all the state to send -- a new joining
+ node that has the lowest nodeid doesn't have any state, which is why
+ we add the "finished" requirement. */
+
+void update_master_nodeid(struct mountgroup *mg)
+{
+ struct mg_member *memb;
+ int new = -1, low = -1;
+
+ list_for_each_entry(memb, &mg->members, list) {
+ if (low == -1 || memb->nodeid < low)
+ low = memb->nodeid;
+ if (!memb->finished)
+ continue;
+ if (new == -1 || memb->nodeid < new)
+ new = memb->nodeid;
+ }
+ mg->master_nodeid = new;
+ mg->low_nodeid = low;
+}
+
/* This can happen before we receive a journals message for our mount. */
void recover_members(struct mountgroup *mg, int num_nodes,
int *nodeids, int *pos_out, int *neg_out)
{
struct mg_member *memb, *safe;
- int i, found, id, pos = 0, neg = 0, low = -1, old_low_finished_nodeid;
+ int i, found, id, pos = 0, neg = 0, prev_master_nodeid;
/* move departed nodes from members list to members_gone */
@@ -982,30 +1031,31 @@
log_group(mg, "add member %d", id);
}
- list_for_each_entry(memb, &mg->members, list) {
- if (mg->low_nodeid == -1 || memb->nodeid < mg->low_nodeid)
- mg->low_nodeid = memb->nodeid;
- if (!memb->finished)
- continue;
- if (low == -1 || memb->nodeid < low)
- low = memb->nodeid;
- }
- old_low_finished_nodeid = mg->low_finished_nodeid;
- mg->low_finished_nodeid = low;
+ prev_master_nodeid = mg->master_nodeid;
+ update_master_nodeid(mg);
*pos_out = pos;
*neg_out = neg;
- log_group(mg, "total members %d low_finished_nodeid %d",
- mg->memb_count, low);
+ log_group(mg, "total members %d master_nodeid %d prev %d",
+ mg->memb_count, mg->master_nodeid, prev_master_nodeid);
- /* the low nodeid failed and we're the new low nodeid, we need
- to unlink the ckpt that the failed node had open so new ckpts
- can be created down the road */
- if ((old_low_finished_nodeid != low) && (our_nodeid == low)) {
- log_group(mg, "unlink ckpt for failed low node %d",
- old_low_finished_nodeid);
+ /* the master failed and we're the new master, we need to:
+ - unlink the ckpt that the failed master had open so new ckpts
+ can be created down the road
+ - resend journals msg to any nodes that needed one from the
+ failed master
+ - store plocks in ckpt for the new mounters to read when they
+ get the journals msg from us */
+
+ if (neg &&
+ (prev_master_nodeid != -1) &&
+ (prev_master_nodeid != mg->master_nodeid) &&
+ (our_nodeid == mg->master_nodeid)) {
+ log_group(mg, "unlink ckpt for failed master %d",
+ prev_master_nodeid);
unlink_checkpoint(mg);
+ resend_journals(mg);
}
}
@@ -1021,6 +1071,7 @@
INIT_LIST_HEAD(&mg->resources);
INIT_LIST_HEAD(&mg->saved_messages);
mg->init = 1;
+ mg->master_nodeid = -1;
mg->low_nodeid = -1;
strncpy(mg->name, name, MAXNAME);
@@ -1925,31 +1976,6 @@
}
}
-/* New mounters may be waiting for a journals message that a failed node (as
- low nodeid) would have sent. If the low nodeid failed and we're the new low
- nodeid, then send a journals message to any nodes for whom we've not seen a
- journals message. We also need to checkpoint the plock state for the new
- nodes to read after they get their journals message. */
-
-void resend_journals(struct mountgroup *mg)
-{
- struct mg_member *memb;
- int stored_plocks = 0;
-
- list_for_each_entry(memb, &mg->members, list) {
- if (!memb->needs_journals)
- continue;
-
- if (!stored_plocks) {
- store_plocks(mg, memb->nodeid);
- stored_plocks = 1;
- }
-
- log_group(mg, "resend_journals to %d", memb->nodeid);
- send_journals(mg, memb->nodeid);
- }
-}
-
/*
old method:
A is rw mount, B mounts rw
@@ -1987,7 +2013,7 @@
void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids)
{
- int pos = 0, neg = 0, low;
+ int pos = 0, neg = 0;
mg->start_event_nr = mg->last_start;
mg->start_type = type;
@@ -1995,18 +2021,9 @@
log_group(mg, "start %d init %d type %d member_count %d",
mg->last_start, mg->init, type, member_count);
- low = mg->low_finished_nodeid;
-
recover_members(mg, member_count, nodeids, &pos, &neg);
-
reset_unfinished_recoveries(mg);
- if (neg && low != mg->low_finished_nodeid && low == our_nodeid) {
- log_group(mg, "low nodeid failed old %d new %d",
- low, mg->low_finished_nodeid);
- resend_journals(mg);
- }
-
if (mg->init) {
if (member_count == 1)
start_first_mounter(mg);
next reply other threads:[~2006-08-21 17:46 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-08-21 17:46 teigland [this message]
-- strict thread matches above, loose matches on Subject: below --
2006-08-18 16:33 [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c teigland
2006-08-08 21:19 teigland
2006-08-07 16:57 teigland
2006-08-04 21:56 teigland
2006-08-02 18:27 teigland
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20060821174620.30566.qmail@sourceware.org \
--to=teigland@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.