From mboxrd@z Thu Jan 1 00:00:00 1970 From: teigland@sourceware.org Date: 6 Oct 2006 15:34:53 -0000 Subject: [Cluster-devel] cluster/group/gfs_controld cpg.c plock.c recover.c Message-ID: <20061006153453.9947.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: teigland at sourceware.org 2006-10-06 15:34:52 Modified files: group/gfs_controld: cpg.c plock.c recover.c Log message: - check cpg flow control status from openais when processing plocks - handle case where we're mounting and the only other mounted node fails -- we need to become the first mounter if we've not begun mount(2) yet - journal recovery requests need to be fed serially to gfs, we weren't doing that in the case where a gfs journal recovery was in progress when another node failed Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/cpg.c.diff?cvsroot=cluster&r1=1.7&r2=1.8 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.20&r2=1.21 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.17&r2=1.18 --- cluster/group/gfs_controld/cpg.c 2006/09/08 22:44:33 1.7 +++ cluster/group/gfs_controld/cpg.c 2006/10/06 15:34:52 1.8 @@ -19,6 +19,7 @@ static int saved_nodeid; static int saved_len; static char saved_data[MAX_MSGLEN]; +int message_flow_control_on; void receive_journals(struct mountgroup *mg, char *buf, int len, int from); void receive_options(struct mountgroup *mg, char *buf, int len, int from); @@ -127,6 +128,7 @@ int process_cpg(void) { + cpg_flow_control_state_t flow_control_state; cpg_error_t error; got_msg = 0; @@ -142,6 +144,22 @@ if (got_msg) do_deliver(saved_nodeid, saved_data, saved_len); + + error = cpg_flow_control_state_get(daemon_handle, &flow_control_state); + if (error != CPG_OK) { + log_error("cpg_flow_control_state_get %d", error); + return -1; + } + + if (flow_control_state == CPG_FLOW_CONTROL_ENABLED) { + message_flow_control_on = 1; + log_debug("flow control on"); + } else { + if (message_flow_control_on) + log_debug("flow control off"); + message_flow_control_on = 0; + } + return 0; } --- cluster/group/gfs_controld/plock.c 2006/08/31 19:13:02 1.20 +++ cluster/group/gfs_controld/plock.c 2006/10/06 15:34:52 1.21 @@ -46,6 +46,7 @@ static int control_fd = -1; extern int our_nodeid; static int plocks_online = 0; +extern int message_flow_control_on; static SaCkptHandleT ckpt_handle; static SaCkptCallbacksT callbacks = { 0, 0 }; @@ -297,6 +298,10 @@ char *buf; int len, rv; + /* Don't send more messages while the cpg message queue is backed up */ + if (message_flow_control_on) + return 0; + memset(&info, 0, sizeof(info)); rv = read(control_fd, &info, sizeof(info)); --- cluster/group/gfs_controld/recover.c 2006/08/31 18:56:25 1.17 +++ cluster/group/gfs_controld/recover.c 2006/10/06 15:34:52 1.18 @@ -27,6 +27,7 @@ void start_participant_init_2(struct mountgroup *mg); void start_spectator_init_2(struct mountgroup *mg); void start_spectator_2(struct mountgroup *mg); +void notify_mount_client(struct mountgroup *mg); int set_sysfs(struct mountgroup *mg, char *field, int val) { @@ -1062,6 +1063,41 @@ unlink_checkpoint(mg); resend_journals(mg); } + + /* Tricky situation when we're mounting and the failed node was + the only other node that had the fs mounted. If the failed node + didn't send us a journals message, we need to: unlink ckpt, pick a + journal for ourselves, act like the first mounter of the fs (do + first-mounter-recovery, the dead node may have been mounting itself + and not finished first-mounter-recovery). */ + + else if (neg && mg->memb_count == 1) { + if (!mg->got_our_journals) { + log_group(mg, "we are left alone, act first mounter"); + + unlink_checkpoint(mg); + memb = find_memb_nodeid(mg, our_nodeid); + memb->jid = 0; + mg->our_jid = 0; + mg->first_mounter = 1; + mg->first_mounter_done = 0; + mg->got_our_options = 1; + mg->got_our_journals = 1; + mg->mount_client_delay = 0; + notify_mount_client(mg); + } else if (mg->mount_client_notified && !mg->got_kernel_mount) { + + /* FIXME */ + + log_group(mg, "FIXME: case not handled"); + + /* we got journals message from other node before it + died which means it finished first mounter recovery, + but we now need to tell gfs to recover the journal + after our own mount(2) completes */ + + } + } } struct mountgroup *create_mg(char *name) @@ -1121,7 +1157,7 @@ static int we_are_in_fence_domain(void) { group_data_t data; - int i, rv; + int rv; memset(&data, 0, sizeof(data)); @@ -1130,11 +1166,8 @@ if (rv || strcmp(data.client_name, "fence")) return 0; - for (i = 0; i < data.member_count; i++) { - if (data.members[i] == our_nodeid) - return 1; - } - + if (data.member == 1) + return 1; return 0; } @@ -1304,8 +1337,16 @@ the problem we're trying to avoid here is telling gfs-kernel to do recovery when it can't for some reason and then waiting forever for - a recovery_done signal that will never arrive. */ + a recovery_done signal that will never arrive. + FIXME: we want to do more here to avoid telling gfs-kernel to do recovery + until our mount is really complete. I want to keep the join/mount + connection between mount.gfs and gfs_controld open throughout the mount + and have mount.gfs use it to return the result from mount(2). Then we'll + know when the mount(2) is done and we should also be able to remove the + special mount_error_fd since errors can be sent back through the original + connection as well. */ + void recover_journals(struct mountgroup *mg) { struct mg_member *memb; @@ -1318,24 +1359,25 @@ mg->kernel_mount_error || !mg->mount_client_notified || !mg->got_kernel_mount) { + log_group(mg, "recover_journals: unable %d,%d,%d,%d,%d,%d,%d", + mg->spectator, + mg->readonly, + mg->withdraw, + mg->our_jid, + mg->kernel_mount_error, + mg->mount_client_notified, + mg->got_kernel_mount); list_for_each_entry(memb, &mg->members_gone, list) { - if (!memb->tell_gfs_to_recover) - continue; - - log_group(mg, "recover journal %d nodeid %d skip: " - "%d %d %d %d %d %d %d", - memb->jid, memb->nodeid, - mg->spectator, - mg->readonly, - mg->withdraw, - mg->our_jid, - mg->kernel_mount_error, - mg->mount_client_notified, - mg->got_kernel_mount); - - memb->tell_gfs_to_recover = 0; - memb->local_recovery_status = RS_READONLY; + log_group(mg, "member gone %d jid %d " + "tell_gfs_to_recover %d", + memb->nodeid, memb->jid, + memb->tell_gfs_to_recover); + + if (memb->tell_gfs_to_recover) { + memb->tell_gfs_to_recover = 0; + memb->local_recovery_status = RS_READONLY; + } } start_done(mg); return; @@ -1346,6 +1388,15 @@ through the single recovery_done sysfs file */ list_for_each_entry(memb, &mg->members_gone, list) { + if (memb->wait_gfs_recover_done) { + log_group(mg, "delay new gfs recovery, " + "wait_gfs_recover_done for nodeid %d jid %d", + memb->nodeid, memb->jid); + return; + } + } + + list_for_each_entry(memb, &mg->members_gone, list) { if (!memb->tell_gfs_to_recover) continue; @@ -1416,6 +1467,17 @@ return 0; } +int need_kernel_recovery_done(struct mountgroup *mg) +{ + struct mg_member *memb; + + list_for_each_entry(memb, &mg->members_gone, list) { + if (memb->wait_gfs_recover_done) + return 1; + } + return 0; +} + /* Note: when a readonly node fails we do consider its journal (and the fs) to need recovery... not sure this is really necessary, but the readonly node did "own" a journal so it seems proper to recover @@ -1500,19 +1562,13 @@ log_group(mg, "recovery_done jid %d nodeid %d %s", memb->jid, memb->nodeid, ss); - out: - recover_journals(mg); - return 0; -} -int need_kernel_recovery_done(struct mountgroup *mg) -{ - struct mg_member *memb; + /* sanity check */ + if (need_kernel_recovery_done(mg)) + log_error("recovery_done: should be no pending gfs recoveries"); - list_for_each_entry(memb, &mg->members_gone, list) { - if (memb->wait_gfs_recover_done) - return 1; - } + out: + recover_journals(mg); return 0; }