From mboxrd@z Thu Jan 1 00:00:00 1970 From: teigland@sourceware.org Date: 15 Jun 2006 15:27:44 -0000 Subject: [Cluster-devel] cluster/group/gfs_controld cpg.c lock_dlm.h ma ... Message-ID: <20060615152744.10150.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: teigland at sourceware.org 2006-06-15 15:27:43 Modified files: group/gfs_controld: cpg.c lock_dlm.h main.c recover.c Log message: Significant reworking of how mounts are processed. The previous approach couldn't deal with certain node failures that occured while processing a new mounter. In this new approach, processing a mounter is largely independent of processing node failures. Nodes failing while processing a mounter hasn't actually been tested yet, so there are sure to be details to fix. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/cpg.c.diff?cvsroot=cluster&r1=1.1&r2=1.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.1&r2=1.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/main.c.diff?cvsroot=cluster&r1=1.1&r2=1.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.1&r2=1.2 --- cluster/group/gfs_controld/cpg.c 2006/06/09 20:59:57 1.1 +++ cluster/group/gfs_controld/cpg.c 2006/06/15 15:27:43 1.2 @@ -27,7 +27,7 @@ void receive_recovery_status(struct mountgroup *mg, char *buf, int len, int from); void receive_recovery_done(struct mountgroup *mg, char *buf, int len, int from); - +char *msg_name(int type); static void do_deliver(int nodeid, char *data, int len) { @@ -58,8 +58,8 @@ discard them since they're only relevant to the app group. */ if (!mg->last_callback) { - log_group(mg, "discard message type %d len %d from %d", - hd->type, len, nodeid); + log_group(mg, "discard %s len %d from %d", + msg_name(hd->type), len, nodeid); return; } --- cluster/group/gfs_controld/lock_dlm.h 2006/06/09 20:59:57 1.1 +++ cluster/group/gfs_controld/lock_dlm.h 2006/06/15 15:27:43 1.2 @@ -111,37 +111,37 @@ char dir[PATH_MAX+1]; char options[MAX_OPTIONS_LEN+1]; - char error_msg[128]; - int mount_client; - int remount_client; int last_stop; int last_start; int last_finish; int last_callback; int start_event_nr; int start_type; - int needs_recovery; - int our_jid; + + char error_msg[128]; + int mount_client; + int remount_client; + int init; + int got_our_options; + int got_our_journals; + int mount_client_notified; + int mount_client_delay; + int delay_send_journals; + int got_kernel_mount; int first_mounter; int first_mounter_done; int emulate_first_mounter; int wait_first_done; - int init; - int init2; int low_finished_nodeid; + + int needs_recovery; + int our_jid; int spectator; int readonly; int rw; int withdraw; - void *journals_msg; - int journals_msg_len; - int journals_msg_from; - void *options_msg; - int options_msg_len; - int options_msg_from; - struct list_head saved_recovery_status; - + struct list_head saved_messages; void *start2_fn; }; @@ -174,11 +174,12 @@ struct list_head list; int nodeid; int jid; - int new; + int spectator; int readonly; int rw; uint32_t opts; + int tell_gfs_to_recover; int wait_gfs_recover_done; int gone_event; @@ -188,6 +189,7 @@ int recovery_status; int withdraw; struct dlm_lksb wd_lksb; + int needs_journals; }; enum { @@ -234,6 +236,7 @@ int do_remount(int ci, char *dir, char *mode); int do_withdraw(char *name); int kernel_recovery_done(char *name); +void ping_kernel_mount(char *table); int client_send(int ci, char *buf, int len); --- cluster/group/gfs_controld/main.c 2006/06/09 20:59:57 1.1 +++ cluster/group/gfs_controld/main.c 2006/06/15 15:27:43 1.2 @@ -201,9 +201,10 @@ if (!strcmp(act, "change@")) kernel_recovery_done(argv[3]); - else if (!strcmp(act, "offline@")) do_withdraw(argv[3]); + else + ping_kernel_mount(argv[3]); return 0; } --- cluster/group/gfs_controld/recover.c 2006/06/09 20:59:57 1.1 +++ cluster/group/gfs_controld/recover.c 2006/06/15 15:27:43 1.2 @@ -16,10 +16,12 @@ struct list_head list; int nodeid; int len; - char buf[MAX_MSGLEN]; + int type; + char buf[0]; }; #define SYSFS_DIR "/sys/fs" +#define JID_INIT -9 extern char *clustername; extern int our_nodeid; @@ -27,12 +29,12 @@ struct list_head mounts; +void send_journals(struct mountgroup *mg, int nodeid); int hold_withdraw_locks(struct mountgroup *mg); void release_withdraw_lock(struct mountgroup *mg, struct mg_member *memb); void release_withdraw_locks(struct mountgroup *mg); void start_participant_init_2(struct mountgroup *mg); -void start_participant_2(struct mountgroup *mg); void start_spectator_init_2(struct mountgroup *mg); void start_spectator_2(struct mountgroup *mg); @@ -53,6 +55,8 @@ return -1; } + mg->got_kernel_mount = 1; + memset(out, 0, 16); sprintf(out, "%d", val); rv = write(fd, out, strlen(out)); @@ -81,6 +85,8 @@ return -1; } + mg->got_kernel_mount = 1; + rv = read(fd, buf, len); if (rv < 0) log_error("read %s error %d %d", fname, rv, errno); @@ -117,13 +123,6 @@ return NULL; } -void clear_new(struct mountgroup *mg) -{ - struct mg_member *memb; - list_for_each_entry(memb, &mg->members, list) - memb->new = 0; -} - static void start_done(struct mountgroup *mg) { log_group(mg, "start_done %d", mg->start_event_nr); @@ -239,37 +238,59 @@ { struct save_msg *sm, *sm2; - if (list_empty(&mg->saved_recovery_status)) + if (list_empty(&mg->saved_messages)) return; log_group(mg, "process_saved_recovery_status"); - list_for_each_entry_safe(sm, sm2, &mg->saved_recovery_status, list) { + list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) { + if (sm->type != MSG_RECOVERY_STATUS) + continue; _receive_recovery_status(mg, sm->buf, sm->len, sm->nodeid); list_del(&sm->list); free(sm); } } +char *msg_name(int type) +{ + switch (type) { + case MSG_JOURNAL: + return "MSG_JOURNAL"; + case MSG_OPTIONS: + return "MSG_OPTIONS"; + case MSG_REMOUNT: + return "MSG_REMOUNT"; + case MSG_PLOCK: + return "MSG_PLOCK"; + case MSG_RECOVERY_STATUS: + return "MSG_RECOVERY_STATUS"; + case MSG_RECOVERY_DONE: + return "MSG_RECOVERY_DONE"; + } + return "unknown"; +} + /* we can receive recovery_status messages from other nodes doing start before we actually process the corresponding start callback ourselves */ -void save_recovery_status(struct mountgroup *mg, char *buf, int len, int from) +void save_message(struct mountgroup *mg, char *buf, int len, int from, int type) { struct save_msg *sm; - sm = malloc(sizeof(struct save_msg)); + sm = malloc(sizeof(struct save_msg) + len); if (!sm) return; - memset(sm, 0, sizeof(struct save_msg)); + memset(sm, 0, sizeof(struct save_msg) + len); memcpy(&sm->buf, buf, len); + sm->type = type; sm->len = len; sm->nodeid = from; - list_add_tail(&sm->list, &mg->saved_recovery_status); + log_group(mg, "save %s from %d len %d", msg_name(type), from, len); - log_group(mg, "save_recovery_status from %d len %d", from, len); + list_add_tail(&sm->list, &mg->saved_messages); } void receive_recovery_status(struct mountgroup *mg, char *buf, int len, @@ -277,7 +298,7 @@ { switch (mg->last_callback) { case DO_STOP: - save_recovery_status(mg, buf, len, from); + save_message(mg, buf, len, from, MSG_RECOVERY_STATUS); break; case DO_START: _receive_recovery_status(mg, buf, len, from); @@ -460,6 +481,97 @@ free(buf); } +/* We set the new member's jid to the lowest unused jid. + If we're the lowest existing member (by nodeid), then + send jid info to the new node. */ + +/* Look at rw/ro/spectator status of all existing mounters and whether + we need to do recovery. Based on that, decide if the current mount + mode (ro/spectator) is permitted; if not, set jid = -2. If spectator + mount and it's ok, set jid = -1. If ro or rw mount and it's ok, set + real jid. */ + +int assign_journal(struct mountgroup *mg, struct mg_member *new) +{ + struct mg_member *memb; + int i, total, rw_count, ro_count, spect_count, invalid_count; + + total = rw_count = ro_count = spect_count = invalid_count = 0; + + list_for_each_entry(memb, &mg->members, list) { + if (memb->nodeid == new->nodeid) + continue; + total++; + if (memb->jid == -2) + invalid_count++; + else if (memb->spectator) + spect_count++; + else if (memb->rw) + rw_count++; + else if (memb->readonly) + ro_count++; + } + + log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d", + total, invalid_count, rw_count, ro_count, spect_count); + + /* do we let the new member mount? jid=-2 means no. + - we only allow an rw mount when the fs needs recovery + - we only allow a single rw mount when the fs needs recovery */ + + if (mg->needs_recovery) { + if (!new->rw || rw_count) + new->jid = -2; + } + + if (new->jid == -2) { + log_group(mg, "assign_journal: fail - needs_recovery %d", + mg->needs_recovery); + goto out; + } + + if (new->spectator) { + log_group(mg, "assign_journal: new spectator allowed"); + new->jid = -1; + goto out; + } + + for (i = 0; i < 1024; i++) { + memb = find_memb_jid(mg, i); + if (!memb) { + new->jid = i; + break; + } + } + + /* Currently the fs needs recovery, i.e. none of the current + mounters (ro/spectators) can recover journals. So, this new rw + mounter is told to do first-mounter recovery of all the journals. */ + + if (mg->needs_recovery) { + log_group(mg, "assign_journal: new member OPT_RECOVER"); + new->opts |= MEMB_OPT_RECOVER; + } + + out: + log_group(mg, "assign_journal: new member %d got jid %d", + new->nodeid, new->jid); + + /* if we're the first mounter and haven't gotten others_may_mount + yet, then don't send journals until kernel_recovery_done_first + so the second node won't mount the fs until omm. */ + + if (mg->low_finished_nodeid == our_nodeid) { + if (mg->first_mounter && !mg->first_mounter_done) { + log_group(mg, "delay sending journals to %d", + new->nodeid); + mg->delay_send_journals = new->nodeid; + } else + send_journals(mg, new->nodeid); + } + return 0; +} + void _receive_options(struct mountgroup *mg, char *buf, int len, int from) { struct mg_member *memb; @@ -475,9 +587,6 @@ return; } - if (from == our_nodeid) - return; - if (strstr(options, "spectator")) { memb->spectator = 1; memb->opts |= MEMB_OPT_SPECT; @@ -489,34 +598,58 @@ memb->opts |= MEMB_OPT_RO; } - log_group(mg, "receive_options from %d rw=%d ro=%d spect=%d opts=%x", + log_group(mg, "_receive_options from %d rw=%d ro=%d spect=%d opts=%x", from, memb->rw, memb->readonly, memb->spectator, memb->opts); + + assign_journal(mg, memb); } void receive_options(struct mountgroup *mg, char *buf, int len, int from) { struct gdlm_header *hd = (struct gdlm_header *)buf; - - if (hd->nodeid == our_nodeid) - return; + struct mg_member *memb; log_group(mg, "receive_options from %d len %d last_cb %d", from, len, mg->last_callback); - /* If last_callback isn't DO_START it means we've not gotten - the start callback for the new node addition yet, and we need to - save this message to be processed after we get our first start. */ - - if (mg->last_callback != DO_START) { - mg->options_msg = malloc(len); - mg->options_msg_len = len; - mg->options_msg_from = from; - memcpy(mg->options_msg, buf, len); - } else { - void (*start2)(struct mountgroup *mg) = mg->start2_fn; + if (hd->nodeid == our_nodeid) { + mg->got_our_options = 1; + return; + } + + if (!mg->got_our_options) { + log_group(mg, "ignore options from %d", from); + return; + } + + /* we can receive an options message before getting the start + that adds the mounting node that sent the options, or + we can receive options messages before we get the journals + message for out own mount */ + + memb = find_memb_nodeid(mg, from); + + if (!memb || !mg->got_our_journals) + save_message(mg, buf, len, from, MSG_OPTIONS); + else _receive_options(mg, buf, len, from); - start2(mg); - mg->start2_fn = NULL; +} + +void process_saved_options(struct mountgroup *mg) +{ + struct save_msg *sm, *sm2; + + if (list_empty(&mg->saved_messages)) + return; + + log_group(mg, "process_saved_options"); + + list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) { + if (sm->type != MSG_OPTIONS) + continue; + _receive_options(mg, sm->buf, sm->len, sm->nodeid); + list_del(&sm->list); + free(sm); } } @@ -564,6 +697,7 @@ void _receive_journals(struct mountgroup *mg, char *buf, int len, int from) { + void (*start2)(struct mountgroup *mg) = mg->start2_fn; struct mg_member *memb, *memb2; struct gdlm_header *hd; int *ids, count, i, nodeid, jid, opts; @@ -571,13 +705,6 @@ hd = (struct gdlm_header *)buf; count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int)); - - if (count != mg->memb_count) { - log_error("invalid journals message len %d counts %d %d", - len, count, mg->memb_count); - return; - } - ids = (int *) (buf + sizeof(struct gdlm_header)); for (i = 0; i < count; i++) { @@ -615,140 +742,47 @@ memb->spectator = 1; } } + + /* we delay processing any options messages from new mounters + until after we receive the journals message for our own mount */ + process_saved_options(mg); + + start2(mg); } void receive_journals(struct mountgroup *mg, char *buf, int len, int from) { struct gdlm_header *hd = (struct gdlm_header *)buf; + struct mg_member *memb; int count; - if (hd->to_nodeid && hd->to_nodeid != our_nodeid) - return; - count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int)); - log_group(mg, "receive_journals from %d len %d count %d last_cb %d", - from, len, count, mg->last_callback); - - /* If init is still 1 it means we've not run do_start() - for our join yet, and we need to save this message to be - processed after we get our first start. */ - - - /* it should now be impossible to receive a journals message prior to - our start because the node sending journals won't do so until - receiving our options message - if (mg->init) { - mg->journals_msg = malloc(len); - mg->journals_msg_len = len; - mg->journals_msg_from = from; - memcpy(mg->journals_msg, buf, len); - } else { - *******/ - - ASSERT(mg->last_callback == DO_START); - - { - void (*start2)(struct mountgroup *mg) = mg->start2_fn; - _receive_journals(mg, buf, len, from); - start2(mg); - mg->start2_fn = NULL; - } -} - -/* We set the new member's jid to the lowest unused jid. - If we're the lowest existing member (by nodeid), then - send jid info to the new node. */ - -/* Look at rw/ro/spectator status of all existing mounters and whether - we need to do recovery. Based on that, decide if the current mount - mode (ro/spectator) is permitted; if not, set jid = -2. If spectator - mount and it's ok, set jid = -1. If ro or rw mount and it's ok, set - real jid. */ - -int discover_journals(struct mountgroup *mg) -{ - struct mg_member *memb, *new = NULL; - int i, total, rw_count, ro_count, spect_count, invalid_count; - - total = rw_count = ro_count = spect_count = invalid_count = 0; + log_group(mg, "receive_journals from %d to %d len %d count %d cb %d", + from, hd->to_nodeid, len, count, mg->last_callback); - list_for_each_entry(memb, &mg->members, list) { - if (memb->new && new) { - log_error("more than one new member %d %d", - new->nodeid, memb->nodeid); - return -1; - } else if (memb->new) { - new = memb; - } else { - total++; - if (memb->jid == -2) - invalid_count++; - else if (memb->spectator) - spect_count++; - else if (memb->rw) - rw_count++; - else if (memb->readonly) - ro_count++; - } - } - - if (!new) { - log_group(mg, "discover_journals: no new member"); - return 0; - } + /* just like we can receive an options msg from a newly added node + before we get the start adding it, we can receive the journals + message sent to it before we get the start adding it */ - log_group(mg, "discover_journals: total %d iv %d rw %d ro %d spect %d", - total, invalid_count, rw_count, ro_count, spect_count); - - log_group(mg, "discover_journals: new member %d rw=%d ro=%d spect=%d", - new->nodeid, new->rw, new->readonly, new->spectator); - - /* do we let the new member mount? jid=-2 means no. - - we only allow an rw mount when the fs needs recovery - - we only allow a single rw mount when the fs needs recovery */ - - if (mg->needs_recovery) { - if (!new->rw || rw_count) - new->jid = -2; - } - - if (new->jid == -2) { - log_group(mg, "discover_journals: fail - needs_recovery %d", - mg->needs_recovery); - goto out; - } - - if (new->spectator) { - log_group(mg, "discover_journals: new spectator allowed"); - new->jid = -1; - goto out; - } - - for (i = 0; i < 1024; i++) { - memb = find_memb_jid(mg, i); - if (!memb) { - new->jid = i; - break; - } + memb = find_memb_nodeid(mg, hd->to_nodeid); + if (!memb) { + log_group(mg, "receive_journals from %d to unknown %d", + from, hd->to_nodeid); + return; } + memb->needs_journals = 0; - /* Currently the fs needs recovery, i.e. none of the current - mounters (ro/spectators) can recover journals. So, this new rw - mounter is told to do first-mounter recovery of all the journals. */ + if (hd->to_nodeid && hd->to_nodeid != our_nodeid) + return; - if (mg->needs_recovery) { - log_group(mg, "discover_journals: new member OPT_RECOVER"); - new->opts |= MEMB_OPT_RECOVER; + if (mg->got_our_journals) { + log_group(mg, "receive_journals from %d duplicate", from); + return; } + mg->got_our_journals = 1; - out: - log_group(mg, "discover_journals: new member %d got jid %d", - new->nodeid, new->jid); - - if (mg->low_finished_nodeid == our_nodeid) - send_journals(mg, new->nodeid); - return 0; + _receive_journals(mg, buf, len, from); } static void add_ordered_member(struct mountgroup *mg, struct mg_member *new) @@ -786,10 +820,13 @@ memset(memb, 0, sizeof(*memb)); memb->nodeid = nodeid; - memb->jid = -9; - memb->new = 1; + memb->jid = JID_INIT; add_ordered_member(mg, memb); mg->memb_count++; + + if (!mg->init) + memb->needs_journals = 1; + return 0; } @@ -837,6 +874,8 @@ clear_memb_list(&mg->members_gone); } +/* This can happen before we receive a journals message for our mount. */ + void recover_members(struct mountgroup *mg, int num_nodes, int *nodeids, int *pos_out, int *neg_out) { @@ -867,13 +906,13 @@ memb->local_recovery_status = 0; /* - journal cb for failed or withdrawing nodes - - journal cb only if failed node finished joining + - failed node was assigned a journal - no journal cb if failed node was spectator - no journal cb if we've already done a journl cb */ if ((memb->gone_type == GROUP_NODE_FAILED || memb->withdraw) && - memb->mount_finished && + memb->jid != JID_INIT && !memb->spectator && !memb->wait_gfs_recover_done) { memb->tell_gfs_to_recover = 1; @@ -887,7 +926,7 @@ mg->spectator, mg->start_type, memb->withdraw, - memb->mount_finished, + memb->jid, memb->spectator, memb->wait_gfs_recover_done); } @@ -929,9 +968,8 @@ INIT_LIST_HEAD(&mg->members); INIT_LIST_HEAD(&mg->members_gone); INIT_LIST_HEAD(&mg->resources); - INIT_LIST_HEAD(&mg->saved_recovery_status); + INIT_LIST_HEAD(&mg->saved_messages); mg->init = 1; - mg->init2 = 1; strncpy(mg->name, name, MAXNAME); @@ -1130,10 +1168,11 @@ we delayed calling start_done() (to complete adding the second node) until here. */ - if (mg->wait_first_done) { - clear_new(mg); + if (mg->wait_first_done) start_done(mg); - } + + if (mg->delay_send_journals) + send_journals(mg, mg->delay_send_journals); } return 0; } @@ -1147,13 +1186,15 @@ struct mg_member *memb; int rv; - if (mg->spectator || mg->readonly) { + if (mg->spectator || mg->readonly || mg->our_jid == JID_INIT) { list_for_each_entry(memb, &mg->members_gone, list) { if (!memb->tell_gfs_to_recover) continue; - log_group(mg, "recover journal %d nodeid %d skip ro", - memb->jid, memb->nodeid); + log_group(mg, "recover journal %d nodeid %d skip, " + "spect %d ro %d our_jid %d", + memb->jid, memb->nodeid, + mg->spectator, mg->readonly, mg->our_jid); memb->tell_gfs_to_recover = 0; memb->local_recovery_status = RS_READONLY; } @@ -1395,6 +1436,11 @@ strncpy(buf, mg->error_msg, MAXLINE); error = 1; } else { + if (mg->mount_client_delay) { + log_group(mg, "notify_mount_client delayed"); + return; + } + if (mg->our_jid < 0) snprintf(buf, MAXLINE, "hostdata=id=%u:first=%d", mg->id, mg->first_mounter); @@ -1414,7 +1460,24 @@ if (error) { log_group(mg, "leaving due to mount error: %s", mg->error_msg); group_leave(gh, mg->name); - } + } else + mg->mount_client_notified = 1; +} + +void ping_kernel_mount(char *table) +{ + struct mountgroup *mg; + char buf[MAXLINE]; + char *name = strstr(table, ":") + 1; + int rv; + + mg = find_mg(name); + if (!mg) + return; + + rv = get_sysfs(mg, "id", buf, sizeof(buf)); + + log_group(mg, "ping_kernel_mount %d", rv); } /* When mounting a fs, we first join the mountgroup, then tell mount.gfs @@ -1438,9 +1501,54 @@ } } +/* The processing of new mounters (send/recv options, send/recv journals, + notify mount.gfs) is not very integrated with the stop/start/finish + callbacks from libgroup. A start callback just notifies us of a new + mounter and the options/journals messages drive things from there. + Recovery for failed nodes _is_ controlled more directly by the + stop/start/finish callbacks. So, processing new mounters happens + independently of recovery and of the libgroup callbacks. One place + where they need to intersect, though, is in stopping/suspending + gfs-kernel: + - When we get a stop callback, we need to be certain that gfs-kernel + is blocked. + - When a mounter notifies mount.gfs to go ahead, gfs-kernel will + shortly begin running in an unblocked fashion as it goes through + the kernel mounting process. + Given this, we need to be sure that if gfs-kernel is supposed to be + blocked, we don't notify mount.gfs to go ahead and do the kernel mount + since that starts gfs-kernel in an unblocked state. */ + +/* - if we're unmounting, the kernel is gone, so no problem. + - if we've just mounted and notified mount.gfs, then wait for kernel + mount and then block. + - if we're mounting and have not yet notified mount.gfs, then set + a flag that delays the notification until block is set to 0. */ + int do_stop(struct mountgroup *mg) { - set_sysfs(mg, "block", 1); + int rv; + + for (;;) { + rv = set_sysfs(mg, "block", 1); + if (!rv) + break; + + /* if the kernel instance of gfs existed before but now + we can't see it, that must mean it's been unmounted, + so it's implicitly stopped */ + + if (mg->got_kernel_mount) + break; + + if (mg->mount_client_notified) + wait_for_kernel_mount(mg); + else { + mg->mount_client_delay = 1; + break; + } + } + group_stop_done(gh, mg->name); return 0; } @@ -1498,12 +1606,17 @@ leave_blocked = 1; } - if (mg->mount_client) { - notify_mount_client(mg); - wait_for_kernel_mount(mg); - } else if (!leave_blocked) + if (!leave_blocked) { set_sysfs(mg, "block", 0); + /* we may have been holding back our local mount due to + being stopped/blocked */ + if (mg->mount_client_delay) { + mg->mount_client_delay = 0; + notify_mount_client(mg); + } + } + return 0; } @@ -1544,9 +1657,7 @@ struct mg_member *memb; log_group(mg, "start_first_mounter"); - set_our_memb_options(mg); - memb = find_memb_nodeid(mg, our_nodeid); ASSERT(memb); @@ -1561,11 +1672,12 @@ mg->our_jid = 0; mg->first_mounter = 1; mg->first_mounter_done = 0; + mg->got_our_options = 1; + mg->got_our_journals = 1; hold_withdraw_locks(mg); } - clear_new(mg); start_done(mg); - mg->init = 0; + notify_mount_client(mg); } /* called for the initial start on a rw/ro mounter; @@ -1574,25 +1686,11 @@ void start_participant_init(struct mountgroup *mg) { log_group(mg, "start_participant_init"); - set_our_memb_options(mg); send_options(mg); hold_withdraw_locks(mg); - - if (mg->journals_msg) { - _receive_journals(mg, - mg->journals_msg, - mg->journals_msg_len, - mg->journals_msg_from); - free(mg->journals_msg); - mg->journals_msg = NULL; - - start_participant_init_2(mg); - } else { - /* will be called in receive_journals() */ - mg->start2_fn = start_participant_init_2; - } - mg->init = 0; + start_done(mg); + mg->start2_fn = start_participant_init_2; } /* called for the initial start on a rw/ro mounter after _receive_journals() */ @@ -1613,7 +1711,7 @@ /* fs needs recovery and existing mounters can't recover it, i.e. they're spectator/readonly, so we're told to do first-mounter recovery on the fs. */ - + if (first_mounter_recovery(mg)) { log_group(mg, "first_mounter_recovery"); mg->emulate_first_mounter = 1; @@ -1621,12 +1719,13 @@ mg->first_mounter_done = 0; } out: - clear_new(mg); - start_done(mg); - mg->init2 = 0; + notify_mount_client(mg); } -/* called for a non-initial start on a normal mounter */ +/* called for a non-initial start on a normal mounter. + NB we can get here without having received a journals message for + our (recent) mount yet in which case we don't know the jid or ro/rw + status of any members, and don't know our own jid. */ void start_participant(struct mountgroup *mg, int pos, int neg) { @@ -1635,80 +1734,40 @@ if (pos) { hold_withdraw_locks(mg); - if (mg->options_msg) { - _receive_options(mg, - mg->options_msg, - mg->options_msg_len, - mg->options_msg_from); - free(mg->options_msg); - mg->options_msg = NULL; + /* If we're the first mounter, and we're adding a second + node here, but haven't gotten first_done (others_may_mount) + from gfs yet, then don't do the start_done() to complete + adding the second node. Set wait_first_done=1 to have + first_recovery_done() call start_done(). This also requires + that we unblock locking on the first mounter if gfs hasn't + done others_may_mount yet. */ + + if (mg->first_mounter && !mg->first_mounter_done) { + mg->wait_first_done = 1; + set_sysfs(mg, "block", 0); + log_group(mg, "delay start_done til others_may_mount"); + } else + start_done(mg); + + mg->start2_fn = NULL; + process_saved_options(mg); - start_participant_2(mg); - } else { - /* will be called in receive_options() */ - mg->start2_fn = start_participant_2; - } } else if (neg) { recover_journals(mg); process_saved_recovery_status(mg); } } -/* called for a non-initial start on a normal mounter when adding a node, - after _receive_options(). we need to know if the new node is a spectator - or not (from options) before deciding if it should be given a journal - in discover_journals() */ - -void start_participant_2(struct mountgroup *mg) -{ - log_group(mg, "start_participant_2"); - - discover_journals(mg); - - /* If we're the first mounter, and we're adding a second - node here, but haven't gotten first_done (others_may_mount) from gfs - yet, then don't do the start_done() to complete adding the - second node. Set wait_first_done=1 to have first_recovery_done() - call start_done(). - This also requires that we unblock locking on the first - mounter if gfs hasn't done others_may_mount yet. */ - - if (mg->init2 && mg->first_mounter && !mg->first_mounter_done) { - mg->wait_first_done = 1; - set_sysfs(mg, "block", 0); - log_group(mg, "delay start_done until others_may_mount"); - } else { - clear_new(mg); - start_done(mg); - } - - mg->init2 = 0; -} - /* called for the initial start on a spectator mounter */ void start_spectator_init(struct mountgroup *mg) { log_group(mg, "start_spectator_init"); - set_our_memb_options(mg); send_options(mg); hold_withdraw_locks(mg); - - if (mg->journals_msg) { - _receive_journals(mg, - mg->journals_msg, - mg->journals_msg_len, - mg->journals_msg_from); - free(mg->journals_msg); - mg->journals_msg = NULL; - - start_spectator_init_2(mg); - } else { - /* will be called in receive_journals() */ - mg->start2_fn = start_spectator_init_2; - } - mg->init = 0; + start_done(mg); + mg->start2_fn = start_spectator_init_2; } /* called for the initial start on a spectator mounter, @@ -1726,9 +1785,7 @@ else ASSERT(mg->our_jid == -1); - clear_new(mg); - start_done(mg); - mg->init2 = 0; + notify_mount_client(mg); } /* called for a non-initial start on a spectator mounter */ @@ -1739,37 +1796,14 @@ if (pos) { hold_withdraw_locks(mg); - - if (mg->options_msg) { - _receive_options(mg, - mg->options_msg, - mg->options_msg_len, - mg->options_msg_from); - free(mg->options_msg); - mg->options_msg = NULL; - - start_spectator_2(mg); - } else { - /* will be called in receive_options() */ - mg->start2_fn = start_spectator_2; - } + start_done(mg); + process_saved_options(mg); } else if (neg) { recover_journals(mg); process_saved_recovery_status(mg); } } -/* called for a non-initial start on a spectator mounter when adding a - node, after _receive_options() */ - -void start_spectator_2(struct mountgroup *mg) -{ - log_group(mg, "start_spectator_2"); - discover_journals(mg); - clear_new(mg); - start_done(mg); -} - /* If nodeA fails, nodeB is recovering journalA and nodeB fails before finishing, then nodeC needs to tell gfs to recover both journalA and journalB. We do this by setting tell_gfs_to_recover back to 1 for @@ -1792,7 +1826,25 @@ } } +/* New mounters may be waiting for a journals message that a failed node (as + low nodeid) would have sent. If the low nodeid failed and we're the new low + nodeid, then send a journals message to any nodes for whom we've not seen a + journals message. */ + +void resend_journals(struct mountgroup *mg) +{ + struct mg_member *memb; + + list_for_each_entry(memb, &mg->members, list) { + if (!memb->needs_journals) + continue; + log_group(mg, "resend_journals to %d", memb->nodeid); + send_journals(mg, memb->nodeid); + } +} + /* + old method: A is rw mount, B mounts rw do_start do_start @@ -1808,11 +1860,27 @@ start_participant_init_2 group_start_done do_finish do_finish + + new method: decouples stop/start/finish from mount processing + A is rw mount, B mounts rw + + do_start do_start + start_participant start_participant_init + start_done send_options + start_done + do_finish do_finish + + receive_options + assign_journal + send_journals + receive_journals + start_participant_init_2 + notify_mount_client */ void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids) { - int pos = 0, neg = 0; + int pos = 0, neg = 0, low; mg->start_event_nr = mg->last_start; mg->start_type = type; @@ -1820,10 +1888,18 @@ log_group(mg, "start %d init %d type %d member_count %d", mg->last_start, mg->init, type, member_count); + low = mg->low_finished_nodeid; + recover_members(mg, member_count, nodeids, &pos, &neg); reset_unfinished_recoveries(mg); + if (neg && low != mg->low_finished_nodeid && low == our_nodeid) { + log_group(mg, "low nodeid failed old %d new %d", + low, mg->low_finished_nodeid); + resend_journals(mg); + } + if (mg->init) { if (member_count == 1) start_first_mounter(mg); @@ -1831,6 +1907,7 @@ start_spectator_init(mg); else start_participant_init(mg); + mg->init = 0; } else { if (mg->spectator) start_spectator(mg, pos, neg); @@ -1839,8 +1916,7 @@ } } -/* FIXME: - +/* What repurcussions are there from umount shutting down gfs in the kernel before we leave the mountgroup? We can no longer participate in recovery even though we're in the group -- what are the end cases