From mboxrd@z Thu Jan 1 00:00:00 1970 From: teigland@sourceware.org Date: 15 Jun 2006 20:41:47 -0000 Subject: [Cluster-devel] cluster/group/gfs_controld Makefile cpg.c grou ... Message-ID: <20060615204147.14251.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: teigland at sourceware.org 2006-06-15 20:41:46 Modified files: group/gfs_controld: Makefile cpg.c group.c lock_dlm.h main.c member_cman.c recover.c Log message: Complete the code to support withdraw, not yet tested. This also switches from using dlm locks for withdraw notifications to simply using messages. The way the daemon now works allows a much simpler approach to withdraw than what we had before where we needed the dlm locks. Setting up a dlm lockspace for the daemon was also an annoyingly heavy-weight step and the dlm kernel state of the daemon made cleaning up from crashes difficult. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/Makefile.diff?cvsroot=cluster&r1=1.1&r2=1.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/cpg.c.diff?cvsroot=cluster&r1=1.2&r2=1.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/group.c.diff?cvsroot=cluster&r1=1.1&r2=1.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.2&r2=1.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/main.c.diff?cvsroot=cluster&r1=1.2&r2=1.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/member_cman.c.diff?cvsroot=cluster&r1=1.1&r2=1.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.2&r2=1.3 --- cluster/group/gfs_controld/Makefile 2006/06/09 20:59:57 1.1 +++ cluster/group/gfs_controld/Makefile 2006/06/15 20:41:46 1.2 @@ -22,8 +22,7 @@ -I../include/ \ -I../lib/ \ -I../../cman/lib/ \ - -I../../cman/daemon/openais/trunk/include/ \ - -I../../dlm/lib/ + -I../../cman/daemon/openais/trunk/include/ TARGET=gfs_controld @@ -38,8 +37,6 @@ group.o \ plock.o \ recover.o \ - withdraw.o \ - ../../dlm/lib/libdlm_lt.a \ ../../cman/lib/libcman.a \ ../../cman/daemon/openais/trunk/lib/libcpg.a \ ../lib/libgroup.a --- cluster/group/gfs_controld/cpg.c 2006/06/15 15:27:43 1.2 +++ cluster/group/gfs_controld/cpg.c 2006/06/15 20:41:46 1.3 @@ -24,6 +24,7 @@ void receive_options(struct mountgroup *mg, char *buf, int len, int from); void receive_remount(struct mountgroup *mg, char *buf, int len, int from); void receive_plock(struct mountgroup *mg, char *buf, int len, int from); +void receive_withdraw(struct mountgroup *mg, char *buf, int len, int from); void receive_recovery_status(struct mountgroup *mg, char *buf, int len, int from); void receive_recovery_done(struct mountgroup *mg, char *buf, int len, int from); @@ -88,6 +89,10 @@ receive_recovery_done(mg, data, len, nodeid); break; + case MSG_WITHDRAW: + receive_withdraw(mg, data, len, nodeid); + break; + default: log_error("unknown message type %d from %d", hd->type, hd->nodeid); --- cluster/group/gfs_controld/group.c 2006/06/09 20:59:57 1.1 +++ cluster/group/gfs_controld/group.c 2006/06/15 20:41:46 1.2 @@ -147,8 +147,6 @@ log_debug("groupd callback: terminate %s", cb_name); mg->last_callback = DO_TERMINATE; do_terminate(mg); - list_del(&mg->list); - free(mg); break; case DO_SETID: --- cluster/group/gfs_controld/lock_dlm.h 2006/06/15 15:27:43 1.2 +++ cluster/group/gfs_controld/lock_dlm.h 2006/06/15 20:41:46 1.3 @@ -36,7 +36,6 @@ #include "list.h" #include "linux_endian.h" #include "libgroup.h" -#include "libdlm.h" #define MAXARGS 64 #define MAXLINE 256 @@ -184,11 +183,10 @@ int wait_gfs_recover_done; int gone_event; int gone_type; - int mount_finished; + int finished; int local_recovery_status; int recovery_status; - int withdraw; - struct dlm_lksb wd_lksb; + int withdrawing; int needs_journals; }; @@ -197,6 +195,7 @@ MSG_OPTIONS, MSG_REMOUNT, MSG_PLOCK, + MSG_WITHDRAW, MSG_RECOVERY_STATUS, MSG_RECOVERY_DONE, }; @@ -223,12 +222,9 @@ int process_cpg(void); int setup_groupd(void); int process_groupd(void); -int setup_libdlm(void); -int process_libdlm(void); int setup_plocks(void); int process_plocks(void); void exit_cman(void); -void exit_libdlm(void); int do_mount(int ci, char *dir, char *type, char *proto, char *table, char *options); --- cluster/group/gfs_controld/main.c 2006/06/15 15:27:43 1.2 +++ cluster/group/gfs_controld/main.c 2006/06/15 20:41:46 1.3 @@ -29,10 +29,10 @@ static int listen_fd; static int groupd_fd; static int uevent_fd; -static int libdlm_fd; static int plocks_fd; extern struct list_head mounts; +extern struct list_head withdrawn_mounts; int no_withdraw; static void make_args(char *buf, int *argc, char **argv, char sep) @@ -266,14 +266,6 @@ goto out; client_add(uevent_fd, &maxi); - if (no_withdraw) - goto next; - - rv = libdlm_fd = setup_libdlm(); - if (rv < 0) - goto next; - client_add(libdlm_fd, &maxi); - next: rv = plocks_fd = setup_plocks(); if (rv < 0) goto out; @@ -309,9 +301,6 @@ process_cpg(); else if (pollfd[i].fd == uevent_fd) process_uevent(); - else if (!no_withdraw && - pollfd[i].fd == libdlm_fd) - process_libdlm(); else if (pollfd[i].fd == plocks_fd) process_plocks(); else @@ -456,6 +445,7 @@ { prog_name = argv[0]; INIT_LIST_HEAD(&mounts); + INIT_LIST_HEAD(&withdrawn_mounts); client_init(); decode_arguments(argc, argv); --- cluster/group/gfs_controld/member_cman.c 2006/06/09 20:59:57 1.1 +++ cluster/group/gfs_controld/member_cman.c 2006/06/15 20:41:46 1.2 @@ -43,10 +43,7 @@ void exit_cman(void) { - /* do we want to try to forcibly clean some stuff up - in the kernel here? */ log_error("cluster is down, exiting"); - exit_libdlm(); exit(1); } --- cluster/group/gfs_controld/recover.c 2006/06/15 15:27:43 1.2 +++ cluster/group/gfs_controld/recover.c 2006/06/15 20:41:46 1.3 @@ -26,14 +26,12 @@ extern char *clustername; extern int our_nodeid; extern group_handle_t gh; +extern int no_withdraw; struct list_head mounts; +struct list_head withdrawn_mounts; void send_journals(struct mountgroup *mg, int nodeid); -int hold_withdraw_locks(struct mountgroup *mg); -void release_withdraw_lock(struct mountgroup *mg, struct mg_member *memb); -void release_withdraw_locks(struct mountgroup *mg); - void start_participant_init_2(struct mountgroup *mg); void start_spectator_init_2(struct mountgroup *mg); void start_spectator_2(struct mountgroup *mg); @@ -146,6 +144,46 @@ mg->remount_client = 0; } +void send_withdraw(struct mountgroup *mg) +{ + struct gdlm_header *hd; + int len; + char *buf; + + len = sizeof(struct gdlm_header); + + buf = malloc(len); + if (!buf) + return; + memset(buf, 0, len); + + hd = (struct gdlm_header *)buf; + hd->type = MSG_WITHDRAW; + hd->nodeid = our_nodeid; + hd->to_nodeid = 0; + + log_group(mg, "send_withdraw"); + + send_group_message(mg, len, buf); + + free(buf); +} + +void receive_withdraw(struct mountgroup *mg, char *buf, int len, int from) +{ + struct mg_member *memb; + + memb = find_memb_nodeid(mg, from); + if (!memb) { + log_group(mg, "receive_withdraw no member %d", from); + return; + } + memb->withdrawing = 1; + + if (from == our_nodeid) + group_leave(gh, mg->name); +} + #define SEND_RS_INTS 3 void send_recovery_status(struct mountgroup *mg) @@ -267,6 +305,8 @@ return "MSG_RECOVERY_STATUS"; case MSG_RECOVERY_DONE: return "MSG_RECOVERY_DONE"; + case MSG_WITHDRAW: + return "MSG_WITHDRAW"; } return "unknown"; } @@ -911,7 +951,7 @@ - no journal cb if we've already done a journl cb */ if ((memb->gone_type == GROUP_NODE_FAILED || - memb->withdraw) && + memb->withdrawing) && memb->jid != JID_INIT && !memb->spectator && !memb->wait_gfs_recover_done) { @@ -925,7 +965,7 @@ memb->nodeid, memb->tell_gfs_to_recover, mg->spectator, mg->start_type, - memb->withdraw, + memb->withdrawing, memb->jid, memb->spectator, memb->wait_gfs_recover_done); @@ -944,7 +984,7 @@ } list_for_each_entry(memb, &mg->members, list) { - if (!memb->mount_finished) + if (!memb->finished) continue; if (low == -1 || memb->nodeid < low) low = memb->nodeid; @@ -1186,7 +1226,12 @@ struct mg_member *memb; int rv; - if (mg->spectator || mg->readonly || mg->our_jid == JID_INIT) { + /* we can't do journal recovery if: we're a spectator or readonly + mount, gfs is currently withdrawing, or we're mounting and haven't + received a journals message yet */ + + if (mg->spectator || mg->readonly || mg->withdraw || + mg->our_jid == JID_INIT) { list_for_each_entry(memb, &mg->members_gone, list) { if (!memb->tell_gfs_to_recover) continue; @@ -1406,11 +1451,25 @@ { struct mountgroup *mg; + list_for_each_entry(mg, &withdrawn_mounts, list) { + if (!strcmp(mg->dir, dir)) { + log_group(mg, "unmount withdrawn fs"); + list_del(&mg->list); + free(mg); + return 0; + } + } + mg = find_mg_dir(dir); if (!mg) { log_error("do_unmount: unknown mount dir %s", dir); return -1; } + + if (mg->withdraw) { + log_error("do_unmount: fs on %s is withdrawing", dir); + return -1; + } /* Check to see if we're waiting for a kernel recovery_done to do a start_done(). If so, call the start_done() here because we won't be @@ -1567,9 +1626,6 @@ from members_gone if their journals have been recovered */ list_for_each_entry_safe(memb, safe, &mg->members_gone, list) { - if (!memb->withdraw) - release_withdraw_lock(mg, memb); - if (!memb->recovery_status) { list_del(&memb->list); free(memb); @@ -1588,18 +1644,8 @@ } } - list_for_each_entry(memb, &mg->members, list) { - memb->mount_finished = 1; - - /* If there are still withdrawing nodes that haven't left - the group, we need to keep lock requests blocked */ - - if (memb->withdraw) { - log_group(mg, "finish: leave locks blocked for " - "withdrawing node %d", memb->nodeid); - leave_blocked = 1; - } - } + list_for_each_entry(memb, &mg->members, list) + memb->finished = 1; if (mg->needs_recovery) { log_group(mg, "finish: leave locks blocked for needs_recovery"); @@ -1674,7 +1720,6 @@ mg->first_mounter_done = 0; mg->got_our_options = 1; mg->got_our_journals = 1; - hold_withdraw_locks(mg); } start_done(mg); notify_mount_client(mg); @@ -1688,7 +1733,6 @@ log_group(mg, "start_participant_init"); set_our_memb_options(mg); send_options(mg); - hold_withdraw_locks(mg); start_done(mg); mg->start2_fn = start_participant_init_2; } @@ -1732,8 +1776,6 @@ log_group(mg, "start_participant pos=%d neg=%d", pos, neg); if (pos) { - hold_withdraw_locks(mg); - /* If we're the first mounter, and we're adding a second node here, but haven't gotten first_done (others_may_mount) from gfs yet, then don't do the start_done() to complete @@ -1765,7 +1807,6 @@ log_group(mg, "start_spectator_init"); set_our_memb_options(mg); send_options(mg); - hold_withdraw_locks(mg); start_done(mg); mg->start2_fn = start_spectator_init_2; } @@ -1795,7 +1836,6 @@ log_group(mg, "start_spectator pos=%d neg=%d", pos, neg); if (pos) { - hold_withdraw_locks(mg); start_done(mg); process_saved_options(mg); } else if (neg) { @@ -1937,12 +1977,57 @@ that needs journal recovery, we have a problem because we wait to call group_start_done() until gfs in the kernel to signal that the journal recovery is done. If we've unmounted gfs isn't there - any more to give us this signal and we'll never call start_done. */ + any more to give us this signal and we'll never call start_done. + + update: we should be dealing with all these issues correctly now. */ int do_terminate(struct mountgroup *mg) { - log_group(mg, "termination of our unmount leave"); - release_withdraw_locks(mg); + /* FIXME: all group members aren't guaranteed to be stopped for + our leave yet when we get terminate. We need that guarantee + before we tell a withdrawing gfs to drop locks. */ + + if (mg->withdraw) { + log_group(mg, "termination of our withdraw leave"); + set_sysfs(mg, "withdraw", 1); + list_move(&mg->list, &withdrawn_mounts); + } else { + log_group(mg, "termination of our unmount leave"); + list_del(&mg->list); + free(mg); + } + + return 0; +} + +/* The basic rule of withdraw is that we don't want to tell the kernel to drop + all locks until we know gfs has been stopped/blocked on all nodes. They'll + be stopped for our leave, we just need to know when they've all arrived + there. + + A withdrawing node is very much like a readonly node, differences are + that others recover its journal when they remove it from the group, + and when it's been removed from the group (gets terminate for its leave), + it tells the locally withdrawing gfs to clear out locks. */ + +int do_withdraw(char *table) +{ + struct mountgroup *mg; + char *name = strstr(table, ":") + 1; + + if (no_withdraw) { + log_error("withdraw feature not enabled"); + return 0; + } + + mg = find_mg(name); + if (!mg) { + log_error("do_withdraw no mountgroup %s", name); + return -1; + } + + mg->withdraw = 1; + send_withdraw(mg); return 0; }