From mboxrd@z Thu Jan  1 00:00:00 1970
From: teigland@sourceware.org <teigland@sourceware.org>
Date: 15 Jun 2006 20:41:47 -0000
Subject: [Cluster-devel] cluster/group/gfs_controld Makefile cpg.c grou ...
Message-ID: <20060615204147.14251.qmail@sourceware.org>
List-Id: <cluster-devel.redhat.com>
To: cluster-devel.redhat.com
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-06-15 20:41:46

Modified files:
	group/gfs_controld: Makefile cpg.c group.c lock_dlm.h main.c 
	                    member_cman.c recover.c 

Log message:
	Complete the code to support withdraw, not yet tested.  This also
	switches from using dlm locks for withdraw notifications to simply
	using messages.  The way the daemon now works allows a much simpler
	approach to withdraw than what we had before where we needed the
	dlm locks.  Setting up a dlm lockspace for the daemon was also an
	annoyingly heavy-weight step and the dlm kernel state of the daemon
	made cleaning up from crashes difficult.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/Makefile.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/cpg.c.diff?cvsroot=cluster&r1=1.2&r2=1.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/group.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.2&r2=1.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/main.c.diff?cvsroot=cluster&r1=1.2&r2=1.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/member_cman.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.2&r2=1.3

--- cluster/group/gfs_controld/Makefile	2006/06/09 20:59:57	1.1
+++ cluster/group/gfs_controld/Makefile	2006/06/15 20:41:46	1.2
@@ -22,8 +22,7 @@
 	-I../include/ \
 	-I../lib/ \
 	-I../../cman/lib/ \
-	-I../../cman/daemon/openais/trunk/include/ \
-	-I../../dlm/lib/
+	-I../../cman/daemon/openais/trunk/include/
 
 TARGET=gfs_controld
 
@@ -38,8 +37,6 @@
 		group.o \
 		plock.o \
 		recover.o \
-		withdraw.o \
-		../../dlm/lib/libdlm_lt.a \
 		../../cman/lib/libcman.a \
 		../../cman/daemon/openais/trunk/lib/libcpg.a \
 		../lib/libgroup.a
--- cluster/group/gfs_controld/cpg.c	2006/06/15 15:27:43	1.2
+++ cluster/group/gfs_controld/cpg.c	2006/06/15 20:41:46	1.3
@@ -24,6 +24,7 @@
 void receive_options(struct mountgroup *mg, char *buf, int len, int from);
 void receive_remount(struct mountgroup *mg, char *buf, int len, int from);
 void receive_plock(struct mountgroup *mg, char *buf, int len, int from);
+void receive_withdraw(struct mountgroup *mg, char *buf, int len, int from);
 void receive_recovery_status(struct mountgroup *mg, char *buf, int len,
 			     int from);
 void receive_recovery_done(struct mountgroup *mg, char *buf, int len, int from);
@@ -88,6 +89,10 @@
 		receive_recovery_done(mg, data, len, nodeid);
 		break;
 
+	case MSG_WITHDRAW:
+		receive_withdraw(mg, data, len, nodeid);
+		break;
+
 	default:
 		log_error("unknown message type %d from %d",
 			  hd->type, hd->nodeid);
--- cluster/group/gfs_controld/group.c	2006/06/09 20:59:57	1.1
+++ cluster/group/gfs_controld/group.c	2006/06/15 20:41:46	1.2
@@ -147,8 +147,6 @@
 		log_debug("groupd callback: terminate %s", cb_name);
 		mg->last_callback = DO_TERMINATE;
 		do_terminate(mg);
-		list_del(&mg->list);
-		free(mg);
 		break;
 
 	case DO_SETID:
--- cluster/group/gfs_controld/lock_dlm.h	2006/06/15 15:27:43	1.2
+++ cluster/group/gfs_controld/lock_dlm.h	2006/06/15 20:41:46	1.3
@@ -36,7 +36,6 @@
 #include "list.h"
 #include "linux_endian.h"
 #include "libgroup.h"
-#include "libdlm.h"
 
 #define MAXARGS			64
 #define MAXLINE			256
@@ -184,11 +183,10 @@
 	int			wait_gfs_recover_done;
 	int			gone_event;
 	int			gone_type;
-	int			mount_finished;
+	int			finished;
 	int			local_recovery_status;
 	int			recovery_status;
-	int			withdraw;
-	struct dlm_lksb		wd_lksb;
+	int			withdrawing;
 	int			needs_journals;
 };
 
@@ -197,6 +195,7 @@
 	MSG_OPTIONS,
 	MSG_REMOUNT,
 	MSG_PLOCK,
+	MSG_WITHDRAW,
 	MSG_RECOVERY_STATUS,
 	MSG_RECOVERY_DONE,
 };
@@ -223,12 +222,9 @@
 int process_cpg(void);
 int setup_groupd(void);
 int process_groupd(void);
-int setup_libdlm(void);
-int process_libdlm(void);
 int setup_plocks(void);
 int process_plocks(void);
 void exit_cman(void);
-void exit_libdlm(void);
 
 int do_mount(int ci, char *dir, char *type, char *proto, char *table,
 	     char *options);
--- cluster/group/gfs_controld/main.c	2006/06/15 15:27:43	1.2
+++ cluster/group/gfs_controld/main.c	2006/06/15 20:41:46	1.3
@@ -29,10 +29,10 @@
 static int listen_fd;
 static int groupd_fd;
 static int uevent_fd;
-static int libdlm_fd;
 static int plocks_fd;
 
 extern struct list_head mounts;
+extern struct list_head withdrawn_mounts;
 int no_withdraw;
 
 static void make_args(char *buf, int *argc, char **argv, char sep)
@@ -266,14 +266,6 @@
 		goto out;
 	client_add(uevent_fd, &maxi);
 
-	if (no_withdraw)
-		goto next;
-
-	rv = libdlm_fd = setup_libdlm();
-	if (rv < 0)
-		goto next;
-	client_add(libdlm_fd, &maxi);
- next:
 	rv = plocks_fd = setup_plocks();
 	if (rv < 0)
 		goto out;
@@ -309,9 +301,6 @@
 					process_cpg();
 				else if (pollfd[i].fd == uevent_fd)
 					process_uevent();
-				else if (!no_withdraw &&
-					 pollfd[i].fd == libdlm_fd)
-					process_libdlm();
 				else if (pollfd[i].fd == plocks_fd)
 					process_plocks();
 				else
@@ -456,6 +445,7 @@
 {
 	prog_name = argv[0];
 	INIT_LIST_HEAD(&mounts);
+	INIT_LIST_HEAD(&withdrawn_mounts);
 	client_init();
 
 	decode_arguments(argc, argv);
--- cluster/group/gfs_controld/member_cman.c	2006/06/09 20:59:57	1.1
+++ cluster/group/gfs_controld/member_cman.c	2006/06/15 20:41:46	1.2
@@ -43,10 +43,7 @@
 
 void exit_cman(void)
 {
-	/* do we want to try to forcibly clean some stuff up
-	   in the kernel here? */
 	log_error("cluster is down, exiting");
-	exit_libdlm();
 	exit(1);
 }
 
--- cluster/group/gfs_controld/recover.c	2006/06/15 15:27:43	1.2
+++ cluster/group/gfs_controld/recover.c	2006/06/15 20:41:46	1.3
@@ -26,14 +26,12 @@
 extern char *clustername;
 extern int our_nodeid;
 extern group_handle_t gh;
+extern int no_withdraw;
 
 struct list_head mounts;
+struct list_head withdrawn_mounts;
 
 void send_journals(struct mountgroup *mg, int nodeid);
-int hold_withdraw_locks(struct mountgroup *mg);
-void release_withdraw_lock(struct mountgroup *mg, struct mg_member *memb);
-void release_withdraw_locks(struct mountgroup *mg);
-
 void start_participant_init_2(struct mountgroup *mg);
 void start_spectator_init_2(struct mountgroup *mg);
 void start_spectator_2(struct mountgroup *mg);
@@ -146,6 +144,46 @@
 	mg->remount_client = 0;
 }
 
+void send_withdraw(struct mountgroup *mg)
+{
+	struct gdlm_header *hd;
+	int len;
+	char *buf;
+
+	len = sizeof(struct gdlm_header);
+
+	buf = malloc(len);
+	if (!buf)
+		return;
+	memset(buf, 0, len);
+
+	hd = (struct gdlm_header *)buf;
+	hd->type = MSG_WITHDRAW;
+	hd->nodeid = our_nodeid;
+	hd->to_nodeid = 0;
+
+	log_group(mg, "send_withdraw");
+
+	send_group_message(mg, len, buf);
+
+	free(buf);
+}
+
+void receive_withdraw(struct mountgroup *mg, char *buf, int len, int from)
+{
+	struct mg_member *memb;
+
+	memb = find_memb_nodeid(mg, from);
+	if (!memb) {
+		log_group(mg, "receive_withdraw no member %d", from);
+		return;
+	}
+	memb->withdrawing = 1;
+
+	if (from == our_nodeid)
+		group_leave(gh, mg->name);
+}
+
 #define SEND_RS_INTS 3
 
 void send_recovery_status(struct mountgroup *mg)
@@ -267,6 +305,8 @@
 		return "MSG_RECOVERY_STATUS";
 	case MSG_RECOVERY_DONE:
 		return "MSG_RECOVERY_DONE";
+	case MSG_WITHDRAW:
+		return "MSG_WITHDRAW";
 	}
 	return "unknown";
 }
@@ -911,7 +951,7 @@
 			   - no journal cb if we've already done a journl cb */
 
 			if ((memb->gone_type == GROUP_NODE_FAILED ||
-			    memb->withdraw) &&
+			    memb->withdrawing) &&
 			    memb->jid != JID_INIT &&
 			    !memb->spectator &&
 			    !memb->wait_gfs_recover_done) {
@@ -925,7 +965,7 @@
 				  memb->nodeid, memb->tell_gfs_to_recover,
 				  mg->spectator,
 				  mg->start_type,
-				  memb->withdraw,
+				  memb->withdrawing,
 				  memb->jid,
 				  memb->spectator,
 				  memb->wait_gfs_recover_done);
@@ -944,7 +984,7 @@
 	}
 
 	list_for_each_entry(memb, &mg->members, list) {
-		if (!memb->mount_finished)
+		if (!memb->finished)
 			continue;
 		if (low == -1 || memb->nodeid < low)
 			low = memb->nodeid;
@@ -1186,7 +1226,12 @@
 	struct mg_member *memb;
 	int rv;
 
-	if (mg->spectator || mg->readonly || mg->our_jid == JID_INIT) {
+	/* we can't do journal recovery if: we're a spectator or readonly
+	   mount, gfs is currently withdrawing, or we're mounting and haven't
+	   received a journals message yet */
+
+	if (mg->spectator || mg->readonly || mg->withdraw ||
+	    mg->our_jid == JID_INIT) {
 		list_for_each_entry(memb, &mg->members_gone, list) {
 			if (!memb->tell_gfs_to_recover)
 				continue;
@@ -1406,11 +1451,25 @@
 {
 	struct mountgroup *mg;
 
+	list_for_each_entry(mg, &withdrawn_mounts, list) {
+		if (!strcmp(mg->dir, dir)) {
+			log_group(mg, "unmount withdrawn fs");
+			list_del(&mg->list);
+			free(mg);
+			return 0;
+		}
+	}
+
 	mg = find_mg_dir(dir);
 	if (!mg) {
 		log_error("do_unmount: unknown mount dir %s", dir);
 		return -1;
 	}
+
+	if (mg->withdraw) {
+		log_error("do_unmount: fs on %s is withdrawing", dir);
+		return -1;
+	}
 	
 	/* Check to see if we're waiting for a kernel recovery_done to do a
 	   start_done().  If so, call the start_done() here because we won't be
@@ -1567,9 +1626,6 @@
 	   from members_gone if their journals have been recovered */
 
 	list_for_each_entry_safe(memb, safe, &mg->members_gone, list) {
-		if (!memb->withdraw)
-			release_withdraw_lock(mg, memb);
-
 		if (!memb->recovery_status) {
 			list_del(&memb->list);
 			free(memb);
@@ -1588,18 +1644,8 @@
 		}
 	}
 
-	list_for_each_entry(memb, &mg->members, list) {
-		memb->mount_finished = 1;
-
-		/* If there are still withdrawing nodes that haven't left
-		   the group, we need to keep lock requests blocked */
-
-		if (memb->withdraw) {
-			log_group(mg, "finish: leave locks blocked for "
-				  "withdrawing node %d", memb->nodeid);
-			leave_blocked = 1;
-		}
-	}
+	list_for_each_entry(memb, &mg->members, list)
+		memb->finished = 1;
 
 	if (mg->needs_recovery) {
 		log_group(mg, "finish: leave locks blocked for needs_recovery");
@@ -1674,7 +1720,6 @@
 		mg->first_mounter_done = 0;
 		mg->got_our_options = 1;
 		mg->got_our_journals = 1;
-		hold_withdraw_locks(mg);
 	}
 	start_done(mg);
 	notify_mount_client(mg);
@@ -1688,7 +1733,6 @@
 	log_group(mg, "start_participant_init");
 	set_our_memb_options(mg);
 	send_options(mg);
-	hold_withdraw_locks(mg);
 	start_done(mg);
 	mg->start2_fn = start_participant_init_2;
 }
@@ -1732,8 +1776,6 @@
 	log_group(mg, "start_participant pos=%d neg=%d", pos, neg);
 
 	if (pos) {
-		hold_withdraw_locks(mg);
-
 		/* If we're the first mounter, and we're adding a second
 		   node here, but haven't gotten first_done (others_may_mount)
 		   from gfs yet, then don't do the start_done() to complete
@@ -1765,7 +1807,6 @@
 	log_group(mg, "start_spectator_init");
 	set_our_memb_options(mg);
 	send_options(mg);
-	hold_withdraw_locks(mg);
 	start_done(mg);
 	mg->start2_fn = start_spectator_init_2;
 }
@@ -1795,7 +1836,6 @@
 	log_group(mg, "start_spectator pos=%d neg=%d", pos, neg);
 
 	if (pos) {
-		hold_withdraw_locks(mg);
 		start_done(mg);
 		process_saved_options(mg);
 	} else if (neg) {
@@ -1937,12 +1977,57 @@
   that needs journal recovery, we have a problem because we wait to
   call group_start_done() until gfs in the kernel to signal that
   the journal recovery is done.  If we've unmounted gfs isn't there
-  any more to give us this signal and we'll never call start_done. */
+  any more to give us this signal and we'll never call start_done.
+ 
+  update: we should be dealing with all these issues correctly now. */
 
 int do_terminate(struct mountgroup *mg)
 {
-	log_group(mg, "termination of our unmount leave");
-	release_withdraw_locks(mg);
+	/* FIXME: all group members aren't guaranteed to be stopped for
+	   our leave yet when we get terminate.  We need that guarantee
+	   before we tell a withdrawing gfs to drop locks. */
+
+	if (mg->withdraw) {
+		log_group(mg, "termination of our withdraw leave");
+		set_sysfs(mg, "withdraw", 1);
+		list_move(&mg->list, &withdrawn_mounts);
+	} else {
+		log_group(mg, "termination of our unmount leave");
+		list_del(&mg->list);
+		free(mg);
+	}
+
+	return 0;
+}
+
+/* The basic rule of withdraw is that we don't want to tell the kernel to drop
+   all locks until we know gfs has been stopped/blocked on all nodes.  They'll
+   be stopped for our leave, we just need to know when they've all arrived
+   there.
+
+   A withdrawing node is very much like a readonly node, differences are
+   that others recover its journal when they remove it from the group,
+   and when it's been removed from the group (gets terminate for its leave),
+   it tells the locally withdrawing gfs to clear out locks. */
+
+int do_withdraw(char *table)
+{
+	struct mountgroup *mg;
+	char *name = strstr(table, ":") + 1;
+
+	if (no_withdraw) {
+		log_error("withdraw feature not enabled");
+		return 0;
+	}
+
+	mg = find_mg(name);
+	if (!mg) {
+		log_error("do_withdraw no mountgroup %s", name);
+		return -1;
+	}
+
+	mg->withdraw = 1;
+	send_withdraw(mg);
 	return 0;
 }