From mboxrd@z Thu Jan  1 00:00:00 1970
From: teigland@sourceware.org <teigland@sourceware.org>
Date: 6 Oct 2006 15:34:53 -0000
Subject: [Cluster-devel] cluster/group/gfs_controld cpg.c plock.c recover.c
Message-ID: <20061006153453.9947.qmail@sourceware.org>
List-Id: <cluster-devel.redhat.com>
To: cluster-devel.redhat.com
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-10-06 15:34:52

Modified files:
	group/gfs_controld: cpg.c plock.c recover.c 

Log message:
	- check cpg flow control status from openais when processing plocks
	- handle case where we're mounting and the only other mounted node
	fails -- we need to become the first mounter if we've not begun
	mount(2) yet
	- journal recovery requests need to be fed serially to gfs, we weren't
	doing that in the case where a gfs journal recovery was in progress
	when another node failed

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/cpg.c.diff?cvsroot=cluster&r1=1.7&r2=1.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.20&r2=1.21
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.17&r2=1.18

--- cluster/group/gfs_controld/cpg.c	2006/09/08 22:44:33	1.7
+++ cluster/group/gfs_controld/cpg.c	2006/10/06 15:34:52	1.8
@@ -19,6 +19,7 @@
 static int		saved_nodeid;
 static int		saved_len;
 static char		saved_data[MAX_MSGLEN];
+int			message_flow_control_on;
 
 void receive_journals(struct mountgroup *mg, char *buf, int len, int from);
 void receive_options(struct mountgroup *mg, char *buf, int len, int from);
@@ -127,6 +128,7 @@
 
 int process_cpg(void)
 {
+	cpg_flow_control_state_t flow_control_state;
 	cpg_error_t error;
 	
 	got_msg = 0;
@@ -142,6 +144,22 @@
 
 	if (got_msg)
 		do_deliver(saved_nodeid, saved_data, saved_len);
+
+	error = cpg_flow_control_state_get(daemon_handle, &flow_control_state);
+	if (error != CPG_OK) {
+		log_error("cpg_flow_control_state_get %d", error);
+		return -1;
+	}
+
+	if (flow_control_state == CPG_FLOW_CONTROL_ENABLED) {
+		message_flow_control_on = 1;
+		log_debug("flow control on");
+	} else {
+		if (message_flow_control_on)
+			log_debug("flow control off");
+		message_flow_control_on = 0;
+	}
+
 	return 0;
 }
 
--- cluster/group/gfs_controld/plock.c	2006/08/31 19:13:02	1.20
+++ cluster/group/gfs_controld/plock.c	2006/10/06 15:34:52	1.21
@@ -46,6 +46,7 @@
 static int control_fd = -1;
 extern int our_nodeid;
 static int plocks_online = 0;
+extern int message_flow_control_on;
 
 static SaCkptHandleT ckpt_handle;
 static SaCkptCallbacksT callbacks = { 0, 0 };
@@ -297,6 +298,10 @@
 	char *buf;
 	int len, rv;
 
+	/* Don't send more messages while the cpg message queue is backed up */
+	if (message_flow_control_on)
+		return 0;
+
 	memset(&info, 0, sizeof(info));
 
 	rv = read(control_fd, &info, sizeof(info));
--- cluster/group/gfs_controld/recover.c	2006/08/31 18:56:25	1.17
+++ cluster/group/gfs_controld/recover.c	2006/10/06 15:34:52	1.18
@@ -27,6 +27,7 @@
 void start_participant_init_2(struct mountgroup *mg);
 void start_spectator_init_2(struct mountgroup *mg);
 void start_spectator_2(struct mountgroup *mg);
+void notify_mount_client(struct mountgroup *mg);
 
 int set_sysfs(struct mountgroup *mg, char *field, int val)
 {
@@ -1062,6 +1063,41 @@
 		unlink_checkpoint(mg);
 		resend_journals(mg);
 	}
+
+	/* Tricky situation when we're mounting and the failed node was
+	   the only other node that had the fs mounted.  If the failed node
+	   didn't send us a journals message, we need to: unlink ckpt, pick a
+	   journal for ourselves, act like the first mounter of the fs (do
+	   first-mounter-recovery, the dead node may have been mounting itself
+	   and not finished first-mounter-recovery). */
+
+	else if (neg && mg->memb_count == 1) {
+		if (!mg->got_our_journals) {
+			log_group(mg, "we are left alone, act first mounter");
+
+			unlink_checkpoint(mg);
+			memb = find_memb_nodeid(mg, our_nodeid);
+			memb->jid = 0;
+			mg->our_jid = 0;
+			mg->first_mounter = 1;
+			mg->first_mounter_done = 0;
+			mg->got_our_options = 1;
+			mg->got_our_journals = 1;
+			mg->mount_client_delay = 0;
+			notify_mount_client(mg);
+		} else if (mg->mount_client_notified && !mg->got_kernel_mount) {
+
+			/* FIXME */
+
+			log_group(mg, "FIXME: case not handled");
+
+			/* we got journals message from other node before it
+			   died which means it finished first mounter recovery,
+			   but we now need to tell gfs to recover the journal
+			   after our own mount(2) completes */
+
+		}
+	}
 }
 
 struct mountgroup *create_mg(char *name)
@@ -1121,7 +1157,7 @@
 static int we_are_in_fence_domain(void)
 {
 	group_data_t data;
-	int i, rv;
+	int rv;
 
 	memset(&data, 0, sizeof(data));
 
@@ -1130,11 +1166,8 @@
 	if (rv || strcmp(data.client_name, "fence"))
 		return 0;
 
-	for (i = 0; i < data.member_count; i++) {
-		if (data.members[i] == our_nodeid)
-			return 1;
-	}
-
+	if (data.member == 1)
+		return 1;
 	return 0;
 }
 
@@ -1304,8 +1337,16 @@
 
    the problem we're trying to avoid here is telling gfs-kernel to do
    recovery when it can't for some reason and then waiting forever for
-   a recovery_done signal that will never arrive. */
+   a recovery_done signal that will never arrive.
 
+   FIXME: we want to do more here to avoid telling gfs-kernel to do recovery
+   until our mount is really complete.  I want to keep the join/mount
+   connection between mount.gfs and gfs_controld open throughout the mount
+   and have mount.gfs use it to return the result from mount(2).  Then we'll
+   know when the mount(2) is done and we should also be able to remove the
+   special mount_error_fd since errors can be sent back through the original
+   connection as well. */
+ 
 void recover_journals(struct mountgroup *mg)
 {
 	struct mg_member *memb;
@@ -1318,24 +1359,25 @@
 	    mg->kernel_mount_error ||
 	    !mg->mount_client_notified ||
 	    !mg->got_kernel_mount) {
+		log_group(mg, "recover_journals: unable %d,%d,%d,%d,%d,%d,%d",
+			  mg->spectator,
+			  mg->readonly,
+			  mg->withdraw,
+			  mg->our_jid,
+			  mg->kernel_mount_error,
+			  mg->mount_client_notified,
+			  mg->got_kernel_mount);
 
 		list_for_each_entry(memb, &mg->members_gone, list) {
-			if (!memb->tell_gfs_to_recover)
-				continue;
-
-			log_group(mg, "recover journal %d nodeid %d skip: "
-				  "%d %d %d %d %d %d %d",
-				  memb->jid, memb->nodeid,
-				  mg->spectator,
-				  mg->readonly,
-				  mg->withdraw,
-				  mg->our_jid,
-				  mg->kernel_mount_error,
-				  mg->mount_client_notified,
-				  mg->got_kernel_mount);
-
-			memb->tell_gfs_to_recover = 0;
-			memb->local_recovery_status = RS_READONLY;
+			log_group(mg, "member gone %d jid %d "
+				  "tell_gfs_to_recover %d",
+				  memb->nodeid, memb->jid,
+				  memb->tell_gfs_to_recover);
+
+			if (memb->tell_gfs_to_recover) {
+				memb->tell_gfs_to_recover = 0;
+				memb->local_recovery_status = RS_READONLY;
+			}
 		}
 		start_done(mg);
 		return;
@@ -1346,6 +1388,15 @@
 	   through the single recovery_done sysfs file */
 
 	list_for_each_entry(memb, &mg->members_gone, list) {
+		if (memb->wait_gfs_recover_done) {
+			log_group(mg, "delay new gfs recovery, "
+			  	  "wait_gfs_recover_done for nodeid %d jid %d",
+			  	  memb->nodeid, memb->jid);
+			return;
+		}
+	}
+
+	list_for_each_entry(memb, &mg->members_gone, list) {
 		if (!memb->tell_gfs_to_recover)
 			continue;
 
@@ -1416,6 +1467,17 @@
 	return 0;
 }
 
+int need_kernel_recovery_done(struct mountgroup *mg)
+{
+	struct mg_member *memb;
+
+	list_for_each_entry(memb, &mg->members_gone, list) {
+		if (memb->wait_gfs_recover_done)
+			return 1;
+	}
+	return 0;
+}
+
 /* Note: when a readonly node fails we do consider its journal (and the
    fs) to need recovery... not sure this is really necessary, but
    the readonly node did "own" a journal so it seems proper to recover
@@ -1500,19 +1562,13 @@
 
 	log_group(mg, "recovery_done jid %d nodeid %d %s",
 		  memb->jid, memb->nodeid, ss);
- out:
-	recover_journals(mg);
-	return 0;
-}
 
-int need_kernel_recovery_done(struct mountgroup *mg)
-{
-	struct mg_member *memb;
+	/* sanity check */
+	if (need_kernel_recovery_done(mg))
+		log_error("recovery_done: should be no pending gfs recoveries");
 
-	list_for_each_entry(memb, &mg->members_gone, list) {
-		if (memb->wait_gfs_recover_done)
-			return 1;
-	}
+ out:
+	recover_journals(mg);
 	return 0;
 }