cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
* [Cluster-devel] cluster/group/daemon app.c cpg.c gd_internal.h ...
@ 2006-09-26 19:17 teigland
  0 siblings, 0 replies; 3+ messages in thread
From: teigland @ 2006-09-26 19:17 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-09-26 19:17:21

Modified files:
	group/daemon   : app.c cpg.c gd_internal.h joinleave.c main.c 

Log message:
	Add debugging in four areas to help us know more quickly when something
	might be wrong at the cpg level:
	- log if cpg flow control goes on
	- log when we're waiting to receive a cpg event for our own join
	- when we're in a FOO_STOP_WAIT or FOO_START_WAIT state, log how
	many more cpg messages we're waiting to receive before moving on
	to the next state
	- save the event id of the last cpg message we sent, and clear that
	value when we receive that message back (this value is printed to
	the debug log when someone runs group_tool, not shown in the
	group_tool output)

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/app.c.diff?cvsroot=cluster&r1=1.45&r2=1.46
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/cpg.c.diff?cvsroot=cluster&r1=1.31&r2=1.32
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/gd_internal.h.diff?cvsroot=cluster&r1=1.40&r2=1.41
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/joinleave.c.diff?cvsroot=cluster&r1=1.17&r2=1.18
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/main.c.diff?cvsroot=cluster&r1=1.44&r2=1.45

--- cluster/group/daemon/app.c	2006/09/15 20:07:15	1.45
+++ cluster/group/daemon/app.c	2006/09/26 19:17:20	1.46
@@ -692,6 +692,7 @@
 	msg_bswap_out(&msg);
 
 	log_group(g, "send stopped");
+	g->app->sent_event_id = ev->id;
 	return send_message_groupd(g, &msg, sizeof(msg));
 }
 
@@ -710,6 +711,7 @@
 	msg_bswap_out(&msg);
 
 	log_group(g, "send started");
+	g->app->sent_event_id = ev->id;
 	return send_message_groupd(g, &msg, sizeof(msg));
 }
 
@@ -788,7 +790,6 @@
 	}
 }
 
-#if 0
 static int count_nodes_not_stopped(app_t *a)
 {
 	node_t *node;
@@ -800,7 +801,6 @@
 	}
 	return i;
 }
-#endif
 
 int event_state_begin(app_t *a)
 {
@@ -853,7 +853,7 @@
 	event_t *ev = a->current_event;
 	node_t *node, *n;
 	struct nodeid *id;
-	int rv = 0, do_start = 0;
+	int rv = 0, do_start = 0, count;
 
 	if (!(event_state_stopping(a) || event_state_starting(a)))
 		log_group(g, "process_current_event %llx %d %s",
@@ -904,10 +904,9 @@
 		break;
 
 	case EST_JOIN_STOP_WAIT:
-		/*
 		count = count_nodes_not_stopped(a);
-		log_group(g, "waiting for %d more nodes to be stopped", count);
-		*/
+		log_group(g, "waiting for %d more stopped messages "
+			  "before JOIN_ALL_STOPPED", count);
 		break;
 
 	case EST_JOIN_ALL_STOPPED:
@@ -939,10 +938,9 @@
 		break;
 
 	case EST_LEAVE_STOP_WAIT:
-		/*
 		count = count_nodes_not_stopped(a);
-		log_group(g, "waiting for %d more nodes to be stopped", count);
-		*/
+		log_group(g, "waiting for %d more stopped messages "
+			  "before LEAVE_ALL_STOPPED", count);
 		break;
 
 	case EST_LEAVE_ALL_STOPPED:
@@ -993,10 +991,9 @@
 		break;
 
 	case EST_FAIL_STOP_WAIT:
-		/*
 		count = count_nodes_not_stopped(a);
-		log_group(g, "waiting for %d more nodes to be stopped", count);
-		*/
+		log_group(g, "waiting for %d more stopped messages "
+			  "before FAIL_ALL_STOPPED", count);
 		break;
 
 	case EST_FAIL_ALL_STOPPED:
@@ -1470,8 +1467,11 @@
 		}
 
 		if (ev) {
+			a->need_first_event = 0;
 			a->current_event = ev;
 			rv = process_current_event(g);
+		} else if (a->need_first_event) {
+			log_group(g, "waiting for our own cpg join event");
 		}
 	}
  out:
--- cluster/group/daemon/cpg.c	2006/09/08 23:14:56	1.31
+++ cluster/group/daemon/cpg.c	2006/09/26 19:17:20	1.32
@@ -20,6 +20,7 @@
 static int			saved_left_count;
 static cpg_handle_t		saved_handle;
 static struct cpg_name		saved_name;
+static int			message_flow_control_on;
 
 
 static node_t *find_group_node(group_t *g, int nodeid)
@@ -246,6 +247,9 @@
 		  msg_type(msg->ms_type));
 	*/
 
+	if (nodeid == our_nodeid && g->app->sent_event_id == msg->ms_event_id)
+		g->app->sent_event_id = 0;
+
 	save = malloc(sizeof(struct save_msg));
 	memset(save, 0, sizeof(struct save_msg));
 	save->nodeid = nodeid;
@@ -375,6 +379,7 @@
 	cpg_error_t error;
 	cpg_handle_t handle;
 	int found = 0;
+	cpg_flow_control_state_t flow_control_state;
 
 	if (ci == groupd_ci) {
 		handle = groupd_handle;
@@ -404,6 +409,18 @@
 		return;
 	}
 
+	error = cpg_flow_control_state_get(handle, &flow_control_state);
+	if (error != CPG_OK)
+		log_error(g, "cpg_flow_control_state_get %d", error);
+	else if (flow_control_state == CPG_FLOW_CONTROL_ENABLED) {
+		message_flow_control_on = 1;
+		log_debug("flow control on");
+	} else {
+		if (message_flow_control_on)
+			log_debug("flow control off");
+		message_flow_control_on = 0;
+	}
+
 	if (got_confchg)
 		process_confchg();
 }
--- cluster/group/daemon/gd_internal.h	2006/09/15 18:20:36	1.40
+++ cluster/group/daemon/gd_internal.h	2006/09/26 19:17:21	1.41
@@ -180,6 +180,8 @@
 	struct list_head	events;
 	event_t			*current_event;
 	group_t			*g;
+	uint64_t		sent_event_id; /* for debugging */
+	int			need_first_event; /* for debugging */
 };
 
 #define MSG_APP_STOPPED        1
--- cluster/group/daemon/joinleave.c	2006/06/28 22:16:36	1.17
+++ cluster/group/daemon/joinleave.c	2006/09/26 19:17:21	1.18
@@ -63,6 +63,7 @@
 	a = malloc(sizeof(app_t));
 	memset(a, 0, sizeof(app_t));
 
+	a->need_first_event = 1;
 	INIT_LIST_HEAD(&a->nodes);
 	INIT_LIST_HEAD(&a->events);
 	a->g = g;
--- cluster/group/daemon/main.c	2006/09/15 18:20:36	1.44
+++ cluster/group/daemon/main.c	2006/09/26 19:17:21	1.45
@@ -462,6 +462,10 @@
 			else
 				data->event_local_status = -1;
 		}
+
+		if (g->app->sent_event_id)
+			log_group(g, "sent_event_id %llx",
+				  g->app->sent_event_id);
 	}
 
 	data->member_count = g->app->node_count;



^ permalink raw reply	[flat|nested] 3+ messages in thread

* [Cluster-devel] cluster/group/daemon app.c cpg.c gd_internal.h ...
@ 2006-09-26 21:32 teigland
  0 siblings, 0 replies; 3+ messages in thread
From: teigland @ 2006-09-26 21:32 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-09-26 21:32:38

Modified files:
	group/daemon   : app.c cpg.c gd_internal.h main.c 

Log message:
	Adding -vv to the groupd command line will result in a log_debug
	for every cpg send and every cpg recv.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/app.c.diff?cvsroot=cluster&r1=1.46&r2=1.47
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/cpg.c.diff?cvsroot=cluster&r1=1.32&r2=1.33
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/gd_internal.h.diff?cvsroot=cluster&r1=1.41&r2=1.42
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/main.c.diff?cvsroot=cluster&r1=1.45&r2=1.46

--- cluster/group/daemon/app.c	2006/09/26 19:17:20	1.46
+++ cluster/group/daemon/app.c	2006/09/26 21:32:38	1.47
@@ -693,7 +693,7 @@
 
 	log_group(g, "send stopped");
 	g->app->sent_event_id = ev->id;
-	return send_message_groupd(g, &msg, sizeof(msg));
+	return send_message_groupd(g, &msg, sizeof(msg), MSG_APP_STOPPED);
 }
 
 static int send_started(group_t *g)
@@ -712,7 +712,7 @@
 
 	log_group(g, "send started");
 	g->app->sent_event_id = ev->id;
-	return send_message_groupd(g, &msg, sizeof(msg));
+	return send_message_groupd(g, &msg, sizeof(msg), MSG_APP_STARTED);
 }
 
 int do_stopdone(char *name, int level)
--- cluster/group/daemon/cpg.c	2006/09/26 19:17:20	1.32
+++ cluster/group/daemon/cpg.c	2006/09/26 21:32:38	1.33
@@ -226,8 +226,15 @@
 		memcpy(&name, &msg->ms_name, MAX_NAMELEN);
 
 		g = find_group_level(name, msg->ms_level);
-		if (!g)
+		if (!g) {
+			if (groupd_debug_verbose > 1) {
+				log_print("%d:%s RECV len %d %s from %d, "
+					  "no group",
+				  	  msg->ms_level, name, data_len,
+				  	  msg_type(msg->ms_type), nodeid);
+			}
 			return;
+		}
 	} else {
 		g = find_group_by_handle(handle);
 		if (!g) {
@@ -242,10 +249,9 @@
 		}
 	}
 
-	/*
-	log_group(g, "deliver_cb from %d len %d type %s", nodeid, data_len,
-		  msg_type(msg->ms_type));
-	*/
+	if (groupd_debug_verbose > 1)
+		log_group(g, "RECV len %d %s from %d", data_len,
+			  msg_type(msg->ms_type), nodeid);
 
 	if (nodeid == our_nodeid && g->app->sent_event_id == msg->ms_event_id)
 		g->app->sent_event_id = 0;
@@ -557,8 +563,11 @@
 	return 0;
 }
 
-int send_message_groupd(group_t *g, void *buf, int len)
+int send_message_groupd(group_t *g, void *buf, int len, int type)
 {
+	if (groupd_debug_verbose > 1)
+		log_group(g, "SEND len %d %s", len, msg_type(type));
+
 	return _send_message(groupd_handle, g, buf, len);
 }
 
--- cluster/group/daemon/gd_internal.h	2006/09/26 19:17:21	1.41
+++ cluster/group/daemon/gd_internal.h	2006/09/26 21:32:38	1.42
@@ -51,6 +51,7 @@
 
 extern char *prog_name;
 extern int groupd_debug_opt;
+extern int groupd_debug_verbose;
 extern char groupd_debug_buf[256];
 extern char dump_buf[DUMP_SIZE];
 extern int dump_point;
@@ -276,7 +277,7 @@
 int do_cpg_join(group_t *g);
 int do_cpg_leave(group_t *g);
 int send_message(group_t *g, void *buf, int len);
-int send_message_groupd(group_t *g, void *buf, int len);
+int send_message_groupd(group_t *g, void *buf, int len, int type);
 void copy_groupd_data(group_data_t *data);
 int in_groupd_cpg(int nodeid);
 
--- cluster/group/daemon/main.c	2006/09/26 19:17:21	1.45
+++ cluster/group/daemon/main.c	2006/09/26 21:32:38	1.46
@@ -12,7 +12,7 @@
 
 #include "gd_internal.h"
 
-#define OPTION_STRING			"DhV"
+#define OPTION_STRING			"DhVv"
 #define LOCKFILE_NAME			"/var/run/groupd.pid"
 
 extern struct list_head recovery_sets;
@@ -598,7 +598,7 @@
 
 	act = get_action(buf);
 
-	log_debug("got %d bytes from client %d %s", rv, ci, last_action);
+	log_debug("got client %d %s", ci, last_action);
 
 	switch (act) {
 
@@ -850,6 +850,10 @@
 			exit(EXIT_SUCCESS);
 			break;
 
+		case 'v':
+			groupd_debug_verbose++;
+			break;
+
 		case 'V':
 			printf("groupd (built %s %s)\n", __DATE__, __TIME__);
 			/* printf("%s\n", REDHAT_COPYRIGHT); */
@@ -934,6 +938,7 @@
 
 char *prog_name;
 int groupd_debug_opt;
+int groupd_debug_verbose;
 char groupd_debug_buf[256];
 char dump_buf[DUMP_SIZE];
 int dump_point;



^ permalink raw reply	[flat|nested] 3+ messages in thread

* [Cluster-devel] cluster/group/daemon app.c cpg.c gd_internal.h ...
@ 2006-10-06 16:55 teigland
  0 siblings, 0 replies; 3+ messages in thread
From: teigland @ 2006-10-06 16:55 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-10-06 16:55:19

Modified files:
	group/daemon   : app.c cpg.c gd_internal.h main.c 

Log message:
	This is a big batch of code that gets us further along the path to
	handling recoveries mixed with joins (gfs mounts).  The test I've
	been using to work on this is inserting a BUG() at the start of
	gfs_lm_get_lock() on six of eight nodes and then mounting on all
	of them in parallel.  We should end up with the two nodes without
	the BUG properly mounted and the six with the BUG properly recovered.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/app.c.diff?cvsroot=cluster&r1=1.48&r2=1.49
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/cpg.c.diff?cvsroot=cluster&r1=1.34&r2=1.35
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/gd_internal.h.diff?cvsroot=cluster&r1=1.43&r2=1.44
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/main.c.diff?cvsroot=cluster&r1=1.47&r2=1.48

--- cluster/group/daemon/app.c	2006/10/04 15:52:24	1.48
+++ cluster/group/daemon/app.c	2006/10/06 16:55:19	1.49
@@ -17,6 +17,8 @@
 		return "stopped";
 	case MSG_APP_STARTED:
 		return "started";
+	case MSG_APP_RECOVER:
+		return "recover";
 	case MSG_APP_INTERNAL:
 		return "internal";
 	}
@@ -125,9 +127,12 @@
 		type = event_id_to_type(save->msg.ms_event_id);
 		node = find_app_node(g->app, nodeid);
 
-		if ((type == 1 && node) || (type != 1 && !node)) {
+		if ((type == 1 && node) || (type != 1 && !node) ||
+		    (save->msg.ms_type == MSG_APP_RECOVER)) {
 
-			if (type == 1)
+			if (save->msg.ms_type == MSG_APP_RECOVER)
+				state_str = "MSG_APP_RECOVER";
+			else if (type == 1)
 				state_str = "EST_JOIN_BEGIN";
 			else if (type == 2)
 				state_str = "EST_LEAVE_BEGIN";
@@ -240,7 +245,7 @@
 	}
 }
 
-void _del_recovery_set(group_t *g, int nodeid)
+void _del_recovery_set(group_t *g, int nodeid, int purge)
 {
 	struct recovery_set *rs, *rs2;
 	struct recovery_entry *re, *re2;
@@ -254,10 +259,17 @@
 
 		list_for_each_entry_safe(re, re2, &rs->entries, list) {
 			if (re->group == g) {
-				re->recovered = 1;
-				log_group(g, "done in recovery set %d",
-					  rs->nodeid);
-				found++;
+				if (purge) {
+					list_del(&re->list);
+					free(re);
+					log_group(g, "purged from rs %d",
+						  rs->nodeid);
+				} else {
+					re->recovered = 1;
+					log_group(g, "done in recovery set %d",
+					  	  rs->nodeid);
+					found++;
+				}
 			} else {
 				if (re->recovered == 0)
 					entries_not_recovered++;
@@ -289,18 +301,18 @@
    one failed nodeid).  Remove this group from recovery sets for those nodeids
    and free any recovery sets that are now completed. */
 
-void del_recovery_set(group_t *g, event_t *ev)
+void del_recovery_set(group_t *g, event_t *ev, int purge)
 {
 	struct nodeid *id;
 
 	log_group(g, "rev %llx done, remove group from rs %d",
 		  ev->id, ev->nodeid);
-	_del_recovery_set(g, ev->nodeid);
+	_del_recovery_set(g, ev->nodeid, purge);
 
 	list_for_each_entry(id, &ev->extended, list) {
 		log_group(g, "rev %llx done, remove group from rs %d",
 			  ev->id, id->nodeid);
-		_del_recovery_set(g, id->nodeid);
+		_del_recovery_set(g, id->nodeid, purge);
 	}
 }
 
@@ -332,7 +344,7 @@
 	return 0;
 }
 
-/* all groups referenced by a recovery set have been stopped on all nodes,
+/* all groups referenced by a recovery set are stopped on all nodes,
    and stopped for recovery */
 
 static int set_is_all_stopped(struct recovery_set *rs, event_t *rev)
@@ -343,6 +355,17 @@
 	list_for_each_entry(re, &rs->entries, list) {
 		ev = re->group->app->current_event;
 
+#if 0
+		/* if we're not in the group yet, skip it */
+		if (ev &&
+		    ev->state == EST_JOIN_STOP_WAIT &&
+		    is_our_join(ev)) {
+			log_group(re->group, "skip all_stopped check for rs %d",
+				  rs->nodeid);
+			continue;
+		}
+#endif
+
 		if (ev &&
 		    is_recovery_event(ev) &&
 		    ev->state == EST_FAIL_ALL_STOPPED)
@@ -434,10 +457,25 @@
 		log_group(g, "lower levels not recovered in rs %d", rs->nodeid);
 		return 0;
 	}
-
 	return 1;
 }
 
+/* We're interested in any unrecovered group at a lower level than g, not
+   just lower groups in the same recovery set. */
+
+static int lower_groups_need_recovery(group_t *g)
+{
+	struct recovery_set *rs;
+
+	list_for_each_entry(rs, &recovery_sets, list) {
+		if (rs_lower_levels_recovered(rs, g->level))
+			continue;
+		log_group(g, "lower group not recovered in rs %d", rs->nodeid);
+		return 1;
+	}
+	return 0;
+}
+
 static int level_is_lowest(struct recovery_set *rs, int level)
 {
 	struct recovery_entry *re;
@@ -692,7 +730,6 @@
 	msg_bswap_out(&msg);
 
 	log_group(g, "send stopped");
-	g->app->sent_event_id = ev->id;
 	return send_message_groupd(g, &msg, sizeof(msg), MSG_APP_STOPPED);
 }
 
@@ -711,10 +748,26 @@
 	msg_bswap_out(&msg);
 
 	log_group(g, "send started");
-	g->app->sent_event_id = ev->id;
 	return send_message_groupd(g, &msg, sizeof(msg), MSG_APP_STARTED);
 }
 
+static int send_recover(group_t *g, event_t *rev)
+{
+	msg_t msg;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.ms_type = MSG_APP_RECOVER;
+	msg.ms_global_id = g->global_id;
+	msg.ms_event_id = rev->id;
+	msg.ms_level = g->level;
+	memcpy(&msg.ms_name, &g->name, MAX_NAMELEN);
+
+	msg_bswap_out(&msg);
+
+	log_group(g, "send recover");
+	return send_message_groupd(g, &msg, sizeof(msg), MSG_APP_RECOVER);
+}
+
 int do_stopdone(char *name, int level)
 {
 	group_t *g;
@@ -906,10 +959,28 @@
 	case EST_JOIN_STOP_WAIT:
 		count = count_nodes_not_stopped(a);
 		log_group(g, "waiting for %d more stopped messages "
-			  "before JOIN_ALL_STOPPED", count);
+			  "before JOIN_ALL_STOPPED %d", count, ev->nodeid);
 		break;
 
 	case EST_JOIN_ALL_STOPPED:
+		if (!cman_quorate) {
+			log_group(g, "wait for quorum before starting app");
+			break;
+		}
+
+		/* We want to move ahead to start here if this ev is to be
+		   started before a pending rev that will abort it.  Once
+		   started, the rev becomes current and stops the app
+		   immediately. */
+
+		if (lower_groups_need_recovery(g) &&
+		    !ev->start_app_before_pending_rev) {
+			log_group(g, "wait for lower_groups_need_recovery "
+				  "before starting app");
+			break;
+		}
+		ev->start_app_before_pending_rev = 0;
+
 		ev->state = EST_JOIN_START_WAIT;
 
 		if (!g->have_set_id) {
@@ -940,7 +1011,7 @@
 	case EST_LEAVE_STOP_WAIT:
 		count = count_nodes_not_stopped(a);
 		log_group(g, "waiting for %d more stopped messages "
-			  "before LEAVE_ALL_STOPPED", count);
+			  "before LEAVE_ALL_STOPPED %d", count, ev->nodeid);
 		break;
 
 	case EST_LEAVE_ALL_STOPPED:
@@ -993,7 +1064,7 @@
 	case EST_FAIL_STOP_WAIT:
 		count = count_nodes_not_stopped(a);
 		log_group(g, "waiting for %d more stopped messages "
-			  "before FAIL_ALL_STOPPED", count);
+			  "before FAIL_ALL_STOPPED %d", count, ev->nodeid);
 		break;
 
 	case EST_FAIL_ALL_STOPPED:
@@ -1058,7 +1129,7 @@
 
 	case EST_FAIL_ALL_STARTED:
 		app_finish(a);
-		del_recovery_set(g, ev);
+		del_recovery_set(g, ev, 0);
 		free_event(ev);
 		a->current_event = NULL;
 		rv = 1;
@@ -1075,6 +1146,14 @@
 	return rv;
 }
 
+static void clear_all_nodes_stopped(app_t *a)
+{
+	node_t *node;
+	log_group(a->g, "clear_all_nodes_stopped");
+	list_for_each_entry(node, &a->nodes, list)
+		node->stopped = 0;
+}
+
 static int mark_node_stopped(app_t *a, int nodeid)
 {
 	node_t *node;
@@ -1169,6 +1248,27 @@
 			continue;
 
 		ev = a->current_event;
+
+		if (save->msg.ms_type == MSG_APP_RECOVER) {
+			if (ev && ev->state == EST_JOIN_STOP_WAIT &&
+			    is_our_join(ev)) {
+				/* keep this msg around for
+				   recover_current_event() to see, it will
+				   be purged later */
+				if (!save->print_ignore) {
+					log_group(g, "rev %llx taken on "
+						  "node %d",
+						  save->msg.ms_event_id,
+						  save->nodeid);
+					save->print_ignore = 1;
+				}
+				continue;
+			} else {
+				goto free_save;
+			}
+		}
+
+
 		if (!ev || ev->id != save->msg.ms_event_id) {
 			if (!save->print_ignore) {
 				log_group(g, "ignore msg from %d id %llx %s",
@@ -1199,7 +1299,7 @@
 			log_group(g, "set global_id %x from %d",
 				  g->global_id, save->nodeid);
 		}
-
+	 free_save:
 		list_del(&save->list);
 		if (save->msg_long)
 			free(save->msg_long);
@@ -1328,6 +1428,7 @@
 	app_t *a = g->app;
 	event_t *ev, *rev;
 	node_t *node;
+	struct save_msg *save;
 	struct nodeid *id, *safe;
 	int rv = 0;
 
@@ -1351,9 +1452,7 @@
 
 		if (ev->state > EST_FAIL_ALL_STOPPED) {
 			ev->state = EST_FAIL_BEGIN;
-			list_for_each_entry(node, &a->nodes, list)
-				node->stopped = 0;
-
+			clear_all_nodes_stopped(a);
 		} else if (event_state_stopping(a)) {
 			mark_node_stopped(a, rev->nodeid);
 			list_for_each_entry(id, &rev->extended, list)
@@ -1365,7 +1464,6 @@
 		list_add(&id->list, &ev->extended);
 		log_group(g, "extend active rev %d with failed node %d",
 			  ev->nodeid, rev->nodeid);
-
 		list_for_each_entry_safe(id, safe, &rev->extended, list) {
 			list_del(&id->list);
 			list_add(&id->list, &ev->extended);
@@ -1373,11 +1471,106 @@
 				  ev->nodeid, id->nodeid);
 		}
 
+		send_recover(g, rev);
 		list_del(&rev->list);
 		free_event(rev);
 		return 1;
 	}
 
+	/* This is a really gross situation, wish I could find a better way
+	   to deal with it... (rev's skip ahead of other queued ev's, I think
+	   that's the root of the difficulties here, we don't know if the
+	   rev has skipped ahead of our join on remote nodes or not).
+
+	   If our own join event is current on other nodes, then we want a
+	   rev (which will replace our join ev once it's starting).  If our
+	   join event isn't current on other nodes, then recovery will occur
+	   before we're added to the app group and the rev doesn't apply to us
+	   (apart from needing to remove the failed node from the memb list).
+
+	   We won't know if our join ev is current on other nodes, though,
+	   until we see a message -- if the message event id is for our join,
+	   then our ev is current and we'll process the rev after our ev, if
+	   the message event id is for the rev, then the rev is being done
+	   by the current members without us and our ev will be done later;
+	   the rev doesn't apply to us.
+
+	   Do nothing until we see a message indicating whether other nodes
+	   are on our join ev (in which case go to "rev will abort curr" code),
+	   or whether they're processing this rev (before our join ev comes
+	   up) in which case we can drop the rev (NB attend to rs, too). */
+
+	if (ev->state == EST_JOIN_STOP_WAIT && is_our_join(ev)) {
+
+		log_group(g, "rev %d is for group we're waiting to join",
+			  rev->nodeid);
+
+		/* Look for a remote node with stopped of 1, if we find one,
+		   then fall through to the 'else if (event_state_stopping)'
+		   below.  A remote node with stopped of 1 means we've received
+		   a stopped message with an event_id of our join event. */
+
+		list_for_each_entry(node, &a->nodes, list) {
+			if (node->nodeid == our_nodeid)
+				continue;
+			if (node->stopped) {
+				log_group(g, "our join is current on %d",
+					  node->nodeid);
+				log_group(g, "rev %d behind our join ev %llx",
+					  rev->nodeid, ev->id);
+				goto next;
+			}
+		}
+
+		/* Look through saved messages for one with an event_id
+		   matching the rev, if we find one, then we get rid of this
+		   rev and clear this group (that we're joining) from any
+		   recovery sets that are sequencing recovery of groups the
+		   failed node was in.  The other nodes are processing the
+		   rev before processing our join ev. */
+		   
+		list_for_each_entry(save, &g->messages, list) {
+			if (save->msg.ms_type == MSG_APP_INTERNAL)
+				continue;
+			if (save->msg.ms_event_id != rev->id)
+				continue;
+
+			log_group(g, "rev %d %llx ahead of our join ev %llx",
+				  rev->nodeid, rev->id, ev->id);
+
+			node = find_app_node(a, rev->nodeid);
+			if (node) {
+				a->node_count--;
+				log_group(g, "not joined, remove %d rev %d",
+					  node->nodeid, rev->nodeid);
+				list_del(&node->list);
+				free(node);
+			}
+			list_for_each_entry(id, &rev->extended, list) {
+				node = find_app_node(a, id->nodeid);
+				if (node) {
+					a->node_count--;
+					log_group(g, "not joined, remove %d "
+						  "rev %d", id->nodeid,
+						  rev->nodeid);
+					list_del(&node->list);
+					free(node);
+				}
+			}
+
+			del_recovery_set(g, rev, 1);
+			list_del(&rev->list);
+			log_group(g, "got rid of rev %d for unjoined group",
+				  rev->nodeid);
+			free_event(rev);
+			return 0;
+		}
+
+		log_group(g, "no messages indicating remote state of group");
+		return 0;
+	}
+
+ next:
 	/* Before starting the rev we need to apply the node addition/removal
 	 * of the current ev to the app.  This means processing the current ev
 	 * up through the starting stage.  So, we're sending the app the start
@@ -1390,30 +1583,31 @@
 	 * starting state so the recovery event can then take over. */
 
 	if (event_state_starting(a) || event_state_all_started(a)) {
-
-		log_group(g, "rev for %d replaces current ev %d %s",
+		log_group(g, "rev %d replaces current ev %d %s",
 			  rev->nodeid, ev->nodeid, ev_state_str(ev));
-
+		clear_all_nodes_stopped(a);
 		list_del(&rev->list);
 		a->current_event = rev;
 		free_event(ev);
+		send_recover(g, rev);
 		rv = 1;
 	} else if (event_state_stopping(a)) {
-
 		/* We'll come back through here multiple times until all the
 		   stopped messages are received; we need to continue to
 		   process this event that's stopping so it will get to the
 		   starting state at which point the rev can replace it. */
 
-		log_group(g, "rev for %d will abort current ev %d %s",
+		log_group(g, "rev %d will abort current ev %d %s",
 			  rev->nodeid, ev->nodeid, ev_state_str(ev));
 
+		ev->start_app_before_pending_rev = 1;
+
 		mark_node_stopped(a, rev->nodeid);
 		list_for_each_entry(id, &rev->extended, list)
 			mark_node_stopped(a, id->nodeid);
 		rv = 1;
 	} else {
-		log_group(g, "rev for %d delayed for ev %d %s",
+		log_group(g, "rev %d delayed for ev %d %s",
 			  rev->nodeid, ev->nodeid, ev_state_str(ev));
 	}
 
@@ -1426,7 +1620,7 @@
 	return rv;
 }
 
-static int process_app(group_t *g)
+int process_app(group_t *g)
 {
 	app_t *a = g->app;
 	event_t *ev = NULL;
@@ -1449,19 +1643,31 @@
 			goto out;
 		rv += ret;
 	} else {
+
 		/* We only take on a new non-recovery event if there are
 		   no recovery sets outstanding.  The new event may be
 		   to mount gfs X where there are no living mounters of X,
 		   and the pending recovery set is to fence a node that
-		   had X mounted. */
+		   had X mounted.  update: relax this so events are taken
+		   if there are unrecovered groups _at a lower level_. */
 
 		ev = find_queued_recover_event(g);
 		if (ev) {
 			log_group(g, "set current event to recovery for %d",
 				  ev->nodeid);
 			list_del(&ev->list);
-		} else if (list_empty(&recovery_sets) && cman_quorate &&
-			   !list_empty(&a->events)) {
+		} else if (!list_empty(&a->events)) {
+#if 0
+			if (!cman_quorate) {
+				log_group(g, "no new event while inquorate");
+			} else if (lower_groups_need_recovery(g)) {
+				log_group(g, "no new event while lower level "
+					  "groups need recovery");
+			} else {
+				ev = list_entry(a->events.next, event_t, list);
+				list_del(&ev->list);
+			}
+#endif
 			ev = list_entry(a->events.next, event_t, list);
 			list_del(&ev->list);
 		}
@@ -1471,7 +1677,7 @@
 			a->current_event = ev;
 			rv = process_current_event(g);
 		} else if (a->need_first_event) {
-			log_group(g, "waiting for our own cpg join event");
+			log_group(g, "waiting for first cpg event");
 		}
 	}
  out:
--- cluster/group/daemon/cpg.c	2006/09/28 19:26:45	1.34
+++ cluster/group/daemon/cpg.c	2006/10/06 16:55:19	1.35
@@ -120,6 +120,12 @@
 	}
 
 	queue_app_join(g, nodeid);
+
+	/* if this is for our own join, then make it current immediately;
+	   other code gets confused if we're not joined and have no current
+	   event */
+	if (nodeid == our_nodeid)
+		process_app(g);
 }
 
 static void process_node_leave(group_t *g, int nodeid)
@@ -253,9 +259,6 @@
 		log_group(g, "RECV len %d %s from %d", data_len,
 			  msg_type(msg->ms_type), nodeid);
 
-	if (nodeid == our_nodeid && g->app->sent_event_id == msg->ms_event_id)
-		g->app->sent_event_id = 0;
-
 	save = malloc(sizeof(struct save_msg));
 	memset(save, 0, sizeof(struct save_msg));
 	save->nodeid = nodeid;
--- cluster/group/daemon/gd_internal.h	2006/10/04 15:52:24	1.43
+++ cluster/group/daemon/gd_internal.h	2006/10/06 16:55:19	1.44
@@ -149,6 +149,7 @@
 	int			nodeid;
 	uint64_t		id;
 	struct list_head	extended;
+	int			start_app_before_pending_rev;
 };
 
 /*
@@ -181,13 +182,13 @@
 	struct list_head	events;
 	event_t			*current_event;
 	group_t			*g;
-	uint64_t		sent_event_id; /* for debugging */
 	int			need_first_event; /* for debugging */
 };
 
 #define MSG_APP_STOPPED        1
 #define MSG_APP_STARTED        2
-#define MSG_APP_INTERNAL       3
+#define MSG_APP_RECOVER        3
+#define MSG_APP_INTERNAL       4
 
 #define MSG_VER_MAJOR          1
 #define MSG_VER_MINOR          0
@@ -257,6 +258,7 @@
 struct recovery_set *get_recovery_set(int nodeid);
 void groupd_down(int nodeid);
 char *msg_type(int type);
+int process_app(group_t *g);
 int is_our_join(event_t *ev);
 
 /* main.c */
--- cluster/group/daemon/main.c	2006/10/04 15:52:24	1.47
+++ cluster/group/daemon/main.c	2006/10/06 16:55:19	1.48
@@ -463,10 +463,6 @@
 			else
 				data->event_local_status = -1;
 		}
-
-		if (g->app->sent_event_id)
-			log_group(g, "sent_event_id %llx",
-				  g->app->sent_event_id);
 	}
 
 	data->member_count = g->app->node_count;



^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2006-10-06 16:55 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-09-26 19:17 [Cluster-devel] cluster/group/daemon app.c cpg.c gd_internal.h teigland
  -- strict thread matches above, loose matches on Subject: below --
2006-09-26 21:32 teigland
2006-10-06 16:55 teigland

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).