[Cluster-devel] cluster/group/daemon app.c cpg.c gd_internal.h

cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed

From: teigland@sourceware.org <teigland@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/group/daemon app.c cpg.c gd_internal.h
Date: 21 Jun 2006 18:10:24 -0000	[thread overview]
Message-ID: <20060621181024.5387.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-06-21 18:10:23

Modified files:
	group/daemon   : app.c cpg.c gd_internal.h 

Log message:
	Don't finalize/terminate a local group leave until we see that all
	remaining group members have stopped.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/app.c.diff?cvsroot=cluster&r1=1.29&r2=1.30
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/cpg.c.diff?cvsroot=cluster&r1=1.24&r2=1.25
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/gd_internal.h.diff?cvsroot=cluster&r1=1.32&r2=1.33

--- cluster/group/daemon/app.c	2006/06/20 20:26:08	1.29
+++ cluster/group/daemon/app.c	2006/06/21 18:10:23	1.30
@@ -592,9 +592,11 @@
 	return (ev->nodeid == our_nodeid);
 }
 
-/* called after the local app has acked that it is stopped as part
-   of our own leave.  We've gotten the final confchg for our leave
-   so we can't send anything out to the group at this point. */
+/* Called after all nodes have acked that they're stopped for our
+   leave.  We get their stopped messages even though we've left the
+   cpg because the messages are sent through the groupd cpg.
+   groupd_down() will fill in stops for us for nodes that fail before
+   sending stopped for our leave. */
 
 void finalize_our_leave(group_t *g)
 {
@@ -620,15 +622,6 @@
 	msg_t msg;
 	event_t *ev = g->app->current_event;
 
-	/* FIXME: see other fixme that mentions that leaving nodes
-	   should also send a stopped message to be counted by the
-	   remaining nodes before they move on to restarted */
-
-	if (ev && ev->state == EST_LEAVE_STOP_WAIT && is_our_leave(ev)) {
-		finalize_our_leave(g);
-		return 0;
-	}
-
 	memset(&msg, 0, sizeof(msg));
 	msg.ms_type = MSG_APP_STOPPED;
 	msg.ms_global_id = g->global_id;
@@ -855,18 +848,6 @@
 	case EST_LEAVE_BEGIN:
 		ev->state = EST_LEAVE_STOP_WAIT;
 		app_stop(a);
-
-		/* FIXME: have leaving node send a stopped message after
-		   the app acks that it's stopped, and then make the
-		   other nodes wait for this stopped message instead of
-		   just setting the leaving node as stopped here */
-
-		if (!is_our_leave(ev)) {
-			node = find_app_node(a, ev->nodeid);
-			ASSERT(node);
-			node->stopped = 1;
-		}
-
 		break;
 
 	case EST_LEAVE_STOP_WAIT:
@@ -877,6 +858,12 @@
 		break;
 
 	case EST_LEAVE_ALL_STOPPED:
+		if (is_our_leave(ev)) {
+			/* frees group structure */
+			finalize_our_leave(g);
+			rv = -1;
+			break;
+		}
 		ev->state = EST_LEAVE_START_WAIT;
 
 		node = find_app_node(a, ev->nodeid);
@@ -1358,14 +1345,16 @@
 {
 	app_t *a = g->app;
 	event_t *ev = NULL;
-	int rv = 0;
+	int rv = 0, ret;
 
 	if (a->current_event) {
-		/* this assumes that we never remove/free the group in
-		   process_current_event */
-
 		rv += process_app_messages(g);
-		rv += process_current_event(g);
+
+		ret = process_current_event(g);
+		if (ret < 0)
+			goto out;
+		rv += ret;
+
 		rv += recover_current_event(g);
 	} else {
 		/* We only take on a new non-recovery event if there are
@@ -1407,3 +1396,30 @@
 	return rv;
 }
 
+/* This is a bit of a hack that may not be entirely necessary.  The problem
+   we're solving with this function is when a node leaves a group and is
+   collecting all the "stopped" messages from the remaining members, some
+   of those members may fail, so we wouldn't get a stopped message from
+   them and never finalize_our_leave (terminate the group).  I'm not entirely
+   sure that we _need_ to wait for stopped messages from remaining members
+   before we do the finalize_our_leave/terminate... The reasoning@this
+   point is that when gfs is withdrawing, we want to be sure gfs is
+   suspended everywhere before we leave the lockspace (which happens at
+   terminate for the withdraw/leave) */
+
+void groupd_down(int nodeid)
+{
+	group_t *g;
+
+	list_for_each_entry(g, &gd_groups, list) {
+		if (g->app &&
+		    g->app->current_event &&
+		    g->app->current_event->state == EST_LEAVE_STOP_WAIT &&
+		    is_our_leave(g->app->current_event)) {
+			log_group(g, "groupd down on %d, push our leave",
+				  nodeid);
+			mark_node_stopped(g->app, nodeid);
+		}
+	}
+}
+
--- cluster/group/daemon/cpg.c	2006/06/20 20:26:08	1.24
+++ cluster/group/daemon/cpg.c	2006/06/21 18:10:23	1.25
@@ -171,8 +171,10 @@
 	   where groupd exits but cman is still running. */
 
 	for (i = 0; i < saved_left_count; i++) {
-		if (saved_left[i].reason != CPG_REASON_LEAVE)
+		if (saved_left[i].reason != CPG_REASON_LEAVE) {
 			add_recovery_set(saved_left[i].nodeId);
+			groupd_down(saved_left[i].nodeId);
+		}
 	}
 }
 
--- cluster/group/daemon/gd_internal.h	2006/06/20 20:26:08	1.32
+++ cluster/group/daemon/gd_internal.h	2006/06/21 18:10:23	1.33
@@ -248,6 +248,7 @@
 void msg_bswap_out(msg_t *msg);
 void msg_bswap_in(msg_t *msg);
 struct recovery_set *get_recovery_set(int nodeid);
+void groupd_down(int nodeid);
 
 /* main.c */
 void app_stop(app_t *a);

next             reply	other threads:[~2006-06-21 18:10 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-06-21 18:10 teigland [this message]
  -- strict thread matches above, loose matches on Subject: below --
2006-06-22 18:39 [Cluster-devel] cluster/group/daemon app.c cpg.c gd_internal.h teigland
2007-01-05 18:49 teigland
2007-01-05 18:50 teigland
2007-01-05 19:56 teigland

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060621181024.5387.qmail@sourceware.org \
    --to=teigland@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).