All of lore.kernel.org
 help / color / mirror / Atom feed
From: teigland@sourceware.org <teigland@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/group/daemon app.c cpg.c gd_internal.h
Date: 21 Jun 2006 18:10:24 -0000	[thread overview]
Message-ID: <20060621181024.5387.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-06-21 18:10:23

Modified files:
	group/daemon   : app.c cpg.c gd_internal.h 

Log message:
	Don't finalize/terminate a local group leave until we see that all
	remaining group members have stopped.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/app.c.diff?cvsroot=cluster&r1=1.29&r2=1.30
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/cpg.c.diff?cvsroot=cluster&r1=1.24&r2=1.25
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/gd_internal.h.diff?cvsroot=cluster&r1=1.32&r2=1.33

--- cluster/group/daemon/app.c	2006/06/20 20:26:08	1.29
+++ cluster/group/daemon/app.c	2006/06/21 18:10:23	1.30
@@ -592,9 +592,11 @@
 	return (ev->nodeid == our_nodeid);
 }
 
-/* called after the local app has acked that it is stopped as part
-   of our own leave.  We've gotten the final confchg for our leave
-   so we can't send anything out to the group at this point. */
+/* Called after all nodes have acked that they're stopped for our
+   leave.  We get their stopped messages even though we've left the
+   cpg because the messages are sent through the groupd cpg.
+   groupd_down() will fill in stops for us for nodes that fail before
+   sending stopped for our leave. */
 
 void finalize_our_leave(group_t *g)
 {
@@ -620,15 +622,6 @@
 	msg_t msg;
 	event_t *ev = g->app->current_event;
 
-	/* FIXME: see other fixme that mentions that leaving nodes
-	   should also send a stopped message to be counted by the
-	   remaining nodes before they move on to restarted */
-
-	if (ev && ev->state == EST_LEAVE_STOP_WAIT && is_our_leave(ev)) {
-		finalize_our_leave(g);
-		return 0;
-	}
-
 	memset(&msg, 0, sizeof(msg));
 	msg.ms_type = MSG_APP_STOPPED;
 	msg.ms_global_id = g->global_id;
@@ -855,18 +848,6 @@
 	case EST_LEAVE_BEGIN:
 		ev->state = EST_LEAVE_STOP_WAIT;
 		app_stop(a);
-
-		/* FIXME: have leaving node send a stopped message after
-		   the app acks that it's stopped, and then make the
-		   other nodes wait for this stopped message instead of
-		   just setting the leaving node as stopped here */
-
-		if (!is_our_leave(ev)) {
-			node = find_app_node(a, ev->nodeid);
-			ASSERT(node);
-			node->stopped = 1;
-		}
-
 		break;
 
 	case EST_LEAVE_STOP_WAIT:
@@ -877,6 +858,12 @@
 		break;
 
 	case EST_LEAVE_ALL_STOPPED:
+		if (is_our_leave(ev)) {
+			/* frees group structure */
+			finalize_our_leave(g);
+			rv = -1;
+			break;
+		}
 		ev->state = EST_LEAVE_START_WAIT;
 
 		node = find_app_node(a, ev->nodeid);
@@ -1358,14 +1345,16 @@
 {
 	app_t *a = g->app;
 	event_t *ev = NULL;
-	int rv = 0;
+	int rv = 0, ret;
 
 	if (a->current_event) {
-		/* this assumes that we never remove/free the group in
-		   process_current_event */
-
 		rv += process_app_messages(g);
-		rv += process_current_event(g);
+
+		ret = process_current_event(g);
+		if (ret < 0)
+			goto out;
+		rv += ret;
+
 		rv += recover_current_event(g);
 	} else {
 		/* We only take on a new non-recovery event if there are
@@ -1407,3 +1396,30 @@
 	return rv;
 }
 
+/* This is a bit of a hack that may not be entirely necessary.  The problem
+   we're solving with this function is when a node leaves a group and is
+   collecting all the "stopped" messages from the remaining members, some
+   of those members may fail, so we wouldn't get a stopped message from
+   them and never finalize_our_leave (terminate the group).  I'm not entirely
+   sure that we _need_ to wait for stopped messages from remaining members
+   before we do the finalize_our_leave/terminate... The reasoning@this
+   point is that when gfs is withdrawing, we want to be sure gfs is
+   suspended everywhere before we leave the lockspace (which happens at
+   terminate for the withdraw/leave) */
+
+void groupd_down(int nodeid)
+{
+	group_t *g;
+
+	list_for_each_entry(g, &gd_groups, list) {
+		if (g->app &&
+		    g->app->current_event &&
+		    g->app->current_event->state == EST_LEAVE_STOP_WAIT &&
+		    is_our_leave(g->app->current_event)) {
+			log_group(g, "groupd down on %d, push our leave",
+				  nodeid);
+			mark_node_stopped(g->app, nodeid);
+		}
+	}
+}
+
--- cluster/group/daemon/cpg.c	2006/06/20 20:26:08	1.24
+++ cluster/group/daemon/cpg.c	2006/06/21 18:10:23	1.25
@@ -171,8 +171,10 @@
 	   where groupd exits but cman is still running. */
 
 	for (i = 0; i < saved_left_count; i++) {
-		if (saved_left[i].reason != CPG_REASON_LEAVE)
+		if (saved_left[i].reason != CPG_REASON_LEAVE) {
 			add_recovery_set(saved_left[i].nodeId);
+			groupd_down(saved_left[i].nodeId);
+		}
 	}
 }
 
--- cluster/group/daemon/gd_internal.h	2006/06/20 20:26:08	1.32
+++ cluster/group/daemon/gd_internal.h	2006/06/21 18:10:23	1.33
@@ -248,6 +248,7 @@
 void msg_bswap_out(msg_t *msg);
 void msg_bswap_in(msg_t *msg);
 struct recovery_set *get_recovery_set(int nodeid);
+void groupd_down(int nodeid);
 
 /* main.c */
 void app_stop(app_t *a);



             reply	other threads:[~2006-06-21 18:10 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-06-21 18:10 teigland [this message]
  -- strict thread matches above, loose matches on Subject: below --
2006-06-22 18:39 [Cluster-devel] cluster/group/daemon app.c cpg.c gd_internal.h teigland
2007-01-05 18:49 teigland
2007-01-05 18:50 teigland
2007-01-05 19:56 teigland

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060621181024.5387.qmail@sourceware.org \
    --to=teigland@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.