All of lore.kernel.org
 help / color / mirror / Atom feed
From: lhh@sourceware.org <lhh@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/rgmanager include/resgroup.h src/daemo ...
Date: 14 Nov 2007 19:03:39 -0000	[thread overview]
Message-ID: <20071114190339.12175.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	lhh at sourceware.org	2007-11-14 19:03:37

Modified files:
	rgmanager/include: resgroup.h 
	rgmanager/src/daemons: groups.c rg_state.c 

Log message:
	Fix #360401 - hang forever during shutdown due to previous service boot problem

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.3.2.9&r2=1.3.2.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.8.2.21&r2=1.8.2.22
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.4.2.21&r2=1.4.2.22

--- cluster/rgmanager/include/resgroup.h	2007/01/03 21:08:17	1.3.2.9
+++ cluster/rgmanager/include/resgroup.h	2007/11/14 19:03:37	1.3.2.10
@@ -156,6 +156,7 @@
 cluster_member_list_t *member_list(void);
 uint64_t my_id(void);
 
+#define RG_ERELO	-9 /* Operation cannot complete here */
 #define RG_ENODEDEATH	-8 /* Processing node died */
 #define RG_ERUN		-7 /* Service is running already */
 #define RG_EAGAIN	-6 /* Try again */
--- cluster/rgmanager/src/daemons/groups.c	2007/09/28 15:14:52	1.8.2.21
+++ cluster/rgmanager/src/daemons/groups.c	2007/11/14 19:03:37	1.8.2.22
@@ -192,7 +192,8 @@
 		}
 
 		if (st.rs_state != RG_STATE_STARTED &&
-		     st.rs_state != RG_STATE_STARTING)
+		     st.rs_state != RG_STATE_STARTING &&
+		     st.rs_state != RG_STATE_STOPPING)
 			continue;
 
 		if (mp->cm_id != st.rs_owner)
--- cluster/rgmanager/src/daemons/rg_state.c	2007/06/28 11:54:50	1.4.2.21
+++ cluster/rgmanager/src/daemons/rg_state.c	2007/11/14 19:03:37	1.4.2.22
@@ -41,10 +41,13 @@
 int set_rg_state(char *servicename, rg_state_t *svcblk);
 int get_rg_state(char *servicename, rg_state_t *svcblk);
 void get_recovery_policy(char *rg_name, char *buf, size_t buflen);
-int have_exclusive_resources();
+int have_exclusive_resources(void);
 int check_exclusive_resources(cluster_member_list_t *membership, char *svcName);
 
 
+pthread_mutex_t exclusive_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+
 uint64_t
 next_node_id(cluster_member_list_t *membership, uint64_t me)
 {
@@ -446,6 +449,7 @@
 		break;
 	
 	case RG_STATE_DISABLED:
+		ret = 2;
 	case RG_STATE_UNINITIALIZED:
 		if (req == RG_DISABLE) {
 			clulog(LOG_NOTICE,
@@ -536,7 +540,7 @@
 			}
 
 			clulog(LOG_NOTICE,
-			       "Starting stopped service%s\n",
+			       "Starting stopped service %s\n",
 			       svcName);
 			ret = 1;
 			break;
@@ -557,7 +561,7 @@
 			snprintf(query,
 				sizeof(query),
 				"/cluster/clusternodes/clusternode[@nodeid=\"%d\"]/@name",
-				svcStatus->rs_owner);
+				(int)svcStatus->rs_owner);
 			ccs_get(fd, query, &nodename);
 			ccs_disconnect(fd);
 		}
@@ -650,42 +654,61 @@
 svc_start(char *svcName, int req)
 {
 	void *lockp = NULL;
-	int ret;
+	int ret, xret;
 	rg_state_t svcStatus;
+	int need_check = have_exclusive_resources();
+	cluster_member_list_t *membership;
+
+ 	if (need_check)
+		pthread_mutex_lock(&exclusive_mutex);
+
+	ret = RG_EFAIL;
 
 	if (rg_lock(svcName, &lockp) < 0) {
 		clulog(LOG_ERR, "#45: Unable to obtain cluster lock: %s\n",
 		       strerror(errno));
-		return FAIL;
+		goto out_nolock;
 	}
 
 	if (get_rg_state(svcName, &svcStatus) != 0) {
-		rg_unlock(svcName, lockp);
 		clulog(LOG_ERR, "#46: Failed getting status for RG %s\n",
 		       svcName);
-		return FAIL;
+		goto out_unlock;
+	}
+
+	if (need_check) {
+		membership = member_list();
+		xret = check_exclusive_resources(membership, svcName);
+		cml_free(membership);
+		if (xret != 0) {
+			if (xret > 0)
+				/* Exc. service running */
+				ret = RG_ERELO;
+			else
+				/* XXX */
+				ret = RG_ENOSERVICE;
+			goto out_unlock;
+		}
 	}
 
 	/* LOCK HELD */
 	switch (svc_advise_start(&svcStatus, svcName, req)) {
 	case 0: /* Don't start service, return FAIL */
-		rg_unlock(svcName, lockp);
-		return FAIL;
+		goto out_unlock;
 	case 2: /* Don't start service, return 0 */
-		rg_unlock(svcName, lockp);
-		return 0;
+		ret = 0;
+		goto out_unlock;
 	case 3:
-		rg_unlock(svcName, lockp);
-		return RG_EAGAIN;
+		ret = RG_EAGAIN;
+		goto out_unlock;
 	case 4:
-		rg_unlock(svcName, lockp);
-		return RG_ERUN;
+		ret = RG_ERUN;
+		goto out_unlock;
 	default:
 		break;
 	}
 
 	/* LOCK HELD if we get here */
-
 	svcStatus.rs_owner = my_id();
 	svcStatus.rs_state = RG_STATE_STARTING;
 	svcStatus.rs_transition = (uint64_t)time(NULL);
@@ -699,10 +722,17 @@
 		clulog(LOG_ERR,
 		       "#47: Failed changing service status\n");
 		rg_unlock(svcName, lockp);
-		return FAIL;
+		goto out_unlock;
 	}
 	
 	rg_unlock(svcName, lockp);
+	
+	/* release excl. mutex during start */
+	if (need_check) {
+		/* Also clear need_check so we don't double-unlock */
+		pthread_mutex_unlock(&exclusive_mutex);
+		need_check = 0;
+	}
 
 	ret = group_op(svcName, RG_START);
 	ret = !!ret; /* Either it worked or it didn't.  Ignore all the
@@ -711,17 +741,17 @@
 	if (rg_lock(svcName, &lockp) < 0) {
 		clulog(LOG_ERR, "#74: Unable to obtain cluster lock: %s\n",
 		       strerror(errno));
-		return FAIL;
+		ret = RG_EFAIL;
+		goto out_nolock;
 	}
 
 	svcStatus.rs_state = RG_STATE_STARTED;
 	if (set_rg_state(svcName, &svcStatus) != 0) {
 		clulog(LOG_ERR,
 		       "#75: Failed changing service status\n");
-		rg_unlock(svcName, lockp);
-		return FAIL;
+		ret = RG_EFAIL;
+		goto out_unlock;
 	}
-	rg_unlock(svcName, lockp);
        
 	if (ret == 0)
 		clulog(LOG_NOTICE,
@@ -732,6 +762,11 @@
 		       "#68: Failed to start %s; return value: %d\n",
 		       svcName, ret);
 
+out_unlock:
+	rg_unlock(svcName, lockp);
+out_nolock:
+	if (need_check)
+		pthread_mutex_unlock(&exclusive_mutex);
 	return ret;
 }
 
@@ -1115,7 +1150,7 @@
 {
 	cluster_member_list_t *allowed_nodes, *backup = NULL;
 	uint64_t target = preferred_target, me = my_id();
-	int ret, x;
+	int ret, x, tried = 0;
 	
 	/*
 	 * Stop the service - if we haven't already done so.
@@ -1181,6 +1216,7 @@
 		 	 * It's legal to start the service on the given
 		 	 * node.  Try to do so.
 		 	 */
+			++tried;
 			if (relocate_service(svcName, request, target) == 0) {
 				*new_owner = target;
 				/*
@@ -1211,9 +1247,12 @@
 		if (target == me)
 			goto exhausted;
 
+		++tried;
+
+		/* Each node gets one try */
+		memb_mark_down(allowed_nodes, target);
 		switch (relocate_service(svcName, request, target)) {
 		case RG_EFAIL:
-			memb_mark_down(allowed_nodes, target);
 			continue;
 		case RG_EABORT:
 			svc_report_failure(svcName);
@@ -1228,9 +1267,10 @@
 			       (uint32_t)(target&0xffffffff), request);
 			return 0;
 		case 0:
-			*new_owner = target;
 			clulog(LOG_NOTICE, "Service %s is now running "
 			       "on member %d\n", svcName, (int)target);
+		case RG_ERUN:
+			*new_owner = target;
 			cml_free(allowed_nodes);
 			return 0;
 		default:
@@ -1254,9 +1294,10 @@
 	 */
 exhausted:
 	if (!rg_locked()) {
-		clulog(LOG_WARNING,
-		       "#70: Attempting to restart service %s locally.\n",
-		       svcName);
+		if (tried)
+			clulog(LOG_WARNING,
+			       "#70: Attempting to restart service %s locally.\n",
+			       svcName);
 		if (svc_start(svcName, RG_START_RECOVER) == 0) {
 			*new_owner = me;
 			return FAIL;
@@ -1275,9 +1316,9 @@
 int
 handle_fd_start_req(char *svcName, int request, uint64_t *new_owner)
 {
-	cluster_member_list_t *allowed_nodes, *backup = NULL;
+	cluster_member_list_t *allowed_nodes;
 	uint64_t target, me = my_id();
-	int ret, x;
+	int ret;
 	
 	allowed_nodes = member_list();
 
@@ -1326,7 +1367,6 @@
 }
 
 
-pthread_mutex_t exclusive_mutex = PTHREAD_MUTEX_INITIALIZER;
 /**
  * handle_start_req - Handle a generic start request from a user or during
  * service manager boot.
@@ -1342,7 +1382,6 @@
 {
 	int ret, tolerance = FOD_BEST;
 	cluster_member_list_t *membership = member_list();
-	int need_check = have_exclusive_resources();
 
 	/*
 	 * When a service request is from a user application (eg, clusvcadm),
@@ -1358,18 +1397,6 @@
 		cml_free(membership);
 		return FAIL;
 	}
-	if (need_check) {
-		pthread_mutex_lock(&exclusive_mutex);
-		ret = check_exclusive_resources(membership, svcName);
-		if (ret != 0) {
-			cml_free(membership);
-			pthread_mutex_unlock(&exclusive_mutex);
-			if (ret > 0)
-				goto relocate;
-			else
-				return FAIL;
-		}
-	}
 	cml_free(membership);
 	
 	/*
@@ -1377,25 +1404,22 @@
 	 * mask here - so that we can try all nodes if necessary.
 	 */
 	ret = svc_start(svcName, req);
-	if (need_check)
-		pthread_mutex_unlock(&exclusive_mutex);
-
-	/* 
-	   If services are locked, return the error 
-	  */
-	if (ret == RG_EAGAIN || ret == RG_ERUN)
+	switch(ret) {
+	case RG_ERELO:
+		goto relocate;
+
+	case RG_EAGAIN:
+		/* If services are locked, return the error */
+	case RG_ENOSERVICE:
+		/* service doesn't exist? */
+	case RG_ERUN:
+		/* If service is already running, return that value */
 		return ret;
 
-	/*
-	 * If we succeeded, then we're done.
-	 */
-	if (ret == SUCCESS) {
+	case SUCCESS:
+		/* If we succeeded, then we're done.  */
 		*new_owner = my_id();
-		return SUCCESS;
-	}
-
-	/* Already running? */
-	if (ret == NO) {
+	case NO: 
 		return SUCCESS;
 	}
 	
@@ -1418,13 +1442,13 @@
 		return RG_EABORT;
 	}
 	
-relocate:
 	/*
 	 * OK, it failed to start - but succeeded to stop.  Now,
 	 * we should relocate the service.
 	 */
 	clulog(LOG_WARNING, "#71: Relocating failed service %s\n",
 	       svcName);
+relocate:
 	ret = handle_relocate_req(svcName, RG_START_RECOVER, -1, new_owner);
 
 	/* If we leave the service stopped, instead of disabled, someone
@@ -1456,7 +1480,6 @@
 	int x;
 	uint64_t me = my_id();
 	cluster_member_list_t *membership = member_list();
-	int need_check = have_exclusive_resources();
 
 	/* XXX ok, so we need to say "should I start this if I was the
 	   only cluster member online */
@@ -1477,23 +1500,29 @@
 		cml_free(membership);
 		return FAIL;
 	}
- 	if (need_check) {
- 		pthread_mutex_lock(&exclusive_mutex);
- 		if (check_exclusive_resources(membership, svcName) != 0) {
- 			pthread_mutex_unlock(&exclusive_mutex);
- 			cml_free(membership);
- 			return FAIL;
- 		}
- 	}
 	cml_free(membership);
 
 	x = svc_start(svcName, req);
-	if (need_check)
- 		pthread_mutex_unlock(&exclusive_mutex);
-	if (x == 0)
-		return 0;
-	if (x == RG_ERUN)
-		return RG_ERUN;
+	switch(x) {
+	case RG_ERELO:
+		/* Don't relocate from here; it was a remote start */
+		/* Return fail so the other node can go ahead and 
+		   try the other nodes in the cluster */
+	case NO: 
+		return RG_EFAIL;
+
+	case RG_EAGAIN:
+		/* If services are locked, return the error */
+	case RG_ENOSERVICE:
+		/* service doesn't exist? */
+	case RG_ERUN:
+		/* If service is already running, return that value */
+		return x;
+
+	case SUCCESS:
+		/* If we succeeded, then we're done.  */
+		return SUCCESS;
+	}
 
 	if (svc_stop(svcName, RG_STOP_RECOVER) == 0)
 		return RG_EFAIL;



             reply	other threads:[~2007-11-14 19:03 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-11-14 19:03 lhh [this message]
  -- strict thread matches above, loose matches on Subject: below --
2006-12-14 22:03 [Cluster-devel] cluster/rgmanager include/resgroup.h src/daemo lhh
2006-12-13 18:39 lhh
2006-12-13 18:38 lhh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20071114190339.12175.qmail@sourceware.org \
    --to=lhh@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.