All of lore.kernel.org
 help / color / mirror / Atom feed
From: lhh@sourceware.org <lhh@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
Date: 30 Nov 2007 20:36:19 -0000	[thread overview]
Message-ID: <20071130203619.18381.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	lhh at sourceware.org	2007-11-30 20:36:18

Modified files:
	rgmanager      : ChangeLog 
	rgmanager/include: resgroup.h reslist.h 
	rgmanager/src/daemons: Makefile fo_domain.c groups.c main.c 
	                       reslist.c resrules.c restree.c rg_state.c 
	                       test.c 
Added files:
	rgmanager/include: restart_counter.h 
	rgmanager/src/daemons: restart_counter.c 

Log message:
	Merges from RHEL5 branch - round 2.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.60&r2=1.61
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/Makefile.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/fo_domain.c.diff?cvsroot=cluster&r1=1.13&r2=1.14
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.39&r2=1.40
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.44&r2=1.45
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/reslist.c.diff?cvsroot=cluster&r1=1.19&r2=1.20
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/resrules.c.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&r1=1.37&r2=1.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.40&r2=1.41
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/test.c.diff?cvsroot=cluster&r1=1.12&r2=1.13

--- cluster/rgmanager/ChangeLog	2007/11/30 20:06:55	1.60
+++ cluster/rgmanager/ChangeLog	2007/11/30 20:36:17	1.61
@@ -1,6 +1,8 @@
 2007-11-30 Lon Hohberger <lhh@redhat.com>
-	* src/resources/*: Merge from RHEL5 branch.
-	* src/utils/*: Merge from RHEL5 branch.
+	* src/resources/*: Merge misc. updates from RHEL5 branch.
+	* src/utils/*: Merge misc. updates from RHEL5 branch.
+	* include/*.h, src/daemons/*: Merge status-counter patch
+	from RHEL5 branch.
 
 2007-08-30 Lon Hohberger <lhh@redhat.com>
 	* src/daemons/restree.c, rg_state.c: Fix tree-restart bug
--- cluster/rgmanager/include/restart_counter.h	2007/11/26 21:46:26	1.1
+++ cluster/rgmanager/include/restart_counter.h	2007/11/30 20:36:17	1.2
@@ -0,0 +1,12 @@
+#ifndef _RESTART_COUNTER_H
+#define _RESTART_COUNTER_H
+
+typedef void *restart_counter_t;
+
+int restart_add(restart_counter_t arg);
+int restart_clear(restart_counter_t arg);
+int restart_count(restart_counter_t arg);
+restart_counter_t restart_init(time_t expire_timeout, int max_restarts);
+int restart_cleanup(restart_counter_t arg);
+
+#endif
--- cluster/rgmanager/include/resgroup.h	2007/06/27 14:03:51	1.23
+++ cluster/rgmanager/include/resgroup.h	2007/11/30 20:36:17	1.24
@@ -150,6 +150,8 @@
 int svc_freeze(char *svcName);
 int svc_unfreeze(char *svcName);
 int svc_migrate(char *svcName, int target);
+int check_restart(char *svcName);
+
 int rt_enqueue_request(const char *resgroupname, int request,
 		       msgctx_t *resp_ctx,
        		       int max, uint32_t target, int arg0, int arg1);
--- cluster/rgmanager/include/reslist.h	2007/08/02 14:53:37	1.23
+++ cluster/rgmanager/include/reslist.h	2007/11/30 20:36:17	1.24
@@ -126,6 +126,7 @@
 	struct _rg_node	*rn_child, *rn_parent;
 	resource_t	*rn_resource;
 	resource_act_t	*rn_actions;
+	restart_counter_t rn_restart_counter;
 	int	rn_state; /* State of this instance of rn_resource */
 	int	rn_flags;
 	int	rn_last_status;
--- cluster/rgmanager/src/daemons/restart_counter.c	2007/11/26 21:46:27	1.1
+++ cluster/rgmanager/src/daemons/restart_counter.c	2007/11/30 20:36:17	1.2
@@ -0,0 +1,185 @@
+/*
+  Copyright Red Hat, Inc. 2007
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License version 2 as published
+  by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+*/
+/* Time-based restart counters for rgmanager */
+
+#include <stdio.h>
+#include <list.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <time.h>
+#include <restart_counter.h>
+
+
+
+#define RESTART_INFO_MAGIC 0x184820ab
+
+typedef struct {
+	list_head();
+	time_t restart_time;
+} restart_item_t;
+
+typedef struct {
+	int magic;
+	time_t expire_timeout;
+	int max_restarts;
+	int restart_count;
+	restart_item_t *restart_nodes;
+} restart_info_t;
+
+
+#define VALIDATE(arg, ret) \
+do { \
+	if (((restart_info_t *)arg)->magic != RESTART_INFO_MAGIC) {\
+		errno = EINVAL; \
+		return ret; \
+	} \
+} while(0)
+
+
+/* Remove expired restarts */
+static int
+restart_timer_purge(restart_counter_t arg, time_t now)
+{
+	restart_info_t *restarts = (restart_info_t *)arg;
+	restart_item_t *i;
+	int x, done = 0;
+
+	VALIDATE(arg, -1);
+
+	/* No timeout */
+	if (restarts->expire_timeout == 0)
+		return 0;
+
+	do {
+		done = 1;
+		list_for(&restarts->restart_nodes, i, x) {
+			if ((now - i->restart_time) >=
+			    restarts->expire_timeout) {
+				restarts->restart_count--;
+				list_remove(&restarts->restart_nodes, i);
+				done = 0;
+				break;
+			}
+		}
+	} while(!done);
+
+	return 0;
+}
+
+
+int
+restart_count(restart_counter_t arg)
+{
+	restart_info_t *restarts = (restart_info_t *)arg;
+	time_t now;
+
+	VALIDATE(arg, -1);
+	now = time(NULL);
+	restart_timer_purge(arg, now);
+	return restarts->restart_count;
+}
+
+
+/* Add a restart entry to the list.  Returns 1 if restart
+   count is exceeded */
+int
+restart_add(restart_counter_t arg)
+{
+	restart_info_t *restarts = (restart_info_t *)arg;
+	restart_item_t *i;
+	time_t t;
+
+	if (!arg)
+		/* No max restarts / threshold = always
+		   ok to restart! */
+		return 0;
+
+	VALIDATE(arg, -1);
+
+	i = malloc(sizeof(*i));
+	if (!i) {
+		return -1;
+	}
+
+	t = time(NULL);
+	i->restart_time = t;
+
+	list_insert(&restarts->restart_nodes, i);
+	restarts->restart_count++;
+
+	/* Check and remove old entries */
+	restart_timer_purge(restarts, t);
+
+	if (restarts->restart_count > restarts->max_restarts)
+		return 1;
+
+	return 0;
+}
+
+
+int
+restart_clear(restart_counter_t arg)
+{
+	restart_info_t *restarts = (restart_info_t *)arg;
+	restart_item_t *i;
+
+	VALIDATE(arg, -1);
+	while ((i = restarts->restart_nodes)) {
+		list_remove(&restarts->restart_nodes, i);
+		free(i);
+	}
+
+	restarts->restart_count = 0;
+
+	return 0;
+}
+
+
+restart_counter_t
+restart_init(time_t expire_timeout, int max_restarts)
+{
+	restart_info_t *info;
+
+	if (max_restarts < 0) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	info = malloc(sizeof(*info));
+	if (info == NULL)
+		return NULL;
+
+	info->magic = RESTART_INFO_MAGIC;
+	info->expire_timeout = expire_timeout;
+	info->max_restarts = max_restarts;
+	info->restart_count = 0;
+
+	return (void *)info;
+}
+
+
+int
+restart_cleanup(restart_counter_t arg)
+{
+	VALIDATE(arg, -1);
+	restart_clear(arg);
+	free(arg);
+	return 0;
+}
--- cluster/rgmanager/src/daemons/Makefile	2007/08/28 04:35:47	1.23
+++ cluster/rgmanager/src/daemons/Makefile	2007/11/30 20:36:17	1.24
@@ -31,12 +31,14 @@
 	rg_queue.o \
 	rg_state.o \
 	rg_thread.o \
+	restart_counter.o \
 	watchdog.o
 
 OBJS2=	clurmtabd.o \
 	clurmtabd_lib.o
 
-OBJS3=	test-noccs.o
+OBJS3=	test-noccs.o \
+	restart_counter.o
 
 OBJS4=	dtest-noccs.o
 
--- cluster/rgmanager/src/daemons/fo_domain.c	2007/03/20 17:09:57	1.13
+++ cluster/rgmanager/src/daemons/fo_domain.c	2007/11/30 20:36:17	1.14
@@ -27,6 +27,7 @@
 #include <list.h>
 #include <clulog.h>
 #include <resgroup.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <ccs.h>
 #include <pthread.h>
--- cluster/rgmanager/src/daemons/groups.c	2007/08/02 14:53:38	1.39
+++ cluster/rgmanager/src/daemons/groups.c	2007/11/30 20:36:17	1.40
@@ -20,6 +20,7 @@
 //#define DEBUG
 #include <platform.h>
 #include <resgroup.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <vf.h>
 #include <message.h>
@@ -179,6 +180,29 @@
 }
 
 
+resource_node_t *
+node_by_ref(resource_node_t **tree, char *name)
+{
+	resource_t *res;
+	resource_node_t *node, *ret = NULL;
+	char rgname[64];
+	int x;
+
+	list_for(&_tree, node, x) {
+
+		res = node->rn_resource;
+		res_build_name(rgname, sizeof(rgname), res);
+
+		if (!strcasecmp(name, rgname)) {
+			ret = node;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+
 int
 count_resource_groups_local(cman_node_t *mp)
 {
@@ -1587,6 +1611,28 @@
 }
 
 
+int
+check_restart(char *rg_name)
+{
+	resource_node_t *node;
+	int ret = 1;
+
+	pthread_rwlock_rdlock(&resource_lock);
+	node = node_by_ref(&_tree, rg_name);
+	if (node) {
+		ret = restart_add(node->rn_restart_counter);
+		if (ret) {
+			/* Clear it out - caller is about 
+			   to relocate the service anyway */
+			restart_clear(node->rn_restart_counter);
+		}
+	}
+	pthread_rwlock_unlock(&resource_lock);
+
+	return ret;
+}
+
+
 void
 kill_resource_groups(void)
 {
--- cluster/rgmanager/src/daemons/main.c	2007/09/19 09:54:19	1.44
+++ cluster/rgmanager/src/daemons/main.c	2007/11/30 20:36:17	1.45
@@ -166,6 +166,7 @@
 
 	old_membership = member_list();
 	new_ml = get_member_list(h);
+	memb_mark_down(new_ml, 0);
 
 	for (x = 0; x < new_ml->cml_count; x++) {
 
@@ -182,19 +183,25 @@
 			quorate = cman_is_listening(h,
 					new_ml->cml_members[x].cn_nodeid,
 					port);
+
 			if (quorate == 0) {
 				clulog(LOG_DEBUG, "Node %d is not listening\n",
 					new_ml->cml_members[x].cn_nodeid);
 				new_ml->cml_members[x].cn_member = 0;
 			} else if (quorate < 0) {
+				if (errno == ENOTCONN) {
+					new_ml->cml_members[x].cn_member = 0;
+					break;
+				}
 				perror("cman_is_listening");
 				usleep(50000);
 				continue;
 			}
-
 #ifdef DEBUG
-			printf("Node %d IS listening\n",
-			       new_ml->cml_members[x].cn_nodeid);
+		       	else {
+				printf("Node %d IS listening\n",
+				       new_ml->cml_members[x].cn_nodeid);
+			}
 #endif
 			break;
 		} while(1);
@@ -202,7 +209,6 @@
 
 	cman_finish(h);
 	member_list_update(new_ml);
-	member_set_state(0, 0);		/* Mark qdisk as dead */
 
 	/*
 	 * Handle nodes lost.  Do our local node event first.
--- cluster/rgmanager/src/daemons/reslist.c	2007/07/31 18:00:25	1.19
+++ cluster/rgmanager/src/daemons/reslist.c	2007/11/30 20:36:17	1.20
@@ -26,6 +26,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #ifndef NO_CCS
--- cluster/rgmanager/src/daemons/resrules.c	2007/07/31 18:00:25	1.23
+++ cluster/rgmanager/src/daemons/resrules.c	2007/11/30 20:36:17	1.24
@@ -27,6 +27,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <ctype.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <dirent.h>
@@ -230,43 +232,70 @@
 
 
 int
-expand_time(char *val)
+expand_time (char *val)
 {
-	int l = strlen(val);
-	char c = val[l - 1];
-	int ret = atoi(val);
+	int curval, len;
+	int ret = 0;
+	char *start = val, ival[16];
 
-	if (ret <= 0)
-		return 0;
+	if (!val)
+		return (time_t)0;
+
+	while (start[0]) {
+
+		len = 0;
+		curval = 0;
+		memset(ival, 0, sizeof(ival));
+
+		while (isdigit(start[len])) {
+			ival[len] = start[len];
+			len++;
+		}
+
+		if (len) {
+			curval = atoi(ival);
+		} else {
+			len = 1;
+		}
 
-	if ((c >= '0') && (c <= '9'))
-		return ret;
+		switch(start[len]) {
+		case 0:
+		case 'S':
+		case 's':
+			break;
+		case 'M':
+        	case 'm':
+			curval *= 60;
+			break;
+		case 'h':
+		case 'H':
+			curval *= 3600;
+			break;
+		case 'd':
+		case 'D':
+			curval *= 86400;
+			break;
+		case 'w':
+		case 'W':
+			curval *= 604800;
+			break;
+		case 'y':
+		case 'Y':
+			curval *= 31536000;
+			break;
+		default:
+			curval = 0;
+		}
 
-	switch(c) {
-	case 'S':
-	case 's':
-		return (ret);
-	case 'M':
-	case 'm':
-		return (ret * 60);
-	case 'h':
-	case 'H':
-		return (ret * 3600);
-	case 'd':
-	case 'D':
-		return (ret * 86400);
-	case 'w':
-	case 'W':
-		return (ret * 604800);
-	case 'y':
-	case 'Y':
-		return (ret * 31536000);
+		ret += (time_t)curval;
+		start += len;
 	}
 
 	return ret;
 }
 
 
+
 /**
  * Store a resource action
  * @param actsp		Action array; may be modified and returned!
--- cluster/rgmanager/src/daemons/restree.c	2007/08/30 16:09:39	1.37
+++ cluster/rgmanager/src/daemons/restree.c	2007/11/30 20:36:17	1.38
@@ -30,6 +30,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <clulog.h>
@@ -432,6 +433,39 @@
 }
 
 
+static inline void
+assign_restart_policy(resource_t *curres, resource_node_t *parent,
+		      resource_node_t *node)
+{
+	char *val;
+	int max_restarts = 0;
+	time_t restart_expire_time = 0;
+
+	node->rn_restart_counter = NULL;
+
+	if (!curres || !node)
+		return;
+	if (parent) /* Non-parents don't get one for now */
+		return;
+
+	val = res_attr_value(curres, "max_restarts");
+	if (!val)
+		return;
+	max_restarts = atoi(val);
+	if (max_restarts <= 0)
+		return;
+	val = res_attr_value(curres, "restart_expire_time");
+	if (val) {
+		restart_expire_time = (time_t)expand_time(val);
+		if (!restart_expire_time)
+			return;
+	}
+
+	node->rn_restart_counter = restart_init(restart_expire_time,
+						max_restarts);
+}
+
+
 static inline int
 do_load_resource(int ccsfd, char *base,
 	         resource_rule_t *rule,
@@ -514,6 +548,7 @@
 	node->rn_state = RES_STOPPED;
 	node->rn_flags = 0;
 	node->rn_actions = (resource_act_t *)act_dup(curres->r_actions);
+	assign_restart_policy(curres, parent, node);
 
 	snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base);
 #ifndef NO_CCS
@@ -769,6 +804,11 @@
 			destroy_resource_tree(&(*tree)->rn_child);
 
 		list_remove(tree, node);
+
+		if (node->rn_restart_counter) {
+			restart_cleanup(node->rn_restart_counter);
+		}
+
 		if(node->rn_actions){
 			free(node->rn_actions);
 		}
--- cluster/rgmanager/src/daemons/rg_state.c	2007/08/30 16:09:39	1.40
+++ cluster/rgmanager/src/daemons/rg_state.c	2007/11/30 20:36:18	1.41
@@ -1350,7 +1350,8 @@
 	}
 
 	if ((svcStatus.rs_state != RG_STATE_STOPPING) &&
-	     (svcStatus.rs_state != RG_STATE_ERROR)) {
+	    (svcStatus.rs_state != RG_STATE_ERROR) &&
+	    (svcStatus.rs_state != RG_STATE_RECOVER)) {
 		rg_unlock(&lockp);
 		return 0;
 	}
@@ -1829,8 +1830,10 @@
 	 * We got sent here from handle_start_req.
 	 * We're DONE.
 	 */
-	if (request == RG_START_RECOVER)
+	if (request == RG_START_RECOVER) {
+		_svc_stop_finish(svcName, 0, RG_STATE_STOPPED);
 		return RG_EFAIL;
+	}
 
 	/*
 	 * All potential places for the service to start have been exhausted.
@@ -1839,7 +1842,7 @@
 exhausted:
 	if (!rg_locked()) {
 		clulog(LOG_WARNING,
-		       "#70: Attempting to restart service %s locally.\n",
+		       "#70: Failed to relocate %s; restarting locally\n",
 		       svcName);
 		if (svc_start(svcName, RG_START_RECOVER) == 0) {
 			*new_owner = me;
@@ -2078,6 +2081,14 @@
 					   new_owner);
 	}
 
+	/* Check restart counter/timer for this resource */
+	if (check_restart(svcName) > 0) {
+		clulog(LOG_NOTICE, "Restart threshold for %s exceeded; "
+		       "attempting to relocate\n", svcName);
+		return handle_relocate_req(svcName, RG_START_RECOVER, -1,
+					   new_owner);
+	}
+
 	return handle_start_req(svcName, RG_START_RECOVER, new_owner);
 }
 
--- cluster/rgmanager/src/daemons/test.c	2007/07/31 18:02:49	1.12
+++ cluster/rgmanager/src/daemons/test.c	2007/11/30 20:36:18	1.13
@@ -25,6 +25,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <depends.h>



             reply	other threads:[~2007-11-30 20:36 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-11-30 20:36 lhh [this message]
  -- strict thread matches above, loose matches on Subject: below --
2007-12-30  8:27 [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h fabbione
2007-12-14 19:37 lhh
2007-06-27 14:03 lhh
2007-06-26 21:55 lhh
2007-06-14 15:06 mgrac
2007-06-14 13:36 mgrac
2007-04-27 18:10 lhh
2006-10-06 21:22 lhh
2006-09-01 19:02 lhh
2006-08-18 15:26 lhh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20071130203619.18381.qmail@sourceware.org \
    --to=lhh@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.