From: lhh@sourceware.org <lhh@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
Date: 30 Nov 2007 20:36:19 -0000 [thread overview]
Message-ID: <20071130203619.18381.qmail@sourceware.org> (raw)
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: lhh at sourceware.org 2007-11-30 20:36:18
Modified files:
rgmanager : ChangeLog
rgmanager/include: resgroup.h reslist.h
rgmanager/src/daemons: Makefile fo_domain.c groups.c main.c
reslist.c resrules.c restree.c rg_state.c
test.c
Added files:
rgmanager/include: restart_counter.h
rgmanager/src/daemons: restart_counter.c
Log message:
Merges from RHEL5 branch - round 2.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.60&r2=1.61
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/Makefile.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/fo_domain.c.diff?cvsroot=cluster&r1=1.13&r2=1.14
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.39&r2=1.40
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.44&r2=1.45
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/reslist.c.diff?cvsroot=cluster&r1=1.19&r2=1.20
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/resrules.c.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&r1=1.37&r2=1.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.40&r2=1.41
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/test.c.diff?cvsroot=cluster&r1=1.12&r2=1.13
--- cluster/rgmanager/ChangeLog 2007/11/30 20:06:55 1.60
+++ cluster/rgmanager/ChangeLog 2007/11/30 20:36:17 1.61
@@ -1,6 +1,8 @@
2007-11-30 Lon Hohberger <lhh@redhat.com>
- * src/resources/*: Merge from RHEL5 branch.
- * src/utils/*: Merge from RHEL5 branch.
+ * src/resources/*: Merge misc. updates from RHEL5 branch.
+ * src/utils/*: Merge misc. updates from RHEL5 branch.
+ * include/*.h, src/daemons/*: Merge status-counter patch
+ from RHEL5 branch.
2007-08-30 Lon Hohberger <lhh@redhat.com>
* src/daemons/restree.c, rg_state.c: Fix tree-restart bug
--- cluster/rgmanager/include/restart_counter.h 2007/11/26 21:46:26 1.1
+++ cluster/rgmanager/include/restart_counter.h 2007/11/30 20:36:17 1.2
@@ -0,0 +1,12 @@
+#ifndef _RESTART_COUNTER_H
+#define _RESTART_COUNTER_H
+
+typedef void *restart_counter_t;
+
+int restart_add(restart_counter_t arg);
+int restart_clear(restart_counter_t arg);
+int restart_count(restart_counter_t arg);
+restart_counter_t restart_init(time_t expire_timeout, int max_restarts);
+int restart_cleanup(restart_counter_t arg);
+
+#endif
--- cluster/rgmanager/include/resgroup.h 2007/06/27 14:03:51 1.23
+++ cluster/rgmanager/include/resgroup.h 2007/11/30 20:36:17 1.24
@@ -150,6 +150,8 @@
int svc_freeze(char *svcName);
int svc_unfreeze(char *svcName);
int svc_migrate(char *svcName, int target);
+int check_restart(char *svcName);
+
int rt_enqueue_request(const char *resgroupname, int request,
msgctx_t *resp_ctx,
int max, uint32_t target, int arg0, int arg1);
--- cluster/rgmanager/include/reslist.h 2007/08/02 14:53:37 1.23
+++ cluster/rgmanager/include/reslist.h 2007/11/30 20:36:17 1.24
@@ -126,6 +126,7 @@
struct _rg_node *rn_child, *rn_parent;
resource_t *rn_resource;
resource_act_t *rn_actions;
+ restart_counter_t rn_restart_counter;
int rn_state; /* State of this instance of rn_resource */
int rn_flags;
int rn_last_status;
--- cluster/rgmanager/src/daemons/restart_counter.c 2007/11/26 21:46:27 1.1
+++ cluster/rgmanager/src/daemons/restart_counter.c 2007/11/30 20:36:17 1.2
@@ -0,0 +1,185 @@
+/*
+ Copyright Red Hat, Inc. 2007
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License version 2 as published
+ by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+*/
+/* Time-based restart counters for rgmanager */
+
+#include <stdio.h>
+#include <list.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <time.h>
+#include <restart_counter.h>
+
+
+
+#define RESTART_INFO_MAGIC 0x184820ab
+
+typedef struct {
+ list_head();
+ time_t restart_time;
+} restart_item_t;
+
+typedef struct {
+ int magic;
+ time_t expire_timeout;
+ int max_restarts;
+ int restart_count;
+ restart_item_t *restart_nodes;
+} restart_info_t;
+
+
+#define VALIDATE(arg, ret) \
+do { \
+ if (((restart_info_t *)arg)->magic != RESTART_INFO_MAGIC) {\
+ errno = EINVAL; \
+ return ret; \
+ } \
+} while(0)
+
+
+/* Remove expired restarts */
+static int
+restart_timer_purge(restart_counter_t arg, time_t now)
+{
+ restart_info_t *restarts = (restart_info_t *)arg;
+ restart_item_t *i;
+ int x, done = 0;
+
+ VALIDATE(arg, -1);
+
+ /* No timeout */
+ if (restarts->expire_timeout == 0)
+ return 0;
+
+ do {
+ done = 1;
+ list_for(&restarts->restart_nodes, i, x) {
+ if ((now - i->restart_time) >=
+ restarts->expire_timeout) {
+ restarts->restart_count--;
+ list_remove(&restarts->restart_nodes, i);
+ done = 0;
+ break;
+ }
+ }
+ } while(!done);
+
+ return 0;
+}
+
+
+int
+restart_count(restart_counter_t arg)
+{
+ restart_info_t *restarts = (restart_info_t *)arg;
+ time_t now;
+
+ VALIDATE(arg, -1);
+ now = time(NULL);
+ restart_timer_purge(arg, now);
+ return restarts->restart_count;
+}
+
+
+/* Add a restart entry to the list. Returns 1 if restart
+ count is exceeded */
+int
+restart_add(restart_counter_t arg)
+{
+ restart_info_t *restarts = (restart_info_t *)arg;
+ restart_item_t *i;
+ time_t t;
+
+ if (!arg)
+ /* No max restarts / threshold = always
+ ok to restart! */
+ return 0;
+
+ VALIDATE(arg, -1);
+
+ i = malloc(sizeof(*i));
+ if (!i) {
+ return -1;
+ }
+
+ t = time(NULL);
+ i->restart_time = t;
+
+ list_insert(&restarts->restart_nodes, i);
+ restarts->restart_count++;
+
+ /* Check and remove old entries */
+ restart_timer_purge(restarts, t);
+
+ if (restarts->restart_count > restarts->max_restarts)
+ return 1;
+
+ return 0;
+}
+
+
+int
+restart_clear(restart_counter_t arg)
+{
+ restart_info_t *restarts = (restart_info_t *)arg;
+ restart_item_t *i;
+
+ VALIDATE(arg, -1);
+ while ((i = restarts->restart_nodes)) {
+ list_remove(&restarts->restart_nodes, i);
+ free(i);
+ }
+
+ restarts->restart_count = 0;
+
+ return 0;
+}
+
+
+restart_counter_t
+restart_init(time_t expire_timeout, int max_restarts)
+{
+ restart_info_t *info;
+
+ if (max_restarts < 0) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ info = malloc(sizeof(*info));
+ if (info == NULL)
+ return NULL;
+
+ info->magic = RESTART_INFO_MAGIC;
+ info->expire_timeout = expire_timeout;
+ info->max_restarts = max_restarts;
+ info->restart_count = 0;
+
+ return (void *)info;
+}
+
+
+int
+restart_cleanup(restart_counter_t arg)
+{
+ VALIDATE(arg, -1);
+ restart_clear(arg);
+ free(arg);
+ return 0;
+}
--- cluster/rgmanager/src/daemons/Makefile 2007/08/28 04:35:47 1.23
+++ cluster/rgmanager/src/daemons/Makefile 2007/11/30 20:36:17 1.24
@@ -31,12 +31,14 @@
rg_queue.o \
rg_state.o \
rg_thread.o \
+ restart_counter.o \
watchdog.o
OBJS2= clurmtabd.o \
clurmtabd_lib.o
-OBJS3= test-noccs.o
+OBJS3= test-noccs.o \
+ restart_counter.o
OBJS4= dtest-noccs.o
--- cluster/rgmanager/src/daemons/fo_domain.c 2007/03/20 17:09:57 1.13
+++ cluster/rgmanager/src/daemons/fo_domain.c 2007/11/30 20:36:17 1.14
@@ -27,6 +27,7 @@
#include <list.h>
#include <clulog.h>
#include <resgroup.h>
+#include <restart_counter.h>
#include <reslist.h>
#include <ccs.h>
#include <pthread.h>
--- cluster/rgmanager/src/daemons/groups.c 2007/08/02 14:53:38 1.39
+++ cluster/rgmanager/src/daemons/groups.c 2007/11/30 20:36:17 1.40
@@ -20,6 +20,7 @@
//#define DEBUG
#include <platform.h>
#include <resgroup.h>
+#include <restart_counter.h>
#include <reslist.h>
#include <vf.h>
#include <message.h>
@@ -179,6 +180,29 @@
}
+resource_node_t *
+node_by_ref(resource_node_t **tree, char *name)
+{
+ resource_t *res;
+ resource_node_t *node, *ret = NULL;
+ char rgname[64];
+ int x;
+
+ list_for(&_tree, node, x) {
+
+ res = node->rn_resource;
+ res_build_name(rgname, sizeof(rgname), res);
+
+ if (!strcasecmp(name, rgname)) {
+ ret = node;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+
int
count_resource_groups_local(cman_node_t *mp)
{
@@ -1587,6 +1611,28 @@
}
+int
+check_restart(char *rg_name)
+{
+ resource_node_t *node;
+ int ret = 1;
+
+ pthread_rwlock_rdlock(&resource_lock);
+ node = node_by_ref(&_tree, rg_name);
+ if (node) {
+ ret = restart_add(node->rn_restart_counter);
+ if (ret) {
+ /* Clear it out - caller is about
+ to relocate the service anyway */
+ restart_clear(node->rn_restart_counter);
+ }
+ }
+ pthread_rwlock_unlock(&resource_lock);
+
+ return ret;
+}
+
+
void
kill_resource_groups(void)
{
--- cluster/rgmanager/src/daemons/main.c 2007/09/19 09:54:19 1.44
+++ cluster/rgmanager/src/daemons/main.c 2007/11/30 20:36:17 1.45
@@ -166,6 +166,7 @@
old_membership = member_list();
new_ml = get_member_list(h);
+ memb_mark_down(new_ml, 0);
for (x = 0; x < new_ml->cml_count; x++) {
@@ -182,19 +183,25 @@
quorate = cman_is_listening(h,
new_ml->cml_members[x].cn_nodeid,
port);
+
if (quorate == 0) {
clulog(LOG_DEBUG, "Node %d is not listening\n",
new_ml->cml_members[x].cn_nodeid);
new_ml->cml_members[x].cn_member = 0;
} else if (quorate < 0) {
+ if (errno == ENOTCONN) {
+ new_ml->cml_members[x].cn_member = 0;
+ break;
+ }
perror("cman_is_listening");
usleep(50000);
continue;
}
-
#ifdef DEBUG
- printf("Node %d IS listening\n",
- new_ml->cml_members[x].cn_nodeid);
+ else {
+ printf("Node %d IS listening\n",
+ new_ml->cml_members[x].cn_nodeid);
+ }
#endif
break;
} while(1);
@@ -202,7 +209,6 @@
cman_finish(h);
member_list_update(new_ml);
- member_set_state(0, 0); /* Mark qdisk as dead */
/*
* Handle nodes lost. Do our local node event first.
--- cluster/rgmanager/src/daemons/reslist.c 2007/07/31 18:00:25 1.19
+++ cluster/rgmanager/src/daemons/reslist.c 2007/11/30 20:36:17 1.20
@@ -26,6 +26,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <list.h>
+#include <restart_counter.h>
#include <reslist.h>
#include <pthread.h>
#ifndef NO_CCS
--- cluster/rgmanager/src/daemons/resrules.c 2007/07/31 18:00:25 1.23
+++ cluster/rgmanager/src/daemons/resrules.c 2007/11/30 20:36:17 1.24
@@ -27,6 +27,8 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <list.h>
+#include <ctype.h>
+#include <restart_counter.h>
#include <reslist.h>
#include <pthread.h>
#include <dirent.h>
@@ -230,43 +232,70 @@
int
-expand_time(char *val)
+expand_time (char *val)
{
- int l = strlen(val);
- char c = val[l - 1];
- int ret = atoi(val);
+ int curval, len;
+ int ret = 0;
+ char *start = val, ival[16];
- if (ret <= 0)
- return 0;
+ if (!val)
+ return (time_t)0;
+
+ while (start[0]) {
+
+ len = 0;
+ curval = 0;
+ memset(ival, 0, sizeof(ival));
+
+ while (isdigit(start[len])) {
+ ival[len] = start[len];
+ len++;
+ }
+
+ if (len) {
+ curval = atoi(ival);
+ } else {
+ len = 1;
+ }
- if ((c >= '0') && (c <= '9'))
- return ret;
+ switch(start[len]) {
+ case 0:
+ case 'S':
+ case 's':
+ break;
+ case 'M':
+ case 'm':
+ curval *= 60;
+ break;
+ case 'h':
+ case 'H':
+ curval *= 3600;
+ break;
+ case 'd':
+ case 'D':
+ curval *= 86400;
+ break;
+ case 'w':
+ case 'W':
+ curval *= 604800;
+ break;
+ case 'y':
+ case 'Y':
+ curval *= 31536000;
+ break;
+ default:
+ curval = 0;
+ }
- switch(c) {
- case 'S':
- case 's':
- return (ret);
- case 'M':
- case 'm':
- return (ret * 60);
- case 'h':
- case 'H':
- return (ret * 3600);
- case 'd':
- case 'D':
- return (ret * 86400);
- case 'w':
- case 'W':
- return (ret * 604800);
- case 'y':
- case 'Y':
- return (ret * 31536000);
+ ret += (time_t)curval;
+ start += len;
}
return ret;
}
+
/**
* Store a resource action
* @param actsp Action array; may be modified and returned!
--- cluster/rgmanager/src/daemons/restree.c 2007/08/30 16:09:39 1.37
+++ cluster/rgmanager/src/daemons/restree.c 2007/11/30 20:36:17 1.38
@@ -30,6 +30,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <list.h>
+#include <restart_counter.h>
#include <reslist.h>
#include <pthread.h>
#include <clulog.h>
@@ -432,6 +433,39 @@
}
+static inline void
+assign_restart_policy(resource_t *curres, resource_node_t *parent,
+ resource_node_t *node)
+{
+ char *val;
+ int max_restarts = 0;
+ time_t restart_expire_time = 0;
+
+ node->rn_restart_counter = NULL;
+
+ if (!curres || !node)
+ return;
+ if (parent) /* Non-parents don't get one for now */
+ return;
+
+ val = res_attr_value(curres, "max_restarts");
+ if (!val)
+ return;
+ max_restarts = atoi(val);
+ if (max_restarts <= 0)
+ return;
+ val = res_attr_value(curres, "restart_expire_time");
+ if (val) {
+ restart_expire_time = (time_t)expand_time(val);
+ if (!restart_expire_time)
+ return;
+ }
+
+ node->rn_restart_counter = restart_init(restart_expire_time,
+ max_restarts);
+}
+
+
static inline int
do_load_resource(int ccsfd, char *base,
resource_rule_t *rule,
@@ -514,6 +548,7 @@
node->rn_state = RES_STOPPED;
node->rn_flags = 0;
node->rn_actions = (resource_act_t *)act_dup(curres->r_actions);
+ assign_restart_policy(curres, parent, node);
snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base);
#ifndef NO_CCS
@@ -769,6 +804,11 @@
destroy_resource_tree(&(*tree)->rn_child);
list_remove(tree, node);
+
+ if (node->rn_restart_counter) {
+ restart_cleanup(node->rn_restart_counter);
+ }
+
if(node->rn_actions){
free(node->rn_actions);
}
--- cluster/rgmanager/src/daemons/rg_state.c 2007/08/30 16:09:39 1.40
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/11/30 20:36:18 1.41
@@ -1350,7 +1350,8 @@
}
if ((svcStatus.rs_state != RG_STATE_STOPPING) &&
- (svcStatus.rs_state != RG_STATE_ERROR)) {
+ (svcStatus.rs_state != RG_STATE_ERROR) &&
+ (svcStatus.rs_state != RG_STATE_RECOVER)) {
rg_unlock(&lockp);
return 0;
}
@@ -1829,8 +1830,10 @@
* We got sent here from handle_start_req.
* We're DONE.
*/
- if (request == RG_START_RECOVER)
+ if (request == RG_START_RECOVER) {
+ _svc_stop_finish(svcName, 0, RG_STATE_STOPPED);
return RG_EFAIL;
+ }
/*
* All potential places for the service to start have been exhausted.
@@ -1839,7 +1842,7 @@
exhausted:
if (!rg_locked()) {
clulog(LOG_WARNING,
- "#70: Attempting to restart service %s locally.\n",
+ "#70: Failed to relocate %s; restarting locally\n",
svcName);
if (svc_start(svcName, RG_START_RECOVER) == 0) {
*new_owner = me;
@@ -2078,6 +2081,14 @@
new_owner);
}
+ /* Check restart counter/timer for this resource */
+ if (check_restart(svcName) > 0) {
+ clulog(LOG_NOTICE, "Restart threshold for %s exceeded; "
+ "attempting to relocate\n", svcName);
+ return handle_relocate_req(svcName, RG_START_RECOVER, -1,
+ new_owner);
+ }
+
return handle_start_req(svcName, RG_START_RECOVER, new_owner);
}
--- cluster/rgmanager/src/daemons/test.c 2007/07/31 18:02:49 1.12
+++ cluster/rgmanager/src/daemons/test.c 2007/11/30 20:36:18 1.13
@@ -25,6 +25,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <list.h>
+#include <restart_counter.h>
#include <reslist.h>
#include <pthread.h>
#include <depends.h>
next reply other threads:[~2007-11-30 20:36 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-11-30 20:36 lhh [this message]
-- strict thread matches above, loose matches on Subject: below --
2007-12-30 8:27 [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h fabbione
2007-12-14 19:37 lhh
2007-06-27 14:03 lhh
2007-06-26 21:55 lhh
2007-06-14 15:06 mgrac
2007-06-14 13:36 mgrac
2007-04-27 18:10 lhh
2006-10-06 21:22 lhh
2006-09-01 19:02 lhh
2006-08-18 15:26 lhh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20071130203619.18381.qmail@sourceware.org \
--to=lhh@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.