From mboxrd@z Thu Jan 1 00:00:00 1970 From: lhh@sourceware.org Date: 26 Nov 2007 21:46:29 -0000 Subject: [Cluster-devel] cluster/rgmanager ChangeLog include/reslist.h ... Message-ID: <20071126214629.1724.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL5 Changes by: lhh at sourceware.org 2007-11-26 21:46:27 Modified files: rgmanager : ChangeLog rgmanager/include: reslist.h rgmanager/src/daemons: Makefile fo_domain.c groups.c main.c reslist.c resrules.c restree.c rg_state.c test.c rgmanager/src/resources: service.sh vm.sh Added files: rgmanager/include: restart_counter.h rgmanager/src/daemons: restart_counter.c Log message: Implement restart counters per #247139 Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.31.2.28&r2=1.31.2.29 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.6&r2=1.15.2.7 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.14.2.3&r2=1.14.2.4 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/fo_domain.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.11&r2=1.11.2.1 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.25.2.12&r2=1.25.2.13 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.34.2.9&r2=1.34.2.10 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/reslist.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.14.2.4&r2=1.14.2.5 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/resrules.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.16.2.7&r2=1.16.2.8 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.23.2.12&r2=1.23.2.13 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.24.2.13&r2=1.24.2.14 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/test.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.6.2.5&r2=1.6.2.6 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/service.sh.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.7.2.6&r2=1.7.2.7 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/vm.sh.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.8&r2=1.1.2.9 --- cluster/rgmanager/ChangeLog 2007/11/26 21:37:17 1.31.2.28 +++ cluster/rgmanager/ChangeLog 2007/11/26 21:46:26 1.31.2.29 @@ -1,3 +1,21 @@ +2007-11-26 Lon Hohberger + * include/reslist.h: Add restart counters to resource node structure + (intended for top-level resources, i.e. services, vms...) + * include/restart_counter.h: Add header file for restart counter + * src/daemons/Makefile: Fix build to include restart counters + * src/daemons/restart_counter.c: Implement restart counters #247139 + * src/daemons/fo_domain.c, groups.c, restart_counter.c, resrules.c, + restree.c, test.c: Glue for restart counters. + * src/daemons/reslist.c: Glue for restart counters. Make expand_time + parser more robust to allow things like '1h30m' as a time value. + * src/daemons/main.c: Mark quorum disk offline in the correct + place to avoid extraneous log messages + * src/daemons/rg_state.c: Allow marking service as stopped if + stuck in recover state. Make service which failed to start + go to stopped state. Glue for restart counters. + * src/resources/service.sh, vm.sh: Add parameters for restart + counters #247139 + 2007-11-14 Lon Hohberger * src/utils/clulog.c: Make clulog honor rgmanager log levels (#289501) --- cluster/rgmanager/include/reslist.h 2007/08/02 14:46:51 1.15.2.6 +++ cluster/rgmanager/include/reslist.h 2007/11/26 21:46:26 1.15.2.7 @@ -126,6 +126,7 @@ struct _rg_node *rn_child, *rn_parent; resource_t *rn_resource; resource_act_t *rn_actions; + restart_counter_t rn_restart_counter; int rn_state; /* State of this instance of rn_resource */ int rn_flags; int rn_last_status; --- cluster/rgmanager/src/daemons/Makefile 2007/07/24 13:53:08 1.14.2.3 +++ cluster/rgmanager/src/daemons/Makefile 2007/11/26 21:46:27 1.14.2.4 @@ -38,7 +38,8 @@ clurgmgrd: rg_thread.o rg_locks.o main.o groups.o \ rg_queue.o rg_forward.o reslist.o \ resrules.o restree.o fo_domain.o nodeevent.o \ - rg_event.o watchdog.o rg_state.o ../clulib/libclulib.a + rg_event.o watchdog.o rg_state.o \ + restart_counter.o ../clulib/libclulib.a $(CC) -o $@ $^ $(INCLUDE) $(CFLAGS) $(LDFLAGS) -lccs -lcman -lpthread -ldlm # @@ -56,7 +57,8 @@ # packages should run 'make check' as part of the build process. # rg_test: rg_locks-noccs.o test-noccs.o reslist-noccs.o \ - resrules-noccs.o restree-noccs.o fo_domain-noccs.o + resrules-noccs.o restree-noccs.o fo_domain-noccs.o \ + restart_counter.o $(CC) -o $@ $^ $(INCLUDE) $(CFLAGS) -llalloc $(LDFLAGS) -lccs -lcman clurmtabd: clurmtabd.o clurmtabd_lib.o --- cluster/rgmanager/src/daemons/fo_domain.c 2006/09/27 16:28:41 1.11 +++ cluster/rgmanager/src/daemons/fo_domain.c 2007/11/26 21:46:27 1.11.2.1 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include --- cluster/rgmanager/src/daemons/groups.c 2007/08/02 14:46:51 1.25.2.12 +++ cluster/rgmanager/src/daemons/groups.c 2007/11/26 21:46:27 1.25.2.13 @@ -20,6 +20,7 @@ //#define DEBUG #include #include +#include #include #include #include @@ -178,6 +179,29 @@ } +resource_node_t * +node_by_ref(resource_node_t **tree, char *name) +{ + resource_t *res; + resource_node_t *node, *ret = NULL; + char rgname[64]; + int x; + + list_for(&_tree, node, x) { + + res = node->rn_resource; + res_build_name(rgname, sizeof(rgname), res); + + if (!strcasecmp(name, rgname)) { + ret = node; + break; + } + } + + return ret; +} + + int count_resource_groups_local(cman_node_t *mp) { @@ -1583,6 +1607,28 @@ } +int +check_restart(char *rg_name) +{ + resource_node_t *node; + int ret = 1; + + pthread_rwlock_rdlock(&resource_lock); + node = node_by_ref(&_tree, rg_name); + if (node) { + ret = restart_add(node->rn_restart_counter); + if (ret) { + /* Clear it out - caller is about + to relocate the service anyway */ + restart_clear(node->rn_restart_counter); + } + } + pthread_rwlock_unlock(&resource_lock); + + return ret; +} + + void kill_resource_groups(void) { --- cluster/rgmanager/src/daemons/main.c 2007/08/21 16:39:02 1.34.2.9 +++ cluster/rgmanager/src/daemons/main.c 2007/11/26 21:46:27 1.34.2.10 @@ -165,6 +165,7 @@ old_membership = member_list(); new_ml = get_member_list(h); + memb_mark_down(new_ml, 0); for (x = 0; x < new_ml->cml_count; x++) { @@ -181,19 +182,25 @@ quorate = cman_is_listening(h, new_ml->cml_members[x].cn_nodeid, port); + if (quorate == 0) { clulog(LOG_DEBUG, "Node %d is not listening\n", new_ml->cml_members[x].cn_nodeid); new_ml->cml_members[x].cn_member = 0; } else if (quorate < 0) { + if (errno == ENOTCONN) { + new_ml->cml_members[x].cn_member = 0; + break; + } perror("cman_is_listening"); usleep(50000); continue; } - #ifdef DEBUG - printf("Node %d IS listening\n", - new_ml->cml_members[x].cn_nodeid); + else { + printf("Node %d IS listening\n", + new_ml->cml_members[x].cn_nodeid); + } #endif break; } while(1); @@ -201,7 +208,6 @@ cman_finish(h); member_list_update(new_ml); - member_set_state(0, 0); /* Mark qdisk as dead */ /* * Handle nodes lost. Do our local node event first. --- cluster/rgmanager/src/daemons/reslist.c 2007/07/31 17:54:54 1.14.2.4 +++ cluster/rgmanager/src/daemons/reslist.c 2007/11/26 21:46:27 1.14.2.5 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #ifndef NO_CCS --- cluster/rgmanager/src/daemons/resrules.c 2007/07/31 17:54:54 1.16.2.7 +++ cluster/rgmanager/src/daemons/resrules.c 2007/11/26 21:46:27 1.16.2.8 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -218,43 +219,70 @@ int -expand_time(char *val) +expand_time (char *val) { - int l = strlen(val); - char c = val[l - 1]; - int ret = atoi(val); + int curval, len; + int ret = 0; + char *start = val, ival[16]; - if (ret <= 0) - return 0; + if (!val) + return (time_t)0; + + while (start[0]) { + + len = 0; + curval = 0; + memset(ival, 0, sizeof(ival)); + + while (isdigit(start[len])) { + ival[len] = start[len]; + len++; + } + + if (len) { + curval = atoi(ival); + } else { + len = 1; + } - if ((c >= '0') && (c <= '9')) - return ret; + switch(start[len]) { + case 0: + case 'S': + case 's': + break; + case 'M': + case 'm': + curval *= 60; + break; + case 'h': + case 'H': + curval *= 3600; + break; + case 'd': + case 'D': + curval *= 86400; + break; + case 'w': + case 'W': + curval *= 604800; + break; + case 'y': + case 'Y': + curval *= 31536000; + break; + default: + curval = 0; + } - switch(c) { - case 'S': - case 's': - return (ret); - case 'M': - case 'm': - return (ret * 60); - case 'h': - case 'H': - return (ret * 3600); - case 'd': - case 'D': - return (ret * 86400); - case 'w': - case 'W': - return (ret * 604800); - case 'y': - case 'Y': - return (ret * 31536000); + ret += (time_t)curval; + start += len; } return ret; } + /** * Store a resource action * @param actsp Action array; may be modified and returned! --- cluster/rgmanager/src/daemons/restree.c 2007/09/25 21:09:23 1.23.2.12 +++ cluster/rgmanager/src/daemons/restree.c 2007/11/26 21:46:27 1.23.2.13 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -432,6 +433,39 @@ } +static inline void +assign_restart_policy(resource_t *curres, resource_node_t *parent, + resource_node_t *node) +{ + char *val; + int max_restarts = 0; + time_t restart_expire_time = 0; + + node->rn_restart_counter = NULL; + + if (!curres || !node) + return; + if (parent) /* Non-parents don't get one for now */ + return; + + val = res_attr_value(curres, "max_restarts"); + if (!val) + return; + max_restarts = atoi(val); + if (max_restarts <= 0) + return; + val = res_attr_value(curres, "restart_expire_time"); + if (val) { + restart_expire_time = (time_t)expand_time(val); + if (!restart_expire_time) + return; + } + + node->rn_restart_counter = restart_init(restart_expire_time, + max_restarts); +} + + static inline int do_load_resource(int ccsfd, char *base, resource_rule_t *rule, @@ -514,6 +548,7 @@ node->rn_state = RES_STOPPED; node->rn_flags = 0; node->rn_actions = (resource_act_t *)act_dup(curres->r_actions); + assign_restart_policy(curres, parent, node); snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base); #ifndef NO_CCS @@ -768,6 +803,11 @@ destroy_resource_tree(&(*tree)->rn_child); list_remove(tree, node); + + if (node->rn_restart_counter) { + restart_cleanup(node->rn_restart_counter); + } + if(node->rn_actions){ free(node->rn_actions); } --- cluster/rgmanager/src/daemons/rg_state.c 2007/08/30 16:03:03 1.24.2.13 +++ cluster/rgmanager/src/daemons/rg_state.c 2007/11/26 21:46:27 1.24.2.14 @@ -1315,7 +1315,8 @@ } if ((svcStatus.rs_state != RG_STATE_STOPPING) && - (svcStatus.rs_state != RG_STATE_ERROR)) { + (svcStatus.rs_state != RG_STATE_ERROR) && + (svcStatus.rs_state != RG_STATE_RECOVER)) { rg_unlock(&lockp); return 0; } @@ -1721,8 +1722,10 @@ * We got sent here from handle_start_req. * We're DONE. */ - if (request == RG_START_RECOVER) + if (request == RG_START_RECOVER) { + _svc_stop_finish(svcName, 0, RG_STATE_STOPPED); return RG_EFAIL; + } /* * All potential places for the service to start have been exhausted. @@ -1731,7 +1734,7 @@ exhausted: if (!rg_locked()) { clulog(LOG_WARNING, - "#70: Attempting to restart service %s locally.\n", + "#70: Failed to relocate %s; restarting locally\n", svcName); if (svc_start(svcName, RG_START_RECOVER) == 0) { *new_owner = me; @@ -1969,6 +1972,14 @@ new_owner); } + /* Check restart counter/timer for this resource */ + if (check_restart(svcName) > 0) { + clulog(LOG_NOTICE, "Restart threshold for %s exceeded; " + "attempting to relocate\n", svcName); + return handle_relocate_req(svcName, RG_START_RECOVER, -1, + new_owner); + } + return handle_start_req(svcName, RG_START_RECOVER, new_owner); } --- cluster/rgmanager/src/daemons/test.c 2007/07/31 17:54:54 1.6.2.5 +++ cluster/rgmanager/src/daemons/test.c 2007/11/26 21:46:27 1.6.2.6 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include --- cluster/rgmanager/src/resources/service.sh 2007/11/13 17:38:43 1.7.2.6 +++ cluster/rgmanager/src/resources/service.sh 2007/11/26 21:46:27 1.7.2.7 @@ -154,6 +154,32 @@ + + + + Maximum restarts for this service. + + + Maximum restarts for this service. + + + + + + + Restart expiration time + + + Restart expiration time. A restart is forgotten + after this time. When combined with the max_restarts + option, this lets administrators specify a threshold + for when to fail over services. If max_restarts + is exceeded in this given expiration time, the service + is relocated instead of restarted again. + + + + --- cluster/rgmanager/src/resources/vm.sh 2007/11/14 18:58:26 1.1.2.8 +++ cluster/rgmanager/src/resources/vm.sh 2007/11/26 21:46:27 1.1.2.9 @@ -184,6 +184,31 @@ + + + Maximum restarts for this service. + + + Maximum restarts for this service. + + + + + + + Restart expiration time + + + Restart expiration time. A restart is forgotten + after this time. When combined with the max_restarts + option, this lets administrators specify a threshold + for when to fail over services. If max_restarts + is exceeded in this given expiration time, the service + is relocated instead of restarted again. + + + +