From mboxrd@z Thu Jan 1 00:00:00 1970 From: lhh@sourceware.org Date: 1 Feb 2008 15:15:04 -0000 Subject: [Cluster-devel] cluster/rgmanager include/resgroup.h include/r ... Message-ID: <20080201151504.6728.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL5 Changes by: lhh at sourceware.org 2008-02-01 15:15:03 Modified files: rgmanager/include: resgroup.h restart_counter.h rgmanager/src/daemons: groups.c restart_counter.c rg_state.c slang_event.c rgmanager/src/resources: default_event_script.sl Log message: Allow restart counters to work with central_processing; #400211 / #431130 Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.9&r2=1.15.2.10 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.2&r2=1.1.2.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.25.2.14&r2=1.25.2.15 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.1&r2=1.1.2.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.24.2.17&r2=1.24.2.18 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/slang_event.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.3.2.1&r2=1.3.2.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/default_event_script.sl.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.3&r2=1.1.2.4 --- cluster/rgmanager/include/resgroup.h 2007/12/18 17:52:56 1.15.2.9 +++ cluster/rgmanager/include/resgroup.h 2008/02/01 15:15:02 1.15.2.10 @@ -135,6 +135,7 @@ int svc_fail(char *svcName); int svc_migrate(char *svcName, int target); int check_restart(char *svcName); +int add_restart(char *svcName); int rt_enqueue_request(const char *resgroupname, int request, msgctx_t *resp_ctx, --- cluster/rgmanager/include/restart_counter.h 2007/12/18 17:52:56 1.1.2.2 +++ cluster/rgmanager/include/restart_counter.h 2008/02/01 15:15:02 1.1.2.3 @@ -25,6 +25,7 @@ int restart_add(restart_counter_t arg); int restart_clear(restart_counter_t arg); int restart_count(restart_counter_t arg); +int restart_treshold_exceeded(restart_counter_t arg); restart_counter_t restart_init(time_t expire_timeout, int max_restarts); int restart_cleanup(restart_counter_t arg); --- cluster/rgmanager/src/daemons/groups.c 2007/12/18 17:52:56 1.25.2.14 +++ cluster/rgmanager/src/daemons/groups.c 2008/02/01 15:15:02 1.25.2.15 @@ -1787,7 +1787,7 @@ int -check_restart(char *rg_name) +add_restart(char *rg_name) { resource_node_t *node; int ret = 1; @@ -1796,11 +1796,24 @@ node = node_by_ref(&_tree, rg_name); if (node) { ret = restart_add(node->rn_restart_counter); - if (ret) { - /* Clear it out - caller is about - to relocate the service anyway */ - restart_clear(node->rn_restart_counter); - } + } + pthread_rwlock_unlock(&resource_lock); + + return ret; +} + + +int +check_restart(char *rg_name) +{ + resource_node_t *node; + int ret = 0; + + pthread_rwlock_rdlock(&resource_lock); + node = node_by_ref(&_tree, rg_name); + if (node) { + printf("%s %p\n", rg_name, node->rn_restart_counter); + ret = restart_threshold_exceeded(node->rn_restart_counter); } pthread_rwlock_unlock(&resource_lock); --- cluster/rgmanager/src/daemons/restart_counter.c 2007/11/26 21:46:27 1.1.2.1 +++ cluster/rgmanager/src/daemons/restart_counter.c 2008/02/01 15:15:02 1.1.2.2 @@ -46,6 +46,10 @@ #define VALIDATE(arg, ret) \ do { \ + if (!arg) {\ + errno = EINVAL; \ + return ret; \ + } \ if (((restart_info_t *)arg)->magic != RESTART_INFO_MAGIC) {\ errno = EINVAL; \ return ret; \ @@ -97,6 +101,21 @@ } +int +restart_threshold_exceeded(restart_counter_t arg) +{ + restart_info_t *restarts = (restart_info_t *)arg; + time_t now; + + VALIDATE(arg, -1); + now = time(NULL); + restart_timer_purge(arg, now); + if (restarts->restart_count >= restarts->max_restarts) + return 1; + return 0; +} + + /* Add a restart entry to the list. Returns 1 if restart count is exceeded */ int @@ -127,7 +146,7 @@ /* Check and remove old entries */ restart_timer_purge(restarts, t); - if (restarts->restart_count > restarts->max_restarts) + if (restarts->restart_count >= restarts->max_restarts) return 1; return 0; @@ -170,6 +189,7 @@ info->expire_timeout = expire_timeout; info->max_restarts = max_restarts; info->restart_count = 0; + info->restart_nodes = NULL; return (void *)info; } --- cluster/rgmanager/src/daemons/rg_state.c 2008/01/25 18:09:24 1.24.2.17 +++ cluster/rgmanager/src/daemons/rg_state.c 2008/02/01 15:15:02 1.24.2.18 @@ -678,7 +678,6 @@ clulog(LOG_NOTICE, "Recovering failed service %s\n", svcName); - svcStatus->rs_state = RG_STATE_STOPPED; /* Start! */ ret = 1; break; @@ -789,13 +788,16 @@ /* LOCK HELD if we get here */ svcStatus.rs_owner = my_id(); - svcStatus.rs_state = RG_STATE_STARTING; svcStatus.rs_transition = (uint64_t)time(NULL); - if (req == RG_START_RECOVER) + if (svcStatus.rs_state == RG_STATE_RECOVER) { + add_restart(svcName); svcStatus.rs_restarts++; - else + } else { svcStatus.rs_restarts = 0; + } + + svcStatus.rs_state = RG_STATE_STARTING; if (set_rg_state(svcName, &svcStatus) < 0) { clulog(LOG_ERR, @@ -1248,7 +1250,7 @@ { struct dlm_lksb lockp; rg_state_t svcStatus; - int ret; + int ret = 0; int old_state; if (!rg_quorate()) { @@ -1291,6 +1293,18 @@ old_state = svcStatus.rs_state; + if (old_state == RG_STATE_RECOVER) { + clulog(LOG_DEBUG, "%s is clean; skipping double-stop\n", + svcName); + svcStatus.rs_state = newstate; + + if (set_rg_state(svcName, &svcStatus) != 0) { + rg_unlock(&lockp); + clulog(LOG_ERR, "#52: Failed changing RG status\n"); + return RG_EFAIL; + } + } + clulog(LOG_NOTICE, "Stopping service %s\n", svcName); if (recover) --- cluster/rgmanager/src/daemons/slang_event.c 2007/12/18 17:52:56 1.3.2.1 +++ cluster/rgmanager/src/daemons/slang_event.c 2008/02/01 15:15:02 1.3.2.2 @@ -80,6 +80,7 @@ _node_clean = 0, _service_owner = 0, _service_last_owner = 0, + _service_restarts_exceeded = 0, _user_request = 0, _user_arg1 = 0, _user_arg2 = 0, @@ -123,6 +124,8 @@ MAKE_VARIABLE("service_owner", &_service_owner,SLANG_INT_TYPE, 1), MAKE_VARIABLE("service_last_owner", &_service_last_owner, SLANG_INT_TYPE, 1), + MAKE_VARIABLE("service_restarts_exceeded", &_service_restarts_exceeded, + SLANG_INT_TYPE, 1), /* User event information */ MAKE_VARIABLE("user_request", &_user_request, SLANG_INT_TYPE,1), @@ -226,6 +229,7 @@ sl_service_status(char *svcName) { rg_state_t svcStatus; + int restarts_exceeded = 0; char *state_str; if (get_service_state_internal(svcName, &svcStatus) < 0) { @@ -236,6 +240,15 @@ return; } + restarts_exceeded = check_restart(svcName); + if (SLang_push_integer(restarts_exceeded) < 0) { + SLang_verror(SL_RunTime_Error, + "%s: Failed to push restarts_exceeded %s", + __FUNCTION__, + svcName); + return; + } + if (SLang_push_integer(svcStatus.rs_restarts) < 0) { SLang_verror(SL_RunTime_Error, "%s: Failed to push restarts for %s", @@ -1085,6 +1098,7 @@ _service_state = (char *)rg_state_str(state); _service_owner = owner; _service_last_owner = last_owner; + _service_restarts_exceeded = check_restart(name); switch(state) { case RG_STATE_DISABLED: @@ -1102,6 +1116,7 @@ _service_state = 0; _service_owner = 0; _service_last_owner = 0; + _service_restarts_exceeded = 0; return ret; } --- cluster/rgmanager/src/resources/default_event_script.sl 2007/12/19 21:33:26 1.1.2.3 +++ cluster/rgmanager/src/resources/default_event_script.sl 2008/02/01 15:15:03 1.1.2.4 @@ -154,7 +154,8 @@ debug("Recovering", " Service: ", service_name, " Last owner: ", service_last_owner, - " Policy: ", policy); + " Policy: ", policy, + " RTE: ", service_restarts_exceeded); if (policy == "disable") { () = service_stop(service_name, 1); @@ -162,13 +163,17 @@ } nodes = allowed_nodes(service_name); - if (policy == "restart") { - tmp = union(service_last_owner, nodes); + if (policy == "restart" and service_restarts_exceeded == 0) { + nodes = union(service_last_owner, nodes); } else { % relocate tmp = subtract(nodes, service_last_owner); - nodes = tmp; - tmp = union(nodes, service_last_owner); + if (length(tmp) == 0) { + () = service_stop(service_name,0); + return; + } + + nodes = union(tmp, service_last_owner); } ()=move_or_start(service_name, nodes);