From mboxrd@z Thu Jan 1 00:00:00 1970 From: lhh@sourceware.org Date: 31 May 2007 18:58:47 -0000 Subject: [Cluster-devel] cluster/rgmanager ChangeLog include/reslist.h ... Message-ID: <20070531185847.12042.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL5 Changes by: lhh at sourceware.org 2007-05-31 18:58:46 Modified files: rgmanager : ChangeLog rgmanager/include: reslist.h rgmanager/src/daemons: groups.c resrules.c restree.c rgmanager/src/resources: script.sh Log message: Fix bugzilla #229650; implement __independent_subtree feature Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.31.2.9&r2=1.31.2.10 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.2&r2=1.15.2.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.25.2.5&r2=1.25.2.6 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/resrules.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.16.2.4&r2=1.16.2.5 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.23.2.3&r2=1.23.2.4 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/script.sh.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.8&r2=1.8.2.1 --- cluster/rgmanager/ChangeLog 2007/05/31 18:38:44 1.31.2.9 +++ cluster/rgmanager/ChangeLog 2007/05/31 18:58:46 1.31.2.10 @@ -1,6 +1,8 @@ 2007-05-31 Lon Hohberger * src/daemons/resrules.c: Fix #234249 - ignore obvious backup files in /usr/share/cluster when processing resource rules + * src/daemons/restree.c, src/daemons/groups.c, include/reslist.h: + Implement independent subtrees, per bug #229650 2007-05-22 Lon Hohberger * src/resources/SAPInstance, SAPDatabase: Add primary attrs --- cluster/rgmanager/include/reslist.h 2007/03/23 00:06:34 1.15.2.2 +++ cluster/rgmanager/include/reslist.h 2007/05/31 18:58:46 1.15.2.3 @@ -35,6 +35,8 @@ #define RF_NEEDSTART (1<<2) /** Used when adding/changing resources */ #define RF_NEEDSTOP (1<<3) /** Used when deleting/changing resources */ #define RF_COMMON (1<<4) /** " */ +#define RF_INDEPENDENT (1<<5) /** Define this for a resource if it is + otherwise an independent subtree */ #define RES_STOPPED (0) #define RES_STARTED (1) @@ -56,10 +58,10 @@ typedef struct _resource_attribute { - int ra_flags; - /* XXX possible alignment problem on ia64 */ char *ra_name; char *ra_value; + int ra_flags; + int _pad_; } resource_attr_t; @@ -78,6 +80,7 @@ time_t ra_last; time_t ra_interval; int ra_depth; + int _pad_; } resource_act_t; --- cluster/rgmanager/src/daemons/groups.c 2007/05/10 16:23:43 1.25.2.5 +++ cluster/rgmanager/src/daemons/groups.c 2007/05/31 18:58:46 1.25.2.6 @@ -813,6 +813,7 @@ } pthread_rwlock_unlock(&resource_lock); +#if 0 /* Do NOT return error codes if we failed to stop for one of these reasons. It didn't start, either, so it's safe to assume that @@ -830,6 +831,7 @@ break; } } +#endif return ret; } --- cluster/rgmanager/src/daemons/resrules.c 2007/05/31 18:37:50 1.16.2.4 +++ cluster/rgmanager/src/daemons/resrules.c 2007/05/31 18:58:46 1.16.2.5 @@ -262,6 +262,7 @@ acts[0].ra_depth = depth; acts[0].ra_timeout = timeout; acts[0].ra_interval = interval; + acts[0].ra_last = 0; acts[1].ra_name = NULL; *actsp = acts; @@ -271,7 +272,7 @@ for (x = 0; acts[x].ra_name; x++) { if (!strcmp(acts[x].ra_name, name) && (depth == acts[x].ra_depth || depth == -1)) { - printf("Replacing action '%s' depth %d: ", + fprintf(stderr, "Replacing action '%s' depth %d: ", name, acts[x].ra_depth); if (timeout >= 0) { printf("timeout: %d->%d ", @@ -306,6 +307,7 @@ acts[x].ra_depth = depth; acts[x].ra_timeout = timeout; acts[x].ra_interval = interval; + acts[x].ra_last = 0; acts[x+1].ra_name = NULL; --- cluster/rgmanager/src/daemons/restree.c 2007/05/03 15:14:16 1.23.2.3 +++ cluster/rgmanager/src/daemons/restree.c 2007/05/31 18:58:46 1.23.2.4 @@ -39,6 +39,9 @@ void malloc_zap_mutex(void); #endif +#define FL_FAILURE 0x1 +#define FL_RECOVERABLE 0x2 + /* XXX from resrules.c */ int store_childtype(resource_child_t **childp, char *name, int start, @@ -507,6 +510,19 @@ node->rn_resource = curres; node->rn_state = RES_STOPPED; node->rn_actions = (resource_act_t *)act_dup(curres->r_actions); + + snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base); +#ifndef NO_CCS + if (ccs_get(ccsfd, tok, &ref) == 0) { +#else + if (conf_get(tok, &ref) == 0) { +#endif + if (atoi(ref) > 0 || strcasecmp(ref, "yes") == 0) + node->rn_flags |= RF_INDEPENDENT; + free(ref); + } + + curres->r_refs++; *newnode = node; @@ -718,7 +734,6 @@ resource_rule_t **rulelist, resource_t **reslist) { - resource_rule_t *curr; resource_node_t *root = NULL; char tok[512]; @@ -777,6 +792,8 @@ printf("NEEDSTART "); if (node->rn_flags & RF_COMMON) printf("COMMON "); + if (node->rn_flags & RF_INDEPENDENT) + printf("INDEPENDENT "); printf("]"); } printf(" {\n"); @@ -838,10 +855,11 @@ #endif /* Do op on all children at our level */ - rv += _res_op(&node->rn_child, first, + rv |= _res_op(&node->rn_child, first, rule->rr_childtypes[x].rc_name, ret, op); - if (rv != 0 && op != RS_STOP) + + if (rv & FL_FAILURE && op != RS_STOP) return rv; } @@ -853,46 +871,6 @@ } -#if 0 -static inline int -_do_child_default_level(resource_node_t **tree, resource_t *first, - void *ret, int op) -{ - resource_node_t *node = *tree; - resource_t *res = node->rn_resource; - resource_rule_t *rule = res->r_rule; - int x, rv = 0, lev; - - for (x = 0; rule->rr_childtypes && - rule->rr_childtypes[x].rc_name; x++) { - - if(op == RS_STOP) - lev = rule->rr_childtypes[x].rc_stoplevel; - else - lev = rule->rr_childtypes[x].rc_startlevel; - - if (lev) - continue; - - /* - printf("%s children of %s type %s (default level)\n", - agent_op_str(op), - node->rn_resource->r_rule->rr_type, - rule->rr_childtypes[x].rc_name); - */ - - rv = _res_op(&node->rn_child, first, - rule->rr_childtypes[x].rc_name, - ret, op); - if (rv != 0) - return rv; - } - - return 0; -} -#endif - - static inline int _xx_child_internal(resource_node_t *node, resource_t *first, resource_node_t *child, void *ret, int op) @@ -926,13 +904,14 @@ if (op == RS_START || op == RS_STATUS) { list_for(&node->rn_child, child, y) { - rv = _xx_child_internal(node, first, child, ret, op); - if (rv) + rv |= _xx_child_internal(node, first, child, ret, op); + + if (rv & FL_FAILURE) return rv; } } else { list_for_rev(&node->rn_child, child, y) { - rv += _xx_child_internal(node, first, child, ret, op); + rv |= _xx_child_internal(node, first, child, ret, op); } } @@ -973,7 +952,7 @@ if (op == RS_START || op == RS_STATUS) { rv = _do_child_levels(tree, first, ret, op); - if (rv != 0) + if (rv & FL_FAILURE) return rv; /* Start default level after specified ones */ @@ -992,6 +971,22 @@ } +void +mark_nodes(resource_node_t *node, int state, int flags) +{ + int x; + resource_node_t *child; + + list_for(&node->rn_child, child, x) { + if (child->rn_child) + mark_nodes(child->rn_child, state, flags); + } + + node->rn_state = state; + node->rn_flags |= (RF_NEEDSTART | RF_NEEDSTOP); +} + + /** Do a status on a resource node. This takes into account the last time the status operation was run and selects the highest possible resource depth @@ -1123,130 +1118,6 @@ in the subtree). @see _res_op_by_level res_exec */ -#if 0 -int -_res_op(resource_node_t **tree, resource_t *first, - char *type, void * __attribute__((unused))ret, int realop) -{ - int rv, me; - resource_node_t *node; - int op; - - list_do(tree, node) { - - /* Restore default operation. */ - op = realop; - - /* If we're starting by type, do that funky thing. */ - if (type && strlen(type) && - strcmp(node->rn_resource->r_rule->rr_type, type)) - continue; - - /* If the resource is found, all nodes in the subtree must - have the operation performed as well. */ - me = !first || (node->rn_resource == first); - - /* - printf("begin %s: %s %s [0x%x]\n", agent_op_str(op), - node->rn_resource->r_rule->rr_type, - primary_attr_value(node->rn_resource), - node->rn_flags); - */ - - if (me) { - /* - If we've been marked as a node which - needs to be started or stopped, clear - that flag and start/stop this resource - and all resource babies. - - Otherwise, don't do anything; look for - children with RF_NEEDSTART and - RF_NEEDSTOP flags. - - CONDSTART and CONDSTOP are no-ops if - the appropriate flag is not set. - */ - if ((op == RS_CONDSTART) && - (node->rn_flags & RF_NEEDSTART)) { - /* - printf("Node %s:%s - CONDSTART\n", - node->rn_resource->r_rule->rr_type, - primary_attr_value(node->rn_resource)); - */ - op = RS_START; - } - - if ((op == RS_CONDSTOP) && - (node->rn_flags & RF_NEEDSTOP)) { - /* - printf("Node %s:%s - CONDSTOP\n", - node->rn_resource->r_rule->rr_type, - primary_attr_value(node->rn_resource)); - */ - op = RS_STOP; - } - } - - /* Start starts before children */ - if (me && (op == RS_START)) { - node->rn_flags &= ~RF_NEEDSTART; - - rv = res_exec(node, agent_op_str(op), NULL, 0); - if (rv != 0) { - node->rn_state = RES_FAILED; - return rv; - } - - set_time("start", 0, node); - clear_checks(node); - - if (node->rn_state != RES_STARTED) { - ++node->rn_resource->r_incarnations; - node->rn_state = RES_STARTED; - } - } - - if (node->rn_child) { - rv = _res_op_by_level(&node, me?NULL:first, ret, op); - if (rv != 0) - return rv; - } - - /* Stop/status/etc stops after children have stopped */ - if (me && (op == RS_STOP)) { - node->rn_flags &= ~RF_NEEDSTOP; - rv = res_exec(node, agent_op_str(op), NULL, 0); - - if (rv != 0) { - node->rn_state = RES_FAILED; - return rv; - } - - if (node->rn_state != RES_STOPPED) { - --node->rn_resource->r_incarnations; - node->rn_state = RES_STOPPED; - } - - } else if (me && (op == RS_STATUS)) { - - rv = do_status(node); - if (rv != 0) - return rv; - } - - /* - printf("end %s: %s %s\n", agent_op_str(op), - node->rn_resource->r_rule->rr_type, - primary_attr_value(node->rn_resource)); - */ - } while (!list_done(tree, node)); - - return 0; -} -#endif - - static inline int _res_op_internal(resource_node_t **tree, resource_t *first, char *type, void *__attribute__((unused))ret, int realop, @@ -1309,7 +1180,7 @@ rv = res_exec(node, agent_op_str(op), NULL, 0); if (rv != 0) { node->rn_state = RES_FAILED; - return rv; + return FL_FAILURE; } set_time("start", 0, node); @@ -1322,14 +1193,43 @@ } else if (me && (op == RS_STATUS)) { /* Check status before children*/ rv = do_status(node); - if (rv != 0) - return rv; + if (rv != 0) { + /* + If this node's status has failed, all of its + dependent children are failed, whether or not this + node is independent or not. + */ + mark_nodes(node, RES_FAILED, + RF_NEEDSTART | RF_NEEDSTOP); + + /* If we're an independent subtree, return a flag + stating that this section is recoverable apart + from siblings in the resource tree. All child + resources of this node must be restarted, + but siblings of this node are not affected. */ + if (node->rn_flags & RF_INDEPENDENT) + return FL_RECOVERABLE; + + return FL_FAILURE; + } + } if (node->rn_child) { rv = _res_op_by_level(&node, me?NULL:first, ret, op); - if (rv != 0) - return rv; + if (rv != 0) { + mark_nodes(node, RES_FAILED, + RF_NEEDSTART | RF_NEEDSTOP); + + /* If this node is independent of its siblings, + that one of its dependent children failed + does not matter: its dependent children must + also be independent of this node's siblings. */ + if (node->rn_flags & RF_INDEPENDENT) + return FL_RECOVERABLE; + + return FL_FAILURE; + } } /* Stop should occur after children have stopped */ @@ -1339,7 +1239,7 @@ if (rv != 0) { node->rn_state = RES_FAILED; - return rv; + return FL_FAILURE; } if (node->rn_state != RES_STOPPED) { @@ -1378,24 +1278,31 @@ char *type, void * __attribute__((unused))ret, int realop) { resource_node_t *node; - int count = 0, rv; + int count = 0, rv = 0; if (realop == RS_STOP) { list_for_rev(tree, node, count) { - rv = _res_op_internal(tree, first, type, ret, realop, - node); - if (rv != 0) - return rv; + rv |= _res_op_internal(tree, first, type, ret, realop, + node); } } else { list_for(tree, node, count) { - rv = _res_op_internal(tree, first, type, ret, realop, - node); - if (rv != 0) + rv |= _res_op_internal(tree, first, type, ret, realop, + node); + + /* If we hit a problem during a 'status' op in an + independent subtree, rv will have the + FL_RECOVERABLE bit set, but not FL_FAILURE. + If we ever hit FL_FAILURE during a status + operation, we're *DONE* - even if the subtree + is flagged w/ indy-subtree */ + + if (rv & FL_FAILURE) return rv; } } - return 0; + + return rv; } /** @@ -1464,7 +1371,30 @@ int res_status(resource_node_t **tree, resource_t *res, void *ret) { - return _res_op(tree, res, NULL, ret, RS_STATUS); + int rv; + rv = _res_op(tree, res, NULL, ret, RS_STATUS); + + if (rv & FL_FAILURE) + return rv; + + clulog(LOG_WARNING, "Some independent resources in %s:%s failed; " + "Attempting inline recovery\n", + res->r_rule->rr_type, res->r_attrs->ra_value); + + rv = res_condstop(tree, res, ret); + if (rv & FL_FAILURE) + goto out_fail; + rv = res_condstart(tree, res, ret); + if (rv & FL_FAILURE) + goto out_fail; + + clulog(LOG_NOTICE, "Inline recovery of %s:%s successful\n", + res->r_rule->rr_type, res->r_attrs->ra_value); + return 0; +out_fail: + clulog(LOG_WARNING, "Inline recovery of %s:%s failed\n", + res->r_rule->rr_type, res->r_attrs->ra_value); + return 1; } --- cluster/rgmanager/src/resources/script.sh 2006/08/18 15:26:23 1.8 +++ cluster/rgmanager/src/resources/script.sh 2007/05/31 18:58:46 1.8.2.1 @@ -115,5 +115,5 @@ declare -i rv=$? if [ $rv -ne 0 ]; then ocf_log err "script:$OCF_RESKEY_name: $1 of $OCF_RESKEY_file failed (returned $rv)" - return $OCF_ERR_GENERIC + exit $OCF_ERR_GENERIC fi