From mboxrd@z Thu Jan 1 00:00:00 1970 From: lhh@sourceware.org Date: 27 Sep 2006 16:28:42 -0000 Subject: [Cluster-devel] cluster/rgmanager ChangeLog include/members.h ... Message-ID: <20060927162842.12961.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: lhh at sourceware.org 2006-09-27 16:28:41 Modified files: rgmanager : ChangeLog rgmanager/include: members.h reslist.h rgmanager/src/clulib: members.c rgmanager/src/daemons: fo_domain.c groups.c main.c nodeevent.c rg_state.c rgmanager/src/utils: clustat.c Log message: Fix various bugs, incl. 208011, 203762 Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.24&r2=1.25 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/members.h.diff?cvsroot=cluster&r1=1.2&r2=1.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&r1=1.14&r2=1.15 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/members.c.diff?cvsroot=cluster&r1=1.3&r2=1.4 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/fo_domain.c.diff?cvsroot=cluster&r1=1.10&r2=1.11 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.22&r2=1.23 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.31&r2=1.32 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/nodeevent.c.diff?cvsroot=cluster&r1=1.3&r2=1.4 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.21&r2=1.22 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clustat.c.diff?cvsroot=cluster&r1=1.20&r2=1.21 --- cluster/rgmanager/ChangeLog 2006/09/27 12:26:16 1.24 +++ cluster/rgmanager/ChangeLog 2006/09/27 16:28:41 1.25 @@ -1,6 +1,29 @@ 2006-09-27 Lon Hohberger * src/daemons/rg_state.c: Fix #208011 - failed->disabled state - transition + transition. Fix node ID type. + * include/members.h: Add a third state to note that a node does + not need to be fenced as a result of rgmanager crashing. Add protos + for memb_set_state and member_online functions. + * include/reslist.h: Fix type error; node IDs are 32-bit ints, not + 64-bit ints. + * src/clulib/members.c: Add member_set_state/online functions for + quick checks and sets of the protected member list. Zero out + the node structure prior to calling cman_get_nodeid so that we + don't get ENOENT (fixed clustat bug where clustat didn't report + the "local" flag). Fix node ID type. + * src/daemons/fo_domain.c: Fix node ID type, fix shadow declaration + of member_online + * src/daemons/sgroups.c: Unfix logic error that wasn't broken in the + first place. + * src/daemons/main.c: Fix node ID types. Add fourth ("clean") + param to node_event* to decide whether a node death is clean or not. + Nodes get marked clean if we get an RG_EXITING message, otherwise, + they are unclean, and we wait for fencing. + * src/daemons/nodeevent.c: Add fourth param to node_event* to help + decide if we need to wait for a node to be fenced or not. Fix + node ID type. + * src/utils/clustat.c: Fix logic error preventing nodes from properly + being flagged. 2006-09-01 Lon Hohberger * include/resgroup.h: Add proto for rg_strerror --- cluster/rgmanager/include/members.h 2006/07/12 14:04:06 1.2 +++ cluster/rgmanager/include/members.h 2006/09/27 16:28:41 1.3 @@ -3,10 +3,19 @@ #include +typedef enum { + NODE_STATE_DOWN = 0, + NODE_STATE_UP = 1, + NODE_STATE_CLEAN = 2 +} node_state_t; + + int get_my_nodeid(cman_handle_t h); int my_id(void); cluster_member_list_t * get_member_list(cman_handle_t h); void free_member_list(cluster_member_list_t *ml); +void member_set_state(int nodeid, int state); +int member_online(int nodeid); int memb_online(cluster_member_list_t *ml, int nodeid); int memb_online_name(cluster_member_list_t *ml, char *name); int memb_name_to_id(cluster_member_list_t *ml, char *name); --- cluster/rgmanager/include/reslist.h 2006/07/19 18:43:32 1.14 +++ cluster/rgmanager/include/reslist.h 2006/09/27 16:28:41 1.15 @@ -196,7 +196,7 @@ int construct_domains(int ccsfd, fod_t **domains); void deconstruct_domains(fod_t **domains); void print_domains(fod_t **domains); -int node_should_start(uint64_t nodeid, cluster_member_list_t *membership, +int node_should_start(int nodeid, cluster_member_list_t *membership, char *rg_name, fod_t **domains); --- cluster/rgmanager/src/clulib/members.c 2006/08/09 21:48:34 1.3 +++ cluster/rgmanager/src/clulib/members.c 2006/09/27 16:28:41 1.4 @@ -66,6 +66,7 @@ get_my_nodeid(cman_handle_t h) { cman_node_t node; + memset(&node,0,sizeof(node)); if (cman_get_node(h, CMAN_NODEID_US, &node) != 0) return -1; @@ -212,8 +213,51 @@ } +void +member_set_state(int nodeid, int state) +{ + int x = 0; + + pthread_rwlock_wrlock(&memblock); + if (!membership) { + pthread_rwlock_unlock(&memblock); + return; + } + + for (x = 0; x < membership->cml_count; x++) { + if (membership->cml_members[x].cn_nodeid == nodeid) + membership->cml_members[x].cn_member = state; + } + pthread_rwlock_unlock(&memblock); +} + + +int +member_online(int nodeid) +{ + int x = 0, ret = 0; + + pthread_rwlock_rdlock(&memblock); + if (!membership) { + pthread_rwlock_unlock(&memblock); + return 0; + } + + for (x = 0; x < membership->cml_count; x++) { + if (membership->cml_members[x].cn_nodeid == nodeid) { + ret = membership->cml_members[x].cn_member; + break; + } + } + pthread_rwlock_unlock(&memblock); + + return ret; +} + + + char * -member_name(uint64_t id, char *buf, int buflen) +member_name(int id, char *buf, int buflen) { char *n; --- cluster/rgmanager/src/daemons/fo_domain.c 2006/07/19 18:43:32 1.10 +++ cluster/rgmanager/src/daemons/fo_domain.c 2006/09/27 16:28:41 1.11 @@ -266,7 +266,7 @@ node_in_domain(char *nodename, fod_t *domain, cluster_member_list_t *membership) { - int member_online = 0, member_match = 0, preferred = 100, myprio = -1; + int online = 0, member_match = 0, preferred = 100, myprio = -1; fod_node_t *fodn; list_do(&domain->fd_nodes, fodn) { @@ -283,7 +283,7 @@ * If we get here, we know: * A member of the domain is online somewhere */ - member_online = 1; + online = 1; if (!strcmp(nodename, fodn->fdn_name)) { /* * If we get here, we know: @@ -297,7 +297,7 @@ preferred = fodn->fdn_prio; } while (!list_done(&domain->fd_nodes, fodn)); - if (!member_online) + if (!online) return 0; if (!member_match) @@ -322,7 +322,7 @@ * @return 0 on NO, 1 for YES */ int -node_should_start(uint64_t nodeid, cluster_member_list_t *membership, +node_should_start(int nodeid, cluster_member_list_t *membership, char *rg_name, fod_t **domains) { char *nodename = NULL; --- cluster/rgmanager/src/daemons/groups.c 2006/09/01 19:02:22 1.22 +++ cluster/rgmanager/src/daemons/groups.c 2006/09/27 16:28:41 1.23 @@ -273,7 +273,7 @@ * local start. */ if (svcStatus->rs_state == RG_STATE_STARTED && - svcStatus->rs_owner != mp->cn_nodeid) + svcStatus->rs_owner == mp->cn_nodeid) return; if (svcStatus->rs_state == RG_STATE_DISABLED) --- cluster/rgmanager/src/daemons/main.c 2006/09/01 19:02:22 1.31 +++ cluster/rgmanager/src/daemons/main.c 2006/09/27 16:28:41 1.32 @@ -41,13 +41,13 @@ int configure_logging(int ccsfd, int debug); -void node_event(int, uint64_t, int); -void node_event_q(int, uint64_t, int); +void node_event(int, int, int, int); +void node_event_q(int, int, int, int); int daemon_init(char *); int init_resource_groups(int); void kill_resource_groups(void); void set_my_id(int); -int eval_groups(int, uint64_t, int); +int eval_groups(int, int, int); void flag_shutdown(int sig); void hard_exit(void); int send_rg_states(msgctx_t *, int); @@ -60,7 +60,7 @@ static int signalled = 0; static int port = RG_PORT; -uint64_t next_node_id(cluster_member_list_t *membership, uint64_t me); +int next_node_id(cluster_member_list_t *membership, int me); int rg_event_q(char *svcName, uint32_t state, int owner); @@ -190,7 +190,7 @@ clulog(LOG_INFO, "State change: LOCAL OFFLINE\n"); if (node_delta) free_member_list(node_delta); - node_event(1, my_id(), 0); + node_event(1, my_id(), 0, 0); /* NOTREACHED */ } @@ -202,9 +202,9 @@ locked. This is just a performance thing */ if (!rg_locked()) { node_event_q(0, node_delta->cml_members[x].cn_nodeid, - 0); + 0, 0); } else { - clulog(LOG_NOTICE, "Not taking action - services" + clulog(LOG_DEBUG, "Not taking action - services" " locked\n"); } } @@ -219,7 +219,7 @@ me = memb_online(node_delta, my_id()); if (me) { clulog(LOG_INFO, "State change: Local UP\n"); - node_event_q(1, my_id(), 1); + node_event_q(1, my_id(), 1, 1); } for (x=0; node_delta && x < node_delta->cml_count; x++) { @@ -232,7 +232,7 @@ clulog(LOG_INFO, "State change: %s UP\n", node_delta->cml_members[x].cn_name); - node_event_q(0, node_delta->cml_members[x].cn_nodeid, 1); + node_event_q(0, node_delta->cml_members[x].cn_nodeid, 1, 1); } free_member_list(node_delta); @@ -490,7 +490,13 @@ break; case RG_EXITING: - clulog(LOG_NOTICE, "Member %d is going offline\n", (int)nodeid); + if (!member_online(msg_hdr->gh_arg1)) + break; + + clulog(LOG_NOTICE, "Member %d shutting down\n", + msg_hdr->gh_arg1); + member_set_state(msg_hdr->gh_arg1, 0); + node_event_q(0, msg_hdr->gh_arg1, 0, 1); break; case VF_MESSAGE: --- cluster/rgmanager/src/daemons/nodeevent.c 2006/07/11 23:52:41 1.3 +++ cluster/rgmanager/src/daemons/nodeevent.c 2006/09/27 16:28:41 1.4 @@ -27,8 +27,9 @@ typedef struct __ne_q { list_head(); int ne_local; - uint64_t ne_nodeid; + int ne_nodeid; int ne_state; + int ne_clean; } nevent_t; /** @@ -37,7 +38,7 @@ static nevent_t *event_queue = NULL; static pthread_mutex_t ne_queue_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_t ne_thread = 0; -int ne_queue_request(int local, uint64_t nodeid, int state); +int ne_queue_request(int local, int nodeid, int state); void hard_exit(void); int init_resource_groups(int); @@ -59,7 +60,7 @@ @see eval_groups */ void -node_event(int local, uint64_t nodeID, int nodeStatus) +node_event(int local, int nodeID, int nodeStatus, int clean) { if (!running) return; @@ -136,7 +137,7 @@ { cman_handle_t ch; int fenced = 0; - uint64_t fence_time; + int fence_time; ch = cman_init(NULL); if (cman_get_fenceinfo(ch, nodeid, &fence_time, &fenced, NULL) < 0) @@ -163,7 +164,8 @@ break; /* We're outta here */ pthread_mutex_unlock(&ne_queue_mutex); - if (ev->ne_state == 0 && node_has_fencing(ev->ne_nodeid)) { + if (ev->ne_state == 0 && !ev->ne_clean && + node_has_fencing(ev->ne_nodeid)) { notice = 0; while (!node_fenced(ev->ne_nodeid)) { if (!notice) { @@ -179,7 +181,8 @@ "continuing\n", ev->ne_nodeid); } - node_event(ev->ne_local, ev->ne_nodeid, ev->ne_state); + node_event(ev->ne_local, ev->ne_nodeid, ev->ne_state, + ev->ne_clean); free(ev); } @@ -192,7 +195,7 @@ void -node_event_q(int local, uint64_t nodeID, int state) +node_event_q(int local, int nodeID, int state, int clean) { nevent_t *ev; pthread_attr_t attrs; @@ -210,6 +213,7 @@ ev->ne_state = state; ev->ne_local = local; ev->ne_nodeid = nodeID; + ev->ne_clean = clean; pthread_mutex_lock (&ne_queue_mutex); list_insert(&event_queue, ev); --- cluster/rgmanager/src/daemons/rg_state.c 2006/09/27 12:26:17 1.21 +++ cluster/rgmanager/src/daemons/rg_state.c 2006/09/27 16:28:41 1.22 @@ -38,7 +38,7 @@ int node_should_start_safe(uint32_t, cluster_member_list_t *, char *); -uint32_t next_node_id(cluster_member_list_t *membership, uint32_t me); +int next_node_id(cluster_member_list_t *membership, int me); int rg_exec_script(char *rgname, char *script, char *action); static int _svc_stop_finish(char *svcName, int failed, uint32_t newstate); @@ -50,11 +50,11 @@ int group_migratory(char *servicename); -uint32_t -next_node_id(cluster_member_list_t *membership, uint32_t me) +int +next_node_id(cluster_member_list_t *membership, int me) { - uint32_t low = (uint32_t)(-1); - uint32_t next = me, curr; + int low = (int)(-1); + int next = me, curr; int x; for (x = 0; x < membership->cml_count; x++) { --- cluster/rgmanager/src/utils/clustat.c 2006/09/01 19:02:22 1.20 +++ cluster/rgmanager/src/utils/clustat.c 2006/09/27 16:28:41 1.21 @@ -243,7 +243,7 @@ m = memb_name_to_p(these, all->cml_members[x].cn_name); - if (m) { + if (m && m->cn_member) { all->cml_members[x].cn_nodeid = m->cn_nodeid; all->cml_members[x].cn_member |= flag; } @@ -299,7 +299,7 @@ char * -my_memb_id_to_name(cluster_member_list_t *members, uint64_t memb_id) +my_memb_id_to_name(cluster_member_list_t *members, int memb_id) { int x; @@ -637,6 +637,7 @@ /* Grab the local node ID and flag it from the list of reported online nodes */ *lid = get_my_nodeid(ch); + /* */ for (x=0; xcml_count; x++) { if (all->cml_members[x].cn_nodeid == *lid) { m = &all->cml_members[x]; @@ -677,7 +678,7 @@ int local_node_id; int fast = 0; int runtype = 0; - cman_handle_t ch; + cman_handle_t ch = NULL; int refresh_sec = 0, errors = 0; int opt, xml = 0, flags = 0;