From mboxrd@z Thu Jan 1 00:00:00 1970 From: pcaulfield@sourceware.org Date: 3 Nov 2006 15:07:53 -0000 Subject: [Cluster-devel] cluster/cman cman_tool/main.c daemon/cnxman-pr ... Message-ID: <20061103150753.2922.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: pcaulfield at sourceware.org 2006-11-03 15:07:53 Modified files: cman/cman_tool : main.c cman/daemon : cnxman-private.h commands.c Log message: fix bz#213747 Basically we don't let a node join a cluster that already has "Disallowed" nodes in it as we don't consistently know the state of the cluster in that case (it could be two inquorate halves for example). Sorry, Steven, this is yet another instance where cman has to exit() the aisexec process for the greater good of the cluster. I've also enhanceed "cman_tool nodes" to show the disallowed nodes and a warning message that the cluster is in a bit of a mess. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/cman_tool/main.c.diff?cvsroot=cluster&r1=1.50&r2=1.51 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/daemon/cnxman-private.h.diff?cvsroot=cluster&r1=1.25&r2=1.26 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/daemon/commands.c.diff?cvsroot=cluster&r1=1.53&r2=1.54 --- cluster/cman/cman_tool/main.c 2006/10/09 15:54:31 1.50 +++ cluster/cman/cman_tool/main.c 2006/11/03 15:07:51 1.51 @@ -302,6 +302,8 @@ int count; int i; int numnodes; + int dis_count; + cman_node_t *dis_nodes; cman_node_t *nodes; struct tm *jtime; struct tm *ftime; @@ -322,11 +324,46 @@ if (cman_get_nodes(h, count, &numnodes, nodes) < 0) die("cman_get_nodes failed: %s", cman_error(errno)); + + /* Get Disallowed nodes, so we can show them as such */ + dis_nodes = malloc(sizeof(cman_node_t) * count); + + if (cman_get_disallowed_nodes(h, count, &dis_count, dis_nodes) == 0) { + int i,j; + for (i=0; ifence_opt) { --- cluster/cman/daemon/cnxman-private.h 2006/10/05 07:48:33 1.25 +++ cluster/cman/daemon/cnxman-private.h 2006/11/03 15:07:52 1.26 @@ -143,12 +143,15 @@ #define RECONFIG_PARAM_CONFIG_VERSION 3 #define RECONFIG_PARAM_CCS 4 -/* NODE_FLAGS_BEENDOWN - this node has been down. - NODE_FLAGS_FENCED - This node has been fenced since it last went down. +/* NODE_FLAGS_BEENDOWN - This node has been down. + NODE_FLAGS_FENCED - This node has been fenced since it last went down. + NODE_FLAGS_FENCEDWHILEUP - This node was fenced manually (probably). + NODE_FLAGS_SEESDISALLOWED - Only set in a transition message */ #define NODE_FLAGS_BEENDOWN 1 #define NODE_FLAGS_FENCED 2 #define NODE_FLAGS_FENCEDWHILEUP 4 +#define NODE_FLAGS_SEESDISALLOWED 8 /* There's one of these for each node in the cluster */ struct cluster_node { --- cluster/cman/daemon/commands.c 2006/10/16 14:10:21 1.53 +++ cluster/cman/daemon/commands.c 2006/11/03 15:07:52 1.54 @@ -131,6 +131,18 @@ return ((node->port_bits[byte] & (1<state == NODESTATE_AISONLY) + return 1; + } + + return 0; +} + /* If "cluster_is_quorate" is 0 then all activity apart from protected ports is * blocked. */ static void set_quorate(int total_votes) @@ -1532,6 +1544,9 @@ len += 1; } + if (have_disallowed()) + msg->flags |= NODE_FLAGS_SEESDISALLOWED; + comms_send_message(msg, len, 0,0, 0, /* multicast */ @@ -1676,6 +1691,16 @@ P_MEMB("Transition message from %d does not match current config - should quit ?\n", nodeid); return; // PJC ??? } + + /* If the remote node can see AISONLY nodes then we can't join as we don't + know the full state */ + if (msg->flags & NODE_FLAGS_SEESDISALLOWED && !have_disallowed()) { + /* Must use syslog directly here or the message will never arrive */ + syslog(LOG_CRIT, "CMAN: Joined a cluster with disallowed nodes. must die"); + exit(2); + } + msg->flags &= ~NODE_FLAGS_SEESDISALLOWED; + node = find_node_by_nodeid(nodeid); assert(node); @@ -1703,6 +1728,12 @@ add_ais_node(nodeid, incarnation, num_ais_nodes); } + /* If the cluster already has some AISONLY nodes then we can't make + sense of the membership. So the new node has to also be AISONLY + until we are consistent again */ + if (have_disallowed() && !node->us) + node->state = NODESTATE_AISONLY; + node->flags = msg->flags; /* This will clear the BEENDOWN flag of course */ if (node->fence_agent && msg->fence_agent[0] && strcmp(node->fence_agent, msg->fence_agent)) {