From mboxrd@z Thu Jan 1 00:00:00 1970 From: lhh@sourceware.org Date: 29 Jan 2007 19:55:07 -0000 Subject: [Cluster-devel] cluster/fence/fenced fd.h main.c recover.c Message-ID: <20070129195507.15296.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL4 Changes by: lhh at sourceware.org 2007-01-29 19:55:06 Modified files: fence/fenced : fd.h main.c recover.c Log message: Add manual override for fenced to RHEL4 branch; patch is a backport from HEAD branch; fixes 223060 Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/fd.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.7.2.5&r2=1.7.2.6 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/main.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.16.2.11&r2=1.16.2.12 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/recover.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.10.2.8&r2=1.10.2.9 --- cluster/fence/fenced/fd.h 2006/12/20 18:14:29 1.7.2.5 +++ cluster/fence/fenced/fd.h 2007/01/29 19:55:06 1.7.2.6 @@ -52,6 +52,7 @@ #define FENCED_SOCK_PATH "fenced_socket" +#define DEFAULT_OVERRIDE_PATH "/var/run/cluster/fenced_override" #define DEFAULT_POST_JOIN_DELAY 6 #define DEFAULT_POST_FAIL_DELAY 0 @@ -129,10 +130,12 @@ int debug; int post_join_delay; int post_fail_delay; + char *override_path; int8_t clean_start; int8_t post_join_delay_opt; int8_t post_fail_delay_opt; int8_t clean_start_opt; + int8_t override_path_opt; }; #define FDFL_RUN (0) --- cluster/fence/fenced/main.c 2005/12/20 16:03:58 1.16.2.11 +++ cluster/fence/fenced/main.c 2007/01/29 19:55:06 1.16.2.12 @@ -23,7 +23,7 @@ char our_name[MAX_CLUSTER_MEMBER_NAME_LEN+1]; -#define OPTION_STRING ("cj:f:t:Dn:hVSwQ") +#define OPTION_STRING ("cj:f:t:Dn:O:hVSwQ") #define LOCKFILE_NAME "/var/run/fenced.pid" @@ -40,6 +40,8 @@ DEFAULT_POST_JOIN_DELAY); printf(" -f Post-fail fencing delay (default %d)\n", DEFAULT_POST_FAIL_DELAY); + printf(" -O Override path (default %s)\n", + DEFAULT_OVERRIDE_PATH); printf(" -D Enable debugging code and don't fork\n"); printf(" -h Print this help, then exit\n"); printf(" -n Name of the fence domain, \"default\" if none\n"); @@ -434,6 +436,23 @@ free(str); } + if (fd->comline->override_path_opt == FALSE) { + str = NULL; + memset(path, 0, 256); + sprintf(path, "/cluster/fence_daemon/@override_path"); + + error = ccs_get(cd, path, &str); + if (!error) + /* XXX These are not explicitly freed on exit; if + we decide to make fenced handle SIGHUP at a later + time, we will need to free this. */ + fd->comline->override_path = strdup(str); + else + fd->comline->override_path = strdup(DEFAULT_OVERRIDE_PATH); + if (str) + free(str); + } + log_debug("delay post_join %ds post_fail %ds", fd->comline->post_join_delay, fd->comline->post_fail_delay); @@ -527,6 +546,8 @@ int cont = TRUE; int optchar; + comline->override_path_opt = FALSE; + comline->override_path = NULL; comline->post_join_delay_opt = FALSE; comline->post_fail_delay_opt = FALSE; comline->clean_start_opt = FALSE; @@ -551,6 +572,11 @@ comline->post_fail_delay_opt = TRUE; break; + case 'O': + comline->override_path = strdup(optarg); + comline->override_path_opt = TRUE; + break; + case 'D': comline->debug = TRUE; fenced_debug = TRUE; --- cluster/fence/fenced/recover.c 2006/12/20 18:14:29 1.10.2.8 +++ cluster/fence/fenced/recover.c 2007/01/29 19:55:06 1.10.2.9 @@ -12,6 +12,9 @@ ******************************************************************************/ #include "fd.h" +#include +#include +#include /* Fencing recovery algorithm @@ -358,6 +361,79 @@ return num_victims; } +static inline void close_override(int *fd, char *path) +{ + unlink(path); + if (fd && *fd >= 0) + close(*fd); + *fd = -1; +} + +static int open_override(char *path) +{ + int ret; + mode_t om; + + om = umask(077); + ret = mkfifo(path, (S_IRUSR | S_IWUSR)); + umask(om); + + if (ret < 0) + return -1; + return open(path, O_RDONLY | O_NONBLOCK); +} + +static int check_override(int ofd, char *nodename, int timeout) +{ + char buf[128]; + fd_set rfds; + struct timeval tv = {0, 0}; + int ret, x; + + if (ofd < 0 || !nodename || !strlen(nodename)) { + sleep(timeout); + return 0; + } + + FD_ZERO(&rfds); + FD_SET(ofd, &rfds); + tv.tv_usec = 0; + tv.tv_sec = timeout; + + ret = select(ofd + 1, &rfds, NULL, NULL, &tv); + if (ret < 0) { + syslog(LOG_ERR, "select: %s\n", strerror(errno)); + return -1; + } + + if (ret == 0) + return 0; + + memset(buf, 0, sizeof(buf)); + ret = read(ofd, buf, sizeof(buf) - 1); + if (ret < 0) { + syslog(LOG_ERR, "read: %s\n", strerror(errno)); + return -1; + } + + /* chop off control characters */ + for (x = 0; x < ret; x++) { + if (buf[x] < 0x20) { + buf[x] = 0; + break; + } + } + + if (!strcasecmp(nodename, buf)) { + /* Case insensitive, but not as nice as, say, name_equal + in the other file... */ + return 1; + } + + return 0; +} + + /* If there are victims after a node has joined, it's a good indication that they may be joining the cluster shortly. If we delay a bit they might become members and we can avoid fencing them. This is only really an issue @@ -428,7 +504,7 @@ fd_node_t *node; char *master_name; uint32_t master; - int error; + int error, override = -1; master = find_master_nodeid(fd, &master_name); @@ -462,7 +538,22 @@ list_del(&node->list); free(node); } - sleep(5); + + if (!fd->comline->override_path) { + sleep(5); + continue; + } + + /* Check for manual intervention */ + override = open_override(fd->comline->override_path); + if (check_override(override, node->name, 5) > 0) { + syslog(LOG_WARNING, "fence \"%s\" overridden by " + "administrator intervention", node->name); + + list_del(&node->list); + free(node); + } + close_override(&override, fd->comline->override_path); } }