From mboxrd@z Thu Jan  1 00:00:00 1970
From: lhh@sourceware.org <lhh@sourceware.org>
Date: 22 Jan 2007 22:50:16 -0000
Subject: [Cluster-devel] cluster/cman qdisk/score.h qdisk/score.c qdisk ...
Message-ID: <20070122225016.5775.qmail@sourceware.org>
List-Id: <cluster-devel.redhat.com>
To: cluster-devel.redhat.com
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL5
Changes by:	lhh at sourceware.org	2007-01-22 22:50:12

Modified files:
	cman/qdisk     : score.h score.c Makefile main.c disk.h clulog.c 
	cman/man       : qdisk.5 mkqdisk.8 
Added files:
	cman/qdisk     : daemon_init.c 

Log message:
	Resolves bugzillas: #213533, #216092, #220211, #223002, #223234/#223240
	
	Detailed comments:
	
	* Lock in memory to prevent being swapped out
	* Turn on RR scheduling for main + score threads
	* Let qdiskd wait for CMAN to start
	* Add option to qdiskd to stop CMAN if qdisk device is not available
	* Make qdisk interval timings more accurate
	* Add option to reboot node if qdiskd detects internal hang > failure time (e.g. interval*tko, in seconds)
	* Add per-heuristic tko counts for unreliable heuristics (e.g. ping packets)
	* Remove nodes from quorate mask immediately on eviction
	* Update man pages with better examples
	* Don't let >1 instance of qdiskd be started
	* Clarify logging output.
	* Improve data in status_file.
	* Allow qdiskd to run with no defined heuristics (master-always-wins mode).
	* Make fencing of nodes optional (default = on).
	* Make sure CMAN is running before we try to talk to it at each point.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/daemon_init.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.6&r2=1.6.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.2&r2=1.4.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/clulog.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdisk.5.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.3&r2=1.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/mkqdisk.8.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.6.1

--- cluster/cman/qdisk/score.h	2006/05/19 14:41:35	1.2
+++ cluster/cman/qdisk/score.h	2007/01/22 22:50:11	1.2.4.1
@@ -32,7 +32,9 @@
 	char *	program;
 	int	score;
 	int	available;
+	int	tko;
 	int	interval;
+	int	misses;
 	pid_t	childpid;
 	time_t	nextrun;
 };
@@ -50,7 +52,7 @@
 /*
    Start the thread which runs the scoring applets
  */
-int start_score_thread(struct h_data *h, int count);
+int start_score_thread(qd_ctx *ctx, struct h_data *h, int count);
 
 /* 
    Get our score + maxscore
--- cluster/cman/qdisk/score.c	2006/05/19 14:41:35	1.2
+++ cluster/cman/qdisk/score.c	2007/01/22 22:50:11	1.2.4.1
@@ -32,14 +32,20 @@
 #include <string.h>
 #include <ccs.h>
 #include <clulog.h>
+#include <sched.h>
+#include <sys/mman.h>
+#include "disk.h"
 #include "score.h"
 
 static pthread_mutex_t sc_lock = PTHREAD_MUTEX_INITIALIZER;
 static int _score = 0, _maxscore = 0, _score_thread_running = 0;
 static pthread_t score_thread = (pthread_t)0;
+void set_priority(int, int);
 
 struct h_arg {
 	struct h_data *h;
+	int sched_queue;
+	int sched_prio;
 	int count;
 };
 
@@ -97,6 +103,20 @@
 		h->childpid = pid;
 		return 0;
 	}
+	
+	/*
+	 * always use SCHED_OTHER for the child processes 
+	 * nice -1 is fine; but we don't know what the child process
+	 * might do, so leaving it (potentially) in SCHED_RR or SCHED_FIFO
+	 * is out of the question
+	 * 
+	 * XXX if you set SCHED_OTHER in the conf file and nice 20, the below
+	 * will make the heuristics a higher prio than qdiskd.  This should be
+	 * fine in practice, because running qdiskd at nice 20 will cause all
+	 * sorts of problems on a busy system.
+	 */
+	set_priority(SCHED_OTHER, -1);
+	munlockall();
 
 	argv[0] = "/bin/sh";
 	argv[1] = "-c";
@@ -122,6 +142,12 @@
 
 	*score = 0;
 	*maxscore = 0;
+	
+	/* Allow operation w/o any heuristics */
+	if (!max) {
+		*score = *maxscore = 1;
+		return;
+	}
 
 	for (x = 0; x < max; x++) {
 		*maxscore += h[x].score;
@@ -141,22 +167,51 @@
 	int status;
 
 	if (h->childpid == 0)
+		/* No child to check */
 		return 0;
 
 	ret = waitpid(h->childpid, &status, block?0:WNOHANG);
 	if (!block && ret == 0)
+		/* No children exited */
 		return 0;
 
 	h->childpid = 0;
-	h->available = 0;
 	if (ret < 0 && errno == ECHILD)
-		return -1;
-	if (!WIFEXITED(status))
-		return 0;
-	if (WEXITSTATUS(status) != 0)
-		return 0;
-	h->available = 1;
+		/* wrong child? */
+		goto miss;
+	if (!WIFEXITED(status)) {
+		ret = 0;
+		goto miss;
+	}
+	if (WEXITSTATUS(status) != 0) {
+		ret = 0;
+		goto miss;
+	}
+	
+	/* Returned 0 and was not killed */
+	if (!h->available) {
+		h->available = 1;
+		clulog(LOG_INFO, "Heuristic: '%s' UP\n", h->program);
+	}
+	h->misses = 0;
 	return 0;
+	
+miss:
+	if (h->available) {
+		h->misses++;
+		if (h->misses >= h->tko) {
+			clulog(LOG_INFO,
+				"Heuristic: '%s' DOWN (%d/%d)\n",
+				h->program, h->misses, h->tko);
+			h->available = 0;
+		} else {
+			clulog(LOG_DEBUG,
+				"Heuristic: '%s' missed (%d/%d)\n",
+				h->program, h->misses, h->tko);
+		}
+	}
+	
+	return ret;
 }
 
 
@@ -204,7 +259,9 @@
 	do {
 		h[x].program = NULL;
 		h[x].available = 0;
+		h[x].misses = 0;
 		h[x].interval = 2;
+		h[x].tko = 1;
 		h[x].score = 1;
 		h[x].childpid = 0;
 		h[x].nextrun = 0;
@@ -236,9 +293,20 @@
 			if (h[x].interval <= 0)
 				h[x].interval = 2;
 		}
+		
+		/* Get tko for this heuristic */
+		snprintf(query, sizeof(query),
+			 "/cluster/quorumd/heuristic[%d]/@tko", x+1);
+		if (ccs_get(ccsfd, query, &val) == 0) {
+			h[x].tko= atoi(val);
+			free(val);
+			if (h[x].tko <= 0)
+				h[x].tko = 1;
+		}
 
-		clulog(LOG_DEBUG, "Heuristic: '%s' score=%d interval=%d\n",
-		       h[x].program, h[x].score, h[x].interval);
+		clulog(LOG_DEBUG,
+		       "Heuristic: '%s' score=%d interval=%d tko=%d\n",
+		       h[x].program, h[x].score, h[x].interval, h[x].tko);
 
 	} while (++x < max);
 
@@ -271,6 +339,8 @@
 {
 	struct h_arg *args = (struct h_arg *)arg;
 	int score, maxscore;
+	
+	set_priority(args->sched_queue, args->sched_prio);
 
 	while (_score_thread_running) {
 		fork_heuristics(args->h, args->count);
@@ -317,7 +387,7 @@
   to pass in h if it was allocated on the stack.
  */
 int
-start_score_thread(struct h_data *h, int count)
+start_score_thread(qd_ctx *ctx, struct h_data *h, int count)
 {
 	pthread_attr_t attrs;
 	struct h_arg *args;
@@ -337,8 +407,11 @@
 
 	memcpy(args->h, h, (sizeof(struct h_data) * count));
 	args->count = count;
+	args->sched_queue = ctx->qc_sched;
+	args->sched_prio = ctx->qc_sched_prio;
 
 	_score_thread_running = 1;
+	
         pthread_attr_init(&attrs);
         pthread_attr_setinheritsched(&attrs, PTHREAD_INHERIT_SCHED);
         pthread_create(&score_thread, &attrs, score_thread_main, args);
--- cluster/cman/qdisk/Makefile	2006/08/11 15:18:05	1.6
+++ cluster/cman/qdisk/Makefile	2007/01/22 22:50:11	1.6.2.1
@@ -28,7 +28,7 @@
 	install ${TARGET} ${sbindir}
 
 qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \
-	gettid.o proc.o ../lib/libcman.a
+	gettid.o proc.o daemon_init.o  ../lib/libcman.a
 	gcc -o $@ $^ -lpthread -L../lib -L${ccslibdir} -lccs
 
 mkqdisk: disk.o crc32.o disk_util.o \
--- cluster/cman/qdisk/main.c	2007/01/16 15:16:56	1.4.2.2
+++ cluster/cman/qdisk/main.c	2007/01/22 22:50:11	1.4.2.3
@@ -35,11 +35,21 @@
 #include <unistd.h>
 #include <time.h>
 #include <sys/reboot.h>
+#include <sys/time.h>
 #include <linux/reboot.h>
+#include <sched.h>
 #include <signal.h>
 #include <ccs.h>
 #include "score.h"
 #include "clulog.h"
+#if (!defined(LIBCMAN_VERSION) || \
+     (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION < 2))
+#include <cluster/cnxman-socket.h>
+#endif
+
+int daemon_init(char *);
+int check_process_running(char *, pid_t *);
+
 /*
   TODO:
   1) Take into account timings to gracefully extend node timeouts during 
@@ -155,6 +165,11 @@
 		if (sb->ps_timestamp == ni[x].ni_last_seen) {
 			/* XXX check for average + allow grace */
 			ni[x].ni_misses++;
+			if (ni[x].ni_misses > 1) {
+				clulog(LOG_DEBUG,
+					"Node %d missed an update (%d/%d)\n",
+					x+1, ni[x].ni_misses, ctx->qc_tko);
+			}
 			continue;
 		}
 
@@ -208,6 +223,11 @@
 			ni[x].ni_misses = 0;
 			ni[x].ni_state = S_NONE;
 
+			/* Clear our master mask for the node after eviction
+			 * or shutdown */
+			if (mask)
+				clear_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+					  sizeof(memb_mask_t));
 			continue;
 		}
 
@@ -227,15 +247,17 @@
 			   Write eviction notice if we're the master.
 			 */
 			if (ctx->qc_status == S_MASTER) {
-				clulog(LOG_DEBUG,
+				clulog(LOG_NOTICE,
 				       "Writing eviction notice for node %d\n",
 				       ni[x].ni_status.ps_nodeid);
 				qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
 						S_EVICT, NULL, NULL, NULL);
-				clulog(LOG_DEBUG,
-				       "Telling CMAN to kill the node\n");
-				cman_kill_node(ctx->qc_ch,
-					       ni[x].ni_status.ps_nodeid);
+				if (ctx->qc_flags & RF_ALLOW_KILL) {
+					clulog(LOG_DEBUG, "Telling CMAN to "
+						"kill the node\n");
+					cman_kill_node(ctx->qc_ch,
+						ni[x].ni_status.ps_nodeid);
+				}
 			}
 
 			/*
@@ -255,6 +277,10 @@
 			ni[x].ni_evil_incarnation = 
 				ni[x].ni_status.ps_incarnation;
 			
+			/* Clear our master mask for the node after eviction */
+			if (mask)
+				clear_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+					  sizeof(memb_mask_t));
 			continue;
 		}
 
@@ -279,9 +305,12 @@
 			ni[x].ni_status.ps_state = S_EVICT;
 
 			/* XXX Need to fence it again */
-			clulog(LOG_DEBUG, "Telling CMAN to kill the node\n");
-			cman_kill_node(ctx->qc_ch,
-				       ni[x].ni_status.ps_nodeid);
+			if (ctx->qc_flags & RF_ALLOW_KILL) {
+				clulog(LOG_DEBUG, "Telling CMAN to "
+					"kill the node\n");
+				cman_kill_node(ctx->qc_ch,
+					ni[x].ni_status.ps_nodeid);
+			}
 			continue;
 		}
 
@@ -416,6 +445,10 @@
 	int x = 0, score, maxscore;
 
 	clulog(LOG_INFO, "Quorum Daemon Initializing\n");
+	
+	if (mlockall(MCL_CURRENT|MCL_FUTURE) != 0) {
+		clulog(LOG_ERR, "Unable to mlockall()\n");
+	}
 
 	if (qdisk_validate(ctx->qc_device) < 0)
 		return -1;
@@ -427,7 +460,7 @@
 		return -1;
 	}
 	
-	start_score_thread(h, maxh);
+	start_score_thread(ctx, h, maxh);
 
 	node_info_init(ni, max);
 	if (qd_write_status(ctx, ctx->qc_my_id,
@@ -447,7 +480,6 @@
 		}
 
 		sleep(ctx->qc_interval);
-
 	}
 
 	get_my_score(&score,&maxscore);
@@ -500,12 +532,16 @@
 		return;
 
 	memset(master_mask, 0, sizeof(master_mask));
-
 	for (x = 0; x < retnodes; x++) {
 		if (is_bit_set(mask, nodes[x].cn_nodeid-1, sizeof(mask)) &&
-		    nodes[x].cn_member)
+		    nodes[x].cn_member) {
 			set_bit(master_mask, nodes[x].cn_nodeid-1,
 				sizeof(master_mask));
+		} else {
+			/* Not in CMAN output = not allowed */
+			clear_bit(master_mask, (nodes[x].cn_nodeid-1),
+				  sizeof(memb_mask_t));
+		}
 	}
 }
 
@@ -604,12 +640,25 @@
 	}
 
 	fprintf(fp, "Node ID: %d\n", ctx->qc_my_id);
-	fprintf(fp, "Score (current / min req. / max allowed): %d / %d / %d\n",
-		score, score_req, score_max);
+	
+	if (ctx->qc_master)
+		fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
+	else 
+		fprintf(fp, "Master Node ID: (none)\n");
+	
+	fprintf(fp, "Score: %d/%d (Minimum required = %d)\n",
+		score, score_max, score_req);
 	fprintf(fp, "Current state: %s\n", state_str(ctx->qc_status));
 	fprintf(fp, "Current disk state: %s\n",
 		state_str(ctx->qc_disk_status));
 
+	fprintf(fp, "Initializing Set: {");
+	for (x=0; x<max; x++) {
+		if (ni[x].ni_state == S_INIT)
+			fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
+	}
+	fprintf(fp, " }\n");
+	
 	fprintf(fp, "Visible Set: {");
 	for (x=0; x<max; x++) {
 		if (ni[x].ni_state >= S_RUN || ni[x].ni_status.ps_nodeid == 
@@ -617,13 +666,10 @@
 			fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
 	}
 	fprintf(fp, " }\n");
-
-	if (!ctx->qc_master) {
-		fprintf(fp, "No master node\n");
+	
+	if (!ctx->qc_master)
 		goto out;
-	}
 
-	fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
 	fprintf(fp, "Quorate Set: {");
 	for (x=0; x<max; x++) {
 		if (is_bit_set(ni[ctx->qc_master-1].ni_status.ps_master_mask,
@@ -642,18 +688,141 @@
 }
 
 
+/* Timeval functions from clumanager */
+/**
+ * Scale a (struct timeval).
+ *
+ * @param tv		The timeval to scale.
+ * @param scale		Positive multiplier.
+ * @return		tv
+ */
+struct timeval *
+_scale_tv(struct timeval *tv, int scale)
+{
+	tv->tv_sec *= scale;
+	tv->tv_usec *= scale;
+
+	if (tv->tv_usec > 1000000) {
+		tv->tv_sec += (tv->tv_usec / 1000000);
+		tv->tv_usec = (tv->tv_usec % 1000000);
+	}
+
+	return tv;
+}
+
+
+static inline void
+_diff_tv(struct timeval *dest, struct timeval *start, struct timeval *end)
+{
+	dest->tv_sec = end->tv_sec - start->tv_sec;
+	dest->tv_usec = end->tv_usec - start->tv_usec;
+
+	if (dest->tv_usec < 0) {
+		dest->tv_usec += 1000000;
+		dest->tv_sec--;
+	}
+}
+
+
+#define _print_tv(val) \
+	printf("%s: %d.%06d\n", #val, (int)((val)->tv_sec), \
+			(int)((val)->tv_usec))
+
+
+static inline int
+_cmp_tv(struct timeval *left, struct timeval *right)
+{
+	if (left->tv_sec > right->tv_sec)
+		return -1;
+
+	if (left->tv_sec < right->tv_sec)
+		return 1;
+
+	if (left->tv_usec > right->tv_usec)
+		return -1;
+	
+	if (left->tv_usec < right->tv_usec)
+		return 1;
+
+	return 0;
+}
+
+
+void
+set_priority(int queue, int prio)
+{
+	struct sched_param s;
+	int ret;
+	char *func = "nice";
+	
+	if (queue == SCHED_OTHER) {
+		s.sched_priority = 0;
+		ret = sched_setscheduler(0, queue, &s);
+		errno = 0;
+		ret = nice(prio);
+	} else {
+		memset(&s,0,sizeof(s));
+		s.sched_priority = prio;
+		ret = sched_setscheduler(0, queue, &s);
+		func = "sched_setscheduler";
+	}
+	
+	if (ret < 0 && errno) {
+		clulog(LOG_WARNING, "set_priority [%s] failed: %s\n", func,
+		       strerror(errno));
+	}
+}
+
+
+int
+cman_alive(cman_handle_t ch)
+{
+	fd_set rfds;
+	int fd = cman_get_fd(ch);
+	struct timeval tv = {0, 0};
+	
+	FD_ZERO(&rfds);
+	FD_SET(fd, &rfds);
+	if (select(fd + 1, &rfds, NULL, NULL, &tv) == 1) {
+		if (cman_dispatch(ch, CMAN_DISPATCH_ALL) < 0) {
+			if (errno == EAGAIN)
+				return 0;
+			return -1;
+		}
+	}
+	return 0;
+}
+
 
 int
 quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
 {
 	disk_msg_t msg = {0, 0, 0};
-	int low_id, bid_pending = 0, score, score_max, score_req;
+	int low_id, bid_pending = 0, score, score_max, score_req,
+	    upgrade = 0;
 	memb_mask_t mask, master_mask;
+	struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval;
 
-	ctx->qc_status = S_RUN;
+	ctx->qc_status = S_NONE;
+	
+	maxtime.tv_usec = 0;
+	maxtime.tv_sec = ctx->qc_interval * ctx->qc_tko;
+	
+	interval.tv_usec = 0;
+	interval.tv_sec = ctx->qc_interval;
+	
+	get_my_score(&score, &score_max);
+	if (score_max < ctx->qc_scoremin) {
+		clulog(LOG_WARNING, "Minimum score (%d) is impossible to "
+		       "achieve (heuristic total = %d)\n",
+		       ctx->qc_scoremin, score_max);
+	}
 	
 	_running = 1;
 	while (_running) {
+		/* XXX this was getuptime() in clumanager */
+		gettimeofday(&oldtime, NULL);
+		
 		/* Read everyone else's status */
 		read_node_blocks(ctx, ni, max);
 
@@ -662,6 +831,7 @@
 
 		/* Check heuristics and remove ourself if necessary */
 		get_my_score(&score, &score_max);
+		upgrade = 0;
 
 		score_req = ctx->qc_scoremin;
 		if (score_req <= 0)
@@ -672,14 +842,19 @@
 			if (ctx->qc_status > S_NONE) {
 				clulog(LOG_NOTICE,
 				       "Score insufficient for master "
-				       "operation (%d/%d; max=%d); "
+				       "operation (%d/%d; required=%d); "
 				       "downgrading\n",
-				       score, score_req, score_max);
+				       score, score_max, score_req);
 				ctx->qc_status = S_NONE;
 				msg.m_msg = M_NONE;
 				++msg.m_seq;
 				bid_pending = 0;
-				cman_poll_quorum_device(ctx->qc_ch, 0);
+				if (cman_alive(ctx->qc_ch) < 0) {
+					clulog(LOG_ERR, "cman: %s\n",
+					       strerror(errno));
+				} else {
+					cman_poll_quorum_device(ctx->qc_ch, 0);
+				}
 				if (ctx->qc_flags & RF_REBOOT)
 					reboot(RB_AUTOBOOT);
 			}
@@ -688,10 +863,13 @@
 			if (ctx->qc_status == S_NONE) {
 				clulog(LOG_NOTICE,
 				       "Score sufficient for master "
-				       "operation (%d/%d; max=%d); "
+				       "operation (%d/%d; required=%d); "
 				       "upgrading\n",
-				       score, score_req, score_max);
+				       score, score_max, score_req);
 				ctx->qc_status = S_RUN;
+				upgrade = (ctx->qc_tko / 3);
+				if (upgrade == 0)
+					upgrade = 1;
 			}
 		}
 
@@ -702,11 +880,13 @@
 		if (!ctx->qc_master &&
 		    low_id == ctx->qc_my_id &&
 		    ctx->qc_status == S_RUN &&
-		    !bid_pending ) {
+		    !bid_pending &&
+		    !upgrade) {
 			/*
 			   If there's no master, and we are the lowest node
 			   ID, make a bid to become master if we're not 
-			   already bidding.
+			   already bidding.  We can't do this if we've just
+			   upgraded.
 			 */
 
 			clulog(LOG_DEBUG,"Making bid for master\n");
@@ -724,10 +904,18 @@
 			/* We're currently bidding for master.
 			   See if anyone's voted, or if we should
 			   rescind our bid */
+			++bid_pending;
 
 			/* Yes, those are all deliberate fallthroughs */
 			switch (check_votes(ctx, ni, max, &msg)) {
 			case 3:
+				/* 
+				 * Give ample time to become aware of other
+				 * nodes
+				 */
+				if (bid_pending < (ctx->qc_tko / 3))
+					break;
+				
 				clulog(LOG_INFO,
 				       "Assuming master role\n");
 				ctx->qc_status = S_MASTER;
@@ -755,6 +943,13 @@
 			/* We are the master.  Poll the quorum device.
 			   We can't be the master unless we score high
 			   enough on our heuristics. */
+			if (cman_alive(ctx->qc_ch) < 0) {
+				clulog(LOG_ERR, "cman_dispatch: %s\n",
+				       strerror(errno));
+				clulog(LOG_ERR,
+				       "Halting qdisk operations\n");
+				return -1;
+			}
 			check_cman(ctx, mask, master_mask);
 			cman_poll_quorum_device(ctx->qc_ch, 1);
 
@@ -768,6 +963,13 @@
 			      ni[ctx->qc_master-1].ni_status.ps_master_mask,
 				       ctx->qc_my_id-1,
 				       sizeof(memb_mask_t))) {
+				if (cman_alive(ctx->qc_ch) < 0) {
+					clulog(LOG_ERR, "cman_dispatch: %s\n",
+						strerror(errno));
+					clulog(LOG_ERR,
+						"Halting qdisk operations\n");
+					return -1;
+				}
 				cman_poll_quorum_device(ctx->qc_ch, 1);
 			}
 		}
@@ -783,8 +985,43 @@
 
 		/* Cycle. We could time the loop and sleep
 		   usleep(interval-looptime), but this is fine for now.*/
+		gettimeofday(&newtime, NULL);
+		_diff_tv(&diff, &oldtime, &newtime);
+		
+		/*
+		 * Reboot if we didn't send a heartbeat in interval*TKO_COUNT
+		 */
+		if (_cmp_tv(&maxtime, &diff) == 1 &&
+		    ctx->qc_flags & RF_PARANOID) {
+			clulog(LOG_EMERG, "Failed to complete a cycle within "
+			       "%d second%s (%d.%06d) - REBOOTING\n",
+			       (int)maxtime.tv_sec,
+			       maxtime.tv_sec==1?"":"s",
+			       (int)diff.tv_sec,
+			       (int)diff.tv_usec);
+			if (!(ctx->qc_flags & RF_DEBUG)) 
+				reboot(RB_AUTOBOOT);
+		}
+
+		/*
+		 * If the amount we took to complete a loop is greater or less
+		 * than our interval, we adjust by the difference each round.
+		 *
+		 * It's not really "realtime", but it helps!
+		 */
+		if (_cmp_tv(&diff, &interval) == 1) {
+			_diff_tv(&sleeptime, &diff, &interval);
+		} else {
+			clulog(LOG_WARNING, "qdisk cycle took more "
+			       "than %d second%s to complete (%d.%06d)\n",
+			       ctx->qc_interval, ctx->qc_interval==1?"":"s",
+			       (int)diff.tv_sec, (int)diff.tv_usec);
+			memcpy(&sleeptime, &interval, sizeof(sleeptime));
+		}
+		
+		/* Could hit a watchdog timer here if we wanted to */
 		if (_running)
-			sleep(ctx->qc_interval);
+			select(0, NULL, NULL, NULL, &sleeptime);
 	}
 
 	return 0;
@@ -829,12 +1066,15 @@
 	ctx->qc_interval = 1;
 	ctx->qc_tko = 10;
 	ctx->qc_scoremin = 0;
-	ctx->qc_flags = RF_REBOOT;
+	ctx->qc_flags = RF_REBOOT | RF_ALLOW_KILL; /* | RF_STOP_CMAN;*/
+	ctx->qc_sched = SCHED_RR;
+	ctx->qc_sched_prio = 1;
 
 	/* Get log log_facility */
 	snprintf(query, sizeof(query), "/cluster/quorumd/@log_facility");
 	if (ccs_get(ccsfd, query, &val) == 0) {
 		clu_set_facility(val);
+		clulog(LOG_DEBUG, "Log facility: %s\n", val);
 		free(val);
 	}
 
@@ -903,6 +1143,37 @@
 		if (ctx->qc_scoremin < 0)
 			ctx->qc_scoremin = 0;
 	}
+	
+	/* Get scheduling queue */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@scheduler");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		switch(val[0]) {
+		case 'r':
+		case 'R':
+			ctx->qc_sched = SCHED_RR;
+			break;
+		case 'f':
+		case 'F':
+			ctx->qc_sched = SCHED_FIFO;
+			break;
+		case 'o':
+		case 'O':
+			ctx->qc_sched = SCHED_OTHER;
+			break;
+		default:
+			clulog(LOG_WARNING, "Invalid scheduling queue '%s'\n",
+			       val);
+			break;
+		}
+		free(val);
+	}
+	
+	/* Get priority */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@priority");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_sched_prio = atoi(val);
+		free(val);
+	}	
 
 	/* Get reboot flag for when we transition -> offline */
 	/* default = on, so, 0 to turn off */
@@ -912,6 +1183,50 @@
 			ctx->qc_flags &= ~RF_REBOOT;
 		free(val);
 	}
+	
+	/*
+	 * Get flag to see if we're supposed to kill cman if qdisk is not 
+	 * available.
+	 */
+	/* default = off, so, 1 to turn on */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@stop_cman");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		if (!atoi(val))
+			ctx->qc_flags &= ~RF_STOP_CMAN;
+		else
+			ctx->qc_flags |= RF_STOP_CMAN;
+		free(val);
+	}
+	
+	
+	/*
+	 * Get flag to see if we're supposed to reboot if we can't complete
+	 * a pass in failure time
+	 */
+	/* default = off, so, 1 to turn on */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@paranoid");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		if (!atoi(val))
+			ctx->qc_flags &= ~RF_PARANOID;
+		else
+			ctx->qc_flags |= RF_PARANOID;
+		free(val);
+	}
+	
+	
+	/*
+	 * Get flag to see if we're supposed to reboot if we can't complete
+	 * a pass in failure time
+	 */
+	/* default = off, so, 1 to turn on */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@allow_kill");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		if (!atoi(val))
+			ctx->qc_flags &= ~RF_ALLOW_KILL;
+		else
+			ctx->qc_flags |= RF_ALLOW_KILL;
+		free(val);
+	}
 
 	*cfh = configure_heuristics(ccsfd, h, maxh);
 
@@ -925,18 +1240,47 @@
 }
 
 
+void
+check_stop_cman(qd_ctx *ctx)
+{
+	if (!(ctx->qc_flags & RF_STOP_CMAN))
+		return;
+	
+	clulog(LOG_WARNING, "Telling CMAN to leave the cluster; qdisk is not"
+		" available\n");
+#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
+	if (cman_shutdown(ctx->qc_ch, 0) < 0) {
+#else
+	int x = 0;
+	if (ioctl(cman_get_fd(ctx->qc_ch), SIOCCLUSTER_LEAVE_CLUSTER, &x) < 0) {
+#endif
+		clulog(LOG_CRIT, "Could not leave the cluster - rebooting\n");
+		sleep(5);
+		if (ctx->qc_flags & RF_DEBUG)
+			return;
+		reboot(RB_AUTOBOOT);
+	}
+}
+
+
 int
 main(int argc, char **argv)
 {
 	cman_node_t me;
-	int cfh, rv;
+	int cfh, rv, forked = 0;
 	qd_ctx ctx;
 	cman_handle_t ch;
 	node_info_t ni[MAX_NODES_DISK];
 	struct h_data h[10];
 	char debug = 0, foreground = 0;
 	char device[128];
-
+	pid_t pid;
+	
+	if (check_process_running(argv[0], &pid) && pid !=getpid()) {
+		printf("QDisk services already running\n");
+		return 0;
+	}
+	
 	while ((rv = getopt(argc, argv, "fd")) != EOF) {
 		switch (rv) {
 		case 'd':
@@ -944,40 +1288,64 @@
 			break;
 		case 'f':
 			foreground = 1;
+			clu_log_console(1);
 		default:
 			break;
 		}
 	}
+	
 #if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
 	ch = cman_admin_init(NULL);
 #else
 	ch = cman_init(NULL);
 #endif
 	if (!ch) {
-		printf("Could not connect to cluster (CMAN not running?)\n");
-		return -1;
+		if (!foreground && !forked) {
+			if (daemon_init(argv[0]) < 0)
+				return -1;
+			else
+				forked = 1;
+		}
+		
+		clulog(LOG_INFO, "Waiting for CMAN to start\n");
+		
+		do {
+			sleep(5);
+#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
+			ch = cman_admin_init(NULL);
+#else
+			ch = cman_init(NULL);
+#endif
+		} while (!ch);
 	}
 
 	memset(&me, 0, sizeof(me));
-	if (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) {
-		printf("Could not determine local node ID; cannot start\n");
-		return -1;
+	while (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) {
+		if (!foreground && !forked) {
+			if (daemon_init(argv[0]) < 0)
+				return -1;
+			else
+				forked = 1;
+		}
+		sleep(5);
 	}
 
 	qd_init(&ctx, ch, me.cn_nodeid);
 
 	signal(SIGINT, int_handler);
+	signal(SIGTERM, int_handler);
 
-        if (debug)
+        if (debug) {
                 clu_set_loglevel(LOG_DEBUG);
-        if (foreground)
-                clu_log_console(1);
+                ctx.qc_flags |= RF_DEBUG;
+        }
 		
 	if (get_config_data(NULL, &ctx, h, 10, &cfh, debug) < 0) {
 		clulog_and_print(LOG_CRIT, "Configuration failed\n");
+		check_stop_cman(&ctx);
 		return -1;
 	}
-
+	
 	if (ctx.qc_label) {
 		if (find_partitions("/proc/partitions",
 				    ctx.qc_label, device,
@@ -985,6 +1353,7 @@
 			clulog_and_print(LOG_CRIT, "Unable to match label"
 					 " '%s' to any device\n",
 					 ctx.qc_label);
+			check_stop_cman(&ctx);
 			return -1;
 		}
 
@@ -1000,15 +1369,21 @@
 			clulog(LOG_CRIT,
 			       "Specified partition %s does not have a "
 			       "qdisk label\n", ctx.qc_device);
+			check_stop_cman(&ctx);
 			return -1;
 		}
 	}
 
-	if (!foreground)
-                daemon(0,0);
+	if (!foreground && !forked) {
+                if (daemon_init(argv[0]) < 0)
+			return -1;
+	}
+	
+	set_priority(ctx.qc_sched, ctx.qc_sched_prio);
 
 	if (quorum_init(&ctx, ni, MAX_NODES_DISK, h, cfh) < 0) {
 		clulog_and_print(LOG_CRIT, "Initialization failed\n");
+		check_stop_cman(&ctx);
 		return -1;
 	}
 	
@@ -1026,14 +1401,12 @@
 	}
 	*/
 
-	quorum_loop(&ctx, ni, MAX_NODES_DISK);
-	cman_unregister_quorum_device(ctx.qc_ch);
+	if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0)
+		cman_unregister_quorum_device(ctx.qc_ch);
 
 	quorum_logout(&ctx);
-
 	qd_destroy(&ctx);
 
 	return 0;
-
 }
 
--- cluster/cman/qdisk/disk.h	2006/10/03 18:06:40	1.4
+++ cluster/cman/qdisk/disk.h	2007/01/22 22:50:11	1.4.2.1
@@ -67,7 +67,11 @@
 
 
 typedef enum {
-	RF_REBOOT = 0x1		/* Reboot if we go from master->none */
+	RF_REBOOT = 0x1,		/* Reboot if we go from master->none */
+	RF_STOP_CMAN = 0x2,
+	RF_DEBUG = 0x4,
+	RF_PARANOID = 0x8,
+	RF_ALLOW_KILL = 0x10
 } run_flag_t;
 
 
@@ -237,6 +241,8 @@
 	int qc_tko;
 	int qc_votes;
 	int qc_scoremin;
+	int qc_sched;
+	int qc_sched_prio;
 	disk_node_state_t qc_disk_status;
 	disk_node_state_t qc_status;
 	int qc_master;		/* Master?! */
--- cluster/cman/qdisk/clulog.c	2006/05/19 14:41:35	1.2
+++ cluster/cman/qdisk/clulog.c	2007/01/22 22:50:11	1.2.4.1
@@ -20,8 +20,6 @@
 /** @file
  * Library routines for communicating with the logging daemon.
  *
- *  $Id: clulog.c,v 1.2 2006/05/19 14:41:35 lhh Exp $
- *
  *  Author: Jeff Moyer <moyer@missioncriticallinux.com>
  */
 #include <stdio.h>
@@ -50,8 +48,6 @@
 #include <string.h>
 
 
-static const char *version __attribute__ ((unused)) = "$Revision: 1.2 $";
-
 #ifdef DEBUG
 #include <assert.h>
 #define Dprintf(fmt,args...) printf(fmt,##args)
@@ -135,7 +131,7 @@
 	}
 	
 	pthread_mutex_unlock(&log_mutex);
-	return "local4";
+	return "daemon";
 }
 
 
@@ -156,7 +152,6 @@
 	for (; facilitynames[x].c_name; x++) {
 		if (strcmp(facilityname, facilitynames[x].c_name))
 			continue;
-
 		syslog_facility = facilitynames[x].c_val;
 		break;
 	}
--- cluster/cman/man/qdisk.5	2006/10/03 18:07:58	1.3
+++ cluster/cman/man/qdisk.5	2007/01/22 22:50:12	1.3.2.1
@@ -1,6 +1,6 @@
-.TH "QDisk" "8" "July 2006" "" "Cluster Quorum Disk"
+.TH "QDisk" "21" "Jan 2007" "" "Cluster Quorum Disk"
 .SH "NAME"
-QDisk 1.0 \- a disk-based quorum daemon for CMAN / Linux-Cluster
+QDisk 1.2 \- a disk-based quorum daemon for CMAN / Linux-Cluster
 .SH "1. Overview"
 .SH "1.1 Problem"
 In some situations, it may be necessary or desirable to sustain
@@ -75,16 +75,24 @@
 
 * Cluster node votes should be more or less equal.
 
-* CMAN must be running before the qdisk program can start.
+* CMAN must be running before the qdisk program can operate in full
+capacity.  If CMAN is not running, qdisk will wait for it.
 
 * CMAN's eviction timeout should be at least 2x the quorum daemon's
 to give the quorum daemon adequate time to converge on a master during a
 failure + load spike situation.
 
-* The total number of votes assigned to the quorum device should be
-equal to or greater than the total number of node-votes in the cluster.
-While it is possible to assign only one (or a few) votes to the quorum
-device, the effects of doing so have not been explored.
+* For 'all-but-one' failure operation, the total number of votes assigned
+to the quorum device should be equal to or greater than the total number
+of node-votes in the cluster.  While it is possible to assign only one
+(or a few) votes to the quorum device, the effects of doing so have not
+been explored.
+
+* For 'tiebreaker' operation in a two-node cluster, unset CMAN's two_node
+flag (or set it to 0), set CMAN's expected votes to '3', set each node's
+vote to '1', and set qdisk's vote count to '1' as well.  This will allow
+the cluster to operate if either both nodes are online, or a single node &
+the heuristics.
 
 * Currently, the quorum disk daemon is difficult to use with CLVM if
 the quorum disk resides on a CLVM logical volume.  CLVM requires a
@@ -217,23 +225,27 @@
 0 = emergencies; 7 = debug.
 
 .in 9
-\fIlog_facility\fP\fB="\fPlocal4\fB"\fP
+\fIlog_facility\fP\fB="\fPdaemon\fB"\fP
 .in 12
 This controls the syslog facility used by the quorum daemon when logging.
 For a complete list of available facilities, see \fBsyslog.conf(5)\fP.
+The default value for this is 'daemon'.
 
 .in 9
 \fIstatus_file\fP\fB="\fP/foo\fB"\fP
 .in 12
 Write internal states out to this file periodically ("-" = use stdout).
-This is primarily used for debugging.
+This is primarily used for debugging.  The default value for this 
+attribute is undefined.
 
 .in 9
 \fImin_score\fP\fB="\fP3\fB"\fP
 .in 12
 Absolute minimum score to be consider one's self "alive".  If omitted,
 or set to 0, the default function "floor((n+1)/2)" is used, where \fIn\fP
-is the sum-total of all of defined heuristics' \fIscore\fP attribute.
+is the total of all of defined heuristics' \fIscore\fP attribute.  This
+must never exceed the sum of the heuristic scores, or else the quorum
+disk will never be available.
 
 .in 9
 \fIreboot\fP\fB="\fP1\fB"\fP
@@ -243,6 +255,45 @@
 this value is 1 (on).
 
 .in 9
+\fIallow_kill\fP\fB="\fP1\fB"\fP
+.in 12
+If set to 0 (off), qdiskd will *not* instruct to kill nodes it thinks
+are dead (as a result of not writing to the quorum disk).  The default
+for this value is 1 (on).
+
+.in 9
+\fIparanoid\fP\fB="\fP0\fB"\fP
+.in 12
+If set to 1 (on), qdiskd will watch internal timers and reboot the node
+if it takes more than (interval * tko) seconds to complete a quorum disk
+pass.  The default for this value is 0 (off).
+
+.in 9
+\fIscheduler\fP\fB="\fPrr\fB"\fP
+.in 12
+Valid values are 'rr', 'fifo', and 'other'.  Selects the scheduling queue
+in the Linux kernel for operation of the main & score threads (does not
+affect the heuristics; they are always run in the 'other' queue).  Default
+is 'rr'.  See sched_setscheduler(2) for more details.
+
+.in 9
+\fIpriority\fP\fB="\fP1\fB"\fP
+.in 12
+Valid values for 'rr' and 'fifo' are 1..100 inclusive.  Valid values
+for 'other' are -20..20 inclusive.  Sets the priority of the main & score
+threads.  The default value is 1 (in the RR and FIFO queues, higher numbers
+denote higher priority; in OTHER, lower values denote higher priority).
+
+.in 9
+\fIstop_cman\fP\fB="\fP0\fB"\fP
+.in 12
+Ordinarily, cluster membership is left up to CMAN, not qdisk.
+If this parameter is set to 1 (on), qdiskd will tell CMAN to leave the
+cluster if it is unable to initialize the quorum disk during startup.  This
+can be used to prevent cluster participation by a node which has been 
+disconnected from the SAN.  The default for this value is 0 (off).
+
+.in 9
 \fIdevice\fP\fB="\fP/dev/sda1\fB"\fP
 .in 12
 This is the device the quorum daemon will use.  This device must be the
@@ -256,6 +307,8 @@
 on every block device found, comparing the label against the specified
 label.  This is useful in configurations where the block device name
 differs on a per-node basis.
+.in 8
+\fB...>\fP
 .in 0
 
 .SH "3.2.  The <heuristic> tag"
@@ -268,34 +321,80 @@
 .in 12
 This is the program used to determine if this heuristic is alive.  This
 can be anything which may be executed by \fI/bin/sh -c\fP.  A return
-value of zero indicates success; anything else indicates failure.
+value of zero indicates success; anything else indicates failure.  This
+is required.
 
 .in 9
 \fIscore\fP\fB="\fP1\fB"\fP
 .in 12
 This is the weight of this heuristic.  Be careful when determining scores
-for heuristics.
+for heuristics.  The default score for each heuristic is 1.
 
 .in 9
 \fIinterval\fP\fB="\fP2\fB"/>\fP
 .in 12
-This is the frequency at which we poll the heuristic.
+This is the frequency (in seconds) at which we poll the heuristic.  The
+default interval for every heuristic is 2 seconds.
+.in 0
+
+.in 9
+\fItko\fP\fB="\fP1\fB"/>\fP
+.in 12
+After this many failed attempts to run the heuristic, it is considered DOWN,
+and its score is removed.  The default tko for each heuristic is 1, which 
+may be inadequate for things such as 'ping'.
+.in 8
+\fB/>\fP
 .in 0
 
-.SH "3.3. Example"
+
+.SH "3.3. Examples"
+.SH "3.3.1. 3 cluster nodes & 3 routers"
+.in 8
+<cman expected_votes="6" .../>
+.br
+<clusternodes>
+.in 12
+<clusternode name="node1" votes="1" ... />
+.br
+<clusternode name="node2" votes="1" ... />
+.br
+<clusternode name="node3" votes="1" ... />
 .in 8
+</clusternodes>
+.br
 <quorumd interval="1" tko="10" votes="3" label="testing">
 .in 12
-<heuristic program="ping A -c1 -t1" score="1" interval="2"/>
+<heuristic program="ping A -c1 -t1" score="1" interval="2" tko="3"/>
+.br
+<heuristic program="ping B -c1 -t1" score="1" interval="2" tko="3"/>
+.br
+<heuristic program="ping C -c1 -t1" score="1" interval="2" tko="3"/>
+.br
+.in 8
+</quorumd>
+
+.SH "3.3.2. 2 cluster nodes & 1 IP tiebreaker"
+.in 8
+<cman two_node="0" expected_votes="3" .../>
+.br
+<clusternodes>
+.in 12
+<clusternode name="node1" votes="1" ... />
 .br
-<heuristic program="ping B -c1 -t1" score="1" interval="2"/>
+<clusternode name="node2" votes="1" ... />
+.in 8
+</clusternodes>
 .br
-<heuristic program="ping C -c1 -t1" score="1" interval="2"/>
+<quorumd interval="1" tko="10" votes="1" label="testing">
+.in 12
+<heuristic program="ping A -c1 -t1" score="1" interval="2" tko="3"/>
 .br
 .in 8
 </quorumd>
 .in 0
 
+
 .SH "3.4. Heuristic score considerations"
 * Heuristic timeouts should be set high enough to allow the previous run
 of a given heuristic to complete.
--- cluster/cman/man/mkqdisk.8	2006/07/21 17:55:04	1.2
+++ cluster/cman/man/mkqdisk.8	2007/01/22 22:50:12	1.2.6.1
@@ -13,11 +13,16 @@
 .IP "\-c device \-l label"
 Initialize a new cluster quorum disk.  This will destroy all data on the given
 device.  If a cluster is currently using that device as a quorum disk, the
-entire cluster will malfunction.  Do not ru
+entire cluster will malfunction.  Do not run this on an active cluster when
+qdiskd is running.  Only one device on the SAN should ever have the given
+label; using multiple different devices is currently not supported (it is
+expected a RAID array is used for quorum disk redundancy).  The label can be
+any textual string up to 127 characters - and is therefore enough space to hold
+a UUID created with uuidgen(1).
 .IP "\-f label"
-Find the cluster quorum disk with the given label and display information about it..
+Find the cluster quorum disk with the given label and display information about it.
 .IP "\-L"
 Display information on all accessible cluster quorum disks.
 
 .SH "SEE ALSO"
-qdisk(5) qdiskd(8)
+qdisk(5), qdiskd(8), uuidgen(1)