From: lhh@sourceware.org <lhh@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/cman qdisk/score.c qdisk/disk.h qdisk/ ...
Date: 22 Jan 2007 22:50:40 -0000 [thread overview]
Message-ID: <20070122225040.7270.qmail@sourceware.org> (raw)
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL4
Changes by: lhh at sourceware.org 2007-01-22 22:50:38
Modified files:
cman/qdisk : score.c disk.h score.h clulog.c main.c Makefile
gettid.c
cman/man : qdisk.5 mkqdisk.8
Added files:
cman/qdisk : daemon_init.c
Log message:
Resolves bugzillas: #213533, #216092, #220211, #223002, #223234/#223240
Detailed comments:
* Lock in memory to prevent being swapped out
* Turn on RR scheduling for main + score threads
* Let qdiskd wait for CMAN to start
* Add option to qdiskd to stop CMAN if qdisk device is not available
* Make qdisk interval timings more accurate
* Add option to reboot node if qdiskd detects internal hang > failure time (e.g. interval*tko, in seconds)
* Add per-heuristic tko counts for unreliable heuristics (e.g. ping packets)
* Remove nodes from quorate mask immediately on eviction
* Update man pages with better examples
* Don't let >1 instance of qdiskd be started
* Clarify logging output.
* Improve data in status_file.
* Allow qdiskd to run with no defined heuristics (master-always-wins mode).
* Make fencing of nodes optional (default = on).
* Make sure CMAN is running before we try to talk to it at each point.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/daemon_init.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=NONE&r2=1.1.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.1&r2=1.1.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.3&r2=1.1.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.1&r2=1.1.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/clulog.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.1&r2=1.1.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/main.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.4&r2=1.1.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.2&r2=1.1.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/gettid.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.2&r2=1.1.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdisk.5.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.1&r2=1.1.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/mkqdisk.8.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.1&r2=1.1.2.2
--- cluster/cman/qdisk/score.c 2006/05/18 14:52:49 1.1.2.1
+++ cluster/cman/qdisk/score.c 2007/01/22 22:50:38 1.1.2.2
@@ -32,14 +32,20 @@
#include <string.h>
#include <ccs.h>
#include <clulog.h>
+#include <sched.h>
+#include <sys/mman.h>
+#include "disk.h"
#include "score.h"
static pthread_mutex_t sc_lock = PTHREAD_MUTEX_INITIALIZER;
static int _score = 0, _maxscore = 0, _score_thread_running = 0;
static pthread_t score_thread = (pthread_t)0;
+void set_priority(int, int);
struct h_arg {
struct h_data *h;
+ int sched_queue;
+ int sched_prio;
int count;
};
@@ -97,6 +103,20 @@
h->childpid = pid;
return 0;
}
+
+ /*
+ * always use SCHED_OTHER for the child processes
+ * nice -1 is fine; but we don't know what the child process
+ * might do, so leaving it (potentially) in SCHED_RR or SCHED_FIFO
+ * is out of the question
+ *
+ * XXX if you set SCHED_OTHER in the conf file and nice 20, the below
+ * will make the heuristics a higher prio than qdiskd. This should be
+ * fine in practice, because running qdiskd at nice 20 will cause all
+ * sorts of problems on a busy system.
+ */
+ set_priority(SCHED_OTHER, -1);
+ munlockall();
argv[0] = "/bin/sh";
argv[1] = "-c";
@@ -122,6 +142,12 @@
*score = 0;
*maxscore = 0;
+
+ /* Allow operation w/o any heuristics */
+ if (!max) {
+ *score = *maxscore = 1;
+ return;
+ }
for (x = 0; x < max; x++) {
*maxscore += h[x].score;
@@ -141,22 +167,51 @@
int status;
if (h->childpid == 0)
+ /* No child to check */
return 0;
ret = waitpid(h->childpid, &status, block?0:WNOHANG);
if (!block && ret == 0)
+ /* No children exited */
return 0;
h->childpid = 0;
- h->available = 0;
if (ret < 0 && errno == ECHILD)
- return -1;
- if (!WIFEXITED(status))
- return 0;
- if (WEXITSTATUS(status) != 0)
- return 0;
- h->available = 1;
+ /* wrong child? */
+ goto miss;
+ if (!WIFEXITED(status)) {
+ ret = 0;
+ goto miss;
+ }
+ if (WEXITSTATUS(status) != 0) {
+ ret = 0;
+ goto miss;
+ }
+
+ /* Returned 0 and was not killed */
+ if (!h->available) {
+ h->available = 1;
+ clulog(LOG_INFO, "Heuristic: '%s' UP\n", h->program);
+ }
+ h->misses = 0;
return 0;
+
+miss:
+ if (h->available) {
+ h->misses++;
+ if (h->misses >= h->tko) {
+ clulog(LOG_INFO,
+ "Heuristic: '%s' DOWN (%d/%d)\n",
+ h->program, h->misses, h->tko);
+ h->available = 0;
+ } else {
+ clulog(LOG_DEBUG,
+ "Heuristic: '%s' missed (%d/%d)\n",
+ h->program, h->misses, h->tko);
+ }
+ }
+
+ return ret;
}
@@ -204,7 +259,9 @@
do {
h[x].program = NULL;
h[x].available = 0;
+ h[x].misses = 0;
h[x].interval = 2;
+ h[x].tko = 1;
h[x].score = 1;
h[x].childpid = 0;
h[x].nextrun = 0;
@@ -236,9 +293,20 @@
if (h[x].interval <= 0)
h[x].interval = 2;
}
+
+ /* Get tko for this heuristic */
+ snprintf(query, sizeof(query),
+ "/cluster/quorumd/heuristic[%d]/@tko", x+1);
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ h[x].tko= atoi(val);
+ free(val);
+ if (h[x].tko <= 0)
+ h[x].tko = 1;
+ }
- clulog(LOG_DEBUG, "Heuristic: '%s' score=%d interval=%d\n",
- h[x].program, h[x].score, h[x].interval);
+ clulog(LOG_DEBUG,
+ "Heuristic: '%s' score=%d interval=%d tko=%d\n",
+ h[x].program, h[x].score, h[x].interval, h[x].tko);
} while (++x < max);
@@ -271,6 +339,8 @@
{
struct h_arg *args = (struct h_arg *)arg;
int score, maxscore;
+
+ set_priority(args->sched_queue, args->sched_prio);
while (_score_thread_running) {
fork_heuristics(args->h, args->count);
@@ -317,7 +387,7 @@
to pass in h if it was allocated on the stack.
*/
int
-start_score_thread(struct h_data *h, int count)
+start_score_thread(qd_ctx *ctx, struct h_data *h, int count)
{
pthread_attr_t attrs;
struct h_arg *args;
@@ -337,8 +407,11 @@
memcpy(args->h, h, (sizeof(struct h_data) * count));
args->count = count;
+ args->sched_queue = ctx->qc_sched;
+ args->sched_prio = ctx->qc_sched_prio;
_score_thread_running = 1;
+
pthread_attr_init(&attrs);
pthread_attr_setinheritsched(&attrs, PTHREAD_INHERIT_SCHED);
pthread_create(&score_thread, &attrs, score_thread_main, args);
--- cluster/cman/qdisk/disk.h 2006/10/05 20:30:21 1.1.2.3
+++ cluster/cman/qdisk/disk.h 2007/01/22 22:50:38 1.1.2.4
@@ -67,7 +67,11 @@
typedef enum {
- RF_REBOOT = 0x1 /* Reboot if we go from master->none */
+ RF_REBOOT = 0x1, /* Reboot if we go from master->none */
+ RF_STOP_CMAN = 0x2,
+ RF_DEBUG = 0x4,
+ RF_PARANOID = 0x8,
+ RF_ALLOW_KILL = 0x10
} run_flag_t;
@@ -237,6 +241,8 @@
int qc_tko;
int qc_votes;
int qc_scoremin;
+ int qc_sched;
+ int qc_sched_prio;
disk_node_state_t qc_disk_status;
disk_node_state_t qc_status;
int qc_master; /* Master?! */
--- cluster/cman/qdisk/score.h 2006/05/18 14:52:49 1.1.2.1
+++ cluster/cman/qdisk/score.h 2007/01/22 22:50:38 1.1.2.2
@@ -32,7 +32,9 @@
char * program;
int score;
int available;
+ int tko;
int interval;
+ int misses;
pid_t childpid;
time_t nextrun;
};
@@ -50,7 +52,7 @@
/*
Start the thread which runs the scoring applets
*/
-int start_score_thread(struct h_data *h, int count);
+int start_score_thread(qd_ctx *ctx, struct h_data *h, int count);
/*
Get our score + maxscore
--- cluster/cman/qdisk/clulog.c 2006/05/18 14:52:49 1.1.2.1
+++ cluster/cman/qdisk/clulog.c 2007/01/22 22:50:38 1.1.2.2
@@ -20,8 +20,6 @@
/** @file
* Library routines for communicating with the logging daemon.
*
- * $Id: clulog.c,v 1.1.2.1 2006/05/18 14:52:49 lhh Exp $
- *
* Author: Jeff Moyer <moyer@missioncriticallinux.com>
*/
#include <stdio.h>
@@ -50,8 +48,6 @@
#include <string.h>
-static const char *version __attribute__ ((unused)) = "$Revision: 1.1.2.1 $";
-
#ifdef DEBUG
#include <assert.h>
#define Dprintf(fmt,args...) printf(fmt,##args)
@@ -135,7 +131,7 @@
}
pthread_mutex_unlock(&log_mutex);
- return "local4";
+ return "daemon";
}
@@ -156,7 +152,6 @@
for (; facilitynames[x].c_name; x++) {
if (strcmp(facilityname, facilitynames[x].c_name))
continue;
-
syslog_facility = facilitynames[x].c_val;
break;
}
--- cluster/cman/qdisk/main.c 2007/01/16 15:36:17 1.1.2.4
+++ cluster/cman/qdisk/main.c 2007/01/22 22:50:38 1.1.2.5
@@ -35,11 +35,21 @@
#include <unistd.h>
#include <time.h>
#include <sys/reboot.h>
+#include <sys/time.h>
#include <linux/reboot.h>
+#include <sched.h>
#include <signal.h>
#include <ccs.h>
#include "score.h"
#include "clulog.h"
+#if (!defined(LIBCMAN_VERSION) || \
+ (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION < 2))
+#include <cluster/cnxman-socket.h>
+#endif
+
+int daemon_init(char *);
+int check_process_running(char *, pid_t *);
+
/*
TODO:
1) Take into account timings to gracefully extend node timeouts during
@@ -155,6 +165,11 @@
if (sb->ps_timestamp == ni[x].ni_last_seen) {
/* XXX check for average + allow grace */
ni[x].ni_misses++;
+ if (ni[x].ni_misses > 1) {
+ clulog(LOG_DEBUG,
+ "Node %d missed an update (%d/%d)\n",
+ x+1, ni[x].ni_misses, ctx->qc_tko);
+ }
continue;
}
@@ -208,6 +223,11 @@
ni[x].ni_misses = 0;
ni[x].ni_state = S_NONE;
+ /* Clear our master mask for the node after eviction
+ * or shutdown */
+ if (mask)
+ clear_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+ sizeof(memb_mask_t));
continue;
}
@@ -227,15 +247,17 @@
Write eviction notice if we're the master.
*/
if (ctx->qc_status == S_MASTER) {
- clulog(LOG_DEBUG,
+ clulog(LOG_NOTICE,
"Writing eviction notice for node %d\n",
ni[x].ni_status.ps_nodeid);
qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
S_EVICT, NULL, NULL, NULL);
- clulog(LOG_DEBUG,
- "Telling CMAN to kill the node\n");
- cman_kill_node(ctx->qc_ch,
- ni[x].ni_status.ps_nodeid);
+ if (ctx->qc_flags & RF_ALLOW_KILL) {
+ clulog(LOG_DEBUG, "Telling CMAN to "
+ "kill the node\n");
+ cman_kill_node(ctx->qc_ch,
+ ni[x].ni_status.ps_nodeid);
+ }
}
/*
@@ -255,6 +277,10 @@
ni[x].ni_evil_incarnation =
ni[x].ni_status.ps_incarnation;
+ /* Clear our master mask for the node after eviction */
+ if (mask)
+ clear_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+ sizeof(memb_mask_t));
continue;
}
@@ -279,9 +305,12 @@
ni[x].ni_status.ps_state = S_EVICT;
/* XXX Need to fence it again */
- clulog(LOG_DEBUG, "Telling CMAN to kill the node\n");
- cman_kill_node(ctx->qc_ch,
- ni[x].ni_status.ps_nodeid);
+ if (ctx->qc_flags & RF_ALLOW_KILL) {
+ clulog(LOG_DEBUG, "Telling CMAN to "
+ "kill the node\n");
+ cman_kill_node(ctx->qc_ch,
+ ni[x].ni_status.ps_nodeid);
+ }
continue;
}
@@ -416,6 +445,10 @@
int x = 0, score, maxscore;
clulog(LOG_INFO, "Quorum Daemon Initializing\n");
+
+ if (mlockall(MCL_CURRENT|MCL_FUTURE) != 0) {
+ clulog(LOG_ERR, "Unable to mlockall()\n");
+ }
if (qdisk_validate(ctx->qc_device) < 0)
return -1;
@@ -427,7 +460,7 @@
return -1;
}
- start_score_thread(h, maxh);
+ start_score_thread(ctx, h, maxh);
node_info_init(ni, max);
if (qd_write_status(ctx, ctx->qc_my_id,
@@ -447,7 +480,6 @@
}
sleep(ctx->qc_interval);
-
}
get_my_score(&score,&maxscore);
@@ -500,12 +532,16 @@
return;
memset(master_mask, 0, sizeof(master_mask));
-
for (x = 0; x < retnodes; x++) {
if (is_bit_set(mask, nodes[x].cn_nodeid-1, sizeof(mask)) &&
- nodes[x].cn_member)
+ nodes[x].cn_member) {
set_bit(master_mask, nodes[x].cn_nodeid-1,
sizeof(master_mask));
+ } else {
+ /* Not in CMAN output = not allowed */
+ clear_bit(master_mask, (nodes[x].cn_nodeid-1),
+ sizeof(memb_mask_t));
+ }
}
}
@@ -604,12 +640,25 @@
}
fprintf(fp, "Node ID: %d\n", ctx->qc_my_id);
- fprintf(fp, "Score (current / min req. / max allowed): %d / %d / %d\n",
- score, score_req, score_max);
+
+ if (ctx->qc_master)
+ fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
+ else
+ fprintf(fp, "Master Node ID: (none)\n");
+
+ fprintf(fp, "Score: %d/%d (Minimum required = %d)\n",
+ score, score_max, score_req);
fprintf(fp, "Current state: %s\n", state_str(ctx->qc_status));
fprintf(fp, "Current disk state: %s\n",
state_str(ctx->qc_disk_status));
+ fprintf(fp, "Initializing Set: {");
+ for (x=0; x<max; x++) {
+ if (ni[x].ni_state == S_INIT)
+ fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
+ }
+ fprintf(fp, " }\n");
+
fprintf(fp, "Visible Set: {");
for (x=0; x<max; x++) {
if (ni[x].ni_state >= S_RUN || ni[x].ni_status.ps_nodeid ==
@@ -617,13 +666,10 @@
fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
}
fprintf(fp, " }\n");
-
- if (!ctx->qc_master) {
- fprintf(fp, "No master node\n");
+
+ if (!ctx->qc_master)
goto out;
- }
- fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
fprintf(fp, "Quorate Set: {");
for (x=0; x<max; x++) {
if (is_bit_set(ni[ctx->qc_master-1].ni_status.ps_master_mask,
@@ -642,18 +688,141 @@
}
+/* Timeval functions from clumanager */
+/**
+ * Scale a (struct timeval).
+ *
+ * @param tv The timeval to scale.
+ * @param scale Positive multiplier.
+ * @return tv
+ */
+struct timeval *
+_scale_tv(struct timeval *tv, int scale)
+{
+ tv->tv_sec *= scale;
+ tv->tv_usec *= scale;
+
+ if (tv->tv_usec > 1000000) {
+ tv->tv_sec += (tv->tv_usec / 1000000);
+ tv->tv_usec = (tv->tv_usec % 1000000);
+ }
+
+ return tv;
+}
+
+
+static inline void
+_diff_tv(struct timeval *dest, struct timeval *start, struct timeval *end)
+{
+ dest->tv_sec = end->tv_sec - start->tv_sec;
+ dest->tv_usec = end->tv_usec - start->tv_usec;
+
+ if (dest->tv_usec < 0) {
+ dest->tv_usec += 1000000;
+ dest->tv_sec--;
+ }
+}
+
+
+#define _print_tv(val) \
+ printf("%s: %d.%06d\n", #val, (int)((val)->tv_sec), \
+ (int)((val)->tv_usec))
+
+
+static inline int
+_cmp_tv(struct timeval *left, struct timeval *right)
+{
+ if (left->tv_sec > right->tv_sec)
+ return -1;
+
+ if (left->tv_sec < right->tv_sec)
+ return 1;
+
+ if (left->tv_usec > right->tv_usec)
+ return -1;
+
+ if (left->tv_usec < right->tv_usec)
+ return 1;
+
+ return 0;
+}
+
+
+void
+set_priority(int queue, int prio)
+{
+ struct sched_param s;
+ int ret;
+ char *func = "nice";
+
+ if (queue == SCHED_OTHER) {
+ s.sched_priority = 0;
+ ret = sched_setscheduler(0, queue, &s);
+ errno = 0;
+ ret = nice(prio);
+ } else {
+ memset(&s,0,sizeof(s));
+ s.sched_priority = prio;
+ ret = sched_setscheduler(0, queue, &s);
+ func = "sched_setscheduler";
+ }
+
+ if (ret < 0 && errno) {
+ clulog(LOG_WARNING, "set_priority [%s] failed: %s\n", func,
+ strerror(errno));
+ }
+}
+
+
+int
+cman_alive(cman_handle_t ch)
+{
+ fd_set rfds;
+ int fd = cman_get_fd(ch);
+ struct timeval tv = {0, 0};
+
+ FD_ZERO(&rfds);
+ FD_SET(fd, &rfds);
+ if (select(fd + 1, &rfds, NULL, NULL, &tv) == 1) {
+ if (cman_dispatch(ch, CMAN_DISPATCH_ALL) < 0) {
+ if (errno == EAGAIN)
+ return 0;
+ return -1;
+ }
+ }
+ return 0;
+}
+
int
quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
{
disk_msg_t msg = {0, 0, 0};
- int low_id, bid_pending = 0, score, score_max, score_req;
+ int low_id, bid_pending = 0, score, score_max, score_req,
+ upgrade = 0;
memb_mask_t mask, master_mask;
+ struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval;
- ctx->qc_status = S_RUN;
+ ctx->qc_status = S_NONE;
+
+ maxtime.tv_usec = 0;
+ maxtime.tv_sec = ctx->qc_interval * ctx->qc_tko;
+
+ interval.tv_usec = 0;
+ interval.tv_sec = ctx->qc_interval;
+
+ get_my_score(&score, &score_max);
+ if (score_max < ctx->qc_scoremin) {
+ clulog(LOG_WARNING, "Minimum score (%d) is impossible to "
+ "achieve (heuristic total = %d)\n",
+ ctx->qc_scoremin, score_max);
+ }
_running = 1;
while (_running) {
+ /* XXX this was getuptime() in clumanager */
+ gettimeofday(&oldtime, NULL);
+
/* Read everyone else's status */
read_node_blocks(ctx, ni, max);
@@ -662,6 +831,7 @@
/* Check heuristics and remove ourself if necessary */
get_my_score(&score, &score_max);
+ upgrade = 0;
score_req = ctx->qc_scoremin;
if (score_req <= 0)
@@ -672,14 +842,19 @@
if (ctx->qc_status > S_NONE) {
clulog(LOG_NOTICE,
"Score insufficient for master "
- "operation (%d/%d; max=%d); "
+ "operation (%d/%d; required=%d); "
"downgrading\n",
- score, score_req, score_max);
+ score, score_max, score_req);
ctx->qc_status = S_NONE;
msg.m_msg = M_NONE;
++msg.m_seq;
bid_pending = 0;
- cman_poll_quorum_device(ctx->qc_ch, 0);
+ if (cman_alive(ctx->qc_ch) < 0) {
+ clulog(LOG_ERR, "cman: %s\n",
+ strerror(errno));
+ } else {
+ cman_poll_quorum_device(ctx->qc_ch, 0);
+ }
if (ctx->qc_flags & RF_REBOOT)
reboot(RB_AUTOBOOT);
}
@@ -688,10 +863,13 @@
if (ctx->qc_status == S_NONE) {
clulog(LOG_NOTICE,
"Score sufficient for master "
- "operation (%d/%d; max=%d); "
+ "operation (%d/%d; required=%d); "
"upgrading\n",
- score, score_req, score_max);
+ score, score_max, score_req);
ctx->qc_status = S_RUN;
+ upgrade = (ctx->qc_tko / 3);
+ if (upgrade == 0)
+ upgrade = 1;
}
}
@@ -702,11 +880,13 @@
if (!ctx->qc_master &&
low_id == ctx->qc_my_id &&
ctx->qc_status == S_RUN &&
- !bid_pending ) {
+ !bid_pending &&
+ !upgrade) {
/*
If there's no master, and we are the lowest node
ID, make a bid to become master if we're not
- already bidding.
+ already bidding. We can't do this if we've just
+ upgraded.
*/
clulog(LOG_DEBUG,"Making bid for master\n");
@@ -724,10 +904,18 @@
/* We're currently bidding for master.
See if anyone's voted, or if we should
rescind our bid */
+ ++bid_pending;
/* Yes, those are all deliberate fallthroughs */
switch (check_votes(ctx, ni, max, &msg)) {
case 3:
+ /*
+ * Give ample time to become aware of other
+ * nodes
+ */
+ if (bid_pending < (ctx->qc_tko / 3))
+ break;
+
clulog(LOG_INFO,
"Assuming master role\n");
ctx->qc_status = S_MASTER;
@@ -755,6 +943,13 @@
/* We are the master. Poll the quorum device.
We can't be the master unless we score high
enough on our heuristics. */
+ if (cman_alive(ctx->qc_ch) < 0) {
+ clulog(LOG_ERR, "cman_dispatch: %s\n",
+ strerror(errno));
+ clulog(LOG_ERR,
+ "Halting qdisk operations\n");
+ return -1;
+ }
check_cman(ctx, mask, master_mask);
cman_poll_quorum_device(ctx->qc_ch, 1);
@@ -768,6 +963,13 @@
ni[ctx->qc_master-1].ni_status.ps_master_mask,
ctx->qc_my_id-1,
sizeof(memb_mask_t))) {
+ if (cman_alive(ctx->qc_ch) < 0) {
+ clulog(LOG_ERR, "cman_dispatch: %s\n",
+ strerror(errno));
+ clulog(LOG_ERR,
+ "Halting qdisk operations\n");
+ return -1;
+ }
cman_poll_quorum_device(ctx->qc_ch, 1);
}
}
@@ -783,8 +985,43 @@
/* Cycle. We could time the loop and sleep
usleep(interval-looptime), but this is fine for now.*/
+ gettimeofday(&newtime, NULL);
+ _diff_tv(&diff, &oldtime, &newtime);
+
+ /*
+ * Reboot if we didn't send a heartbeat in interval*TKO_COUNT
+ */
+ if (_cmp_tv(&maxtime, &diff) == 1 &&
+ ctx->qc_flags & RF_PARANOID) {
+ clulog(LOG_EMERG, "Failed to complete a cycle within "
+ "%d second%s (%d.%06d) - REBOOTING\n",
+ (int)maxtime.tv_sec,
+ maxtime.tv_sec==1?"":"s",
+ (int)diff.tv_sec,
+ (int)diff.tv_usec);
+ if (!(ctx->qc_flags & RF_DEBUG))
+ reboot(RB_AUTOBOOT);
+ }
+
+ /*
+ * If the amount we took to complete a loop is greater or less
+ * than our interval, we adjust by the difference each round.
+ *
+ * It's not really "realtime", but it helps!
+ */
+ if (_cmp_tv(&diff, &interval) == 1) {
+ _diff_tv(&sleeptime, &diff, &interval);
+ } else {
+ clulog(LOG_WARNING, "qdisk cycle took more "
+ "than %d second%s to complete (%d.%06d)\n",
+ ctx->qc_interval, ctx->qc_interval==1?"":"s",
+ (int)diff.tv_sec, (int)diff.tv_usec);
+ memcpy(&sleeptime, &interval, sizeof(sleeptime));
+ }
+
+ /* Could hit a watchdog timer here if we wanted to */
if (_running)
- sleep(ctx->qc_interval);
+ select(0, NULL, NULL, NULL, &sleeptime);
}
return 0;
@@ -829,12 +1066,15 @@
ctx->qc_interval = 1;
ctx->qc_tko = 10;
ctx->qc_scoremin = 0;
- ctx->qc_flags = RF_REBOOT;
+ ctx->qc_flags = RF_REBOOT | RF_ALLOW_KILL; /* | RF_STOP_CMAN;*/
+ ctx->qc_sched = SCHED_RR;
+ ctx->qc_sched_prio = 1;
/* Get log log_facility */
snprintf(query, sizeof(query), "/cluster/quorumd/@log_facility");
if (ccs_get(ccsfd, query, &val) == 0) {
clu_set_facility(val);
+ clulog(LOG_DEBUG, "Log facility: %s\n", val);
free(val);
}
@@ -903,6 +1143,37 @@
if (ctx->qc_scoremin < 0)
ctx->qc_scoremin = 0;
}
+
+ /* Get scheduling queue */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@scheduler");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ switch(val[0]) {
+ case 'r':
+ case 'R':
+ ctx->qc_sched = SCHED_RR;
+ break;
+ case 'f':
+ case 'F':
+ ctx->qc_sched = SCHED_FIFO;
+ break;
+ case 'o':
+ case 'O':
+ ctx->qc_sched = SCHED_OTHER;
+ break;
+ default:
+ clulog(LOG_WARNING, "Invalid scheduling queue '%s'\n",
+ val);
+ break;
+ }
+ free(val);
+ }
+
+ /* Get priority */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@priority");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_sched_prio = atoi(val);
+ free(val);
+ }
/* Get reboot flag for when we transition -> offline */
/* default = on, so, 0 to turn off */
@@ -912,6 +1183,50 @@
ctx->qc_flags &= ~RF_REBOOT;
free(val);
}
+
+ /*
+ * Get flag to see if we're supposed to kill cman if qdisk is not
+ * available.
+ */
+ /* default = off, so, 1 to turn on */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@stop_cman");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ if (!atoi(val))
+ ctx->qc_flags &= ~RF_STOP_CMAN;
+ else
+ ctx->qc_flags |= RF_STOP_CMAN;
+ free(val);
+ }
+
+
+ /*
+ * Get flag to see if we're supposed to reboot if we can't complete
+ * a pass in failure time
+ */
+ /* default = off, so, 1 to turn on */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@paranoid");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ if (!atoi(val))
+ ctx->qc_flags &= ~RF_PARANOID;
+ else
+ ctx->qc_flags |= RF_PARANOID;
+ free(val);
+ }
+
+
+ /*
+ * Get flag to see if we're supposed to reboot if we can't complete
+ * a pass in failure time
+ */
+ /* default = off, so, 1 to turn on */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@allow_kill");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ if (!atoi(val))
+ ctx->qc_flags &= ~RF_ALLOW_KILL;
+ else
+ ctx->qc_flags |= RF_ALLOW_KILL;
+ free(val);
+ }
*cfh = configure_heuristics(ccsfd, h, maxh);
@@ -925,18 +1240,47 @@
}
+void
+check_stop_cman(qd_ctx *ctx)
+{
+ if (!(ctx->qc_flags & RF_STOP_CMAN))
+ return;
+
+ clulog(LOG_WARNING, "Telling CMAN to leave the cluster; qdisk is not"
+ " available\n");
+#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
+ if (cman_shutdown(ctx->qc_ch, 0) < 0) {
+#else
+ int x = 0;
+ if (ioctl(cman_get_fd(ctx->qc_ch), SIOCCLUSTER_LEAVE_CLUSTER, &x) < 0) {
+#endif
+ clulog(LOG_CRIT, "Could not leave the cluster - rebooting\n");
+ sleep(5);
+ if (ctx->qc_flags & RF_DEBUG)
+ return;
+ reboot(RB_AUTOBOOT);
+ }
+}
+
+
int
main(int argc, char **argv)
{
cman_node_t me;
- int cfh, rv;
+ int cfh, rv, forked = 0;
qd_ctx ctx;
cman_handle_t ch;
node_info_t ni[MAX_NODES_DISK];
struct h_data h[10];
char debug = 0, foreground = 0;
char device[128];
-
+ pid_t pid;
+
+ if (check_process_running(argv[0], &pid) && pid !=getpid()) {
+ printf("QDisk services already running\n");
+ return 0;
+ }
+
while ((rv = getopt(argc, argv, "fd")) != EOF) {
switch (rv) {
case 'd':
@@ -944,39 +1288,64 @@
break;
case 'f':
foreground = 1;
+ clu_log_console(1);
default:
break;
}
}
+
#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
ch = cman_admin_init(NULL);
#else
ch = cman_init(NULL);
#endif
if (!ch) {
- printf("Could not connect to cluster (CMAN not running?)\n");
- return -1;
+ if (!foreground && !forked) {
+ if (daemon_init(argv[0]) < 0)
+ return -1;
+ else
+ forked = 1;
+ }
+
+ clulog(LOG_INFO, "Waiting for CMAN to start\n");
+
+ do {
+ sleep(5);
+#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
+ ch = cman_admin_init(NULL);
+#else
+ ch = cman_init(NULL);
+#endif
+ } while (!ch);
}
- if (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) {
- printf("Could not determine local node ID; cannot start\n");
- return -1;
+ memset(&me, 0, sizeof(me));
+ while (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) {
+ if (!foreground && !forked) {
+ if (daemon_init(argv[0]) < 0)
+ return -1;
+ else
+ forked = 1;
+ }
+ sleep(5);
}
qd_init(&ctx, ch, me.cn_nodeid);
signal(SIGINT, int_handler);
+ signal(SIGTERM, int_handler);
- if (debug)
+ if (debug) {
clu_set_loglevel(LOG_DEBUG);
- if (foreground)
- clu_log_console(1);
+ ctx.qc_flags |= RF_DEBUG;
+ }
if (get_config_data(NULL, &ctx, h, 10, &cfh, debug) < 0) {
clulog_and_print(LOG_CRIT, "Configuration failed\n");
+ check_stop_cman(&ctx);
return -1;
}
-
+
if (ctx.qc_label) {
if (find_partitions("/proc/partitions",
ctx.qc_label, device,
@@ -984,6 +1353,7 @@
clulog_and_print(LOG_CRIT, "Unable to match label"
" '%s' to any device\n",
ctx.qc_label);
+ check_stop_cman(&ctx);
return -1;
}
@@ -999,15 +1369,21 @@
clulog(LOG_CRIT,
"Specified partition %s does not have a "
"qdisk label\n", ctx.qc_device);
+ check_stop_cman(&ctx);
return -1;
}
}
- if (!foreground)
- daemon(0,0);
+ if (!foreground && !forked) {
+ if (daemon_init(argv[0]) < 0)
+ return -1;
+ }
+
+ set_priority(ctx.qc_sched, ctx.qc_sched_prio);
if (quorum_init(&ctx, ni, MAX_NODES_DISK, h, cfh) < 0) {
clulog_and_print(LOG_CRIT, "Initialization failed\n");
+ check_stop_cman(&ctx);
return -1;
}
@@ -1025,14 +1401,12 @@
}
*/
- quorum_loop(&ctx, ni, MAX_NODES_DISK);
- cman_unregister_quorum_device(ctx.qc_ch);
+ if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0)
+ cman_unregister_quorum_device(ctx.qc_ch);
quorum_logout(&ctx);
-
qd_destroy(&ctx);
return 0;
-
}
--- cluster/cman/qdisk/Makefile 2006/06/23 16:01:02 1.1.2.2
+++ cluster/cman/qdisk/Makefile 2007/01/22 22:50:38 1.1.2.3
@@ -19,6 +19,16 @@
CFLAGS +=-I${incdir} -I${top_srcdir}/config \
-Wall -Werror -Wstrict-prototypes -Wshadow -D_GNU_SOURCE -g
+ifneq (${KERNEL_SRC}, )
+# Use the kernel tree if patched, otherwise, look where cluster headers
+# should be installed
+CFLAGS += $(shell if [ -d ${KERNEL_SRC}/include/cluster ]; then \
+ echo '-I${KERNEL_SRC}/include/cluster'; else \
+ echo '-I${incdir}/cluster'; fi)
+else
+CFLAGS += -I${incdir}/cluster
+endif
+
TARGET=qdiskd mkqdisk
all: ${TARGET}
@@ -31,7 +41,7 @@
install ${TARGET} ${sbindir}
qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \
- gettid.o proc.o ../lib/libcman.a
+ gettid.o proc.o daemon_init.o ../lib/libcman.a
gcc -o $@ $^ -lpthread -L../lib -lccs
mkqdisk: disk.o crc32.o disk_util.o \
--- cluster/cman/qdisk/gettid.c 2006/06/28 18:57:33 1.1.2.2
+++ cluster/cman/qdisk/gettid.c 2007/01/22 22:50:38 1.1.2.3
@@ -2,8 +2,8 @@
#include <sys/syscall.h>
#include <linux/unistd.h>
#include <gettid.h>
-#include <errno.h>
#include <unistd.h>
+#include <errno.h>
/* Patch from Adam Conrad / Ubuntu: Don't use _syscall macro */
--- cluster/cman/man/qdisk.5 2006/07/21 17:53:08 1.1.2.1
+++ cluster/cman/man/qdisk.5 2007/01/22 22:50:38 1.1.2.2
@@ -1,6 +1,6 @@
-.TH "QDisk" "8" "July 2006" "" "Cluster Quorum Disk"
+.TH "QDisk" "21" "Jan 2007" "" "Cluster Quorum Disk"
.SH "NAME"
-QDisk 1.0 \- a disk-based quorum daemon for CMAN / Linux-Cluster
+QDisk 1.2 \- a disk-based quorum daemon for CMAN / Linux-Cluster
.SH "1. Overview"
.SH "1.1 Problem"
In some situations, it may be necessary or desirable to sustain
@@ -75,16 +75,24 @@
* Cluster node votes should be more or less equal.
-* CMAN must be running before the qdisk program can start.
+* CMAN must be running before the qdisk program can operate in full
+capacity. If CMAN is not running, qdisk will wait for it.
* CMAN's eviction timeout should be at least 2x the quorum daemon's
to give the quorum daemon adequate time to converge on a master during a
failure + load spike situation.
-* The total number of votes assigned to the quorum device should be
-equal to or greater than the total number of node-votes in the cluster.
-While it is possible to assign only one (or a few) votes to the quorum
-device, the effects of doing so have not been explored.
+* For 'all-but-one' failure operation, the total number of votes assigned
+to the quorum device should be equal to or greater than the total number
+of node-votes in the cluster. While it is possible to assign only one
+(or a few) votes to the quorum device, the effects of doing so have not
+been explored.
+
+* For 'tiebreaker' operation in a two-node cluster, unset CMAN's two_node
+flag (or set it to 0), set CMAN's expected votes to '3', set each node's
+vote to '1', and set qdisk's vote count to '1' as well. This will allow
+the cluster to operate if either both nodes are online, or a single node &
+the heuristics.
* Currently, the quorum disk daemon is difficult to use with CLVM if
the quorum disk resides on a CLVM logical volume. CLVM requires a
@@ -217,23 +225,73 @@
0 = emergencies; 7 = debug.
.in 9
-\fIlog_facility\fP\fB="\fPlocal4\fB"\fP
+\fIlog_facility\fP\fB="\fPdaemon\fB"\fP
.in 12
This controls the syslog facility used by the quorum daemon when logging.
For a complete list of available facilities, see \fBsyslog.conf(5)\fP.
+The default value for this is 'daemon'.
.in 9
\fIstatus_file\fP\fB="\fP/foo\fB"\fP
.in 12
Write internal states out to this file periodically ("-" = use stdout).
-This is primarily used for debugging.
+This is primarily used for debugging. The default value for this
+attribute is undefined.
.in 9
\fImin_score\fP\fB="\fP3\fB"\fP
.in 12
Absolute minimum score to be consider one's self "alive". If omitted,
or set to 0, the default function "floor((n+1)/2)" is used, where \fIn\fP
-is the sum-total of all of defined heuristics' \fIscore\fP attribute.
+is the total of all of defined heuristics' \fIscore\fP attribute. This
+must never exceed the sum of the heuristic scores, or else the quorum
+disk will never be available.
+
+.in 9
+\fIreboot\fP\fB="\fP1\fB"\fP
+.in 12
+If set to 0 (off), qdiskd will *not* reboot after a negative transition
+as a result in a change in score (see section 2.2). The default for
+this value is 1 (on).
+
+.in 9
+\fIallow_kill\fP\fB="\fP1\fB"\fP
+.in 12
+If set to 0 (off), qdiskd will *not* instruct to kill nodes it thinks
+are dead (as a result of not writing to the quorum disk). The default
+for this value is 1 (on).
+
+.in 9
+\fIparanoid\fP\fB="\fP0\fB"\fP
+.in 12
+If set to 1 (on), qdiskd will watch internal timers and reboot the node
+if it takes more than (interval * tko) seconds to complete a quorum disk
+pass. The default for this value is 0 (off).
+
+.in 9
+\fIscheduler\fP\fB="\fPrr\fB"\fP
+.in 12
+Valid values are 'rr', 'fifo', and 'other'. Selects the scheduling queue
+in the Linux kernel for operation of the main & score threads (does not
+affect the heuristics; they are always run in the 'other' queue). Default
+is 'rr'. See sched_setscheduler(2) for more details.
+
+.in 9
+\fIpriority\fP\fB="\fP1\fB"\fP
+.in 12
+Valid values for 'rr' and 'fifo' are 1..100 inclusive. Valid values
+for 'other' are -20..20 inclusive. Sets the priority of the main & score
+threads. The default value is 1 (in the RR and FIFO queues, higher numbers
+denote higher priority; in OTHER, lower values denote higher priority).
+
+.in 9
+\fIstop_cman\fP\fB="\fP0\fB"\fP
+.in 12
+Ordinarily, cluster membership is left up to CMAN, not qdisk.
+If this parameter is set to 1 (on), qdiskd will tell CMAN to leave the
+cluster if it is unable to initialize the quorum disk during startup. This
+can be used to prevent cluster participation by a node which has been
+disconnected from the SAN. The default for this value is 0 (off).
.in 9
\fIdevice\fP\fB="\fP/dev/sda1\fB"\fP
@@ -249,6 +307,8 @@
on every block device found, comparing the label against the specified
label. This is useful in configurations where the block device name
differs on a per-node basis.
+.in 8
+\fB...>\fP
.in 0
.SH "3.2. The <heuristic> tag"
@@ -261,34 +321,80 @@
.in 12
This is the program used to determine if this heuristic is alive. This
can be anything which may be executed by \fI/bin/sh -c\fP. A return
-value of zero indicates success; anything else indicates failure.
+value of zero indicates success; anything else indicates failure. This
+is required.
.in 9
\fIscore\fP\fB="\fP1\fB"\fP
.in 12
This is the weight of this heuristic. Be careful when determining scores
-for heuristics.
+for heuristics. The default score for each heuristic is 1.
.in 9
\fIinterval\fP\fB="\fP2\fB"/>\fP
.in 12
-This is the frequency at which we poll the heuristic.
+This is the frequency (in seconds) at which we poll the heuristic. The
+default interval for every heuristic is 2 seconds.
.in 0
-.SH "3.3. Example"
+.in 9
+\fItko\fP\fB="\fP1\fB"/>\fP
+.in 12
+After this many failed attempts to run the heuristic, it is considered DOWN,
+and its score is removed. The default tko for each heuristic is 1, which
+may be inadequate for things such as 'ping'.
.in 8
+\fB/>\fP
+.in 0
+
+
+.SH "3.3. Examples"
+.SH "3.3.1. 3 cluster nodes & 3 routers"
+.in 8
+<cman expected_votes="6" .../>
+.br
+<clusternodes>
+.in 12
+<clusternode name="node1" votes="1" ... />
+.br
+<clusternode name="node2" votes="1" ... />
+.br
+<clusternode name="node3" votes="1" ... />
+.in 8
+</clusternodes>
+.br
<quorumd interval="1" tko="10" votes="3" label="testing">
.in 12
-<heuristic program="ping A -c1 -t1" score="1" interval="2"/>
+<heuristic program="ping A -c1 -t1" score="1" interval="2" tko="3"/>
.br
-<heuristic program="ping B -c1 -t1" score="1" interval="2"/>
+<heuristic program="ping B -c1 -t1" score="1" interval="2" tko="3"/>
.br
-<heuristic program="ping C -c1 -t1" score="1" interval="2"/>
+<heuristic program="ping C -c1 -t1" score="1" interval="2" tko="3"/>
+.br
+.in 8
+</quorumd>
+
+.SH "3.3.2. 2 cluster nodes & 1 IP tiebreaker"
+.in 8
+<cman two_node="0" expected_votes="3" .../>
+.br
+<clusternodes>
+.in 12
+<clusternode name="node1" votes="1" ... />
+.br
+<clusternode name="node2" votes="1" ... />
+.in 8
+</clusternodes>
+.br
+<quorumd interval="1" tko="10" votes="1" label="testing">
+.in 12
+<heuristic program="ping A -c1 -t1" score="1" interval="2" tko="3"/>
.br
.in 8
</quorumd>
.in 0
+
.SH "3.4. Heuristic score considerations"
* Heuristic timeouts should be set high enough to allow the previous run
of a given heuristic to complete.
--- cluster/cman/man/mkqdisk.8 2006/07/21 17:53:08 1.1.2.1
+++ cluster/cman/man/mkqdisk.8 2007/01/22 22:50:38 1.1.2.2
@@ -13,11 +13,16 @@
.IP "\-c device \-l label"
Initialize a new cluster quorum disk. This will destroy all data on the given
device. If a cluster is currently using that device as a quorum disk, the
-entire cluster will malfunction. Do not ru
+entire cluster will malfunction. Do not run this on an active cluster when
+qdiskd is running. Only one device on the SAN should ever have the given
+label; using multiple different devices is currently not supported (it is
+expected a RAID array is used for quorum disk redundancy). The label can be
+any textual string up to 127 characters - and is therefore enough space to hold
+a UUID created with uuidgen(1).
.IP "\-f label"
-Find the cluster quorum disk with the given label and display information about it..
+Find the cluster quorum disk with the given label and display information about it.
.IP "\-L"
Display information on all accessible cluster quorum disks.
.SH "SEE ALSO"
-qdisk(5) qdiskd(8)
+qdisk(5), qdiskd(8), uuidgen(1)
next reply other threads:[~2007-01-22 22:50 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-01-22 22:50 lhh [this message]
-- strict thread matches above, loose matches on Subject: below --
2007-02-21 20:22 [Cluster-devel] cluster/cman qdisk/score.c qdisk/disk.h qdisk/ lhh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20070122225040.7270.qmail@sourceware.org \
--to=lhh@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.