From mboxrd@z Thu Jan 1 00:00:00 1970 From: lhh@sourceware.org Date: 22 Jan 2007 22:50:16 -0000 Subject: [Cluster-devel] cluster/cman qdisk/score.h qdisk/score.c qdisk ... Message-ID: <20070122225016.5775.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL5 Changes by: lhh at sourceware.org 2007-01-22 22:50:12 Modified files: cman/qdisk : score.h score.c Makefile main.c disk.h clulog.c cman/man : qdisk.5 mkqdisk.8 Added files: cman/qdisk : daemon_init.c Log message: Resolves bugzillas: #213533, #216092, #220211, #223002, #223234/#223240 Detailed comments: * Lock in memory to prevent being swapped out * Turn on RR scheduling for main + score threads * Let qdiskd wait for CMAN to start * Add option to qdiskd to stop CMAN if qdisk device is not available * Make qdisk interval timings more accurate * Add option to reboot node if qdiskd detects internal hang > failure time (e.g. interval*tko, in seconds) * Add per-heuristic tko counts for unreliable heuristics (e.g. ping packets) * Remove nodes from quorate mask immediately on eviction * Update man pages with better examples * Don't let >1 instance of qdiskd be started * Clarify logging output. * Improve data in status_file. * Allow qdiskd to run with no defined heuristics (master-always-wins mode). * Make fencing of nodes optional (default = on). * Make sure CMAN is running before we try to talk to it at each point. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/daemon_init.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.6&r2=1.6.2.1 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.2&r2=1.4.2.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4&r2=1.4.2.1 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/clulog.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdisk.5.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.3&r2=1.3.2.1 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/mkqdisk.8.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.6.1 --- cluster/cman/qdisk/score.h 2006/05/19 14:41:35 1.2 +++ cluster/cman/qdisk/score.h 2007/01/22 22:50:11 1.2.4.1 @@ -32,7 +32,9 @@ char * program; int score; int available; + int tko; int interval; + int misses; pid_t childpid; time_t nextrun; }; @@ -50,7 +52,7 @@ /* Start the thread which runs the scoring applets */ -int start_score_thread(struct h_data *h, int count); +int start_score_thread(qd_ctx *ctx, struct h_data *h, int count); /* Get our score + maxscore --- cluster/cman/qdisk/score.c 2006/05/19 14:41:35 1.2 +++ cluster/cman/qdisk/score.c 2007/01/22 22:50:11 1.2.4.1 @@ -32,14 +32,20 @@ #include #include #include +#include +#include +#include "disk.h" #include "score.h" static pthread_mutex_t sc_lock = PTHREAD_MUTEX_INITIALIZER; static int _score = 0, _maxscore = 0, _score_thread_running = 0; static pthread_t score_thread = (pthread_t)0; +void set_priority(int, int); struct h_arg { struct h_data *h; + int sched_queue; + int sched_prio; int count; }; @@ -97,6 +103,20 @@ h->childpid = pid; return 0; } + + /* + * always use SCHED_OTHER for the child processes + * nice -1 is fine; but we don't know what the child process + * might do, so leaving it (potentially) in SCHED_RR or SCHED_FIFO + * is out of the question + * + * XXX if you set SCHED_OTHER in the conf file and nice 20, the below + * will make the heuristics a higher prio than qdiskd. This should be + * fine in practice, because running qdiskd at nice 20 will cause all + * sorts of problems on a busy system. + */ + set_priority(SCHED_OTHER, -1); + munlockall(); argv[0] = "/bin/sh"; argv[1] = "-c"; @@ -122,6 +142,12 @@ *score = 0; *maxscore = 0; + + /* Allow operation w/o any heuristics */ + if (!max) { + *score = *maxscore = 1; + return; + } for (x = 0; x < max; x++) { *maxscore += h[x].score; @@ -141,22 +167,51 @@ int status; if (h->childpid == 0) + /* No child to check */ return 0; ret = waitpid(h->childpid, &status, block?0:WNOHANG); if (!block && ret == 0) + /* No children exited */ return 0; h->childpid = 0; - h->available = 0; if (ret < 0 && errno == ECHILD) - return -1; - if (!WIFEXITED(status)) - return 0; - if (WEXITSTATUS(status) != 0) - return 0; - h->available = 1; + /* wrong child? */ + goto miss; + if (!WIFEXITED(status)) { + ret = 0; + goto miss; + } + if (WEXITSTATUS(status) != 0) { + ret = 0; + goto miss; + } + + /* Returned 0 and was not killed */ + if (!h->available) { + h->available = 1; + clulog(LOG_INFO, "Heuristic: '%s' UP\n", h->program); + } + h->misses = 0; return 0; + +miss: + if (h->available) { + h->misses++; + if (h->misses >= h->tko) { + clulog(LOG_INFO, + "Heuristic: '%s' DOWN (%d/%d)\n", + h->program, h->misses, h->tko); + h->available = 0; + } else { + clulog(LOG_DEBUG, + "Heuristic: '%s' missed (%d/%d)\n", + h->program, h->misses, h->tko); + } + } + + return ret; } @@ -204,7 +259,9 @@ do { h[x].program = NULL; h[x].available = 0; + h[x].misses = 0; h[x].interval = 2; + h[x].tko = 1; h[x].score = 1; h[x].childpid = 0; h[x].nextrun = 0; @@ -236,9 +293,20 @@ if (h[x].interval <= 0) h[x].interval = 2; } + + /* Get tko for this heuristic */ + snprintf(query, sizeof(query), + "/cluster/quorumd/heuristic[%d]/@tko", x+1); + if (ccs_get(ccsfd, query, &val) == 0) { + h[x].tko= atoi(val); + free(val); + if (h[x].tko <= 0) + h[x].tko = 1; + } - clulog(LOG_DEBUG, "Heuristic: '%s' score=%d interval=%d\n", - h[x].program, h[x].score, h[x].interval); + clulog(LOG_DEBUG, + "Heuristic: '%s' score=%d interval=%d tko=%d\n", + h[x].program, h[x].score, h[x].interval, h[x].tko); } while (++x < max); @@ -271,6 +339,8 @@ { struct h_arg *args = (struct h_arg *)arg; int score, maxscore; + + set_priority(args->sched_queue, args->sched_prio); while (_score_thread_running) { fork_heuristics(args->h, args->count); @@ -317,7 +387,7 @@ to pass in h if it was allocated on the stack. */ int -start_score_thread(struct h_data *h, int count) +start_score_thread(qd_ctx *ctx, struct h_data *h, int count) { pthread_attr_t attrs; struct h_arg *args; @@ -337,8 +407,11 @@ memcpy(args->h, h, (sizeof(struct h_data) * count)); args->count = count; + args->sched_queue = ctx->qc_sched; + args->sched_prio = ctx->qc_sched_prio; _score_thread_running = 1; + pthread_attr_init(&attrs); pthread_attr_setinheritsched(&attrs, PTHREAD_INHERIT_SCHED); pthread_create(&score_thread, &attrs, score_thread_main, args); --- cluster/cman/qdisk/Makefile 2006/08/11 15:18:05 1.6 +++ cluster/cman/qdisk/Makefile 2007/01/22 22:50:11 1.6.2.1 @@ -28,7 +28,7 @@ install ${TARGET} ${sbindir} qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \ - gettid.o proc.o ../lib/libcman.a + gettid.o proc.o daemon_init.o ../lib/libcman.a gcc -o $@ $^ -lpthread -L../lib -L${ccslibdir} -lccs mkqdisk: disk.o crc32.o disk_util.o \ --- cluster/cman/qdisk/main.c 2007/01/16 15:16:56 1.4.2.2 +++ cluster/cman/qdisk/main.c 2007/01/22 22:50:11 1.4.2.3 @@ -35,11 +35,21 @@ #include #include #include +#include #include +#include #include #include #include "score.h" #include "clulog.h" +#if (!defined(LIBCMAN_VERSION) || \ + (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION < 2)) +#include +#endif + +int daemon_init(char *); +int check_process_running(char *, pid_t *); + /* TODO: 1) Take into account timings to gracefully extend node timeouts during @@ -155,6 +165,11 @@ if (sb->ps_timestamp == ni[x].ni_last_seen) { /* XXX check for average + allow grace */ ni[x].ni_misses++; + if (ni[x].ni_misses > 1) { + clulog(LOG_DEBUG, + "Node %d missed an update (%d/%d)\n", + x+1, ni[x].ni_misses, ctx->qc_tko); + } continue; } @@ -208,6 +223,11 @@ ni[x].ni_misses = 0; ni[x].ni_state = S_NONE; + /* Clear our master mask for the node after eviction + * or shutdown */ + if (mask) + clear_bit(mask, (ni[x].ni_status.ps_nodeid-1), + sizeof(memb_mask_t)); continue; } @@ -227,15 +247,17 @@ Write eviction notice if we're the master. */ if (ctx->qc_status == S_MASTER) { - clulog(LOG_DEBUG, + clulog(LOG_NOTICE, "Writing eviction notice for node %d\n", ni[x].ni_status.ps_nodeid); qd_write_status(ctx, ni[x].ni_status.ps_nodeid, S_EVICT, NULL, NULL, NULL); - clulog(LOG_DEBUG, - "Telling CMAN to kill the node\n"); - cman_kill_node(ctx->qc_ch, - ni[x].ni_status.ps_nodeid); + if (ctx->qc_flags & RF_ALLOW_KILL) { + clulog(LOG_DEBUG, "Telling CMAN to " + "kill the node\n"); + cman_kill_node(ctx->qc_ch, + ni[x].ni_status.ps_nodeid); + } } /* @@ -255,6 +277,10 @@ ni[x].ni_evil_incarnation = ni[x].ni_status.ps_incarnation; + /* Clear our master mask for the node after eviction */ + if (mask) + clear_bit(mask, (ni[x].ni_status.ps_nodeid-1), + sizeof(memb_mask_t)); continue; } @@ -279,9 +305,12 @@ ni[x].ni_status.ps_state = S_EVICT; /* XXX Need to fence it again */ - clulog(LOG_DEBUG, "Telling CMAN to kill the node\n"); - cman_kill_node(ctx->qc_ch, - ni[x].ni_status.ps_nodeid); + if (ctx->qc_flags & RF_ALLOW_KILL) { + clulog(LOG_DEBUG, "Telling CMAN to " + "kill the node\n"); + cman_kill_node(ctx->qc_ch, + ni[x].ni_status.ps_nodeid); + } continue; } @@ -416,6 +445,10 @@ int x = 0, score, maxscore; clulog(LOG_INFO, "Quorum Daemon Initializing\n"); + + if (mlockall(MCL_CURRENT|MCL_FUTURE) != 0) { + clulog(LOG_ERR, "Unable to mlockall()\n"); + } if (qdisk_validate(ctx->qc_device) < 0) return -1; @@ -427,7 +460,7 @@ return -1; } - start_score_thread(h, maxh); + start_score_thread(ctx, h, maxh); node_info_init(ni, max); if (qd_write_status(ctx, ctx->qc_my_id, @@ -447,7 +480,6 @@ } sleep(ctx->qc_interval); - } get_my_score(&score,&maxscore); @@ -500,12 +532,16 @@ return; memset(master_mask, 0, sizeof(master_mask)); - for (x = 0; x < retnodes; x++) { if (is_bit_set(mask, nodes[x].cn_nodeid-1, sizeof(mask)) && - nodes[x].cn_member) + nodes[x].cn_member) { set_bit(master_mask, nodes[x].cn_nodeid-1, sizeof(master_mask)); + } else { + /* Not in CMAN output = not allowed */ + clear_bit(master_mask, (nodes[x].cn_nodeid-1), + sizeof(memb_mask_t)); + } } } @@ -604,12 +640,25 @@ } fprintf(fp, "Node ID: %d\n", ctx->qc_my_id); - fprintf(fp, "Score (current / min req. / max allowed): %d / %d / %d\n", - score, score_req, score_max); + + if (ctx->qc_master) + fprintf(fp, "Master Node ID: %d\n", ctx->qc_master); + else + fprintf(fp, "Master Node ID: (none)\n"); + + fprintf(fp, "Score: %d/%d (Minimum required = %d)\n", + score, score_max, score_req); fprintf(fp, "Current state: %s\n", state_str(ctx->qc_status)); fprintf(fp, "Current disk state: %s\n", state_str(ctx->qc_disk_status)); + fprintf(fp, "Initializing Set: {"); + for (x=0; x= S_RUN || ni[x].ni_status.ps_nodeid == @@ -617,13 +666,10 @@ fprintf(fp," %d", ni[x].ni_status.ps_nodeid); } fprintf(fp, " }\n"); - - if (!ctx->qc_master) { - fprintf(fp, "No master node\n"); + + if (!ctx->qc_master) goto out; - } - fprintf(fp, "Master Node ID: %d\n", ctx->qc_master); fprintf(fp, "Quorate Set: {"); for (x=0; xqc_master-1].ni_status.ps_master_mask, @@ -642,18 +688,141 @@ } +/* Timeval functions from clumanager */ +/** + * Scale a (struct timeval). + * + * @param tv The timeval to scale. + * @param scale Positive multiplier. + * @return tv + */ +struct timeval * +_scale_tv(struct timeval *tv, int scale) +{ + tv->tv_sec *= scale; + tv->tv_usec *= scale; + + if (tv->tv_usec > 1000000) { + tv->tv_sec += (tv->tv_usec / 1000000); + tv->tv_usec = (tv->tv_usec % 1000000); + } + + return tv; +} + + +static inline void +_diff_tv(struct timeval *dest, struct timeval *start, struct timeval *end) +{ + dest->tv_sec = end->tv_sec - start->tv_sec; + dest->tv_usec = end->tv_usec - start->tv_usec; + + if (dest->tv_usec < 0) { + dest->tv_usec += 1000000; + dest->tv_sec--; + } +} + + +#define _print_tv(val) \ + printf("%s: %d.%06d\n", #val, (int)((val)->tv_sec), \ + (int)((val)->tv_usec)) + + +static inline int +_cmp_tv(struct timeval *left, struct timeval *right) +{ + if (left->tv_sec > right->tv_sec) + return -1; + + if (left->tv_sec < right->tv_sec) + return 1; + + if (left->tv_usec > right->tv_usec) + return -1; + + if (left->tv_usec < right->tv_usec) + return 1; + + return 0; +} + + +void +set_priority(int queue, int prio) +{ + struct sched_param s; + int ret; + char *func = "nice"; + + if (queue == SCHED_OTHER) { + s.sched_priority = 0; + ret = sched_setscheduler(0, queue, &s); + errno = 0; + ret = nice(prio); + } else { + memset(&s,0,sizeof(s)); + s.sched_priority = prio; + ret = sched_setscheduler(0, queue, &s); + func = "sched_setscheduler"; + } + + if (ret < 0 && errno) { + clulog(LOG_WARNING, "set_priority [%s] failed: %s\n", func, + strerror(errno)); + } +} + + +int +cman_alive(cman_handle_t ch) +{ + fd_set rfds; + int fd = cman_get_fd(ch); + struct timeval tv = {0, 0}; + + FD_ZERO(&rfds); + FD_SET(fd, &rfds); + if (select(fd + 1, &rfds, NULL, NULL, &tv) == 1) { + if (cman_dispatch(ch, CMAN_DISPATCH_ALL) < 0) { + if (errno == EAGAIN) + return 0; + return -1; + } + } + return 0; +} + int quorum_loop(qd_ctx *ctx, node_info_t *ni, int max) { disk_msg_t msg = {0, 0, 0}; - int low_id, bid_pending = 0, score, score_max, score_req; + int low_id, bid_pending = 0, score, score_max, score_req, + upgrade = 0; memb_mask_t mask, master_mask; + struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval; - ctx->qc_status = S_RUN; + ctx->qc_status = S_NONE; + + maxtime.tv_usec = 0; + maxtime.tv_sec = ctx->qc_interval * ctx->qc_tko; + + interval.tv_usec = 0; + interval.tv_sec = ctx->qc_interval; + + get_my_score(&score, &score_max); + if (score_max < ctx->qc_scoremin) { + clulog(LOG_WARNING, "Minimum score (%d) is impossible to " + "achieve (heuristic total = %d)\n", + ctx->qc_scoremin, score_max); + } _running = 1; while (_running) { + /* XXX this was getuptime() in clumanager */ + gettimeofday(&oldtime, NULL); + /* Read everyone else's status */ read_node_blocks(ctx, ni, max); @@ -662,6 +831,7 @@ /* Check heuristics and remove ourself if necessary */ get_my_score(&score, &score_max); + upgrade = 0; score_req = ctx->qc_scoremin; if (score_req <= 0) @@ -672,14 +842,19 @@ if (ctx->qc_status > S_NONE) { clulog(LOG_NOTICE, "Score insufficient for master " - "operation (%d/%d; max=%d); " + "operation (%d/%d; required=%d); " "downgrading\n", - score, score_req, score_max); + score, score_max, score_req); ctx->qc_status = S_NONE; msg.m_msg = M_NONE; ++msg.m_seq; bid_pending = 0; - cman_poll_quorum_device(ctx->qc_ch, 0); + if (cman_alive(ctx->qc_ch) < 0) { + clulog(LOG_ERR, "cman: %s\n", + strerror(errno)); + } else { + cman_poll_quorum_device(ctx->qc_ch, 0); + } if (ctx->qc_flags & RF_REBOOT) reboot(RB_AUTOBOOT); } @@ -688,10 +863,13 @@ if (ctx->qc_status == S_NONE) { clulog(LOG_NOTICE, "Score sufficient for master " - "operation (%d/%d; max=%d); " + "operation (%d/%d; required=%d); " "upgrading\n", - score, score_req, score_max); + score, score_max, score_req); ctx->qc_status = S_RUN; + upgrade = (ctx->qc_tko / 3); + if (upgrade == 0) + upgrade = 1; } } @@ -702,11 +880,13 @@ if (!ctx->qc_master && low_id == ctx->qc_my_id && ctx->qc_status == S_RUN && - !bid_pending ) { + !bid_pending && + !upgrade) { /* If there's no master, and we are the lowest node ID, make a bid to become master if we're not - already bidding. + already bidding. We can't do this if we've just + upgraded. */ clulog(LOG_DEBUG,"Making bid for master\n"); @@ -724,10 +904,18 @@ /* We're currently bidding for master. See if anyone's voted, or if we should rescind our bid */ + ++bid_pending; /* Yes, those are all deliberate fallthroughs */ switch (check_votes(ctx, ni, max, &msg)) { case 3: + /* + * Give ample time to become aware of other + * nodes + */ + if (bid_pending < (ctx->qc_tko / 3)) + break; + clulog(LOG_INFO, "Assuming master role\n"); ctx->qc_status = S_MASTER; @@ -755,6 +943,13 @@ /* We are the master. Poll the quorum device. We can't be the master unless we score high enough on our heuristics. */ + if (cman_alive(ctx->qc_ch) < 0) { + clulog(LOG_ERR, "cman_dispatch: %s\n", + strerror(errno)); + clulog(LOG_ERR, + "Halting qdisk operations\n"); + return -1; + } check_cman(ctx, mask, master_mask); cman_poll_quorum_device(ctx->qc_ch, 1); @@ -768,6 +963,13 @@ ni[ctx->qc_master-1].ni_status.ps_master_mask, ctx->qc_my_id-1, sizeof(memb_mask_t))) { + if (cman_alive(ctx->qc_ch) < 0) { + clulog(LOG_ERR, "cman_dispatch: %s\n", + strerror(errno)); + clulog(LOG_ERR, + "Halting qdisk operations\n"); + return -1; + } cman_poll_quorum_device(ctx->qc_ch, 1); } } @@ -783,8 +985,43 @@ /* Cycle. We could time the loop and sleep usleep(interval-looptime), but this is fine for now.*/ + gettimeofday(&newtime, NULL); + _diff_tv(&diff, &oldtime, &newtime); + + /* + * Reboot if we didn't send a heartbeat in interval*TKO_COUNT + */ + if (_cmp_tv(&maxtime, &diff) == 1 && + ctx->qc_flags & RF_PARANOID) { + clulog(LOG_EMERG, "Failed to complete a cycle within " + "%d second%s (%d.%06d) - REBOOTING\n", + (int)maxtime.tv_sec, + maxtime.tv_sec==1?"":"s", + (int)diff.tv_sec, + (int)diff.tv_usec); + if (!(ctx->qc_flags & RF_DEBUG)) + reboot(RB_AUTOBOOT); + } + + /* + * If the amount we took to complete a loop is greater or less + * than our interval, we adjust by the difference each round. + * + * It's not really "realtime", but it helps! + */ + if (_cmp_tv(&diff, &interval) == 1) { + _diff_tv(&sleeptime, &diff, &interval); + } else { + clulog(LOG_WARNING, "qdisk cycle took more " + "than %d second%s to complete (%d.%06d)\n", + ctx->qc_interval, ctx->qc_interval==1?"":"s", + (int)diff.tv_sec, (int)diff.tv_usec); + memcpy(&sleeptime, &interval, sizeof(sleeptime)); + } + + /* Could hit a watchdog timer here if we wanted to */ if (_running) - sleep(ctx->qc_interval); + select(0, NULL, NULL, NULL, &sleeptime); } return 0; @@ -829,12 +1066,15 @@ ctx->qc_interval = 1; ctx->qc_tko = 10; ctx->qc_scoremin = 0; - ctx->qc_flags = RF_REBOOT; + ctx->qc_flags = RF_REBOOT | RF_ALLOW_KILL; /* | RF_STOP_CMAN;*/ + ctx->qc_sched = SCHED_RR; + ctx->qc_sched_prio = 1; /* Get log log_facility */ snprintf(query, sizeof(query), "/cluster/quorumd/@log_facility"); if (ccs_get(ccsfd, query, &val) == 0) { clu_set_facility(val); + clulog(LOG_DEBUG, "Log facility: %s\n", val); free(val); } @@ -903,6 +1143,37 @@ if (ctx->qc_scoremin < 0) ctx->qc_scoremin = 0; } + + /* Get scheduling queue */ + snprintf(query, sizeof(query), "/cluster/quorumd/@scheduler"); + if (ccs_get(ccsfd, query, &val) == 0) { + switch(val[0]) { + case 'r': + case 'R': + ctx->qc_sched = SCHED_RR; + break; + case 'f': + case 'F': + ctx->qc_sched = SCHED_FIFO; + break; + case 'o': + case 'O': + ctx->qc_sched = SCHED_OTHER; + break; + default: + clulog(LOG_WARNING, "Invalid scheduling queue '%s'\n", + val); + break; + } + free(val); + } + + /* Get priority */ + snprintf(query, sizeof(query), "/cluster/quorumd/@priority"); + if (ccs_get(ccsfd, query, &val) == 0) { + ctx->qc_sched_prio = atoi(val); + free(val); + } /* Get reboot flag for when we transition -> offline */ /* default = on, so, 0 to turn off */ @@ -912,6 +1183,50 @@ ctx->qc_flags &= ~RF_REBOOT; free(val); } + + /* + * Get flag to see if we're supposed to kill cman if qdisk is not + * available. + */ + /* default = off, so, 1 to turn on */ + snprintf(query, sizeof(query), "/cluster/quorumd/@stop_cman"); + if (ccs_get(ccsfd, query, &val) == 0) { + if (!atoi(val)) + ctx->qc_flags &= ~RF_STOP_CMAN; + else + ctx->qc_flags |= RF_STOP_CMAN; + free(val); + } + + + /* + * Get flag to see if we're supposed to reboot if we can't complete + * a pass in failure time + */ + /* default = off, so, 1 to turn on */ + snprintf(query, sizeof(query), "/cluster/quorumd/@paranoid"); + if (ccs_get(ccsfd, query, &val) == 0) { + if (!atoi(val)) + ctx->qc_flags &= ~RF_PARANOID; + else + ctx->qc_flags |= RF_PARANOID; + free(val); + } + + + /* + * Get flag to see if we're supposed to reboot if we can't complete + * a pass in failure time + */ + /* default = off, so, 1 to turn on */ + snprintf(query, sizeof(query), "/cluster/quorumd/@allow_kill"); + if (ccs_get(ccsfd, query, &val) == 0) { + if (!atoi(val)) + ctx->qc_flags &= ~RF_ALLOW_KILL; + else + ctx->qc_flags |= RF_ALLOW_KILL; + free(val); + } *cfh = configure_heuristics(ccsfd, h, maxh); @@ -925,18 +1240,47 @@ } +void +check_stop_cman(qd_ctx *ctx) +{ + if (!(ctx->qc_flags & RF_STOP_CMAN)) + return; + + clulog(LOG_WARNING, "Telling CMAN to leave the cluster; qdisk is not" + " available\n"); +#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2) + if (cman_shutdown(ctx->qc_ch, 0) < 0) { +#else + int x = 0; + if (ioctl(cman_get_fd(ctx->qc_ch), SIOCCLUSTER_LEAVE_CLUSTER, &x) < 0) { +#endif + clulog(LOG_CRIT, "Could not leave the cluster - rebooting\n"); + sleep(5); + if (ctx->qc_flags & RF_DEBUG) + return; + reboot(RB_AUTOBOOT); + } +} + + int main(int argc, char **argv) { cman_node_t me; - int cfh, rv; + int cfh, rv, forked = 0; qd_ctx ctx; cman_handle_t ch; node_info_t ni[MAX_NODES_DISK]; struct h_data h[10]; char debug = 0, foreground = 0; char device[128]; - + pid_t pid; + + if (check_process_running(argv[0], &pid) && pid !=getpid()) { + printf("QDisk services already running\n"); + return 0; + } + while ((rv = getopt(argc, argv, "fd")) != EOF) { switch (rv) { case 'd': @@ -944,40 +1288,64 @@ break; case 'f': foreground = 1; + clu_log_console(1); default: break; } } + #if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2) ch = cman_admin_init(NULL); #else ch = cman_init(NULL); #endif if (!ch) { - printf("Could not connect to cluster (CMAN not running?)\n"); - return -1; + if (!foreground && !forked) { + if (daemon_init(argv[0]) < 0) + return -1; + else + forked = 1; + } + + clulog(LOG_INFO, "Waiting for CMAN to start\n"); + + do { + sleep(5); +#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2) + ch = cman_admin_init(NULL); +#else + ch = cman_init(NULL); +#endif + } while (!ch); } memset(&me, 0, sizeof(me)); - if (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) { - printf("Could not determine local node ID; cannot start\n"); - return -1; + while (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) { + if (!foreground && !forked) { + if (daemon_init(argv[0]) < 0) + return -1; + else + forked = 1; + } + sleep(5); } qd_init(&ctx, ch, me.cn_nodeid); signal(SIGINT, int_handler); + signal(SIGTERM, int_handler); - if (debug) + if (debug) { clu_set_loglevel(LOG_DEBUG); - if (foreground) - clu_log_console(1); + ctx.qc_flags |= RF_DEBUG; + } if (get_config_data(NULL, &ctx, h, 10, &cfh, debug) < 0) { clulog_and_print(LOG_CRIT, "Configuration failed\n"); + check_stop_cman(&ctx); return -1; } - + if (ctx.qc_label) { if (find_partitions("/proc/partitions", ctx.qc_label, device, @@ -985,6 +1353,7 @@ clulog_and_print(LOG_CRIT, "Unable to match label" " '%s' to any device\n", ctx.qc_label); + check_stop_cman(&ctx); return -1; } @@ -1000,15 +1369,21 @@ clulog(LOG_CRIT, "Specified partition %s does not have a " "qdisk label\n", ctx.qc_device); + check_stop_cman(&ctx); return -1; } } - if (!foreground) - daemon(0,0); + if (!foreground && !forked) { + if (daemon_init(argv[0]) < 0) + return -1; + } + + set_priority(ctx.qc_sched, ctx.qc_sched_prio); if (quorum_init(&ctx, ni, MAX_NODES_DISK, h, cfh) < 0) { clulog_and_print(LOG_CRIT, "Initialization failed\n"); + check_stop_cman(&ctx); return -1; } @@ -1026,14 +1401,12 @@ } */ - quorum_loop(&ctx, ni, MAX_NODES_DISK); - cman_unregister_quorum_device(ctx.qc_ch); + if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0) + cman_unregister_quorum_device(ctx.qc_ch); quorum_logout(&ctx); - qd_destroy(&ctx); return 0; - } --- cluster/cman/qdisk/disk.h 2006/10/03 18:06:40 1.4 +++ cluster/cman/qdisk/disk.h 2007/01/22 22:50:11 1.4.2.1 @@ -67,7 +67,11 @@ typedef enum { - RF_REBOOT = 0x1 /* Reboot if we go from master->none */ + RF_REBOOT = 0x1, /* Reboot if we go from master->none */ + RF_STOP_CMAN = 0x2, + RF_DEBUG = 0x4, + RF_PARANOID = 0x8, + RF_ALLOW_KILL = 0x10 } run_flag_t; @@ -237,6 +241,8 @@ int qc_tko; int qc_votes; int qc_scoremin; + int qc_sched; + int qc_sched_prio; disk_node_state_t qc_disk_status; disk_node_state_t qc_status; int qc_master; /* Master?! */ --- cluster/cman/qdisk/clulog.c 2006/05/19 14:41:35 1.2 +++ cluster/cman/qdisk/clulog.c 2007/01/22 22:50:11 1.2.4.1 @@ -20,8 +20,6 @@ /** @file * Library routines for communicating with the logging daemon. * - * $Id: clulog.c,v 1.2 2006/05/19 14:41:35 lhh Exp $ - * * Author: Jeff Moyer */ #include @@ -50,8 +48,6 @@ #include -static const char *version __attribute__ ((unused)) = "$Revision: 1.2 $"; - #ifdef DEBUG #include #define Dprintf(fmt,args...) printf(fmt,##args) @@ -135,7 +131,7 @@ } pthread_mutex_unlock(&log_mutex); - return "local4"; + return "daemon"; } @@ -156,7 +152,6 @@ for (; facilitynames[x].c_name; x++) { if (strcmp(facilityname, facilitynames[x].c_name)) continue; - syslog_facility = facilitynames[x].c_val; break; } --- cluster/cman/man/qdisk.5 2006/10/03 18:07:58 1.3 +++ cluster/cman/man/qdisk.5 2007/01/22 22:50:12 1.3.2.1 @@ -1,6 +1,6 @@ -.TH "QDisk" "8" "July 2006" "" "Cluster Quorum Disk" +.TH "QDisk" "21" "Jan 2007" "" "Cluster Quorum Disk" .SH "NAME" -QDisk 1.0 \- a disk-based quorum daemon for CMAN / Linux-Cluster +QDisk 1.2 \- a disk-based quorum daemon for CMAN / Linux-Cluster .SH "1. Overview" .SH "1.1 Problem" In some situations, it may be necessary or desirable to sustain @@ -75,16 +75,24 @@ * Cluster node votes should be more or less equal. -* CMAN must be running before the qdisk program can start. +* CMAN must be running before the qdisk program can operate in full +capacity. If CMAN is not running, qdisk will wait for it. * CMAN's eviction timeout should be at least 2x the quorum daemon's to give the quorum daemon adequate time to converge on a master during a failure + load spike situation. -* The total number of votes assigned to the quorum device should be -equal to or greater than the total number of node-votes in the cluster. -While it is possible to assign only one (or a few) votes to the quorum -device, the effects of doing so have not been explored. +* For 'all-but-one' failure operation, the total number of votes assigned +to the quorum device should be equal to or greater than the total number +of node-votes in the cluster. While it is possible to assign only one +(or a few) votes to the quorum device, the effects of doing so have not +been explored. + +* For 'tiebreaker' operation in a two-node cluster, unset CMAN's two_node +flag (or set it to 0), set CMAN's expected votes to '3', set each node's +vote to '1', and set qdisk's vote count to '1' as well. This will allow +the cluster to operate if either both nodes are online, or a single node & +the heuristics. * Currently, the quorum disk daemon is difficult to use with CLVM if the quorum disk resides on a CLVM logical volume. CLVM requires a @@ -217,23 +225,27 @@ 0 = emergencies; 7 = debug. .in 9 -\fIlog_facility\fP\fB="\fPlocal4\fB"\fP +\fIlog_facility\fP\fB="\fPdaemon\fB"\fP .in 12 This controls the syslog facility used by the quorum daemon when logging. For a complete list of available facilities, see \fBsyslog.conf(5)\fP. +The default value for this is 'daemon'. .in 9 \fIstatus_file\fP\fB="\fP/foo\fB"\fP .in 12 Write internal states out to this file periodically ("-" = use stdout). -This is primarily used for debugging. +This is primarily used for debugging. The default value for this +attribute is undefined. .in 9 \fImin_score\fP\fB="\fP3\fB"\fP .in 12 Absolute minimum score to be consider one's self "alive". If omitted, or set to 0, the default function "floor((n+1)/2)" is used, where \fIn\fP -is the sum-total of all of defined heuristics' \fIscore\fP attribute. +is the total of all of defined heuristics' \fIscore\fP attribute. This +must never exceed the sum of the heuristic scores, or else the quorum +disk will never be available. .in 9 \fIreboot\fP\fB="\fP1\fB"\fP @@ -243,6 +255,45 @@ this value is 1 (on). .in 9 +\fIallow_kill\fP\fB="\fP1\fB"\fP +.in 12 +If set to 0 (off), qdiskd will *not* instruct to kill nodes it thinks +are dead (as a result of not writing to the quorum disk). The default +for this value is 1 (on). + +.in 9 +\fIparanoid\fP\fB="\fP0\fB"\fP +.in 12 +If set to 1 (on), qdiskd will watch internal timers and reboot the node +if it takes more than (interval * tko) seconds to complete a quorum disk +pass. The default for this value is 0 (off). + +.in 9 +\fIscheduler\fP\fB="\fPrr\fB"\fP +.in 12 +Valid values are 'rr', 'fifo', and 'other'. Selects the scheduling queue +in the Linux kernel for operation of the main & score threads (does not +affect the heuristics; they are always run in the 'other' queue). Default +is 'rr'. See sched_setscheduler(2) for more details. + +.in 9 +\fIpriority\fP\fB="\fP1\fB"\fP +.in 12 +Valid values for 'rr' and 'fifo' are 1..100 inclusive. Valid values +for 'other' are -20..20 inclusive. Sets the priority of the main & score +threads. The default value is 1 (in the RR and FIFO queues, higher numbers +denote higher priority; in OTHER, lower values denote higher priority). + +.in 9 +\fIstop_cman\fP\fB="\fP0\fB"\fP +.in 12 +Ordinarily, cluster membership is left up to CMAN, not qdisk. +If this parameter is set to 1 (on), qdiskd will tell CMAN to leave the +cluster if it is unable to initialize the quorum disk during startup. This +can be used to prevent cluster participation by a node which has been +disconnected from the SAN. The default for this value is 0 (off). + +.in 9 \fIdevice\fP\fB="\fP/dev/sda1\fB"\fP .in 12 This is the device the quorum daemon will use. This device must be the @@ -256,6 +307,8 @@ on every block device found, comparing the label against the specified label. This is useful in configurations where the block device name differs on a per-node basis. +.in 8 +\fB...>\fP .in 0 .SH "3.2. The tag" @@ -268,34 +321,80 @@ .in 12 This is the program used to determine if this heuristic is alive. This can be anything which may be executed by \fI/bin/sh -c\fP. A return -value of zero indicates success; anything else indicates failure. +value of zero indicates success; anything else indicates failure. This +is required. .in 9 \fIscore\fP\fB="\fP1\fB"\fP .in 12 This is the weight of this heuristic. Be careful when determining scores -for heuristics. +for heuristics. The default score for each heuristic is 1. .in 9 \fIinterval\fP\fB="\fP2\fB"/>\fP .in 12 -This is the frequency at which we poll the heuristic. +This is the frequency (in seconds) at which we poll the heuristic. The +default interval for every heuristic is 2 seconds. +.in 0 + +.in 9 +\fItko\fP\fB="\fP1\fB"/>\fP +.in 12 +After this many failed attempts to run the heuristic, it is considered DOWN, +and its score is removed. The default tko for each heuristic is 1, which +may be inadequate for things such as 'ping'. +.in 8 +\fB/>\fP .in 0 -.SH "3.3. Example" + +.SH "3.3. Examples" +.SH "3.3.1. 3 cluster nodes & 3 routers" +.in 8 + +.br + +.in 12 + +.br + +.br + .in 8 + +.br .in 12 - + +.br + +.br + +.br +.in 8 + + +.SH "3.3.2. 2 cluster nodes & 1 IP tiebreaker" +.in 8 + +.br + +.in 12 + .br - + +.in 8 + .br - + +.in 12 + .br .in 8 .in 0 + .SH "3.4. Heuristic score considerations" * Heuristic timeouts should be set high enough to allow the previous run of a given heuristic to complete. --- cluster/cman/man/mkqdisk.8 2006/07/21 17:55:04 1.2 +++ cluster/cman/man/mkqdisk.8 2007/01/22 22:50:12 1.2.6.1 @@ -13,11 +13,16 @@ .IP "\-c device \-l label" Initialize a new cluster quorum disk. This will destroy all data on the given device. If a cluster is currently using that device as a quorum disk, the -entire cluster will malfunction. Do not ru +entire cluster will malfunction. Do not run this on an active cluster when +qdiskd is running. Only one device on the SAN should ever have the given +label; using multiple different devices is currently not supported (it is +expected a RAID array is used for quorum disk redundancy). The label can be +any textual string up to 127 characters - and is therefore enough space to hold +a UUID created with uuidgen(1). .IP "\-f label" -Find the cluster quorum disk with the given label and display information about it.. +Find the cluster quorum disk with the given label and display information about it. .IP "\-L" Display information on all accessible cluster quorum disks. .SH "SEE ALSO" -qdisk(5) qdiskd(8) +qdisk(5), qdiskd(8), uuidgen(1)