From mboxrd@z Thu Jan 1 00:00:00 1970 From: lhh@sourceware.org Date: 21 Feb 2007 20:22:56 -0000 Subject: [Cluster-devel] cluster/cman qdisk/score.c qdisk/disk.h qdisk/ ... Message-ID: <20070221202256.11988.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL5 Changes by: lhh at sourceware.org 2007-02-21 20:22:54 Modified files: cman/qdisk : score.c disk.h score.h main.c cman/man : qdisk.5 qdiskd.8 cman/init.d : qdiskd Log message: Resolves: 229338 * Makes zero-heuristic mode work (#229338) General (small) fixes: * Add time stamp to status file * Hush stdout/stderr from init script * Give lots of information in status file if debug mode is enabled Fixes for clusters with long failover times (e.g. 2+ minutes): * Enable status file generation during initialization loop * Allow termination (e.g. service qdiskd stop) during initialization loop * Add tunables for clusters with long failure detection times (e.g. 2+ minutes) Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2.4.1&r2=1.2.4.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.2&r2=1.4.2.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2.4.1&r2=1.2.4.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.4&r2=1.4.2.5 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdisk.5.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.3.2.2&r2=1.3.2.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdiskd.8.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.6.1 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/init.d/qdiskd.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1 --- cluster/cman/qdisk/score.c 2007/01/22 22:50:11 1.2.4.1 +++ cluster/cman/qdisk/score.c 2007/02/21 20:22:53 1.2.4.2 @@ -143,6 +143,7 @@ *score = 0; *maxscore = 0; + printf("max = %d\n", max); /* Allow operation w/o any heuristics */ if (!max) { *score = *maxscore = 1; @@ -332,6 +333,20 @@ /** + Call this if no heuristics are set to run in master-wins mode + */ +int +fudge_scoring(void) +{ + pthread_mutex_lock(&sc_lock); + _score = _maxscore = 1; + pthread_mutex_unlock(&sc_lock); + + return 0; +} + + +/** Loop for the scoring thread. */ void * --- cluster/cman/qdisk/disk.h 2007/01/23 17:56:14 1.4.2.2 +++ cluster/cman/qdisk/disk.h 2007/02/21 20:22:53 1.4.2.3 @@ -240,6 +240,9 @@ int qc_writes; int qc_interval; int qc_tko; + int qc_tko_up; + int qc_upgrade_wait; + int qc_master_wait; int qc_votes; int qc_scoremin; int qc_sched; @@ -247,6 +250,7 @@ disk_node_state_t qc_disk_status; disk_node_state_t qc_status; int qc_master; /* Master?! */ + int _pad_; run_flag_t qc_flags; cman_handle_t qc_ch; char *qc_device; --- cluster/cman/qdisk/score.h 2007/01/22 22:50:11 1.2.4.1 +++ cluster/cman/qdisk/score.h 2007/02/21 20:22:53 1.2.4.2 @@ -59,4 +59,11 @@ */ int get_my_score(int *score, int *maxscore); +/* + Set score + maxscore to 1. Call if no heuristics are present + to enable master-wins mode + */ +int fudge_scoring(void); + + #endif --- cluster/cman/qdisk/main.c 2007/01/23 17:56:14 1.4.2.4 +++ cluster/cman/qdisk/main.c 2007/02/21 20:22:53 1.4.2.5 @@ -66,7 +66,9 @@ inline void _diff_tv(struct timeval *dest, struct timeval *start, struct timeval *end); -static int _running = 0; +static int _running = 1; +void update_local_status(qd_ctx *ctx, node_info_t *ni, int max, int score, + int score_req, int score_max); static void @@ -158,6 +160,8 @@ continue; } /* message. */ + memcpy(&(ni[x].ni_last_msg), &(ni[x].ni_msg), + sizeof(ni[x].ni_last_msg)); ni[x].ni_msg.m_arg = sb->ps_arg; ni[x].ni_msg.m_msg = sb->ps_msg; ni[x].ni_msg.m_seq = sb->ps_seq; @@ -325,7 +329,7 @@ Transition from Offline -> Online */ - if (ni[x].ni_seen > (ctx->qc_tko / 2) && + if (ni[x].ni_seen > ctx->qc_tko_up && !state_run(ni[x].ni_state)) { /* Node-join - everyone just kind of "agrees" @@ -446,7 +450,7 @@ int quorum_init(qd_ctx *ctx, node_info_t *ni, int max, struct h_data *h, int maxh) { - int x = 0, score, maxscore; + int x = 0, score, maxscore, score_req; clulog(LOG_INFO, "Quorum Daemon Initializing\n"); @@ -464,16 +468,22 @@ return -1; } - start_score_thread(ctx, h, maxh); + if (h && maxh) { + start_score_thread(ctx, h, maxh); + } else { + clulog(LOG_DEBUG, "Permanently setting score to 1/1\n"); + fudge_scoring(); + } node_info_init(ni, max); + ctx->qc_status = S_INIT; if (qd_write_status(ctx, ctx->qc_my_id, S_INIT, NULL, NULL, NULL) != 0) { clulog(LOG_CRIT, "Could not initialize status block!\n"); return -1; } - while (++x <= ctx->qc_tko) { + while (++x <= ctx->qc_tko && _running) { read_node_blocks(ctx, ni, max); check_transitions(ctx, ni, max, NULL); @@ -483,10 +493,16 @@ return -1; } + get_my_score(&score, &maxscore); + score_req = ctx->qc_scoremin; + if (score_req <= 0) + score_req = (maxscore/2 + 1); + update_local_status(ctx, ni, max, score, score_req, maxscore); + sleep(ctx->qc_interval); } - get_my_score(&score,&maxscore); + get_my_score(&score, &maxscore); clulog(LOG_INFO, "Initial score %d/%d\n", score, maxscore); clulog(LOG_INFO, "Initialization complete\n"); @@ -625,11 +641,41 @@ void +print_node_info(FILE *fp, node_info_t *ni) +{ + fprintf(fp, "node_info_t [node %d] {\n", ni->ni_status.ps_nodeid); + fprintf(fp, " ni_incarnation = 0x%08x%08x\n", + ((int)(ni->ni_incarnation>>32))&0xffffffff, + ((int)(ni->ni_incarnation)&0xffffffff)); + fprintf(fp, " ni_evil_incarnation = 0x%08x%08x\n", + ((int)(ni->ni_evil_incarnation>>32))&0xffffffff, + ((int)(ni->ni_evil_incarnation)&0xffffffff)); + fprintf(fp, " ni_last_seen = %s", ctime(&ni->ni_last_seen)); + fprintf(fp, " ni_misses = %d\n", ni->ni_misses); + fprintf(fp, " ni_seen = %d\n", ni->ni_seen); + fprintf(fp, " ni_msg = {\n"); + fprintf(fp, " m_msg = 0x%08x\n", ni->ni_msg.m_msg); + fprintf(fp, " m_arg = %d\n", ni->ni_msg.m_arg); + fprintf(fp, " m_seq = %d\n", ni->ni_msg.m_seq); + fprintf(fp, " }\n"); + fprintf(fp, " ni_last_msg = {\n"); + fprintf(fp, " m_msg = 0x%08x\n", ni->ni_last_msg.m_msg); + fprintf(fp, " m_arg = %d\n", ni->ni_last_msg.m_arg); + fprintf(fp, " m_seq = %d\n", ni->ni_last_msg.m_seq); + fprintf(fp, " }\n"); + fprintf(fp, " ni_state = 0x%08x (%s)\n", ni->ni_state, + state_str(ni->ni_state)); + fprintf(fp, "}\n\n"); +} + + +void update_local_status(qd_ctx *ctx, node_info_t *ni, int max, int score, int score_req, int score_max) { FILE *fp; int x, need_close = 0; + time_t now; if (!ctx->qc_status_file) return; @@ -643,26 +689,25 @@ need_close = 1; } + now = time(NULL); + fprintf(fp, "Time Stamp: %s", ctime(&now)); fprintf(fp, "Node ID: %d\n", ctx->qc_my_id); - if (ctx->qc_master) - fprintf(fp, "Master Node ID: %d\n", ctx->qc_master); - else - fprintf(fp, "Master Node ID: (none)\n"); - fprintf(fp, "Score: %d/%d (Minimum required = %d)\n", score, score_max, score_req); fprintf(fp, "Current state: %s\n", state_str(ctx->qc_status)); + + /* fprintf(fp, "Current disk state: %s\n", state_str(ctx->qc_disk_status)); - + */ fprintf(fp, "Initializing Set: {"); for (x=0; x= S_RUN || ni[x].ni_status.ps_nodeid == @@ -671,6 +716,14 @@ } fprintf(fp, " }\n"); + if (ctx->qc_status == S_INIT) + goto out; + + if (ctx->qc_master) + fprintf(fp, "Master Node ID: %d\n", ctx->qc_master); + else + fprintf(fp, "Master Node ID: (none)\n"); + if (!ctx->qc_master) goto out; @@ -686,6 +739,11 @@ fprintf(fp, " }\n"); out: + if (ctx->qc_flags & RF_DEBUG) { + for (x = 0; x < max; x++) + print_node_info(fp, &ni[x]); + } + fprintf(fp, "\n"); if (need_close) fclose(fp); @@ -823,7 +881,10 @@ /* Check heuristics and remove ourself if necessary */ get_my_score(&score, &score_max); - upgrade = 0; + + /* If we recently upgraded, decrement our wait time */ + if (upgrade > 0) + --upgrade; score_req = ctx->qc_scoremin; if (score_req <= 0) @@ -859,9 +920,7 @@ "upgrading\n", score, score_max, score_req); ctx->qc_status = S_RUN; - upgrade = (ctx->qc_tko / 3); - if (upgrade == 0) - upgrade = 1; + upgrade = ctx->qc_upgrade_wait; } } @@ -905,7 +964,7 @@ * Give ample time to become aware of other * nodes */ - if (bid_pending < (ctx->qc_tko / 3)) + if (bid_pending < (ctx->qc_master_wait)) break; clulog(LOG_INFO, @@ -1060,6 +1119,8 @@ ctx->qc_scoremin = 0; ctx->qc_flags = RF_REBOOT | RF_ALLOW_KILL | RF_UPTIME; /* | RF_STOP_CMAN;*/ + if (debug) + ctx->qc_flags |= RF_DEBUG; ctx->qc_sched = SCHED_RR; ctx->qc_sched_prio = 1; @@ -1100,6 +1161,38 @@ if (ctx->qc_tko < 3) ctx->qc_tko = 3; } + + /* Get up-tko (transition off->online) */ + ctx->qc_tko_up = (ctx->qc_tko / 2); + snprintf(query, sizeof(query), "/cluster/quorumd/@tko_up"); + if (ccs_get(ccsfd, query, &val) == 0) { + ctx->qc_tko_up = atoi(val); + free(val); + } + if (ctx->qc_tko_up < 2) + ctx->qc_tko_up = 2; + + /* After coming online, wait this many intervals before + being allowed to bid for master. */ + ctx->qc_upgrade_wait = 2; /* (ctx->qc_tko / 3); */ + snprintf(query, sizeof(query), "/cluster/quorumd/@upgrade_wait"); + if (ccs_get(ccsfd, query, &val) == 0) { + ctx->qc_upgrade_wait = atoi(val); + free(val); + } + if (ctx->qc_upgrade_wait < 1) + ctx->qc_upgrade_wait = 1; + + /* wait this many intervals after bidding for master before + becoming Caesar */ + ctx->qc_master_wait = (ctx->qc_tko / 3); + snprintf(query, sizeof(query), "/cluster/quorumd/@master_wait"); + if (ccs_get(ccsfd, query, &val) == 0) { + ctx->qc_master_wait = atoi(val); + free(val); + } + if (ctx->qc_master_wait < 2) + ctx->qc_master_wait = 2; /* Get votes */ snprintf(query, sizeof(query), "/cluster/quorumd/@votes"); @@ -1275,7 +1368,7 @@ main(int argc, char **argv) { cman_node_t me; - int cfh, rv, forked = 0; + int cfh, rv, forked = 0, nfd = -1; qd_ctx ctx; cman_handle_t ch; node_info_t ni[MAX_NODES_DISK]; @@ -1283,13 +1376,13 @@ char debug = 0, foreground = 0; char device[128]; pid_t pid; - + if (check_process_running(argv[0], &pid) && pid !=getpid()) { printf("QDisk services already running\n"); return 0; } - while ((rv = getopt(argc, argv, "fd")) != EOF) { + while ((rv = getopt(argc, argv, "fdQ")) != EOF) { switch (rv) { case 'd': debug = 1; @@ -1297,6 +1390,18 @@ case 'f': foreground = 1; clu_log_console(1); + break; + case 'Q': + /* Make qdisk very quiet */ + nfd = open("/dev/null", O_RDWR); + close(0); + close(1); + close(2); + dup2(nfd, 0); + dup2(nfd, 1); + dup2(nfd, 2); + close(nfd); + break; default: break; } @@ -1394,6 +1499,9 @@ check_stop_cman(&ctx); return -1; } + + if (!_running) + return 0; cman_register_quorum_device(ctx.qc_ch, ctx.qc_device, ctx.qc_votes); /* --- cluster/cman/man/qdisk.5 2007/01/26 21:12:39 1.3.2.2 +++ cluster/cman/man/qdisk.5 2007/02/21 20:22:54 1.3.2.3 @@ -1,4 +1,4 @@ -.TH "QDisk" "21" "Jan 2007" "" "Cluster Quorum Disk" +.TH "QDisk" "5" "20 Feb 2007" "" "Cluster Quorum Disk" .SH "NAME" QDisk 1.2 \- a disk-based quorum daemon for CMAN / Linux-Cluster .SH "1. Overview" @@ -205,7 +205,7 @@ .in 9 \fIinterval\fP\fB="\fP1\fB"\fP .in 12 -This is the frequency of read/write cycles +This is the frequency of read/write cycles, in seconds. .in 9 \fItko\fP\fB="\fP10\fB"\fP @@ -213,6 +213,26 @@ This is the number of cycles a node must miss in order to be declared dead. .in 9 +\fItko_up\fP\fB="\fPX\fB"\fP +.in 12 +This is the number of cycles a node must be seen in order to be declared +online. Default is \fBfloor(tko/2)\fP. + +.in 9 +\fIupgrade_wait\fP\fB="\fP2\fB"\fP +.in 12 +This is the number of cycles a node must wait before initiating a bid +for master status after heuristic scoring becomes sufficient. The +default is 2. This can not be set to 0, and should not exceed \fBtko\fP. + +.in 9 +\fImaster_wait\fP\fB="\fPX\fB"\fP +.in 12 +This is the number of cycles a node must wait for votes before declaring +itself master after making a bid. Default is \fBfloor(tko/3)\fP. +This can not be less than 2 and should not exceed \fBtko\fP. + +.in 9 \fIvotes\fP\fB="\fP3\fB"\fP .in 12 This is the number of votes the quorum daemon advertises to CMAN when it --- cluster/cman/man/qdiskd.8 2006/07/21 17:55:04 1.2 +++ cluster/cman/man/qdiskd.8 2007/02/21 20:22:54 1.2.6.1 @@ -15,6 +15,11 @@ Run in the foreground (do not fork / daemonize). .IP "\-d" Enable debug output. +.IP "\-Q" +Close stdin/out/err immediately before doing validations. This +is primarily for use when being called from an init script. Using +this option will stop all output, and can not be used with the -d +option. .SH "SEE ALSO" mkqdisk(8), qdisk(5), cman(5) --- cluster/cman/init.d/qdiskd 2006/05/19 14:41:35 1.2 +++ cluster/cman/init.d/qdiskd 2007/02/21 20:22:54 1.2.4.1 @@ -19,7 +19,7 @@ # See how we were called. case "$1" in start) - action "Starting the Quorum Disk Daemon:" qdiskd + action "Starting the Quorum Disk Daemon:" qdiskd -Q rtrn=$? [ $rtrn = 0 ] && touch $LOCK_FILE ;;