All of lore.kernel.org
 help / color / mirror / Atom feed
From: lhh@sourceware.org <lhh@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/cman qdisk/score.c qdisk/score.h qdisk ...
Date: 21 Feb 2007 20:25:12 -0000	[thread overview]
Message-ID: <20070221202512.14032.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	STABLE
Changes by:	lhh at sourceware.org	2007-02-21 20:25:10

Modified files:
	cman/qdisk     : score.c score.h main.c disk.h 
	cman/man       : qdisk.5 qdiskd.8 
	cman/init.d    : qdiskd 

Log message:
	Resolves: 229338
	* Makes zero-heuristic mode work (#229338)
	
	General (small) fixes:
	* Add time stamp to status file
	* Hush stdout/stderr from init script
	* Give lots of information in status file if debug mode is enabled
	
	Fixes for clusters with long failover times (e.g. 2+ minutes):
	* Enable status file generation during initialization loop
	* Allow termination (e.g. service qdiskd stop) during initialization loop
	* Add tunables for clusters with long failure detection times (e.g. 2+ minutes)

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.2.2.2&r2=1.2.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.2.2.2&r2=1.2.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/main.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.3.2.3&r2=1.3.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.3.2.3&r2=1.3.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdisk.5.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.2.4.2&r2=1.2.4.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdiskd.8.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.2.4.1&r2=1.2.4.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/init.d/qdiskd.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.2.2.1&r2=1.2.2.2

--- cluster/cman/qdisk/score.c	2007/01/22 23:01:53	1.2.2.2
+++ cluster/cman/qdisk/score.c	2007/02/21 20:25:09	1.2.2.3
@@ -143,6 +143,7 @@
 	*score = 0;
 	*maxscore = 0;
 	
+	printf("max = %d\n", max);
 	/* Allow operation w/o any heuristics */
 	if (!max) {
 		*score = *maxscore = 1;
@@ -332,6 +333,20 @@
 
 
 /**
+  Call this if no heuristics are set to run in master-wins mode
+ */
+int
+fudge_scoring(void)
+{
+	pthread_mutex_lock(&sc_lock);
+	_score = _maxscore = 1;
+	pthread_mutex_unlock(&sc_lock);
+
+	return 0;
+}
+
+
+/**
   Loop for the scoring thread.
  */
 void *
--- cluster/cman/qdisk/score.h	2007/01/22 23:01:53	1.2.2.2
+++ cluster/cman/qdisk/score.h	2007/02/21 20:25:09	1.2.2.3
@@ -59,4 +59,11 @@
  */
 int get_my_score(int *score, int *maxscore);
 
+/* 
+   Set score + maxscore to 1.  Call if no heuristics are present
+   to enable master-wins mode
+ */
+int fudge_scoring(void);
+
+
 #endif
--- cluster/cman/qdisk/main.c	2007/01/26 14:37:28	1.3.2.3
+++ cluster/cman/qdisk/main.c	2007/02/21 20:25:09	1.3.2.4
@@ -66,7 +66,9 @@
 inline void _diff_tv(struct timeval *dest, struct timeval *start,
 		     struct timeval *end);
 
-static int _running = 0;
+static int _running = 1;
+void update_local_status(qd_ctx *ctx, node_info_t *ni, int max, int score,
+		    	 int score_req, int score_max);
 
 
 static void
@@ -158,6 +160,8 @@
 			continue;
 		} 
 		/* message. */
+		memcpy(&(ni[x].ni_last_msg), &(ni[x].ni_msg),
+		       sizeof(ni[x].ni_last_msg));
 		ni[x].ni_msg.m_arg = sb->ps_arg;
 		ni[x].ni_msg.m_msg = sb->ps_msg;
 		ni[x].ni_msg.m_seq = sb->ps_seq;
@@ -325,7 +329,7 @@
 
 		   Transition from Offline -> Online
 		 */
-		if (ni[x].ni_seen > (ctx->qc_tko / 2) &&
+		if (ni[x].ni_seen > ctx->qc_tko_up &&
 		    !state_run(ni[x].ni_state)) {
 			/*
 			   Node-join - everyone just kind of "agrees"
@@ -446,7 +450,7 @@
 int
 quorum_init(qd_ctx *ctx, node_info_t *ni, int max, struct h_data *h, int maxh)
 {
-	int x = 0, score, maxscore;
+	int x = 0, score, maxscore, score_req;
 
 	clulog(LOG_INFO, "Quorum Daemon Initializing\n");
 	
@@ -464,16 +468,22 @@
 		return -1;
 	}
 	
-	start_score_thread(ctx, h, maxh);
+	if (h && maxh) {
+		start_score_thread(ctx, h, maxh);
+	} else {
+		clulog(LOG_DEBUG, "Permanently setting score to 1/1\n");
+		fudge_scoring();
+	}
 
 	node_info_init(ni, max);
+	ctx->qc_status = S_INIT;
 	if (qd_write_status(ctx, ctx->qc_my_id,
 			    S_INIT, NULL, NULL, NULL) != 0) {
 		clulog(LOG_CRIT, "Could not initialize status block!\n");
 		return -1;
 	}
 
-	while (++x <= ctx->qc_tko) {
+	while (++x <= ctx->qc_tko && _running) {
 		read_node_blocks(ctx, ni, max);
 		check_transitions(ctx, ni, max, NULL);
 
@@ -483,10 +493,16 @@
 			return -1;
 		}
 
+		get_my_score(&score, &maxscore);
+		score_req = ctx->qc_scoremin;
+		if (score_req <= 0)
+			score_req = (maxscore/2 + 1);
+		update_local_status(ctx, ni, max, score, score_req, maxscore);
+
 		sleep(ctx->qc_interval);
 	}
 
-	get_my_score(&score,&maxscore);
+	get_my_score(&score, &maxscore);
 	clulog(LOG_INFO, "Initial score %d/%d\n", score, maxscore);
 	clulog(LOG_INFO, "Initialization complete\n");
 
@@ -625,11 +641,41 @@
 
 
 void
+print_node_info(FILE *fp, node_info_t *ni)
+{
+	fprintf(fp, "node_info_t [node %d] {\n", ni->ni_status.ps_nodeid);
+	fprintf(fp, "    ni_incarnation = 0x%08x%08x\n",
+		((int)(ni->ni_incarnation>>32))&0xffffffff,
+		((int)(ni->ni_incarnation)&0xffffffff));
+	fprintf(fp, "    ni_evil_incarnation = 0x%08x%08x\n",
+		((int)(ni->ni_evil_incarnation>>32))&0xffffffff,
+		((int)(ni->ni_evil_incarnation)&0xffffffff));
+	fprintf(fp, "    ni_last_seen = %s", ctime(&ni->ni_last_seen));
+	fprintf(fp, "    ni_misses = %d\n", ni->ni_misses);
+	fprintf(fp, "    ni_seen = %d\n", ni->ni_seen);
+	fprintf(fp, "    ni_msg = {\n");
+	fprintf(fp, "        m_msg = 0x%08x\n", ni->ni_msg.m_msg);
+	fprintf(fp, "        m_arg = %d\n", ni->ni_msg.m_arg);
+	fprintf(fp, "        m_seq = %d\n", ni->ni_msg.m_seq);
+	fprintf(fp, "    }\n");
+	fprintf(fp, "    ni_last_msg = {\n");
+	fprintf(fp, "        m_msg = 0x%08x\n", ni->ni_last_msg.m_msg);
+	fprintf(fp, "        m_arg = %d\n", ni->ni_last_msg.m_arg);
+	fprintf(fp, "        m_seq = %d\n", ni->ni_last_msg.m_seq);
+	fprintf(fp, "    }\n");
+	fprintf(fp, "    ni_state = 0x%08x (%s)\n", ni->ni_state,
+		state_str(ni->ni_state));
+	fprintf(fp, "}\n\n");
+}
+
+
+void
 update_local_status(qd_ctx *ctx, node_info_t *ni, int max, int score,
 		    int score_req, int score_max)
 {
 	FILE *fp;
 	int x, need_close = 0;
+	time_t now;
 
 	if (!ctx->qc_status_file)
 		return;
@@ -643,26 +689,25 @@
 		need_close = 1;
 	}
 
+	now = time(NULL);
+	fprintf(fp, "Time Stamp: %s", ctime(&now));
 	fprintf(fp, "Node ID: %d\n", ctx->qc_my_id);
 	
-	if (ctx->qc_master)
-		fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
-	else 
-		fprintf(fp, "Master Node ID: (none)\n");
-	
 	fprintf(fp, "Score: %d/%d (Minimum required = %d)\n",
 		score, score_max, score_req);
 	fprintf(fp, "Current state: %s\n", state_str(ctx->qc_status));
+
+	/*
 	fprintf(fp, "Current disk state: %s\n",
 		state_str(ctx->qc_disk_status));
-
+	 */
 	fprintf(fp, "Initializing Set: {");
 	for (x=0; x<max; x++) {
-		if (ni[x].ni_state == S_INIT)
+		if (ni[x].ni_status.ps_state == S_INIT && ni[x].ni_seen)
 			fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
 	}
 	fprintf(fp, " }\n");
-	
+
 	fprintf(fp, "Visible Set: {");
 	for (x=0; x<max; x++) {
 		if (ni[x].ni_state >= S_RUN || ni[x].ni_status.ps_nodeid == 
@@ -671,6 +716,14 @@
 	}
 	fprintf(fp, " }\n");
 	
+	if (ctx->qc_status == S_INIT)
+		goto out;
+	
+	if (ctx->qc_master)
+		fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
+	else 
+		fprintf(fp, "Master Node ID: (none)\n");
+
 	if (!ctx->qc_master)
 		goto out;
 
@@ -686,6 +739,11 @@
 	fprintf(fp, " }\n");
 
 out:
+	if (ctx->qc_flags & RF_DEBUG) {
+		for (x = 0; x < max; x++)
+			print_node_info(fp, &ni[x]);
+	}
+
 	fprintf(fp, "\n");
 	if (need_close)
 		fclose(fp);
@@ -823,7 +881,10 @@
 
 		/* Check heuristics and remove ourself if necessary */
 		get_my_score(&score, &score_max);
-		upgrade = 0;
+
+		/* If we recently upgraded, decrement our wait time */
+		if (upgrade > 0)
+			--upgrade;
 
 		score_req = ctx->qc_scoremin;
 		if (score_req <= 0)
@@ -859,9 +920,7 @@
 				       "upgrading\n",
 				       score, score_max, score_req);
 				ctx->qc_status = S_RUN;
-				upgrade = (ctx->qc_tko / 3);
-				if (upgrade == 0)
-					upgrade = 1;
+				upgrade = ctx->qc_upgrade_wait;
 			}
 		}
 
@@ -905,7 +964,7 @@
 				 * Give ample time to become aware of other
 				 * nodes
 				 */
-				if (bid_pending < (ctx->qc_tko / 3))
+				if (bid_pending < (ctx->qc_master_wait))
 					break;
 				
 				clulog(LOG_INFO,
@@ -1060,6 +1119,8 @@
 	ctx->qc_scoremin = 0;
 	ctx->qc_flags = RF_REBOOT | RF_ALLOW_KILL | RF_UPTIME;
 			/* | RF_STOP_CMAN;*/
+	if (debug)
+		ctx->qc_flags |= RF_DEBUG;
 	ctx->qc_sched = SCHED_RR;
 	ctx->qc_sched_prio = 1;
 
@@ -1100,6 +1161,38 @@
 		if (ctx->qc_tko < 3)
 			ctx->qc_tko = 3;
 	}
+
+	/* Get up-tko (transition off->online) */
+	ctx->qc_tko_up = (ctx->qc_tko / 2);
+	snprintf(query, sizeof(query), "/cluster/quorumd/@tko_up");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_tko_up = atoi(val);
+		free(val);
+	}
+	if (ctx->qc_tko_up < 2)
+		ctx->qc_tko_up = 2;
+
+	/* After coming online, wait this many intervals before
+	   being allowed to bid for master. */
+	ctx->qc_upgrade_wait = 2; /* (ctx->qc_tko / 3); */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@upgrade_wait");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_upgrade_wait = atoi(val);
+		free(val);
+	}
+	if (ctx->qc_upgrade_wait < 1)
+		ctx->qc_upgrade_wait = 1;
+
+	/* wait this many intervals after bidding for master before
+	   becoming Caesar  */
+	ctx->qc_master_wait = (ctx->qc_tko / 3);
+	snprintf(query, sizeof(query), "/cluster/quorumd/@master_wait");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_master_wait = atoi(val);
+		free(val);
+	}
+	if (ctx->qc_master_wait < 2)
+		ctx->qc_master_wait = 2;
 		
 	/* Get votes */
 	snprintf(query, sizeof(query), "/cluster/quorumd/@votes");
@@ -1274,7 +1367,7 @@
 main(int argc, char **argv)
 {
 	cman_node_t me;
-	int cfh, rv, forked = 0;
+	int cfh, rv, forked = 0, nfd = -1;
 	qd_ctx ctx;
 	cman_handle_t ch;
 	node_info_t ni[MAX_NODES_DISK];
@@ -1282,13 +1375,13 @@
 	char debug = 0, foreground = 0;
 	char device[128];
 	pid_t pid;
-	
+
 	if (check_process_running(argv[0], &pid) && pid !=getpid()) {
 		printf("QDisk services already running\n");
 		return 0;
 	}
 	
-	while ((rv = getopt(argc, argv, "fd")) != EOF) {
+	while ((rv = getopt(argc, argv, "fdQ")) != EOF) {
 		switch (rv) {
 		case 'd':
 			debug = 1;
@@ -1296,6 +1389,18 @@
 		case 'f':
 			foreground = 1;
 			clu_log_console(1);
+			break;
+		case 'Q':
+			/* Make qdisk very quiet */
+			nfd = open("/dev/null", O_RDWR);
+			close(0);
+			close(1);
+			close(2);
+			dup2(nfd, 0);
+			dup2(nfd, 1);
+			dup2(nfd, 2);
+			close(nfd);
+			break;
 		default:
 			break;
 		}
@@ -1393,6 +1498,9 @@
 		check_stop_cman(&ctx);
 		return -1;
 	}
+
+	if (!_running)
+		return 0;
 	
 	cman_register_quorum_device(ctx.qc_ch, ctx.qc_device, ctx.qc_votes);
 	/*
--- cluster/cman/qdisk/disk.h	2007/01/26 14:37:27	1.3.2.3
+++ cluster/cman/qdisk/disk.h	2007/02/21 20:25:09	1.3.2.4
@@ -240,6 +240,9 @@
 	int qc_writes;
 	int qc_interval;
 	int qc_tko;
+	int qc_tko_up;
+	int qc_upgrade_wait;
+	int qc_master_wait;
 	int qc_votes;
 	int qc_scoremin;
 	int qc_sched;
@@ -247,6 +250,7 @@
 	disk_node_state_t qc_disk_status;
 	disk_node_state_t qc_status;
 	int qc_master;		/* Master?! */
+	int _pad_;
 	run_flag_t qc_flags;
 	cman_handle_t qc_ch;
 	char *qc_device;
--- cluster/cman/man/qdisk.5	2007/01/22 23:01:53	1.2.4.2
+++ cluster/cman/man/qdisk.5	2007/02/21 20:25:09	1.2.4.3
@@ -1,4 +1,4 @@
-.TH "QDisk" "21" "Jan 2007" "" "Cluster Quorum Disk"
+.TH "QDisk" "5" "20 Feb 2007" "" "Cluster Quorum Disk"
 .SH "NAME"
 QDisk 1.2 \- a disk-based quorum daemon for CMAN / Linux-Cluster
 .SH "1. Overview"
@@ -205,7 +205,7 @@
 .in 9
 \fIinterval\fP\fB="\fP1\fB"\fP
 .in 12 
-This is the frequency of read/write cycles
+This is the frequency of read/write cycles, in seconds.
 
 .in 9
 \fItko\fP\fB="\fP10\fB"\fP
@@ -213,6 +213,26 @@
 This is the number of cycles a node must miss in order to be declared dead.
 
 .in 9
+\fItko_up\fP\fB="\fPX\fB"\fP
+.in 12
+This is the number of cycles a node must be seen in order to be declared
+online.  Default is \fBfloor(tko/2)\fP.
+
+.in 9
+\fIupgrade_wait\fP\fB="\fP2\fB"\fP
+.in 12
+This is the number of cycles a node must wait before initiating a bid
+for master status after heuristic scoring becomes sufficient.  The
+default is 2.  This can not be set to 0, and should not exceed \fBtko\fP.
+
+.in 9
+\fImaster_wait\fP\fB="\fPX\fB"\fP
+.in 12
+This is the number of cycles a node must wait for votes before declaring
+itself master after making a bid.  Default is \fBfloor(tko/3)\fP. 
+This can not be less than 2 and should not exceed \fBtko\fP.
+
+.in 9
 \fIvotes\fP\fB="\fP3\fB"\fP
 .in 12
 This is the number of votes the quorum daemon advertises to CMAN when it
--- cluster/cman/man/qdiskd.8	2006/07/21 18:01:38	1.2.4.1
+++ cluster/cman/man/qdiskd.8	2007/02/21 20:25:09	1.2.4.2
@@ -15,6 +15,11 @@
 Run in the foreground (do not fork / daemonize).
 .IP "\-d"
 Enable debug output.
+.IP "\-Q"
+Close stdin/out/err immediately before doing validations.  This
+is primarily for use when being called from an init script.  Using
+this option will stop all output, and can not be used with the -d 
+option.
 
 .SH "SEE ALSO"
 mkqdisk(8), qdisk(5), cman(5)
--- cluster/cman/init.d/qdiskd	2006/07/21 18:01:38	1.2.2.1
+++ cluster/cman/init.d/qdiskd	2007/02/21 20:25:09	1.2.2.2
@@ -19,7 +19,7 @@
 # See how we were called.
 case "$1" in
   start)
-	action "Starting the Quorum Disk Daemon:" qdiskd
+	action "Starting the Quorum Disk Daemon:" qdiskd -Q
 	rtrn=$?
 	[ $rtrn = 0 ] && touch $LOCK_FILE
 	;;



                 reply	other threads:[~2007-02-21 20:25 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070221202512.14032.qmail@sourceware.org \
    --to=lhh@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.