All of lore.kernel.org
 help / color / mirror / Atom feed
From: lhh@sourceware.org <lhh@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/cman Makefile init.d/Makefile man/Make ...
Date: 21 Jul 2006 18:01:40 -0000	[thread overview]
Message-ID: <20060721180140.1996.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	STABLE
Changes by:	lhh at sourceware.org	2006-07-21 18:01:38

Modified files:
	cman           : Makefile 
	cman/init.d    : Makefile 
	cman/man       : Makefile 
Added files:
	cman/init.d    : qdiskd 
	cman/man       : mkqdisk.8 qdisk.5 qdiskd.8 
	cman/qdisk     : Makefile README bitmap.c clulog.c clulog.h 
	                 crc32.c disk.c disk.h disk_util.c gettid.c 
	                 gettid.h main.c mkqdisk.c platform.h proc.c 
	                 score.c score.h 

Log message:
	Merge from RHEL4 branch; add QDisk

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/Makefile.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.4.8.1&r2=1.4.8.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/init.d/qdiskd.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/init.d/Makefile.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1&r2=1.1.8.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/mkqdisk.8.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdisk.5.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdiskd.8.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/Makefile.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1&r2=1.1.8.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/Makefile.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.5.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/README.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/bitmap.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/clulog.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/clulog.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/crc32.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk_util.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/gettid.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/gettid.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/main.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/mkqdisk.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/platform.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/proc.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1

--- cluster/cman/Makefile	2005/07/05 16:01:29	1.4.8.1
+++ cluster/cman/Makefile	2006/07/21 18:01:37	1.4.8.2
@@ -14,14 +14,17 @@
 all:
 	cd cman_tool && ${MAKE} all
 	cd lib && ${MAKE} all
+	cd qdisk && ${MAKE} all
 
 copytobin:
 	cd cman_tool && ${MAKE} copytobin
+	cd qdisk && ${MAKE} copytobin
 	cd lib && ${MAKE} copytobin
 
 clean:
 	cd bin && ${MAKE} clean
 	cd cman_tool && ${MAKE} clean
+	cd qdisk && ${MAKE} clean
 	cd lib && ${MAKE} clean
 
 distclean: clean
@@ -31,10 +34,12 @@
 	cd man && ${MAKE} install
 	cd cman_tool && ${MAKE} install
 	cd lib && ${MAKE} install
+	cd qdisk && ${MAKE} install
 	cd init.d && ${MAKE} install
 
 uninstall:
 	cd cman_tool && ${MAKE} uninstall
 	cd lib && ${MAKE} uninstall
 	cd man && ${MAKE} uninstall
+	cd qdisk && ${MAKE} uninstall
 	cd init.d && ${MAKE} uninstall
/cvs/cluster/cluster/cman/init.d/qdiskd,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/init.d/qdiskd
+++ -	2006-07-21 18:01:38.720108000 +0000
@@ -0,0 +1,63 @@
+#!/bin/bash
+#
+# chkconfig: 345 22 78
+# description: Starts and stops the quroum disk daemon
+#
+#	       
+### BEGIN INIT INFO
+# Provides: 
+### END INIT INFO
+
+. /etc/init.d/functions
+[ -f /etc/sysconfig/cluster ] && . /etc/sysconfig/cluster
+
+LOCK_FILE="/var/lock/subsys/qdiskd"
+
+rtrn=1
+retries=0
+
+# See how we were called.
+case "$1" in
+  start)
+	action "Starting the Quorum Disk Daemon:" qdiskd
+	rtrn=$?
+	[ $rtrn = 0 ] && touch $LOCK_FILE
+	;;
+
+  stop)
+	echo -n "Stopping the Quorum Disk Daemon:"
+	killproc qdiskd
+	while [ -n "`pidof qdiskd`" ] && [ $retries -lt 5 ]; do
+		sleep 1
+		killproc qdiskd
+		((retries++))
+	done
+	if [ -z "`pidof qdiskd`" ]; then
+		echo_success
+		echo
+		rtrn=0
+		rm -f $LOCK_FILE
+	else
+		echo_failure
+		echo
+		rtrn=1
+	fi
+	;;
+
+  restart)
+	$0 stop || exit $?
+	$0 start 
+	rtrn=$?
+	;;
+
+  status)
+	status qdiskd
+	rtrn=$?
+	;;
+
+  *)
+	echo $"Usage: $0 {start|stop|restart|status}"
+	;;
+esac
+
+exit $rtrn
--- cluster/cman/init.d/Makefile	2004/12/17 20:07:59	1.1
+++ cluster/cman/init.d/Makefile	2006/07/21 18:01:38	1.1.8.1
@@ -10,7 +10,7 @@
 ###############################################################################
 ###############################################################################
 
-TARGET= cman
+TARGET= cman qdiskd
 
 UNINSTALL=${top_srcdir}/scripts/uninstall.pl
 
/cvs/cluster/cluster/cman/man/mkqdisk.8,v  -->  standard output
revision 1.2.4.1
--- cluster/cman/man/mkqdisk.8
+++ -	2006-07-21 18:01:38.882836000 +0000
@@ -0,0 +1,23 @@
+.TH "mkqdisk" "8" "July 2006" "" "Quorum Disk Management"
+.SH "NAME"
+mkqdisk \- Cluster Quorum Disk Utility
+.SH "WARNING"
+Use of this command can cause the cluster to malfunction.
+.SH "SYNOPSIS"
+\fBmkqdisk [\-?|\-h] | [\-L] | [\-f \fPlabel\fB] [\-c \fPdevice \fB -l \fPlabel\fB]
+.SH "DESCRIPTION"
+.PP 
+The \fBmkqdisk\fP command is used to create a new quorum disk or display
+existing quorum disks accessible from a given cluster node.
+.SH "OPTIONS"
+.IP "\-c device \-l label"
+Initialize a new cluster quorum disk.  This will destroy all data on the given
+device.  If a cluster is currently using that device as a quorum disk, the
+entire cluster will malfunction.  Do not ru
+.IP "\-f label"
+Find the cluster quorum disk with the given label and display information about it..
+.IP "\-L"
+Display information on all accessible cluster quorum disks.
+
+.SH "SEE ALSO"
+qdisk(5) qdiskd(8)
/cvs/cluster/cluster/cman/man/qdisk.5,v  -->  standard output
revision 1.2.4.1
--- cluster/cman/man/qdisk.5
+++ -	2006-07-21 18:01:38.970862000 +0000
@@ -0,0 +1,309 @@
+.TH "QDisk" "8" "July 2006" "" "Cluster Quorum Disk"
+.SH "NAME"
+QDisk 1.0 \- a disk-based quorum daemon for CMAN / Linux-Cluster
+.SH "1. Overview"
+.SH "1.1 Problem"
+In some situations, it may be necessary or desirable to sustain
+a majority node failure of a cluster without introducing the need for
+asymmetric cluster configurations (e.g. client-server, or heavily-weighted
+voting nodes).
+
+.SH "1.2. Design Requirements"
+* Ability to sustain 1..(n-1)/n simultaneous node failures, without the
+danger of a simple network partition causing a split brain.  That is, we
+need to be able to ensure that the majority failure case is not merely
+the result of a network partition.
+
+* Ability to use external reasons for deciding which partition is the 
+the quorate partition in a partitioned cluster.  For example, a user may
+have a service running on one node, and that node must always be the master
+in the event of a network partition.  Or, a node might lose all network
+connectivity except the cluster communication path - in which case, a
+user may wish that node to be evicted from the cluster.
+
+* Integration with CMAN.  We must not require CMAN to run with us (or
+without us).  Linux-Cluster does not require a quorum disk normally -
+introducing new requirements on the base of how Linux-Cluster operates
+is not allowed.
+
+* Data integrity.  In order to recover from a majority failure, fencing
+is required.  The fencing subsystem is already provided by Linux-Cluster.
+
+* Non-reliance on hardware or protocol specific methods (i.e. SCSI
+reservations).  This ensures the quorum disk algorithm can be used on the
+widest range of hardware configurations possible.
+
+* Little or no memory allocation after initialization.  In critical paths
+during failover, we do not want to have to worry about being killed during
+a memory pressure situation because we request a page fault, and the Linux
+OOM killer responds...
+
+.SH "1.3. Hardware Considerations and Requirements"
+.SH "1.3.1. Concurrent, Synchronous, Read/Write Access"
+This quorum daemon requires a shared block device with concurrent read/write
+access from all nodes in the cluster.  The shared block device can be
+a multi-port SCSI RAID array, a Fiber-Channel RAID SAN, a RAIDed iSCSI
+target, or even GNBD.  The quorum daemon uses O_DIRECT to write to the
+device.
+
+.SH "1.3.2. Bargain-basement JBODs need not apply"
+There is a minimum performance requirement inherent when using disk-based
+cluster quorum algorithms, so design your cluster accordingly.  Using a
+cheap JBOD with old SCSI2 disks on a multi-initiator bus will cause 
+problems at the first load spike.  Plan your loads accordingly; a node's
+inability to write to the quorum disk in a timely manner will cause the
+cluster to evict the node.  Using host-RAID or multi-initiator parallel
+SCSI configurations with the qdisk daemon is unlikely to work, and will
+probably cause administrators a lot of frustration.  That having been
+said, because the timeouts are configurable, most hardware should work
+if the timeouts are set high enough.
+
+.SH "1.3.3. Fencing is Required"
+In order to maintain data integrity under all failure scenarios, use of
+this quorum daemon requires adequate fencing, preferrably power-based
+fencing.  Watchdog timers and software-based solutions to reboot the node
+internally, while possibly sufficient, are not considered 'fencing' for 
+the purposes of using the quorum disk.
+
+.SH "1.4. Limitations"
+* At this time, this daemon supports a maximum of 16 nodes.  This is
+primarily a scalability issue: As we increase the node count, we increase
+the amount of synchronous I/O contention on the shared quorum disk.
+
+* Cluster node IDs must be statically configured in cluster.conf and
+must be numbered from 1..16 (there can be gaps, of course).
+
+* Cluster node votes should be more or less equal.
+
+* CMAN must be running before the qdisk program can start.
+
+* CMAN's eviction timeout should be at least 2x the quorum daemon's
+to give the quorum daemon adequate time to converge on a master during a
+failure + load spike situation.
+
+* The total number of votes assigned to the quorum device should be
+equal to or greater than the total number of node-votes in the cluster.
+While it is possible to assign only one (or a few) votes to the quorum
+device, the effects of doing so have not been explored.
+
+* Currently, the quorum disk daemon is difficult to use with CLVM if
+the quorum disk resides on a CLVM logical volume.  CLVM requires a
+quorate cluster to correctly operate, which introduces a chicken-and-egg
+problem for starting the cluster: CLVM needs quorum, but the quorum daemon
+needs CLVM (if and only if the quorum device lies on CLVM-managed storage).
+One way to work around this is to *not* set the cluster's expected votes
+to include the quorum daemon's votes.  Bring all nodes online, and start
+the quorum daemon *after* the whole cluster is running.  This will allow
+the expected votes to increase naturally.
+
+.SH "2. Algorithms"
+.SH "2.1. Heartbeating & Liveliness Determination"
+Nodes update individual status blocks on the quorum disk at a user-
+defined rate.  Each write of a status block alters the timestamp, which
+is what other nodes use to decide whether a node has hung or not.  If,
+after a user-defined number of 'misses' (that is, failure to update a
+timestamp), a node is declared offline.  After a certain number of 'hits'
+(changed timestamp + "i am alive" state), the node is declared online.
+
+The status block contains additional information, such as a bitmask of
+the nodes that node believes are online.  Some of this information is
+used by the master - while some is just for performace recording, and
+may be used at a later time.  The most important pieces of information
+a node writes to its status block are:
+
+.in 12
+- Timestamp
+.br
+- Internal state (available / not available)
+.br
+- Score
+.br
+- Known max score (may be used in the future to detect invalid configurations)
+.br
+- Vote/bid messages
+.br
+- Other nodes it thinks are online
+.in 0
+
+.SH "2.2. Scoring & Heuristics"
+The administrator can configure up to 10 purely arbitrary heuristics, and
+must exercise caution in doing so.  At least one administrator-
+defined heuristic is required for operation, but it is generally a good
+idea to have more than one heuristic.  By default, only nodes scoring over
+1/2 of the total maximum score will claim they are available via the
+quorum disk, and a node (master or otherwise) whose score drops too low
+will remove itself (usually, by rebooting).
+
+The heuristics themselves can be any command executable by 'sh -c'.  For
+example, in early testing the following was used:
+
+.ti 12
+<\fBheuristic \fP\fIprogram\fP\fB="\fP[ -f /quorum ]\fB" \fP\fIscore\fP\fB="\fP10\fB" \fP\fIinterval\fP\fB="\fP2\fB"/>\fP
+
+This is a literal sh-ism which tests for the existence of a file called
+"/quorum".  Without that file, the node would claim it was unavailable.
+This is an awful example, and should never, ever be used in production,
+but is provided as an example as to what one could do...
+
+Typically, the heuristics should be snippets of shell code or commands which
+help determine a node's usefulness to the cluster or clients.  Ideally, you
+want to add traces for all of your network paths (e.g. check links, or
+ping routers), and methods to detect availability of shared storage.
+
+.SH "2.3. Master Election"
+Only one master is present at any one time in the cluster, regardless of
+how many partitions exist within the cluster itself.  The master is
+elected by a simple voting scheme in which the lowest node which believes
+it is capable of running (i.e. scores high enough) bids for master status.
+If the other nodes agree, it becomes the master.  This algorithm is 
+run whenever no master is present.
+
+If another node comes online with a lower node ID while a node is still
+bidding for master status, it will rescind its bid and vote for the lower
+node ID.  If a master dies or a bidding node dies, the voting algorithm
+is started over.  The voting algorithm typically takes two passes to
+complete.
+
+Master deaths take marginally longer to recover from than non-master
+deaths, because a new master must be elected before the old master can
+be evicted & fenced.
+
+.SH "2.4. Master Duties"
+The master node decides who is or is not in the master partition, as
+well as handles eviction of dead nodes (both via the quorum disk and via
+the linux-cluster fencing system by using the cman_kill_node() API).
+
+.SH "2.5. How it All Ties Together"
+When a master is present, and if the master believes a node to be online,
+that node will advertise to CMAN that the quorum disk is available.  The
+master will only grant a node membership if:
+
+.in 12
+(a) CMAN believes the node to be online, and
+.br
+(b) that node has made enough consecutive, timely writes
+.in 16
+to the quorum disk, and
+.in 12
+(c) the node has a high enough score to consider itself online.
+.in 0
+
+.SH "3. Configuration"
+.SH "3.1. The <quorumd> tag"
+This tag is a child of the top-level <cluster> tag.
+
+.in 8
+\fB<quorumd\fP
+.in 9
+\fIinterval\fP\fB="\fP1\fB"\fP
+.in 12 
+This is the frequency of read/write cycles
+
+.in 9
+\fItko\fP\fB="\fP10\fB"\fP
+.in 12
+This is the number of cycles a node must miss in order to be declared dead.
+
+.in 9
+\fIvotes\fP\fB="\fP3\fB"\fP
+.in 12
+This is the number of votes the quorum daemon advertises to CMAN when it
+has a high enough score.
+
+.in 9
+\fIlog_level\fP\fB="\fP4\fB"\fP
+.in 12
+This controls the verbosity of the quorum daemon in the system logs.
+0 = emergencies; 7 = debug.
+
+.in 9
+\fIlog_facility\fP\fB="\fPlocal4\fB"\fP
+.in 12
+This controls the syslog facility used by the quorum daemon when logging.
+For a complete list of available facilities, see \fBsyslog.conf(5)\fP.
+
+.in 9
+\fIstatus_file\fP\fB="\fP/foo\fB"\fP
+.in 12
+Write internal states out to this file periodically ("-" = use stdout).
+This is primarily used for debugging.
+
+.in 9
+\fImin_score\fP\fB="\fP3\fB"\fP
+.in 12
+Absolute minimum score to be consider one's self "alive".  If omitted,
+or set to 0, the default function "floor((n+1)/2)" is used, where \fIn\fP
+is the sum-total of all of defined heuristics' \fIscore\fP attribute.
+
+.in 9
+\fIdevice\fP\fB="\fP/dev/sda1\fB"\fP
+.in 12
+This is the device the quorum daemon will use.  This device must be the
+same on all nodes.
+
+.in 9
+\fIlabel\fP\fB="\fPmylabel\fB"/>\fP
+.in 12
+This overrides the device field if present.  If specified, the quorum
+daemon will read /proc/partitions and check for qdisk signatures
+on every block device found, comparing the label against the specified
+label.  This is useful in configurations where the block device name
+differs on a per-node basis.
+.in 0
+
+.SH "3.2.  The <heuristic> tag"
+This tag is a child of the <quorumd> tag.
+
+.in 8
+\fB<heuristic\fP
+.in 9
+\fIprogram\fP\fB="\fP/test.sh\fB"\fP
+.in 12
+This is the program used to determine if this heuristic is alive.  This
+can be anything which may be executed by \fI/bin/sh -c\fP.  A return
+value of zero indicates success; anything else indicates failure.
+
+.in 9
+\fIscore\fP\fB="\fP1\fB"\fP
+.in 12
+This is the weight of this heuristic.  Be careful when determining scores
+for heuristics.
+
+.in 9
+\fIinterval\fP\fB="\fP2\fB"/>\fP
+.in 12
+This is the frequency at which we poll the heuristic.
+.in 0
+
+.SH "3.3. Example"
+.in 8
+<quorumd interval="1" tko="10" votes="3" label="testing">
+.in 12
+<heuristic program="ping A -c1 -t1" score="1" interval="2"/>
+.br
+<heuristic program="ping B -c1 -t1" score="1" interval="2"/>
+.br
+<heuristic program="ping C -c1 -t1" score="1" interval="2"/>
+.br
+.in 8
+</quorumd>
+.in 0
+
+.SH "3.4. Heuristic score considerations"
+* Heuristic timeouts should be set high enough to allow the previous run
+of a given heuristic to complete.
+
+* Heuristic scripts returning anything except 0 as their return code 
+are considered failed.
+
+* The worst-case for improperly configured quorum heuristics is a race
+to fence where two partitions simultaneously try to kill each other.
+
+.SH "3.5. Creating a quorum disk partition"
+The mkqdisk utility can create and list currently configured quorum disks
+visible to the local node; see
+.B mkqdisk(8)
+for more details.
+
+.SH "SEE ALSO"
+mkqdisk(8), qdiskd(8), cman(5), syslog.conf(5)
/cvs/cluster/cluster/cman/man/qdiskd.8,v  -->  standard output
revision 1.2.4.1
--- cluster/cman/man/qdiskd.8
+++ -	2006-07-21 18:01:39.053646000 +0000
@@ -0,0 +1,20 @@
+.TH "qdiskd" "8" "July 2006" "" "Quorum Disk Management"
+.SH "NAME"
+qdiskd \- Cluster Quorum Disk Daemon
+.SH "SYNOPSIS"
+\fBqdiskd [\-f] [\-d]
+.SH "DESCRIPTION"
+.PP 
+The \fBqdiskd\fP daemon talks to CMAN and provides a mechanism for determining
+node-fitness in a cluster environment.  See
+.B
+qdisk(5)
+for configuration information.
+.SH "OPTIONS"
+.IP "\-f"
+Run in the foreground (do not fork / daemonize).
+.IP "\-d"
+Enable debug output.
+
+.SH "SEE ALSO"
+mkqdisk(8), qdisk(5), cman(5)
--- cluster/cman/man/Makefile	2004/08/13 06:38:22	1.1
+++ cluster/cman/man/Makefile	2006/07/21 18:01:38	1.1.8.1
@@ -18,10 +18,10 @@
 install:
 	install -d ${mandir}/man5
 	install -d ${mandir}/man8
-	install cman.5 ${mandir}/man5
-	install cman_tool.8 ${mandir}/man8
+	install cman.5 qdisk.5 ${mandir}/man5
+	install cman_tool.8 qdiskd.8 mkqdisk.8 ${mandir}/man8
 
 uninstall:
-	${UNINSTALL} cman.5 ${mandir}/man5
-	${UNINSTALL} cman_tool.8 ${mandir}/man8
+	${UNINSTALL} cman.5 qdisk.5 ${mandir}/man5
+	${UNINSTALL} cman_tool.8 qdiskd.8 mkqdisk.8 ${mandir}/man8
 
/cvs/cluster/cluster/cman/qdisk/Makefile,v  -->  standard output
revision 1.5.2.1
--- cluster/cman/qdisk/Makefile
+++ -	2006-07-21 18:01:39.244798000 +0000
@@ -0,0 +1,49 @@
+###############################################################################
+###############################################################################
+##
+##  Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+##
+##  This copyrighted material is made available to anyone wishing to use,
+##  modify, copy, or redistribute it subject to the terms and conditions
+##  of the GNU General Public License v.2.
+##
+###############################################################################
+###############################################################################
+
+top_srcdir=..
+UNINSTALL=${top_srcdir}/scripts/uninstall.pl
+
+include ${top_srcdir}/make/defines.mk
+
+INCLUDES+=-I. -I../lib
+CFLAGS +=-I${incdir} -I${top_srcdir}/config \
+         -Wall -Werror -Wstrict-prototypes -Wshadow -D_GNU_SOURCE -g
+
+TARGET=qdiskd mkqdisk
+
+all: ${TARGET}
+
+copytobin: all
+	cp ${TARGET} ${top_srcdir}/bin
+
+install: ${TARGET}
+	install -d ${sbindir}
+	install ${TARGET} ${sbindir}
+
+qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \
+	gettid.o proc.o ../lib/libcman.a
+	gcc -o $@ $^ -lpthread -L../lib -lccs
+
+mkqdisk: disk.o crc32.o disk_util.o \
+	 proc.o mkqdisk.o
+	gcc -o $@ $^ 
+
+
+%.o: %.c
+	$(CC) -c -o $@ $^ $(INCLUDES) $(CFLAGS)
+
+clean:
+	rm -f *.o ${TARGET}
+
+uninstall:
+	${UNINSTALL} ${TARGET} ${sbindir}
/cvs/cluster/cluster/cman/qdisk/README,v  -->  standard output
revision 1.4.2.1
--- cluster/cman/qdisk/README
+++ -	2006-07-21 18:01:39.324979000 +0000
@@ -0,0 +1,274 @@
+qdisk 1.0 - a disk-based quorum algorithm for Linux-Cluster
+
+(C) 2006 Red Hat, Inc.
+
+1. Overview
+
+1.1. Problem
+
+In some situations, it may be necessary or desirable to sustain
+a majority node failure of a cluster without introducing the need for
+asymmetric (client-server, or heavy-weighted voting nodes).
+
+1.2. Design Requirements
+
+* Ability to sustain 1..(n-1)/n simultaneous node failures, without the
+danger of a simple network partition causing a split brain.  That is, we
+need to be able to ensure that the majority failure case is not merely
+the result of a network partition.
+
+* Ability to use external reasons for deciding which partition is the 
+the quorate partition in a partitioned cluster.  For example, a user may
+have a service running on one node, and that node must always be the master
+in the event of a network partition.  Or, a node might lose all network
+connectivity except the cluster communication path - in which case, a
+user may wish that node to be evicted from the cluster.
+
+* Integration with CMAN.  We must not require CMAN to run with us (or
+without us).  Linux-Cluster does not require a quorum disk normally -
+introducing new requirements on the base of how Linux-Cluster operates
+is not allowed.
+
+* Data integrity.  In order to recover from a majority failure, fencing
+is required.  The fencing subsystem is already provided by Linux-Cluster.
+
+* Non-reliance on hardware or protocol specific methods (i.e. SCSI
+reservations).  This ensures the quorum disk algorithm can be used on the
+widest range of hardware configurations possible.
+
+* Little or no memory allocation after initialization.  In critical paths
+during failover, we do not want to have to worry about being killed during
+a memory pressure situation because we request a page fault, and the Linux
+OOM killer responds...
+
+
+1.3. Hardware Configuration Considerations
+
+1.3.1. Concurrent, Synchronous, Read/Write Access
+
+This daemon requires a shared block device with concurrent read/write
+access from all nodes in the cluster.  The shared block device can be
+a multi-port SCSI RAID array, a Fiber-Channel RAID SAN, a RAIDed iSCSI
+target, or even GNBD.  The quorum daemon uses O_DIRECT to write to the
+device.
+
+1.3.2. Bargain-basement JBODs need not apply
+
+There is a minimum performance requirement inherent when using disk-based
+cluster quorum algorithms, so design your cluster accordingly.  Using a
+cheap JBOD with old SCSI2 disks on a multi-initiator bus will cause 
+problems at the first load spike.  Plan your loads accordingly; a node's
+inability to write to the quorum disk in a timely manner will cause the
+cluster to evict the node.  Using host-RAID or multi-initiator parallel
+SCSI configurations with the qdisk daemon is unlikely to work, and will
+probably cause administrators a lot of frustration.  That having been
+said, because the timeouts are configurable, most hardware should work
+if the timeouts are set high enough.
+
+1.3.3. Fencing is Required
+
+In order to maintain data integrity under all failure scenarios, use of
+this quorum daemon requires adequate fencing, preferrably power-based
+fencing.
+
+
+1.4. Limitations
+
+* At this time, this daemon only supports a maximum of 16 nodes.
+
+* Cluster node IDs must be statically configured in cluster.conf and
+must be numbered from 1..16 (there can be gaps, of course).
+
+* Cluster node votes should be more or less equal.
+
+* CMAN must be running before the qdisk program can start.  This
+limitation will be removed before a production release.
+
+* CMAN's eviction timeout should be at least 2x the quorum daemon's
+to give the quorum daemon adequate time to converge on a master during a
+failure + load spike situation.
+
+* The total number of votes assigned to the quorum device should be
+equal to or greater than the total number of node-votes in the cluster.
+While it is possible to assign only one (or a few) votes to the quorum
+device, the effects of doing so have not been explored.
+
+* Currently, the quorum disk daemon is difficult to use with CLVM if
+the quorum disk resides on a CLVM logical volume.  CLVM requires a
+quorate cluster to correctly operate, which introduces a chicken-and-egg
+problem for starting the cluster: CLVM needs quorum, but the quorum daemon
+needs CLVM (if and only if the quorum device lies on CLVM-managed storage).
+One way to work around this is to *not* set the cluster's expected votes
+to include theh quorum daemon's votes.  Bring all nodes online, and start
+the quorum daemon *after* the whole cluster is running.  This will allow
+the expected votes to increase naturally.
+
+
+2. Algorithms
+
+2.1. Heartbeating & Liveliness Determination
+
+Nodes update individual status blocks on the quorum disk at a user-
+defined rate.  Each write of a status block alters the timestamp, which
+is what other nodes use to decide whether a node has hung or not.  If,
+after a user-defined number of 'misses' (that is, failure to update a
+timestamp), a node is declared offline.  After a certain number of 'hits'
+(changed timestamp + "i am alive" state), the node is declared online.
+
+The status block contains additional information, such as a bitmask of
+the nodes that node believes are online.  Some of this information is
+used by the master - while some is just for performace recording, and
+may be used at a later time.  The most important pieces of information
+a node writes to its status block are:
+
+  - timestamp
+  - internal state (available / not available)
+  - score
+  - max score
+  - vote/bid messages
+  - other nodes it thinks are online
+
+
+2.2. Scoring & Heuristics
+
+The administrator can configure up to 10 purely arbitrary heuristics, and
+must exercise caution in doing so.  By default, only nodes scoring over
+1/2 of the total maximum score will claim they are available via the
+quorum disk, and a node (master or otherwise) whose score drops too low
+will remove itself (usually, by rebooting).
+
+The heuristics themselves can be any command executable by 'sh -c'.  For
+example, in early testing, I used this:
+
+    <heuristic program="[ -f /quorum ]" score="10" interval="2"/>
+
+This is a literal sh-ism which tests for the existence of a file called
+"/quorum".  Without that file, the node would claim it was unavailable.
+This is an awful example, and should never, ever be used in production,
+but is provided as an example as to what one could do...
+
+Typically, the heuristics should be snippets of shell code or commands which
+help determine a node's usefulness to the cluster or clients.  Ideally, you
+want to add traces for all of your network paths (e.g. check links, or
+ping routers), and methods to detect availability of shared storage.
+
+
+2.3. Master Election
+
+Only one master is present at any one time in the cluster, regardless of
+how many partitions exist within the cluster itself.  The master is
+elected by a simple voting scheme in which the lowest node which believes
+it is capable of running (i.e. scores high enough) bids for master status.
+If the other nodes agree, it becomes the master.  This algorithm is 
+run whenever no master is present.
+
+If another node comes online with a lower node ID while a node is still
+bidding for master status, it will rescind its bid and vote for the lower
+node ID.  If a master dies or a bidding node dies, the voting algorithm
+is started over.  The voting algorithm typically takes two passes to
+complete.
+
+Master deaths take marginally longer to recover from than non-master
+deaths, because a new master must be elected before the old master can
+be evicted & fenced.
+
+
+2.4. Master Duties
+
+The master node decides who is or is not in the master partition, as
+well as handles eviction of dead nodes (both via the quorum disk and via
+the linux-cluster fencing system by using the cman_kill_node() API).
+
+
+2.5. How it All Ties Together
+
+When a master is present, and if the master believes a node to be online,
+that node will advertise to CMAN that the quorum disk is avilable.  The
+master will only grant a node membership if:
+
+   (a) CMAN believes the node to be online, and
+   (b) that node has made enough consecutive, timely writes to the quorum
+       disk.
+
+
+3. Configuration
+
+3.1. The <quorumd> tag
+
+This tag is a child of the top-level <cluster> tag.
+
+   <quorumd
+    interval="1"          This is the frequency of read/write cycles
+    tko="10"              This is the number of cycles a node must miss
+                          in order to be declared dead.
+    votes="3"             This is the number of votes the quorum daemon
+                          advertises to CMAN when it has a high enough
+                          score.
+    log_level="4"         This controls the verbosity of the quorum daemon
+                          in the system logs. 0 = emergencies; 7 = debug
+    log_facility="local4" This controls the syslog facility used by the
+			  quorum daemon when logging.
+    status_file="/foo"    Write internal states out to this file
+			  periodically ("-" = use stdout).
+    min_score="3"	  Absolute minimum score to be consider one's
+			  self "alive".  If omitted, or set to 0, the
+			  default function "floor((n+1)/2)" is used.
+    device="/dev/sda1"    This is the device the quorum daemon will use.
+			  This device must be the same on all nodes.
+    label="mylabel"/>     This overrides the device field if present.
+			  If specified, the quorum daemon will read
+			  /proc/partitions and check for qdisk signatures
+			  on every block device found, comparing the label
+			  against the specified label.  This is useful in
+			  configurations where the block device name
+			  differs on a per-node basis.
+
+
+3.2.  The <heuristic> tag
+
+This tag is a child of the <quorumd> tag.
+
+   <heuristic
+    program="/test.sh"    This is the program used to determine if this
+                          heuristic is alive.  This can be anything which
+                          may be executed by "/bin/sh -c".  A return value
+                          of zero indicates success.
+    score="1"             This is the weight of this heuristic.  Be careful
+                          when determining scores for heuristics.
+    interval="2"/>        This is the frequency at which we poll the
+                          heuristic.
+
+3.3. Example
+
+  <quorumd interval="1" tko="10" votes="3" device="/dev/gnbd/qdisk">
+    <heuristic program="ping routerA -c1 -t1" score="1" interval="2"/>
+    <heuristic program="ping routerB -c1 -t1" score="1" interval="2"/>
+    <heuristic program="ping routerC -c1 -t1" score="1" interval="2"/>
+  </quorumd>
+
+3.4. Heuristic score considerations
+
+* Heuristic timeouts should be set high enough to allow the previous run
+of a given heuristic to complete.
+
+* Heuristic scripts returning anything except 0 as their return code 
+are considered failed.
+
+* The worst-case for improperly configured quorum heuristics is a race
+to fence where two partitions simultaneously try to kill each other.
+
+3.5. Creating a quorum disk partition
+
+3.5.1. The mkqdisk utility.
+
+The mkqdisk utility can create and list currently configured quorum disks
+visible to the local node.
+
+  mkqdisk -L		List available quorum disks.
+
+  mkqdisk -f <label>	Find a quorum device by the given label.
+
+  mkqdisk -c <device> -l <label>
+			Initialize <device> and name it <label>.  This
+			will destroy all data on the device, so be careful
+			when running this command.
/cvs/cluster/cluster/cman/qdisk/bitmap.c,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/bitmap.c
+++ -	2006-07-21 18:01:39.411387000 +0000
@@ -0,0 +1,107 @@
+/*
+  Copyright Red Hat, Inc. 2002-2003, 2006
+
+  The Red Hat Cluster Manager API Library is free software; you can
+  redistribute it and/or modify it under the terms of the GNU Lesser
+  General Public License as published by the Free Software Foundation;
+  either version 2.1 of the License, or (at your option) any later
+  version.
+
+  The Red Hat Cluster Manager API Library is distributed in the hope
+  that it will be useful, but WITHOUT ANY WARRANTY; without even the
+  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+  PURPOSE.  See the GNU Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+  USA.
+ */
+/** @file
+ * Bitmap and membership mask handling routines.
+ */
+#include <stdint.h>
+
+
+/**
+ * Clear a bit in a bitmap / bitmask.
+ *
+ * @param mask		Bitmask to modify.
+ * @param bitidx	Bit to modify.
+ * @param masklen	Bitmask length (in uint8_t units)
+ * @return		-1 if the index exceeds the number of bits in the
+ *			bitmap, otherwise 0.
+ */
+int
+clear_bit(uint8_t *mask, uint32_t bitidx, uint32_t masklen)
+{
+	uint32_t idx;
+	uint32_t bit;
+
+	/* Index into array */
+	idx = bitidx >> 3;
+	bit = 1 << (bitidx & 0x7);
+
+	if (idx >= masklen)
+		return -1;
+
+	mask[idx] &= ~bit;
+
+	return 0;
+}
+
+
+/**
+ * Set a bit in a bitmap / bitmask.
+ *
+ * @param mask		Bitmask to modify.
+ * @param bitidx	Bit to modify.
+ * @param masklen	Bitmask length (in uint8_t units).
+ * @return		-1 if the index exceeds the number of bits in the
+ *			bitmap, otherwise 0.
+ */
+int
+set_bit(uint8_t *mask, uint32_t bitidx, uint32_t masklen)
+{
+	uint32_t idx;
+	uint32_t bit;
+
+	/* Index into array */
+	idx = bitidx >> 3;
+	bit = 1 << (bitidx & 0x7);
+
+	if (idx >= masklen)
+		return -1;
+
+	mask[idx] |= bit;
+
+	return 0;
+}
+
+
+/**
+ * Check the status of a bit in a bitmap / bitmask.
+ *
+ * @param mask		Bitmask to check.
+ * @param bitidx	Bit to to check.
+ * @param masklen	Bitmask length (in uint8_t units).
+ * @return		-1 if the index exceeds the number of bits in the
+ *			bitmap, 0 if not set, or 1 if set.
+ */
+int
+is_bit_set(uint8_t *mask, uint32_t bitidx, uint32_t masklen)
+{
+	uint32_t idx;
+	uint32_t bit;
+
+	/* Index into array */
+	idx = bitidx >> 3;
+	bit = 1 << (bitidx & 0x7);
+
+	if (idx >= masklen)
+		return -1;
+
+	return !!(mask[idx]&bit);
+}
+
+
/cvs/cluster/cluster/cman/qdisk/clulog.c,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/clulog.c
+++ -	2006-07-21 18:01:39.508895000 +0000
@@ -0,0 +1,296 @@
+/*
+  Copyright Red Hat, Inc. 2002
+  Copyright Mission Critical Linux, 2000
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+*/
+/** @file
+ * Library routines for communicating with the logging daemon.
+ *
+ *  $Id: clulog.c,v 1.2.2.1 2006/07/21 18:01:38 lhh Exp $
+ *
+ *  Author: Jeff Moyer <moyer@missioncriticallinux.com>
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <malloc.h>
+#include <dirent.h>
+#include <signal.h>
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/file.h>
+#include <sys/socket.h>
+#include <ccs.h>
+#define SYSLOG_NAMES
+#include <sys/syslog.h>
+#undef SYSLOG_NAMES
+
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <linux/unistd.h>
+#include <pthread.h>
+#include <gettid.h>
+#include <clulog.h>
+#include <string.h>
+
+
+static const char *version __attribute__ ((unused)) = "$Revision: 1.2.2.1 $";
+
+#ifdef DEBUG
+#include <assert.h>
+#define Dprintf(fmt,args...) printf(fmt,##args)
+#define DBG_ASSERT(x)  assert(x)
+#else
+#define Dprintf(fmt,args...)
+#define DBG_ASSERT(x)
+#endif
+
+/*
+ * Globals
+ */
+static int   log_is_open = 0;
+static int   useconsole = 0;
+static int   loglevel = LOGLEVEL_DFLT;
+static int   syslog_facility = LOG_DAEMON;
+static char  *daemon_name = NULL;
+static pid_t daemon_pid = -1;
+static pthread_mutex_t log_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+CODE logger_prioritynames[] = 
+{ {"emerg", LOG_EMERG},
+  {"alert", LOG_ALERT},
+  {"crit", LOG_CRIT},
+  {"err", LOG_ERR},
+  {"warning", LOG_WARNING},
+  {"notice", LOG_NOTICE},
+  {"info", LOG_INFO},
+  {"debug", LOG_DEBUG}
+};
+
+/*
+ *  Exported Functions.
+ */
+
+/**
+ * @return The current cluster log level.
+ */
+int
+clu_get_loglevel(void)
+{
+	return loglevel;
+}
+
+
+/**
+ * Set the cluster log level.
+ *
+ * @param severity	New log level.
+ * @return 		Old log level, or -1 if 'severity' is an invalid log
+ *			level.
+ */
+int
+clu_set_loglevel(int severity)
+{
+	int ret = loglevel;
+
+	if (severity > 0) {
+		loglevel = severity;
+		return ret;
+	}
+
+	return -1;
+}
+
+
+/**
+ * @return The current cluster log facility.
+ */
+char *
+clu_get_facility(void)
+{
+	int x = 0;
+
+	pthread_mutex_lock(&log_mutex);
+	for (; facilitynames[x].c_name; x++) {
+		if (syslog_facility == facilitynames[x].c_val) {
+			pthread_mutex_unlock(&log_mutex);
+			return facilitynames[x].c_name;
+		}
+	}
+	
+	pthread_mutex_unlock(&log_mutex);
+	return "local4";
+}
+
+
+/**
+ * Set the cluster log facility.
+ *
+ * @param facilityname  New log facility (see /usr/include/sys/syslog.h).
+ * @return 		0
+ */
+int
+clu_set_facility(char *facilityname)
+{
+	int x = 0, old;
+
+	pthread_mutex_lock(&log_mutex);
+	old = syslog_facility;
+
+	for (; facilitynames[x].c_name; x++) {
+		if (strcmp(facilityname, facilitynames[x].c_name))
+			continue;
+
+		syslog_facility = facilitynames[x].c_val;
+		break;
+	}
+
+	if (syslog_facility == old) {
+		pthread_mutex_unlock(&log_mutex);
+		return 0;
+	}
+
+	closelog();
+	log_is_open = 0;
+	pthread_mutex_unlock(&log_mutex);
+	return 0;
+}
+
+
+/**
+ * Set the console logging mode.  Does not work for daemons.
+ *
+ * @param onoff		0 = off, otherwise on.
+ * @return		Old log-to-console state.
+ */
+int
+clu_log_console(int onoff)
+{
+	int ret = useconsole;
+
+	useconsole = !!onoff;
+	return ret;
+}
+
+
+/**
+ * Cluster logging function.  Talks to syslog and writes to the
+ * console, if necessary.
+ */
+int
+do_clulog(int        severity,
+	  int        write_to_cons,
+	  pid_t      pid,
+	  char       *prog,
+	  const char *fmt, ...)
+{
+	va_list      args;
+	char         logmsg[MAX_LOGMSG_LEN];	/* message to go to the log */
+	char         printmsg[MAX_LOGMSG_LEN];	/* message to go to stdout */
+	int          syslog_flags = LOG_NDELAY;
+
+	pthread_mutex_lock(&log_mutex);
+	if (severity > loglevel) {
+		pthread_mutex_unlock(&log_mutex);
+		return 0;
+	}
+
+	memset(logmsg, 0, MAX_LOGMSG_LEN);
+	memset(printmsg, 0, MAX_LOGMSG_LEN);
+
+	/*
+	 * Check to see if the caller has forked.
+	 */
+	if (!pid) {
+
+		/* Use thread IDs */
+		if (daemon_pid != gettid()) {
+
+			daemon_pid = gettid();
+			log_is_open = 0;
+		}
+
+		syslog_flags |= LOG_PID;
+
+	} else {
+
+		daemon_pid = pid;
+		closelog();
+		log_is_open = 0;
+		snprintf(logmsg, MAX_LOGMSG_LEN, "[%d]: ", pid);
+	}
+
+	if (prog) {
+
+		if (daemon_name) {
+
+			free(daemon_name);
+			daemon_name = NULL;
+		}
+
+		daemon_name = strdup(prog);
+	}
+
+	if (!log_is_open) {
+
+		openlog(daemon_name, syslog_flags, syslog_facility);
+		log_is_open = 1;
+	}
+	/*
+	 * Note: This can be called in the context of a CGI program, in which
+	 * case anything printed to stdout goes to the web page.  This can
+	 * cause problems if we have our standard <warning> strings b/c
+	 * the web client will try to interpret this as an html tag.
+	 */
+	snprintf(logmsg + strlen(logmsg), MAX_LOGMSG_LEN - strlen(logmsg), 
+		 "<%s> ", logger_prioritynames[severity].c_name);
+
+	va_start(args, fmt);
+	vsnprintf(logmsg + strlen(logmsg), MAX_LOGMSG_LEN - strlen(logmsg), 
+		  fmt, args);
+	va_end(args);
+
+	if (write_to_cons || useconsole) {
+		snprintf(printmsg, MAX_LOGMSG_LEN, "[%d] %s: ", daemon_pid,
+			 logger_prioritynames[severity].c_name);
+
+		va_start(args, fmt);
+		vsnprintf(printmsg + strlen(printmsg),
+			  MAX_LOGMSG_LEN - strlen(printmsg), fmt, args);
+		va_end(args);
+
+		fprintf(stdout, "%s", printmsg);
+	}
+
+	syslog(severity, logmsg);
+
+	pthread_mutex_unlock(&log_mutex);
+
+	return 0;
+}
+
+
+/**
+ * Stop the cluster logging facility.
+ */
+void
+clulog_close(void)
+{
+	closelog();
+}
/cvs/cluster/cluster/cman/qdisk/clulog.h,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/clulog.h
+++ -	2006-07-21 18:01:39.595061000 +0000
@@ -0,0 +1,161 @@
+/*
+  Copyright Red Hat, Inc. 2002
+  Copyright Mission Critical Linux, 2000
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+*/
+/** @file
+ * Header for clulog.c
+ */
+/*
+ *  author: Jeff Moyer <moyer@missioncriticallinux.com>
+ */
+
+#ifndef __CLUSTER_LOG_H
+#define __CLUSTER_LOG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <syslog.h>
+#include <sys/types.h>
+
+#define LOGLEVEL_DFLT         LOG_INFO
+#define MAX_LOGMSG_LEN        512
+
+/*
+ * int clu_set_loglevel(int severity)
+ *
+ * DESCRIPTION
+ *   Set the logging level for this daemon.  This is not a 
+ *   system-wide setting.
+ *
+ * ARGUMENTS
+ *   severity  Severity as documented in sys/syslog.h (i.e. LOG_ERR)
+ *
+ * RETURN VALUES
+ *   On success, the previous loglevel is returned.  On error -1 is returned.
+ *
+ * NOTES
+ *   The only way of generating errors for this call is to give a negative
+ *   value for severity.  Currently, syslog lists severities up to 8, but
+ *   I see no reason for this restriction if, in the future, we decided to
+ *   add more levels.  Thus, any number up to MAXINT will be supported.
+ */
+int clu_set_loglevel(int severity);
+int clu_set_facility(char *facility);
+int clu_log_console(int onoff);
+
+/*
+ * int clu_get_loglevel(void)
+ *
+ * DESCRIPTION
+ *   Get the current logging level.
+ *
+ * ARGUMENTS
+ *   none
+ *
+ * RETURN VALUES
+ *   The current logging level is returned.
+ */
+int clu_get_loglevel(void);
+
+/*
+ * DESCRIPTION
+ *   Cluster logging facility.  This is the actual function that does the
+ *   logging.  No one should call this, you should call the wrappers provided.
+ *   i.e. clulog and clulog_and_print.
+ */
+int do_clulog(int severity, int write_to_cons, pid_t pid,
+	      char *prog, const char *fmt, ...);
+/*
+ * int clulog(int severity, const char *fmt, ...)
+ *
+ * DESCRIPTION
+ *   Cluster logging facility.  This is a library routine which sends the 
+ *   supplied parameters to the syslog daemon.  If the supplied severity is 
+ *   numerically larger than the current loglevel, the message is never sent 
+ *   to the log.
+ *
+ * ARGUMENTS
+ *   severity  Severity as documented in sys/syslog.h (i.e. LOG_ERR)
+ *   fmt       Format string as used with printf.
+ *
+ * RETURN VALUES
+ *   On success, 0 is returned.  On error, -1 is returned.
+ *
+ * NOTES
+ *   Inability to contact the logging daemon is the only source of error
+ *   for this function.  Thus, it would behoove you to try a clulog before
+ *   daemonizing your process.  If it fails, print a message to stderr
+ *   explaining that the cluster logging daemon should probably be started.
+ *   If you really want your message to be heard by someone, use
+ *   clulog_and_print().
+ */
+#define clulog(x,fmt,args...)              do_clulog(x,0,0,NULL,fmt,##args)
+#define clulog_pid(x,pid,prog,fmt,args...) do_clulog(x,0,pid,prog,fmt,##args)
+
+/*
+ * int clulog_and_print(int severity, int write_to_cons, const char *fmt, ...)
+ *
+ * DESCRIPTION
+ *   Cluster logging facility.  This is a library routine which sends the 
+ *   supplied parameters to the syslog daemon.  If the supplied severity is 
+ *   numerically larger than the current loglevel, the message is never sent 
+ *   to the log.  This version also prints the given message to the terminal.
+ *
+ * ARGUMENTS
+ *   severity       Severity as documented in sys/syslog.h (i.e. LOG_ERR)
+ *   fmt            Format string as used with printf.
+ *
+ * RETURN VALUES
+ *   On success, 0 is returned.  On error, -1 is returned.
+ */
+#define clulog_and_print(x,fmt,args...)   do_clulog(x,1,0,NULL,fmt,##args)
+
+
+/*
+ * void clulog_close(void)
+ *
+ * DESCRIPTION
+ *   This is an optional call to close the logfile.  This translates into a
+ *   closelog() call.
+ *
+ * ARGUMENTS
+ *   none
+ *
+ * RETURN VALUES
+ *   This function does not return anything.
+ */
+void clulog_close(void);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif				/* __CLUSTER_LOG_H */
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ *  c-indent-level: 8
+ *  tab-width: 8
+ * End:
+ */
/cvs/cluster/cluster/cman/qdisk/crc32.c,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/crc32.c
+++ -	2006-07-21 18:01:39.679326000 +0000
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2000 Bryan Call <bc@fodder.org>
+ *
+ * Modified by Lon H. Hohberger <lhh@redhat.com>
+ * Copyright (C) 2003 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/** @file
+ * Calculates CRC32s on data.
+ */
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+static const unsigned long crctable[256] = {
+  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
+  0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
+  0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
+  0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+  0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
+  0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
+  0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+  0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
+  0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
+  0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
+  0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
+  0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
+  0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
+  0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+  0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
+  0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
+  0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+  0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
+  0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
+  0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
+  0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
+  0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
+  0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
+  0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+  0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
+  0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
+  0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+  0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
+  0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
+  0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
+  0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
+  0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
+  0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
+  0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+  0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
+  0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
+  0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+  0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
+  0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
+  0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
+  0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
+  0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
+  0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
+};
+
+
+/**
+ * Calculate CRC32 of a data set.
+ *
+ * @param data		Data set for building CRC32
+ * @param count		Size of data set, in bytes.
+ * @return 		CRC32 of data set.
+ */
+uint32_t clu_crc32(const char *data, size_t count)
+{
+	uint32_t x;
+	uint32_t crc = (uint32_t)~0;
+	
+	for (x = 0; x < count; x++)
+		crc = (crc >> 8) ^ crctable[(crc ^ data[x]) & 0xff];
+
+	if (crc == (uint32_t)~0)
+	       return 0;
+       	return ~crc;
+}
+
+#if 0
+int
+main(int argc, const char **argv)
+{
+	printf("%08x\n",crc32(argv[1],strlen(argv[1])));
+}
+#endif
/cvs/cluster/cluster/cman/qdisk/disk.c,v  -->  standard output
revision 1.4.2.1
--- cluster/cman/qdisk/disk.c
+++ -	2006-07-21 18:01:39.760295000 +0000
@@ -0,0 +1,758 @@
+/*
+  Copyright Red Hat, Inc. 2002-2003, 2006
+  Copyright Mission Critical Linux, 2000
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR lgPURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+*/
+/** @file
+ * Single-block Raw/Direct I/O Functions
+ */
+/*
+ *  author: Tim Burke <tburke@redhat.com>
+ *  description: Raw IO Interfaces.
+ *
+ * The RAW IO code we are using from 2.2.13 requires user buffers and
+ * disk offsets to be 512 byte aligned.  So this code consists of a 
+ * read and write routine which check to see if the user buffer is 
+ * aligned.  If it isn't a temporary aligned buffer is allocated, a data
+ * copy is performed along with the IO operation itself.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <disk.h>
+#include <platform.h>
+#include <unistd.h>
+#include <time.h>
+
+static int diskRawRead(int fd, char *buf, int len);
+uint32_t clu_crc32(const char *data, size_t count);
+
+
+/**
+ * Swap the bytes of a shared header so that it's always in big-endian form
+ * when stored on disk.
+ *
+ * @param hdr		Header to encode.
+ */
+static void
+header_encode(shared_header_t *hdr)
+{
+	/* sanity check - LE machine -> already encoded. */
+	if (hdr->h_magic == be_swap32(SHARED_HEADER_MAGIC))
+		return;
+
+	swab32(hdr->h_magic);
+	swab32(hdr->h_hcrc);
+	swab32(hdr->h_dcrc);
+	swab32(hdr->h_length);
+	swab64(hdr->h_view);
+	swab64(hdr->h_timestamp);
+}
+
+
+/**
+ * Swap the bytes of a shared header so that it's always in host-byte order
+ * after we read it.  This should be a macro calling header_encode.
+ *
+ * @param hdr		Header to decode.
+ */
+static void
+header_decode(shared_header_t *hdr)
+{
+	/* sanity check - LE machine -> already decoded. */
+	if (hdr->h_magic == SHARED_HEADER_MAGIC)
+		return;
+
+	swab32(hdr->h_magic);
+	swab32(hdr->h_hcrc);
+	swab32(hdr->h_dcrc);
+	swab32(hdr->h_length);
+	swab64(hdr->h_view);
+	swab64(hdr->h_timestamp);
+}
+
+
+/**
+ * Generate a shared header suitable for storing data.  This includes:
+ * header magic, header crc, data crc, header length, timestamp.
+ * The header CRC is generated *after* the data CRC; so the header,
+ * in effect, ensures that the data CRC is valid before we even look
+ * at the data.  Thus, if the header CRC decodes properly, then we
+ * assume that there's a very very high chance that the data CRC is valid.
+ * If the data CRC doesn't match the data, it's indicative of a problem.
+ *
+ * @param hdr		Preallocated pointer to shared_header_t structure.
+ * @param data		Data to be stored with hdr.
+ * @param count		Size of data.
+ * @return		-1 if CRC32 generation fails, or 0 on success.
+ */
+static int
+header_generate(shared_header_t *hdr, const char *data, size_t count)
+{
+	memset(hdr,0,sizeof(*hdr));
+
+	hdr->h_magic = SHARED_HEADER_MAGIC;
+
+	if (data && count) {
+		hdr->h_dcrc = clu_crc32(data, count);
+		hdr->h_length = (uint32_t)count;
+
+		if (hdr->h_dcrc == 0) {
+			fprintf(stderr, "Invalid CRC32 generated on data!\n");
+			return -1;
+		}
+	}
+
+	hdr->h_timestamp = (uint64_t)time(NULL);
+
+	hdr->h_hcrc = clu_crc32((char *)hdr, sizeof(*hdr));
+	if (hdr->h_hcrc == 0) {
+		fprintf(stderr, "Invalid CRC32 generated on header!\n");
+		return -1;
+	}
+
+	header_encode(hdr);
+
+	return 0;
+}
+
+
+/**
+ * Verify the integrity of a shared header.  Basically, check the CRC32
+ * information against the data and header.  A better name for this would
+ * be "shared_block_verify".
+ *
+ * @param hdr		Preallocated pointer to shared_header_t structure.
+ * @param data		Data to be stored with hdr.
+ * @param count		Size of data.
+ * @return		-1 if CRC32 generation fails, or 0 on success.
+ */
+static int
+header_verify(shared_header_t *hdr, const char *data, size_t count)
+{
+	uint32_t crc;
+	uint32_t bkupcrc;
+
+	header_decode(hdr);
+	/*
+	 * verify the header's CRC32.  Ok, we know it's overkill taking
+	 * the CRC32 of a friggin' 16-byte (12 bytes, really) structure,
+	 * but why not?
+	 */
+	bkupcrc = hdr->h_hcrc;
+	hdr->h_hcrc = 0;
+	crc = clu_crc32((char *)hdr, sizeof(*hdr));
+	hdr->h_hcrc = bkupcrc;
+	if (bkupcrc != crc) {
+#if 0
+		fprintf(stderr, "Header CRC32 mismatch; Exp: 0x%08x "
+			"Got: 0x%08x\n", bkupcrc, crc);
+#endif
+		return -1;
+	}
+
+	/*
+	 * Verify the magic number.
+	 */
+	if (hdr->h_magic != SHARED_HEADER_MAGIC) {
+#if 0
+		fprintf(stderr, "Magic mismatch; Exp: 0x%08x "
+			"Got: 0x%08x\n", SHARED_HEADER_MAGIC, hdr->h_magic);
+#endif
+		return -1;
+	}
+
+	/* 
+	 * If there's no data or no count, or perhaps the length fed in is less
+	 * then the expected length, bail.
+	 */
+	if (!data || !count || (count < hdr->h_length))
+		return 0;
+
+	crc = clu_crc32(data, (count > hdr->h_length) ?
+			hdr->h_length : count);
+
+	if (hdr->h_dcrc != crc) {
+#if 0
+		fprintf(stderr, "Data CRC32 mismatch; Exp: 0x%08x "
+			"Got: 0x%08x\n", hdr->h_dcrc, crc);
+#endif
+		return -1;
+	}
+
+	return 0;
+}
+
+
+
+/*
+ * qdisk_open
+ * Called to open the shared state partition with appropriate mode.
+ * Returns - (the file descriptor), a value >= 0 on success.
+ */
+int
+qdisk_open(char *name)
+{
+	int fd;
+	int retval;
+
+	/*
+	 * Open for synchronous writes to insure all writes go directly
+	 * to disk.
+	 */
+	fd = open(name, O_RDWR | O_SYNC | O_DIRECT);
+	if (fd < 0) {
+		return fd;
+	}
+
+	/* Check to verify that the partition is large enough.*/
+	retval = lseek(fd, END_OF_DISK, SEEK_SET);
+
+	if (retval < 0) {
+		perror("open_partition: seek");
+		return -1;
+	}
+
+	if (retval < END_OF_DISK) {
+		fprintf(stderr, "Partition %s too small\n", name);
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Set close-on-exec bit */
+        retval = fcntl(fd, F_GETFD, 0);
+        if (retval < 0) {
+                close(fd);
+                return -1;
+        }
+
+        retval |= FD_CLOEXEC;
+        if (fcntl(fd, F_SETFD, retval) < 0) {
+		perror("open_partition: fcntl");
+                close(fd);
+                return -1;
+        }
+
+	return fd;
+}
+
+
+/*
+ * qdisk_close
+ * Closes the shared state disk partition.
+ * Returns - value from close syscall.
+ */
+int
+qdisk_close(int *fd)
+{
+	int retval;
+
+	if (!fd || *fd < 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	retval = close(*fd);
+	*fd = -1;
+
+	return retval;
+}
+
+/*
+ * qdisk_validate
+ * Called to verify that the specified device special file representing
+ * the partition appears to be a valid device.
+ * Returns: 0 - success, 1 - failure
+ */
+int
+qdisk_validate(char *name)
+{
+	struct stat stat_st, *stat_ptr;
+	int fd;
+	stat_ptr = &stat_st;
+
+	if (stat(name, stat_ptr) < 0) {
+		perror("stat");
+		return -1;
+	}
+	/*
+	 * Verify that its a block or character special file.
+	 */
+	if (S_ISCHR(stat_st.st_mode) == 0 && S_ISBLK(stat_st.st_mode) == 0) {
+/*
+		errno = EINVAL;
+		return -1;
+*/
+		fprintf(stderr, "Warning: %s is not a block device\n",
+		        name);
+	}
+
+	/*
+	 * Verify read/write permission.
+	 */
+	fd = qdisk_open(name);
+	if (fd < 0) {
+		fprintf(stderr, "%s: open of %s for RDWR failed: %s\n",
+			__FUNCTION__, name, strerror(errno));
+		return -1;
+	}
+	qdisk_close(&fd);
+	return 0;
+}
+
+
+static int
+diskRawReadShadow(int fd, off_t readOffset, char *buf, int len)
+{
+	int ret;
+	shared_header_t *hdrp;
+	char *data;
+	int datalen;
+
+	ret = lseek(fd, readOffset, SEEK_SET);
+	if (ret != readOffset) {
+#if 0
+		fprintf(stderr,
+		       "diskRawReadShadow: can't seek to offset %d.\n",
+		       (int) readOffset);
+#endif
+		errno = ENODATA;
+		return -1;
+	}
+
+	ret = diskRawRead(fd, buf, len);
+	if (ret != len) {
+#if 0
+		fprintf(stderr, "diskRawReadShadow: aligned read "
+		       "returned %d, not %d.\n", ret, len);
+#endif
+		errno = ENODATA;
+		return -1;
+	}
+
+	/* Decode the header portion so we can run a checksum on it. */
+	hdrp = (shared_header_t *)buf;
+	data = (char *)buf + sizeof(*hdrp);
+	swab_shared_header_t(hdrp);
+	datalen = hdrp->h_length;
+
+	if (header_verify(hdrp, data, len)) {
+#if 0
+		fprintf(stderr, "diskRawReadShadow: bad CRC32, "
+		       "fd = %d offset = %d len = %d\n", fd,
+		       (int) readOffset, len);
+#endif
+		errno = EPROTO;
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/*
+ * The RAW IO implementation requires buffers to be 512 byte aligned.
+ * Here we check for alignment and do a bounceio if necessary.
+ */
+static int
+diskRawRead(int fd, char *buf, int len)
+{
+	char *alignedBuf;
+	int readret;
+	int extraLength;
+	int readlen;
+	int bounceNeeded = 1;
+
+	if ((((unsigned long) buf & (unsigned long) 0x3ff) == 0) &&
+	    ((len % 512) == 0)) {
+		bounceNeeded = 0;
+	}
+
+	if (bounceNeeded == 0) {
+		/* Already aligned and even multiple of 512, no bounceio
+		 * required. */
+		return (read(fd, buf, len));
+	}
+
+	if (len > 512) {
+		fprintf(stderr,
+			"diskRawRead: not setup for reads larger than %d.\n",
+		       512);
+		return (-1);
+	}
+	/*
+	 * All IOs must be of size which is a multiple of 512.  Here we
+	 * just add in enough extra to accommodate.
+	 * XXX - if the on-disk offsets don't provide enough room we're cooked!
+	 */
+	extraLength = 0;
+	if (len % 512) {
+		extraLength = 512 - (len % 512);
+	}
+
+	readlen = len;
+	if (extraLength) {
+		readlen += extraLength;
+	}
+
+	readret = posix_memalign((void **)&alignedBuf, 512, 512);
+	if (readret < 0) {
+		return -1;
+	}
+
+	readret = read(fd, alignedBuf, readlen);
+	if (readret > 0) {
+		if (readret > len) {
+			bcopy(alignedBuf, buf, len);
+			readret = len;
+		} else {
+			bcopy(alignedBuf, buf, readret);
+		}
+	}
+
+	free(alignedBuf);
+	if (readret != len) {
+		fprintf(stderr, "diskRawRead: read err, len=%d, readret=%d\n",
+			len, readret);
+	}
+
+	return (readret);
+}
+
+
+/*
+ * The RAW IO implementation requires buffers to be 512 byte aligned.
+ * Here we check for alignment and do a bounceio if necessary.
+ */
+static int
+diskRawWrite(int fd, char *buf, int len)
+{
+	char *alignedBuf;
+	int ret;
+	int extraLength;
+	int writelen;
+	int bounceNeeded = 1;
+
+	if ((((unsigned long) buf & (unsigned long) 0x3ff) == 0) &&
+	    ((len % 512) == 0)) {
+		bounceNeeded = 0;
+	}
+	if (bounceNeeded == 0) {
+		/* Already aligned and even multiple of 512, no bounceio
+		 * required. */
+		return (write(fd, buf, len));
+	}
+
+	if (len > 512) {
+		fprintf(stderr,
+		       "diskRawWrite: not setup for larger than %d.\n",
+		       512);
+		return (-1);
+	}
+
+	/*
+	 * All IOs must be of size which is a multiple of 512.  Here we
+	 * just add in enough extra to accommodate.
+	 * XXX - if the on-disk offsets don't provide enough room we're cooked!
+	 */
+	extraLength = 0;
+	if (len % 512) {
+		extraLength = 512 - (len % 512);
+	}
+
+	writelen = len;
+	if (extraLength) {
+		writelen += extraLength;
+	}
+
+	ret = posix_memalign((void **)&alignedBuf, 512,512);
+	if (ret < 0) {
+		return (-1);
+	}
+
+	bcopy(buf, alignedBuf, len);
+	ret = write(fd, alignedBuf, writelen);
+	if (ret > len) {
+		ret = len;
+	}
+
+	free(alignedBuf);
+	if (ret != len) {
+		fprintf(stderr, "diskRawWrite: write err, len=%d, ret=%dn",
+		       len, ret);
+	}
+
+	return (ret);
+}
+
+
+static int
+diskRawWriteShadow(int fd, __off64_t writeOffset, char *buf, int len)
+{
+	off_t retval_seek;
+	ssize_t retval_write;
+
+	if ((writeOffset < 0) || (len < 0)) {
+		fprintf(stderr,
+		       "diskRawWriteShadow: writeOffset=%08x, "
+		       "len=%08x.\n", (int)writeOffset, len);
+		return (-1);
+	}
+
+	retval_seek = lseek(fd, writeOffset, SEEK_SET);
+	if (retval_seek != writeOffset) {
+		fprintf(stderr,
+		       "diskRawWriteShadow: can't seek to offset %d\n",
+		       (int) writeOffset);
+		return (-1);
+	}
+
+	retval_write = diskRawWrite(fd, buf, len);
+	if (retval_write != len) {
+		if (retval_write == -1) {
+			fprintf(stderr, "%s: %s\n", __FUNCTION__,
+			       strerror(errno));
+		}
+		fprintf(stderr,
+		       "diskRawWriteShadow: aligned write returned %d"
+		       ", not %d\n", (int)retval_write, (int)len);
+		return (-1);
+	}
+
+	return 0;
+}
+
+
+int
+qdisk_read(int fd, __off64_t offset, void *buf, int count)
+{
+	shared_header_t *hdrp;
+	char *data;
+	size_t total;
+	int rv;
+
+	/*
+	 * Calculate the total length of the buffer, including the header.
+	 * Raw blocks are 512 byte aligned.
+	 */
+	total = count + sizeof(shared_header_t);
+	if (total < 512)
+		total = 512;
+
+	/* Round it up */
+	if (total % 512) 
+		total = total + (512 * !!(total % 512)) - (total % 512);
+
+	hdrp = NULL;
+	rv = posix_memalign((void **)&hdrp, sysconf(_SC_PAGESIZE), total);
+	if (rv < 0)
+		return -1;
+
+	if (hdrp == NULL) 
+		return -1;
+
+	data = (char *)hdrp + sizeof(shared_header_t);
+
+	rv = diskRawReadShadow(fd, offset, (char *)hdrp, total);
+	
+	if (rv == -1) {
+		return -1;
+	}
+	
+	/* Copy out the data */
+	memcpy(buf, data, hdrp->h_length);
+
+	/* Zero out the remainder. */
+	if (hdrp->h_length < count) {
+		memset(buf + hdrp->h_length, 0,
+		       count - hdrp->h_length);
+	}
+
+	free(hdrp);
+	return count;
+}
+
+
+int
+qdisk_write(int fd, __off64_t offset, const void *buf, int count)
+{
+	size_t maxsize;
+	shared_header_t *hdrp;
+	char *data;
+	size_t total = 0, rv = -1, psz = 512; //sysconf(_SC_PAGESIZE);
+
+	maxsize = psz - (sizeof(shared_header_t));
+	if (count >= (maxsize + sizeof(shared_header_t))) {
+		printf("error: count %d >= (%d + %d)\n", (int)count,
+		       (int)maxsize, (int)sizeof(shared_header_t));
+		errno = ENOSPC;
+		return -1;
+	}
+
+	/*
+	 * Calculate the total length of the buffer, including the header.
+	 * Raw blocks are 512 byte aligned.
+	 */
+	total = count + sizeof(shared_header_t);
+	if (total < psz)
+		total = psz;
+
+	/* Round it up */
+	if (total % psz) 
+		total = total + (psz * !!(total % psz)) - (total % psz);
+
+	hdrp = NULL;
+	rv = posix_memalign((void **)&hdrp, sysconf(_SC_PAGESIZE), total);
+	if (rv < 0) {
+		perror("posix_memalign");
+		return -1;
+	}
+
+	/* 
+	 * Copy the data into our new buffer
+	 */
+	data = (char *)hdrp + sizeof(shared_header_t);
+	memcpy(data, buf, count);
+
+	if (header_generate(hdrp, buf, count) == -1) {
+		free((char *)hdrp);
+		return -1;
+	}
+	swab_shared_header_t(hdrp);
+
+	/* 
+	 * Locking must be performed elsewhere.  We make no assumptions
+	 * about locking here.
+	 */
+	if (total == psz)
+		rv = diskRawWriteShadow(fd, offset, (char *)hdrp, psz);
+
+	if (rv == -1)
+		perror("diskRawWriteShadow");
+	
+	free((char *)hdrp);
+	if (rv == -1)
+		return -1;
+	return count;
+}
+
+
+static int
+header_init(int fd, char *label)
+{
+	quorum_header_t qh;
+
+	if (qdisk_read(fd, OFFSET_HEADER, &qh, sizeof(qh)) == sizeof(qh)) {
+		swab_quorum_header_t(&qh);
+		if (qh.qh_magic == HEADER_MAGIC_OLD) {
+			printf("Warning: Red Hat Cluster Manager 1.2.x "
+			       "header found\n");
+		} else if (qh.qh_magic == HEADER_MAGIC_NUMBER) {
+			printf("Warning: Initializing previously "
+			       "initialized partition\n");
+		}
+	}
+
+	if (gethostname(qh.qh_updatehost, sizeof(qh.qh_updatehost)) < 0) {
+		perror("gethostname");
+		return -1;
+	}
+
+	/* Copy in the cluster/label name */
+	snprintf(qh.qh_cluster, sizeof(qh.qh_cluster)-1, label);
+
+	if ((qh.qh_timestamp = (uint64_t)time(NULL)) <= 0) {
+		perror("time");
+		return -1;
+	}
+
+	qh.qh_magic = HEADER_MAGIC_NUMBER;
+	swab_quorum_header_t(&qh);
+	if (qdisk_write(fd, OFFSET_HEADER, &qh, sizeof(qh)) != sizeof(qh)) {
+		return -1;
+	}
+
+	return 0;
+}
+
+
+int
+qdisk_init(char *partname, char *label)
+{
+	int fd;
+	status_block_t ps, wps;
+	int nid;
+	time_t t;
+
+	fd = qdisk_validate(partname);
+	if (fd < 0) {
+		perror("qdisk_verify");
+		return -1;
+	}
+
+	fd = qdisk_open(partname);
+	if (fd < 0) {
+		perror("qdisk_open");
+		return -1;
+	}
+
+	if (header_init(fd, label) < 0) {
+		return -1;
+	}
+
+	time(&t);
+
+	ps.ps_magic = STATE_MAGIC_NUMBER;
+	ps.ps_updatenode = 0;
+	ps.pad0 = 0;
+	ps.ps_timestamp = (uint64_t)t;
+	ps.ps_state = (uint8_t)S_NONE;
+	ps.pad1[0] = 0;
+	ps.ps_flags = 0;
+	ps.ps_score = 0;
+	ps.ps_scoremax = 0;
+	ps.ps_ca_sec = 0;
+	ps.ps_ca_usec = 0;
+	ps.ps_lc_sec = 0;
+	ps.ps_ca_usec = 0;
+
+	/* Node IDs 1..N */
+	for (nid = 1; nid <= MAX_NODES_DISK; nid++) {
+		ps.ps_nodeid = nid;
+
+		printf("Initializing status block for node %d...\n", nid);
+		wps = ps;
+		swab_status_block_t(&wps);
+
+		if (qdisk_write(fd, qdisk_nodeid_offset(nid), &wps, sizeof(wps)) < 0) {
+			printf("Error writing node ID block %d\n", nid);
+			qdisk_close(&fd);
+			return -1;
+		}
+	}
+
+	qdisk_close(&fd);
+
+	return 0;
+}
+
/cvs/cluster/cluster/cman/qdisk/disk.h,v  -->  standard output
revision 1.3.2.1
--- cluster/cman/qdisk/disk.h
+++ -	2006-07-21 18:01:39.865378000 +0000
@@ -0,0 +1,269 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh@redhat.com>
+ */
+/**
+  @file Main quorum daemon include file
+ */
+#ifndef _QUORUM_DISK_H
+#define _QUORUM_DISK_H
+
+#include <stdint.h>
+#include <pthread.h>
+#include <arpa/inet.h>
+#include <libcman.h>
+
+#define MAX_NODES_DISK		16	
+#define MEMB_MASK_LEN           ((MAX_NODES_DISK / 8) + \
+				 (!!(MAX_NODES_DISK % 8)))
+#define DISK_MEMB_MASK_LEN	((MEMB_MASK_LEN + 7) & ~7)
+
+/** The membership bitmask type */
+typedef uint8_t memb_mask_t [DISK_MEMB_MASK_LEN];
+
+typedef enum {
+	S_NONE  = 0x0,		// Shutdown / not quorate / not running
+	S_EVICT	= 0x1,		// Voted out / about to be fenced.
+	/* ^^^ Fencing OK */
+	S_INIT	= 0x2,		// Initializing.  Hold your fire.
+        /* vvv Fencing will kill a node */
+	S_RUN	= 0x5,		// I think I'm running.
+	S_MASTER= 0x6		// I know I'm running, and have advertised to
+				// CMAN the availability of the disk vote for my
+				// partition.
+} disk_node_state_t;
+
+
+typedef enum {
+	M_NONE  = 0x0,
+	M_BID	= 0x1,
+	M_ACK	= 0x2,
+	M_NACK	= 0x3,
+	M_MASK	= 0x4
+} disk_msg_id_t;
+
+
+typedef enum {
+	FL_MSG	= 0x1,
+	FL_BID	= 0x2,
+	FL_VOTE = 0x4
+} disk_state_flag_t;
+
+
+/* RHEL 2.1 / RHCS3 old magic numbers */
+#define HEADER_MAGIC_OLD	0x39119FCD	/* partition header */
+#define STATE_MAGIC_OLD		0xF1840DCE	/* Status block */
+#define SHARED_HEADER_MAGIC_OLD	0x00DEBB1E	/* Per-block header */
+
+/* Conversion */
+#define HEADER_MAGIC_NUMBER	0xeb7a62c2	/* Partition header */
+#define STATE_MAGIC_NUMBER	0x47bacef8	/* Status block */
+#define SHARED_HEADER_MAGIC	0x00DEBB1E	/* Per-block headeer */
+
+
+typedef struct __attribute__ ((packed)) {
+	uint32_t	ps_magic;
+	/* 4 */
+	uint32_t	ps_updatenode;		// Last writer
+	/* 8 */
+	uint64_t	ps_timestamp;		// time of last update
+	/* 16 */
+	uint32_t	ps_nodeid;
+	uint32_t	pad0;
+	/* 24 */
+	uint8_t		ps_state;		// running or stopped
+	uint8_t		pad1[1];
+	uint16_t	ps_flags;
+	/* 26 */
+	uint16_t	ps_score;		// Local points
+	uint16_t	ps_scoremax;		// What we think is our max
+						// points, if other nodes
+						// disagree, we may be voted
+						// out
+	/* 28 */
+	uint32_t	ps_ca_sec;		// Cycle speed (average)
+	uint32_t	ps_ca_usec;
+	/* 36 */
+	uint32_t	ps_lc_sec;		// Cycle speed (last)
+	uint32_t	ps_lc_usec;
+	uint64_t	ps_incarnation;		// Token to detect hung +
+						// restored node
+	/* 44 */
+	uint16_t	ps_msg;			// Vote/bid mechanism 
+	uint16_t	ps_seq;
+	uint32_t	ps_arg;
+	/* 52 */
+	memb_mask_t	ps_mask;		// Bitmap
+	memb_mask_t	ps_master_mask;		// Bitmap
+	/* 60 */
+} status_block_t;
+
+#define swab_status_block_t(ptr) \
+{\
+	swab32((ptr)->ps_magic);\
+	swab32((ptr)->ps_updatenode);\
+	swab64((ptr)->ps_timestamp);\
+	swab32((ptr)->ps_nodeid);\
+	swab32((ptr)->pad0);\
+	/* state + pad */ \
+	swab16((ptr)->ps_flags);\
+	swab16((ptr)->ps_score);\
+	swab16((ptr)->ps_scoremax);\
+	/* Cycle speeds */ \
+	swab32((ptr)->ps_ca_sec);\
+	swab32((ptr)->ps_ca_usec);\
+	swab32((ptr)->ps_lc_sec);\
+	swab32((ptr)->ps_lc_usec);\
+	/* Message */ \
+	swab16((ptr)->ps_msg); \
+	swab16((ptr)->ps_seq); \
+	swab32((ptr)->ps_arg); \
+ }
+
+
+/*
+ * Shared state disk header.  Describes cluster global information.
+ */
+typedef struct __attribute__ ((packed)) {
+	uint32_t	qh_magic;
+	uint32_t	qh_align;	   // 64-bit-ism: alignment fixer.
+	uint64_t	qh_timestamp;	   // time of last update
+	char 		qh_updatehost[128];// Hostname who put this here...
+	char		qh_cluster[128];   // Cluster name
+} quorum_header_t;
+
+#define swab_quorum_header_t(ptr) \
+{\
+	swab32((ptr)->qh_magic); \
+	swab32((ptr)->qh_align); \
+	swab64((ptr)->qh_timestamp); \
+}
+
+
+
+/*
+ * The user data is stored with this header prepended.
+ * The header ONLY contains CRC information and the length of the data.
+ * The data blocks themselves contain their own respective magic numbers.
+ */
+typedef struct __attribute__ ((packed)) {
+	uint32_t h_magic;		/* Header magic	       */
+	uint32_t h_hcrc;		/* Header CRC          */
+	uint32_t h_dcrc;		/* CRC32 of data       */
+	uint32_t h_length;		/* Length of real data */
+	uint64_t h_view;		/* View # of real data */
+	uint64_t h_timestamp;		/* Timestamp           */
+} shared_header_t;
+
+#define SHARED_HEADER_INITIALIZER = {0, 0, 0, 0, 0, 0}
+
+#define swab_shared_header_t(ptr) \
+{\
+	swab32((ptr)->h_magic);\
+	swab32((ptr)->h_hcrc);\
+	swab32((ptr)->h_dcrc);\
+	swab32((ptr)->h_length);\
+	swab64((ptr)->h_view);\
+	swab64((ptr)->h_timestamp);\
+}
+
+
+/* Offsets from RHCM 1.2.x */
+#define OFFSET_HEADER	0
+#define HEADER_SIZE	4096		/* Page size for now */
+
+#define OFFSET_FIRST_STATUS_BLOCK	(OFFSET_HEADER + HEADER_SIZE)
+#define SPACE_PER_STATUS_BLOCK		4096 /* Page size for now */
+#define STATUS_BLOCK_COUNT		MAX_NODES_DISK
+
+#define SPACE_PER_MESSAGE_BLOCK		(4096)
+#define	MESSAGE_BLOCK_COUNT		MAX_NODES_DISK
+
+#define END_OF_DISK			(OFFSET_FIRST_STATUS_BLOCK + \
+					 (MAX_NODES_DISK + 1) * \
+					 SPACE_PER_STATUS_BLOCK) \
+
+
+
+/* From disk.c */
+int qdisk_open(char *name);
+int qdisk_close(int *fd);
+int qdisk_init(char *name, char *clustername);
+int qdisk_validate(char *name);
+int qdisk_read(int fd, __off64_t ofs, void *buf, int len);
+int qdisk_write(int fd, __off64_t ofs, const void *buf, int len);
+
+#define qdisk_nodeid_offset(nodeid) \
+	(OFFSET_FIRST_STATUS_BLOCK + (SPACE_PER_STATUS_BLOCK * (nodeid - 1)))
+
+/* From disk_utils.c */
+#define HISTORY_LENGTH 60
+typedef struct {
+	disk_msg_id_t m_msg;	 /* this is an int, but will be stored as 16bit*/
+	uint32_t m_arg;
+	uint16_t m_seq;
+	uint16_t pad0;
+} disk_msg_t;
+
+typedef struct {
+	uint64_t qc_incarnation;
+	struct timeval qc_average;
+	struct timeval qc_last[HISTORY_LENGTH];
+	int qc_fd;
+	int qc_my_id;
+	int qc_writes;
+	int qc_interval;
+	int qc_tko;
+	int qc_votes;
+	int qc_scoremin;
+	disk_node_state_t qc_disk_status;
+	disk_node_state_t qc_status;
+	int qc_master;		/* Master?! */
+	int qc_unused;
+	cman_handle_t qc_ch;
+	char *qc_device;
+	char *qc_label;
+	char *qc_status_file;
+} qd_ctx;
+
+typedef struct {
+	uint64_t ni_incarnation;
+	uint64_t ni_evil_incarnation;
+	time_t	ni_last_seen;
+	int	ni_misses;
+	int	ni_seen;
+	disk_msg_t ni_msg;
+	disk_msg_t ni_last_msg;
+	disk_node_state_t ni_state;
+	status_block_t ni_status;
+} node_info_t;
+
+int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
+		    disk_msg_t *msg, memb_mask_t mask, memb_mask_t master);
+int qd_read_print_status(int fd, int nid);
+int qd_init(qd_ctx *ctx, cman_handle_t ch, int me);
+void qd_destroy(qd_ctx *ctx);
+
+/* proc.c */
+int find_partitions(const char *partfile, const char *label,
+		    char *devname, size_t devlen, int print);
+int check_device(char *device, char *label, quorum_header_t *qh);
+
+
+#endif
/cvs/cluster/cluster/cman/qdisk/disk_util.c,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/disk_util.c
+++ -	2006-07-21 18:01:39.967981000 +0000
@@ -0,0 +1,293 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh@redhat.com>
+ */
+/**
+  @file Misc. Quorum daemon context utilities / high-level functions
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <disk.h>
+#include <platform.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <time.h>
+
+
+static inline void
+_diff_tv(struct timeval *dest, struct timeval *start, struct timeval *end)
+{
+	        dest->tv_sec = end->tv_sec - start->tv_sec;
+	        dest->tv_usec = end->tv_usec - start->tv_usec;
+
+		if (dest->tv_usec < 0) {
+			dest->tv_usec += 1000000;
+			dest->tv_sec--;
+		}
+}
+
+
+/**
+  Update write times and calculate a new average time
+ */
+void
+qd_update_wtime(qd_ctx *ctx, struct timeval *newtime)
+{
+	int x;
+	int max = HISTORY_LENGTH;
+	uint64_t sum = 0;
+
+	/* Store the thing */
+	ctx->qc_writes++;
+	ctx->qc_last[ctx->qc_writes % HISTORY_LENGTH].tv_sec = newtime->tv_sec;
+	ctx->qc_last[ctx->qc_writes % HISTORY_LENGTH].tv_usec = newtime->tv_usec;
+
+	if (ctx->qc_writes < HISTORY_LENGTH)
+		max = ctx->qc_writes;
+
+	for (x = 0; x < max; x++) {
+		sum += (ctx->qc_last[x].tv_sec * 1000000);
+		sum += ctx->qc_last[x].tv_usec;
+	}
+
+	sum /= max;
+
+	ctx->qc_average.tv_sec = (sum / 1000000);
+	ctx->qc_average.tv_usec = (sum % 1000000);
+}
+
+
+/**
+  Write a status block to disk, given state, nodeid, message, and the
+  membership mask.
+ */
+int
+qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
+		disk_msg_t *msg, memb_mask_t mask, memb_mask_t master)
+{
+	status_block_t ps;
+	struct timeval start, end;
+	int utime_ok = 1;
+
+	if (!ctx) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (nid <= 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	ps.ps_magic = STATE_MAGIC_NUMBER;
+	ps.ps_nodeid = nid;
+	ps.ps_updatenode = ctx->qc_my_id;
+	ps.pad0 = 0;
+	ps.ps_timestamp = (uint64_t)time(NULL);
+	ps.ps_state = (uint8_t)state;
+	ps.pad1[0] = 0;
+	ps.ps_flags = 0;
+	ps.ps_score = 0;
+	ps.ps_scoremax = 0;
+	ps.ps_ca_sec = ctx->qc_average.tv_sec;
+	ps.ps_ca_usec = ctx->qc_average.tv_usec;
+	ps.ps_incarnation = ctx->qc_incarnation;
+	if (mask) {
+		memcpy(ps.ps_mask, mask, sizeof(memb_mask_t));
+	} else {
+		memset(ps.ps_mask, 0, sizeof(memb_mask_t));
+	}
+	if (master) {
+		memcpy(ps.ps_master_mask, master, sizeof(memb_mask_t));
+	} else {
+		memset(ps.ps_master_mask, 0, sizeof(memb_mask_t));
+	}
+
+	if (ctx->qc_writes) {
+		ps.ps_lc_sec =
+		   ctx->qc_last[(ctx->qc_writes - 1) % HISTORY_LENGTH].tv_sec;
+		ps.ps_lc_usec =
+		   ctx->qc_last[(ctx->qc_writes - 1) % HISTORY_LENGTH].tv_usec;
+	} else {
+		ps.ps_lc_sec = ps.ps_lc_usec = 0;
+	}
+	ps.ps_nodeid = nid;
+
+	/* Argh! */
+	if (msg) {
+		ps.ps_msg = msg->m_msg;
+		ps.ps_seq = msg->m_seq;
+		ps.ps_arg = msg->m_arg;
+	} else {
+		ps.ps_msg = 0;
+		ps.ps_seq = 0;
+		ps.ps_arg = 0;
+	}
+
+	if (gettimeofday(&start, NULL) < 0)
+		utime_ok = 0;
+	swab_status_block_t(&ps);
+	if (qdisk_write(ctx->qc_fd, qdisk_nodeid_offset(nid), &ps,
+			sizeof(ps)) < 0) {
+		printf("Error writing node ID block %d\n", nid);
+		return -1;
+	}
+	if (utime_ok && (gettimeofday(&end, NULL) < 0))
+		utime_ok = 0;
+
+	if (utime_ok) {
+		_diff_tv(&start,&start,&end);
+	} else {
+		/* Use heuristic */
+		start.tv_sec = ctx->qc_average.tv_sec;
+		start.tv_usec = ctx->qc_average.tv_usec;
+	}
+	qd_update_wtime(ctx, &start);
+
+	return 0;
+}
+
+
+int
+qd_print_status(status_block_t *ps)
+{
+	int x;
+
+	printf("Data @ offset %d:\n",
+	       (int)qdisk_nodeid_offset(ps->ps_nodeid));
+	printf("status_block_t {\n");
+	printf("\t.ps_magic = %08x;\n", (int)ps->ps_magic);
+	printf("\t.ps_nodeid = %d;\n", (int)ps->ps_nodeid);
+	printf("\t.ps_updatenode = %d;\n", (int)ps->ps_updatenode);
+	printf("\t.pad0 = %d;\n", (int)ps->pad0);
+	printf("\t.ps_timestamp = %llu;\n", (long long unsigned)
+		ps->ps_timestamp);
+	printf("\t.ps_state = %d;\n", ps->ps_state);
+	printf("\t.pad1[0] = %d;\n", ps->pad1[0]);
+	printf("\t.ps_flags = %d;\n", ps->ps_flags);
+	printf("\t.ps_score = %d;\n", ps->ps_score);
+	printf("\t.ps_scoremax = %d;\n", ps->ps_scoremax);
+	printf("\t.ps_ca_sec = %d;\n", ps->ps_ca_sec);
+	printf("\t.ps_ca_usec = %d;\n", ps->ps_ca_usec);
+	printf("\t.ps_lc_sec = %d;\n", ps->ps_lc_sec);
+	printf("\t.ps_lc_usec = %d;\n", ps->ps_lc_usec);
+	printf("\t.ps_mask = 0x");
+	for (x = (sizeof(memb_mask_t)-1); x >= 0; x--)
+		printf("%02x", ps->ps_mask[x]);
+	printf("\n");
+	printf("\t.ps_master_mask = 0x");
+	for (x = (sizeof(memb_mask_t)-1); x >= 0; x--)
+		printf("%02x", ps->ps_mask[x]);
+	printf("\n");
+
+	printf("}\n");
+
+	return 0;
+}
+
+
+int
+qd_read_print_status(int fd, int nid)
+{
+	status_block_t ps;
+
+	if (fd < 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (nid <= 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (qdisk_read(fd, qdisk_nodeid_offset(nid), &ps,
+			sizeof(ps)) < 0) {
+		printf("Error reading node ID block %d\n", nid);
+		return -1;
+	}
+	swab_status_block_t(&ps);
+	qd_print_status(&ps);
+
+	return 0;
+}
+
+
+/**
+  Generate a token based on the current system time.
+ */
+uint64_t
+generate_token(void)
+{
+	uint64_t my_token = 0;
+	struct timeval tv;
+
+        while(my_token == 0) {
+                gettimeofday(&tv, NULL);
+
+                my_token = ((uint64_t) (tv.tv_sec) << 32) |
+                        (uint64_t) (tv.tv_sec & 0x00000000ffffffff);
+        }
+
+	return my_token;
+}
+
+
+/**
+  Initialize a quorum disk context, given a CMAN handle and a nodeid.
+ */
+int
+qd_init(qd_ctx *ctx, cman_handle_t ch, int me)
+{
+	if (!ctx || !ch || !me) {
+		errno = EINVAL;
+		return -1;
+	}	
+
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->qc_incarnation = generate_token();
+	ctx->qc_ch = ch;
+	ctx->qc_my_id = me;
+
+	return 0;
+}
+
+
+/**
+  Destroy a quorum disk context
+ */
+void
+qd_destroy(qd_ctx *ctx)
+{
+	if (ctx->qc_my_id == 0)
+		return;
+	if (ctx->qc_device) {
+		free(ctx->qc_device);
+		ctx->qc_device = NULL;
+	}
+	close(ctx->qc_fd);
+	ctx->qc_fd = -1;
+}
/cvs/cluster/cluster/cman/qdisk/gettid.c,v  -->  standard output
revision 1.4.2.1
--- cluster/cman/qdisk/gettid.c
+++ -	2006-07-21 18:01:40.062745000 +0000
@@ -0,0 +1,24 @@
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <linux/unistd.h>
+#include <gettid.h>
+#include <errno.h>
+#include <unistd.h>
+
+/* Patch from Adam Conrad / Ubuntu: Don't use _syscall macro */
+
+#ifdef __NR_gettid
+pid_t gettid (void)
+{
+	return syscall(__NR_gettid);
+}
+#else
+
+#warn "gettid not available -- substituting with pthread_self()"
+
+#include <pthread.h>
+pid_t gettid (void)
+{
+	return (pid_t)pthread_self();
+}
+#endif
/cvs/cluster/cluster/cman/qdisk/gettid.h,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/gettid.h
+++ -	2006-07-21 18:01:40.159682000 +0000
@@ -0,0 +1,7 @@
+#ifndef __GETTID_H
+#define __GETTID_H
+
+pid_t gettid(void);
+
+#endif
+
/cvs/cluster/cluster/cman/qdisk/main.c,v  -->  standard output
revision 1.3.2.1
--- cluster/cman/qdisk/main.c
+++ -	2006-07-21 18:01:40.252627000 +0000
@@ -0,0 +1,1026 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh@redhat.com>
+ */
+/**
+  @file Main loop / functions for disk-based quorum daemon.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <disk.h>
+#include <platform.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/reboot.h>
+#include <linux/reboot.h>
+#include <signal.h>
+#include <ccs.h>
+#include "score.h"
+#include "clulog.h"
+/*
+  TODO:
+  1) Take into account timings to gracefully extend node timeouts during 
+     node spikes (that's why they are there!).
+  2) Poll ccsd for configuration changes.
+  3) Logging.
+ */
+
+/* From bitmap.c */
+int clear_bit(uint8_t *mask, uint32_t bitidx, uint32_t masklen);
+int set_bit(uint8_t *mask, uint32_t bitidx, uint32_t masklen);
+int is_bit_set(uint8_t *mask, uint32_t bitidx, uint32_t masklen);
+static int _running = 0;
+
+
+static void
+int_handler(int sig)
+{
+	_running = 0;
+}
+
+
+/**
+  Simple thing to see if a node is running.
+ */
+inline int
+state_run(disk_node_state_t state)
+{
+	return (state >= S_INIT ? state : 0);
+}
+
+
+/**
+  Clear out / initialize node info block.
+ */
+void
+node_info_init(node_info_t *ni, int max)
+{
+	int x;
+	time_t t = time(NULL);
+
+	memset(ni, 0, sizeof(*ni) * max);
+	for (x = 0; x < max; x++) {
+		ni[x].ni_status.ps_nodeid = (x + 1); /* node ids are 1-based */
+		ni[x].ni_status.ps_timestamp = t;
+		ni[x].ni_misses = 0;
+		ni[x].ni_last_seen = t;
+	}
+}
+
+
+/**
+  Check to see if someone tried to evict us but we were out to lunch.
+  Rare case; usually other nodes would put up the 'Undead' message and
+  re-evict us.
+ */
+void
+check_self(qd_ctx *ctx, status_block_t *sb)
+{
+	if (!sb->ps_updatenode ||
+	    (sb->ps_updatenode == ctx->qc_my_id)) {
+		return;
+	}
+
+	/* I did not update this??! */
+	switch(sb->ps_state) {
+	case S_EVICT:
+		/* Someone told us to die. */
+		reboot(RB_AUTOBOOT);
+	default:
+		clulog(LOG_EMERG, "Unhandled state: %d\n", sb->ps_state);
+		raise(SIGSTOP);
+	}
+}
+
+
+/**
+  Read in the node blocks off of the quorum disk and see if anyone has
+  or has not updated their timestamp recently.  See check_transitions as
+  well.
+ */
+void
+read_node_blocks(qd_ctx *ctx, node_info_t *ni, int max)
+{
+	int x;
+	status_block_t *sb;
+
+	for (x = 0; x < max; x++) {
+
+		sb = &ni[x].ni_status;
+
+		if (qdisk_read(ctx->qc_fd, qdisk_nodeid_offset(x+1),
+			       sb, sizeof(*sb)) < 0) {
+			clulog(LOG_WARNING,"Error reading node ID block %d\n",
+			       x+1);
+		}
+		swab_status_block_t(sb);
+
+		if (sb->ps_nodeid == ctx->qc_my_id) {
+			check_self(ctx, sb);
+			continue;
+		} 
+		/* message. */
+		ni[x].ni_msg.m_arg = sb->ps_arg;
+		ni[x].ni_msg.m_msg = sb->ps_msg;
+		ni[x].ni_msg.m_seq = sb->ps_seq;
+
+		if (!state_run(sb->ps_state))
+			continue;
+
+		/* Unchanged timestamp: miss */
+		if (sb->ps_timestamp == ni[x].ni_last_seen) {
+			/* XXX check for average + allow grace */
+			ni[x].ni_misses++;
+			continue;
+		}
+
+		/* Got through?  The node is good. */
+		ni[x].ni_misses = 0;
+		ni[x].ni_seen++;
+		ni[x].ni_last_seen = sb->ps_timestamp;
+	}
+}
+
+
+/**
+  Check for node transitions.
+ */
+void
+check_transitions(qd_ctx *ctx, node_info_t *ni, int max, memb_mask_t mask)
+{
+	int x;
+
+	if (mask)
+		memset(mask, 0, sizeof(memb_mask_t));
+
+	for (x = 0; x < max; x++) {
+
+		/*
+		   Case 1: check to see if the node is still up
+		   according to our internal state, but has been
+		   evicted by the master or cleanly shut down
+		   (or restarted).
+
+		   Transition from Evicted/Shutdown -> Offline
+		 */
+		if ((ni[x].ni_state >= S_EVICT &&
+		     ni[x].ni_status.ps_state <= S_EVICT) ||
+		     (ni[x].ni_incarnation &&
+		      (ni[x].ni_incarnation !=
+		       ni[x].ni_status.ps_incarnation))) {
+
+			if (ni[x].ni_status.ps_state == S_EVICT) {
+				clulog(LOG_NOTICE, "Node %d evicted\n",
+				       ni[x].ni_status.ps_nodeid);
+			} else {
+				/* State == S_NONE or incarnation change */
+				clulog(LOG_INFO, "Node %d shutdown\n",
+				       ni[x].ni_status.ps_nodeid);
+				ni[x].ni_evil_incarnation = 0;
+			}
+
+			ni[x].ni_incarnation = 0;
+			ni[x].ni_seen = 0;
+			ni[x].ni_misses = 0;
+			ni[x].ni_state = S_NONE;
+
+			continue;
+		}
+
+		/*
+		   Case 2: Check for a heartbeat timeout.  Write an eviction
+		   notice if we're the master.  If this is our first notice
+		   of the heartbeat timeout, update our internal state
+		   accordingly.  When the master evicts this node, we will
+		   hit case 1 above.
+
+		   Transition from Online -> Evicted
+		 */
+		if (ni[x].ni_misses > ctx->qc_tko &&
+		     state_run(ni[x].ni_status.ps_state)) {
+
+			/*
+			   Write eviction notice if we're the master.
+			 */
+			if (ctx->qc_status == S_MASTER) {
+				clulog(LOG_DEBUG,
+				       "Writing eviction notice for node %d\n",
+				       ni[x].ni_status.ps_nodeid);
+				qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
+						S_EVICT, NULL, NULL, NULL);
+				clulog(LOG_DEBUG,
+				       "Telling CMAN to kill the node\n");
+				cman_kill_node(ctx->qc_ch,
+					       ni[x].ni_status.ps_nodeid);
+			}
+
+			/*
+			   Mark our internal views as dead if nodes miss too
+			   many heartbeats...  This will cause a master
+			   transition if no live master exists.
+			 */
+			if (ni[x].ni_status.ps_state >= S_RUN &&
+			    ni[x].ni_seen) {
+				clulog(LOG_DEBUG, "Node %d DOWN\n",
+				       ni[x].ni_status.ps_nodeid);
+				ni[x].ni_seen = 0;	
+			}
+
+			ni[x].ni_state = S_EVICT;
+			ni[x].ni_status.ps_state = S_EVICT;
+			ni[x].ni_evil_incarnation = 
+				ni[x].ni_status.ps_incarnation;
+			
+			continue;
+		}
+
+		/*
+		   Case 3:  Check for node who is supposed to be dead, but
+		   has started writing to the disk again with the same
+		   incarnation.  
+
+		   Transition from Offline -> Undead (BAD!!!)
+		 */
+		if (ni[x].ni_evil_incarnation &&
+                    (ni[x].ni_evil_incarnation == 
+		     ni[x].ni_status.ps_incarnation)) {
+			clulog(LOG_CRIT, "Node %d is undead.\n",
+			       ni[x].ni_status.ps_nodeid);
+
+			clulog(LOG_ALERT,
+			       "Writing eviction notice for node %d\n",
+			       ni[x].ni_status.ps_nodeid);
+			qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
+					S_EVICT, NULL, NULL, NULL);
+			ni[x].ni_status.ps_state = S_EVICT;
+
+			/* XXX Need to fence it again */
+			clulog(LOG_DEBUG, "Telling CMAN to kill the node\n");
+			cman_kill_node(ctx->qc_ch,
+				       ni[x].ni_status.ps_nodeid);
+			continue;
+		}
+
+
+		/*
+		   Case 4:  Check for a node who has met our minimum # of
+		   'seen' requests.
+
+		   Transition from Offline -> Online
+		 */
+		if (ni[x].ni_seen > (ctx->qc_tko / 2) &&
+		    !state_run(ni[x].ni_state)) {
+			/*
+			   Node-join - everyone just kind of "agrees"
+			   there's no consensus to just have a node join
+			   right now.
+			 */
+			ni[x].ni_state = S_RUN;
+			clulog(LOG_DEBUG, "Node %d is UP\n",
+			       ni[x].ni_status.ps_nodeid);
+			ni[x].ni_incarnation =
+			    ni[x].ni_status.ps_incarnation;
+			if (mask)
+				set_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+					sizeof(memb_mask_t));
+
+			continue;
+		}
+
+		/*
+		   Case 5: Check for a node becoming master.  Not really a
+		   transition.
+		 */
+		if (ni[x].ni_state == S_RUN &&
+		    ni[x].ni_status.ps_state == S_MASTER) {
+			clulog(LOG_INFO, "Node %d is the master\n",
+			       ni[x].ni_status.ps_nodeid);
+			ni[x].ni_state = S_MASTER;
+			if (mask)
+				set_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+					sizeof(memb_mask_t));
+			continue;
+		}
+
+		/*
+		   All other cases: Believe the node's reported state ;)
+		 */
+		if (state_run(ni[x].ni_state)) {
+			ni[x].ni_state = ni[x].ni_status.ps_state;
+			if (mask)
+				set_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+					sizeof(memb_mask_t));
+		}
+	}
+}
+
+
+/**
+  Checks for presence of an online master.  If there is no
+  Returns
+ */
+int
+master_exists(qd_ctx *ctx, node_info_t *ni, int max, int *low_id)
+{
+	int x;
+	int masters = 0;
+	int ret = 0;
+
+	*low_id = ctx->qc_my_id;
+
+	for (x = 0; x < max; x++) {
+
+		/* See if this one's a master */
+		if (ni[x].ni_state >= S_RUN &&
+		    ni[x].ni_status.ps_state == S_MASTER) {
+			if (!ret)
+				ret = ni[x].ni_status.ps_nodeid;
+			++masters;
+		}
+
+		/* See if it's us... */
+		if (ni[x].ni_status.ps_nodeid == ctx->qc_my_id &&
+		    ni[x].ni_status.ps_state == S_MASTER) {
+			if (!ret)
+				ret = ctx->qc_my_id;
+			++masters;
+			continue;
+		}
+
+		/* Look for dead master */
+		if (ni[x].ni_state < S_RUN &&
+		    ni[x].ni_status.ps_state == S_MASTER) {
+			clulog(LOG_DEBUG,
+			       "Node %d is marked master, but is dead.\n",
+			       ni[x].ni_status.ps_nodeid);
+			continue;
+		}
+
+		if (ni[x].ni_state < S_RUN)
+			continue;
+		
+		if (ni[x].ni_status.ps_nodeid < *low_id)
+			*low_id = ni[x].ni_status.ps_nodeid;
+	}
+
+	if (masters > 1) {
+		clulog(LOG_CRIT,
+		       "Critical Error: More than one master found!\n");
+		/* XXX Handle this how? */
+	}
+	/*
+ 	else if (masters == 1) {
+		printf("Node %d is the master\n", ret);
+	} else {
+		printf("No master found; node %d should be the master\n",
+		       *low_id);
+	}
+	*/
+
+	return ret;
+}
+
+
+/**
+  initialize node information blocks and wait to see if there is already
+  a cluster running using this QD.  Note that this will delay master
+  election if multiple nodes start with a second or two of each other.
+ */
+int
+quorum_init(qd_ctx *ctx, node_info_t *ni, int max, struct h_data *h, int maxh)
+{
+	int x = 0, score, maxscore;
+
+	clulog(LOG_INFO, "Quorum Daemon Initializing\n");
+
+	if (qdisk_validate(ctx->qc_device) < 0)
+		return -1;
+
+	ctx->qc_fd = qdisk_open(ctx->qc_device);
+	if (ctx->qc_fd < 0) {
+		clulog(LOG_CRIT, "Failed to open %s: %s\n", ctx->qc_device,
+		       strerror(errno));
+		return -1;
+	}
+	
+	start_score_thread(h, maxh);
+
+	node_info_init(ni, max);
+	if (qd_write_status(ctx, ctx->qc_my_id,
+			    S_INIT, NULL, NULL, NULL) != 0) {
+		clulog(LOG_CRIT, "Could not initialize status block!\n");
+		return -1;
+	}
+
+	while (++x <= ctx->qc_tko) {
+		read_node_blocks(ctx, ni, max);
+		check_transitions(ctx, ni, max, NULL);
+
+		if (qd_write_status(ctx, ctx->qc_my_id,
+				    S_INIT, NULL, NULL, NULL) != 0) {
+			clulog(LOG_CRIT, "Initialization failed\n");
+			return -1;
+		}
+
+		sleep(ctx->qc_interval);
+
+	}
+
+	get_my_score(&score,&maxscore);
+	clulog(LOG_INFO, "Initial score %d/%d\n", score, maxscore);
+	clulog(LOG_INFO, "Initialization complete\n");
+
+	return 0;
+}
+
+
+/**
+  Vote for a master if it puts a bid in.
+ */
+void
+do_vote(qd_ctx *ctx, node_info_t *ni, int max, disk_msg_t *msg)
+{
+	int x;
+
+	for (x = 0; x < max; x++) {
+		if (ni[x].ni_state != S_RUN)
+			continue;
+
+		if (ni[x].ni_status.ps_msg == M_BID &&
+		    ni[x].ni_status.ps_nodeid < ctx->qc_my_id) {
+
+			/* Vote for lowest bidding ID that is lower
+			   than us */
+			msg->m_msg = M_ACK;
+			msg->m_arg = ni[x].ni_status.ps_nodeid;
+			msg->m_seq = ni[x].ni_status.ps_seq;
+
+			return;
+		}
+	}
+}
+
+
+/*
+  Check to match nodes in mask with nodes online according to CMAN.
+  Only the master needs to do this.
+ */
+void
+check_cman(qd_ctx *ctx, memb_mask_t mask, memb_mask_t master_mask)
+{
+	cman_node_t nodes[MAX_NODES_DISK];
+	int retnodes, x;
+
+	if (cman_get_nodes(ctx->qc_ch, MAX_NODES_DISK,
+			   &retnodes, nodes) <0 )
+		return;
+
+	memset(master_mask, 0, sizeof(master_mask));
+
+	for (x = 0; x < retnodes; x++) {
+		if (is_bit_set(mask, nodes[x].cn_nodeid-1, sizeof(mask)) &&
+		    nodes[x].cn_member)
+			set_bit(master_mask, nodes[x].cn_nodeid-1,
+				sizeof(master_mask));
+	}
+}
+
+
+/* 
+   returns:
+	3: all acks received - you are the master.
+	2: nacked (not highest score?) might not happen
+	1: other node with lower ID is bidding and we should rescind our
+	   bid.
+	0: still waiting; don't clear bid; just wait another round.
+   Modifies:
+	*msg - it will store the vote for the lowest bid if we should
+	clear our bid.
+ */ 
+int
+check_votes(qd_ctx *ctx, node_info_t *ni, int max, disk_msg_t *msg)
+{
+	int x, running = 0, acks = 0, nacks = 0, low_id = ctx->qc_my_id;
+
+	for (x = 0; x < max; x++) {
+		if (state_run(ni[x].ni_state))
+			++running;
+		else
+			continue;
+
+		if (ni[x].ni_status.ps_msg == M_ACK &&
+		    ni[x].ni_status.ps_arg == ctx->qc_my_id) {
+			++acks;
+		}
+
+		if (ni[x].ni_status.ps_msg == M_NACK &&
+		    ni[x].ni_status.ps_arg == ctx->qc_my_id) {
+			++nacks;
+		}
+		
+		/* If there's someone with a lower ID who is also
+		   bidding for master, change our message to vote
+		   for the lowest bidding node ID */
+		if (ni[x].ni_status.ps_msg == M_BID && 
+		    ni[x].ni_status.ps_nodeid < low_id) {
+			low_id = ni[x].ni_status.ps_nodeid;
+			msg->m_msg = M_ACK;
+			msg->m_arg = ni[x].ni_status.ps_nodeid;
+			msg->m_seq = ni[x].ni_status.ps_seq;
+		}
+	}
+
+	if (acks == running)
+		return 3;
+	if (nacks)
+		return 2;
+	if (low_id != ctx->qc_my_id)
+		return 1;
+	return 0;
+}
+
+
+char *
+state_str(disk_node_state_t s)
+{
+	switch (s) {
+	case S_NONE:
+		return "None";
+	case S_EVICT:
+		return "Evicted";
+	case S_INIT:
+		return "Initializing";
+	case S_RUN:
+		return "Running";
+	case S_MASTER:
+		return "Master";
+	default:
+		return "ILLEGAL";
+	}
+}
+
+
+void
+update_local_status(qd_ctx *ctx, node_info_t *ni, int max, int score,
+		    int score_req, int score_max)
+{
+	FILE *fp;
+	int x, need_close = 0;
+
+	if (!ctx->qc_status_file)
+		return;
+
+	if (strcmp(ctx->qc_status_file, "-") == 0) {
+		fp = stdout;
+	} else {
+		fp = fopen(ctx->qc_status_file, "w+");
+		if (fp == NULL)
+			return;
+		need_close = 1;
+	}
+
+	fprintf(fp, "Node ID: %d\n", ctx->qc_my_id);
+	fprintf(fp, "Score (current / min req. / max allowed): %d / %d / %d\n",
+		score, score_req, score_max);
+	fprintf(fp, "Current state: %s\n", state_str(ctx->qc_status));
+	fprintf(fp, "Current disk state: %s\n",
+		state_str(ctx->qc_disk_status));
+
+	fprintf(fp, "Visible Set: {");
+	for (x=0; x<max; x++) {
+		if (ni[x].ni_state >= S_RUN || ni[x].ni_status.ps_nodeid == 
+		    ctx->qc_my_id)
+			fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
+	}
+	fprintf(fp, " }\n");
+
+	if (!ctx->qc_master) {
+		fprintf(fp, "No master node\n");
+		goto out;
+	}
+
+	fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
+	fprintf(fp, "Quorate Set: {");
+	for (x=0; x<max; x++) {
+		if (is_bit_set(ni[ctx->qc_master-1].ni_status.ps_master_mask,
+			       ni[x].ni_status.ps_nodeid-1,
+			       sizeof(memb_mask_t))) {
+			fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
+		}
+	}
+
+	fprintf(fp, " }\n");
+
+out:
+	fprintf(fp, "\n");
+	if (need_close)
+		fclose(fp);
+}
+
+
+
+int
+quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
+{
+	disk_msg_t msg = {0, 0, 0};
+	int low_id, bid_pending = 0, score, score_max, score_req;
+	memb_mask_t mask, master_mask;
+
+	ctx->qc_status = S_RUN;
+	
+	_running = 1;
+	while (_running) {
+		/* Read everyone else's status */
+		read_node_blocks(ctx, ni, max);
+
+		/* Check for node transitions */
+		check_transitions(ctx, ni, max, mask);
+
+		/* Check heuristics and remove ourself if necessary */
+		get_my_score(&score, &score_max);
+
+		score_req = ctx->qc_scoremin;
+		if (score_req <= 0)
+			score_req = ((score_max + 1) / 2);
+
+		if (score < score_req) {
+			clear_bit(mask, (ctx->qc_my_id-1), sizeof(mask));
+			if (ctx->qc_status > S_NONE) {
+				clulog(LOG_NOTICE,
+				       "Score insufficient for master "
+				       "operation (%d/%d; max=%d); "
+				       "downgrading\n",
+				       score, score_req, score_max);
+				ctx->qc_status = S_NONE;
+				msg.m_msg = M_NONE;
+				++msg.m_seq;
+				bid_pending = 0;
+				cman_poll_quorum_device(ctx->qc_ch, 0);
+				/* reboot??? */
+			}
+		}  else {
+			set_bit(mask, (ctx->qc_my_id-1), sizeof(mask));
+			if (ctx->qc_status == S_NONE) {
+				clulog(LOG_NOTICE,
+				       "Score sufficient for master "
+				       "operation (%d/%d; max=%d); "
+				       "upgrading\n",
+				       score, score_req, score_max);
+				ctx->qc_status = S_RUN;
+			}
+		}
+
+		/* Find master */
+		ctx->qc_master = master_exists(ctx, ni, max, &low_id);
+
+		/* Figure out what to do based on what we know */
+		if (!ctx->qc_master &&
+		    low_id == ctx->qc_my_id &&
+		    ctx->qc_status == S_RUN &&
+		    !bid_pending ) {
+			/*
+			   If there's no master, and we are the lowest node
+			   ID, make a bid to become master if we're not 
+			   already bidding.
+			 */
+
+			clulog(LOG_DEBUG,"Making bid for master\n");
+			msg.m_msg = M_BID;
+			++msg.m_seq;
+			bid_pending = 1;
+
+		} else if (!ctx->qc_master && !bid_pending) {
+
+			/* We're not the master, and we do not have a bid
+			   pending.  Check for voting on other nodes. */
+			do_vote(ctx, ni, max, &msg);
+		} else if (!ctx->qc_master && bid_pending) {
+
+			/* We're currently bidding for master.
+			   See if anyone's voted, or if we should
+			   rescind our bid */
+
+			/* Yes, those are all deliberate fallthroughs */
+			switch (check_votes(ctx, ni, max, &msg)) {
+			case 3:
+				clulog(LOG_INFO,
+				       "Assuming master role\n");
+				ctx->qc_status = S_MASTER;
+			case 2:
+				msg.m_msg = M_NONE;
+			case 1:
+				bid_pending = 0;
+			default:
+				break;
+			}
+		} else if (ctx->qc_status == S_MASTER &&
+			   ctx->qc_master != ctx->qc_my_id) {
+			
+			/* We think we're master, but someone else claims
+			   that they are master. */
+
+			clulog(LOG_CRIT,
+			       "A master exists, but it's not me?!\n");
+			/* XXX Handle this how? Should not happen*/
+			/* reboot(RB_AUTOBOOT); */
+
+		} else if (ctx->qc_status == S_MASTER &&
+			   ctx->qc_master == ctx->qc_my_id) {
+
+			/* We are the master.  Poll the quorum device.
+			   We can't be the master unless we score high
+			   enough on our heuristics. */
+			check_cman(ctx, mask, master_mask);
+			cman_poll_quorum_device(ctx->qc_ch, 1);
+
+		} else if (ctx->qc_status == S_RUN && ctx->qc_master &&
+			   ctx->qc_master != ctx->qc_my_id) {
+
+			/* We're not the master, but a master exists
+			   Check to see if the master thinks we are 
+			   online.  If we are, tell CMAN so. */
+			if (is_bit_set(
+			      ni[ctx->qc_master-1].ni_status.ps_master_mask,
+				       ctx->qc_my_id-1,
+				       sizeof(memb_mask_t))) {
+				cman_poll_quorum_device(ctx->qc_ch, 1);
+			}
+		}
+		
+		/* Write out our status */
+		if (qd_write_status(ctx, ctx->qc_my_id, ctx->qc_status,
+				    &msg, mask, master_mask) != 0) {
+			clulog(LOG_ERR, "Error writing to quorum disk\n");
+		}
+
+		/* write out our local status */
+		update_local_status(ctx, ni, max, score, score_req, score_max);
+
+		/* Cycle. We could time the loop and sleep
+		   usleep(interval-looptime), but this is fine for now.*/
+		if (_running)
+			sleep(ctx->qc_interval);
+	}
+
+	return 0;
+}
+
+
+/**
+  Tell the other nodes we're done (safely!).
+ */
+int
+quorum_logout(qd_ctx *ctx)
+{
+	/* Write out our status */
+	if (qd_write_status(ctx, ctx->qc_my_id, S_NONE,
+			    NULL, NULL, NULL) != 0) {
+		clulog(LOG_WARNING,
+		       "Error writing to quorum disk during logout\n");
+	}
+	return 0;
+}
+
+
+/**
+  Grab all our configuration data from CCSD
+ */
+int
+get_config_data(char *cluster_name, qd_ctx *ctx, struct h_data *h, int maxh,
+		int *cfh, int debug)
+{
+	int ccsfd = -1, loglevel = 4;
+	char query[256];
+	char *val;
+
+	clulog(LOG_DEBUG, "Loading configuration information\n");
+
+	ccsfd = ccs_force_connect(cluster_name, 1);
+	if (ccsfd < 0) {
+		clulog(LOG_CRIT, "Connection to CCSD failed; cannot start\n");
+		return -1;
+	}
+
+	ctx->qc_interval = 1;
+	ctx->qc_tko = 10;
+	ctx->qc_scoremin = 0;
+
+	/* Get log log_facility */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@log_facility");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		clu_set_facility(val);
+		free(val);
+	}
+
+	/* Get log level */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@log_level");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		loglevel = atoi(val);
+		free(val);
+		if (loglevel < 0)
+			loglevel = 4;
+
+		if (!debug)
+			clu_set_loglevel(loglevel);
+	}
+
+	/* Get interval */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@interval");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_interval = atoi(val);
+		free(val);
+		if (ctx->qc_interval < 1)
+			ctx->qc_interval = 1;
+	}
+		
+	/* Get tko */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@tko");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_tko = atoi(val);
+		free(val);
+		if (ctx->qc_tko < 3)
+			ctx->qc_tko = 3;
+	}
+		
+	/* Get votes */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@votes");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_votes = atoi(val);
+		free(val);
+		if (ctx->qc_votes < 0)
+			ctx->qc_votes = 0;
+	}
+
+	/* Get device */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@device");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_device = val;
+	}
+
+	/* Get label (overrides device) */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@label");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_label = val;
+	}
+
+	/* Get status file */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@status_file");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_status_file = val;
+	}
+
+	/* Get min score */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@min_score");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_scoremin = atoi(val);
+		free(val);
+		if (ctx->qc_scoremin < 0)
+			ctx->qc_scoremin = 0;
+	}
+
+	*cfh = configure_heuristics(ccsfd, h, maxh);
+
+	clulog(LOG_DEBUG,
+	       "Quorum Daemon: %d heuristics, %d interval, %d tko, %d votes\n",
+	       *cfh, ctx->qc_interval, ctx->qc_tko, ctx->qc_votes);
+
+	ccs_disconnect(ccsfd);
+
+	return 0;
+}
+
+
+int
+main(int argc, char **argv)
+{
+	cman_node_t me;
+	int cfh, rv;
+	qd_ctx ctx;
+	cman_handle_t ch;
+	node_info_t ni[MAX_NODES_DISK];
+	struct h_data h[10];
+	char debug = 0, foreground = 0;
+	char device[128];
+
+	while ((rv = getopt(argc, argv, "fd")) != EOF) {
+		switch (rv) {
+		case 'd':
+			debug = 1;
+			break;
+		case 'f':
+			foreground = 1;
+		default:
+			break;
+		}
+	}
+#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
+	ch = cman_admin_init(NULL);
+#else
+	ch = cman_init(NULL);
+#endif
+	if (!ch) {
+		printf("Could not connect to cluster (CMAN not running?)\n");
+		return -1;
+	}
+
+	if (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) {
+		printf("Could not determine local node ID; cannot start\n");
+		return -1;
+	}
+
+	qd_init(&ctx, ch, me.cn_nodeid);
+
+	signal(SIGINT, int_handler);
+
+        if (debug)
+                clu_set_loglevel(LOG_DEBUG);
+        if (foreground)
+                clu_log_console(1);
+		
+	if (get_config_data(NULL, &ctx, h, 10, &cfh, debug) < 0) {
+		clulog_and_print(LOG_CRIT, "Configuration failed\n");
+		return -1;
+	}
+
+	if (ctx.qc_label) {
+		if (find_partitions("/proc/partitions",
+				    ctx.qc_label, device,
+				    sizeof(device), 0) != 0) {
+			clulog_and_print(LOG_CRIT, "Unable to match label"
+					 " '%s' to any device\n",
+					 ctx.qc_label);
+			return -1;
+		}
+
+		if (ctx.qc_device)
+			free(ctx.qc_device);
+
+		ctx.qc_device = strdup(device);
+
+		clulog(LOG_INFO, "Quorum Partition: %s Label: %s\n",
+		       ctx.qc_device, ctx.qc_label);
+	} else if (ctx.qc_device) {
+		if (check_device(ctx.qc_device, NULL, NULL) != 0) {
+			clulog(LOG_CRIT,
+			       "Specified partition %s does not have a "
+			       "qdisk label\n", ctx.qc_device);
+			return -1;
+		}
+	}
+
+	if (!foreground)
+                daemon(0,0);
+
+	if (quorum_init(&ctx, ni, MAX_NODES_DISK, h, cfh) < 0) {
+		clulog_and_print(LOG_CRIT, "Initialization failed\n");
+		return -1;
+	}
+	
+	cman_register_quorum_device(ctx.qc_ch, ctx.qc_device, ctx.qc_votes);
+	/*
+		XXX this always returns -1 / EBUSY even when it works?!!!
+		
+	if ((rv = cman_register_quorum_device(ctx.qc_ch, ctx.qc_device,
+					      ctx.qc_votes)) < 0) {
+		clulog_and_print(LOG_CRIT,
+				 "Could not register %s with CMAN; "
+				 "return = %d; error = %s\n",
+				 ctx.qc_device, rv, strerror(errno));
+		return -1;
+	}
+	*/
+
+	quorum_loop(&ctx, ni, MAX_NODES_DISK);
+	cman_unregister_quorum_device(ctx.qc_ch);
+
+	quorum_logout(&ctx);
+
+	qd_destroy(&ctx);
+
+	return 0;
+
+}
+
/cvs/cluster/cluster/cman/qdisk/mkqdisk.c,v  -->  standard output
revision 1.3.2.1
--- cluster/cman/qdisk/mkqdisk.c
+++ -	2006-07-21 18:01:40.340826000 +0000
@@ -0,0 +1,93 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh@redhat.com>
+ */
+/**
+  @file Quorum disk utility
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <disk.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <platform.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+
+int
+main(int argc, char **argv)
+{
+	char device[128];
+	char *newdev = NULL, *newlabel = NULL;
+	int rv;
+
+	printf("mkqdisk v0.5\n");
+
+	while ((rv = getopt(argc, argv, "Lfc:l:h")) != EOF) {
+		switch (rv) {
+		case 'L':
+			/* List */
+			close(2);
+			return find_partitions("/proc/partitions",
+					       NULL, NULL, 0, 1);
+			break;
+		case 'f':
+			close(2);
+			return find_partitions("/proc/partitions",
+					       optarg, device,
+					       sizeof(device), 0);
+		case 'c':
+			newdev = optarg;
+			break;
+		case 'l':
+			newlabel = optarg;
+			break;
+		case 'h':
+			printf("usage: mkqdisk -L | -f <label> | -c "
+			       "<device> -l <label>\n");
+			return 0;
+		default:
+			break;
+		}
+	}
+
+	if (!newdev && !newlabel) {
+		printf("usage: mkqdisk -L | -f <label> | -c "
+		       "<device> -l <label>\n");
+		return 1;
+	}
+
+	if (!newdev || !newlabel) {
+		printf("Both a device and a label are required\n");
+		return 1;
+	}
+
+	printf("Writing new quorum disk label '%s' to %s.\n",
+	       newlabel, newdev);
+	printf("WARNING: About to destroy all data on %s; proceed [N/y] ? ",
+	       newdev);
+	if (getc(stdin) != 'y') {
+		printf("Good thinking.\n");
+		return 0;
+	}
+
+	return qdisk_init(newdev, newlabel);
+}
/cvs/cluster/cluster/cman/qdisk/platform.h,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/platform.h
+++ -	2006-07-21 18:01:40.435441000 +0000
@@ -0,0 +1,59 @@
+/*
+  Copyright Red Hat, Inc. 2002-2003
+
+  The Red Hat Cluster Manager API Library is free software; you can
+  redistribute it and/or modify it under the terms of the GNU Lesser
+  General Public License as published by the Free Software Foundation;
+  either version 2.1 of the License, or (at your option) any later
+  version.
+
+  The Red Hat Cluster Manager API Library is distributed in the hope
+  that it will be useful, but WITHOUT ANY WARRANTY; without even the
+  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+  PURPOSE.  See the GNU Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+  USA. 
+ */
+/** @file
+ * Defines for byte-swapping
+ */
+#ifndef __PLATFORM_H
+#define __PLATFORM_H
+
+#include <endian.h>
+#include <sys/param.h>
+#include <byteswap.h>
+#include <bits/wordsize.h>
+
+/* No swapping on little-endian machines */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define le_swap16(x) (x)
+#define le_swap32(x) (x)
+#define le_swap64(x) (x)
+#else
+#define le_swap16(x) bswap_16(x)
+#define le_swap32(x) bswap_32(x)
+#define le_swap64(x) bswap_64(x)
+#endif
+
+/* No swapping on big-endian machines */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define be_swap16(x) bswap_16(x)
+#define be_swap32(x) bswap_32(x)
+#define be_swap64(x) bswap_64(x)
+#else
+#define be_swap16(x) (x)
+#define be_swap32(x) (x)
+#define be_swap64(x) (x)
+#endif
+
+
+#define swab16(x) x=be_swap16(x)
+#define swab32(x) x=be_swap32(x)
+#define swab64(x) x=be_swap64(x)
+
+
+#endif /* __PLATFORM_H */
/cvs/cluster/cluster/cman/qdisk/proc.c,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/proc.c
+++ -	2006-07-21 18:01:40.521042000 +0000
@@ -0,0 +1,128 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh@redhat.com>
+ */
+/**
+  @file Quorum disk /proc/partition scanning functions
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <disk.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <platform.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+int
+check_device(char *device, char *label, quorum_header_t *qh)
+{
+	int fd = -1, ret = -1;
+	quorum_header_t qh_local;
+
+	if (!qh)
+		qh = &qh_local;
+
+	fd = qdisk_validate(device);
+	if (fd < 0) {
+		perror("qdisk_verify");
+		return -1;
+	}
+
+	fd = qdisk_open(device);
+	if (fd < 0) {
+		perror("qdisk_open");
+		return -1;
+	}
+
+	if (qdisk_read(fd, OFFSET_HEADER, qh, sizeof(*qh)) == sizeof(*qh)) {
+		swab_quorum_header_t(qh);
+                if (qh->qh_magic == HEADER_MAGIC_NUMBER) {
+			if (!label || !strcmp(qh->qh_cluster, label)) {
+				ret = 0;
+			}
+                }
+        }
+
+	qdisk_close(&fd);
+
+	return ret;
+}
+
+
+int
+find_partitions(const char *partfile, const char *label,
+	        char *devname, size_t devlen, int print)
+{
+	char line[4096];
+	FILE *fp;
+	int minor, major;
+	unsigned long long blkcnt;
+	char device[128];
+	char realdev[256];
+	quorum_header_t qh;
+
+	fp = fopen(partfile, "r");
+	if (!fp)
+		return -1;
+
+	while (fgets(line, sizeof(line), fp) != NULL) {
+		if (strlen(line) > 128 + (22) /* 5 + 5 + 11 + 1 */) {
+			/*printf("Line too long!\n");*/
+			continue;
+		}
+
+		/* This line is taken from 2.6.15.4's proc line */
+		sscanf(line, "%4d %4d %10llu %s", &major, &minor,
+		       &blkcnt, device);
+
+		if (strlen(device)) {
+			snprintf(realdev, sizeof(realdev),
+				 "/dev/%s", device);
+			if (check_device(realdev, (char *)label, &qh) != 0)
+				continue;
+
+			if (print) {
+				printf("%s:\n", realdev);
+				printf("\tMagic:   %08x\n", qh.qh_magic);
+				printf("\tLabel:   %s\n", qh.qh_cluster);
+				printf("\tCreated: %s",
+				       ctime((time_t *)&qh.qh_timestamp));
+				printf("\tHost:    %s\n\n", qh.qh_updatehost);
+			}
+
+			if (devname && devlen) {
+				/* Got it */
+				strncpy(devname, realdev, devlen);
+				fclose(fp);
+				return 0;
+			}
+		}
+	}
+
+	fclose(fp);
+
+	if (print)
+		/* No errors if we're just printing stuff */
+		return 0;
+
+	errno = ENOENT;
+	return -1;
+}
/cvs/cluster/cluster/cman/qdisk/score.c,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/score.c
+++ -	2006-07-21 18:01:40.622758000 +0000
@@ -0,0 +1,383 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh@redhat.com>
+ */
+/**
+  @file Quorum daemon scoring functions + thread.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <string.h>
+#include <ccs.h>
+#include <clulog.h>
+#include "score.h"
+
+static pthread_mutex_t sc_lock = PTHREAD_MUTEX_INITIALIZER;
+static int _score = 0, _maxscore = 0, _score_thread_running = 0;
+static pthread_t score_thread = (pthread_t)0;
+
+struct h_arg {
+	struct h_data *h;
+	int count;
+};
+
+
+/*
+  XXX Messy, but works for now... 
+ */
+void
+nullify(void)
+{
+	int fd[3];
+
+	close(0);
+	close(1);
+	close(2);
+
+	fd[0] = open("/dev/null", O_RDONLY);
+	if (fd[0] != 0)
+		dup2(fd[0], 0);
+	fd[1] = open("/dev/null", O_WRONLY);
+	if (fd[1] != 1)
+		dup2(fd[1], 1);
+	fd[2] = open("/dev/null", O_WRONLY);
+	if (fd[2] != 2)
+		dup2(fd[2], 2);
+}
+
+
+/**
+  Spin off a user-defined heuristic
+ */
+static int
+fork_heuristic(struct h_data *h)
+{
+	int pid;
+	char *argv[4];
+	time_t now;
+
+	if (h->childpid) {	
+		errno = EINPROGRESS;
+		return -1;
+	}
+
+	now = time(NULL);
+	if (now < h->nextrun)
+		return 0;
+
+	h->nextrun = now + h->interval;
+
+	pid = fork();
+	if (pid < 0)
+		return -1;
+
+	if (pid) {
+		h->childpid = pid;
+		return 0;
+	}
+
+	argv[0] = "/bin/sh";
+	argv[1] = "-c";
+	argv[2] = h->program;
+	argv[3] = NULL;
+
+	nullify();
+
+	execv("/bin/sh", argv);
+
+	printf("Execv failed\n");
+	return 0;
+}
+
+
+/**
+  Total our current score
+ */
+static void
+total_score(struct h_data *h, int max, int *score, int *maxscore)
+{
+	int x;
+
+	*score = 0;
+	*maxscore = 0;
+
+	for (x = 0; x < max; x++) {
+		*maxscore += h[x].score;
+		if (h[x].available)
+			*score += h[x].score;
+	}
+}
+
+
+/**
+  Check for response from a user-defined heuristic / script
+ */
+static int
+check_heuristic(struct h_data *h, int block)
+{
+	int ret;
+	int status;
+
+	if (h->childpid == 0)
+		return 0;
+
+	ret = waitpid(h->childpid, &status, block?0:WNOHANG);
+	if (!block && ret == 0)
+		return 0;
+
+	h->childpid = 0;
+	h->available = 0;
+	if (ret < 0 && errno == ECHILD)
+		return -1;
+	if (!WIFEXITED(status))
+		return 0;
+	if (WEXITSTATUS(status) != 0)
+		return 0;
+	h->available = 1;
+	return 0;
+}
+
+
+/**
+  Kick off all available heuristics
+ */
+static int
+fork_heuristics(struct h_data *h, int max)
+{
+	int x;
+
+	for (x = 0; x < max; x++)
+		fork_heuristic(&h[x]);
+	return 0;
+}
+
+
+/**
+  Check all available heuristics
+ */
+static int
+check_heuristics(struct h_data *h, int max, int block)
+{
+	int x;
+
+	for (x = 0; x < max; x++)
+		check_heuristic(&h[x], block);
+	return 0;
+}
+
+
+/**
+  Read configuration data from CCS into the array provided
+ */
+int
+configure_heuristics(int ccsfd, struct h_data *h, int max)
+{
+	int x = 0;
+	char *val;
+	char query[128];
+
+	if (!h || !max)
+		return -1;
+
+	do {
+		h[x].program = NULL;
+		h[x].available = 0;
+		h[x].interval = 2;
+		h[x].score = 1;
+		h[x].childpid = 0;
+		h[x].nextrun = 0;
+
+		/* Get program */
+		snprintf(query, sizeof(query),
+			 "/cluster/quorumd/heuristic[%d]/@program", x+1);
+		if (ccs_get(ccsfd, query, &val) != 0)
+			/* No more */
+			break;
+		h[x].program = val;
+
+		/* Get score */
+		snprintf(query, sizeof(query),
+			 "/cluster/quorumd/heuristic[%d]/@score", x+1);
+		if (ccs_get(ccsfd, query, &val) == 0) {
+			h[x].score = atoi(val);
+			free(val);
+			if (h[x].score <= 0)
+				h[x].score = 1;
+		}
+		
+		/* Get query interval */
+		snprintf(query, sizeof(query),
+			 "/cluster/quorumd/heuristic[%d]/@interval", x+1);
+		if (ccs_get(ccsfd, query, &val) == 0) {
+			h[x].interval = atoi(val);
+			free(val);
+			if (h[x].interval <= 0)
+				h[x].interval = 2;
+		}
+
+		clulog(LOG_DEBUG, "Heuristic: '%s' score=%d interval=%d\n",
+		       h[x].program, h[x].score, h[x].interval);
+
+	} while (++x < max);
+
+	clulog(LOG_DEBUG, "%d heuristics loaded\n", x);
+		
+	return x;
+}
+
+
+/**
+  Return the current score + maxscore to the caller
+ */
+int
+get_my_score(int *score, int *maxscore)
+{
+	pthread_mutex_lock(&sc_lock);
+	*score = _score;
+	*maxscore = _maxscore;
+	pthread_mutex_unlock(&sc_lock);
+
+	return 0;
+}
+
+
+/**
+  Loop for the scoring thread.
+ */
+void *
+score_thread_main(void *arg)
+{
+	struct h_arg *args = (struct h_arg *)arg;
+	int score, maxscore;
+
+	while (_score_thread_running) {
+		fork_heuristics(args->h, args->count);
+		check_heuristics(args->h, args->count, 0);
+		total_score(args->h, args->count, &score, &maxscore);
+
+		pthread_mutex_lock(&sc_lock);
+		_score = score;
+		_maxscore = maxscore;
+		pthread_mutex_unlock(&sc_lock);
+
+		if (_score_thread_running)
+			sleep(1);
+	}
+
+	free(args->h);
+	free(args);
+	printf("Score thread going away\n");
+	return (NULL);
+}
+
+
+/**
+  Stop the score thread for shutdown / reconfiguration
+ */
+int
+stop_score_thread(void)
+{
+	void *ret;
+
+	if (!_score_thread_running)
+		return 0;
+
+	_score_thread_running = 0;
+	pthread_join(score_thread, &ret);
+
+	return 0;
+}
+
+
+/**
+  Start the score thread.  h is copied into an argument which is
+  passed in as the arg parameter in the score thread, so it is safe
+  to pass in h if it was allocated on the stack.
+ */
+int
+start_score_thread(struct h_data *h, int count)
+{
+	pthread_attr_t attrs;
+	struct h_arg *args;
+
+	if (!h || !count)
+		return -1;
+
+	args = malloc(sizeof(struct h_arg));
+	if (!args)
+		return -1;
+
+	args->h = malloc(sizeof(struct h_data) * count);
+	if (!args->h) {
+		free(args);
+		return -1;
+	}
+
+	memcpy(args->h, h, (sizeof(struct h_data) * count));
+	args->count = count;
+
+	_score_thread_running = 1;
+        pthread_attr_init(&attrs);
+        pthread_attr_setinheritsched(&attrs, PTHREAD_INHERIT_SCHED);
+        pthread_create(&score_thread, &attrs, score_thread_main, args);
+        pthread_attr_destroy(&attrs);
+
+	if (score_thread)
+		return 0;
+	_score_thread_running = 0;
+	return -1;	
+}
+
+
+#if 0
+int
+main(int argc, char **argv)
+{
+	struct h_data h[10];
+	int max = 0, score, maxscore, ccsfd;
+
+	ccsfd = ccs_force_connect("test", 1);
+	if (ccsfd < 0) 
+		return -1;
+	max = configure_heuristics(ccsfd, h, 10);
+	ccs_disconnect(ccsfd);
+	
+	start_score_thread(h, max);
+	max = 0;
+	while (max < 10) {
+		get_my_score(&score,&maxscore);
+		printf("current %d/%d\n", score, maxscore);
+		sleep(1);
+		++max;
+	}
+	stop_score_thread();
+
+	get_my_score(&score,&maxscore);
+	printf("final! %d/%d\n", score, maxscore);
+
+	return 0;
+}
+#endif
+
/cvs/cluster/cluster/cman/qdisk/score.h,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/score.h
+++ -	2006-07-21 18:01:40.712579000 +0000
@@ -0,0 +1,60 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh@redhat.com>
+ */
+/**
+  @file Quorum daemon scoring functions + thread header file
+ */
+#ifndef _SCORE_H
+#define _SCORE_H
+
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+struct h_data {
+	char *	program;
+	int	score;
+	int	available;
+	int	interval;
+	pid_t	childpid;
+	time_t	nextrun;
+};
+
+/*
+   Grab score data from CCSD
+ */
+int configure_heuristics(int ccsfd, struct h_data *hp, int max);
+
+/* 
+   Stop the thread which runs the scoring applets.
+ */
+int stop_score_thread(void);
+
+/*
+   Start the thread which runs the scoring applets
+ */
+int start_score_thread(struct h_data *h, int count);
+
+/* 
+   Get our score + maxscore
+ */
+int get_my_score(int *score, int *maxscore);
+
+#endif



                 reply	other threads:[~2006-07-21 18:01 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060721180140.1996.qmail@sourceware.org \
    --to=lhh@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.