[Cluster-devel] cluster/group/dlm_controld Makefile action.c d ...

All of lore.kernel.org
 help / color / mirror / Atom feed

From: teigland@sourceware.org <teigland@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/group/dlm_controld Makefile action.c d ...
Date: 26 Oct 2007 21:33:28 -0000	[thread overview]
Message-ID: <20071026213328.7264.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL5
Changes by:	teigland at sourceware.org	2007-10-26 21:33:27

Modified files:
	group/dlm_controld: Makefile action.c dlm_daemon.h group.c 
	                    main.c member_cman.c 
Added files:
	group/dlm_controld: deadlock.c 

Log message:
	add all the deadlock code from HEAD, and build it into the daemon,
	but make it impossible to enable

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/deadlock.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.7.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/action.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.8.2.5&r2=1.8.2.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/dlm_daemon.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.5.2.4&r2=1.5.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/group.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.3&r2=1.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.6.2.5&r2=1.6.2.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/member_cman.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.4&r2=1.4.2.5

/cvs/cluster/cluster/group/dlm_controld/deadlock.c,v  -->  standard output
revision 1.7.2.1
--- cluster/group/dlm_controld/deadlock.c
+++ -	2007-10-26 21:33:28.107576000 +0000
@@ -0,0 +1,1868 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_daemon.h"
+#include "libdlm.h"
+
+int deadlock_enabled = 0;
+
+extern struct list_head lockspaces;
+extern int our_nodeid;
+
+static SaCkptHandleT global_ckpt_h;
+static SaCkptCallbacksT callbacks = { 0, 0 };
+static SaVersionT version = { 'B', 1, 1 };
+static char section_buf[10 * 1024 * 1024];  /* 10MB of pack_lock's enough? */
+static uint32_t section_len;
+static uint32_t section_max;
+
+struct node {
+	struct list_head	list;
+	int			nodeid;
+	int			checkpoint_ready; /* we've read its ckpt */
+	int			in_cycle;         /* participating in cycle */
+};
+
+enum {
+	LOCAL_COPY = 1,
+	MASTER_COPY = 2,
+};
+
+/* from linux/fs/dlm/dlm_internal.h */
+#define DLM_LKSTS_WAITING       1
+#define DLM_LKSTS_GRANTED       2
+#define DLM_LKSTS_CONVERT       3
+
+struct pack_lock {
+	uint64_t		xid;
+	uint32_t		id;
+	int			nodeid;
+	uint32_t		remid;
+	int			ownpid;
+	uint32_t		exflags;
+	uint32_t		flags;
+	int8_t			status;
+	int8_t			grmode;
+	int8_t			rqmode;
+	int8_t			copy;
+};
+
+struct dlm_rsb {
+	struct list_head	list;
+	struct list_head	locks;
+	char			name[DLM_RESNAME_MAXLEN];
+	int			len;
+};
+
+/* information is saved in the lkb, and lkb->lock, from the perspective of the
+   local or master copy, not the process copy */
+
+struct dlm_lkb {
+	struct list_head        list;       /* r->locks */
+	struct pack_lock	lock;       /* data from debugfs/checkpoint */
+	int			home;       /* node where the lock owner lives*/
+	struct dlm_rsb		*rsb;       /* lock is on resource */
+	struct trans		*trans;     /* lock owned by this transaction */
+	struct list_head	trans_list; /* tr->locks */
+	struct trans		*waitfor_trans; /* the trans that's holding the
+						   lock that's blocking us */
+};
+
+/* waitfor pointers alloc'ed 4 at at time */
+#define TR_NALLOC		4
+
+struct trans {
+	struct list_head	list;
+	struct list_head	locks;
+	uint64_t		xid;
+	int			others_waiting_on_us; /* count of trans's
+							 pointing to us in
+							 waitfor */
+	int			waitfor_alloc;
+	int			waitfor_count;        /* count of in-use
+							 waitfor slots and
+						         num of trans's we're
+						         waiting on */
+	struct trans		**waitfor;	      /* waitfor_alloc trans
+							 pointers */
+};
+
+#define DLM_HEADER_MAJOR	1
+#define DLM_HEADER_MINOR	0
+#define DLM_HEADER_PATCH	0
+
+#define DLM_MSG_CYCLE_START	 1
+#define DLM_MSG_CYCLE_END	 2
+#define DLM_MSG_CHECKPOINT_READY 3
+#define DLM_MSG_CANCEL_LOCK	 4
+
+struct dlm_header {
+	uint16_t		version[3];
+	uint16_t		type; /* MSG_ */
+	uint32_t		nodeid; /* sender */
+	uint32_t		to_nodeid; /* 0 if to all */
+	uint32_t		global_id;
+	uint32_t		lkid;
+	uint32_t		pad;
+	char			name[MAXNAME];
+};
+
+static const int __dlm_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
+        {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
+        {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
+        {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
+        {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
+        {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+
+static inline int dlm_modes_compat(int mode1, int mode2)
+{
+	return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
+}
+
+static char *status_str(int lksts)
+{
+	switch (lksts) {
+	case DLM_LKSTS_WAITING:
+		return "W";
+	case DLM_LKSTS_GRANTED:
+		return "G";
+	case DLM_LKSTS_CONVERT:
+		return "C";
+	}
+	return "?";
+}
+
+static void free_resources(struct lockspace *ls)
+{
+	struct dlm_rsb *r, *r_safe;
+	struct dlm_lkb *lkb, *lkb_safe;
+
+	list_for_each_entry_safe(r, r_safe, &ls->resources, list) {
+		list_for_each_entry_safe(lkb, lkb_safe, &r->locks, list) {
+			list_del(&lkb->list);
+			if (!list_empty(&lkb->trans_list))
+				list_del(&lkb->trans_list);
+			free(lkb);
+		}
+		list_del(&r->list);
+		free(r);
+	}
+}
+
+static void free_transactions(struct lockspace *ls)
+{
+	struct trans *tr, *tr_safe;
+
+	list_for_each_entry_safe(tr, tr_safe, &ls->transactions, list) {
+		list_del(&tr->list);
+		if (tr->waitfor)
+			free(tr->waitfor);
+		free(tr);
+	}
+}
+
+static void disable_deadlock(void)
+{
+	log_error("FIXME: deadlock detection disabled");
+}
+
+void setup_deadlock(void)
+{
+	SaAisErrorT rv;
+
+	if (!deadlock_enabled)
+		return;
+
+	rv = saCkptInitialize(&global_ckpt_h, &callbacks, &version);
+	if (rv != SA_AIS_OK)
+		log_error("ckpt init error %d", rv);
+}
+
+/* FIXME: use private data hooks into libcpg to save ls */
+
+static struct lockspace *find_ls_by_handle(cpg_handle_t h)
+{
+	struct lockspace *ls;
+
+	list_for_each_entry(ls, &lockspaces, list) {
+		if (ls->cpg_h == h)
+			return ls;
+	}
+	return NULL;
+}
+
+static struct dlm_rsb *get_resource(struct lockspace *ls, char *name, int len)
+{
+	struct dlm_rsb *r;
+
+	list_for_each_entry(r, &ls->resources, list) {
+		if (r->len == len && !strncmp(r->name, name, len))
+			return r;
+	}
+
+	r = malloc(sizeof(struct dlm_rsb));
+	if (!r) {
+		log_error("get_resource: no memory");
+		disable_deadlock();
+		return NULL;
+	}
+	memset(r, 0, sizeof(struct dlm_rsb));
+	memcpy(r->name, name, len);
+	r->len = len;
+	INIT_LIST_HEAD(&r->locks);
+	list_add(&r->list, &ls->resources);
+	return r;
+}
+
+static struct dlm_lkb *create_lkb(void)
+{
+	struct dlm_lkb *lkb;
+
+	lkb = malloc(sizeof(struct dlm_lkb));
+	if (!lkb) {
+		log_error("create_lkb: no memory");
+		disable_deadlock();
+	} else {
+		memset(lkb, 0, sizeof(struct dlm_lkb));
+		INIT_LIST_HEAD(&lkb->list);
+		INIT_LIST_HEAD(&lkb->trans_list);
+	}
+	return lkb;
+}
+
+static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	list_add(&lkb->list, &r->locks);
+	lkb->rsb = r;
+}
+
+/* from linux/fs/dlm/dlm_internal.h */
+#define IFL_MSTCPY 0x00010000
+
+/* called on a lock that's just been read from debugfs */
+
+static void set_copy(struct pack_lock *lock)
+{
+	uint32_t id, remid;
+
+	if (!lock->nodeid)
+		lock->copy = LOCAL_COPY;
+	else if (lock->flags & IFL_MSTCPY)
+		lock->copy = MASTER_COPY;
+	else {
+		/* process copy lock is converted to a partial master copy
+		   lock that will be combined with the real master copy */
+		lock->copy = MASTER_COPY;
+		id = lock->id;
+		remid = lock->remid;
+		lock->id = remid;
+		lock->remid = id;
+		lock->nodeid = our_nodeid;
+	}
+}
+
+/* xid is always zero in the real master copy, xid should always be non-zero
+   in the partial master copy (what was a process copy) */
+/* TODO: confirm or enforce that the partial will always have non-zero xid */
+
+static int partial_master_copy(struct pack_lock *lock)
+{
+	return (lock->xid != 0);
+}
+
+static struct dlm_lkb *get_lkb(struct dlm_rsb *r, struct pack_lock *lock)
+{
+	struct dlm_lkb *lkb;
+
+	if (lock->copy != MASTER_COPY)
+		goto out;
+
+	list_for_each_entry(lkb, &r->locks, list) {
+		if (lkb->lock.nodeid == lock->nodeid &&
+		    lkb->lock.id == lock->id)
+			return lkb;
+	}
+ out:
+	return create_lkb();
+}
+
+static struct dlm_lkb *add_lock(struct lockspace *ls, struct dlm_rsb *r,
+				int from_nodeid, struct pack_lock *lock)
+{
+	struct dlm_lkb *lkb;
+
+	lkb = get_lkb(r, lock);
+	if (!lkb)
+		return NULL;
+
+	switch (lock->copy) {
+	case LOCAL_COPY:
+		lkb->lock.xid     = lock->xid;
+		lkb->lock.nodeid  = lock->nodeid;
+		lkb->lock.id      = lock->id;
+		lkb->lock.remid   = lock->remid;
+		lkb->lock.ownpid  = lock->ownpid;
+		lkb->lock.exflags = lock->exflags;
+		lkb->lock.flags   = lock->flags;
+		lkb->lock.status  = lock->status;
+		lkb->lock.grmode  = lock->grmode;
+		lkb->lock.rqmode  = lock->rqmode;
+		lkb->lock.copy    = LOCAL_COPY;
+		lkb->home = from_nodeid;
+
+		log_group(ls, "add %s local nodeid %d id %x remid %x xid %llx",
+			  r->name, lock->nodeid, lock->id, lock->remid,
+			  (unsigned long long)lock->xid);
+		break;
+
+	case MASTER_COPY:
+		if (partial_master_copy(lock)) {
+			lkb->lock.xid     = lock->xid;
+			lkb->lock.nodeid  = lock->nodeid;
+			lkb->lock.id      = lock->id;
+			lkb->lock.remid   = lock->remid;
+			lkb->lock.copy    = MASTER_COPY;
+		} else {
+			/* only set xid from partial master copy above */
+			lkb->lock.nodeid  = lock->nodeid;
+			lkb->lock.id      = lock->id;
+			lkb->lock.remid   = lock->remid;
+			lkb->lock.copy    = MASTER_COPY;
+			/* set other fields from real master copy */
+			lkb->lock.ownpid  = lock->ownpid;
+			lkb->lock.exflags = lock->exflags;
+			lkb->lock.flags   = lock->flags;
+			lkb->lock.status  = lock->status;
+			lkb->lock.grmode  = lock->grmode;
+			lkb->lock.rqmode  = lock->rqmode;
+		}
+		lkb->home = lock->nodeid;
+
+		log_group(ls, "add %s master nodeid %d id %x remid %x xid %llx",
+			  r->name, lock->nodeid, lock->id, lock->remid,
+			  (unsigned long long)lock->xid);
+		break;
+	}
+
+	if (list_empty(&lkb->list))
+		add_lkb(r, lkb);
+	return lkb;
+}
+
+static void parse_r_name(char *line, char *name)
+{
+	char *p;
+	int i = 0;
+	int begin = 0;
+
+	for (p = line; ; p++) {
+		if (*p == '"') {
+			if (begin)
+				break;
+			begin = 1;
+			continue;
+		}
+		if (begin)
+			name[i++] = *p;
+	}
+}
+
+#define LOCK_LINE_MAX 1024
+
+/* old/original way of dumping (only master state) in 5.1 kernel;
+   does deadlock detection based on pid instead of xid */
+
+static int read_debugfs_master(struct lockspace *ls)
+{
+	FILE *file;
+	char path[PATH_MAX];
+	char line[LOCK_LINE_MAX];
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb;
+	struct pack_lock lock;
+	char r_name[65];
+	unsigned long long xid;
+	unsigned int waiting;
+	int r_len;
+	int rv;
+
+	snprintf(path, PATH_MAX, "/sys/kernel/debug/dlm/%s_master", ls->name);
+
+	file = fopen(path, "r");
+	if (!file)
+		return -1;
+
+	/* skip the header on the first line */
+	fgets(line, LOCK_LINE_MAX, file);
+
+	while (fgets(line, LOCK_LINE_MAX, file)) {
+		memset(&lock, 0, sizeof(struct pack_lock));
+
+		rv = sscanf(line, "%x %d %x %u %llu %x %hhd %hhd %hhd %u %d",
+			    &lock.id,
+			    &lock.nodeid,
+			    &lock.remid,
+			    &lock.ownpid,
+			    &xid,
+			    &lock.exflags,
+			    &lock.status,
+			    &lock.grmode,
+			    &lock.rqmode,
+			    &waiting,
+			    &r_len);
+
+		if (rv != 11) {
+			log_error("invalid debugfs line %d: %s", rv, line);
+			goto out;
+		}
+
+		memset(r_name, 0, sizeof(r_name));
+		parse_r_name(line, r_name);
+
+		r = get_resource(ls, r_name, r_len);
+		if (!r)
+			break;
+
+		/* we want lock.xid to be zero before calling add_lock
+		   so it will treat this like the full master copy (not
+		   partial).  then set the xid manually at the end to
+		   ownpid (there will be no process copy to merge and
+		   get the xid from in 5.1) */
+
+		set_copy(&lock);
+		lkb = add_lock(ls, r, our_nodeid, &lock);
+		if (!lkb)
+			break;
+		lkb->lock.xid = lock.ownpid;
+	}
+ out:
+	fclose(file);
+	return 0;
+}
+
+static int read_debugfs_locks(struct lockspace *ls)
+{
+	FILE *file;
+	char path[PATH_MAX];
+	char line[LOCK_LINE_MAX];
+	struct dlm_rsb *r;
+	struct pack_lock lock;
+	char r_name[65];
+	unsigned long long xid;
+	unsigned int waiting;
+	int r_nodeid;
+	int r_len;
+	int rv;
+
+	snprintf(path, PATH_MAX, "/sys/kernel/debug/dlm/%s_locks", ls->name);
+
+	file = fopen(path, "r");
+	if (!file)
+		return -1;
+
+	/* skip the header on the first line */
+	fgets(line, LOCK_LINE_MAX, file);
+
+	while (fgets(line, LOCK_LINE_MAX, file)) {
+		memset(&lock, 0, sizeof(struct pack_lock));
+
+		rv = sscanf(line, "%x %d %x %u %llu %x %x %hhd %hhd %hhd %u %d %d",
+			    &lock.id,
+			    &lock.nodeid,
+			    &lock.remid,
+			    &lock.ownpid,
+			    &xid,
+			    &lock.exflags,
+			    &lock.flags,
+			    &lock.status,
+			    &lock.grmode,
+			    &lock.rqmode,
+			    &waiting,
+			    &r_nodeid,
+			    &r_len);
+
+		lock.xid = xid; /* hack to avoid warning */
+
+		if (rv != 13) {
+			log_error("invalid debugfs line %d: %s", rv, line);
+			goto out;
+		}
+
+		memset(r_name, 0, sizeof(r_name));
+		parse_r_name(line, r_name);
+
+		r = get_resource(ls, r_name, r_len);
+		if (!r)
+			break;
+
+		set_copy(&lock);
+		add_lock(ls, r, our_nodeid, &lock);
+	}
+ out:
+	fclose(file);
+	return 0;
+}
+
+static int read_checkpoint_locks(struct lockspace *ls, int from_nodeid,
+			         char *numbuf, int buflen)
+{
+	struct dlm_rsb *r;
+	struct pack_lock *lock;
+	int count = section_len / sizeof(struct pack_lock);
+	int i;
+
+	r = get_resource(ls, numbuf, buflen - 1);
+	if (!r)
+		return -1;
+
+	lock = (struct pack_lock *) &section_buf;
+
+	for (i = 0; i < count; i++) {
+		lock->xid     = le64_to_cpu(lock->xid);
+		lock->id      = le32_to_cpu(lock->id);
+		lock->nodeid  = le32_to_cpu(lock->nodeid);
+		lock->remid   = le32_to_cpu(lock->remid);
+		lock->ownpid  = le32_to_cpu(lock->ownpid);
+		lock->exflags = le32_to_cpu(lock->exflags);
+		lock->flags   = le32_to_cpu(lock->flags);
+
+		add_lock(ls, r, from_nodeid, lock);
+		lock++;
+	}
+	return 0;
+}
+
+static int pack_lkb_list(struct list_head *q, struct pack_lock **lockp)
+{
+	struct dlm_lkb *lkb;
+	struct pack_lock *lock = *lockp;
+	int count = 0;
+
+	list_for_each_entry(lkb, q, list) {
+		if (count + 1 > section_max) {
+			log_error("too many locks %d for ckpt buf", count);
+			break;
+		}
+
+		lock->xid     = cpu_to_le64(lkb->lock.xid);
+		lock->id      = cpu_to_le32(lkb->lock.id);
+		lock->nodeid  = cpu_to_le32(lkb->lock.nodeid);
+		lock->remid   = cpu_to_le32(lkb->lock.remid);
+		lock->ownpid  = cpu_to_le32(lkb->lock.ownpid);
+		lock->exflags = cpu_to_le32(lkb->lock.exflags);
+		lock->flags   = cpu_to_le32(lkb->lock.flags);
+		lock->status  = lkb->lock.status;
+		lock->grmode  = lkb->lock.grmode;
+		lock->rqmode  = lkb->lock.rqmode;
+		lock->copy    = lkb->lock.copy;
+
+		lock++;
+		count++;
+	}
+	return count;
+}
+
+static void pack_section_buf(struct lockspace *ls, struct dlm_rsb *r)
+{
+	struct pack_lock *lock;
+	int count;
+
+	memset(&section_buf, 0, sizeof(section_buf));
+	section_max = sizeof(section_buf) / sizeof(struct pack_lock);
+
+	lock = (struct pack_lock *) &section_buf;
+
+	count = pack_lkb_list(&r->locks, &lock);
+
+	section_len = count * sizeof(struct pack_lock);
+}
+
+static int _unlink_checkpoint(struct lockspace *ls, SaNameT *name)
+{
+	SaCkptCheckpointHandleT h;
+	SaCkptCheckpointDescriptorT s;
+	SaAisErrorT rv;
+	int ret = 0;
+	int retries;
+
+	h = (SaCkptCheckpointHandleT) ls->lock_ckpt_handle;
+	log_group(ls, "unlink ckpt %llx", (unsigned long long)h);
+
+	retries = 0;
+ unlink_retry:
+	rv = saCkptCheckpointUnlink(global_ckpt_h, name);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(ls, "unlink ckpt retry");
+		sleep(1);
+		if (retries++ < 10)
+			goto unlink_retry;
+	}
+	if (rv == SA_AIS_OK)
+		goto out_close;
+	if (!h)
+		goto out;
+
+	log_error("unlink ckpt error %d %s", rv, ls->name);
+	ret = -1;
+
+	retries = 0;
+ status_retry:
+	rv = saCkptCheckpointStatusGet(h, &s);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(ls, "unlink ckpt status retry");
+		sleep(1);
+		if (retries++ < 10)
+			goto status_retry;
+	}
+	if (rv != SA_AIS_OK) {
+		log_error("unlink ckpt status error %d %s", rv, ls->name);
+		goto out_close;
+	}
+
+	log_group(ls, "unlink ckpt status: size %llu, max sections %u, "
+		      "max section size %llu, section count %u, mem %u",
+		 (unsigned long long)s.checkpointCreationAttributes.checkpointSize,
+		 s.checkpointCreationAttributes.maxSections,
+		 (unsigned long long)s.checkpointCreationAttributes.maxSectionSize,
+		 s.numberOfSections, s.memoryUsed);
+
+ out_close:
+	retries = 0;
+ close_retry:
+	rv = saCkptCheckpointClose(h);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(ls, "unlink ckpt close retry");
+		sleep(1);
+		if (retries++ < 10)
+			goto close_retry;
+	}
+	if (rv != SA_AIS_OK) {
+		log_error("unlink ckpt %llx close err %d %s",
+			  (unsigned long long)h, rv, ls->name);
+	}
+ out:
+	ls->lock_ckpt_handle = 0;
+	return ret;
+}
+
+static int unlink_checkpoint(struct lockspace *ls)
+{
+	SaNameT name;
+	int len;
+
+	len = snprintf((char *)name.value, SA_MAX_NAME_LENGTH, "dlmdeadlk.%s.%d",
+		       ls->name, our_nodeid);
+	name.length = len;
+
+	return _unlink_checkpoint(ls, &name);
+}
+
+static void read_checkpoint(struct lockspace *ls, int nodeid)
+{
+	SaCkptCheckpointHandleT h;
+	SaCkptSectionIterationHandleT itr;
+	SaCkptSectionDescriptorT desc;
+	SaCkptIOVectorElementT iov;
+	SaNameT name;
+	SaAisErrorT rv;
+	char buf[DLM_RESNAME_MAXLEN];
+	int len;
+	int retries;
+
+	if (nodeid == our_nodeid)
+		return;
+
+	log_group(ls, "read_checkpoint %d", nodeid);
+
+	len = snprintf((char *)name.value, SA_MAX_NAME_LENGTH, "dlmdeadlk.%s.%d",
+		       ls->name, nodeid);
+	name.length = len;
+
+	retries = 0;
+ open_retry:
+	rv = saCkptCheckpointOpen(global_ckpt_h, &name, NULL,
+				  SA_CKPT_CHECKPOINT_READ, 0, &h);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(ls, "read_checkpoint: %d ckpt open retry", nodeid);
+		sleep(1);
+		if (retries++ < 10)
+			goto open_retry;
+	}
+	if (rv != SA_AIS_OK) {
+		log_error("read_checkpoint: %d ckpt open error %d", nodeid, rv);
+		return;
+	}
+
+	retries = 0;
+ init_retry:
+	rv = saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, 0, &itr);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(ls, "read_checkpoint: ckpt iterinit retry");
+		sleep(1);
+		if (retries++ < 10)
+			goto init_retry;
+	}
+	if (rv != SA_AIS_OK) {
+		log_error("read_checkpoint: %d ckpt iterinit error %d", nodeid, rv);
+		goto out;
+	}
+
+	while (1) {
+		retries = 0;
+	 next_retry:
+		rv = saCkptSectionIterationNext(itr, &desc);
+		if (rv == SA_AIS_ERR_NO_SECTIONS)
+			break;
+		if (rv == SA_AIS_ERR_TRY_AGAIN) {
+			log_group(ls, "read_checkpoint: ckpt iternext retry");
+			sleep(1);
+			if (retries++ < 10)
+				goto next_retry;
+		}
+		if (rv != SA_AIS_OK) {
+			log_error("read_checkpoint: %d ckpt iternext error %d",
+				  nodeid, rv);
+			goto out_it;
+		}
+
+		if (!desc.sectionSize)
+			continue;
+
+		iov.sectionId = desc.sectionId;
+		iov.dataBuffer = &section_buf;
+		iov.dataSize = desc.sectionSize;
+		iov.dataOffset = 0;
+
+		memset(&buf, 0, sizeof(buf));
+		snprintf(buf, sizeof(buf), "%s", desc.sectionId.id);
+
+		log_group(ls, "read_checkpoint: section size %llu id %u \"%s\"",
+			  (unsigned long long)iov.dataSize,
+			  iov.sectionId.idLen, buf);
+
+		retries = 0;
+	 read_retry:
+		rv = saCkptCheckpointRead(h, &iov, 1, NULL);
+		if (rv == SA_AIS_ERR_TRY_AGAIN) {
+			log_group(ls, "read_checkpoint: ckpt read retry");
+			sleep(1);
+			if (retries++ < 10)
+				goto read_retry;
+		}
+		if (rv != SA_AIS_OK) {
+			log_error("read_checkpoint: %d ckpt read error %d",
+				  nodeid, rv);
+			goto out_it;
+		}
+
+		section_len = iov.readSize;
+
+		if (!section_len)
+		       continue;
+
+		if (section_len % sizeof(struct pack_lock)) {
+			log_error("read_checkpoint: %d bad section len %d",
+				  nodeid, section_len);
+			continue;
+		}
+
+		read_checkpoint_locks(ls, nodeid, (char *)desc.sectionId.id,
+				      desc.sectionId.idLen);
+	}
+
+ out_it:
+	saCkptSectionIterationFinalize(itr);
+	retries = 0;
+ out:
+	rv = saCkptCheckpointClose(h);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(ls, "read_checkpoint: unlink ckpt close retry");
+		sleep(1);
+		if (retries++ < 10)
+			goto out;
+	}
+	if (rv != SA_AIS_OK)
+		log_error("read_checkpoint: %d close error %d", nodeid, rv);
+}
+
+static void write_checkpoint(struct lockspace *ls)
+{
+	SaCkptCheckpointCreationAttributesT attr;
+	SaCkptCheckpointHandleT h;
+	SaCkptSectionIdT section_id;
+	SaCkptSectionCreationAttributesT section_attr;
+	SaCkptCheckpointOpenFlagsT flags;
+	SaNameT name;
+	SaAisErrorT rv;
+	char buf[DLM_RESNAME_MAXLEN];
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb;
+	int r_count, lock_count, total_size, section_size, max_section_size;
+	int len;
+
+	len = snprintf((char *)name.value, SA_MAX_NAME_LENGTH, "dlmdeadlk.%s.%d",
+		      ls->name, our_nodeid);
+	name.length = len;
+
+	/* unlink an old checkpoint before we create a new one */
+	if (ls->lock_ckpt_handle) {
+		log_error("write_checkpoint: old ckpt");
+		if (_unlink_checkpoint(ls, &name))
+			return;
+	}
+
+	/* loop through all locks to figure out sizes to set in
+	   the attr fields */
+
+	r_count = 0;
+	lock_count = 0;
+	total_size = 0;
+	max_section_size = 0;
+
+	list_for_each_entry(r, &ls->resources, list) {
+		r_count++;
+		section_size = 0;
+		list_for_each_entry(lkb, &r->locks, list) {
+			section_size += sizeof(struct pack_lock);
+			lock_count++;
+		}
+		total_size += section_size;
+		if (section_size > max_section_size)
+			max_section_size = section_size;
+	}
+
+	log_group(ls, "write_checkpoint: r_count %d, lock_count %d",
+		  r_count, lock_count);
+
+	log_group(ls, "write_checkpoint: total %d bytes, max_section %d bytes",
+		  total_size, max_section_size);
+
+	attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS;
+	attr.checkpointSize = total_size;
+	attr.retentionDuration = SA_TIME_MAX;
+	attr.maxSections = r_count + 1;      /* don't know why we need +1 */
+	attr.maxSectionSize = max_section_size;
+	attr.maxSectionIdSize = DLM_RESNAME_MAXLEN;
+
+	flags = SA_CKPT_CHECKPOINT_READ |
+		SA_CKPT_CHECKPOINT_WRITE |
+		SA_CKPT_CHECKPOINT_CREATE;
+
+ open_retry:
+	rv = saCkptCheckpointOpen(global_ckpt_h, &name, &attr, flags, 0, &h);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(ls, "write_checkpoint: ckpt open retry");
+		sleep(1);
+		goto open_retry;
+	}
+	if (rv == SA_AIS_ERR_EXIST) {
+		log_group(ls, "write_checkpoint: ckpt already exists");
+		return;
+	}
+	if (rv != SA_AIS_OK) {
+		log_group(ls, "write_checkpoint: ckpt open error %d", rv);
+		return;
+	}
+
+	log_group(ls, "write_checkpoint: open ckpt handle %llx",
+		  (unsigned long long)h);
+	ls->lock_ckpt_handle = (uint64_t) h;
+
+	list_for_each_entry(r, &ls->resources, list) {
+		memset(buf, 0, sizeof(buf));
+		len = snprintf(buf, sizeof(buf), "%s", r->name);
+
+		section_id.id = (void *)buf;
+		section_id.idLen = len + 1;
+		section_attr.sectionId = &section_id;
+		section_attr.expirationTime = SA_TIME_END;
+
+		pack_section_buf(ls, r);
+
+		log_group(ls, "write_checkpoint: section size %u id %u \"%s\"",
+			  section_len, section_id.idLen, buf);
+
+	 create_retry:
+		rv = saCkptSectionCreate(h, &section_attr, &section_buf,
+					 section_len);
+		if (rv == SA_AIS_ERR_TRY_AGAIN) {
+			log_group(ls, "write_checkpoint: ckpt create retry");
+			sleep(1);
+			goto create_retry;
+		}
+		if (rv == SA_AIS_ERR_EXIST) {
+			/* this shouldn't happen in general */
+			log_error("write_checkpoint: clearing old ckpt");
+			saCkptCheckpointClose(h);
+			_unlink_checkpoint(ls, &name);
+			goto open_retry;
+		}
+		if (rv != SA_AIS_OK) {
+			log_error("write_checkpoint: section create %d", rv);
+			break;
+		}
+	}
+}
+
+static int _send_message(cpg_handle_t h, void *buf, int len, int type)
+{
+	struct iovec iov;
+	cpg_error_t error;
+	int retries = 0;
+
+	iov.iov_base = buf;
+	iov.iov_len = len;
+ retry:
+	error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1);
+	if (error == CPG_ERR_TRY_AGAIN) {
+		retries++;
+		usleep(1000);
+		if (!(retries % 100))
+			log_error("cpg_mcast_joined retry %d", retries);
+		if (retries < 1000)
+			goto retry;
+	}
+	if (error != CPG_OK) {
+		log_error("cpg_mcast_joined error %d handle %llx",
+			  (int)error, (unsigned long long)h);
+		disable_deadlock();
+		return -1;
+	}
+	return 0;
+}
+
+static void send_message(struct lockspace *ls, int type)
+{
+	struct dlm_header *hd;
+	int len;
+	char *buf;
+
+	len = sizeof(struct dlm_header);
+	buf = malloc(len);
+	if (!buf) {
+		log_error("send_message: no memory");
+		disable_deadlock();
+		return;
+	}
+	memset(buf, 0, len);
+
+	hd = (struct dlm_header *)buf;
+	hd->version[0]  = cpu_to_le16(DLM_HEADER_MAJOR);
+	hd->version[1]  = cpu_to_le16(DLM_HEADER_MINOR);
+	hd->version[2]  = cpu_to_le16(DLM_HEADER_PATCH);
+	hd->type	= cpu_to_le16(type);
+	hd->nodeid      = cpu_to_le32(our_nodeid);
+	hd->to_nodeid   = 0;
+	hd->global_id   = cpu_to_le32(ls->global_id);
+	memcpy(hd->name, ls->name, strlen(ls->name));
+
+	_send_message(ls->cpg_h, buf, len, type);
+
+	free(buf);
+}
+
+static void send_checkpoint_ready(struct lockspace *ls)
+{
+	log_group(ls, "send_checkpoint_ready");
+	send_message(ls, DLM_MSG_CHECKPOINT_READY);
+}
+
+void send_cycle_start(struct lockspace *ls)
+{
+	if (!deadlock_enabled)
+		return;
+	log_group(ls, "send_cycle_start");
+	send_message(ls, DLM_MSG_CYCLE_START);
+}
+
+void send_cycle_end(struct lockspace *ls)
+{
+	if (!deadlock_enabled)
+		return;
+	log_group(ls, "send_cycle_end");
+	send_message(ls, DLM_MSG_CYCLE_END);
+}
+
+static void send_cancel_lock(struct lockspace *ls, struct trans *tr,
+			     struct dlm_lkb *lkb)
+{
+	struct dlm_header *hd;
+	int len;
+	char *buf;
+	int to_nodeid;
+	uint32_t lkid;
+
+	len = sizeof(struct dlm_header);
+	buf = malloc(len);
+	if (!buf) {
+		log_error("send_message: no memory");
+		disable_deadlock();
+		return;
+	}
+	memset(buf, 0, len);
+
+	if (!lkb->lock.nodeid)
+		lkid = lkb->lock.id;
+	else
+		lkid = lkb->lock.remid;
+	to_nodeid = lkb->home;
+
+	log_group(ls, "send_cancel_lock to nodeid %d rsb %s id %x xid %llx",
+		  to_nodeid, lkb->rsb->name, lkid,
+		  (unsigned long long)lkb->lock.xid);
+
+	hd = (struct dlm_header *)buf;
+	hd->version[0]  = cpu_to_le16(DLM_HEADER_MAJOR);
+	hd->version[1]  = cpu_to_le16(DLM_HEADER_MINOR);
+	hd->version[2]  = cpu_to_le16(DLM_HEADER_PATCH);
+	hd->type	= cpu_to_le16(DLM_MSG_CANCEL_LOCK);
+	hd->nodeid      = cpu_to_le32(our_nodeid);
+	hd->to_nodeid   = cpu_to_le32(to_nodeid);
+	hd->lkid        = cpu_to_le32(lkid);
+	hd->global_id   = cpu_to_le32(ls->global_id);
+	memcpy(hd->name, ls->name, strlen(ls->name));
+
+	_send_message(ls->cpg_h, buf, len, DLM_MSG_CANCEL_LOCK);
+
+	free(buf);
+}
+
+static void dump_resources(struct lockspace *ls)
+{
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb;
+
+	log_group(ls, "Resource dump:");
+
+	list_for_each_entry(r, &ls->resources, list) {
+		log_group(ls, "\"%s\" len %d", r->name, r->len);
+		list_for_each_entry(lkb, &r->locks, list) {
+			log_group(ls, "  %s: nodeid %d id %08x remid %08x gr %s rq %s pid %u xid %llx",
+			  	  status_str(lkb->lock.status),
+				  lkb->lock.nodeid,
+			  	  lkb->lock.id,
+			  	  lkb->lock.remid,
+			  	  dlm_mode_str(lkb->lock.grmode),
+			          dlm_mode_str(lkb->lock.rqmode),
+			          lkb->lock.ownpid,
+				  (unsigned long long)lkb->lock.xid);
+		}
+	}
+}
+
+static void find_deadlock(struct lockspace *ls);
+
+static void run_deadlock(struct lockspace *ls)
+{
+	struct node *node;
+	int not_ready = 0;
+	int low = -1;
+
+	if (ls->all_checkpoints_ready)
+		log_group(ls, "WARNING: run_deadlock all_checkpoints_ready");
+
+	list_for_each_entry(node, &ls->nodes, list) {
+		if (!node->in_cycle)
+			continue;
+		if (!node->checkpoint_ready)
+			not_ready++;
+
+		log_group(ls, "nodeid %d checkpoint_ready = %d",
+			  node->nodeid, node->checkpoint_ready);
+	}
+	if (not_ready)
+		return;
+
+	ls->all_checkpoints_ready = 1;
+
+	list_for_each_entry(node, &ls->nodes, list) {
+		if (!node->in_cycle)
+			continue;
+		if (node->nodeid < low || low == -1)
+			low = node->nodeid;
+	}
+	ls->low_nodeid = low;
+
+	if (low == our_nodeid)
+		find_deadlock(ls);
+	else
+		log_group(ls, "defer resolution to low nodeid %d", low);
+}
+
+static void receive_checkpoint_ready(struct lockspace *ls, int nodeid)
+{
+	struct node *node;
+
+	log_group(ls, "receive_checkpoint_ready from %d", nodeid);
+
+	read_checkpoint(ls, nodeid);
+
+	list_for_each_entry(node, &ls->nodes, list) {
+		if (node->nodeid == nodeid) {
+			node->checkpoint_ready = 1;
+			break;
+		}
+	}
+
+	run_deadlock(ls);
+}
+
+static void receive_cycle_start(struct lockspace *ls, int nodeid)
+{
+	struct node *node;
+	int rv;
+
+	log_group(ls, "receive_cycle_start from %d", nodeid);
+
+	if (ls->cycle_running) {
+		log_group(ls, "cycle already running");
+		return;
+	}
+	ls->cycle_running = 1;
+	gettimeofday(&ls->cycle_start_time, NULL);
+
+	list_for_each_entry(node, &ls->nodes, list)
+		node->in_cycle = 1;
+
+	rv = read_debugfs_locks(ls);
+	if (rv < 0) {
+		/* compat for RHEL5.1 kernels */
+		rv = read_debugfs_master(ls);
+		if (rv < 0) {
+			log_error("can't read dlm debugfs file: %s",
+				  strerror(errno));
+			return;
+		}
+	}
+
+	write_checkpoint(ls);
+	send_checkpoint_ready(ls);
+}
+
+static uint64_t dt_usec(struct timeval *start, struct timeval *stop)
+{
+	uint64_t dt;
+
+	dt = stop->tv_sec - start->tv_sec;
+	dt *= 1000000;
+	dt += stop->tv_usec - start->tv_usec;
+	return dt;
+}
+
+/* TODO: nodes added during a cycle - what will they do with messages
+   they recv from other nodes running the cycle? */
+
+static void receive_cycle_end(struct lockspace *ls, int nodeid)
+{
+	struct node *node;
+	uint64_t usec;
+
+	if (!ls->cycle_running) {
+		log_error("receive_cycle_end %s from %d: no cycle running",
+			  ls->name, nodeid);
+		return;
+	}
+
+	gettimeofday(&ls->cycle_end_time, NULL);
+	usec = dt_usec(&ls->cycle_start_time, &ls->cycle_end_time);
+	log_group(ls, "receive_cycle_end: from %d cycle time %.2f s",
+		  nodeid, usec * 1.e-6);
+
+	ls->cycle_running = 0;
+	ls->all_checkpoints_ready = 0;
+
+	list_for_each_entry(node, &ls->nodes, list)
+		node->checkpoint_ready = 0;
+
+	free_resources(ls);
+	free_transactions(ls);
+	unlink_checkpoint(ls);
+}
+
+static void receive_cancel_lock(struct lockspace *ls, int nodeid, uint32_t lkid)
+{
+	dlm_lshandle_t h;
+	int rv;
+
+	if (nodeid != our_nodeid)
+		return;
+
+	h = dlm_open_lockspace(ls->name);
+	if (!h) {
+		log_error("deadlock cancel %x from %d can't open lockspace %s",
+			  lkid, nodeid, ls->name);
+		return;
+	}
+
+	log_group(ls, "receive_cancel_lock %x from %d", lkid, nodeid);
+
+	rv = dlm_ls_deadlock_cancel(h, lkid, 0);
+	if (rv < 0) {
+		log_error("deadlock cancel %x from %x lib cancel errno %d",
+			  lkid, nodeid, errno);
+	}
+
+	dlm_close_lockspace(h);
+}
+
+static void deliver_cb(cpg_handle_t handle, struct cpg_name *group_name,
+		uint32_t nodeid, uint32_t pid, void *data, int data_len)
+{
+	struct lockspace *ls;
+	struct dlm_header *hd;
+
+	ls = find_ls_by_handle(handle);
+	if (!ls)
+		return;
+
+	hd = (struct dlm_header *) data;
+
+	hd->version[0]  = le16_to_cpu(hd->version[0]);
+	hd->version[1]  = le16_to_cpu(hd->version[1]);
+	hd->version[2]  = le16_to_cpu(hd->version[2]);
+	hd->type	= le16_to_cpu(hd->type);
+	hd->nodeid      = le32_to_cpu(hd->nodeid);
+	hd->to_nodeid   = le32_to_cpu(hd->to_nodeid);
+	hd->global_id   = le32_to_cpu(hd->global_id);
+
+	if (hd->version[0] != DLM_HEADER_MAJOR) {
+		log_error("reject message version %u.%u.%u",
+			  hd->version[0], hd->version[1], hd->version[2]);
+		return;
+	}
+
+	switch (hd->type) {
+	case DLM_MSG_CYCLE_START:
+		receive_cycle_start(ls, hd->nodeid);
+		break;
+	case DLM_MSG_CYCLE_END:
+		receive_cycle_end(ls, hd->nodeid);
+		break;
+	case DLM_MSG_CHECKPOINT_READY:
+		receive_checkpoint_ready(ls, hd->nodeid);
+		break;
+	case DLM_MSG_CANCEL_LOCK:
+		receive_cancel_lock(ls, hd->nodeid, hd->lkid);
+		break;
+	default:
+		log_error("unknown message type %d from %d",
+			  hd->type, hd->nodeid);
+	}
+}
+
+static void node_joined(struct lockspace *ls, int nodeid)
+{
+	struct node *node;
+
+	node = malloc(sizeof(struct node));
+	if (!node) {
+		log_error("node_joined: no memory");
+		disable_deadlock();
+		return;
+	}
+	memset(node, 0, sizeof(struct node));
+	node->nodeid = nodeid;
+	list_add_tail(&node->list, &ls->nodes);
+	log_group(ls, "node %d joined deadlock cpg", nodeid);
+}
+
+static void node_left(struct lockspace *ls, int nodeid, int reason)
+{
+	struct node *node, *safe;
+
+	list_for_each_entry_safe(node, safe, &ls->nodes, list) {
+		if (node->nodeid != nodeid)
+			continue;
+
+		list_del(&node->list);
+		free(node);
+		log_group(ls, "node %d left deadlock cpg", nodeid);
+	}
+}
+
+static void purge_locks(struct lockspace *ls, int nodeid);
+
+static void confchg_cb(cpg_handle_t handle, struct cpg_name *group_name,
+		struct cpg_address *member_list, int member_list_entries,
+		struct cpg_address *left_list, int left_list_entries,
+		struct cpg_address *joined_list, int joined_list_entries)
+{
+	struct lockspace *ls;
+	int i;
+
+	ls = find_ls_by_handle(handle);
+	if (!ls)
+		return;
+
+	if (!ls->got_first_confchg) {
+		ls->got_first_confchg = 1;
+		for (i = 0; i < member_list_entries; i++)
+			node_joined(ls, member_list[i].nodeid);
+		return;
+	}
+
+	/* nodes added during a cycle won't have node->in_cycle set so they
+	   won't be included in any of the cycle processing */
+
+	for (i = 0; i < joined_list_entries; i++)
+		node_joined(ls, joined_list[i].nodeid);
+
+	for (i = 0; i < left_list_entries; i++)
+		node_left(ls, left_list[i].nodeid, left_list[i].reason);
+
+	if (!ls->cycle_running)
+		return;
+
+	if (!left_list_entries)
+		return;
+
+	if (!ls->all_checkpoints_ready) {
+		run_deadlock(ls);
+		return;
+	}
+
+	for (i = 0; i < left_list_entries; i++)
+		purge_locks(ls, left_list[i].nodeid);
+
+	for (i = 0; i < left_list_entries; i++) {
+		if (left_list[i].nodeid != ls->low_nodeid)
+			continue;
+		/* this will set a new low node which will call find_deadlock */
+		run_deadlock(ls);
+		break;
+	}
+}
+
+static void process_deadlock_cpg(int ci)
+{
+	struct lockspace *ls;
+	cpg_error_t error;
+
+	ls = get_client_lockspace(ci);
+	if (!ls)
+		return;
+
+	error = cpg_dispatch(ls->cpg_h, CPG_DISPATCH_ONE);
+	if (error != CPG_OK)
+		log_error("cpg_dispatch error %d", error);
+}
+
+cpg_callbacks_t ls_callbacks = {
+	.cpg_deliver_fn = deliver_cb,
+	.cpg_confchg_fn = confchg_cb,
+};
+
+static void make_cpgname(struct lockspace *ls, struct cpg_name *cn)
+{
+	char name[MAXNAME+8];
+
+	memset(name, 0, sizeof(name));
+	strncpy(name, ls->name, sizeof(name));
+	strncat(name, "_deadlk", 7);
+	memset(cn, 0, sizeof(struct cpg_name));
+	strncpy(cn->value, name, strlen(name) + 1);
+	cn->length = strlen(name) + 1;
+}
+
+void join_deadlock_cpg(struct lockspace *ls)
+{
+	cpg_handle_t h;
+	struct cpg_name cpgname;
+	cpg_error_t error;
+	int retries = 0;
+	int fd, ci;
+
+	if (!deadlock_enabled)
+		return;
+
+	unlink_checkpoint(ls); /* not sure about this */
+
+	error = cpg_initialize(&h, &ls_callbacks);
+	if (error != CPG_OK) {
+		log_error("cpg_initialize error %d", error);
+		return;
+	}
+
+	cpg_fd_get(h, &fd);
+	if (fd < 0) {
+		log_error("cpg_fd_get error %d", error);
+		return;
+	}
+
+	ci = client_add(fd, process_deadlock_cpg, NULL);
+
+	make_cpgname(ls, &cpgname);
+
+ retry:
+	error = cpg_join(h, &cpgname);
+	if (error == CPG_ERR_TRY_AGAIN) {
+		sleep(1);
+		if (retries++ < 10)
+			goto retry;
+	}
+	if (error != CPG_OK) {
+		log_error("deadlk cpg join error %d", error);
+		goto fail;
+	}
+
+	ls->cpg_h = h;
+	ls->cpg_ci = ci;
+	set_client_lockspace(ci, ls);
+	log_group(ls, "deadlk cpg ci %d fd %d", ci, fd);
+	return;
+ fail:
+	cpg_finalize(h);
+	client_dead(ci);
+}
+
+void leave_deadlock_cpg(struct lockspace *ls)
+{
+	struct cpg_name cpgname;
+	cpg_error_t error;
+	int retries = 0;
+
+	if (!deadlock_enabled)
+		return;
+
+	make_cpgname(ls, &cpgname);
+ retry:
+	error = cpg_leave(ls->cpg_h, &cpgname);
+	if (error == CPG_ERR_TRY_AGAIN) {
+		sleep(1);
+		if (retries++ < 10)
+			goto retry;
+	}
+	if (error != CPG_OK)
+		log_error("deadlk cpg leave error %d", error);
+
+	cpg_finalize(ls->cpg_h);
+	client_dead(ls->cpg_ci);
+}
+
+/* would we ever call this after we've created the transaction lists?
+   I don't think so; I think it can only be called between reading
+   checkpoints */
+
+static void purge_locks(struct lockspace *ls, int nodeid)
+{
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb, *safe;
+
+	list_for_each_entry(r, &ls->resources, list) {
+		list_for_each_entry_safe(lkb, safe, &r->locks, list) {
+			if (lkb->home == nodeid) {
+				list_del(&lkb->list);
+				if (list_empty(&lkb->trans_list))
+					free(lkb);
+				else
+					log_group(ls, "purge %d %x on trans",
+						  nodeid, lkb->lock.id);
+			}
+		}
+	}
+}
+
+static void add_lkb_trans(struct trans *tr, struct dlm_lkb *lkb)
+{
+	list_add(&lkb->trans_list, &tr->locks);
+	lkb->trans = tr;
+}
+
+static struct trans *get_trans(struct lockspace *ls, uint64_t xid)
+{
+	struct trans *tr;
+
+	list_for_each_entry(tr, &ls->transactions, list) {
+		if (tr->xid == xid)
+			return tr;
+	}
+
+	tr = malloc(sizeof(struct trans));
+	if (!tr) {
+		log_error("get_trans: no memory");
+		disable_deadlock();
+		return NULL;
+	}
+	memset(tr, 0, sizeof(struct trans));
+	tr->xid = xid;
+	tr->waitfor = NULL;
+	tr->waitfor_alloc = 0;
+	tr->waitfor_count = 0;
+	INIT_LIST_HEAD(&tr->locks);
+	list_add(&tr->list, &ls->transactions);
+	return tr;
+}
+
+/* for each rsb, for each lock, find/create trans, add lkb to the trans list */
+
+static void create_trans_list(struct lockspace *ls)
+{
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb;
+	struct trans *tr;
+	int r_count = 0, lkb_count = 0;
+
+	list_for_each_entry(r, &ls->resources, list) {
+		r_count++;
+		list_for_each_entry(lkb, &r->locks, list) {
+			lkb_count++;
+			tr = get_trans(ls, lkb->lock.xid);
+			if (!tr)
+				goto out;
+			add_lkb_trans(tr, lkb);
+		}
+	}
+ out:
+	log_group(ls, "create_trans_list: r_count %d lkb_count %d",
+		  r_count, lkb_count);
+}
+
+static int locks_compat(struct dlm_lkb *waiting_lkb,
+			struct dlm_lkb *granted_lkb)
+{
+	if (waiting_lkb == granted_lkb) {
+		log_debug("waiting and granted same lock");
+		return 0;
+	}
+
+	if (waiting_lkb->trans->xid == granted_lkb->trans->xid) {
+		log_debug("waiting and granted same trans %llx",
+			  (unsigned long long)waiting_lkb->trans->xid);
+		return 0;
+	}
+
+	return dlm_modes_compat(granted_lkb->lock.grmode,
+				waiting_lkb->lock.rqmode);
+}
+
+static int in_waitfor(struct trans *tr, struct trans *add_tr)
+{
+	int i;
+
+	for (i = 0; i < tr->waitfor_alloc; i++) {
+		if (!tr->waitfor[i])
+			continue;
+		if (tr->waitfor[i] == add_tr)
+			return 1;
+	}
+	return 0;
+}
+
+static void add_waitfor(struct lockspace *ls, struct dlm_lkb *waiting_lkb,
+			struct dlm_lkb *granted_lkb)
+{
+	struct trans *tr = waiting_lkb->trans;
+	int old_alloc, i;
+
+	if (locks_compat(waiting_lkb, granted_lkb))
+		return;
+
+	/* this shouldn't happen AFAIK */
+	if (tr == granted_lkb->trans) {
+		log_group(ls, "trans %llx waiting on self",
+			  (unsigned long long)tr->xid);
+		return;
+	}
+
+	/* don't add the same trans to the waitfor list multiple times */
+	if (tr->waitfor_count && in_waitfor(tr, granted_lkb->trans)) {
+		log_group(ls, "trans %llx already waiting for trans %llx, "
+			  "waiting %x %s, granted %x %s",
+			  (unsigned long long)waiting_lkb->trans->xid,
+			  (unsigned long long)granted_lkb->trans->xid,
+			  waiting_lkb->lock.id, waiting_lkb->rsb->name,
+			  granted_lkb->lock.id, granted_lkb->rsb->name);
+		return;
+	}
+
+	if (tr->waitfor_count == tr->waitfor_alloc) {
+		old_alloc = tr->waitfor_alloc;
+		tr->waitfor_alloc += TR_NALLOC;
+		tr->waitfor = realloc(tr->waitfor,
+				      tr->waitfor_alloc * sizeof(tr));
+		for (i = old_alloc; i < tr->waitfor_alloc; i++)
+			tr->waitfor[i] = NULL;
+	}
+
+	tr->waitfor[tr->waitfor_count++] = granted_lkb->trans;
+	granted_lkb->trans->others_waiting_on_us++;
+	waiting_lkb->waitfor_trans = granted_lkb->trans;
+}
+
+/* for each trans, for each waiting lock, go to rsb of the lock,
+   find granted locks on that rsb, then find the trans the
+   granted lock belongs to, add that trans to our waitfor list */
+
+static void create_waitfor_graph(struct lockspace *ls)
+{
+	struct dlm_lkb *waiting_lkb, *granted_lkb;
+	struct dlm_rsb *r;
+	struct trans *tr;
+	int depend_count = 0;
+
+	list_for_each_entry(tr, &ls->transactions, list) {
+		list_for_each_entry(waiting_lkb, &tr->locks, trans_list) {
+			if (waiting_lkb->lock.status == DLM_LKSTS_GRANTED)
+				continue;
+			/* waiting_lkb status is CONVERT or WAITING */
+
+			r = waiting_lkb->rsb;
+
+			list_for_each_entry(granted_lkb, &r->locks, list) {
+				if (granted_lkb->lock.status==DLM_LKSTS_WAITING)
+					continue;
+				/* granted_lkb status is GRANTED or CONVERT */
+				add_waitfor(ls, waiting_lkb, granted_lkb);
+				depend_count++;
+			}
+		}
+	}
+
+	log_group(ls, "create_waitfor_graph: depend_count %d", depend_count);
+}
+
+/* Assume a transaction that's not waiting on any locks will complete, release
+   all the locks it currently holds, and exit.  Other transactions that were
+   blocked waiting on the removed transaction's now-released locks may now be
+   unblocked, complete, release all held locks and exit.  Repeat this until
+   no more transactions can be removed.  If there are transactions remaining,
+   then they are deadlocked. */
+
+static void remove_waitfor(struct trans *tr, struct trans *remove_tr)
+{
+	int i;
+
+	for (i = 0; i < tr->waitfor_alloc; i++) {
+		if (!tr->waitfor_count)
+			break;
+
+		if (!tr->waitfor[i])
+			continue;
+
+		if (tr->waitfor[i] == remove_tr) {
+			tr->waitfor[i] = NULL;
+			tr->waitfor_count--;
+			remove_tr->others_waiting_on_us--;
+		}
+	}
+}
+
+/* remove_tr is not waiting for anything, assume it completes and goes away
+   and remove it from any other transaction's waitfor list */
+
+static void remove_trans(struct lockspace *ls, struct trans *remove_tr)
+{
+	struct trans *tr;
+
+	list_for_each_entry(tr, &ls->transactions, list) {
+		if (tr == remove_tr)
+			continue;
+		if (!remove_tr->others_waiting_on_us)
+			break;
+		remove_waitfor(tr, remove_tr);
+	}
+
+	if (remove_tr->others_waiting_on_us)
+		log_group(ls, "trans %llx removed others waiting %d",
+			  (unsigned long long)remove_tr->xid,
+			  remove_tr->others_waiting_on_us);
+}
+
+static int reduce_waitfor_graph(struct lockspace *ls)
+{
+	struct trans *tr, *safe;
+	int blocked = 0;
+	int removed = 0;
+
+	list_for_each_entry_safe(tr, safe, &ls->transactions, list) {
+		if (tr->waitfor_count) {
+			blocked++;
+			continue;
+		}
+		remove_trans(ls, tr);
+		list_del(&tr->list);
+		if (tr->waitfor)
+			free(tr->waitfor);
+		free(tr);
+		removed++;
+	}
+
+	log_group(ls, "reduce_waitfor_graph: %d blocked, %d removed",
+		  blocked, removed);
+	return removed;
+}
+
+static void reduce_waitfor_graph_loop(struct lockspace *ls)
+{
+	int removed;
+
+	while (1) {
+		removed = reduce_waitfor_graph(ls);
+		if (!removed)
+			break;
+	}
+}
+
+static struct trans *find_trans_to_cancel(struct lockspace *ls)
+{
+	struct trans *tr;
+
+	list_for_each_entry(tr, &ls->transactions, list) {
+		if (!tr->others_waiting_on_us)
+			continue;
+		return tr;
+	}
+	return NULL;
+}
+
+static void cancel_trans(struct lockspace *ls)
+{
+	struct trans *tr;
+	struct dlm_lkb *lkb;
+	int removed;
+
+	tr = find_trans_to_cancel(ls);
+	if (!tr) {
+		log_group(ls, "cancel_trans: no trans found");
+		return;
+	}
+
+	list_for_each_entry(lkb, &tr->locks, trans_list) {
+		if (lkb->lock.status == DLM_LKSTS_GRANTED)
+			continue;
+		send_cancel_lock(ls, tr, lkb);
+
+		/* When this canceled trans has multiple locks all blocked by
+		   locks held by one other trans, that other trans is only
+		   added to tr->waitfor once, and only one of these waiting
+		   locks will have waitfor_trans set.  So, the lkb with
+		   non-null waitfor_trans was the first one responsible
+		   for adding waitfor_trans to tr->waitfor.
+
+		   We could potentially forget about keeping track of lkb->
+		   waitfor_trans, forget about calling remove_waitfor()
+		   here and just set tr->waitfor_count = 0 after this loop.
+		   The loss would be that waitfor_trans->others_waiting_on_us
+		   would not get decremented. */
+
+		if (lkb->waitfor_trans)
+			remove_waitfor(tr, lkb->waitfor_trans);
+	}
+
+	/* this shouldn't happen, if it does something's not working right */
+	if (tr->waitfor_count) {
+		log_group(ls, "cancel_trans: %llx non-zero waitfor_count %d",
+			  (unsigned long long)tr->xid, tr->waitfor_count);
+	}
+
+	/* this should now remove the canceled trans since it now has a zero
+	   waitfor_count */
+	removed = reduce_waitfor_graph(ls);
+
+	if (!removed)
+		log_group(ls, "canceled trans not removed from graph");
+
+	/* now call reduce_waitfor_graph() in another loop and it
+	   should completely reduce */
+}
+
+static void dump_trans(struct lockspace *ls, struct trans *tr)
+{
+	struct dlm_lkb *lkb;
+	struct trans *wf;
+	int i;
+
+	log_group(ls, "trans xid %llx waitfor_count %d others_waiting_on_us %d",
+		  (unsigned long long)tr->xid, tr->waitfor_count,
+		  tr->others_waiting_on_us);
+
+	log_group(ls, "locks:");
+	
+	list_for_each_entry(lkb, &tr->locks, trans_list) {
+		log_group(ls, "  %s: id %08x gr %s rq %s pid %u:%u \"%s\"",
+			  status_str(lkb->lock.status),
+			  lkb->lock.id,
+			  dlm_mode_str(lkb->lock.grmode),
+			  dlm_mode_str(lkb->lock.rqmode),
+			  lkb->home,
+			  lkb->lock.ownpid,
+			  lkb->rsb->name);
+	}
+
+	if (!tr->waitfor_count)
+		return;
+
+	log_group(ls, "waitfor:");
+
+	for (i = 0; i < tr->waitfor_alloc; i++) {
+		if (!tr->waitfor[i])
+			continue;
+		wf = tr->waitfor[i];
+		log_group(ls, "  xid %llx", (unsigned long long)wf->xid);
+	}
+}
+
+static void dump_all_trans(struct lockspace *ls)
+{
+	struct trans *tr;
+
+	log_group(ls, "Transaction dump:");
+
+	list_for_each_entry(tr, &ls->transactions, list)
+		dump_trans(ls, tr);
+}
+
+static void find_deadlock(struct lockspace *ls)
+{
+	if (list_empty(&ls->resources)) {
+		log_group(ls, "no deadlock: no resources");
+		goto out;
+	}
+
+	if (!list_empty(&ls->transactions)) {
+		log_group(ls, "transactions list should be empty");
+		goto out;
+	}
+
+	dump_resources(ls);
+	create_trans_list(ls);
+	create_waitfor_graph(ls);
+	dump_all_trans(ls);
+	reduce_waitfor_graph_loop(ls);
+
+	if (list_empty(&ls->transactions)) {
+		log_group(ls, "no deadlock: all transactions reduced");
+		goto out;
+	}
+
+	log_group(ls, "found deadlock");
+	dump_all_trans(ls);
+
+	cancel_trans(ls);
+	reduce_waitfor_graph_loop(ls);
+
+	if (list_empty(&ls->transactions)) {
+		log_group(ls, "resolved deadlock with cancel");
+		goto out;
+	}
+
+	log_error("deadlock resolution failed");
+	dump_all_trans(ls);
+ out:
+	send_cycle_end(ls);
+}
+
--- cluster/group/dlm_controld/Makefile	2006/08/11 15:18:15	1.4
+++ cluster/group/dlm_controld/Makefile	2007/10/26 21:33:27	1.4.2.1
@@ -23,7 +23,7 @@
 	-I../../cman/lib/ \
 	-I../include/
 
-LDFLAGS+= -L../../cman/lib
+LDFLAGS+= -L../../cman/lib -L${libdir}/openais
 
 
 TARGET=dlm_controld
@@ -34,9 +34,10 @@
 		member_cman.o \
 		group.o \
 		action.o \
+		deadlock.o \
 		../lib/libgroup.a \
 		../../ccs/lib/libccs.a
-	$(CC) $(LDFLAGS) -o $@ $^ -lcman 
+	$(CC) $(LDFLAGS) -o $@ $^ -lcman -ldlm -lcpg -lSaCkpt
 
 main.o: main.c
 	$(CC) $(CFLAGS) -c -o $@ $<
@@ -50,6 +51,9 @@
 action.o: action.c
 	$(CC) $(CFLAGS) -c -o $@ $<
 
+deadlock.o: deadlock.c
+	$(CC) $(CFLAGS) -c -o $@ $<
+
 install: all
 	install -d ${sbindir}
 	install ${TARGET} ${sbindir}
--- cluster/group/dlm_controld/action.c	2007/06/06 21:10:03	1.8.2.5
+++ cluster/group/dlm_controld/action.c	2007/10/26 21:33:27	1.8.2.6
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -47,7 +47,24 @@
 #define COMMS_DIR     "/sys/kernel/config/dlm/cluster/comms"
 
 
-static int do_write(int fd, void *buf, size_t count)
+int do_read(int fd, void *buf, size_t count)
+{
+	int rv, off = 0;
+
+	while (off < count) {
+		rv = read(fd, buf + off, count - off);
+		if (rv == 0)
+			return -1;
+		if (rv == -1 && errno == EINTR)
+			continue;
+		if (rv == -1)
+			return -1;
+		off += rv;
+	}
+	return 0;
+}
+
+int do_write(int fd, void *buf, size_t count)
 {
 	int rv, off = 0;
 
--- cluster/group/dlm_controld/dlm_daemon.h	2007/05/04 21:14:32	1.5.2.4
+++ cluster/group/dlm_controld/dlm_daemon.h	2007/10/26 21:33:27	1.5.2.5
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -40,9 +40,15 @@
 #include <syslog.h>
 #include <sched.h>
 #include <signal.h>
-#include <linux/netlink.h>
+#include <sys/time.h>
 
+#include <openais/saAis.h>
+#include <openais/saCkpt.h>
+#include <openais/cpg.h>
+
+#include "dlm_controld.h"
 #include "list.h"
+#include "linux_endian.h"
 #include "libgroup.h"
 
 #define MAXARGS		8
@@ -68,11 +74,32 @@
 	syslog(LOG_ERR, fmt, ##args); \
 } while (0)
 
+#define log_group(ls, fmt, args...) \
+do { \
+	snprintf(daemon_debug_buf, 255, "%ld %s " fmt "\n", time(NULL), \
+		 (ls)->name, ##args); \
+	if (daemon_debug_opt) fprintf(stderr, "%s", daemon_debug_buf); \
+} while (0)
+
 
 struct lockspace {
 	struct list_head	list;
 	char			name[MAXNAME+1];
+	uint32_t		global_id;
+	int			low_nodeid;
 	int			joining;
+	int			cpg_ci;
+	cpg_handle_t		cpg_h;
+	SaCkptCheckpointHandleT lock_ckpt_handle;
+	struct list_head	transactions;
+	struct list_head	resources;
+	struct list_head	nodes;
+	struct timeval		cycle_start_time;
+	struct timeval		cycle_end_time;
+	struct timeval		last_send_cycle_start;
+	int			got_first_confchg;
+	int			cycle_running;
+	int			all_checkpoints_ready;
 };
 
 /* action.c */
@@ -84,23 +111,36 @@
 int set_members(char *name, int new_count, int *new_members);
 int set_id(char *name, uint32_t id);
 void set_ccs_options(void);
+int do_read(int fd, void *buf, size_t count);
+int do_write(int fd, void *buf, size_t count);
 
 /* member_xxx.c */
 int setup_member(void);
-int process_member(void);
+void process_member(int ci);
 char *nodeid2name(int nodeid);
 
 /* group.c */
 int setup_groupd(void);
-int process_groupd(void);
+void process_groupd(int ci);
 
 /* main.c */
+int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci));
+void client_dead(int ci);
+void set_client_lockspace(int ci, struct lockspace *ls);
+struct lockspace *get_client_lockspace(int ci);
 struct lockspace *create_ls(char *name);
 struct lockspace *find_ls(char *name);
+char *dlm_mode_str(int mode);
 
 /* member_cman.c */
 int is_cman_member(int nodeid);
 void cman_statechange(void);
 
+/* deadlock.c */
+void setup_deadlock(void);
+void join_deadlock_cpg(struct lockspace *ls);
+void leave_deadlock_cpg(struct lockspace *ls);
+void send_cycle_start(struct lockspace *ls);
+
 #endif
 
--- cluster/group/dlm_controld/group.c	2006/10/13 16:03:47	1.3
+++ cluster/group/dlm_controld/group.c	2007/10/26 21:33:27	1.3.2.1
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -107,7 +107,7 @@
 	return str_members_buf;
 }
 
-int process_groupd(void)
+void process_groupd(int ci)
 {
 	struct lockspace *ls;
 	int error = 0, val;
@@ -155,13 +155,15 @@
 
 		/* this causes the dlm_new_lockspace() call (typically from
 		   mount) to complete */
-
 		set_event_done(cb_name, 0);
+
+		join_deadlock_cpg(ls);
 		break;
 
 	case DO_SETID:
 		log_debug("groupd callback: set_id %s %x", cb_name, cb_id);
 		set_id(cb_name, cb_id);
+		ls->global_id = cb_id;
 		break;
 
 	case DO_TERMINATE:
@@ -180,6 +182,7 @@
 		}
 
 		set_event_done(cb_name, val);
+		leave_deadlock_cpg(ls);
 		list_del(&ls->list);
 		free(ls);
 		break;
@@ -194,7 +197,7 @@
 
 	cb_action = 0;
  out:
-	return error;
+	return;
 }
 
 int setup_groupd(void)
@@ -203,7 +206,7 @@
 
 	gh = group_init(NULL, "dlm", 1, &callbacks, GROUPD_TIMEOUT);
 	if (!gh) {
-		log_error("group_init error %d %d", (int) gh, errno);
+		log_error("group_init error %p %d", gh, errno);
 		return -ENOTCONN;
 	}
 
--- cluster/group/dlm_controld/main.c	2007/05/04 21:14:32	1.6.2.5
+++ cluster/group/dlm_controld/main.c	2007/10/26 21:33:27	1.6.2.6
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -12,21 +12,112 @@
 
 #include "dlm_daemon.h"
 
-#define OPTION_STRING			"KDhV"
+#include <linux/netlink.h>
+#include <linux/genetlink.h>
+#include <linux/dlm.h>
+#include <linux/dlm_netlink.h>
+
+#define OPTION_STRING			"KDhVd:"
 #define LOCKFILE_NAME			"/var/run/dlm_controld.pid"
 
-static int uevent_fd;
-static int groupd_fd;
-static int member_fd;
+#define DEADLOCK_CHECK_SECS		10
+
+#define NALLOC 16
 
 struct list_head lockspaces;
 
 extern group_handle_t gh;
+extern int deadlock_enabled;
+
+static int daemon_quit;
+static int client_maxi;
+static int client_size = 0;
+static struct client *client = NULL;
+static struct pollfd *pollfd = NULL;
+
+struct client {
+	int fd;
+	void *workfn;
+	void *deadfn;
+	struct lockspace *ls;
+};
+
+static void client_alloc(void)
+{
+	int i;
+
+	if (!client) {
+		client = malloc(NALLOC * sizeof(struct client));
+		pollfd = malloc(NALLOC * sizeof(struct pollfd));
+	} else {
+		client = realloc(client, (client_size + NALLOC) *
+					 sizeof(struct client));
+		pollfd = realloc(pollfd, (client_size + NALLOC) *
+					 sizeof(struct pollfd));
+		if (!pollfd)
+			log_error("can't alloc for pollfd");
+	}
+	if (!client || !pollfd)
+		log_error("can't alloc for client array");
+
+	for (i = client_size; i < client_size + NALLOC; i++) {
+		client[i].workfn = NULL;
+		client[i].deadfn = NULL;
+		client[i].fd = -1;
+		pollfd[i].fd = -1;
+		pollfd[i].revents = 0;
+	}
+	client_size += NALLOC;
+}
+
+void client_dead(int ci)
+{
+	close(client[ci].fd);
+	client[ci].workfn = NULL;
+	client[ci].fd = -1;
+	pollfd[ci].fd = -1;
+}
+
+int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci))
+{
+	int i;
+
+	if (!client)
+		client_alloc();
+ again:
+	for (i = 0; i < client_size; i++) {
+		if (client[i].fd == -1) {
+			client[i].workfn = workfn;
+			if (deadfn)
+				client[i].deadfn = deadfn;
+			else
+				client[i].deadfn = client_dead;
+			client[i].fd = fd;
+			pollfd[i].fd = fd;
+			pollfd[i].events = POLLIN;
+			if (i > client_maxi)
+				client_maxi = i;
+			return i;
+		}
+	}
+
+	client_alloc();
+	goto again;
+}
+
+void set_client_lockspace(int ci, struct lockspace *ls)
+{
+	client[ci].ls = ls;
+}
+
+struct lockspace *get_client_lockspace(int ci)
+{
+	return client[ci].ls;
+}
 
 static void sigterm_handler(int sig)
 {
-	if (list_empty(&lockspaces))
-		clear_configfs();
+	daemon_quit = 1;
 }
 
 struct lockspace *create_ls(char *name)
@@ -38,6 +129,9 @@
 		goto out;
 	memset(ls, 0, sizeof(*ls));
 	strncpy(ls->name, name, MAXNAME);
+	INIT_LIST_HEAD(&ls->transactions);
+	INIT_LIST_HEAD(&ls->resources);
+	INIT_LIST_HEAD(&ls->nodes);
  out:
 	return ls;
 }
@@ -54,25 +148,16 @@
 	return NULL;
 }
 
-#if 0
-void make_args(char *buf, int *argc, char **argv, char sep)
+struct lockspace *find_ls_id(uint32_t id)
 {
-	char *p = buf;
-	int i;
-
-	argv[0] = p;
+	struct lockspace *ls;
 
-	for (i = 1; i < MAXARGS; i++) {
-		p = strchr(buf, sep);
-		if (!p)
-			break;
-		*p = '\0';
-		argv[i] = p + 1;
-		buf = p + 1;
+	list_for_each_entry(ls, &lockspaces, list) {
+		if (ls->global_id == id)
+			return ls;
 	}
-	*argc = i;
+	return NULL;
 }
-#endif
 
 static char *get_args(char *buf, int *argc, char **argv, char sep, int want)
 {
@@ -104,11 +189,31 @@
 	return rp;
 }
 
+char *dlm_mode_str(int mode)
+{
+	switch (mode) {
+	case DLM_LOCK_IV:
+		return "IV";
+	case DLM_LOCK_NL:
+		return "NL";
+	case DLM_LOCK_CR:
+		return "CR";
+	case DLM_LOCK_CW:
+		return "CW";
+	case DLM_LOCK_PR:
+		return "PR";
+	case DLM_LOCK_PW:
+		return "PW";
+	case DLM_LOCK_EX:
+		return "EX";
+	}
+	return "??";
+}
 
 /* recv "online" (join) and "offline" (leave) 
    messages from dlm via uevents and pass them on to groupd */
 
-int process_uevent(void)
+static void process_uevent(int ci)
 {
 	struct lockspace *ls;
 	char buf[MAXLINE];
@@ -119,18 +224,18 @@
 	memset(argv, 0, sizeof(char *) * MAXARGS);
 
  retry_recv:
-	rv = recv(uevent_fd, &buf, sizeof(buf), 0);
+	rv = recv(client[ci].fd, &buf, sizeof(buf), 0);
 	if (rv == -1 && rv == EINTR)
 		goto retry_recv;
 	if (rv == -1 && rv == EAGAIN)
-		return 0;
+		return;
 	if (rv < 0) {
 		log_error("uevent recv error %d errno %d", rv, errno);
 		goto out;
 	}
 
 	if (!strstr(buf, "dlm"))
-		return 0;
+		return;
 
 	log_debug("uevent: %s", buf);
 
@@ -141,7 +246,7 @@
 	sys = argv[2];
 
 	if ((strlen(sys) != strlen("dlm")) || strcmp(sys, "dlm"))
-		return 0;
+		return;
 
 	log_debug("kernel: %s %s", act, argv[3]);
 
@@ -177,17 +282,16 @@
 	if (rv < 0)
 		log_error("process_uevent %s error %d errno %d",
 			  act, rv, errno);
-	return rv;
 }
 
-int setup_uevent(void)
+static int setup_uevent(void)
 {
 	struct sockaddr_nl snl;
 	int s, rv;
 
 	s = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
 	if (s < 0) {
-		log_error("netlink socket");
+		log_error("uevent netlink socket");
 		return s;
 	}
 
@@ -206,67 +310,366 @@
 	return s;
 }
 
-int loop(void)
+/* FIXME: look into using libnl/libnetlink */
+
+#define GENLMSG_DATA(glh)       ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
+#define GENLMSG_PAYLOAD(glh)    (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
+#define NLA_DATA(na)	    	((void *)((char*)(na) + NLA_HDRLEN))
+#define NLA_PAYLOAD(len)	(len - NLA_HDRLEN)
+
+/* Maximum size of response requested or message sent */
+#define MAX_MSG_SIZE    1024
+
+struct msgtemplate {
+	struct nlmsghdr n;
+	struct genlmsghdr g;
+	char buf[MAX_MSG_SIZE];
+};
+
+static int send_genetlink_cmd(int sd, uint16_t nlmsg_type, uint32_t nlmsg_pid,
+			      uint8_t genl_cmd, uint16_t nla_type,
+			      void *nla_data, int nla_len)
+{
+	struct nlattr *na;
+	struct sockaddr_nl nladdr;
+	int r, buflen;
+	char *buf;
+
+	struct msgtemplate msg;
+
+	msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+	msg.n.nlmsg_type = nlmsg_type;
+	msg.n.nlmsg_flags = NLM_F_REQUEST;
+	msg.n.nlmsg_seq = 0;
+	msg.n.nlmsg_pid = nlmsg_pid;
+	msg.g.cmd = genl_cmd;
+	msg.g.version = 0x1;
+	na = (struct nlattr *) GENLMSG_DATA(&msg);
+	na->nla_type = nla_type;
+	na->nla_len = nla_len + 1 + NLA_HDRLEN;
+	if (nla_data)
+		memcpy(NLA_DATA(na), nla_data, nla_len);
+	msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
+
+	buf = (char *) &msg;
+	buflen = msg.n.nlmsg_len ;
+	memset(&nladdr, 0, sizeof(nladdr));
+	nladdr.nl_family = AF_NETLINK;
+	while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
+			   sizeof(nladdr))) < buflen) {
+		if (r > 0) {
+			buf += r;
+			buflen -= r;
+		} else if (errno != EAGAIN)
+			return -1;
+	}
+	return 0;
+}
+
+/*
+ * Probe the controller in genetlink to find the family id
+ * for the DLM family
+ */
+static int get_family_id(int sd)
+{
+	char genl_name[100];
+	struct {
+		struct nlmsghdr n;
+		struct genlmsghdr g;
+		char buf[256];
+	} ans;
+
+	int id, rc;
+	struct nlattr *na;
+	int rep_len;
+
+	strcpy(genl_name, DLM_GENL_NAME);
+	rc = send_genetlink_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
+				CTRL_ATTR_FAMILY_NAME, (void *)genl_name,
+				strlen(DLM_GENL_NAME)+1);
+
+	rep_len = recv(sd, &ans, sizeof(ans), 0);
+	if (ans.n.nlmsg_type == NLMSG_ERROR ||
+	    (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len))
+		return 0;
+
+	na = (struct nlattr *) GENLMSG_DATA(&ans);
+	na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
+	if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
+		id = *(uint16_t *) NLA_DATA(na);
+	}
+	return id;
+}
+
+/* genetlink messages are timewarnings used as part of deadlock detection */
+
+static int setup_netlink(void)
 {
-	struct pollfd *pollfd;
-	int rv, i, maxi;
+	struct sockaddr_nl snl;
+	int s, rv;
+	uint16_t id;
+
+	s = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+	if (s < 0) {
+		log_error("generic netlink socket");
+		return s;
+	}
 
-	pollfd = malloc(MAXCON * sizeof(struct pollfd));
-	if (!pollfd)
+	memset(&snl, 0, sizeof(snl));
+	snl.nl_family = AF_NETLINK;
+
+	rv = bind(s, (struct sockaddr *) &snl, sizeof(snl));
+	if (rv < 0) {
+		log_error("gen netlink bind error %d errno %d", rv, errno);
+		close(s);
+		return rv;
+	}
+
+	id = get_family_id(s);
+	if (!id) {
+		log_error("Error getting family id, errno %d", errno);
+		close(s);
 		return -1;
+	}
 
-	rv = groupd_fd = setup_groupd();
+	rv = send_genetlink_cmd(s, id, getpid(), DLM_CMD_HELLO, 0, NULL, 0);
+	if (rv < 0) {
+		log_error("error sending hello cmd, errno %d", errno);
+		close(s);
+		return -1;
+	}
+
+	return s;
+}
+
+static void process_timewarn(struct dlm_lock_data *data)
+{
+	struct lockspace *ls;
+	struct timeval now;
+	unsigned int sec;
+
+	ls = find_ls_id(data->lockspace_id);
+	if (!ls)
+		return;
+
+	data->resource_name[data->resource_namelen] = '\0';
+
+	log_group(ls, "timewarn: lkid %x pid %d name %s",
+		  data->id, data->ownpid, data->resource_name);
+
+	/* Problem: we don't want to get a timewarn, assume it's resolved
+	   by the current cycle, but in fact it's from a deadlock that
+	   formed after the checkpoints for the current cycle.  Then we'd
+	   have to hope for another warning (that may not come) to trigger
+	   a new cycle to catch the deadlock.  If our last cycle ckpt
+	   was say N (~5?) sec before we receive the timewarn, then we
+	   can be confident that the cycle included the lock in question.
+	   Otherwise, we're not sure if the warning is for a new deadlock
+	   that's formed since our last cycle ckpt (unless it's a long
+	   enough time since the last cycle that we're confident it *is*
+	   a new deadlock).  When there is a deadlock, I suspect it will
+	   be common to receive warnings before, during, and possibly
+	   after the cycle that resolves it.  Wonder if we should record
+	   timewarns and match them with deadlock cycles so we can tell
+	   which timewarns are addressed by a given cycle and which aren't.  */
+
+
+	gettimeofday(&now, NULL);
+
+	/* don't send a new start until at least SECS after the last
+	   we sent, and at least SECS after the last completed cycle */
+
+	sec = now.tv_sec - ls->last_send_cycle_start.tv_sec;
+
+	if (sec < DEADLOCK_CHECK_SECS) {
+		log_group(ls, "skip send: recent send cycle %d sec", sec);
+		return;
+	}
+
+	sec = now.tv_sec - ls->cycle_end_time.tv_sec;
+
+	if (sec < DEADLOCK_CHECK_SECS) {
+		log_group(ls, "skip send: recent cycle end %d sec", sec);
+		return;
+	}
+
+	gettimeofday(&ls->last_send_cycle_start, NULL);
+	send_cycle_start(ls);
+}
+
+static void process_netlink(int ci)
+{
+	struct msgtemplate msg;
+	struct nlattr *na;
+	int len;
+
+	len = recv(client[ci].fd, &msg, sizeof(msg), 0);
+
+	if (len < 0) {
+		log_error("nonfatal netlink error: errno %d", errno);
+		return;
+	}
+
+	if (msg.n.nlmsg_type == NLMSG_ERROR || !NLMSG_OK((&msg.n), len)) {
+		struct nlmsgerr *err = NLMSG_DATA(&msg);
+		log_error("fatal netlink error: errno %d", err->error);
+		return;
+	}
+
+	na = (struct nlattr *) GENLMSG_DATA(&msg);
+
+	process_timewarn((struct dlm_lock_data *) NLA_DATA(na));
+}
+
+static void process_connection(int ci)
+{
+	char buf[DLM_CONTROLD_MSGLEN], *argv[MAXARGS];
+	int argc = 0, rv;
+	struct lockspace *ls;
+
+	memset(buf, 0, sizeof(buf));
+	memset(argv, 0, sizeof(char *) * MAXARGS);
+
+	rv = do_read(client[ci].fd, buf, DLM_CONTROLD_MSGLEN);
+	if (rv < 0) {
+		log_error("client %d fd %d read error %d %d", ci,
+			  client[ci].fd, rv, errno);
+		client_dead(ci);
+		return;
+	}
+
+	log_debug("ci %d read %s", ci, buf);
+
+	get_args(buf, &argc, argv, ' ', 2);
+
+	if (!strncmp(argv[0], "deadlock_check", 14)) {
+		ls = find_ls(argv[1]);
+		if (ls)
+			send_cycle_start(ls);
+		else
+			log_debug("deadlock_check ls name not found");
+	}
+}
+
+static void process_listener(int ci)
+{
+	int fd, i;
+
+	fd = accept(client[ci].fd, NULL, NULL);
+	if (fd < 0) {
+		log_error("process_listener: accept error %d %d", fd, errno);
+		return;
+	}
+	
+	i = client_add(fd, process_connection, NULL);
+
+	log_debug("client connection %d fd %d", i, fd);
+}
+
+static int setup_listener(void)
+{
+	struct sockaddr_un addr;
+	socklen_t addrlen;
+	int rv, s;
+
+	/* we listen for new client connections on socket s */
+
+	s = socket(AF_LOCAL, SOCK_STREAM, 0);
+	if (s < 0) {
+		log_error("socket error %d %d", s, errno);
+		return s;
+	}
+
+	memset(&addr, 0, sizeof(addr));
+	addr.sun_family = AF_LOCAL;
+	strcpy(&addr.sun_path[1], DLM_CONTROLD_SOCK_PATH);
+	addrlen = sizeof(sa_family_t) + strlen(addr.sun_path+1) + 1;
+
+	rv = bind(s, (struct sockaddr *) &addr, addrlen);
+	if (rv < 0) {
+		log_error("bind error %d %d", rv, errno);
+		close(s);
+		return rv;
+	}
+
+	rv = listen(s, 5);
+	if (rv < 0) {
+		log_error("listen error %d %d", rv, errno);
+		close(s);
+		return rv;
+	}
+	return s;
+}
+
+void cluster_dead(int ci)
+{
+	log_error("cluster is down, exiting");
+	clear_configfs();
+	exit(1);
+}
+
+static int loop(void)
+{
+	int rv, i;
+	void (*workfn) (int ci);
+	void (*deadfn) (int ci);
+
+	rv = setup_listener();
+	if (rv < 0)
+		goto out;
+	client_add(rv, process_listener, NULL);
+
+	rv = setup_groupd();
 	if (rv < 0)
 		goto out;
-	pollfd[0].fd = groupd_fd;
-	pollfd[0].events = POLLIN;
+	client_add(rv, process_groupd, cluster_dead);
 
-	rv = uevent_fd = setup_uevent();
+	rv = setup_uevent();
 	if (rv < 0)
 		goto out;
-	pollfd[1].fd = uevent_fd;
-	pollfd[1].events = POLLIN;
+	client_add(rv, process_uevent, NULL);
 
-	rv = member_fd = setup_member();
+	rv = setup_member();
 	if (rv < 0)
 		goto out;
-	pollfd[2].fd = member_fd;
-	pollfd[2].events = POLLIN;
+	client_add(rv, process_member, cluster_dead);
 
-	maxi = 2;
+	/* netlink stuff is only used for deadlock detection */
+	if (!deadlock_enabled)
+		goto for_loop;
+
+	rv = setup_netlink();
+	if (rv < 0)
+		goto for_loop;
+	client_add(rv, process_netlink, NULL);
+
+ for_loop:
 
 	for (;;) {
-		rv = poll(pollfd, maxi + 1, -1);
-		if (rv == -1 && errno == EINTR)
+		rv = poll(pollfd, client_maxi + 1, -1);
+		if (rv == -1 && errno == EINTR) {
+			if (daemon_quit && list_empty(&lockspaces)) {
+				clear_configfs();
+				exit(1);
+			}
+			daemon_quit = 0;
 			continue;
+		}
 		if (rv < 0) {
 			log_error("poll errno %d", errno);
 			goto out;
 		}
 
-		for (i = 0; i <= maxi; i++) {
+		for (i = 0; i <= client_maxi; i++) {
+			if (client[i].fd < 0)
+				continue;
 			if (pollfd[i].revents & POLLIN) {
-				if (pollfd[i].fd == groupd_fd)
-					process_groupd();
-				else if (pollfd[i].fd == uevent_fd)
-					process_uevent();
-				else if (pollfd[i].fd == member_fd)
-					process_member();
+				workfn = client[i].workfn;
+				workfn(i);
 			}
-
 			if (pollfd[i].revents & POLLHUP) {
-				if (pollfd[i].fd == member_fd) {
-					log_error("cluster is down, exiting");
-					clear_configfs();
-					exit(1);
-				}
-				if (pollfd[i].fd == groupd_fd) {
-					log_error("groupd is down, exiting");
-					clear_configfs();
-					exit(1);
-				}
-				log_debug("closing fd %d", pollfd[i].fd);
-				close(pollfd[i].fd);
+				deadfn = client[i].deadfn;
+				deadfn(i);
 			}
 		}
 	}
@@ -346,6 +749,9 @@
 	printf("\n");
 	printf("Options:\n");
 	printf("\n");
+#if DEADLOCK
+	printf("  -d <num>     Enable (1) or disable (0, default) deadlock code\n");     
+#endif
 	printf("  -D	       Enable debugging code and don't fork\n");
 	printf("  -K	       Enable kernel dlm debugging messages\n");
 	printf("  -h	       Print this help, then exit\n");
@@ -374,7 +780,11 @@
 			print_usage();
 			exit(EXIT_SUCCESS);
 			break;
-
+#if DEADLOCK
+		case 'd':
+			deadlock_enabled = atoi(optarg);
+			break;
+#endif
 		case 'V':
 			printf("dlm_controld (built %s %s)\n", __DATE__, __TIME__);
 			/* printf("%s\n", REDHAT_COPYRIGHT); */
@@ -440,6 +850,8 @@
 	if (!daemon_debug_opt)
 		daemonize();
 
+	setup_deadlock();
+
 	signal(SIGTERM, sigterm_handler);
 
 	set_scheduler();
--- cluster/group/dlm_controld/member_cman.c	2007/10/05 14:20:33	1.4.2.4
+++ cluster/group/dlm_controld/member_cman.c	2007/10/26 21:33:27	1.4.2.5
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -13,15 +13,14 @@
 #include <libcman.h>
 #include "dlm_daemon.h"
 
+int			our_nodeid;
 static cman_handle_t	ch;
 static cman_node_t      old_nodes[MAX_NODES];
 static int              old_node_count;
 static cman_node_t      cman_nodes[MAX_NODES];
 static int              cman_node_count;
-static int		local_nodeid;
 extern struct list_head lockspaces;
 
-
 static int is_member(cman_node_t *node_list, int count, int nodeid)
 {
 	int i;
@@ -82,14 +81,6 @@
 		return;
 	}
 
-	/* Never allow node ID 0 to be considered a member #315711 */
-	for (i = 0; i < cman_node_count; i++) {
-		if (cman_nodes[i].cn_nodeid == 0) {
-			cman_nodes[i].cn_member = 0;
-			break;
-		}
-	}
-
 	for (i = 0; i < old_node_count; i++) {
 		if (old_nodes[i].cn_member &&
 		    !is_cman_member(old_nodes[i].cn_nodeid)) {
@@ -112,7 +103,7 @@
 					  cman_nodes[i].cn_address.cna_address,
 					  cman_nodes[i].cn_address.cna_addrlen,
 					  (cman_nodes[i].cn_nodeid ==
-					   local_nodeid));
+					   our_nodeid));
 		}
 	}
 }
@@ -134,7 +125,7 @@
 	}
 }
 
-int process_member(void)
+void process_member(int ci)
 {
 	int rv;
 
@@ -146,7 +137,6 @@
 		clear_configfs();
 		exit(1);
 	}
-	return 0;
 }
 
 int setup_member(void)
@@ -156,7 +146,7 @@
 
 	ch = cman_init(NULL);
 	if (!ch) {
-		log_error("cman_init error %d %d", (int) ch, errno);
+		log_error("cman_init error %p %d", ch, errno);
 		return -ENOTCONN;
 	}
 
@@ -179,7 +169,7 @@
 		fd = rv;
 		goto out;
 	}
-	local_nodeid = node.cn_nodeid;
+	our_nodeid = node.cn_nodeid;
 
 	old_node_count = 0;
 	memset(&old_nodes, 0, sizeof(old_nodes));

next             reply	other threads:[~2007-10-26 21:33 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-10-26 21:33 teigland [this message]
  -- strict thread matches above, loose matches on Subject: below --
2007-07-24 18:15 [Cluster-devel] cluster/group/dlm_controld Makefile action.c d teigland

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20071026213328.7264.qmail@sourceware.org \
    --to=teigland@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.