From mboxrd@z Thu Jan  1 00:00:00 1970
From: teigland@sourceware.org <teigland@sourceware.org>
Date: 17 Aug 2007 21:17:53 -0000
Subject: [Cluster-devel] cluster/group/dlm_controld deadlock.c dlm_daem ...
Message-ID: <20070817211753.15992.qmail@sourceware.org>
List-Id: <cluster-devel.redhat.com>
To: cluster-devel.redhat.com
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2007-08-17 21:17:53

Modified files:
	group/dlm_controld: deadlock.c dlm_daemon.h main.c 

Log message:
	handle addition/removal/failure of nodes during a deadlock cycle
	serialize deadlock cycles and limit how often cycles are started

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/deadlock.c.diff?cvsroot=cluster&r1=1.5&r2=1.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/dlm_daemon.h.diff?cvsroot=cluster&r1=1.12&r2=1.13
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/main.c.diff?cvsroot=cluster&r1=1.13&r2=1.14

--- cluster/group/dlm_controld/deadlock.c	2007/08/13 15:32:05	1.5
+++ cluster/group/dlm_controld/deadlock.c	2007/08/17 21:17:53	1.6
@@ -28,7 +28,8 @@
 struct node {
 	struct list_head	list;
 	int			nodeid;
-	int			checkpoint_ready;
+	int			checkpoint_ready; /* we've read its ckpt */
+	int			in_cycle;         /* participating in cycle */
 };
 
 enum {
@@ -96,8 +97,9 @@
 #define DLM_HEADER_PATCH	0
 
 #define DLM_MSG_CYCLE_START	 1
-#define DLM_MSG_CHECKPOINT_READY 2
-#define DLM_MSG_CANCEL_LOCK	 3
+#define DLM_MSG_CYCLE_END	 2
+#define DLM_MSG_CHECKPOINT_READY 3
+#define DLM_MSG_CANCEL_LOCK	 4
 
 struct dlm_header {
 	uint16_t		version[3];
@@ -816,6 +818,7 @@
 
 	/* unlink an old checkpoint before we create a new one */
 	if (ls->lock_ckpt_handle) {
+		log_error("write_checkpoint: old ckpt");
 		if (_unlink_checkpoint(ls, &name))
 			return;
 	}
@@ -984,6 +987,14 @@
 	send_message(ls, DLM_MSG_CYCLE_START);
 }
 
+void send_cycle_end(struct lockspace *ls)
+{
+	if (!deadlock_enabled)
+		return;
+	log_group(ls, "send_cycle_end");
+	send_message(ls, DLM_MSG_CYCLE_END);
+}
+
 static void send_cancel_lock(struct lockspace *ls, struct trans *tr,
 			     struct dlm_lkb *lkb)
 {
@@ -1053,22 +1064,18 @@
 
 static void find_deadlock(struct lockspace *ls);
 
-static void receive_checkpoint_ready(struct lockspace *ls, int nodeid)
+static void run_deadlock(struct lockspace *ls)
 {
 	struct node *node;
 	int not_ready = 0;
 	int low = -1;
 
-	log_group(ls, "receive_checkpoint_ready from %d", nodeid);
-
-	read_checkpoint(ls, nodeid);
-
-	/* when locks are read from all nodes, then search_deadlock()
-	   to do detection */
+	if (ls->all_checkpoints_ready)
+		log_group(ls, "WARNING: run_deadlock all_checkpoints_ready");
 
 	list_for_each_entry(node, &ls->nodes, list) {
-		if (node->nodeid == nodeid)
-			node->checkpoint_ready = 1;
+		if (!node->in_cycle)
+			continue;
 		if (!node->checkpoint_ready)
 			not_ready++;
 
@@ -1078,23 +1085,56 @@
 	if (not_ready)
 		return;
 
+	ls->all_checkpoints_ready = 1;
+
 	list_for_each_entry(node, &ls->nodes, list) {
+		if (!node->in_cycle)
+			continue;
 		if (node->nodeid < low || low == -1)
 			low = node->nodeid;
 	}
 	ls->low_nodeid = low;
-	log_group(ls, "low nodeid in charge of resolution is %d", low);
 
-	find_deadlock(ls);
+	if (low == our_nodeid)
+		find_deadlock(ls);
+	else
+		log_group(ls, "defer resolution to low nodeid %d", low);
+}
+
+static void receive_checkpoint_ready(struct lockspace *ls, int nodeid)
+{
+	struct node *node;
+
+	log_group(ls, "receive_checkpoint_ready from %d", nodeid);
+
+	read_checkpoint(ls, nodeid);
+
+	list_for_each_entry(node, &ls->nodes, list) {
+		if (node->nodeid == nodeid) {
+			node->checkpoint_ready = 1;
+			break;
+		}
+	}
+
+	run_deadlock(ls);
 }
 
 static void receive_cycle_start(struct lockspace *ls, int nodeid)
 {
+	struct node *node;
 	int rv;
 
-	log_group(ls, "receive_cycle_start %d", nodeid);
+	log_group(ls, "receive_cycle_start from %d", nodeid);
+
+	if (ls->cycle_running) {
+		log_group(ls, "cycle already running");
+		return;
+	}
+	ls->cycle_running = 1;
+	gettimeofday(&ls->cycle_start_time, NULL);
 
-	gettimeofday(&ls->last_deadlock_check, NULL);
+	list_for_each_entry(node, &ls->nodes, list)
+		node->in_cycle = 1;
 
 	rv = read_debugfs_locks(ls);
 	if (rv < 0) {
@@ -1111,6 +1151,46 @@
 	send_checkpoint_ready(ls);
 }
 
+static uint64_t dt_usec(struct timeval *start, struct timeval *stop)
+{
+	uint64_t dt;
+
+	dt = stop->tv_sec - start->tv_sec;
+	dt *= 1000000;
+	dt += stop->tv_usec - start->tv_usec;
+	return dt;
+}
+
+/* TODO: nodes added during a cycle - what will they do with messages
+   they recv from other nodes running the cycle? */
+
+static void receive_cycle_end(struct lockspace *ls, int nodeid)
+{
+	struct node *node;
+	uint64_t usec;
+
+	if (!ls->cycle_running) {
+		log_error("receive_cycle_end %s from %d: no cycle running",
+			  ls->name, nodeid);
+		return;
+	}
+
+	gettimeofday(&ls->cycle_end_time, NULL);
+	usec = dt_usec(&ls->cycle_start_time, &ls->cycle_end_time);
+	log_group(ls, "receive_cycle_end: from %d cycle time %.2f s",
+		  nodeid, usec * 1.e-6);
+
+	ls->cycle_running = 0;
+	ls->all_checkpoints_ready = 0;
+
+	list_for_each_entry(node, &ls->nodes, list)
+		node->checkpoint_ready = 0;
+
+	free_resources(ls);
+	free_transactions(ls);
+	unlink_checkpoint(ls);
+}
+
 static void receive_cancel_lock(struct lockspace *ls, int nodeid, uint32_t lkid)
 {
 	dlm_lshandle_t h;
@@ -1167,6 +1247,9 @@
 	case DLM_MSG_CYCLE_START:
 		receive_cycle_start(ls, hd->nodeid);
 		break;
+	case DLM_MSG_CYCLE_END:
+		receive_cycle_end(ls, hd->nodeid);
+		break;
 	case DLM_MSG_CHECKPOINT_READY:
 		receive_checkpoint_ready(ls, hd->nodeid);
 		break;
@@ -1203,14 +1286,14 @@
 		if (node->nodeid != nodeid)
 			continue;
 
-		/* TODO: purge locks from this node if we're in a cycle */
-
 		list_del(&node->list);
 		free(node);
 		log_group(ls, "node %d left deadlock cpg", nodeid);
 	}
 }
 
+static void purge_locks(struct lockspace *ls, int nodeid);
+
 static void confchg_cb(cpg_handle_t handle, struct cpg_name *group_name,
 		struct cpg_address *member_list, int member_list_entries,
 		struct cpg_address *left_list, int left_list_entries,
@@ -1230,11 +1313,36 @@
 		return;
 	}
 
+	/* nodes added during a cycle won't have node->in_cycle set so they
+	   won't be included in any of the cycle processing */
+
 	for (i = 0; i < joined_list_entries; i++)
 		node_joined(ls, joined_list[i].nodeid);
 
 	for (i = 0; i < left_list_entries; i++)
 		node_left(ls, left_list[i].nodeid, left_list[i].reason);
+
+	if (!ls->cycle_running)
+		return;
+
+	if (!left_list_entries)
+		return;
+
+	if (!ls->all_checkpoints_ready) {
+		run_deadlock(ls);
+		return;
+	}
+
+	for (i = 0; i < left_list_entries; i++)
+		purge_locks(ls, left_list[i].nodeid);
+
+	for (i = 0; i < left_list_entries; i++) {
+		if (left_list[i].nodeid != ls->low_nodeid)
+			continue;
+		/* this will set a new low node which will call find_deadlock */
+		run_deadlock(ls);
+		break;
+	}
 }
 
 static void process_deadlock_cpg(int ci)
@@ -1343,6 +1451,29 @@
 	client_dead(ls->cpg_ci);
 }
 
+/* would we ever call this after we've created the transaction lists?
+   I don't think so; I think it can only be called between reading
+   checkpoints */
+
+static void purge_locks(struct lockspace *ls, int nodeid)
+{
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb, *safe;
+
+	list_for_each_entry(r, &ls->resources, list) {
+		list_for_each_entry_safe(lkb, safe, &r->locks, list) {
+			if (lkb->home == nodeid) {
+				list_del(&lkb->list);
+				if (list_empty(&lkb->trans_list))
+					free(lkb);
+				else
+					log_group(ls, "purge %d %x on trans",
+						  nodeid, lkb->lock.id);
+			}
+		}
+	}
+}
+
 static void add_lkb_trans(struct trans *tr, struct dlm_lkb *lkb)
 {
 	list_add(&lkb->trans_list, &tr->locks);
@@ -1543,7 +1674,7 @@
 	}
 
 	if (remove_tr->others_waiting_on_us)
-		log_debug("trans %llx removed others waiting %d",
+		log_group(ls, "trans %llx removed others waiting %d",
 			  (unsigned long long)remove_tr->xid,
 			  remove_tr->others_waiting_on_us);
 }
@@ -1675,16 +1806,14 @@
 
 static void find_deadlock(struct lockspace *ls)
 {
-	struct node *node;
-
 	if (list_empty(&ls->resources)) {
 		log_group(ls, "no deadlock: no resources");
-		return;
+		goto out;
 	}
 
 	if (!list_empty(&ls->transactions)) {
 		log_group(ls, "transactions list should be empty");
-		return;
+		goto out;
 	}
 
 	dump_resources(ls);
@@ -1701,13 +1830,6 @@
 	log_group(ls, "found deadlock");
 	dump_all_trans(ls);
 
-	/* TODO: should probably do this above instead */
-	if (ls->low_nodeid != our_nodeid) {
-		log_group(ls, "defer resolution to low nodeid %d",
-			  ls->low_nodeid);
-		goto out;
-	}
-
 	cancel_trans(ls);
 	reduce_waitfor_graph_loop(ls);
 
@@ -1718,12 +1840,7 @@
 
 	log_error("deadlock resolution failed");
 	dump_all_trans(ls);
-
  out:
-	free_resources(ls);
-	free_transactions(ls);
-
-	list_for_each_entry(node, &ls->nodes, list)
-		node->checkpoint_ready = 0;
+	send_cycle_end(ls);
 }
 
--- cluster/group/dlm_controld/dlm_daemon.h	2007/08/10 20:23:07	1.12
+++ cluster/group/dlm_controld/dlm_daemon.h	2007/08/17 21:17:53	1.13
@@ -94,9 +94,12 @@
 	struct list_head	transactions;
 	struct list_head	resources;
 	struct list_head	nodes;
-	struct timeval		last_deadlock_check;
-	unsigned int		timewarn_count;
+	struct timeval		cycle_start_time;
+	struct timeval		cycle_end_time;
+	struct timeval		last_send_cycle_start;
 	int			got_first_confchg;
+	int			cycle_running;
+	int			all_checkpoints_ready;
 };
 
 /* action.c */
--- cluster/group/dlm_controld/main.c	2007/08/06 21:50:26	1.13
+++ cluster/group/dlm_controld/main.c	2007/08/17 21:17:53	1.14
@@ -446,28 +446,55 @@
 {
 	struct lockspace *ls;
 	struct timeval now;
+	unsigned int sec;
 
 	ls = find_ls_id(data->lockspace_id);
 	if (!ls)
 		return;
 
-	log_group(ls, "timewarn: lkid %x pid %d count %d",
-		  data->id, data->ownpid, ls->timewarn_count);
+	data->resource_name[data->resource_namelen] = '\0';
+
+	log_group(ls, "timewarn: lkid %x pid %d name %s",
+		  data->id, data->ownpid, data->resource_name);
+
+	/* Problem: we don't want to get a timewarn, assume it's resolved
+	   by the current cycle, but in fact it's from a deadlock that
+	   formed after the checkpoints for the current cycle.  Then we'd
+	   have to hope for another warning (that may not come) to trigger
+	   a new cycle to catch the deadlock.  If our last cycle ckpt
+	   was say N (~5?) sec before we receive the timewarn, then we
+	   can be confident that the cycle included the lock in question.
+	   Otherwise, we're not sure if the warning is for a new deadlock
+	   that's formed since our last cycle ckpt (unless it's a long
+	   enough time since the last cycle that we're confident it *is*
+	   a new deadlock).  When there is a deadlock, I suspect it will
+	   be common to receive warnings before, during, and possibly
+	   after the cycle that resolves it.  Wonder if we should record
+	   timewarns and match them with deadlock cycles so we can tell
+	   which timewarns are addressed by a given cycle and which aren't.  */
+
 
 	gettimeofday(&now, NULL);
 
-	if (now.tv_sec - ls->last_deadlock_check.tv_sec > DEADLOCK_CHECK_SECS) {
-		ls->timewarn_count = 0;
-		send_cycle_start(ls);
-	} else {
-		/* TODO: set a poll timeout and start another cycle after
-		   DEADLOCK_CHECK_SECS.  Want to save a record of all the
-		   warned locks to see if they're still blocked later before
-		   starting a cycle?  This would only be helpful if we
-		   experienced regular false-warnings, indicating that the
-		   timewarn setting should be larger. */
-		ls->timewarn_count++;
+	/* don't send a new start until at least SECS after the last
+	   we sent, and at least SECS after the last completed cycle */
+
+	sec = now.tv_sec - ls->last_send_cycle_start.tv_sec;
+
+	if (sec < DEADLOCK_CHECK_SECS) {
+		log_group(ls, "skip send: recent send cycle %d sec", sec);
+		return;
+	}
+
+	sec = now.tv_sec - ls->cycle_end_time.tv_sec;
+
+	if (sec < DEADLOCK_CHECK_SECS) {
+		log_group(ls, "skip send: recent cycle end %d sec", sec);
+		return;
 	}
+
+	gettimeofday(&ls->last_send_cycle_start, NULL);
+	send_cycle_start(ls);
 }
 
 static void process_netlink(int ci)