All of lore.kernel.org
 help / color / mirror / Atom feed
From: teigland@sourceware.org <teigland@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/group/dlm_controld deadlock.c dlm_daem ...
Date: 17 Aug 2007 21:17:53 -0000	[thread overview]
Message-ID: <20070817211753.15992.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2007-08-17 21:17:53

Modified files:
	group/dlm_controld: deadlock.c dlm_daemon.h main.c 

Log message:
	handle addition/removal/failure of nodes during a deadlock cycle
	serialize deadlock cycles and limit how often cycles are started

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/deadlock.c.diff?cvsroot=cluster&r1=1.5&r2=1.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/dlm_daemon.h.diff?cvsroot=cluster&r1=1.12&r2=1.13
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/main.c.diff?cvsroot=cluster&r1=1.13&r2=1.14

--- cluster/group/dlm_controld/deadlock.c	2007/08/13 15:32:05	1.5
+++ cluster/group/dlm_controld/deadlock.c	2007/08/17 21:17:53	1.6
@@ -28,7 +28,8 @@
 struct node {
 	struct list_head	list;
 	int			nodeid;
-	int			checkpoint_ready;
+	int			checkpoint_ready; /* we've read its ckpt */
+	int			in_cycle;         /* participating in cycle */
 };
 
 enum {
@@ -96,8 +97,9 @@
 #define DLM_HEADER_PATCH	0
 
 #define DLM_MSG_CYCLE_START	 1
-#define DLM_MSG_CHECKPOINT_READY 2
-#define DLM_MSG_CANCEL_LOCK	 3
+#define DLM_MSG_CYCLE_END	 2
+#define DLM_MSG_CHECKPOINT_READY 3
+#define DLM_MSG_CANCEL_LOCK	 4
 
 struct dlm_header {
 	uint16_t		version[3];
@@ -816,6 +818,7 @@
 
 	/* unlink an old checkpoint before we create a new one */
 	if (ls->lock_ckpt_handle) {
+		log_error("write_checkpoint: old ckpt");
 		if (_unlink_checkpoint(ls, &name))
 			return;
 	}
@@ -984,6 +987,14 @@
 	send_message(ls, DLM_MSG_CYCLE_START);
 }
 
+void send_cycle_end(struct lockspace *ls)
+{
+	if (!deadlock_enabled)
+		return;
+	log_group(ls, "send_cycle_end");
+	send_message(ls, DLM_MSG_CYCLE_END);
+}
+
 static void send_cancel_lock(struct lockspace *ls, struct trans *tr,
 			     struct dlm_lkb *lkb)
 {
@@ -1053,22 +1064,18 @@
 
 static void find_deadlock(struct lockspace *ls);
 
-static void receive_checkpoint_ready(struct lockspace *ls, int nodeid)
+static void run_deadlock(struct lockspace *ls)
 {
 	struct node *node;
 	int not_ready = 0;
 	int low = -1;
 
-	log_group(ls, "receive_checkpoint_ready from %d", nodeid);
-
-	read_checkpoint(ls, nodeid);
-
-	/* when locks are read from all nodes, then search_deadlock()
-	   to do detection */
+	if (ls->all_checkpoints_ready)
+		log_group(ls, "WARNING: run_deadlock all_checkpoints_ready");
 
 	list_for_each_entry(node, &ls->nodes, list) {
-		if (node->nodeid == nodeid)
-			node->checkpoint_ready = 1;
+		if (!node->in_cycle)
+			continue;
 		if (!node->checkpoint_ready)
 			not_ready++;
 
@@ -1078,23 +1085,56 @@
 	if (not_ready)
 		return;
 
+	ls->all_checkpoints_ready = 1;
+
 	list_for_each_entry(node, &ls->nodes, list) {
+		if (!node->in_cycle)
+			continue;
 		if (node->nodeid < low || low == -1)
 			low = node->nodeid;
 	}
 	ls->low_nodeid = low;
-	log_group(ls, "low nodeid in charge of resolution is %d", low);
 
-	find_deadlock(ls);
+	if (low == our_nodeid)
+		find_deadlock(ls);
+	else
+		log_group(ls, "defer resolution to low nodeid %d", low);
+}
+
+static void receive_checkpoint_ready(struct lockspace *ls, int nodeid)
+{
+	struct node *node;
+
+	log_group(ls, "receive_checkpoint_ready from %d", nodeid);
+
+	read_checkpoint(ls, nodeid);
+
+	list_for_each_entry(node, &ls->nodes, list) {
+		if (node->nodeid == nodeid) {
+			node->checkpoint_ready = 1;
+			break;
+		}
+	}
+
+	run_deadlock(ls);
 }
 
 static void receive_cycle_start(struct lockspace *ls, int nodeid)
 {
+	struct node *node;
 	int rv;
 
-	log_group(ls, "receive_cycle_start %d", nodeid);
+	log_group(ls, "receive_cycle_start from %d", nodeid);
+
+	if (ls->cycle_running) {
+		log_group(ls, "cycle already running");
+		return;
+	}
+	ls->cycle_running = 1;
+	gettimeofday(&ls->cycle_start_time, NULL);
 
-	gettimeofday(&ls->last_deadlock_check, NULL);
+	list_for_each_entry(node, &ls->nodes, list)
+		node->in_cycle = 1;
 
 	rv = read_debugfs_locks(ls);
 	if (rv < 0) {
@@ -1111,6 +1151,46 @@
 	send_checkpoint_ready(ls);
 }
 
+static uint64_t dt_usec(struct timeval *start, struct timeval *stop)
+{
+	uint64_t dt;
+
+	dt = stop->tv_sec - start->tv_sec;
+	dt *= 1000000;
+	dt += stop->tv_usec - start->tv_usec;
+	return dt;
+}
+
+/* TODO: nodes added during a cycle - what will they do with messages
+   they recv from other nodes running the cycle? */
+
+static void receive_cycle_end(struct lockspace *ls, int nodeid)
+{
+	struct node *node;
+	uint64_t usec;
+
+	if (!ls->cycle_running) {
+		log_error("receive_cycle_end %s from %d: no cycle running",
+			  ls->name, nodeid);
+		return;
+	}
+
+	gettimeofday(&ls->cycle_end_time, NULL);
+	usec = dt_usec(&ls->cycle_start_time, &ls->cycle_end_time);
+	log_group(ls, "receive_cycle_end: from %d cycle time %.2f s",
+		  nodeid, usec * 1.e-6);
+
+	ls->cycle_running = 0;
+	ls->all_checkpoints_ready = 0;
+
+	list_for_each_entry(node, &ls->nodes, list)
+		node->checkpoint_ready = 0;
+
+	free_resources(ls);
+	free_transactions(ls);
+	unlink_checkpoint(ls);
+}
+
 static void receive_cancel_lock(struct lockspace *ls, int nodeid, uint32_t lkid)
 {
 	dlm_lshandle_t h;
@@ -1167,6 +1247,9 @@
 	case DLM_MSG_CYCLE_START:
 		receive_cycle_start(ls, hd->nodeid);
 		break;
+	case DLM_MSG_CYCLE_END:
+		receive_cycle_end(ls, hd->nodeid);
+		break;
 	case DLM_MSG_CHECKPOINT_READY:
 		receive_checkpoint_ready(ls, hd->nodeid);
 		break;
@@ -1203,14 +1286,14 @@
 		if (node->nodeid != nodeid)
 			continue;
 
-		/* TODO: purge locks from this node if we're in a cycle */
-
 		list_del(&node->list);
 		free(node);
 		log_group(ls, "node %d left deadlock cpg", nodeid);
 	}
 }
 
+static void purge_locks(struct lockspace *ls, int nodeid);
+
 static void confchg_cb(cpg_handle_t handle, struct cpg_name *group_name,
 		struct cpg_address *member_list, int member_list_entries,
 		struct cpg_address *left_list, int left_list_entries,
@@ -1230,11 +1313,36 @@
 		return;
 	}
 
+	/* nodes added during a cycle won't have node->in_cycle set so they
+	   won't be included in any of the cycle processing */
+
 	for (i = 0; i < joined_list_entries; i++)
 		node_joined(ls, joined_list[i].nodeid);
 
 	for (i = 0; i < left_list_entries; i++)
 		node_left(ls, left_list[i].nodeid, left_list[i].reason);
+
+	if (!ls->cycle_running)
+		return;
+
+	if (!left_list_entries)
+		return;
+
+	if (!ls->all_checkpoints_ready) {
+		run_deadlock(ls);
+		return;
+	}
+
+	for (i = 0; i < left_list_entries; i++)
+		purge_locks(ls, left_list[i].nodeid);
+
+	for (i = 0; i < left_list_entries; i++) {
+		if (left_list[i].nodeid != ls->low_nodeid)
+			continue;
+		/* this will set a new low node which will call find_deadlock */
+		run_deadlock(ls);
+		break;
+	}
 }
 
 static void process_deadlock_cpg(int ci)
@@ -1343,6 +1451,29 @@
 	client_dead(ls->cpg_ci);
 }
 
+/* would we ever call this after we've created the transaction lists?
+   I don't think so; I think it can only be called between reading
+   checkpoints */
+
+static void purge_locks(struct lockspace *ls, int nodeid)
+{
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb, *safe;
+
+	list_for_each_entry(r, &ls->resources, list) {
+		list_for_each_entry_safe(lkb, safe, &r->locks, list) {
+			if (lkb->home == nodeid) {
+				list_del(&lkb->list);
+				if (list_empty(&lkb->trans_list))
+					free(lkb);
+				else
+					log_group(ls, "purge %d %x on trans",
+						  nodeid, lkb->lock.id);
+			}
+		}
+	}
+}
+
 static void add_lkb_trans(struct trans *tr, struct dlm_lkb *lkb)
 {
 	list_add(&lkb->trans_list, &tr->locks);
@@ -1543,7 +1674,7 @@
 	}
 
 	if (remove_tr->others_waiting_on_us)
-		log_debug("trans %llx removed others waiting %d",
+		log_group(ls, "trans %llx removed others waiting %d",
 			  (unsigned long long)remove_tr->xid,
 			  remove_tr->others_waiting_on_us);
 }
@@ -1675,16 +1806,14 @@
 
 static void find_deadlock(struct lockspace *ls)
 {
-	struct node *node;
-
 	if (list_empty(&ls->resources)) {
 		log_group(ls, "no deadlock: no resources");
-		return;
+		goto out;
 	}
 
 	if (!list_empty(&ls->transactions)) {
 		log_group(ls, "transactions list should be empty");
-		return;
+		goto out;
 	}
 
 	dump_resources(ls);
@@ -1701,13 +1830,6 @@
 	log_group(ls, "found deadlock");
 	dump_all_trans(ls);
 
-	/* TODO: should probably do this above instead */
-	if (ls->low_nodeid != our_nodeid) {
-		log_group(ls, "defer resolution to low nodeid %d",
-			  ls->low_nodeid);
-		goto out;
-	}
-
 	cancel_trans(ls);
 	reduce_waitfor_graph_loop(ls);
 
@@ -1718,12 +1840,7 @@
 
 	log_error("deadlock resolution failed");
 	dump_all_trans(ls);
-
  out:
-	free_resources(ls);
-	free_transactions(ls);
-
-	list_for_each_entry(node, &ls->nodes, list)
-		node->checkpoint_ready = 0;
+	send_cycle_end(ls);
 }
 
--- cluster/group/dlm_controld/dlm_daemon.h	2007/08/10 20:23:07	1.12
+++ cluster/group/dlm_controld/dlm_daemon.h	2007/08/17 21:17:53	1.13
@@ -94,9 +94,12 @@
 	struct list_head	transactions;
 	struct list_head	resources;
 	struct list_head	nodes;
-	struct timeval		last_deadlock_check;
-	unsigned int		timewarn_count;
+	struct timeval		cycle_start_time;
+	struct timeval		cycle_end_time;
+	struct timeval		last_send_cycle_start;
 	int			got_first_confchg;
+	int			cycle_running;
+	int			all_checkpoints_ready;
 };
 
 /* action.c */
--- cluster/group/dlm_controld/main.c	2007/08/06 21:50:26	1.13
+++ cluster/group/dlm_controld/main.c	2007/08/17 21:17:53	1.14
@@ -446,28 +446,55 @@
 {
 	struct lockspace *ls;
 	struct timeval now;
+	unsigned int sec;
 
 	ls = find_ls_id(data->lockspace_id);
 	if (!ls)
 		return;
 
-	log_group(ls, "timewarn: lkid %x pid %d count %d",
-		  data->id, data->ownpid, ls->timewarn_count);
+	data->resource_name[data->resource_namelen] = '\0';
+
+	log_group(ls, "timewarn: lkid %x pid %d name %s",
+		  data->id, data->ownpid, data->resource_name);
+
+	/* Problem: we don't want to get a timewarn, assume it's resolved
+	   by the current cycle, but in fact it's from a deadlock that
+	   formed after the checkpoints for the current cycle.  Then we'd
+	   have to hope for another warning (that may not come) to trigger
+	   a new cycle to catch the deadlock.  If our last cycle ckpt
+	   was say N (~5?) sec before we receive the timewarn, then we
+	   can be confident that the cycle included the lock in question.
+	   Otherwise, we're not sure if the warning is for a new deadlock
+	   that's formed since our last cycle ckpt (unless it's a long
+	   enough time since the last cycle that we're confident it *is*
+	   a new deadlock).  When there is a deadlock, I suspect it will
+	   be common to receive warnings before, during, and possibly
+	   after the cycle that resolves it.  Wonder if we should record
+	   timewarns and match them with deadlock cycles so we can tell
+	   which timewarns are addressed by a given cycle and which aren't.  */
+
 
 	gettimeofday(&now, NULL);
 
-	if (now.tv_sec - ls->last_deadlock_check.tv_sec > DEADLOCK_CHECK_SECS) {
-		ls->timewarn_count = 0;
-		send_cycle_start(ls);
-	} else {
-		/* TODO: set a poll timeout and start another cycle after
-		   DEADLOCK_CHECK_SECS.  Want to save a record of all the
-		   warned locks to see if they're still blocked later before
-		   starting a cycle?  This would only be helpful if we
-		   experienced regular false-warnings, indicating that the
-		   timewarn setting should be larger. */
-		ls->timewarn_count++;
+	/* don't send a new start until at least SECS after the last
+	   we sent, and at least SECS after the last completed cycle */
+
+	sec = now.tv_sec - ls->last_send_cycle_start.tv_sec;
+
+	if (sec < DEADLOCK_CHECK_SECS) {
+		log_group(ls, "skip send: recent send cycle %d sec", sec);
+		return;
+	}
+
+	sec = now.tv_sec - ls->cycle_end_time.tv_sec;
+
+	if (sec < DEADLOCK_CHECK_SECS) {
+		log_group(ls, "skip send: recent cycle end %d sec", sec);
+		return;
 	}
+
+	gettimeofday(&ls->last_send_cycle_start, NULL);
+	send_cycle_start(ls);
 }
 
 static void process_netlink(int ci)



                 reply	other threads:[~2007-08-17 21:17 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070817211753.15992.qmail@sourceware.org \
    --to=teigland@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.