All of lore.kernel.org
 help / color / mirror / Atom feed
From: teigland@sourceware.org <teigland@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/group/dlm_controld deadlock.c
Date: 23 Aug 2007 19:08:12 -0000	[thread overview]
Message-ID: <20070823190812.7054.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2007-08-23 19:08:11

Modified files:
	group/dlm_controld: deadlock.c 

Log message:
	needed to be a little more thorough in taking a canceled transaction
	out of the dependency graph for cases where multiple locks were blocked
	between the same two transactions

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/deadlock.c.diff?cvsroot=cluster&r1=1.6&r2=1.7

--- cluster/group/dlm_controld/deadlock.c	2007/08/17 21:17:53	1.6
+++ cluster/group/dlm_controld/deadlock.c	2007/08/23 19:08:11	1.7
@@ -73,6 +73,8 @@
 	struct dlm_rsb		*rsb;       /* lock is on resource */
 	struct trans		*trans;     /* lock owned by this transaction */
 	struct list_head	trans_list; /* tr->locks */
+	struct trans		*waitfor_trans; /* the trans that's holding the
+						   lock that's blocking us */
 };
 
 /* waitfor pointers alloc'ed 4 at at time */
@@ -87,7 +89,9 @@
 							 waitfor */
 	int			waitfor_alloc;
 	int			waitfor_count;        /* count of in-use
-							 waitfor slots */
+							 waitfor slots and
+						         num of trans's we're
+						         waiting on */
 	struct trans		**waitfor;	      /* waitfor_alloc trans
 							 pointers */
 };
@@ -765,8 +769,6 @@
 			goto out_it;
 		}
 
-		log_group(ls, "read_checkpoint: ckpt read %llu bytes",
-			  (unsigned long long)iov.readSize);
 		section_len = iov.readSize;
 
 		if (!section_len)
@@ -1598,6 +1600,7 @@
 
 	tr->waitfor[tr->waitfor_count++] = granted_lkb->trans;
 	granted_lkb->trans->others_waiting_on_us++;
+	waiting_lkb->waitfor_trans = granted_lkb->trans;
 }
 
 /* for each trans, for each waiting lock, go to rsb of the lock,
@@ -1742,14 +1745,32 @@
 		if (lkb->lock.status == DLM_LKSTS_GRANTED)
 			continue;
 		send_cancel_lock(ls, tr, lkb);
-		tr->waitfor_count--;
-	}
 
-	if (tr->waitfor_count)
-		log_group(ls, "canceled trans has non-zero waitfor_count %d",
-			  tr->waitfor_count);
+		/* When this canceled trans has multiple locks all blocked by
+		   locks held by one other trans, that other trans is only
+		   added to tr->waitfor once, and only one of these waiting
+		   locks will have waitfor_trans set.  So, the lkb with
+		   non-null waitfor_trans was the first one responsible
+		   for adding waitfor_trans to tr->waitfor.
+
+		   We could potentially forget about keeping track of lkb->
+		   waitfor_trans, forget about calling remove_waitfor()
+		   here and just set tr->waitfor_count = 0 after this loop.
+		   The loss would be that waitfor_trans->others_waiting_on_us
+		   would not get decremented. */
+
+		if (lkb->waitfor_trans)
+			remove_waitfor(tr, lkb->waitfor_trans);
+	}
+
+	/* this shouldn't happen, if it does something's not working right */
+	if (tr->waitfor_count) {
+		log_group(ls, "cancel_trans: %llx non-zero waitfor_count %d",
+			  (unsigned long long)tr->xid, tr->waitfor_count);
+	}
 
-	/* this should now remove the canceled trans */
+	/* this should now remove the canceled trans since it now has a zero
+	   waitfor_count */
 	removed = reduce_waitfor_graph(ls);
 
 	if (!removed)
@@ -1772,11 +1793,12 @@
 	log_group(ls, "locks:");
 	
 	list_for_each_entry(lkb, &tr->locks, trans_list) {
-		log_group(ls, "  %s: id %08x gr %s rq %s pid %u \"%s\"",
+		log_group(ls, "  %s: id %08x gr %s rq %s pid %u:%u \"%s\"",
 			  status_str(lkb->lock.status),
 			  lkb->lock.id,
 			  dlm_mode_str(lkb->lock.grmode),
 			  dlm_mode_str(lkb->lock.rqmode),
+			  lkb->home,
 			  lkb->lock.ownpid,
 			  lkb->rsb->name);
 	}



             reply	other threads:[~2007-08-23 19:08 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-08-23 19:08 teigland [this message]
  -- strict thread matches above, loose matches on Subject: below --
2007-08-13 15:32 [Cluster-devel] cluster/group/dlm_controld deadlock.c teigland
2007-08-07 19:20 teigland

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070823190812.7054.qmail@sourceware.org \
    --to=teigland@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.