From mboxrd@z Thu Jan 1 00:00:00 1970 From: teigland@sourceware.org Date: 23 Aug 2007 19:08:12 -0000 Subject: [Cluster-devel] cluster/group/dlm_controld deadlock.c Message-ID: <20070823190812.7054.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: teigland at sourceware.org 2007-08-23 19:08:11 Modified files: group/dlm_controld: deadlock.c Log message: needed to be a little more thorough in taking a canceled transaction out of the dependency graph for cases where multiple locks were blocked between the same two transactions Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/deadlock.c.diff?cvsroot=cluster&r1=1.6&r2=1.7 --- cluster/group/dlm_controld/deadlock.c 2007/08/17 21:17:53 1.6 +++ cluster/group/dlm_controld/deadlock.c 2007/08/23 19:08:11 1.7 @@ -73,6 +73,8 @@ struct dlm_rsb *rsb; /* lock is on resource */ struct trans *trans; /* lock owned by this transaction */ struct list_head trans_list; /* tr->locks */ + struct trans *waitfor_trans; /* the trans that's holding the + lock that's blocking us */ }; /* waitfor pointers alloc'ed 4 at at time */ @@ -87,7 +89,9 @@ waitfor */ int waitfor_alloc; int waitfor_count; /* count of in-use - waitfor slots */ + waitfor slots and + num of trans's we're + waiting on */ struct trans **waitfor; /* waitfor_alloc trans pointers */ }; @@ -765,8 +769,6 @@ goto out_it; } - log_group(ls, "read_checkpoint: ckpt read %llu bytes", - (unsigned long long)iov.readSize); section_len = iov.readSize; if (!section_len) @@ -1598,6 +1600,7 @@ tr->waitfor[tr->waitfor_count++] = granted_lkb->trans; granted_lkb->trans->others_waiting_on_us++; + waiting_lkb->waitfor_trans = granted_lkb->trans; } /* for each trans, for each waiting lock, go to rsb of the lock, @@ -1742,14 +1745,32 @@ if (lkb->lock.status == DLM_LKSTS_GRANTED) continue; send_cancel_lock(ls, tr, lkb); - tr->waitfor_count--; - } - if (tr->waitfor_count) - log_group(ls, "canceled trans has non-zero waitfor_count %d", - tr->waitfor_count); + /* When this canceled trans has multiple locks all blocked by + locks held by one other trans, that other trans is only + added to tr->waitfor once, and only one of these waiting + locks will have waitfor_trans set. So, the lkb with + non-null waitfor_trans was the first one responsible + for adding waitfor_trans to tr->waitfor. + + We could potentially forget about keeping track of lkb-> + waitfor_trans, forget about calling remove_waitfor() + here and just set tr->waitfor_count = 0 after this loop. + The loss would be that waitfor_trans->others_waiting_on_us + would not get decremented. */ + + if (lkb->waitfor_trans) + remove_waitfor(tr, lkb->waitfor_trans); + } + + /* this shouldn't happen, if it does something's not working right */ + if (tr->waitfor_count) { + log_group(ls, "cancel_trans: %llx non-zero waitfor_count %d", + (unsigned long long)tr->xid, tr->waitfor_count); + } - /* this should now remove the canceled trans */ + /* this should now remove the canceled trans since it now has a zero + waitfor_count */ removed = reduce_waitfor_graph(ls); if (!removed) @@ -1772,11 +1793,12 @@ log_group(ls, "locks:"); list_for_each_entry(lkb, &tr->locks, trans_list) { - log_group(ls, " %s: id %08x gr %s rq %s pid %u \"%s\"", + log_group(ls, " %s: id %08x gr %s rq %s pid %u:%u \"%s\"", status_str(lkb->lock.status), lkb->lock.id, dlm_mode_str(lkb->lock.grmode), dlm_mode_str(lkb->lock.rqmode), + lkb->home, lkb->lock.ownpid, lkb->rsb->name); }