From mboxrd@z Thu Jan 1 00:00:00 1970 From: jbrassow@sourceware.org Date: 5 Apr 2007 21:33:37 -0000 Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ... Message-ID: <20070405213337.30913.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL45 Changes by: jbrassow at sourceware.org 2007-04-05 22:33:36 Modified files: cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c Log message: Bug 234918 Processed: NMI Watchdog detected LOCKUP while running proces... Bug 217438: scrolling kernel requests to mark mirror regions Item 1: I needed to check for marked regions when getting resync work, not just check for resyncing regions when a mark/flush happens. Item 2: There is a corner case that allows two calls to clear the same region. The second does not need to be logged. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.2&r2=1.1.2.41.2.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.3&r2=1.1.2.26.2.4 --- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/03 18:23:01 1.1.2.41.2.2 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/05 21:33:36 1.1.2.41.2.3 @@ -1034,7 +1034,9 @@ spin_lock(&lc->state_lock); - /* Should find match in this list, or no lists at all */ + /* + * The nominal case is to find the region in the marked list + */ list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){ if(region == rs->rs_region){ list_del_init(&rs->rs_list); @@ -1043,28 +1045,46 @@ } } - - list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){ + /* + * It is possible, but unlikely to get to this case. It requires + * the following to happen: + * 1) mark the region for writing + * 2) clear the region + * 3) clear doesn't get flushed because of bug 235040 + * 4) suspend due to server relocation + * 5) on-disk log says we need to recover (because it hasn't been cleared) + * 6) we recover the region + * 7) clearing the region after recovery causes us to get here + * + * Once 235040 is cleared, any entries found in this list should + * cause a bug. + */ + list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){ if(region == rs->rs_region){ - DMERR("Clear pre-empting mark (%Lu/%s)", - region, lc->uuid + (strlen(lc->uuid) - 8)); - BUG(); + DMERR("%d) Double clear on region (" + SECTOR_FORMAT ")", __LINE__, region); + goto out; } } - list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){ + list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){ if(region == rs->rs_region){ - DMERR("%d) Double clear on region (" - SECTOR_FORMAT ")", __LINE__, region); + DMERR("Clear pre-empting mark (%Lu/%s)", + region, lc->uuid + (strlen(lc->uuid) - 8)); BUG(); } } + /* We can get here because we may be doing resync_work, and therefore,** ** clearing without ever marking..................................... */ /* Don't need to spin_unlock, because allocation is non-blocking */ rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC); - BUG_ON(!rs_new); + if (!rs_new) { + DMERR("Failed to allocate space for clear region request: %Lu", + region); + BUG(); + } memset(rs_new, 0, sizeof(struct region_state)); rs_new->rs_region = region; @@ -1088,6 +1108,21 @@ DMWARN("Error while getting resync work: bad region"); rtn = 0; } + + /* + * Check for bug 235039 + * Note the changes in cluser_clear_region + */ + if (rtn == 1) { + struct region_state *rs, *tmp_rs; + list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) { + if (*region == rs->rs_region) { + DMERR("WARNING: Bug 235039/235040 detected!"); + DMERR("Work-around in place."); + } + } + } + return rtn; } --- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/04 21:36:01 1.1.2.26.2.3 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/05 21:33:36 1.1.2.26.2.4 @@ -656,6 +656,8 @@ static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){ + struct region_user *ru; + if (lr->u.lr_region > lc->region_count) { return -EINVAL; } @@ -678,6 +680,42 @@ DMDEBUG("Resync work completed: %Lu", lr->u.lr_region); } else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) { + ru = find_ru_by_region(lc, lr->u.lr_region); + + /* + * The following condition can never happen unless we have + * a corrupted list or we had a communication error. + * + * If a write failed to one of the mirror devices, the ru + * should be RU_WRITE. If a recovery failed, it should be + * RU_RECOVER + */ + if (!ru) { + DMERR("Unable to find region being marked out-of-sync: %Lu", + lr->u.lr_region); + return -EINVAL; + } + + if (ru->ru_rw == RU_RECOVER) { + if (lr->u.lr_region != lc->recovering_region) { + DMERR("Recovering region mismatch: (%Lu/%Lu)", + lr->u.lr_region, lc->recovering_region); + BUG(); + } + /* + * Clear the recovery + */ + lc->recovering_region = (uint64_t)-1; + list_del(&ru->ru_list); + mempool_free(ru, region_user_pool); + } else { /* ru->ru_rw == RU_WRITE */ + /* + * Mirror has place the region into RH_NOSYNC + * It is safe to pull the ru + */ + list_del(&ru->ru_list); + mempool_free(ru, region_user_pool); + } /* gone again: lc->sync_count--;*/ log_clear_bit(lc, lc->sync_bits, lr->u.lr_region); }