From mboxrd@z Thu Jan 1 00:00:00 1970 From: jbrassow@sourceware.org Date: 11 Jul 2007 16:18:04 -0000 Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ... Message-ID: <20070711161804.22419.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL4 Changes by: jbrassow at sourceware.org 2007-07-11 16:18:03 Modified files: cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c dm-cmirror-xfr.h Log message: Bug 238629: dm-cmirror: Remote recovery conflict... The kernel changes are now in place (marking/clearing the log during writes to nosync regions) to allow nominal I/O to region that have yet to be recovered. Also moved around some debugging messages and removed 'is_remote_recovering()'. (is_remote_recovering is obviated by the new mechanism for handling recovery/write ordering.) Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.48&r2=1.1.2.49 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.35&r2=1.1.2.36 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.5&r2=1.1.2.6 --- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/05/09 21:44:34 1.1.2.48 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/07/11 16:18:03 1.1.2.49 @@ -286,8 +286,8 @@ lc->server_id = lr.u.lr_coordinator; } else { /* ATTENTION -- what do we do with this ? */ - DMWARN("Failed to receive election results from server: (%s,%d)", - lc->uuid + (strlen(lc->uuid) - 8), len); + DMWARN("Failed to receive election results from server: (%s/%d,%d)", + lc->uuid + (strlen(lc->uuid) - 8), lc->uuid_ref, len); error = len; } @@ -601,6 +601,7 @@ INIT_LIST_HEAD(&lc->mark_logged); spin_lock_init(&lc->state_lock); + atomic_set(&lc->suspended, 1); lc->server_valid = 0; lc->server_id = 0xDEAD; @@ -853,15 +854,7 @@ static int cluster_is_remote_recovering(struct dirty_log *log, region_t region) { - int rtn; - struct log_c *lc = (struct log_c *) log->context; - - if(atomic_read(&lc->in_sync) == 1){ - return 0; - } - - rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL); - return rtn; + return 0; } static int cluster_in_sync(struct dirty_log *log, region_t region, int block) @@ -977,57 +970,43 @@ spin_lock(&lc->state_lock); - /* - * It is possible for the following in the mirror code: - * 0) Mark is already logged for a region - * 1) rh_dec, sets region state to RH_CLEAN (asynchronous) - * 2) rh_update_states (DOESN'T FLUSH!!!, bug #235040) - * 3) do_writes, trys to mark region - * - * The following shouldn't have to be handled b/c of the flush - * 0) Region finishes recovery - * 1) rh_update_states clears region (DOES FLUSH) - * 2) do_writes, trys to mark region - * - * This can lead to this next case being valid. + * An item on the clear_waiting list should have been flushed + * before getting this mark_region call. */ - list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) { + list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) if (region == rs->rs_region) { - if (!rs->rs_mark_logged) { - DMERR("Moving region(%Lu/%s) from clear_waiting -> mark_waiting", - region, lc->uuid + (strlen(lc->uuid) - 8)); - } - list_del_init(&rs->rs_list); - list_add(&rs->rs_list, - (rs->rs_mark_logged) ? - &lc->mark_logged : &lc->mark_waiting); - goto out; + DMERR("Region being marked found on clear_waiting list (%Lu/%s)", + region, lc->uuid + (strlen(lc->uuid) - 8)); + BUG(); } - } + + /* + * We should never get two marks before a flush, unless the + * region is not in-sync. One valid scenario would be: + * 0) region not in-sync + * 1) rh_inc (mark region) + * 2) rh_update_states + * 3) rh_dec (dec pending and put on clean_region list) + * 4) do_writes -> rh_inc (second mark) + */ + list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list) + if (region == rs->rs_region) + goto out; /* * It is possible for the following in the mirror code: * 0) Mark is already logged for a region - * 1) rh_update_states + * 1) rh_update_states (were the clear_region happens) * 2) rh_dec, sets region state to RH_CLEAN (asynchronous) * 3) do_writes, trys to mark region * * This can lead to this next case being valid. */ - list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){ - if (region == rs->rs_region) { + list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) + if (region == rs->rs_region) goto out; - } - } - list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){ - if (region == rs->rs_region) { - DMERR("Mark already waiting (%Lu/%s)", - region, lc->uuid + (strlen(lc->uuid) - 8)); - BUG(); - } - } spin_unlock(&lc->state_lock); rs_new = mempool_alloc(region_state_pool, GFP_NOFS); @@ -1074,14 +1053,13 @@ * 6) we recover the region * 7) clearing the region after recovery causes us to get here * - * Once 235040 is cleared, any entries found in this list should - * cause a bug. + * Bug 235040 cleared. This should no longer happen. */ list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){ if(region == rs->rs_region){ - DMERR("%d) Double clear on region (" - SECTOR_FORMAT ")", __LINE__, region); - goto out; + DMERR("Double clear on region (%Lu/%s)", + region, lc->uuid + (strlen(lc->uuid) - 8)); + BUG(); } } @@ -1140,7 +1118,7 @@ list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) { if (*region == rs->rs_region) { DMERR("WARNING: Bug 235039/235040 detected!"); - DMERR("Work-around in place."); + BUG(); } } } @@ -1204,16 +1182,6 @@ switch(status){ case STATUSTYPE_INFO: - DMDEBUG("LOG INFO:"); - DMDEBUG(" uuid: %s", lc->uuid); - DMDEBUG(" uuid_ref : %d", lc->uuid_ref); - DMDEBUG(" ?region_count: %Lu", lc->region_count); - DMDEBUG(" ?sync_count : %Lu", lc->sync_count); - DMDEBUG(" ?sync_search : %d", lc->sync_search); - DMDEBUG(" in_sync : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO"); - DMDEBUG(" server_id : %u", lc->server_id); - DMDEBUG(" server_valid: %s", - ((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO"); if(lc->sync != DEFAULTSYNC) arg_count++; @@ -1228,6 +1196,42 @@ break; case STATUSTYPE_TABLE: + DMDEBUG("LOG INFO:"); + DMDEBUG(" uuid: %s", lc->uuid); + DMDEBUG(" uuid_ref : %d", lc->uuid_ref); + DMDEBUG(" ?region_count: %Lu", lc->region_count); + DMDEBUG(" ?sync_count : %Lu", lc->sync_count); + DMDEBUG(" ?sync_search : %d", lc->sync_search); + DMDEBUG(" in_sync : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO"); + DMDEBUG(" suspended : %s", (atomic_read(&lc->suspended)) ? "YES" : "NO"); + DMDEBUG(" server_id : %u", lc->server_id); + DMDEBUG(" server_valid: %s", + ((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO"); + { + struct log_c *tmp_lc; + + down(&log_list_lock); + list_for_each_entry(tmp_lc, &log_list_head, log_list){ + if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN) && + (tmp_lc->uuid_ref != lc->uuid_ref)){ + DMDEBUG("LOG INFO [COPY FOUND]:"); + DMDEBUG(" uuid: %s", tmp_lc->uuid); + DMDEBUG(" uuid_ref : %d", tmp_lc->uuid_ref); + DMDEBUG(" ?region_count: %Lu", tmp_lc->region_count); + DMDEBUG(" ?sync_count : %Lu", tmp_lc->sync_count); + DMDEBUG(" ?sync_search : %d", tmp_lc->sync_search); + DMDEBUG(" in_sync : %s", + (atomic_read(&tmp_lc->in_sync)) ? "YES" : "NO"); + DMDEBUG(" suspended : %s", + (atomic_read(&tmp_lc->suspended)) ? "YES" : "NO"); + DMDEBUG(" server_id : %u", tmp_lc->server_id); + DMDEBUG(" server_valid: %s", + ((tmp_lc->server_id != 0xDEAD) && + tmp_lc->server_valid) ? "YES" : "NO"); + } + } + up(&log_list_lock); + } if(lc->sync != DEFAULTSYNC) arg_count++; --- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/26 16:54:49 1.1.2.35 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/07/11 16:18:03 1.1.2.36 @@ -221,54 +221,6 @@ return count; } -struct region_user *find_ru_by_region(struct log_c *lc, region_t region); -static int _core_get_resync_work(struct log_c *lc, region_t *region) -{ - int sync_search, conflict = 0; - - if (lc->recovering_region != (uint64_t)-1) { - DMDEBUG("Someone is already recovering region %Lu/%s", - lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8)); - return 0; - } - - if (lc->sync_search >= lc->region_count) { - /* - * FIXME: pvmove is not supported yet, but when it is, - * an audit of sync_count changes will need to be made - */ - if ((lc->sync_count < lc->region_count) && !lc->sync_pass) { - lc->sync_search = 0; - lc->sync_pass++; - } else { - return 0; - } - } - for (sync_search = lc->sync_search; - sync_search < lc->region_count; - sync_search = (*region + 1)) { - *region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits, - lc->region_count, - sync_search); - if (find_ru_by_region(lc, *region)) { - conflict = 1; - DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s", - *region, lc->uuid + (strlen(lc->uuid) - 8)); - } else { - break; - } - } - if (!conflict) - lc->sync_search = *region + 1; - - if (*region >= lc->region_count) - return 0; - - lc->recovering_region = *region; - return 1; -} - - static int print_zero_bits(unsigned char *str, int offset, int bit_count){ int i,j; int count=0; @@ -492,39 +444,6 @@ return 0; } -static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr) -{ - region_t region; - struct region_user *ru; - - /* - * This gets a bit complicated. I wish we didn't have to use this - * function, but because the mirror code doesn't mark regions which - * it writes to that are out-of-sync, we need this function. - * - * Problem is, we don't know how long the user is going to take to - * write to the region after they have called this function. So, - * we are forced at this point to deny any writes to regions we - * are recovering or might recover in the future. - * - * We can count on the client side to not send us one of these - * requests if the mirror is known to be in-sync. - * - * And yes, it sucks to take this much time off the I/O. - */ - region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits, - lc->region_count, 0); - - if (lr->u.lr_region >= region) { - DMDEBUG("Remote recovery conflict: (%Lu >= %Lu)/%s", - lr->u.lr_region, region, lc->uuid + (strlen(lc->uuid) - 8)); - lr->u.lr_int_rtn = 1; - } else - lr->u.lr_int_rtn = 0; - - return 0; -} - static int server_in_sync(struct log_c *lc, struct log_request *lr) { if (lr->u.lr_region > lc->region_count) { @@ -555,7 +474,13 @@ new->ru_region = lr->u.lr_region; new->ru_rw = RU_WRITE; - if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) { + if (find_ru(lc, who, lr->u.lr_region)) { + DMWARN("Attempt to mark a already marked region (%u," + SECTOR_FORMAT + "/%s)", + who, lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8)); + mempool_free(new, region_user_pool); + } else if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) { log_clear_bit(lc, lc->clean_bits, lr->u.lr_region); list_add(&new->ru_list, &lc->region_users); } else if (ru->ru_rw == RU_RECOVER) { @@ -572,6 +497,22 @@ DMDEBUG("Mark requester : %u", who); log_clear_bit(lc, lc->clean_bits, lr->u.lr_region); list_add_tail(&new->ru_list, &lc->region_users); + } else { + list_add(&new->ru_list, &ru->ru_list); + } + + /* + if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) { + log_clear_bit(lc, lc->clean_bits, lr->u.lr_region); + list_add(&new->ru_list, &lc->region_users); + } else if (ru->ru_rw == RU_RECOVER) { + DMDEBUG("Attempt to mark a region " SECTOR_FORMAT + "/%s which is being recovered.", + lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8)); + DMDEBUG("Current recoverer: %u", ru->ru_nodeid); + DMDEBUG("Mark requester : %u", who); + log_clear_bit(lc, lc->clean_bits, lr->u.lr_region); + list_add_tail(&new->ru_list, &lc->region_users); } else if (!find_ru(lc, who, lr->u.lr_region)) { list_add(&new->ru_list, &ru->ru_list); } else { @@ -581,6 +522,7 @@ who, lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8)); mempool_free(new, region_user_pool); } + */ return 0; } @@ -611,35 +553,28 @@ { int r = 0; int count = 0; - struct region_user *ru, *ru2; + struct region_user *ru, *marker = NULL, *recoverer = NULL; if (lc->recovering_region != (uint64_t)-1) { list_for_each_entry(ru, &lc->region_users, ru_list) - if (ru->ru_region == lc->recovering_region) - count++; - - if (count > 1) { - list_for_each_entry(ru, &lc->region_users, ru_list) + if (ru->ru_region == lc->recovering_region) { if (ru->ru_rw == RU_RECOVER) - break; + recoverer = ru; + else + marker = ru; + count++; + } - DMDEBUG("Flush includes region which is being recovered (%u/%Lu). Delaying...", - ru->ru_nodeid, ru->ru_region); - DMDEBUG("Recovering region: %Lu", lc->recovering_region); + if (marker && recoverer) { + DMDEBUG("Flush/recovery collision (%Lu/%s): count = %d, marker = %u, recoverer = %u", + marker->ru_region, lc->uuid + (strlen(lc->uuid) - 8), + count, marker->ru_nodeid, recoverer->ru_nodeid); + /* DMDEBUG(" sync_bit: %s, clean_bit: %s", log_test_bit(lc->sync_bits, lc->recovering_region) ? "set" : "unset", log_test_bit(lc->clean_bits, lc->recovering_region) ? "set" : "unset"); + */ - list_for_each_entry(ru2, &lc->region_users, ru_list) - if (ru->ru_region == ru2->ru_region) - DMDEBUG(" %s", (ru2->ru_rw == RU_RECOVER) ? "recover" : - (ru2->ru_rw == RU_WRITE) ? "writer" : "unknown"); - - /* FIXME: work-around for bug 235040 */ - DMDEBUG("Revoking resync work"); - lc->recovering_region = (uint64_t)-1; - list_del(&ru->ru_list); - mempool_free(ru, region_user_pool); return -EBUSY; } } @@ -1121,9 +1056,6 @@ case LRT_IS_CLEAN: error = server_is_clean(lc, lr); break; - case LRT_IS_REMOTE_RECOVERING: - error = server_is_remote_recovering(lc, lr); - break; case LRT_IN_SYNC: error = server_in_sync(lc, lr); break; --- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h 2007/04/24 20:08:57 1.1.2.5 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h 2007/07/11 16:18:03 1.1.2.6 @@ -10,19 +10,18 @@ #define MAX_NAME_LEN 128 #define LRT_IS_CLEAN 1 -#define LRT_IS_REMOTE_RECOVERING 2 -#define LRT_IN_SYNC 3 -#define LRT_MARK_REGION 4 -#define LRT_CLEAR_REGION 5 -#define LRT_FLUSH 6 -#define LRT_GET_RESYNC_WORK 7 -#define LRT_COMPLETE_RESYNC_WORK 8 -#define LRT_GET_SYNC_COUNT 9 - -#define LRT_ELECTION 10 -#define LRT_SELECTION 11 -#define LRT_MASTER_ASSIGN 12 -#define LRT_MASTER_LEAVING 13 +#define LRT_IN_SYNC 2 +#define LRT_MARK_REGION 3 +#define LRT_CLEAR_REGION 4 +#define LRT_FLUSH 5 +#define LRT_GET_RESYNC_WORK 6 +#define LRT_COMPLETE_RESYNC_WORK 7 +#define LRT_GET_SYNC_COUNT 8 + +#define LRT_ELECTION 9 +#define LRT_SELECTION 10 +#define LRT_MASTER_ASSIGN 11 +#define LRT_MASTER_LEAVING 12 #define CLUSTER_LOG_PORT 51005