From mboxrd@z Thu Jan 1 00:00:00 1970 From: jbrassow@sourceware.org Date: 26 Apr 2007 16:55:53 -0000 Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ... Message-ID: <20070426165553.25025.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL45 Changes by: jbrassow at sourceware.org 2007-04-26 17:55:51 Modified files: cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c Log message: Bug 238031: cluster mirrors not handling all recovery/write conflicts Problem is that the kernel (main mirror code) does not do any marks/clears when writing to a region before its recovery. So, it is not possible for the server to detect a conflict. Basically, we must turn back on the 'is_remote_recovering' function and disallow any writes to regions that are OR WILL BE recovering. It's really going to cause some pain during writes while mirrors are re-syncing. The better fix for the future is to have the writes always mark/clear the regions - then we can again remove the 'is_remote_recovering' function. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.5&r2=1.1.2.41.2.6 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.7&r2=1.1.2.26.2.8 --- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/24 20:10:20 1.1.2.41.2.5 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/26 16:55:51 1.1.2.41.2.6 @@ -861,11 +861,10 @@ int rtn; struct log_c *lc = (struct log_c *) log->context; -/* take out optimization if(atomic_read(&lc->in_sync) == 1){ return 0; } -*/ + rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL); return rtn; } @@ -876,11 +875,11 @@ struct log_c *lc = (struct log_c *) log->context; /* check known_regions, return if found */ -/* take out optimization + if(atomic_read(&lc->in_sync) == 1){ return 1; } -*/ + if(!block){ return -EWOULDBLOCK; } @@ -1414,7 +1413,7 @@ .resume = cluster_resume, .get_region_size = cluster_get_region_size, .is_clean = cluster_is_clean, -/* .is_remote_recovering = cluster_is_remote_recovering,*/ + .is_remote_recovering = cluster_is_remote_recovering, .in_sync = cluster_in_sync, .flush = cluster_flush, .mark_region = cluster_mark_region, @@ -1436,7 +1435,7 @@ .resume = cluster_resume, .get_region_size = cluster_get_region_size, .is_clean = cluster_is_clean, -/* .is_remote_recovering = cluster_is_remote_recovering,*/ + .is_remote_recovering = cluster_is_remote_recovering, .in_sync = cluster_in_sync, .flush = cluster_flush, .mark_region = cluster_mark_region, --- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/24 20:10:20 1.1.2.26.2.7 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/26 16:55:51 1.1.2.26.2.8 @@ -494,12 +494,32 @@ static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr) { + region_t region; struct region_user *ru; - if ((ru = find_ru_by_region(lc, lr->u.lr_region)) && - (ru->ru_rw == RU_RECOVER)) + /* + * This gets a bit complicated. I wish we didn't have to use this + * function, but because the mirror code doesn't mark regions which + * it writes to that are out-of-sync, we need this function. + * + * Problem is, we don't know how long the user is going to take to + * write to the region after they have called this function. So, + * we are forced at this point to deny any writes to regions we + * are recovering or might recover in the future. + * + * We can count on the client side to not send us one of these + * requests if the mirror is known to be in-sync. + * + * And yes, it sucks to take this much time off the I/O. + */ + region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits, + lc->region_count, 0); + + if (lr->u.lr_region >= region) { + DMDEBUG("Remote recovery conflict: (%Lu >= %Lu)/%s", + lr->u.lr_region, region, lc->uuid + (strlen(lc->uuid) - 8)); lr->u.lr_int_rtn = 1; - else + } else lr->u.lr_int_rtn = 0; return 0; @@ -639,24 +659,65 @@ static int server_get_resync_work(struct log_c *lc, struct log_request *lr, uint32_t who) { struct region_user *new; + int sync_search, conflict = 0; + region_t *region = &(lr->u.lr_region_rtn); - new = mempool_alloc(region_user_pool, GFP_NOFS); - if(!new){ - lr->u.lr_int_rtn = 0; - return -ENOMEM; + lr->u.lr_int_rtn = 0; /* Default to no work */ + + if (lc->recovering_region != (uint64_t)-1) { + DMDEBUG("Someone is already recovering region %Lu/%s", + lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8)); + return 0; } - - if ((lr->u.lr_int_rtn = _core_get_resync_work(lc, &(lr->u.lr_region_rtn)))){ - new->ru_nodeid = who; - new->ru_region = lr->u.lr_region_rtn; - new->ru_rw = RU_RECOVER; - list_add(&new->ru_list, &lc->region_users); - DMDEBUG("Assigning recovery work to %u/%s: %Lu", - who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region); - } else { - mempool_free(new, region_user_pool); + + if (lc->sync_search >= lc->region_count) { + /* + * FIXME: pvmove is not supported yet, but when it is, + * an audit of sync_count changes will need to be made + */ + if ((lc->sync_count < lc->region_count) && !lc->sync_pass) { + lc->sync_search = 0; + lc->sync_pass++; + } else { + return 0; + } + } + + for (sync_search = lc->sync_search; + sync_search < lc->region_count; + sync_search = (*region + 1)) { + *region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits, + lc->region_count, + sync_search); + if (find_ru_by_region(lc, *region)) { + conflict = 1; + DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s", + *region, lc->uuid + (strlen(lc->uuid) - 8)); + } else { + break; + } } + if (*region >= lc->region_count) + return 0; + + new = mempool_alloc(region_user_pool, GFP_NOFS); + if (!new) + return -ENOMEM; + + if (!conflict) + lc->sync_search = *region + 1; + + lc->recovering_region = *region; + + lr->u.lr_int_rtn = 1; /* Assigning work */ + new->ru_nodeid = who; + new->ru_region = *region; + new->ru_rw = RU_RECOVER; + list_add(&new->ru_list, &lc->region_users); + DMDEBUG("Assigning recovery work to %u/%s: %Lu", + who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region); + return 0; }