From mboxrd@z Thu Jan 1 00:00:00 1970 From: jbrassow@sourceware.org Date: 10 Apr 2007 07:12:26 -0000 Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ... Message-ID: <20070410071226.1753.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL4 Changes by: jbrassow at sourceware.org 2007-04-10 08:12:24 Modified files: cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h dm-cmirror-server.c dm-cmirror-server.h Log message: Bug 235686: Kernel BUG at dm_cmirror_server while recovering region Several fixes have gone in to fix the handling of this bug: 1) During server relocation (which can happen due to machine failure or normal mirror suspension), the server value could get set before the client had a chance to clean-up. This caused the server to become confused and issue a BUG(). 2) perform a flush of the log before suspending. This ensures that regions which are in-sync get correctly flushed to the disk log. Without this, there will always be recovery work to be done when a mirror starts up - even if it was properly in-sync during shutdown. 3) clean-up memory used to record region users when a mirror is shutdown. It was possible for some regions to be left over (causing a memory leak) during certain fault scenarios. 4) properly initialize the state field (ru_rw) in the region user structure when a mark occurs. Without the initialization, it was sometimes possible for the region to be misinterpretted as recovering instead of marked. 5) resolve and unhandled case in server_complete_resync_work 6) reset a variable in cluster_complete_resync_work. Failure to do so was causing a retry to include the wrong value for the completion of the resync work - confusing the server. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.44&r2=1.1.2.45 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.13&r2=1.1.2.14 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.30&r2=1.1.2.31 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.2&r2=1.1.2.3 --- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/05 21:32:25 1.1.2.44 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/10 07:12:24 1.1.2.45 @@ -373,13 +373,8 @@ fs = get_fs(); set_fs(get_ds()); - if(type == LRT_MASTER_LEAVING){ - len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request), - 0, 10); - } else { - len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request), - 0, 5); - } + len = my_recvmsg(lc->client_sock, &msg, + sizeof(struct log_request), 0, 15); set_fs(fs); if(len <= 0){ @@ -492,9 +487,14 @@ goto out; } election: - while(lc->server_id == 0xDEAD){ + while ((lc->server_id == 0xDEAD) || (!lc->server_valid)) { + DMDEBUG("server_id=%x, server_valid=%u, %s", + lc->server_id, lc->server_valid, + lc->uuid + (strlen(lc->uuid) - 8)); + DMDEBUG("trigger = %s", RQ_STRING(type)); run_election(lc, my_id); new_server = 1; + lc->server_valid = 1; } spin_lock(&lc->state_lock); @@ -600,6 +600,7 @@ INIT_LIST_HEAD(&lc->mark_logged); spin_lock_init(&lc->state_lock); + lc->server_valid = 0; lc->server_id = 0xDEAD; if ((error = cluster_connect())) { @@ -731,19 +732,20 @@ } if (!list_empty(&lc->mark_logged)) { - DMERR("Mark requests remain at cluster log deactivation"); - /* - * Should I BUG() this? - * No. In the worst case, they will get cleaned up later - */ - } - list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) { - list_del_init(&rs->rs_list); - mempool_free(rs, region_state_pool); + int i = 0; + + list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) { + i++; + list_del_init(&rs->rs_list); + mempool_free(rs, region_state_pool); + } + DMDEBUG("%d mark requests remain at cluster log deactivation", i); } spin_unlock(&lc->state_lock); + server_free_region_users(lc); + if (lc->log_dev) disk_dtr(log); else @@ -753,8 +755,13 @@ DMERR("Unable to disconnect from cluster infrastructure.\n"); } +static int cluster_flush(struct dirty_log *log); static int cluster_presuspend(struct dirty_log *log) { + /* FIXME: flush is work-around for bug 235040 */ + DMDEBUG("Performing flush to work around bug 235040"); + cluster_flush(log); + DMDEBUG("Log flush complete"); return 0; } @@ -784,6 +791,7 @@ atomic_set(&lc->suspended, 1); if(lc->server_id == my_id) { while (1) { + DMDEBUG("Telling everyone I'm suspending"); consult_server(lc, 0, LRT_MASTER_LEAVING, NULL); down(&consult_server_lock); @@ -791,11 +799,13 @@ up(&consult_server_lock); if ((my_id && (lc->server_id == my_id))) { + DMDEBUG("Delaying suspend, work to be done."); atomic_set(&lc->suspended, 0); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ*2); atomic_set(&lc->suspended, 1); } else { + DMDEBUG("Suspending now"); break; } } @@ -907,7 +917,7 @@ DMDEBUG("Delaying mark to region %Lu, due to recovery", rs->rs_region); set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ/2); + schedule_timeout(HZ); continue; } @@ -933,7 +943,7 @@ if (r == -EBUSY) { DMDEBUG("Delaying flush due to recovery"); set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ/2); + schedule_timeout(HZ); continue; } @@ -1109,6 +1119,10 @@ rtn = 0; } + if (rtn) + DMDEBUG("Client received resync work: %Lu/%s", + *region, lc->uuid + (strlen(lc->uuid) - 8)); + /* * Check for bug 235039 * Note the changes in cluser_clear_region @@ -1129,10 +1143,16 @@ static void cluster_complete_resync_work(struct dirty_log *log, region_t region, int success) { + int i; region_t success_tmp = success; struct log_c *lc = (struct log_c *) log->context; - while(consult_server(lc, region, LRT_COMPLETE_RESYNC_WORK, &success_tmp)){ - DMWARN("unable to notify server of completed resync work"); + + for (i = 0; i < 5; i++) { + if (!consult_server(lc, region, + LRT_COMPLETE_RESYNC_WORK, &success_tmp)) + break; + success_tmp = success; + DMWARN("Unable to notify server of completed resync work"); } return; } --- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h 2007/04/03 18:21:10 1.1.2.13 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h 2007/04/10 07:12:24 1.1.2.14 @@ -141,6 +141,7 @@ struct list_head mark_waiting; struct list_head mark_logged; + uint32_t server_valid; uint32_t server_id; struct socket *client_sock; }; --- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/05 21:32:25 1.1.2.30 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/10 07:12:24 1.1.2.31 @@ -229,7 +229,8 @@ int sync_search, conflict = 0; if (lc->recovering_region != (uint64_t)-1) { - DMDEBUG("Someone is already recovering region %Lu", lc->recovering_region); + DMDEBUG("Someone is already recovering region %Lu/%s", + lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8)); return 0; } @@ -253,8 +254,8 @@ sync_search); if (find_ru_by_region(lc, *region)) { conflict = 1; - DMDEBUG("Recovery blocked by outstanding write on region %Lu", - *region); + DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s", + *region, lc->uuid + (strlen(lc->uuid) - 8)); } else { break; } @@ -534,7 +535,8 @@ new->ru_nodeid = who; new->ru_region = lr->u.lr_region; - + new->ru_rw = RU_WRITE; + if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) { log_clear_bit(lc, lc->clean_bits, lr->u.lr_region); list_add(&new->ru_list, &lc->region_users); @@ -615,6 +617,11 @@ DMDEBUG(" %s", (ru2->ru_rw == RU_RECOVER) ? "recover" : (ru2->ru_rw == RU_WRITE) ? "writer" : "unknown"); + /* FIXME: work-around for bug 235040 */ + DMDEBUG("Revoking resync work"); + lc->recovering_region = (uint64_t)-1; + list_del(&ru->ru_list); + mempool_free(ru, region_user_pool); return -EBUSY; } } @@ -646,7 +653,8 @@ new->ru_region = lr->u.lr_region_rtn; new->ru_rw = RU_RECOVER; list_add(&new->ru_list, &lc->region_users); - DMDEBUG("Assigning recovery work to %u: %Lu", who, new->ru_region); + DMDEBUG("Assigning recovery work to %u/%s: %Lu", + who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region); } else { mempool_free(new, region_user_pool); } @@ -654,8 +662,8 @@ return 0; } - -static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){ +static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, + int success, uint32_t who){ struct region_user *ru; if (lr->u.lr_region > lc->region_count) { @@ -679,51 +687,61 @@ lc->sync_pass = 0; DMDEBUG("Resync work completed: %Lu", lr->u.lr_region); - } else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) { - ru = find_ru_by_region(lc, lr->u.lr_region); + return 0; + } - /* - * The following condition can never happen unless we have - * a corrupted list or we had a communication error. - * - * If a write failed to one of the mirror devices, the ru - * should be RU_WRITE. If a recovery failed, it should be - * RU_RECOVER - */ - if (!ru) { - DMERR("Unable to find region being marked out-of-sync: %Lu", - lr->u.lr_region); - return -EINVAL; - } + /* + * Recovery failed or mirror is being marked out-of-sync + * + * We can recieve multiple calls to mark out-of-sync + * if there were several writes to the same region that + * failed. In this case, there will not be a record for + * the region. + */ + ru = find_ru(lc, who, lr->u.lr_region); - if (ru->ru_rw == RU_RECOVER) { - if (lr->u.lr_region != lc->recovering_region) { - DMERR("Recovering region mismatch: (%Lu/%Lu)", - lr->u.lr_region, lc->recovering_region); - BUG(); - } - /* - * Clear the recovery - */ - lc->recovering_region = (uint64_t)-1; - list_del(&ru->ru_list); - mempool_free(ru, region_user_pool); - } else { /* ru->ru_rw == RU_WRITE */ - /* - * Mirror has place the region into RH_NOSYNC - * It is safe to pull the ru - */ - list_del(&ru->ru_list); - mempool_free(ru, region_user_pool); + if ((lr->u.lr_region == lc->recovering_region) && !ru) { + DMERR("Unable to locate record of recovery"); + BUG(); + } + + if (!ru) { + DMERR("Unable to find region to be marked out-of-sync: %Lu/%s/%u", + lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who); + BUG(); + } else if (ru->ru_rw == RU_RECOVER) { + if (lr->u.lr_region != lc->recovering_region) { + DMERR("Recovering region mismatch from node %u: (%Lu/%Lu)", + who, lr->u.lr_region, lc->recovering_region); + BUG(); } - /* gone again: lc->sync_count--;*/ - log_clear_bit(lc, lc->sync_bits, lr->u.lr_region); + /* + * Clear the recovery + */ + DMDEBUG("Setting recovering region out-of-sync: %Lu/%s/%u", + lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who); + lc->recovering_region = (uint64_t)-1; + list_del(&ru->ru_list); + mempool_free(ru, region_user_pool); } + /* else if (ru->ru_rw == RU_WRITE) + * Mirror has place the region into RH_NOSYNC + * We will leave the record in place. It is + * likely there are more requests coming to + * mark this region out-of-sync, due to the + * way mirror handles the situation. + * + DMDEBUG("Setting marked region out-of-sync: %Lu/%s", + lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8)); + */ + + /* gone again: lc->sync_count--;*/ + log_clear_bit(lc, lc->sync_bits, lr->u.lr_region); + return 0; } - static int server_get_sync_count(struct log_c *lc, struct log_request *lr){ if (lc->sync_count > lc->region_count) { DMERR("sync_count (" SECTOR_FORMAT ") > region_count (" SECTOR_FORMAT ") in %s!", @@ -901,6 +919,7 @@ * If we are the server, assign it */ if(lr->u.lr_coordinator == my_id){ + lc->server_valid = 0; lc->server_id = my_id; } @@ -988,7 +1007,8 @@ DMDEBUG("I'm the cluster mirror log server for %s", lc->uuid + (strlen(lc->uuid) - 8)); if (atomic_read(&lc->suspended)) { - DMDEBUG("Not reading disk log because I'm suspended."); + DMDEBUG("Not reading disk log because I'm suspended (%s)", + lc->uuid + (strlen(lc->uuid) - 8)); } else if (disk_resume(lc) == -EDEADLK) { DMDEBUG("Unable to read disk log - deadlock potential."); @@ -1060,7 +1080,12 @@ error = server_get_resync_work(lc, lr, nodeid); break; case LRT_COMPLETE_RESYNC_WORK: - error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn); + if(!(nodeid = + ipaddr_to_nodeid((struct sockaddr *)msg.msg_name))){ + error = -ENXIO; + break; + } + error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn, nodeid); lr->u.lr_int_rtn = 0; break; case LRT_GET_SYNC_COUNT: @@ -1236,6 +1261,22 @@ } */ +int server_free_region_users(struct log_c *lc) +{ + int i = 0; + struct region_user *ru, *tmp_ru; + + list_for_each_entry_safe(ru, tmp_ru, &lc->region_users, ru_list) { + i++; + list_del(&ru->ru_list); + mempool_free(ru, region_user_pool); + } + + DMDEBUG("%d region user structures freed", i); + return 0; +} + + int suspend_server(void){ atomic_set(&_suspend, 1); return 0; --- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h 2005/08/11 18:26:19 1.1.2.2 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h 2007/04/10 07:12:24 1.1.2.3 @@ -13,5 +13,6 @@ int start_server(void); void stop_server(void); void print_server_status(struct log_c *lc); +int server_free_region_users(struct log_c *lc); #endif /* __DM_CMIRROR_SERVER_H__ */