From mboxrd@z Thu Jan 1 00:00:00 1970 From: jbrassow@sourceware.org Date: 24 Apr 2007 20:08:59 -0000 Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ... Message-ID: <20070424200859.3672.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL4 Changes by: jbrassow at sourceware.org 2007-04-24 21:08:57 Modified files: cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c dm-cmirror-server.h dm-cmirror-xfr.h Log message: Bug 199433: NULL pointer dereference in cman:process_messages for cmirro... - While this isn't a complete fix for 199433, it is most likely the cause of the error. Cluster mirrors were steadily leaking memory every time they were deactivated. Bug 237028: cmirror recovery deadlock due to machine failure + primary l... - If there is outstanding resync work remaining when the server gets notice to suspend, delay for a moment to wait for it. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.45&r2=1.1.2.46 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.33&r2=1.1.2.34 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.3&r2=1.1.2.4 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.4&r2=1.1.2.5 --- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/10 07:12:24 1.1.2.45 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/24 20:08:57 1.1.2.46 @@ -379,7 +379,8 @@ if(len <= 0){ /* ATTENTION -- what do we do with this ? */ - DMWARN("Error while listening for server response: %d", len); + DMWARN("Error listening for server(%u) response for %s: %d", + lc->server_id, lc->uuid + (strlen(lc->uuid) - 8), len); error = len; *retry = 1; seq++; @@ -767,6 +768,7 @@ static int cluster_postsuspend(struct dirty_log *log) { + int i; struct region_state *rs, *tmp_rs; struct log_c *lc = (struct log_c *) log->context; @@ -788,10 +790,20 @@ spin_unlock(&lc->state_lock); + if(lc->server_id == my_id) { + for (i = 0; server_busy(lc) && (i < 10); i++) { + DMDEBUG("Server for %s still busy, waiting for others", + lc->uuid + (strlen(lc->uuid) - 8)); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ*2); + } + } + atomic_set(&lc->suspended, 1); if(lc->server_id == my_id) { while (1) { - DMDEBUG("Telling everyone I'm suspending"); + DMDEBUG("Telling everyone I'm suspending (%s)", + lc->uuid + (strlen(lc->uuid) - 8)); consult_server(lc, 0, LRT_MASTER_LEAVING, NULL); down(&consult_server_lock); @@ -799,13 +811,15 @@ up(&consult_server_lock); if ((my_id && (lc->server_id == my_id))) { - DMDEBUG("Delaying suspend, work to be done."); + DMDEBUG("Delaying suspend, work to be done (%s)", + lc->uuid + (strlen(lc->uuid) - 8)); atomic_set(&lc->suspended, 0); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ*2); atomic_set(&lc->suspended, 1); } else { - DMDEBUG("Suspending now"); + DMDEBUG("Suspending now (%s)", + lc->uuid + (strlen(lc->uuid) - 8)); break; } } @@ -1196,6 +1210,16 @@ switch(status){ case STATUSTYPE_INFO: + DMDEBUG("LOG INFO:"); + DMDEBUG(" uuid: %s", lc->uuid); + DMDEBUG(" uuid_ref : %d", lc->uuid_ref); + DMDEBUG(" ?region_count: %Lu", lc->region_count); + DMDEBUG(" ?sync_count : %Lu", lc->sync_count); + DMDEBUG(" ?sync_search : %d", lc->sync_search); + DMDEBUG(" in_sync : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO"); + DMDEBUG(" server_id : %u", lc->server_id); + DMDEBUG(" server_valid: %s", + ((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO"); if(lc->sync != DEFAULTSYNC) arg_count++; @@ -1254,11 +1278,6 @@ } up(&log_list_lock); - /* - if (likely(!shutting_down)) - suspend_server(); - */ - return 0; } @@ -1311,9 +1330,7 @@ BUG(); break; } - /* - resume_server(); - */ + return 0; } @@ -1452,6 +1469,7 @@ r = dm_register_dirty_log_type(&_clustered_core_type); if (r) { DMWARN("couldn't register clustered_core dirty log type"); + mempool_destroy(region_state_pool); return r; } @@ -1459,6 +1477,7 @@ if (r) { DMWARN("couldn't register clustered_disk dirty log type"); dm_unregister_dirty_log_type(&_clustered_core_type); + mempool_destroy(region_state_pool); return r; } @@ -1475,6 +1494,7 @@ } dm_unregister_dirty_log_type(&_clustered_core_type); dm_unregister_dirty_log_type(&_clustered_disk_type); + mempool_destroy(region_state_pool); DMINFO("dm-cmirror %s (built %s %s) removed", CMIRROR_RELEASE_NAME, __DATE__, __TIME__); } --- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/17 19:49:11 1.1.2.33 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/24 20:08:57 1.1.2.34 @@ -42,8 +42,6 @@ static atomic_t server_run; static struct completion server_completion; -static wait_queue_head_t _suspend_queue; -static atomic_t _suspend; static atomic_t _do_requests; static int debug_disk_write = 0; @@ -798,6 +796,12 @@ uint32_t lowest, next; uint32_t node_count=global_count, *nodeids=global_nodeids; + DMDEBUG("%s(%d): (%s)", RQ_STRING(lr->lr_type), lr->lr_type, + (lc) ? lc->uuid + (strlen(lc->uuid) - 8) : "none"); + DMDEBUG(" starter : %u", lr->u.lr_starter); + DMDEBUG(" co-ordinator: %u", lr->u.lr_coordinator); + DMDEBUG(" node_count : %d", lr->u.lr_node_count); + /* Record the starter's port number so we can get back to him */ if((lr->u.lr_starter == my_id) && (!lr->u.lr_node_count)){ lr->u.lr_starter_port = saddr->sin_port; @@ -1175,12 +1179,12 @@ complete(&server_completion); + DMDEBUG("cluster_log_serverd ready for work"); for(;;){ if(!atomic_read(&server_run)){ break; } - suspend_on(&_suspend_queue, atomic_read(&_suspend)); switch(atomic_read(&restart_event_type)){ case SERVICE_NODE_LEAVE: /* ATTENTION -- may wish to check if regions ** @@ -1206,6 +1210,9 @@ up(&log_list_lock); break; + case SERVICE_NODE_JOIN: + DMDEBUG("Node joining"); + break; default: /* Someone has joined, or there is no event */ break; @@ -1227,6 +1234,7 @@ schedule(); } + DMDEBUG("Closing socket on server side"); sock_release(sock); complete(&server_completion); return 0; @@ -1244,8 +1252,6 @@ void print_server_status(struct log_c *lc){ int i; - atomic_set(&_suspend, 1); - DMINFO("SERVER OUTPUT::"); DMINFO(" Live nodes :: %d", global_count); @@ -1267,11 +1273,18 @@ i = print_zero_bits((unsigned char *)lc->sync_bits, 0, lc->bitset_uint32_count); DMINFO(" Total = %d", i); - atomic_set(&_suspend, 0); - wake_up_all(&_suspend_queue); } */ +int server_busy(struct log_c *lc) +{ + if (!list_empty(&lc->region_users) || + (lc->recovering_region != (uint64_t)-1)) + return 1; + else + return 0; +} + int server_free_region_users(struct log_c *lc) { int i = 0; @@ -1287,18 +1300,6 @@ return 0; } - -int suspend_server(void){ - atomic_set(&_suspend, 1); - return 0; -} - -int resume_server(void){ - atomic_set(&_suspend, 0); - wake_up_all(&_suspend_queue); - return 0; -} - int resume_server_requests(void) { atomic_set(&_do_requests, 1); return 0; @@ -1307,6 +1308,7 @@ int start_server(void /* log_devices ? */){ int error; + DMDEBUG("start_server called"); region_user_pool = mempool_create(1000, region_user_alloc, region_user_free, NULL); if(!region_user_pool){ @@ -1314,20 +1316,20 @@ return -ENOMEM; } - init_waitqueue_head(&_suspend_queue); - atomic_set(&_do_requests, 0); atomic_set(&server_run, 1); init_completion(&server_completion); error = kernel_thread(cluster_log_serverd, NULL, 0); if(error < 0){ + mempool_destroy(region_user_pool); DMWARN("failed to start kernel thread."); return error; } wait_for_completion(&server_completion); if(!atomic_read(&server_run)){ + mempool_destroy(region_user_pool); DMWARN("Cluster mirror log server thread failed to start"); return -1; } @@ -1337,9 +1339,17 @@ void stop_server(void){ + DMDEBUG("stop_server called"); atomic_set(&server_run, 0); wait_for_completion(&server_completion); + down(&log_list_lock); + if (!list_empty(&log_list_head)) { + DMERR("Log elements remain@cluster log server shutdown"); + } + up(&log_list_lock); + mempool_destroy(region_user_pool); + dm_io_put(32); } /* --- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h 2007/04/10 07:12:24 1.1.2.3 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h 2007/04/24 20:08:57 1.1.2.4 @@ -7,7 +7,7 @@ #ifndef __DM_CMIRROR_SERVER_H__ #define __DM_CMIRROR_SERVER_H__ -int suspend_server(void); +int server_busy(struct log_c *lc); int resume_server(void); int resume_server_requests(void); int start_server(void); --- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h 2007/04/03 18:21:10 1.1.2.4 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h 2007/04/24 20:08:57 1.1.2.5 @@ -30,14 +30,15 @@ ((x) == LRT_IS_CLEAN) ? "LRT_IS_CLEAN": \ ((x) == LRT_IN_SYNC) ? "LRT_IN_SYNC": \ ((x) == LRT_MARK_REGION) ? "LRT_MARK_REGION": \ + ((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \ ((x) == LRT_FLUSH) ? "LRT_FLUSH": \ ((x) == LRT_GET_RESYNC_WORK) ? "LRT_GET_RESYNC_WORK": \ - ((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \ - ((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \ ((x) == LRT_COMPLETE_RESYNC_WORK) ? "LRT_COMPLETE_RESYNC_WORK": \ - ((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING": \ + ((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \ ((x) == LRT_ELECTION) ? "LRT_ELECTION": \ - ((x) == LRT_SELECTION) ? "LRT_SELECTION": "UNKNOWN" + ((x) == LRT_SELECTION) ? "LRT_SELECTION": \ + ((x) == LRT_MASTER_ASSIGN) ? "LRT_MASTER_ASSIGN": \ + ((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING" : "UNKNOWN" struct log_request { int lr_type;