From mboxrd@z Thu Jan 1 00:00:00 1970 From: jbrassow@sourceware.org Date: 5 Sep 2006 17:50:12 -0000 Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ... Message-ID: <20060905175012.3340.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL4 Changes by: jbrassow at sourceware.org 2006-09-05 17:50:11 Modified files: cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c Log message: - fix the bugs I've seen so far - mostly related to the recently added ability to migrate the log server on suspension - that cause hangs during combinations of create/delete/convert of mirrors Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.23&r2=1.1.2.24 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.16&r2=1.1.2.17 --- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2006/07/27 23:11:55 1.1.2.23 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2006/09/05 17:50:11 1.1.2.24 @@ -396,8 +396,8 @@ set_fs(get_ds()); if(type == LRT_MASTER_LEAVING){ - len = sock_recvmsg(lc->client_sock, &msg, sizeof(struct log_request), - /* WAIT for it */0); + len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request), + 0, 10); } else { len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request), 0, 5); @@ -419,7 +419,7 @@ goto fail; } - if(lr->u.lr_int_rtn == -ENXIO){ + if (lr->u.lr_int_rtn == -ENXIO) { lc->server_id = 0xDEAD; *retry = 1; goto fail; @@ -591,7 +591,7 @@ unsigned int argc, char **argv, int disk) { int error = 0; - struct log_c *lc; + struct log_c *lc, *tmp_lc; struct sockaddr_in saddr_in; if (!disk) { @@ -621,6 +621,15 @@ atomic_set(&lc->in_sync, -1); + list_for_each_entry(tmp_lc, &log_list_head, log_list){ + if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN)){ + DMERR("Log already exists with uuid, %s", + lc->uuid + (strlen(lc->uuid) - 8)); + error = -EINVAL; + goto fail; + } + } + list_add(&lc->log_list, &log_list_head); INIT_LIST_HEAD(&lc->region_users); @@ -730,6 +739,7 @@ list_del_init(&lc->log_list); if ((lc->server_id == my_id) && !atomic_read(&lc->suspended)) consult_server(lc, 0, LRT_MASTER_LEAVING, NULL); + sock_release(lc->client_sock); if (lc->log_dev) @@ -748,6 +758,7 @@ static int cluster_postsuspend(struct dirty_log *log) { + int r; struct log_c *lc = (struct log_c *) log->context; while (1) { @@ -765,12 +776,16 @@ if(lc->server_id == my_id) { while (1) { consult_server(lc, 0, LRT_MASTER_LEAVING, NULL); + down(&consult_server_lock); run_election(lc, 0xDEAD); up(&consult_server_lock); - if (lc->server_id == my_id) { + + if ((my_id && (lc->server_id == my_id))) { + atomic_set(&lc->suspended, 0); set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ/4); + schedule_timeout(HZ*2); + atomic_set(&lc->suspended, 1); } else { break; } @@ -1005,7 +1020,7 @@ if (!success) { DMERR("Attempting to revert sync status of region #%llu", region); set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ/50); + schedule_timeout(HZ/5); } return; --- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2006/07/27 23:11:55 1.1.2.16 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2006/09/05 17:50:11 1.1.2.17 @@ -107,7 +107,9 @@ if (!log->log_dev) return 0; - BUG_ON(atomic_read(&log->suspended)); + if (atomic_read(&log->suspended)) + return -EDEADLK; + r = dm_io_sync_vm(1, &log->header_location, READ, log->disk_header, &ebits); if (unlikely(r)) @@ -138,7 +140,9 @@ if (!log->log_dev) return 0; - BUG_ON(atomic_read(&log->suspended)); + if (atomic_read(&log->suspended)) + return -EDEADLK; + header_to_disk(&log->header, log->disk_header); return dm_io_sync_vm(1, &log->header_location, WRITE, log->disk_header, &ebits); @@ -182,7 +186,9 @@ if (!log->log_dev) return 0; - BUG_ON(atomic_read(&log->suspended)); + if (atomic_read(&log->suspended)) + return -EDEADLK; + r = dm_io_sync_vm(1, &log->bits_location, READ, log->clean_bits, &ebits); @@ -199,7 +205,9 @@ if (!log->log_dev) return 0; - BUG_ON(atomic_read(&log->suspended)); + if (atomic_read(&log->suspended)) + return -EDEADLK; + return dm_io_sync_vm(1, &log->bits_location, WRITE, log->clean_bits, &ebits); } @@ -252,9 +260,9 @@ continue; } else if(str[i] == 0xFF){ if(range_count==1){ - DMINFO(" %d", region - 1); + DMDEBUG(" %d", region - 1); } else if(range_count){ - DMINFO(" %d - %d", region-range_count, region-1); + DMDEBUG(" %d - %d", region-range_count, region-1); } range_count = 0; region+=(bit_count < 8)? bit_count: 8; @@ -272,9 +280,9 @@ count++; } else { if(range_count==1){ - DMINFO(" %d", region - 1); + DMDEBUG(" %d", region - 1); } else if(range_count){ - DMINFO(" %d - %d", region-range_count, region-1); + DMDEBUG(" %d - %d", region-range_count, region-1); } range_count = 0; region++; @@ -283,9 +291,9 @@ } if(range_count==1){ - DMINFO(" %d", region - 1); + DMDEBUG(" %d", region - 1); } else if(range_count){ - DMINFO(" %d - %d", region-range_count, region); + DMDEBUG(" %d - %d", region-range_count, region); } return count; } @@ -312,7 +320,7 @@ i = 1; if (!lc->log_dev_failed && ((r = read_header(lc)) || (i = 0) || (r = read_bits(lc)))) { - if (r == -EINVAL) + if (r == -EINVAL || r == -EDEADLK) return r; DMWARN("Read %s failed on mirror log device, %s", @@ -416,9 +424,11 @@ i = 1; if ((r = write_bits(lc)) || (i = 0) || (r = write_header(lc))) { - DMWARN("Write %s failed on mirror log device, %s.", - i ? "bits" : "header", lc->log_dev->name); - lc->log_dev_failed = 1; + if (r != -EDEADLK) { + DMWARN("Write %s failed on mirror log device, %s.", + i ? "bits" : "header", lc->log_dev->name); + lc->log_dev_failed = 1; + } } else lc->log_dev_failed = 0; @@ -469,6 +479,11 @@ static int server_in_sync(struct log_c *lc, struct log_request *lr) { + if (lr->u.lr_region > lc->region_count) { + lr->u.lr_int_rtn = 0; + return -EINVAL; + } + if(likely(log_test_bit(lc->sync_bits, lr->u.lr_region))) /* in-sync */ lr->u.lr_int_rtn = 1; @@ -581,6 +596,11 @@ static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){ uint32_t info; + + if (lr->u.lr_region > lc->region_count) { + return -EINVAL; + } + log_clear_bit(lc, lc->recovering_bits, lr->u.lr_region); if (success) { @@ -678,15 +698,16 @@ /* * Check if we have access to the log. We may not - * get have loaded this device. + * yet have loaded this device. */ - if(!lc){ + if (!lc) { lr->u.lr_node_count++; return 0; } if(lr->lr_type == LRT_MASTER_LEAVING){ - lc->server_id = 0xDEAD; + if (lr->u.lr_starter == lc->server_id) + lc->server_id = 0xDEAD; lr->u.lr_node_count++; return 0; } @@ -696,7 +717,7 @@ * We shortcut the election here and respond directly * to the inquirer */ - if(lc->server_id == my_id){ + if((lc->server_id == my_id) && !atomic_read(&lc->suspended)){ lr->u.lr_coordinator = my_id; if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){ return -1; @@ -850,10 +871,12 @@ if(lc && (old != lc->server_id) && (my_id == lc->server_id)){ DMDEBUG("I'm the cluster mirror log server for %s", lc->uuid + (strlen(lc->uuid) - 8)); - if (!atomic_read(&lc->suspended)) - disk_resume(lc); - else + if (atomic_read(&lc->suspended)) { DMDEBUG("Not reading disk log because I'm suspended."); + + } else if (disk_resume(lc) == -EDEADLK) { + DMDEBUG("Unable to read disk log - deadlock potential."); + } } goto reply; } @@ -944,7 +967,7 @@ /* DMWARN("Error (%d) while processing request (%s)", error, - (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C LEAN": + (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN": (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC": (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION": (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK": @@ -972,7 +995,18 @@ set_fs(fs); if(error < 0){ - DMWARN("unable to sendmsg to client (error = %d)", error); + DMWARN("unable to sendmsg to client (type = %s, error = %d)", + (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN": + (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC": + (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION": + (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK": + (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT": + (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION": + (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK": + (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING": + (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION": + (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN", + error); return error; } } else if(error == -EAGAIN || error == -ETIMEDOUT){ @@ -1036,10 +1070,11 @@ list_for_each_entry(lc, &log_list_head, log_list){ if(lc->server_id == my_id){ - if (!atomic_read(&lc->suspended)) - disk_resume(lc); - else + if (atomic_read(&lc->suspended)) { DMDEBUG("Not reading disk log because I'm suspended."); + } else if (disk_resume(lc) == -EDEADLK) { + DMDEBUG("Unable to read disk log - deadlock potential."); + } } } break;