From mboxrd@z Thu Jan 1 00:00:00 1970 From: teigland@sourceware.org Date: 17 Aug 2007 21:17:53 -0000 Subject: [Cluster-devel] cluster/group/dlm_controld deadlock.c dlm_daem ... Message-ID: <20070817211753.15992.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: teigland at sourceware.org 2007-08-17 21:17:53 Modified files: group/dlm_controld: deadlock.c dlm_daemon.h main.c Log message: handle addition/removal/failure of nodes during a deadlock cycle serialize deadlock cycles and limit how often cycles are started Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/deadlock.c.diff?cvsroot=cluster&r1=1.5&r2=1.6 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/dlm_daemon.h.diff?cvsroot=cluster&r1=1.12&r2=1.13 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/main.c.diff?cvsroot=cluster&r1=1.13&r2=1.14 --- cluster/group/dlm_controld/deadlock.c 2007/08/13 15:32:05 1.5 +++ cluster/group/dlm_controld/deadlock.c 2007/08/17 21:17:53 1.6 @@ -28,7 +28,8 @@ struct node { struct list_head list; int nodeid; - int checkpoint_ready; + int checkpoint_ready; /* we've read its ckpt */ + int in_cycle; /* participating in cycle */ }; enum { @@ -96,8 +97,9 @@ #define DLM_HEADER_PATCH 0 #define DLM_MSG_CYCLE_START 1 -#define DLM_MSG_CHECKPOINT_READY 2 -#define DLM_MSG_CANCEL_LOCK 3 +#define DLM_MSG_CYCLE_END 2 +#define DLM_MSG_CHECKPOINT_READY 3 +#define DLM_MSG_CANCEL_LOCK 4 struct dlm_header { uint16_t version[3]; @@ -816,6 +818,7 @@ /* unlink an old checkpoint before we create a new one */ if (ls->lock_ckpt_handle) { + log_error("write_checkpoint: old ckpt"); if (_unlink_checkpoint(ls, &name)) return; } @@ -984,6 +987,14 @@ send_message(ls, DLM_MSG_CYCLE_START); } +void send_cycle_end(struct lockspace *ls) +{ + if (!deadlock_enabled) + return; + log_group(ls, "send_cycle_end"); + send_message(ls, DLM_MSG_CYCLE_END); +} + static void send_cancel_lock(struct lockspace *ls, struct trans *tr, struct dlm_lkb *lkb) { @@ -1053,22 +1064,18 @@ static void find_deadlock(struct lockspace *ls); -static void receive_checkpoint_ready(struct lockspace *ls, int nodeid) +static void run_deadlock(struct lockspace *ls) { struct node *node; int not_ready = 0; int low = -1; - log_group(ls, "receive_checkpoint_ready from %d", nodeid); - - read_checkpoint(ls, nodeid); - - /* when locks are read from all nodes, then search_deadlock() - to do detection */ + if (ls->all_checkpoints_ready) + log_group(ls, "WARNING: run_deadlock all_checkpoints_ready"); list_for_each_entry(node, &ls->nodes, list) { - if (node->nodeid == nodeid) - node->checkpoint_ready = 1; + if (!node->in_cycle) + continue; if (!node->checkpoint_ready) not_ready++; @@ -1078,23 +1085,56 @@ if (not_ready) return; + ls->all_checkpoints_ready = 1; + list_for_each_entry(node, &ls->nodes, list) { + if (!node->in_cycle) + continue; if (node->nodeid < low || low == -1) low = node->nodeid; } ls->low_nodeid = low; - log_group(ls, "low nodeid in charge of resolution is %d", low); - find_deadlock(ls); + if (low == our_nodeid) + find_deadlock(ls); + else + log_group(ls, "defer resolution to low nodeid %d", low); +} + +static void receive_checkpoint_ready(struct lockspace *ls, int nodeid) +{ + struct node *node; + + log_group(ls, "receive_checkpoint_ready from %d", nodeid); + + read_checkpoint(ls, nodeid); + + list_for_each_entry(node, &ls->nodes, list) { + if (node->nodeid == nodeid) { + node->checkpoint_ready = 1; + break; + } + } + + run_deadlock(ls); } static void receive_cycle_start(struct lockspace *ls, int nodeid) { + struct node *node; int rv; - log_group(ls, "receive_cycle_start %d", nodeid); + log_group(ls, "receive_cycle_start from %d", nodeid); + + if (ls->cycle_running) { + log_group(ls, "cycle already running"); + return; + } + ls->cycle_running = 1; + gettimeofday(&ls->cycle_start_time, NULL); - gettimeofday(&ls->last_deadlock_check, NULL); + list_for_each_entry(node, &ls->nodes, list) + node->in_cycle = 1; rv = read_debugfs_locks(ls); if (rv < 0) { @@ -1111,6 +1151,46 @@ send_checkpoint_ready(ls); } +static uint64_t dt_usec(struct timeval *start, struct timeval *stop) +{ + uint64_t dt; + + dt = stop->tv_sec - start->tv_sec; + dt *= 1000000; + dt += stop->tv_usec - start->tv_usec; + return dt; +} + +/* TODO: nodes added during a cycle - what will they do with messages + they recv from other nodes running the cycle? */ + +static void receive_cycle_end(struct lockspace *ls, int nodeid) +{ + struct node *node; + uint64_t usec; + + if (!ls->cycle_running) { + log_error("receive_cycle_end %s from %d: no cycle running", + ls->name, nodeid); + return; + } + + gettimeofday(&ls->cycle_end_time, NULL); + usec = dt_usec(&ls->cycle_start_time, &ls->cycle_end_time); + log_group(ls, "receive_cycle_end: from %d cycle time %.2f s", + nodeid, usec * 1.e-6); + + ls->cycle_running = 0; + ls->all_checkpoints_ready = 0; + + list_for_each_entry(node, &ls->nodes, list) + node->checkpoint_ready = 0; + + free_resources(ls); + free_transactions(ls); + unlink_checkpoint(ls); +} + static void receive_cancel_lock(struct lockspace *ls, int nodeid, uint32_t lkid) { dlm_lshandle_t h; @@ -1167,6 +1247,9 @@ case DLM_MSG_CYCLE_START: receive_cycle_start(ls, hd->nodeid); break; + case DLM_MSG_CYCLE_END: + receive_cycle_end(ls, hd->nodeid); + break; case DLM_MSG_CHECKPOINT_READY: receive_checkpoint_ready(ls, hd->nodeid); break; @@ -1203,14 +1286,14 @@ if (node->nodeid != nodeid) continue; - /* TODO: purge locks from this node if we're in a cycle */ - list_del(&node->list); free(node); log_group(ls, "node %d left deadlock cpg", nodeid); } } +static void purge_locks(struct lockspace *ls, int nodeid); + static void confchg_cb(cpg_handle_t handle, struct cpg_name *group_name, struct cpg_address *member_list, int member_list_entries, struct cpg_address *left_list, int left_list_entries, @@ -1230,11 +1313,36 @@ return; } + /* nodes added during a cycle won't have node->in_cycle set so they + won't be included in any of the cycle processing */ + for (i = 0; i < joined_list_entries; i++) node_joined(ls, joined_list[i].nodeid); for (i = 0; i < left_list_entries; i++) node_left(ls, left_list[i].nodeid, left_list[i].reason); + + if (!ls->cycle_running) + return; + + if (!left_list_entries) + return; + + if (!ls->all_checkpoints_ready) { + run_deadlock(ls); + return; + } + + for (i = 0; i < left_list_entries; i++) + purge_locks(ls, left_list[i].nodeid); + + for (i = 0; i < left_list_entries; i++) { + if (left_list[i].nodeid != ls->low_nodeid) + continue; + /* this will set a new low node which will call find_deadlock */ + run_deadlock(ls); + break; + } } static void process_deadlock_cpg(int ci) @@ -1343,6 +1451,29 @@ client_dead(ls->cpg_ci); } +/* would we ever call this after we've created the transaction lists? + I don't think so; I think it can only be called between reading + checkpoints */ + +static void purge_locks(struct lockspace *ls, int nodeid) +{ + struct dlm_rsb *r; + struct dlm_lkb *lkb, *safe; + + list_for_each_entry(r, &ls->resources, list) { + list_for_each_entry_safe(lkb, safe, &r->locks, list) { + if (lkb->home == nodeid) { + list_del(&lkb->list); + if (list_empty(&lkb->trans_list)) + free(lkb); + else + log_group(ls, "purge %d %x on trans", + nodeid, lkb->lock.id); + } + } + } +} + static void add_lkb_trans(struct trans *tr, struct dlm_lkb *lkb) { list_add(&lkb->trans_list, &tr->locks); @@ -1543,7 +1674,7 @@ } if (remove_tr->others_waiting_on_us) - log_debug("trans %llx removed others waiting %d", + log_group(ls, "trans %llx removed others waiting %d", (unsigned long long)remove_tr->xid, remove_tr->others_waiting_on_us); } @@ -1675,16 +1806,14 @@ static void find_deadlock(struct lockspace *ls) { - struct node *node; - if (list_empty(&ls->resources)) { log_group(ls, "no deadlock: no resources"); - return; + goto out; } if (!list_empty(&ls->transactions)) { log_group(ls, "transactions list should be empty"); - return; + goto out; } dump_resources(ls); @@ -1701,13 +1830,6 @@ log_group(ls, "found deadlock"); dump_all_trans(ls); - /* TODO: should probably do this above instead */ - if (ls->low_nodeid != our_nodeid) { - log_group(ls, "defer resolution to low nodeid %d", - ls->low_nodeid); - goto out; - } - cancel_trans(ls); reduce_waitfor_graph_loop(ls); @@ -1718,12 +1840,7 @@ log_error("deadlock resolution failed"); dump_all_trans(ls); - out: - free_resources(ls); - free_transactions(ls); - - list_for_each_entry(node, &ls->nodes, list) - node->checkpoint_ready = 0; + send_cycle_end(ls); } --- cluster/group/dlm_controld/dlm_daemon.h 2007/08/10 20:23:07 1.12 +++ cluster/group/dlm_controld/dlm_daemon.h 2007/08/17 21:17:53 1.13 @@ -94,9 +94,12 @@ struct list_head transactions; struct list_head resources; struct list_head nodes; - struct timeval last_deadlock_check; - unsigned int timewarn_count; + struct timeval cycle_start_time; + struct timeval cycle_end_time; + struct timeval last_send_cycle_start; int got_first_confchg; + int cycle_running; + int all_checkpoints_ready; }; /* action.c */ --- cluster/group/dlm_controld/main.c 2007/08/06 21:50:26 1.13 +++ cluster/group/dlm_controld/main.c 2007/08/17 21:17:53 1.14 @@ -446,28 +446,55 @@ { struct lockspace *ls; struct timeval now; + unsigned int sec; ls = find_ls_id(data->lockspace_id); if (!ls) return; - log_group(ls, "timewarn: lkid %x pid %d count %d", - data->id, data->ownpid, ls->timewarn_count); + data->resource_name[data->resource_namelen] = '\0'; + + log_group(ls, "timewarn: lkid %x pid %d name %s", + data->id, data->ownpid, data->resource_name); + + /* Problem: we don't want to get a timewarn, assume it's resolved + by the current cycle, but in fact it's from a deadlock that + formed after the checkpoints for the current cycle. Then we'd + have to hope for another warning (that may not come) to trigger + a new cycle to catch the deadlock. If our last cycle ckpt + was say N (~5?) sec before we receive the timewarn, then we + can be confident that the cycle included the lock in question. + Otherwise, we're not sure if the warning is for a new deadlock + that's formed since our last cycle ckpt (unless it's a long + enough time since the last cycle that we're confident it *is* + a new deadlock). When there is a deadlock, I suspect it will + be common to receive warnings before, during, and possibly + after the cycle that resolves it. Wonder if we should record + timewarns and match them with deadlock cycles so we can tell + which timewarns are addressed by a given cycle and which aren't. */ + gettimeofday(&now, NULL); - if (now.tv_sec - ls->last_deadlock_check.tv_sec > DEADLOCK_CHECK_SECS) { - ls->timewarn_count = 0; - send_cycle_start(ls); - } else { - /* TODO: set a poll timeout and start another cycle after - DEADLOCK_CHECK_SECS. Want to save a record of all the - warned locks to see if they're still blocked later before - starting a cycle? This would only be helpful if we - experienced regular false-warnings, indicating that the - timewarn setting should be larger. */ - ls->timewarn_count++; + /* don't send a new start until at least SECS after the last + we sent, and at least SECS after the last completed cycle */ + + sec = now.tv_sec - ls->last_send_cycle_start.tv_sec; + + if (sec < DEADLOCK_CHECK_SECS) { + log_group(ls, "skip send: recent send cycle %d sec", sec); + return; + } + + sec = now.tv_sec - ls->cycle_end_time.tv_sec; + + if (sec < DEADLOCK_CHECK_SECS) { + log_group(ls, "skip send: recent cycle end %d sec", sec); + return; } + + gettimeofday(&ls->last_send_cycle_start, NULL); + send_cycle_start(ls); } static void process_netlink(int ci)