From mboxrd@z Thu Jan 1 00:00:00 1970 From: teigland@sourceware.org Date: 26 Sep 2006 19:17:22 -0000 Subject: [Cluster-devel] cluster/group/daemon app.c cpg.c gd_internal.h ... Message-ID: <20060926191722.21656.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: teigland at sourceware.org 2006-09-26 19:17:21 Modified files: group/daemon : app.c cpg.c gd_internal.h joinleave.c main.c Log message: Add debugging in four areas to help us know more quickly when something might be wrong at the cpg level: - log if cpg flow control goes on - log when we're waiting to receive a cpg event for our own join - when we're in a FOO_STOP_WAIT or FOO_START_WAIT state, log how many more cpg messages we're waiting to receive before moving on to the next state - save the event id of the last cpg message we sent, and clear that value when we receive that message back (this value is printed to the debug log when someone runs group_tool, not shown in the group_tool output) Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/app.c.diff?cvsroot=cluster&r1=1.45&r2=1.46 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/cpg.c.diff?cvsroot=cluster&r1=1.31&r2=1.32 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/gd_internal.h.diff?cvsroot=cluster&r1=1.40&r2=1.41 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/joinleave.c.diff?cvsroot=cluster&r1=1.17&r2=1.18 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/main.c.diff?cvsroot=cluster&r1=1.44&r2=1.45 --- cluster/group/daemon/app.c 2006/09/15 20:07:15 1.45 +++ cluster/group/daemon/app.c 2006/09/26 19:17:20 1.46 @@ -692,6 +692,7 @@ msg_bswap_out(&msg); log_group(g, "send stopped"); + g->app->sent_event_id = ev->id; return send_message_groupd(g, &msg, sizeof(msg)); } @@ -710,6 +711,7 @@ msg_bswap_out(&msg); log_group(g, "send started"); + g->app->sent_event_id = ev->id; return send_message_groupd(g, &msg, sizeof(msg)); } @@ -788,7 +790,6 @@ } } -#if 0 static int count_nodes_not_stopped(app_t *a) { node_t *node; @@ -800,7 +801,6 @@ } return i; } -#endif int event_state_begin(app_t *a) { @@ -853,7 +853,7 @@ event_t *ev = a->current_event; node_t *node, *n; struct nodeid *id; - int rv = 0, do_start = 0; + int rv = 0, do_start = 0, count; if (!(event_state_stopping(a) || event_state_starting(a))) log_group(g, "process_current_event %llx %d %s", @@ -904,10 +904,9 @@ break; case EST_JOIN_STOP_WAIT: - /* count = count_nodes_not_stopped(a); - log_group(g, "waiting for %d more nodes to be stopped", count); - */ + log_group(g, "waiting for %d more stopped messages " + "before JOIN_ALL_STOPPED", count); break; case EST_JOIN_ALL_STOPPED: @@ -939,10 +938,9 @@ break; case EST_LEAVE_STOP_WAIT: - /* count = count_nodes_not_stopped(a); - log_group(g, "waiting for %d more nodes to be stopped", count); - */ + log_group(g, "waiting for %d more stopped messages " + "before LEAVE_ALL_STOPPED", count); break; case EST_LEAVE_ALL_STOPPED: @@ -993,10 +991,9 @@ break; case EST_FAIL_STOP_WAIT: - /* count = count_nodes_not_stopped(a); - log_group(g, "waiting for %d more nodes to be stopped", count); - */ + log_group(g, "waiting for %d more stopped messages " + "before FAIL_ALL_STOPPED", count); break; case EST_FAIL_ALL_STOPPED: @@ -1470,8 +1467,11 @@ } if (ev) { + a->need_first_event = 0; a->current_event = ev; rv = process_current_event(g); + } else if (a->need_first_event) { + log_group(g, "waiting for our own cpg join event"); } } out: --- cluster/group/daemon/cpg.c 2006/09/08 23:14:56 1.31 +++ cluster/group/daemon/cpg.c 2006/09/26 19:17:20 1.32 @@ -20,6 +20,7 @@ static int saved_left_count; static cpg_handle_t saved_handle; static struct cpg_name saved_name; +static int message_flow_control_on; static node_t *find_group_node(group_t *g, int nodeid) @@ -246,6 +247,9 @@ msg_type(msg->ms_type)); */ + if (nodeid == our_nodeid && g->app->sent_event_id == msg->ms_event_id) + g->app->sent_event_id = 0; + save = malloc(sizeof(struct save_msg)); memset(save, 0, sizeof(struct save_msg)); save->nodeid = nodeid; @@ -375,6 +379,7 @@ cpg_error_t error; cpg_handle_t handle; int found = 0; + cpg_flow_control_state_t flow_control_state; if (ci == groupd_ci) { handle = groupd_handle; @@ -404,6 +409,18 @@ return; } + error = cpg_flow_control_state_get(handle, &flow_control_state); + if (error != CPG_OK) + log_error(g, "cpg_flow_control_state_get %d", error); + else if (flow_control_state == CPG_FLOW_CONTROL_ENABLED) { + message_flow_control_on = 1; + log_debug("flow control on"); + } else { + if (message_flow_control_on) + log_debug("flow control off"); + message_flow_control_on = 0; + } + if (got_confchg) process_confchg(); } --- cluster/group/daemon/gd_internal.h 2006/09/15 18:20:36 1.40 +++ cluster/group/daemon/gd_internal.h 2006/09/26 19:17:21 1.41 @@ -180,6 +180,8 @@ struct list_head events; event_t *current_event; group_t *g; + uint64_t sent_event_id; /* for debugging */ + int need_first_event; /* for debugging */ }; #define MSG_APP_STOPPED 1 --- cluster/group/daemon/joinleave.c 2006/06/28 22:16:36 1.17 +++ cluster/group/daemon/joinleave.c 2006/09/26 19:17:21 1.18 @@ -63,6 +63,7 @@ a = malloc(sizeof(app_t)); memset(a, 0, sizeof(app_t)); + a->need_first_event = 1; INIT_LIST_HEAD(&a->nodes); INIT_LIST_HEAD(&a->events); a->g = g; --- cluster/group/daemon/main.c 2006/09/15 18:20:36 1.44 +++ cluster/group/daemon/main.c 2006/09/26 19:17:21 1.45 @@ -462,6 +462,10 @@ else data->event_local_status = -1; } + + if (g->app->sent_event_id) + log_group(g, "sent_event_id %llx", + g->app->sent_event_id); } data->member_count = g->app->node_count;