From mboxrd@z Thu Jan 1 00:00:00 1970 From: bmarzins@sourceware.org Date: 13 Oct 2006 22:32:33 -0000 Subject: [Cluster-devel] cluster/gnbd client/Makefile client/gnbd_monit ... Message-ID: <20061013223233.6038.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: bmarzins at sourceware.org 2006-10-13 22:32:30 Modified files: gnbd/client : Makefile gnbd_monitor.c gnbd_monitor.h gnbd/server : Makefile gnbd_clusterd.c Removed files: gnbd/utils : group.c group.h Log message: Make gnbd work with cman correctly. This sort of roughly falls under the heading of bz #210415 Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gnbd/client/Makefile.diff?cvsroot=cluster&r1=1.9&r2=1.10 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gnbd/client/gnbd_monitor.c.diff?cvsroot=cluster&r1=1.13&r2=1.14 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gnbd/client/gnbd_monitor.h.diff?cvsroot=cluster&r1=1.3&r2=1.4 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gnbd/server/Makefile.diff?cvsroot=cluster&r1=1.9&r2=1.10 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gnbd/server/gnbd_clusterd.c.diff?cvsroot=cluster&r1=1.5&r2=1.6 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gnbd/utils/group.c.diff?cvsroot=cluster&r1=1.2&r2=NONE http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gnbd/utils/group.h.diff?cvsroot=cluster&r1=1.1&r2=NONE --- cluster/gnbd/client/Makefile 2006/08/11 15:18:14 1.9 +++ cluster/gnbd/client/Makefile 2006/10/13 22:32:30 1.10 @@ -19,12 +19,11 @@ $(top_srcdir)/utils/gnbd_utils.c MONITOR_SRC= gnbd_monitor.c monitor_req.c $(top_srcdir)/utils/trans.c \ - $(top_srcdir)/utils/gnbd_utils.c $(top_srcdir)/utils/group.c \ + $(top_srcdir)/utils/gnbd_utils.c \ $(top_srcdir)/utils/member_cman.c INCLUDE= -I$(top_srcdir)/include -I$(top_srcdir)/server -I$(top_srcdir)/utils \ - -I${top_srcdir}/config -I${gnbdkincdir} -I${incdir} -I${cmanincdir} \ - -I../../group/lib + -I${top_srcdir}/config -I${gnbdkincdir} -I${incdir} -I${cmanincdir} ifneq (${KERNEL_SRC}, ) # Use the kernel tree if patched, otherwise, look where cluster headers @@ -33,7 +32,7 @@ echo '-I${KERNEL_SRC}/include'; fi) endif -LDLIBS+= -L${libdir} -L${cmanincdir} -L../../group/lib -lcman -lgroup -ldl -lpthread +LDLIBS+= -L${libdir} -L${cmanincdir} -lcman -ldl -lpthread CFLAGS+= -O2 -DGNBD_RELEASE_NAME=\"${RELEASE}\" --- cluster/gnbd/client/gnbd_monitor.c 2006/08/11 15:18:14 1.13 +++ cluster/gnbd/client/gnbd_monitor.c 2006/10/13 22:32:30 1.14 @@ -28,7 +28,6 @@ #include #include "gnbd.h" -#include "group.h" #include "member_cman.h" #include "gnbd_endian.h" #include "list.h" @@ -52,8 +51,15 @@ }; typedef struct waiter_s waiter_t; +struct down_node_s { + int nodeid; + list_t list; +}; +typedef struct down_node_s down_node_t; + #define MAX_NODES 256 +list_decl(down_node_list); list_decl(waiter_list); connection_t *connections; struct pollfd *polls; @@ -63,8 +69,6 @@ cman_handle_t ch; cman_node_t nodes[MAX_NODES]; int num_nodes; -cman_node_t old_nodes[MAX_NODES]; -int old_num_nodes; int cman_cb; int cman_reason; @@ -74,7 +78,6 @@ #define CLUSTER 0 #define CONNECT 1 -#define GROUP 2 list_t monitor_list; @@ -159,15 +162,7 @@ connections[CLUSTER].dev = -1; polls[CONNECT].fd = start_comm_device("gnbd_monitorcomm"); polls[CONNECT].events = POLLIN; - polls[GROUP].fd = setup_groupd("gnbd_monitor"); - if (polls[GROUP].fd < 0) - fail_startup("cannot get group fd\n"); - polls[GROUP].events = POLLIN; - connections[GROUP].buf = NULL; - connections[GROUP].action = 0; - connections[GROUP].size = 0; - connections[GROUP].dev = -1; - for(i = 3; i < open_max(); i++){ + for(i = 2; i < open_max(); i++){ polls[i].fd = -1; polls[i].revents = 0; } @@ -186,10 +181,6 @@ /* FIXME -- again, don't do this */ exit(1); } - if (index == GROUP){ - log_err("lost connection to groupd\n"); - exit(1); - } polls[index].fd = -1; polls[index].revents = 0; free(connections[index].buf); @@ -356,6 +347,7 @@ waiter_t *waiter; block_sigchld(); + dev->state = FAILED_STATE; list_foreach_safe(list_item, &waiter_list, tmp) { waiter = list_entry(list_item, waiter_t, list); @@ -375,14 +367,75 @@ unblock_sigchld(); } -static void statechange(void) +static void fail_devices(char *node) { - int ret; monitor_t *dev; + list_t *item; + + list_foreach(item, &monitor_list) { + dev = list_entry(item, monitor_t, list); + if (strcmp(dev->server, node) == 0) + fail_device(dev); + } +} + +static char *nodeid_to_name(int nodeid) +{ + int i; + + for(i = 0; i < num_nodes; i++) + if (nodes[i].cn_nodeid == nodeid) + return nodes[i].cn_name; + log_err("cannot find node that matches nodeid %d\n", nodeid); + exit(1); +} + +static void check_down_nodes(void) +{ + uint64_t fence_time; + int fenced; + down_node_t *node; list_t *item, *next; + + list_foreach_safe(item, &down_node_list, next){ + node = list_entry(item, down_node_t, list); + if (cman_get_fenceinfo(ch, node->nodeid, &fence_time, &fenced, NULL) < 0) { + log_err("cannot get fence info for nodeid %d : %s\n", node->nodeid, + strerror(errno)); + exit(1); + } + if (fenced){ + fail_devices(nodeid_to_name(node->nodeid)); + list_del(&node->list); + free(node); + } + } +} - old_num_nodes = num_nodes; - memcpy(&old_nodes, &nodes, sizeof(old_nodes)); +static down_node_t *get_down_node(int nodeid) +{ + list_t *item; + down_node_t *node; + + list_foreach(item, &down_node_list) { + node = list_entry(item, down_node_t, list); + if (node->nodeid == nodeid) + return node; + } + return NULL; +} + +static void get_initial_nodelist(void) +{ + if (cman_get_nodes(ch, MAX_NODES, &num_nodes, nodes) < 0) { + log_err("can't get initial cluster node list : %s\n", strerror(errno)); + exit(1); + } +} + +static void statechange(void) +{ + int ret, i; num_nodes = 0; memset(&nodes, 0, sizeof(nodes)); @@ -391,13 +444,41 @@ log_err("can't get cluster node list : %s\n", strerror(errno)); exit(1); } - list_foreach_safe(item, &monitor_list, next){ - dev = list_entry(item, monitor_t, list); - if (check_for_node(old_nodes, old_num_nodes, dev->server) && - !check_for_node(nodes, num_nodes, dev->server)) - fail_device(dev); + for (i = 0; i < num_nodes; i++){ + if (nodes[i].cn_member) { + down_node_t *node = get_down_node(nodes[i].cn_nodeid); + if (!node) + continue; + fail_devices(nodes[i].cn_name); + list_del(&node->list); + free(node); + } + else { + monitor_t *dev; + list_t *item; + if (get_down_node(nodes[i].cn_nodeid)) + continue; + list_foreach(item, &monitor_list) { + down_node_t *node; + dev = list_entry(item, monitor_t, list); + if (strcmp(dev->server, nodes[i].cn_name) != 0) + continue; + if (dev->state == RESET_STATE || dev->state == RESTARTABLE_STATE || + dev->state == FAILED_STATE) + continue; + node = malloc(sizeof(down_node_t)); + if (!node) { + log_err("cannot allocate memory for down node %s\n", + nodes[i].cn_name); + exit(1); + } + node->nodeid = nodes[i].cn_nodeid; + list_add(&node->list, &down_node_list); + break; + } + } } -} +} void handle_cluster_msg(void) { @@ -608,7 +689,7 @@ exit(1); for(i = open_max()-1; i > 2; --i) close(i); - execlp("gnbd_recvd", "gnbd_recvd", "-f", "-d", minor_str); + execlp("gnbd_recvd", "gnbd_recvd", "-f", "-d", minor_str, NULL); exit(1); } @@ -661,9 +742,9 @@ log_err("cman_admin_init failure : %s\n", strerror(errno)); goto cant_fence; } - if (cman_kill_node(ch, server->cn_nodeid) < 0){ + if (cman_kill_node(ach, server->cn_nodeid) < 0){ log_err("fence of %s failed : %s\n", dev->server, strerror(errno)); - cman_finish(ch); + cman_finish(ach); goto cant_fence; } cman_finish(ach); @@ -685,6 +766,7 @@ start_recvd(dev); break; /* FENCED_STATE */ + /* FAILED_STATE */ } } } @@ -717,6 +799,9 @@ case FENCED_STATE: strcpy(state, "fenced"); break; + case FAILED_STATE: + strcpy(state, "failed"); + break; } printf("%8d %7d %s\n", ptr->minor_nr, ptr->timeout, state); } @@ -734,8 +819,11 @@ log_err("poll error : %s\n", strerror(errno)); return; } - if (err == 0) + if (err == 0) { check_devices(); + check_down_nodes(); + return; + } for (i = 0; i <= max_id; i++){ if (polls[i].revents & (POLLERR | POLLHUP | POLLNVAL)){ log_err("Bad poll result, 0x%x on id %d\n", polls[i].revents, i); @@ -747,8 +835,6 @@ accept_connection(); else if (i == CLUSTER) handle_cluster_msg(); - else if (i == GROUP) - default_process_groupd(); else handle_msg(i); } @@ -819,14 +905,13 @@ list_init(&monitor_list); setup_poll(); - err = monitor_device(minor_nr, timeout, argv[3]); if (err) fail_startup("cannot add device #%d to monitor_list : %s\n", minor_nr, strerror(err)); finish_startup("gnbd_monitor started. Monitoring device #%d\n", minor_nr); - + get_initial_nodelist(); while(1){ do_poll(); } --- cluster/gnbd/client/gnbd_monitor.h 2004/08/14 01:33:20 1.3 +++ cluster/gnbd/client/gnbd_monitor.h 2006/10/13 22:32:30 1.4 @@ -23,6 +23,7 @@ #define RESET_STATE 2 #define RESTARTABLE_STATE 3 #define FENCED_STATE 4 +#define FAILED_STATE 5 struct monitor_info_s { int minor_nr; --- cluster/gnbd/server/Makefile 2006/08/11 15:18:14 1.9 +++ cluster/gnbd/server/Makefile 2006/10/13 22:32:30 1.10 @@ -17,16 +17,15 @@ include ${top_srcdir}/make/defines.mk CLU_SOURCE= gnbd_clusterd.c $(top_srcdir)/utils/gnbd_utils.c \ - $(top_srcdir)/utils/member_cman.c $(top_srcdir)/utils/group.c + $(top_srcdir)/utils/member_cman.c -LDLIBS+= -L${libdir} -L${cmanincdir} -L../../group/lib -lcman -lgroup -ldl -lpthread +LDLIBS+= -L${libdir} -L${cmanincdir} -lcman -ldl -lpthread SRV_SOURCE= gnbd_serv.c local_req.c extern_req.c device.c gserv.c fence.c \ $(top_srcdir)/utils/trans.c $(top_srcdir)/utils/gnbd_utils.c -INCLUDE= -I$(top_srcdir)/include -I$(top_srcdir)/utils -I${groupincdir}\ - -I${top_srcdir}/config -I${gnbdkincdir} -I${incdir} -I${cmanincdir} \ - -I../../group/lib +INCLUDE= -I$(top_srcdir)/include -I$(top_srcdir)/utils \ + -I${top_srcdir}/config -I${gnbdkincdir} -I${incdir} -I${cmanincdir} ifneq (${KERNEL_SRC}, ) # Use the kernel tree if patched, otherwise, look where cluster headers --- cluster/gnbd/server/gnbd_clusterd.c 2006/05/16 19:08:17 1.5 +++ cluster/gnbd/server/gnbd_clusterd.c 2006/10/13 22:32:30 1.6 @@ -20,15 +20,12 @@ #include "gnbd_utils.h" #include "member_cman.h" -#include "group.h" #define CMAN 0 -#define GROUP 1 -struct pollfd polls[2]; +struct pollfd polls[1]; static int quit = 0; -group_callbacks_t callbacks; static void sig_usr1(int sig) {} @@ -56,28 +53,16 @@ { polls[CMAN].fd = setup_member(NULL); if (polls[CMAN].fd < 0) - finish_startup("cannot join cman\n"); - polls[GROUP].fd = setup_groupd("gnbd_clusterd"); - if (polls[GROUP].fd < 0) { - exit_member(); - fail_startup("cannot init group\n"); - } - if (group_join(gh, "default")) { - exit_groupd(); - exit_member(); - fail_startup("cannot join group\n"); - } + fail_startup("cannot join cman\n"); polls[CMAN].events = POLLIN; polls[CMAN].revents = 0; - polls[GROUP].events = POLLIN; - polls[GROUP].revents = 0; } void do_poll(void) { int err; - err = poll(polls, 2, -1); + err = poll(polls, 1, -1); if (err < 0) { if (errno != EINTR) log_err("poll error : %s\n", strerror(errno)); @@ -87,15 +72,9 @@ log_err("Bad poll result 0x%x from cluster\n", polls[CMAN].revents); exit(1); } - if (polls[GROUP].revents & (POLLERR | POLLHUP | POLLNVAL)) { - log_err("Bad poll result 0x%x from groupd\n", polls[GROUP].revents); - exit(1); - } if (polls[CMAN].revents & POLLIN) default_process_member(); - if (polls[GROUP].revents & POLLIN) - default_process_groupd(); } int main(int argc, char **argv){ @@ -137,8 +116,6 @@ while(!quit){ do_poll(); } - group_leave(gh, "default"); - group_exit(gh); cman_finish(ch); return 0; }