From mboxrd@z Thu Jan 1 00:00:00 1970 From: rpeterso@sourceware.org Date: 13 Oct 2006 16:03:49 -0000 Subject: [Cluster-devel] cluster fence/fenced/group.c fence/fenced/main ... Message-ID: <20061013160349.31623.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: rpeterso at sourceware.org 2006-10-13 16:03:48 Modified files: fence/fenced : group.c main.c gnbd/utils : group.c group/dlm_controld: group.c group/gfs_controld: group.c group/lib : libgroup.c libgroup.h Log message: This fix is for bugzilla 210641: Race condition hang/failure between cman daemons and groupd. Added a retry with timeout to group_init and all its callers. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/group.c.diff?cvsroot=cluster&r1=1.9&r2=1.10 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/main.c.diff?cvsroot=cluster&r1=1.37&r2=1.38 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gnbd/utils/group.c.diff?cvsroot=cluster&r1=1.1&r2=1.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/group.c.diff?cvsroot=cluster&r1=1.2&r2=1.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/group.c.diff?cvsroot=cluster&r1=1.2&r2=1.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/lib/libgroup.c.diff?cvsroot=cluster&r1=1.20&r2=1.21 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/lib/libgroup.h.diff?cvsroot=cluster&r1=1.16&r2=1.17 --- cluster/fence/fenced/group.c 2006/06/20 18:11:58 1.9 +++ cluster/fence/fenced/group.c 2006/10/13 16:03:47 1.10 @@ -18,6 +18,8 @@ #define DO_TERMINATE 4 #define DO_SETID 5 +#define GROUPD_TIMEOUT 10 /* seconds */ + /* save all the params from callback functions here because we can't do the processing within the callback function itself */ @@ -157,17 +159,14 @@ { int rv; - gh = group_init(NULL, "fence", 0, &callbacks); + gh = group_init(NULL, "fence", 0, &callbacks, GROUPD_TIMEOUT); if (!gh) { log_error("group_init error %d %d", (int) gh, errno); return -ENOTCONN; } - rv = group_get_fd(gh); - if (rv < 0) { + if (rv < 0) log_error("group_get_fd error %d %d", rv, errno); - } - return rv; } --- cluster/fence/fenced/main.c 2006/08/15 17:17:45 1.37 +++ cluster/fence/fenced/main.c 2006/10/13 16:03:47 1.38 @@ -182,7 +182,7 @@ fd = find_domain(name); if (fd) { - log_debug("join error: domain exists"); + log_debug("join error: domain %s exists", name); rv = -EEXIST; goto out; } @@ -204,6 +204,7 @@ rv = group_join(gh, name); if (rv) { log_error("group_join error %d", rv); + list_del(&fd->list); free(fd); } out: --- cluster/gnbd/utils/group.c 2006/05/16 19:08:18 1.1 +++ cluster/gnbd/utils/group.c 2006/10/13 16:03:47 1.2 @@ -21,6 +21,7 @@ #include "group.h" #define MAXLINE 256 +#define GROUPD_TIMEOUT 10 /* save all the params from callback functions here because we can't do the processing within the callback function itself */ @@ -139,7 +140,7 @@ { int rv; - gh = group_init(NULL, name, 0, &callbacks); + gh = group_init(NULL, name, 0, &callbacks, GROUPD_TIMEOUT); if (!gh) { log_err("group_init error %d %d", (int) gh, errno); return -ENOTCONN; --- cluster/group/dlm_controld/group.c 2006/08/31 18:17:00 1.2 +++ cluster/group/dlm_controld/group.c 2006/10/13 16:03:47 1.3 @@ -18,6 +18,8 @@ #define DO_TERMINATE 4 #define DO_SETID 5 +#define GROUPD_TIMEOUT 10 /* seconds */ + /* save all the params from callback functions here because we can't do the processing within the callback function itself */ @@ -199,7 +201,7 @@ { int rv; - gh = group_init(NULL, "dlm", 1, &callbacks); + gh = group_init(NULL, "dlm", 1, &callbacks, GROUPD_TIMEOUT); if (!gh) { log_error("group_init error %d %d", (int) gh, errno); return -ENOTCONN; --- cluster/group/gfs_controld/group.c 2006/06/15 20:41:46 1.2 +++ cluster/group/gfs_controld/group.c 2006/10/13 16:03:47 1.3 @@ -15,6 +15,8 @@ /* save all the params from callback functions here because we can't do the processing within the callback function itself */ +#define GROUPD_TIMEOUT 10 /* seconds */ + group_handle_t gh; static int cb_action; static char cb_name[MAX_GROUP_NAME_LEN+1]; @@ -168,7 +170,7 @@ int rv; gh = group_init(NULL, LOCK_DLM_GROUP_NAME, LOCK_DLM_GROUP_LEVEL, - &callbacks); + &callbacks, GROUPD_TIMEOUT); if (!gh) { log_error("group_init error %d %d", (int) gh, errno); return -ENOTCONN; --- cluster/group/lib/libgroup.c 2006/09/07 19:24:08 1.20 +++ cluster/group/lib/libgroup.c 2006/10/13 16:03:48 1.21 @@ -279,11 +279,11 @@ } group_handle_t group_init(void *private, char *prog_name, int level, - group_callbacks_t *cbs) + group_callbacks_t *cbs, int timeout) { struct group_handle *h; char buf[GROUPD_MSGLEN]; - int rv, saved_errno; + int rv, saved_errno, i; h = malloc(sizeof(struct group_handle)); if (!h) @@ -295,20 +295,25 @@ h->level = level; strncpy(h->prog_name, prog_name, 32); - h->fd = connect_groupd(); - if (h->fd < 0) - goto fail; - - memset(buf, 0, sizeof(buf)); - snprintf(buf, sizeof(buf), "setup %s %d", prog_name, level); - - rv = do_write(h->fd, &buf, GROUPD_MSGLEN); - if (rv < 0) - goto fail; - - return (group_handle_t) h; - - fail: + for (i = 0; !timeout || i < timeout * 2; i++) { + h->fd = connect_groupd(); + if (h->fd > 0 || !timeout) /* if successful or only once allowed */ + break; + usleep(500000); + } + if (h->fd > 0) { + memset(buf, 0, sizeof(buf)); + snprintf(buf, sizeof(buf), "setup %s %d", prog_name, level); + + for (; !timeout || i < timeout * 2; i++) { + rv = do_write(h->fd, &buf, GROUPD_MSGLEN); + if (rv >= 0) + return (group_handle_t) h; + if (!timeout) + break; + usleep(500000); + } + } saved_errno = errno; close(h->fd); free(h); @@ -475,31 +480,30 @@ return rv; } -int group_get_group(int level, char *name, group_data_t *data) +int group_get_group(int level, const char *name, group_data_t *data) { - char buf[GROUPD_MSGLEN]; - char data_buf[sizeof(group_data_t)]; - int fd, rv, len; + char buf[GROUPD_MSGLEN]; + char data_buf[sizeof(group_data_t)]; + int fd, rv, len; - fd = connect_groupd(); - if (fd < 0) - return fd; + fd = connect_groupd(); + if (fd < 0) + return fd; - memset(buf, 0, sizeof(buf)); - snprintf(buf, sizeof(buf), "get_group %d %s", level, name); + memset(buf, 0, sizeof(buf)); + snprintf(buf, sizeof(buf), "get_group %d %s", level, name); - rv = do_write(fd, &buf, GROUPD_MSGLEN); - if (rv < 0) - goto out; + rv = do_write(fd, &buf, GROUPD_MSGLEN); + if (rv < 0) + goto out; - rv = do_read(fd, &data_buf, sizeof(data_buf)); - if (rv < 0) - goto out; + rv = do_read(fd, &data_buf, sizeof(data_buf)); + if (rv < 0) + goto out; - memcpy(data, data_buf, sizeof(group_data_t)); - rv = 0; + memcpy(data, data_buf, sizeof(group_data_t)); + rv = 0; out: - close(fd); - return rv; + close(fd); + return rv; } - --- cluster/group/lib/libgroup.h 2006/03/02 20:24:17 1.16 +++ cluster/group/lib/libgroup.h 2006/10/13 16:03:48 1.17 @@ -54,7 +54,7 @@ group_deliver_t deliver; } group_callbacks_t; -group_handle_t group_init(void *private, char *prog_name, int level, group_callbacks_t *cbs); +group_handle_t group_init(void *private, char *prog_name, int level, group_callbacks_t *cbs, int timeout); int group_exit(group_handle_t handle); int group_join(group_handle_t handle, char *name); @@ -88,7 +88,7 @@ don't interfere with dispatchable callback messages. */ int group_get_groups(int max, int *count, group_data_t *data); -int group_get_group(int level, char *name, group_data_t *data); +int group_get_group(int level, const char *name, group_data_t *data); #endif