cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
* [Cluster-devel] cluster fence/fenced/group.c fence/fenced/main ...
@ 2006-10-13 16:03 rpeterso
  0 siblings, 0 replies; 2+ messages in thread
From: rpeterso @ 2006-10-13 16:03 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	rpeterso at sourceware.org	2006-10-13 16:03:48

Modified files:
	fence/fenced   : group.c main.c 
	gnbd/utils     : group.c 
	group/dlm_controld: group.c 
	group/gfs_controld: group.c 
	group/lib      : libgroup.c libgroup.h 

Log message:
	This fix is for bugzilla 210641: Race condition hang/failure
	between cman daemons and groupd.  Added a retry with timeout
	to group_init and all its callers.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/group.c.diff?cvsroot=cluster&r1=1.9&r2=1.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/main.c.diff?cvsroot=cluster&r1=1.37&r2=1.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gnbd/utils/group.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/group.c.diff?cvsroot=cluster&r1=1.2&r2=1.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/group.c.diff?cvsroot=cluster&r1=1.2&r2=1.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/lib/libgroup.c.diff?cvsroot=cluster&r1=1.20&r2=1.21
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/lib/libgroup.h.diff?cvsroot=cluster&r1=1.16&r2=1.17

--- cluster/fence/fenced/group.c	2006/06/20 18:11:58	1.9
+++ cluster/fence/fenced/group.c	2006/10/13 16:03:47	1.10
@@ -18,6 +18,8 @@
 #define DO_TERMINATE 4
 #define DO_SETID 5
 
+#define GROUPD_TIMEOUT 10 /* seconds */
+
 /* save all the params from callback functions here because we can't
    do the processing within the callback function itself */
 
@@ -157,17 +159,14 @@
 {
 	int rv;
 
-	gh = group_init(NULL, "fence", 0, &callbacks);
+	gh = group_init(NULL, "fence", 0, &callbacks, GROUPD_TIMEOUT);
 	if (!gh) {
 		log_error("group_init error %d %d", (int) gh, errno);
 		return -ENOTCONN;
 	}
-
 	rv = group_get_fd(gh);
-	if (rv < 0) {
+	if (rv < 0)
 		log_error("group_get_fd error %d %d", rv, errno);
-	}
-
 	return rv;
 }
 
--- cluster/fence/fenced/main.c	2006/08/15 17:17:45	1.37
+++ cluster/fence/fenced/main.c	2006/10/13 16:03:47	1.38
@@ -182,7 +182,7 @@
 
 	fd = find_domain(name);
 	if (fd) {
-		log_debug("join error: domain exists");
+		log_debug("join error: domain %s exists", name);
 		rv = -EEXIST;
 		goto out;
 	}
@@ -204,6 +204,7 @@
 	rv = group_join(gh, name);
 	if (rv) {
 		log_error("group_join error %d", rv);
+		list_del(&fd->list);
 		free(fd);
 	}
  out:
--- cluster/gnbd/utils/group.c	2006/05/16 19:08:18	1.1
+++ cluster/gnbd/utils/group.c	2006/10/13 16:03:47	1.2
@@ -21,6 +21,7 @@
 #include "group.h"
 
 #define MAXLINE 256
+#define GROUPD_TIMEOUT 10
 
 /* save all the params from callback functions here because we can't
    do the processing within the callback function itself */
@@ -139,7 +140,7 @@
 {
 	int rv;
 
-	gh = group_init(NULL, name, 0, &callbacks);
+	gh = group_init(NULL, name, 0, &callbacks, GROUPD_TIMEOUT);
 	if (!gh) {
 		log_err("group_init error %d %d", (int) gh, errno);
 		return -ENOTCONN;
--- cluster/group/dlm_controld/group.c	2006/08/31 18:17:00	1.2
+++ cluster/group/dlm_controld/group.c	2006/10/13 16:03:47	1.3
@@ -18,6 +18,8 @@
 #define DO_TERMINATE 4
 #define DO_SETID 5
 
+#define GROUPD_TIMEOUT 10 /* seconds */
+
 /* save all the params from callback functions here because we can't
    do the processing within the callback function itself */
 
@@ -199,7 +201,7 @@
 {
 	int rv;
 
-	gh = group_init(NULL, "dlm", 1, &callbacks);
+	gh = group_init(NULL, "dlm", 1, &callbacks, GROUPD_TIMEOUT);
 	if (!gh) {
 		log_error("group_init error %d %d", (int) gh, errno);
 		return -ENOTCONN;
--- cluster/group/gfs_controld/group.c	2006/06/15 20:41:46	1.2
+++ cluster/group/gfs_controld/group.c	2006/10/13 16:03:47	1.3
@@ -15,6 +15,8 @@
 /* save all the params from callback functions here because we can't
    do the processing within the callback function itself */
 
+#define GROUPD_TIMEOUT 10 /* seconds */
+
 group_handle_t gh;
 static int cb_action;
 static char cb_name[MAX_GROUP_NAME_LEN+1];
@@ -168,7 +170,7 @@
 	int rv;
 
 	gh = group_init(NULL, LOCK_DLM_GROUP_NAME, LOCK_DLM_GROUP_LEVEL,
-			&callbacks);
+					&callbacks, GROUPD_TIMEOUT);
 	if (!gh) {
 		log_error("group_init error %d %d", (int) gh, errno);
 		return -ENOTCONN;
--- cluster/group/lib/libgroup.c	2006/09/07 19:24:08	1.20
+++ cluster/group/lib/libgroup.c	2006/10/13 16:03:48	1.21
@@ -279,11 +279,11 @@
 }
 
 group_handle_t group_init(void *private, char *prog_name, int level,
-			  group_callbacks_t *cbs)
+			  group_callbacks_t *cbs, int timeout)
 {
 	struct group_handle *h;
 	char buf[GROUPD_MSGLEN];
-	int rv, saved_errno;
+	int rv, saved_errno, i;
 
 	h = malloc(sizeof(struct group_handle));
 	if (!h)
@@ -295,20 +295,25 @@
 	h->level = level;
 	strncpy(h->prog_name, prog_name, 32);
 
-	h->fd = connect_groupd();
-	if (h->fd < 0)
-		goto fail;
-
-	memset(buf, 0, sizeof(buf));
-	snprintf(buf, sizeof(buf), "setup %s %d", prog_name, level);
-
-	rv = do_write(h->fd, &buf, GROUPD_MSGLEN);
-	if (rv < 0)
-		goto fail;
-
-	return (group_handle_t) h;
-
- fail:
+	for (i = 0; !timeout || i < timeout * 2; i++) {
+		h->fd = connect_groupd();
+		if (h->fd > 0 || !timeout) /* if successful or only once allowed */
+			break;
+		usleep(500000);
+	}
+	if (h->fd > 0) {
+		memset(buf, 0, sizeof(buf));
+		snprintf(buf, sizeof(buf), "setup %s %d", prog_name, level);
+
+		for (; !timeout || i < timeout * 2; i++) {
+			rv = do_write(h->fd, &buf, GROUPD_MSGLEN);
+			if (rv >= 0)
+				return (group_handle_t) h;
+			if (!timeout)
+				break;
+			usleep(500000);
+		}
+	}
 	saved_errno = errno;
 	close(h->fd);
 	free(h);
@@ -475,31 +480,30 @@
 	return rv;
 }
 
-int group_get_group(int level, char *name, group_data_t *data)
+int group_get_group(int level, const char *name, group_data_t *data)
 {
-	char buf[GROUPD_MSGLEN];
-	char data_buf[sizeof(group_data_t)];
-	int fd, rv, len;
+       char buf[GROUPD_MSGLEN];
+       char data_buf[sizeof(group_data_t)];
+       int fd, rv, len;
 
-	fd = connect_groupd();
-	if (fd < 0)
-		return fd;
+       fd = connect_groupd();
+       if (fd < 0)
+               return fd;
 
-	memset(buf, 0, sizeof(buf));
-	snprintf(buf, sizeof(buf), "get_group %d %s", level, name);
+       memset(buf, 0, sizeof(buf));
+       snprintf(buf, sizeof(buf), "get_group %d %s", level, name);
 
-	rv = do_write(fd, &buf, GROUPD_MSGLEN);
-	if (rv < 0)
-		goto out;
+       rv = do_write(fd, &buf, GROUPD_MSGLEN);
+       if (rv < 0)
+               goto out;
 
-	rv = do_read(fd, &data_buf, sizeof(data_buf));
-	if (rv < 0)
-		goto out;
+       rv = do_read(fd, &data_buf, sizeof(data_buf));
+       if (rv < 0)
+               goto out;
 
-	memcpy(data, data_buf, sizeof(group_data_t));
-	rv = 0;
+       memcpy(data, data_buf, sizeof(group_data_t));
+       rv = 0;
  out:
-	close(fd);
-	return rv;
+       close(fd);
+       return rv;
 }
-
--- cluster/group/lib/libgroup.h	2006/03/02 20:24:17	1.16
+++ cluster/group/lib/libgroup.h	2006/10/13 16:03:48	1.17
@@ -54,7 +54,7 @@
 	group_deliver_t deliver;
 } group_callbacks_t;
 
-group_handle_t group_init(void *private, char *prog_name, int level, group_callbacks_t *cbs);
+group_handle_t group_init(void *private, char *prog_name, int level, group_callbacks_t *cbs, int timeout);
 int group_exit(group_handle_t handle);
 
 int group_join(group_handle_t handle, char *name);
@@ -88,7 +88,7 @@
    don't interfere with dispatchable callback messages. */
 
 int group_get_groups(int max, int *count, group_data_t *data);
-int group_get_group(int level, char *name, group_data_t *data);
+int group_get_group(int level, const char *name, group_data_t *data);
 
 #endif
 



^ permalink raw reply	[flat|nested] 2+ messages in thread

* [Cluster-devel] cluster fence/fenced/group.c fence/fenced/main ...
@ 2007-10-26 20:34 teigland
  0 siblings, 0 replies; 2+ messages in thread
From: teigland @ 2007-10-26 20:34 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL5
Changes by:	teigland at sourceware.org	2007-10-26 20:34:43

Modified files:
	fence/fenced   : group.c main.c member_cman.c 
	group/daemon   : main.c 
	group/gfs_controld: main.c 
	group/lib      : libgroup.c 
	group/tool     : main.c 

Log message:
	Improve the dumping of debug logs from daemons.
	bz 317181
	
	group_tool reads debug logs from groupd, fenced, and gfs_controld.
	The dumping code in all three daemons is now identical.  The other
	change is that the dumping function terminates the final write
	with \0, and no longer sends the entire 1MB log buffer if it's not full.
	
	(Plus a couple random bits to sync with HEAD)

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/group.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.10&r2=1.10.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.38.2.4&r2=1.38.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/member_cman.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.2&r2=1.15.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.51.2.7&r2=1.51.2.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.18.2.12&r2=1.18.2.13
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/lib/libgroup.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.24.2.1&r2=1.24.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/tool/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.21.2.3&r2=1.21.2.4

--- cluster/fence/fenced/group.c	2006/10/13 16:03:47	1.10
+++ cluster/fence/fenced/group.c	2007/10/26 20:34:37	1.10.2.1
@@ -161,7 +161,7 @@
 
 	gh = group_init(NULL, "fence", 0, &callbacks, GROUPD_TIMEOUT);
 	if (!gh) {
-		log_error("group_init error %d %d", (int) gh, errno);
+		log_error("group_init error %p %d", gh, errno);
 		return -ENOTCONN;
 	}
 	rv = group_get_fd(gh);
--- cluster/fence/fenced/main.c	2007/10/01 16:32:26	1.38.2.4
+++ cluster/fence/fenced/main.c	2007/10/26 20:34:38	1.38.2.5
@@ -338,21 +338,22 @@
 		client[i].fd = -1;
 }
 
-static int do_dump(int ci)
+static int do_dump(int fd)
 {
-	int rv, len = DUMP_SIZE;
+	int len;
 
 	if (dump_wrap) {
 		len = DUMP_SIZE - dump_point;
-		rv = do_write(client[ci].fd, dump_buf + dump_point, len);
-		if (rv < 0)
-			log_debug("write error %d errno %d", rv, errno);
+		do_write(fd, dump_buf + dump_point, len);
+		len = dump_point;
+	} else
 		len = dump_point;
-	}
 
-	rv = do_write(client[ci].fd, dump_buf, len);
-	if (rv < 0)
-		log_debug("write error %d errno %d", rv, errno);
+	/* NUL terminate the debug string */
+	dump_buf[dump_point] = '\0';
+
+	do_write(fd, dump_buf, len);
+
 	return 0;
 }
 
@@ -386,7 +387,8 @@
 	else if (!strcmp(cmd, "leave"))
 		rv = do_leave(name);
 	else if (!strcmp(cmd, "dump")) {
-		do_dump(ci);
+		do_dump(client[ci].fd);
+		close(client[ci].fd);
 		return 0;
 	} else
 		rv = -EINVAL;
--- cluster/fence/fenced/member_cman.c	2007/08/31 14:26:04	1.15.2.2
+++ cluster/fence/fenced/member_cman.c	2007/10/26 20:34:38	1.15.2.3
@@ -123,7 +123,7 @@
 
 	ch = cman_init(NULL);
 	if (!ch) {
-		log_error("cman_init error %d %d", (int) ch, errno);
+		log_error("cman_init error %p %d", ch, errno);
 		return -ENOTCONN;
 	}
 
--- cluster/group/daemon/main.c	2007/09/07 19:22:08	1.51.2.7
+++ cluster/group/daemon/main.c	2007/10/26 20:34:39	1.51.2.8
@@ -553,19 +553,20 @@
 
 static int do_dump(int fd)
 {
-	int rv, len = DUMP_SIZE;
+	int len;
 
 	if (dump_wrap) {
 		len = DUMP_SIZE - dump_point;
-		rv = do_write(fd, dump_buf + dump_point, len);
-		if (rv < 0)
-			log_print("dump write error %d errno %d", rv, errno);
+		do_write(fd, dump_buf + dump_point, len);
+		len = dump_point;
+	} else
 		len = dump_point;
-	}
 
-	rv = do_write(fd, dump_buf, len);
-	if (rv < 0)
-		log_print("dump write error %d errno %d", rv, errno);
+	/* NUL terminate the debug string */
+	dump_buf[dump_point] = '\0';
+
+	do_write(fd, dump_buf, len);
+
 	return 0;
 }
 
@@ -667,6 +668,7 @@
 
 	case DO_DUMP:
 		do_dump(client[ci].fd);
+		close(client[ci].fd);
 		break;
 
 	case DO_LOG:
--- cluster/group/gfs_controld/main.c	2007/06/12 20:05:12	1.18.2.12
+++ cluster/group/gfs_controld/main.c	2007/10/26 20:34:40	1.18.2.13
@@ -255,17 +255,22 @@
 	return do_write(client[ci].fd, buf, len);
 }
 
-static int dump_debug(int ci)
+static int do_dump(int fd)
 {
-	int len = DUMP_SIZE;
+	int len;
 
 	if (dump_wrap) {
 		len = DUMP_SIZE - dump_point;
-		do_write(client[ci].fd, dump_buf + dump_point, len);
+		do_write(fd, dump_buf + dump_point, len);
 		len = dump_point;
-	}
+	} else
+		len = dump_point;
+
+	/* NUL terminate the debug string */
+	dump_buf[dump_point] = '\0';
+
+	do_write(fd, dump_buf, len);
 
-	do_write(client[ci].fd, dump_buf, len);
 	return 0;
 }
 
@@ -375,7 +380,8 @@
 		goto reply;
 
 	} else if (!strcmp(cmd, "dump")) {
-		dump_debug(ci);
+		do_dump(client[ci].fd);
+		close(client[ci].fd);
 
 	} else if (!strcmp(cmd, "plocks")) {
 		dump_plocks(argv[1], client[ci].fd);
--- cluster/group/lib/libgroup.c	2006/11/17 16:30:45	1.24.2.1
+++ cluster/group/lib/libgroup.c	2007/10/26 20:34:41	1.24.2.2
@@ -378,7 +378,7 @@
 {
 	char buf[GROUPD_MSGLEN], *argv[MAXARGS];
 	char *p;
-	int act, argc, rv, i, count, *nodeids;
+	int act, argc, rv, count, *nodeids;
 	struct group_handle *h = (struct group_handle *) handle;
 	VALIDATE_HANDLE(h);
 
@@ -494,7 +494,7 @@
 {
 	char buf[GROUPD_MSGLEN];
 	char data_buf[sizeof(group_data_t)];
-	int fd, rv, len;
+	int fd, rv;
 
 	fd = connect_groupd();
 	if (fd < 0)
--- cluster/group/tool/main.c	2006/12/01 15:26:40	1.21.2.3
+++ cluster/group/tool/main.c	2007/10/26 20:34:42	1.21.2.4
@@ -379,10 +379,8 @@
 		return -1;
 	}
 
-	rv = do_read(fd, inbuf, sizeof(inbuf));
-	if (rv < 0)
-		printf("dump read error %d errno %d\n", rv, errno);
-	do_write(STDOUT_FILENO, inbuf, sizeof(inbuf));
+	do_read(fd, inbuf, sizeof(inbuf));
+	do_write(STDOUT_FILENO, inbuf, strlen(inbuf));
 
 	close(fd);
 	return 0;
@@ -405,9 +403,7 @@
 		return -1;
 	}
 
-	rv = do_read(fd, inbuf, sizeof(inbuf));
-	if (rv < 0)
-		printf("dump read error %d errno %d\n", rv, errno);
+	do_read(fd, inbuf, sizeof(inbuf));
 	do_write(STDOUT_FILENO, inbuf, sizeof(inbuf));
 
 	close(fd);



^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2007-10-26 20:34 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-26 20:34 [Cluster-devel] cluster fence/fenced/group.c fence/fenced/main teigland
  -- strict thread matches above, loose matches on Subject: below --
2006-10-13 16:03 rpeterso

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).