All of lore.kernel.org
 help / color / mirror / Atom feed
From: lhh@sourceware.org <lhh@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/rgmanager ChangeLog src/clulib/msg_clu ...
Date: 23 Jul 2007 20:49:15 -0000	[thread overview]
Message-ID: <20070723204915.1157.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	lhh at sourceware.org	2007-07-23 20:49:13

Modified files:
	rgmanager      : ChangeLog 
	rgmanager/src/clulib: msg_cluster.c msgtest.c vft.c 
	rgmanager/src/daemons: groups.c main.c nodeevent.c rg_event.c 
	                       rg_forward.c rg_state.c rg_thread.c 
	rgmanager/src/resources: service.sh 
	rgmanager/src/utils: clusvcadm.c 
Added files:
	rgmanager/src/clulib: tmgr.c 

Log message:
	Misc. bugfixes; see ChangeLog

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.52&r2=1.53
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/tmgr.c.diff?cvsroot=cluster&r1=NONE&r2=1.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/msg_cluster.c.diff?cvsroot=cluster&r1=1.4&r2=1.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/msgtest.c.diff?cvsroot=cluster&r1=1.2&r2=1.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/vft.c.diff?cvsroot=cluster&r1=1.20&r2=1.21
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.36&r2=1.37
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.40&r2=1.41
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/nodeevent.c.diff?cvsroot=cluster&r1=1.7&r2=1.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_event.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_forward.c.diff?cvsroot=cluster&r1=1.9&r2=1.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.37&r2=1.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_thread.c.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/service.sh.diff?cvsroot=cluster&r1=1.9&r2=1.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clusvcadm.c.diff?cvsroot=cluster&r1=1.20&r2=1.21

--- cluster/rgmanager/ChangeLog	2007/07/12 11:25:09	1.52
+++ cluster/rgmanager/ChangeLog	2007/07/23 20:49:13	1.53
@@ -1,3 +1,23 @@
+2007-07-23 Lon Hohberger <lhh@redhat.com>
+	* general: make threads exit with pthread_exit() so we can wrap/track them.
+	Add internal statedump (SIGUSR1) support.
+	* src/clulib/msg_cluster.c: Fix rare deadlock condition.  Add dump support.
+	* src/clulib/tmgr.c: Add thread wrappers so we can report threads in
+	internal state dumps.
+	* src/clulib/vft.c: Fix rare crash if vf_resolve_views gets called with
+	NULL.  Add dump support.
+	* src/daemons/main.c: Fix minor memory leak in membership_update().  Fix
+	crash-on-exit race.  Don't exit if someone requests foreground mode.
+	* src/daemons/rg_forward.c: Clean up forwarding logic and handle missed
+	case (forward-to-self -> ERROR!)
+	* src/daemons/rg_state.c: Move closing / free of contexts out of
+	send_ret/send_response to the caller (where they belong).  Don't let
+	people relocate disabled services.
+	* src/daemons/rg_thread.c: Don't loop forever if the thread exits before
+	we notice that it's started.
+	* src/daemons/clusvcadm.c: Fix error codes if you try to relocate when
+	rgmanager isn't running
+
 2007-07-12 Marek Grac <mgrac@redhat.com>
 	* src/resources/Makefile: Fix #245178 - install RA for named
 
/cvs/cluster/cluster/rgmanager/src/clulib/tmgr.c,v  -->  standard output
revision 1.1
--- cluster/rgmanager/src/clulib/tmgr.c
+++ -	2007-07-23 20:49:14.098409000 +0000
@@ -0,0 +1,128 @@
+/*
+  Copyright Red Hat, Inc. 2007
+  Copyright Crosswalk 2006-2007
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+*/
+#ifdef WRAP_THREADS
+#include <stdio.h>
+#include <sys/types.h>
+#include <gettid.h>
+#include <pthread.h>
+#include <string.h>
+#include <errno.h>
+#include <malloc.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <list.h>
+#include <execinfo.h>
+
+typedef struct _thr {
+	list_head();
+	void *(*fn)(void *arg);
+	char **name;
+	pthread_t th;
+} mthread_t;
+
+static mthread_t *_tlist = NULL;
+static int _tcount = 0;
+static pthread_rwlock_t _tlock = PTHREAD_RWLOCK_INITIALIZER;
+
+void
+dump_thread_states(FILE *fp)
+{
+	int x;
+	mthread_t *curr;
+	fprintf(fp, "Thread Information\n");
+	pthread_rwlock_rdlock(&_tlock);
+	list_for(&_tlist, curr, x) {
+		fprintf(fp, "  Thread #%d   id: %d   function: %s\n",
+			x, (unsigned)curr->th, curr->name[0]);
+	}
+	pthread_rwlock_unlock(&_tlock);
+	fprintf(fp, "\n\n");
+}
+
+
+int __real_pthread_create(pthread_t *, const pthread_attr_t *,
+			  void *(*)(void*), void *);
+int
+__wrap_pthread_create(pthread_t *th, const pthread_attr_t *attr,
+	 	      void *(*start_routine)(void*),
+	 	      void *arg)
+{
+	void *fn = start_routine;
+	mthread_t *new;
+	int ret;
+
+	new = malloc(sizeof (*new));
+
+	ret = __real_pthread_create(th, attr, start_routine, arg);
+	if (ret) {
+		if (new)
+			free(new);
+		return ret;
+	}
+
+	if (new) {
+		new->th = *th;
+		new->fn = start_routine;
+		new->name = backtrace_symbols(&new->fn, 1);
+		pthread_rwlock_wrlock(&_tlock);
+		list_insert(&_tlist, new);
+		++_tcount;
+		pthread_rwlock_unlock(&_tlock);
+	}
+
+	return ret;
+}
+
+
+void __real_pthread_exit(void *);
+void
+__wrap_pthread_exit(void *exitval)
+{
+	mthread_t *old;
+	int ret = 0, found = 0;
+	pthread_t me = pthread_self();
+
+	pthread_rwlock_rdlock(&_tlock);
+	list_for(&_tlist, old, ret) {
+		if (old->th == me) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		old = NULL;
+	pthread_rwlock_unlock(&_tlock);
+
+	if (!old)
+		__real_pthread_exit(exitval);
+
+	pthread_rwlock_wrlock(&_tlock);
+	list_remove(&_tlist, old);
+	--_tcount;
+	pthread_rwlock_unlock(&_tlock);
+
+	if (old->name)
+		free(old->name);
+	free(old);
+	__real_pthread_exit(exitval);
+}
+#endif
--- cluster/rgmanager/src/clulib/msg_cluster.c	2006/10/23 22:47:00	1.4
+++ cluster/rgmanager/src/clulib/msg_cluster.c	2007/07/23 20:49:13	1.5
@@ -46,7 +46,7 @@
 static msgctx_t *contexts[MAX_CONTEXTS];
 static int _me = 0;
 pthread_t comms_thread;
-int thread_running;
+int thread_running = 0;
 
 #define is_established(ctx) \
 	(((ctx->type == MSG_CLUSTER) && \
@@ -856,7 +856,6 @@
 	errno = EINVAL;
 	cluster_msg_hdr_t *m;
 	msg_q_t *n;
-	char done = 0;
 	char foo;
 
 	if (!listenctx || !acceptctx)
@@ -884,24 +883,38 @@
 		m = n->message;
 		switch(m->msg_control) {
 		case M_OPEN:
+			/* XXX make this case statement its own function or at 
+			   least make it not a big case block . */
 			list_remove(&listenctx->u.cluster_info.queue, n);
 			/*printf("Accepting connection from %d %d\n",
 			  	 m->src_nodeid, m->src_ctx);*/
 
-			/* New connection */
+			/* Release lock on listen context queue; we're done
+			   with it at this point */
+			pthread_mutex_unlock(&listenctx->u.cluster_info.mutex);
+
+			/* New connection: first, create + lock the mutex */
 			pthread_mutex_init(&acceptctx->u.cluster_info.mutex,
 					   NULL);
+			/* Lock this while we finish initializing */
+			pthread_mutex_lock(&acceptctx->u.cluster_info.mutex);
+
 			pthread_cond_init(&acceptctx->u.cluster_info.cond,
 					  NULL);
+
 			acceptctx->u.cluster_info.queue = NULL;
 			acceptctx->u.cluster_info.remote_ctx = m->src_ctx;
 			acceptctx->u.cluster_info.nodeid = m->src_nodeid;
 			acceptctx->u.cluster_info.port = m->msg_port;
 			acceptctx->flags = (SKF_READ | SKF_WRITE);
 
-			if (assign_ctx(acceptctx) < 0) {
+			/* assign_ctx requires the context lock.  We need to 
+			ensure we don't try to take the context lock w/ a local
+			queue lock held on a context that's in progress (i.e.
+			the global cluster context...) */
+			if (assign_ctx(acceptctx) < 0)
 				printf("FAILED TO ASSIGN CONTEXT\n");
-			}
+
 			cluster_send_control_msg(acceptctx, M_OPEN_ACK);
 
 			if (listenctx->u.cluster_info.select_pipe[0] >= 0) {
@@ -910,11 +923,14 @@
 				     &foo, 1);
 			}
 
-			done = 1;
 			free(m);
 			free(n);
 
-			break;
+			/* Let the new context go. */
+			pthread_mutex_unlock(&acceptctx->u.cluster_info.mutex);
+			return 0;
+			/* notreached */
+
 		case M_DATA:
 			/* Data messages (i.e. from broadcast msgs) are
 			   okay too!...  but we don't handle them here */
@@ -925,9 +941,6 @@
 			break;
 		}
 
-		if (done)
-			break;
-
 	} while (!list_done(&listenctx->u.cluster_info.queue, n));
 
 	pthread_mutex_unlock(&listenctx->u.cluster_info.mutex);
@@ -950,7 +963,7 @@
 		poll_cluster_messages(2);
 	}
 
-	return NULL;
+	pthread_exit(NULL);
 }
 
 
@@ -1105,7 +1118,7 @@
 
        	pthread_attr_init(&attrs);
        	pthread_attr_setinheritsched(&attrs, PTHREAD_INHERIT_SCHED);
-       	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+       	/*pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);*/
 
 	thread_running = 1;	
 	pthread_create(&comms_thread, &attrs, cluster_comms_thread, NULL);
@@ -1130,16 +1143,81 @@
 }
 
 
+void
+dump_cluster_ctx(FILE *fp)
+{
+	int x;
+	msgctx_t *ctx;
+
+	fprintf(fp, "CMAN/mux subsystem status\n");
+	if (thread_running) {
+		fprintf(fp, "  Thread: %d\n", (unsigned)comms_thread);
+	} else {
+		fprintf(fp, "  Thread Offline\n");
+	}
+
+	pthread_mutex_lock(&context_lock);
+	for (x = 0; x < MAX_CONTEXTS; x++) {
+		if (!contexts[x]) 
+			continue;
+		ctx = contexts[x];
+
+		fprintf(fp, "    Cluster Message Context %p\n", ctx);
+		fprintf(fp, "      Flags %08x  ", ctx->flags);
+		if (ctx->flags & SKF_READ)
+			fprintf(fp, "SKF_READ ");
+		if (ctx->flags & SKF_WRITE)
+			fprintf(fp, "SKF_WRITE ");
+		if (ctx->flags & SKF_LISTEN)
+			fprintf(fp, "SKF_LISTEN ");
+		if (ctx->flags & SKF_MCAST)
+			fprintf(fp, "SKF_MCAST ");
+		fprintf(fp, "\n");
+		fprintf(fp, "      Target node ID %d\n", ctx->u.cluster_info.nodeid);
+		fprintf(fp, "      Local Index %d\n", ctx->u.cluster_info.local_ctx);
+		fprintf(fp, "      Remote Index %d\n", ctx->u.cluster_info.remote_ctx);
+	}
+	pthread_mutex_unlock(&context_lock);
+	fprintf(fp, "\n");
+}
+
+
 int
 cluster_msg_shutdown(void)
 {
 	cman_handle_t ch;
+	cluster_msg_hdr_t m;
+	msgctx_t *ctx;
+	int x;
+
+	thread_running = 0;
+	pthread_join(comms_thread, NULL);
 
 	ch = cman_lock(1, SIGUSR2);
 	cman_end_recv_data(ch);
-	pthread_kill(comms_thread, SIGTERM);
 	cman_unlock(ch);
 
+	/* Send close message to all open contexts */
+	memset(&m, 0, sizeof(m));
+	m.msg_control = M_CLOSE;
+
+	pthread_mutex_lock(&context_lock);
+	for (x = 0; x < MAX_CONTEXTS; x++) {
+		if (!contexts[x])
+			continue;
+
+		ctx = contexts[x];
+
+		/* Kill remote side if it exists */
+		if (is_established(ctx))
+			cluster_send_control_msg(ctx, M_CLOSE);
+
+		/* Queue close for local side */
+		queue_for_context(ctx, (void *)&m, sizeof(m));
+	}
+	pthread_mutex_unlock(&context_lock);
+
+
 	return 0;
 }
 
--- cluster/rgmanager/src/clulib/msgtest.c	2006/08/07 22:05:01	1.2
+++ cluster/rgmanager/src/clulib/msgtest.c	2007/07/23 20:49:13	1.3
@@ -49,7 +49,7 @@
 
 	if (msg_open(MSG_CLUSTER, 0, MYPORT, &ctx, 0) != 0) {
 		printf("Could not set up mcast socket!\n");
-		return NULL;
+		pthread_exit(NULL);
 	}
 
 	printf("PIGGYBACK CONTEXT\n");
@@ -66,7 +66,7 @@
 
 	printf("PIGGY flies...\n");
 
-	return NULL;
+	pthread_exit(NULL);
 }
 
 
@@ -102,7 +102,7 @@
 
 		if (msg_open(MSG_CLUSTER, 0, MYPORT, &ctx, 1) != 0) {
 			printf("Could not set up mcast socket!\n");
-			return NULL;
+			pthread_exit(NULL);
 		}
 
 		snprintf(buf, sizeof(buf), "Babble, babble\n");
@@ -116,7 +116,7 @@
 
 	printf("Private thread is outta here...\n");
 
-	return NULL;
+	pthread_exit(NULL);
 }
 
 
--- cluster/rgmanager/src/clulib/vft.c	2007/04/27 04:23:05	1.20
+++ cluster/rgmanager/src/clulib/vft.c	2007/07/23 20:49:13	1.21
@@ -121,9 +121,9 @@
 
 
 struct vf_args {
-	uint16_t port;
-	int local_node_id;
 	msgctx_t *ctx;
+	int local_node_id;
+	uint16_t port;
 };
 
 
@@ -277,6 +277,9 @@
 	uint32_t datalen;
 	uint32_t trans;
 
+	if (!key_node)
+		return 0;
+
 	while ((trans = vf_try_commit(key_node)) != 0) {
 		commits++;
 	}
@@ -895,7 +898,7 @@
 
 	msg_close(ctx);
 	msg_free_ctx(ctx);
-	return NULL;
+	pthread_exit(NULL);
 }
 
 
@@ -1776,3 +1779,40 @@
 	return VFR_OK;
 }
 
+
+void
+dump_vf_states(FILE *fp)
+{
+	key_node_t *cur;
+
+	fprintf(fp, "View-Formation States:\n");
+	fprintf(fp, "  Thread: %d\n", (unsigned)vf_thread);
+	fprintf(fp, "  Default callbacks:\n    Vote: %p\n    Commit: %p\n",
+		default_vote_cb, default_commit_cb);
+	fprintf(fp, "  Distributed key metadata:\n");
+
+	pthread_mutex_lock(&key_list_mutex);
+
+	for (cur = key_list; cur; cur = cur->kn_next) {
+		fprintf(fp, "    %s, View: %d, Size: %d, Address: %p\n",
+			cur->kn_keyid,
+			(int)cur->kn_viewno,
+			cur->kn_datalen,
+			cur->kn_data);
+		if (cur->kn_vote_cb != default_vote_cb) 
+			fprintf(fp, "      Vote callback: %p\n", cur->kn_vote_cb);
+		if (cur->kn_commit_cb != default_commit_cb) 
+			fprintf(fp, "      Commit callback: %p\n", cur->kn_commit_cb);
+
+		if (cur->kn_jvlist)
+			fprintf(fp, "        This key has unresolved "
+			        "new views pending\n");
+ 		if (cur->kn_clist)
+			fprintf(fp, "        This key has unresolved "
+			        "commits pending\n");
+
+	}
+
+	pthread_mutex_unlock(&key_list_mutex);
+	fprintf(fp, "\n");
+}
--- cluster/rgmanager/src/daemons/groups.c	2007/07/10 18:25:26	1.36
+++ cluster/rgmanager/src/daemons/groups.c	2007/07/23 20:49:13	1.37
@@ -1033,7 +1033,7 @@
 		msg_send_simple(ctx, RG_FAIL, RG_EAGAIN, 0);
 		msg_close(ctx);
 		msg_free_ctx(ctx);
-		return NULL;
+		pthread_exit(NULL);
 	}
 	
 	pthread_rwlock_rdlock(&resource_lock);
@@ -1056,7 +1056,7 @@
 	
 	rg_dec_status();
 
-	return NULL;
+	pthread_exit(NULL);
 }
 
 
@@ -1172,7 +1172,7 @@
 	
 	/* Only one status thread at a time, please! */
 	if (pthread_mutex_trylock(&status_mutex) != 0)
-		return NULL;
+		pthread_exit(NULL);
 
 	pthread_rwlock_rdlock(&resource_lock);
 	list_do(&_tree, curr) {
@@ -1198,7 +1198,7 @@
 	pthread_rwlock_unlock(&resource_lock);
 	pthread_mutex_unlock(&status_mutex);
 
-	return NULL;
+	pthread_exit(NULL);
 }
 
 
@@ -1400,6 +1400,13 @@
 }
 
 
+void
+dump_config_version(FILE *fp)
+{
+	fprintf(fp, "Cluster configuration version %d\n\n", config_version);
+}
+
+
 /**
   Initialize resource groups.  This reads all the resource groups from 
   CCS, builds the tree, etc.  Ideally, we'll have a similar function 
--- cluster/rgmanager/src/daemons/main.c	2007/06/27 14:03:51	1.40
+++ cluster/rgmanager/src/daemons/main.c	2007/07/23 20:49:13	1.41
@@ -40,6 +40,9 @@
 #define L_SYS (1<<1)
 #define L_USER (1<<0)
 
+#ifdef WRAP_THREADS
+void dump_thread_states(FILE *);
+#endif
 int configure_logging(int ccsfd, int debug);
 
 void node_event(int, int, int, int);
@@ -63,7 +66,7 @@
 
 int next_node_id(cluster_member_list_t *membership, int me);
 int rg_event_q(char *svcName, uint32_t state, int owner);
-
+void malloc_dump_table(FILE *, size_t, size_t);
 
 void
 segfault(int sig)
@@ -259,6 +262,7 @@
 
 	free_member_list(node_delta);
 	free_member_list(new_ml);
+	free_member_list(old_membership);
 
 	rg_unlockall(L_SYS);
 
@@ -405,7 +409,8 @@
 	sz = msg_receive(ctx, msg_hdr, sizeof(msgbuf), 1);
 	if (sz < sizeof (generic_msg_hdr)) {
 		clulog(LOG_ERR,
-		       "#37: Error receiving message header (%d)\n", sz);
+		       "#37: Error receiving header from %d sz=%d CTX %p\n",
+		       nodeid, sz, ctx);
 		goto out;
 	}
 
@@ -593,6 +598,7 @@
 		break;
 
 	case M_DATA:
+		nodeid = msg_get_nodeid(ctx);
 		dispatch_msg(ctx, nodeid, 0);
 		break;
 		
@@ -629,7 +635,26 @@
 }
 
 
-void dump_threads(void);
+void dump_threads(FILE *fp);
+void dump_config_version(FILE *fp);
+void dump_vf_states(FILE *fp);
+void dump_cluster_ctx(FILE *fp);
+
+void
+dump_internal_state(char *loc)
+{
+	FILE *fp;
+	fp=fopen(loc, "w+");
+ 	dump_config_version(fp);
+ 	dump_threads(fp);
+ 	dump_vf_states(fp);
+#ifdef WRAP_THREADS
+	dump_thread_states(fp);
+#endif
+	dump_cluster_ctx(fp);
+	//malloc_dump_table(fp, 1, 16384); /* Only works if alloc.c us used */
+ 	fclose(fp);
+}
 
 int
 event_loop(msgctx_t *localctx, msgctx_t *clusterctx)
@@ -645,10 +670,8 @@
 
 	if (signalled) {
 		signalled = 0;
-		/*
-		malloc_stats();
-		dump_threads();
-		 */
+ 
+		dump_internal_state("/tmp/rgmanager-dump");
 	}
 
 	while (running && (tv.tv_sec || tv.tv_usec)) {
@@ -747,7 +770,6 @@
 cleanup(msgctx_t *clusterctx)
 {
 	kill_resource_groups();
-	member_list_update(NULL);
 	send_exit_msg(clusterctx);
 }
 
@@ -760,7 +782,7 @@
 }
 
 
-void malloc_dump_table(size_t, size_t);
+void malloc_dump_table(FILE *, size_t, size_t);
 
 
 /*
@@ -846,10 +868,13 @@
 	rg_doall(RG_STOP_EXITING, 1, NULL);
 	running = 0;
 
-	return 0;
+	pthread_exit(NULL);
 }
 
 
+#ifdef WRAP_THREADS
+void dump_thread_states(FILE *);
+#endif
 int
 main(int argc, char **argv)
 {
@@ -871,7 +896,9 @@
 			break;
 		case 'f':
 			foreground = 1;
+			break;
 		default:
+			return 1;
 			break;
 		}
 	}
@@ -984,6 +1011,9 @@
 		event_loop(local_ctx, cluster_ctx);
 
 		if (shutdown_pending == 1) {
+			/* Kill local socket; local requests need to
+			   be ignored here */
+			msg_close(local_ctx);
 			++shutdown_pending;
 			clulog(LOG_NOTICE, "Shutting down\n");
 			pthread_create(&th, NULL, shutdown_thread, NULL);
--- cluster/rgmanager/src/daemons/nodeevent.c	2007/06/27 14:03:51	1.7
+++ cluster/rgmanager/src/daemons/nodeevent.c	2007/07/23 20:49:13	1.8
@@ -196,7 +196,7 @@
 	/* Mutex held */
 	ne_thread = 0;
 	pthread_mutex_unlock(&ne_queue_mutex);
-	return NULL;
+	pthread_exit(NULL);
 }
 
 
--- cluster/rgmanager/src/daemons/rg_event.c	2006/07/11 23:52:41	1.1
+++ cluster/rgmanager/src/daemons/rg_event.c	2007/07/23 20:49:13	1.2
@@ -64,7 +64,7 @@
 	/* Mutex held */
 	rg_ev_thread = 0;
 	pthread_mutex_unlock(&rg_queue_mutex);
-	return NULL;
+	pthread_exit(NULL);
 }
 
 
--- cluster/rgmanager/src/daemons/rg_forward.c	2006/12/14 22:03:17	1.9
+++ cluster/rgmanager/src/daemons/rg_forward.c	2007/07/23 20:49:13	1.10
@@ -24,6 +24,7 @@
 #include <msgsimple.h>
 #include <clulog.h>
 #include <message.h>
+#include <members.h>
 
 
 void
@@ -49,59 +50,100 @@
 	request_t *req = (request_t *)arg;
 	struct dlm_lksb lockp;
 	msgctx_t *ctx = NULL;
+	cluster_member_list_t *m = NULL;
 	SmMessageSt msg;
+	int response_code = RG_EAGAIN, ret;
+	int new_owner = 0, retries = 0;
 
-	if (rg_lock(req->rr_group, &lockp) != 0)
+	if (rg_lock(req->rr_group, &lockp) != 0) {
+		clulog(LOG_WARNING, "FW: Forwarding failed; lock unavailable for %s\n",
+		       req->rr_group);
 		goto out_fail;
-
+	}
 	if (get_rg_state(req->rr_group, &rgs) != 0) {
 		rg_unlock(&lockp);
+		clulog(LOG_WARNING, "FW: Forwarding failed; state unavailable for %s\n",
+		       req->rr_group);
 		goto out_fail;
 	}
-
 	rg_unlock(&lockp);
 
-	/* Construct message */
-	build_message(&msg, req->rr_request, req->rr_group, req->rr_target);
-
 	if (rgs.rs_owner == 0)
 		rgs.rs_owner = req->rr_target;
 	if (rgs.rs_owner == 0) {
-		msg_close(req->rr_resp_ctx);
-		msg_free_ctx(req->rr_resp_ctx);
-		rq_free(req);
-		clulog(LOG_ERR, "Attempt to forward to invalid node ID\n");
-		pthread_exit(NULL);
+		clulog(LOG_ERR, "FW: Attempt to forward to invalid node ID\n");
+       		goto out_fail;
+	}
+	if (rgs.rs_owner == my_id()) {
+		clulog(LOG_WARNING, "BUG! Attempt to forward to myself!\n");
+       		goto out_fail;
 	}
 
-	clulog(LOG_DEBUG, "Forwarding %s request to %d\n",
+	clulog(LOG_DEBUG, "FW: Forwarding %s request to %d\n",
 	       rg_req_str(req->rr_request), rgs.rs_owner);
 
-	while ((ctx = msg_new_ctx()) == NULL)
-		sleep(1);
-
-	if (msg_open(MSG_CLUSTER, rgs.rs_owner, RG_PORT, ctx, 10) < 0)
+	ctx = msg_new_ctx();
+	if (ctx == NULL) {
+		clulog(LOG_DEBUG, "FW: Failed to allocate socket context: %s\n",
+		       strerror(errno));
 		goto out_fail;
-	if (msg_send(ctx, &msg, sizeof(msg)) < sizeof(msg))
+	}
+
+	/* Construct message */
+	build_message(&msg, req->rr_request, req->rr_group, req->rr_target);
+
+	if (msg_open(MSG_CLUSTER, rgs.rs_owner, RG_PORT, ctx, 10) < 0) {
+		clulog(LOG_DEBUG, "FW: Failed to open channel to %d CTX: %p\n",
+		       rgs.rs_owner, ctx);
 		goto out_fail;
-	if (msg_receive(ctx, &msg, sizeof(msg), 600) < sizeof(msg))
+	}
+	if (msg_send(ctx, &msg, sizeof(msg)) < sizeof(msg)) {
+		clulog(LOG_DEBUG, "FW: Failed to send message to %d CTX: %p\n",
+		       rgs.rs_owner, ctx);
 		goto out_fail;
+	}
 
-	msg_close(ctx);
-	msg_free_ctx(ctx);
+        /*
+	 * Ok, we're forwarding a message to another node.  Keep tabs on
+	 * the node to make sure it doesn't die.  Basically, wake up every
+	 * now and again to make sure it's still online.  If it isn't, send
+	 * a response back to the caller.
+	 */
+	do {
+		ret = msg_receive(ctx, &msg, sizeof(msg), 10);
+		if (ret < (int)sizeof(msg)) {
+			if (ret < 0 && errno == ETIMEDOUT) {
+				m = member_list();
+				if (!memb_online(m, rgs.rs_owner)) {
+					response_code = RG_ENODE;
+					goto out_fail;
+				}
+				free_member_list(m);
+				m = NULL;
+				continue;
+			}
+			goto out_fail;
+		}
+		break;
+	} while(++retries < 60); /* old 60 second rule */
 
 	swab_SmMessageSt(&msg);
-	send_response(msg.sm_data.d_ret, msg.sm_data.d_svcOwner, req);
-	rq_free(req);
-	pthread_exit(NULL);
-	
-out_fail: /* Failure path */
+
+	response_code = msg.sm_data.d_ret;
+	new_owner = msg.sm_data.d_svcOwner;
+
+out_fail:
+	send_response(response_code, new_owner, req);
+	msg_close(req->rr_resp_ctx);
+	msg_free_ctx(req->rr_resp_ctx);
+
 	if (ctx) {
 		msg_close(ctx);
 		msg_free_ctx(ctx);
 	}
-	msg_close(req->rr_resp_ctx);
-	msg_free_ctx(req->rr_resp_ctx);
+	if (m)
+		free_member_list(m);
+
 	rq_free(req);
 	pthread_exit(NULL);
 }
--- cluster/rgmanager/src/daemons/rg_state.c	2007/07/02 15:15:00	1.37
+++ cluster/rgmanager/src/daemons/rg_state.c	2007/07/23 20:49:13	1.38
@@ -217,9 +217,6 @@
 
 	swab_SmMessageSt(msgp);
 	msg_send(ctx, msgp, sizeof(*msgp));
-
-	/* :) */
-	msg_close(ctx);
 }
 
 	
@@ -245,11 +242,6 @@
 
 	swab_SmMessageSt(msgp);
 	msg_send(req->rr_resp_ctx, msgp, sizeof(*msgp));
-
-	/* :( */
-	msg_close(req->rr_resp_ctx);
-	msg_free_ctx(req->rr_resp_ctx);
-	req->rr_resp_ctx = NULL;
 }
 
 
@@ -556,6 +548,7 @@
 			break;
 		}
 
+		ret = 2;
 		clulog(LOG_DEBUG, "Not stopping disabled service %s\n",
 		       svcName);
 		break;
@@ -1615,6 +1608,11 @@
 	int ret, x;
 	rg_state_t svcStatus;
 	
+	get_rg_state_local(svcName, &svcStatus);
+	if (svcStatus.rs_state == RG_STATE_DISABLED ||
+	    svcStatus.rs_state == RG_STATE_UNINITIALIZED)
+		return RG_EINVAL;
+
 	if (preferred_target > 0) {
 		/* TODO: simplify this and don't keep alloc/freeing 
 		   member lists */
@@ -1684,8 +1682,10 @@
 		 * I am the ONLY one capable of running this service,
 		 * PERIOD...
 		 */
-		if (target == me && me != preferred_target)
+		if (target == me && me != preferred_target) {
+			free_member_list(backup);
 			goto exhausted;
+		}
 
 		if (target == me) {
 			/*
@@ -1948,8 +1948,16 @@
 	int tolerance = FOD_BEST;
 	int x;
 	uint32_t me = my_id();
-	cluster_member_list_t *membership = member_list();
-	int need_check = have_exclusive_resources();
+	cluster_member_list_t *membership;
+	int need_check;
+
+	if (rg_locked()) {
+		/* don't even calc if rg's locked */
+		return RG_EFAIL;
+	}
+
+	need_check = have_exclusive_resources();
+	membership = member_list();
 
 	/* XXX ok, so we need to say "should I start this if I was the
 	   only cluster member online */
@@ -2042,25 +2050,28 @@
 	 				  svcName, 1);
 	  	if (target == me) {
 	   		ret = handle_start_remote_req(svcName, request);
+			if (ret == RG_EAGAIN)
+				goto out;
 	    	} else if (target < 0) {
-	     		free_member_list(allowed_nodes);
-	      		return RG_EFAIL;
+			goto out;
 	       	} else {
 			ret = relocate_service(svcName, request, target);
 		}
 
 		switch(ret) {
 		case RG_ESUCCESS:
-		    	return RG_ESUCCESS;
+		    	ret = RG_ESUCCESS;
+			goto out;
 		case RG_ERUN:
-		      	return RG_ERUN;
+		      	ret = RG_ERUN;
+			goto out;
 		case RG_EFAIL:
 			memb_mark_down(allowed_nodes, target);
 			continue;
 		case RG_EABORT:
 			svc_report_failure(svcName);
-			free_member_list(allowed_nodes);
-       			return RG_EFAIL;
+			ret = RG_EFAIL;
+			goto out;
       		default:
 			clulog(LOG_ERR,
 	 		       "#6X: Invalid reply [%d] from member %d during"
@@ -2068,6 +2079,7 @@
 	   	}
 	}
 
+out:
 	free_member_list(allowed_nodes);
-	return RG_EFAIL;
+	return ret;
 }
--- cluster/rgmanager/src/daemons/rg_thread.c	2007/07/10 18:25:26	1.23
+++ cluster/rgmanager/src/daemons/rg_thread.c	2007/07/23 20:49:13	1.24
@@ -60,19 +60,39 @@
   SIGUSR1 output
  */
 void
-dump_threads(void)
+dump_threads(FILE *fp)
 {
 	resthread_t *rt;
+	request_t *req;
+	int x = 0, y = 0;
 
-	printf("+++ BEGIN Thread dump\n");
+	fprintf(fp, "Resource Group Threads \n");
 	pthread_mutex_lock(&reslist_mutex);
-	list_do(&resthread_list, rt) {
-		printf("TID %d group %s (@ %p) request %d\n",
-		       (int)rt->rt_thread,
-		       rt->rt_name, rt, rt->rt_request);
-	} while (!list_done(&resthread_list, rt));
+	list_for(&resthread_list, rt, x) {
+		fprintf(fp, "  %s id:%d (@ %p) processing %s request (%d)\n",
+		        rt->rt_name,
+		        (unsigned)rt->rt_thread,
+			rt,
+			rg_req_str(rt->rt_request),
+			rt->rt_request);
+		if (!*rt->rt_queue) {
+			fprintf(fp, "    Pending requests: \n");
+			list_for(rt->rt_queue, req, y) {
+				fprintf(fp, "      %s tgt:%d  ctx:%p  a0:%d  a1:%d\n",
+				        rg_req_str(req->rr_request),
+					req->rr_target,
+					req->rr_resp_ctx,
+					req->rr_arg0,
+					req->rr_arg1);
+			}
+		}
+	}
+
+	x = !!resthread_list;
 	pthread_mutex_unlock(&reslist_mutex);
-	printf("--- END Thread dump\n");
+	if (!x)
+		fprintf(fp, "  (none)\n");
+	fprintf(fp, "\n");
 }
 
 
@@ -151,6 +171,8 @@
 		dprintf("Removed request %d\n", curr->rr_request);
 		if (curr->rr_resp_ctx) {
 			send_response(RG_EABORT, 0, curr);
+			msg_close(curr->rr_resp_ctx);
+			msg_free_ctx(curr->rr_resp_ctx);
 		}
 		rq_free(curr);
 	}
@@ -241,12 +263,14 @@
 			break;
 
 		case RG_ENABLE:
+			#if 0
 			if (req->rr_target != 0 &&
 			    req->rr_target != my_id()) {
 				error = RG_EFORWARD;
 				ret = RG_NONE;
 				break;
 			}
+			#endif
 		case RG_START:
 			if (req->rr_arg0) {
 				error = handle_fd_start_req(myname,
@@ -476,6 +500,8 @@
 		if (ret != RG_NONE && rg_initialized() &&
 		    (req->rr_resp_ctx)) {
 			send_response(error, newowner, req);
+			msg_close(req->rr_resp_ctx);
+			msg_free_ctx(req->rr_resp_ctx);
 		}
 		
 		rq_free(req);
@@ -565,7 +591,6 @@
 	int ret;
 	resthread_t *resgroup = NULL;
 
-retry:
 	pthread_mutex_lock(&reslist_mutex);
 	while (resgroup == NULL) {
 		resgroup = find_resthread_byname(resgroupname);
@@ -584,7 +609,7 @@
 
 	pthread_mutex_unlock(&reslist_mutex);
 	if (wait_initialize(resgroupname) < 0) {
-		goto retry;
+		return -1;
 	}
 
 	return ret;
@@ -689,6 +714,8 @@
 		case RG_ENABLE:
 			send_ret(response_ctx, resgroup->rt_name, RG_EDEADLCK,
 				 request);
+			msg_close(response_ctx);
+			msg_free_ctx(response_ctx);
 			break;
 		}
 		fprintf(stderr, "Failed to queue request: Would block\n");
--- cluster/rgmanager/src/resources/service.sh	2007/04/05 15:08:20	1.9
+++ cluster/rgmanager/src/resources/service.sh	2007/07/23 20:49:13	1.10
@@ -67,7 +67,7 @@
             <content type="string"/>
         </parameter>
 
-        <parameter name="autostart">
+        <parameter name="autostart" reconfig="1">
             <longdesc lang="en">
 	    	If set to yes, this resource group will automatically be started
 		after the cluster forms a quorum.  If set to no, this resource
@@ -80,7 +80,7 @@
             <content type="boolean"/>
         </parameter>
 
-        <parameter name="hardrecovery">
+        <parameter name="hardrecovery" reconfig="1">
             <longdesc lang="en">
 	    	If set to yes, the last owner will reboot if this resource
 		group fails to stop cleanly, thus allowing the resource
@@ -128,7 +128,7 @@
 	    <content type="boolean"/>
 	</parameter>
                 
-        <parameter name="recovery">
+        <parameter name="recovery" reconfig="1">
             <longdesc lang="en">
 	        This currently has three possible options: "restart" tries
 		to restart failed parts of this resource group locally before
--- cluster/rgmanager/src/utils/clusvcadm.c	2007/06/14 15:06:52	1.20
+++ cluster/rgmanager/src/utils/clusvcadm.c	2007/07/23 20:49:13	1.21
@@ -390,7 +390,10 @@
 			printf("Member %s %s %s", nodename, actionstr, svcname);
 		printf("...");
 		fflush(stdout);
-		msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5);
+		if (msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5) < 0) {
+			printf("Could not connect to resource group manager\n");
+			return 1;
+		}
 	} else {
 		if (!svctarget)
 			printf("Trying to %s %s", actionstr, svcname);
@@ -399,7 +402,10 @@
 			       nodename);
 		printf("...");
 		fflush(stdout);
-		msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5);
+		if (msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5) < 0) {
+			printf("Could not connect to resource group manager\n");
+			return 1;
+		}
 	}
 
 	if (ctx.type < 0) {



                 reply	other threads:[~2007-07-23 20:49 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070723204915.1157.qmail@sourceware.org \
    --to=lhh@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.