From: lhh@sourceware.org <lhh@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/rgmanager include/resgroup.h src/daemo ...
Date: 14 Nov 2007 19:03:39 -0000 [thread overview]
Message-ID: <20071114190339.12175.qmail@sourceware.org> (raw)
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL4
Changes by: lhh at sourceware.org 2007-11-14 19:03:37
Modified files:
rgmanager/include: resgroup.h
rgmanager/src/daemons: groups.c rg_state.c
Log message:
Fix #360401 - hang forever during shutdown due to previous service boot problem
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.3.2.9&r2=1.3.2.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.8.2.21&r2=1.8.2.22
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.4.2.21&r2=1.4.2.22
--- cluster/rgmanager/include/resgroup.h 2007/01/03 21:08:17 1.3.2.9
+++ cluster/rgmanager/include/resgroup.h 2007/11/14 19:03:37 1.3.2.10
@@ -156,6 +156,7 @@
cluster_member_list_t *member_list(void);
uint64_t my_id(void);
+#define RG_ERELO -9 /* Operation cannot complete here */
#define RG_ENODEDEATH -8 /* Processing node died */
#define RG_ERUN -7 /* Service is running already */
#define RG_EAGAIN -6 /* Try again */
--- cluster/rgmanager/src/daemons/groups.c 2007/09/28 15:14:52 1.8.2.21
+++ cluster/rgmanager/src/daemons/groups.c 2007/11/14 19:03:37 1.8.2.22
@@ -192,7 +192,8 @@
}
if (st.rs_state != RG_STATE_STARTED &&
- st.rs_state != RG_STATE_STARTING)
+ st.rs_state != RG_STATE_STARTING &&
+ st.rs_state != RG_STATE_STOPPING)
continue;
if (mp->cm_id != st.rs_owner)
--- cluster/rgmanager/src/daemons/rg_state.c 2007/06/28 11:54:50 1.4.2.21
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/11/14 19:03:37 1.4.2.22
@@ -41,10 +41,13 @@
int set_rg_state(char *servicename, rg_state_t *svcblk);
int get_rg_state(char *servicename, rg_state_t *svcblk);
void get_recovery_policy(char *rg_name, char *buf, size_t buflen);
-int have_exclusive_resources();
+int have_exclusive_resources(void);
int check_exclusive_resources(cluster_member_list_t *membership, char *svcName);
+pthread_mutex_t exclusive_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+
uint64_t
next_node_id(cluster_member_list_t *membership, uint64_t me)
{
@@ -446,6 +449,7 @@
break;
case RG_STATE_DISABLED:
+ ret = 2;
case RG_STATE_UNINITIALIZED:
if (req == RG_DISABLE) {
clulog(LOG_NOTICE,
@@ -536,7 +540,7 @@
}
clulog(LOG_NOTICE,
- "Starting stopped service%s\n",
+ "Starting stopped service %s\n",
svcName);
ret = 1;
break;
@@ -557,7 +561,7 @@
snprintf(query,
sizeof(query),
"/cluster/clusternodes/clusternode[@nodeid=\"%d\"]/@name",
- svcStatus->rs_owner);
+ (int)svcStatus->rs_owner);
ccs_get(fd, query, &nodename);
ccs_disconnect(fd);
}
@@ -650,42 +654,61 @@
svc_start(char *svcName, int req)
{
void *lockp = NULL;
- int ret;
+ int ret, xret;
rg_state_t svcStatus;
+ int need_check = have_exclusive_resources();
+ cluster_member_list_t *membership;
+
+ if (need_check)
+ pthread_mutex_lock(&exclusive_mutex);
+
+ ret = RG_EFAIL;
if (rg_lock(svcName, &lockp) < 0) {
clulog(LOG_ERR, "#45: Unable to obtain cluster lock: %s\n",
strerror(errno));
- return FAIL;
+ goto out_nolock;
}
if (get_rg_state(svcName, &svcStatus) != 0) {
- rg_unlock(svcName, lockp);
clulog(LOG_ERR, "#46: Failed getting status for RG %s\n",
svcName);
- return FAIL;
+ goto out_unlock;
+ }
+
+ if (need_check) {
+ membership = member_list();
+ xret = check_exclusive_resources(membership, svcName);
+ cml_free(membership);
+ if (xret != 0) {
+ if (xret > 0)
+ /* Exc. service running */
+ ret = RG_ERELO;
+ else
+ /* XXX */
+ ret = RG_ENOSERVICE;
+ goto out_unlock;
+ }
}
/* LOCK HELD */
switch (svc_advise_start(&svcStatus, svcName, req)) {
case 0: /* Don't start service, return FAIL */
- rg_unlock(svcName, lockp);
- return FAIL;
+ goto out_unlock;
case 2: /* Don't start service, return 0 */
- rg_unlock(svcName, lockp);
- return 0;
+ ret = 0;
+ goto out_unlock;
case 3:
- rg_unlock(svcName, lockp);
- return RG_EAGAIN;
+ ret = RG_EAGAIN;
+ goto out_unlock;
case 4:
- rg_unlock(svcName, lockp);
- return RG_ERUN;
+ ret = RG_ERUN;
+ goto out_unlock;
default:
break;
}
/* LOCK HELD if we get here */
-
svcStatus.rs_owner = my_id();
svcStatus.rs_state = RG_STATE_STARTING;
svcStatus.rs_transition = (uint64_t)time(NULL);
@@ -699,10 +722,17 @@
clulog(LOG_ERR,
"#47: Failed changing service status\n");
rg_unlock(svcName, lockp);
- return FAIL;
+ goto out_unlock;
}
rg_unlock(svcName, lockp);
+
+ /* release excl. mutex during start */
+ if (need_check) {
+ /* Also clear need_check so we don't double-unlock */
+ pthread_mutex_unlock(&exclusive_mutex);
+ need_check = 0;
+ }
ret = group_op(svcName, RG_START);
ret = !!ret; /* Either it worked or it didn't. Ignore all the
@@ -711,17 +741,17 @@
if (rg_lock(svcName, &lockp) < 0) {
clulog(LOG_ERR, "#74: Unable to obtain cluster lock: %s\n",
strerror(errno));
- return FAIL;
+ ret = RG_EFAIL;
+ goto out_nolock;
}
svcStatus.rs_state = RG_STATE_STARTED;
if (set_rg_state(svcName, &svcStatus) != 0) {
clulog(LOG_ERR,
"#75: Failed changing service status\n");
- rg_unlock(svcName, lockp);
- return FAIL;
+ ret = RG_EFAIL;
+ goto out_unlock;
}
- rg_unlock(svcName, lockp);
if (ret == 0)
clulog(LOG_NOTICE,
@@ -732,6 +762,11 @@
"#68: Failed to start %s; return value: %d\n",
svcName, ret);
+out_unlock:
+ rg_unlock(svcName, lockp);
+out_nolock:
+ if (need_check)
+ pthread_mutex_unlock(&exclusive_mutex);
return ret;
}
@@ -1115,7 +1150,7 @@
{
cluster_member_list_t *allowed_nodes, *backup = NULL;
uint64_t target = preferred_target, me = my_id();
- int ret, x;
+ int ret, x, tried = 0;
/*
* Stop the service - if we haven't already done so.
@@ -1181,6 +1216,7 @@
* It's legal to start the service on the given
* node. Try to do so.
*/
+ ++tried;
if (relocate_service(svcName, request, target) == 0) {
*new_owner = target;
/*
@@ -1211,9 +1247,12 @@
if (target == me)
goto exhausted;
+ ++tried;
+
+ /* Each node gets one try */
+ memb_mark_down(allowed_nodes, target);
switch (relocate_service(svcName, request, target)) {
case RG_EFAIL:
- memb_mark_down(allowed_nodes, target);
continue;
case RG_EABORT:
svc_report_failure(svcName);
@@ -1228,9 +1267,10 @@
(uint32_t)(target&0xffffffff), request);
return 0;
case 0:
- *new_owner = target;
clulog(LOG_NOTICE, "Service %s is now running "
"on member %d\n", svcName, (int)target);
+ case RG_ERUN:
+ *new_owner = target;
cml_free(allowed_nodes);
return 0;
default:
@@ -1254,9 +1294,10 @@
*/
exhausted:
if (!rg_locked()) {
- clulog(LOG_WARNING,
- "#70: Attempting to restart service %s locally.\n",
- svcName);
+ if (tried)
+ clulog(LOG_WARNING,
+ "#70: Attempting to restart service %s locally.\n",
+ svcName);
if (svc_start(svcName, RG_START_RECOVER) == 0) {
*new_owner = me;
return FAIL;
@@ -1275,9 +1316,9 @@
int
handle_fd_start_req(char *svcName, int request, uint64_t *new_owner)
{
- cluster_member_list_t *allowed_nodes, *backup = NULL;
+ cluster_member_list_t *allowed_nodes;
uint64_t target, me = my_id();
- int ret, x;
+ int ret;
allowed_nodes = member_list();
@@ -1326,7 +1367,6 @@
}
-pthread_mutex_t exclusive_mutex = PTHREAD_MUTEX_INITIALIZER;
/**
* handle_start_req - Handle a generic start request from a user or during
* service manager boot.
@@ -1342,7 +1382,6 @@
{
int ret, tolerance = FOD_BEST;
cluster_member_list_t *membership = member_list();
- int need_check = have_exclusive_resources();
/*
* When a service request is from a user application (eg, clusvcadm),
@@ -1358,18 +1397,6 @@
cml_free(membership);
return FAIL;
}
- if (need_check) {
- pthread_mutex_lock(&exclusive_mutex);
- ret = check_exclusive_resources(membership, svcName);
- if (ret != 0) {
- cml_free(membership);
- pthread_mutex_unlock(&exclusive_mutex);
- if (ret > 0)
- goto relocate;
- else
- return FAIL;
- }
- }
cml_free(membership);
/*
@@ -1377,25 +1404,22 @@
* mask here - so that we can try all nodes if necessary.
*/
ret = svc_start(svcName, req);
- if (need_check)
- pthread_mutex_unlock(&exclusive_mutex);
-
- /*
- If services are locked, return the error
- */
- if (ret == RG_EAGAIN || ret == RG_ERUN)
+ switch(ret) {
+ case RG_ERELO:
+ goto relocate;
+
+ case RG_EAGAIN:
+ /* If services are locked, return the error */
+ case RG_ENOSERVICE:
+ /* service doesn't exist? */
+ case RG_ERUN:
+ /* If service is already running, return that value */
return ret;
- /*
- * If we succeeded, then we're done.
- */
- if (ret == SUCCESS) {
+ case SUCCESS:
+ /* If we succeeded, then we're done. */
*new_owner = my_id();
- return SUCCESS;
- }
-
- /* Already running? */
- if (ret == NO) {
+ case NO:
return SUCCESS;
}
@@ -1418,13 +1442,13 @@
return RG_EABORT;
}
-relocate:
/*
* OK, it failed to start - but succeeded to stop. Now,
* we should relocate the service.
*/
clulog(LOG_WARNING, "#71: Relocating failed service %s\n",
svcName);
+relocate:
ret = handle_relocate_req(svcName, RG_START_RECOVER, -1, new_owner);
/* If we leave the service stopped, instead of disabled, someone
@@ -1456,7 +1480,6 @@
int x;
uint64_t me = my_id();
cluster_member_list_t *membership = member_list();
- int need_check = have_exclusive_resources();
/* XXX ok, so we need to say "should I start this if I was the
only cluster member online */
@@ -1477,23 +1500,29 @@
cml_free(membership);
return FAIL;
}
- if (need_check) {
- pthread_mutex_lock(&exclusive_mutex);
- if (check_exclusive_resources(membership, svcName) != 0) {
- pthread_mutex_unlock(&exclusive_mutex);
- cml_free(membership);
- return FAIL;
- }
- }
cml_free(membership);
x = svc_start(svcName, req);
- if (need_check)
- pthread_mutex_unlock(&exclusive_mutex);
- if (x == 0)
- return 0;
- if (x == RG_ERUN)
- return RG_ERUN;
+ switch(x) {
+ case RG_ERELO:
+ /* Don't relocate from here; it was a remote start */
+ /* Return fail so the other node can go ahead and
+ try the other nodes in the cluster */
+ case NO:
+ return RG_EFAIL;
+
+ case RG_EAGAIN:
+ /* If services are locked, return the error */
+ case RG_ENOSERVICE:
+ /* service doesn't exist? */
+ case RG_ERUN:
+ /* If service is already running, return that value */
+ return x;
+
+ case SUCCESS:
+ /* If we succeeded, then we're done. */
+ return SUCCESS;
+ }
if (svc_stop(svcName, RG_STOP_RECOVER) == 0)
return RG_EFAIL;
next reply other threads:[~2007-11-14 19:03 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-11-14 19:03 lhh [this message]
-- strict thread matches above, loose matches on Subject: below --
2006-12-14 22:03 [Cluster-devel] cluster/rgmanager include/resgroup.h src/daemo lhh
2006-12-13 18:39 lhh
2006-12-13 18:38 lhh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20071114190339.12175.qmail@sourceware.org \
--to=lhh@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.