* [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
@ 2006-10-06 21:22 lhh
0 siblings, 0 replies; 11+ messages in thread
From: lhh @ 2006-10-06 21:22 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: lhh at sourceware.org 2006-10-06 21:22:28
Modified files:
rgmanager : ChangeLog
rgmanager/include: resgroup.h
rgmanager/src/daemons: groups.c main.c
rgmanager/src/utils: clustat.c
Log message:
Fix #202497
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.27&r2=1.28
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.14&r2=1.15
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.32&r2=1.33
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clustat.c.diff?cvsroot=cluster&r1=1.23&r2=1.24
--- cluster/rgmanager/ChangeLog 2006/09/27 18:58:53 1.27
+++ cluster/rgmanager/ChangeLog 2006/10/06 21:22:27 1.28
@@ -1,3 +1,13 @@
+2006-10-06 Lon Hohberger <lhh@redhat.com>
+ * src/daemons/main.c: Fix #202497: provide rgmanager's view of
+ who is running rgmanager to clustat.
+ * src/daemons/groups.c: Fix tiny memory leak during configuration
+ changes
+ * include/resgroup.h: #202497: Flip unused RG_STATUS_INQUIRY to
+ RG_STATUS_NODE.
+ * src/utils/clustat.c: #202497: Send RG_STATUS_NODE to clurgmgrd
+ in order to obtain rgmanager "group" status information
+
2006-09-27 Lon Hohberger <lhh@redhat.com>
* src/daemons/rg_state.c: Fix #208011 - failed->disabled state
transition. Fix node ID type.
--- cluster/rgmanager/include/resgroup.h 2006/09/01 19:02:21 1.14
+++ cluster/rgmanager/include/resgroup.h 2006/10/06 21:22:27 1.15
@@ -68,7 +68,7 @@
#define RG_EXITING 9
#define RG_INIT 10
#define RG_ENABLE 11
-#define RG_STATUS_INQUIRY 12
+#define RG_STATUS_NODE 12
#define RG_RELOCATE 13
#define RG_CONDSTOP 14
#define RG_CONDSTART 15
--- cluster/rgmanager/src/daemons/groups.c 2006/09/27 16:28:41 1.23
+++ cluster/rgmanager/src/daemons/groups.c 2006/10/06 21:22:27 1.24
@@ -1228,6 +1228,7 @@
pthread_mutex_lock(&config_mutex);
config_version = atoi(val);
pthread_mutex_unlock(&config_mutex);
+ free(val);
}
clulog(LOG_DEBUG, "Building Resource Trees\n");
--- cluster/rgmanager/src/daemons/main.c 2006/09/27 16:28:41 1.32
+++ cluster/rgmanager/src/daemons/main.c 2006/10/06 21:22:27 1.33
@@ -87,6 +87,26 @@
void
+send_node_states(msgctx_t *ctx)
+{
+ int x;
+ generic_msg_hdr hdr;
+ cluster_member_list_t *ml = member_list();
+
+ for (x = 0; x < ml->cml_count; x++) {
+ if (ml->cml_members[x].cn_member == 1) {
+ msg_send_simple(ctx, RG_STATUS_NODE,
+ ml->cml_members[x].cn_nodeid, 0);
+ }
+ }
+ msg_send_simple(ctx, RG_SUCCESS,
+ ml->cml_members[x].cn_nodeid, 0);
+ msg_receive(ctx, &hdr, sizeof(hdr), 10);
+ free_member_list(ml);
+}
+
+
+void
flag_reconfigure(int sig)
{
need_reconfigure++;
@@ -417,8 +437,13 @@
switch (msg_hdr->gh_command) {
case RG_STATUS:
clulog(LOG_DEBUG, "Sending service states to CTX%p\n",ctx);
- send_rg_states(ctx, msg_hdr->gh_arg1);
- need_close = 0;
+ if (send_rg_states(ctx, msg_hdr->gh_arg1) == 0)
+ need_close = 0;
+ break;
+
+ case RG_STATUS_NODE:
+ clulog(LOG_DEBUG, "Sending node states to CTX%p\n",ctx);
+ send_node_states(ctx);
break;
case RG_LOCK:
--- cluster/rgmanager/src/utils/clustat.c 2006/09/27 16:43:39 1.23
+++ cluster/rgmanager/src/utils/clustat.c 2006/10/06 21:22:28 1.24
@@ -43,6 +43,94 @@
} rg_state_list_t;
+void
+flag_rgmanager_nodes(cluster_member_list_t *cml)
+{
+ msgctx_t ctx;
+ int max = 0, n;
+ generic_msg_hdr *msgp;
+ fd_set rfds;
+
+ struct timeval tv;
+
+ if (msg_open(MSG_SOCKET, 0, 0, &ctx, 10) < 0)
+ return;
+
+ msg_send_simple(&ctx, RG_STATUS_NODE, 0, 0);
+
+ while (1) {
+ FD_ZERO(&rfds);
+ msg_fd_set(&ctx, &rfds, &max);
+ tv.tv_sec = 10;
+ tv.tv_usec = 0;
+
+ n = select(max+1, &rfds, NULL, NULL, &tv);
+ if (n == 0) {
+ fprintf(stderr, "Timed out waiting for a response "
+ "from Resource Group Manager\n");
+ break;
+ }
+
+ if (n < 0) {
+ if (errno == EAGAIN ||
+ errno == EINTR)
+ continue;
+ fprintf(stderr, "Failed to receive "
+ "service data: select: %s\n",
+ strerror(errno));
+ break;
+ }
+
+ n = msg_receive_simple(&ctx, &msgp, tv.tv_sec);
+
+ if (n < 0) {
+ if (errno == EAGAIN)
+ continue;
+ perror("msg_receive_simple");
+ break;
+ }
+ if (n < sizeof(generic_msg_hdr)) {
+ printf("Error: Malformed message\n");
+ break;
+ }
+
+ if (!msgp) {
+ printf("Error: no message?!\n");
+ break;
+ }
+
+ swab_generic_msg_hdr(msgp);
+
+ if (msgp->gh_command == RG_FAIL) {
+ printf("Member states unavailable: %s\n",
+ rg_strerror(msgp->gh_arg1));
+ free(msgp);
+ msg_close(&ctx);
+ return;
+ }
+
+ if (msgp->gh_command == RG_SUCCESS) {
+ free(msgp);
+ break;
+ }
+
+ for (n = 0; n < cml->cml_count; n++) {
+ if (cml->cml_members[n].cn_nodeid != msgp->gh_arg1)
+ continue;
+ cml->cml_members[n].cn_member |= FLAG_RGMGR;
+ }
+
+ free(msgp);
+ msgp = NULL;
+ }
+
+ msg_send_simple(&ctx, RG_SUCCESS, 0, 0);
+ msg_close(&ctx);
+
+ return;
+}
+
+
rg_state_list_t *
rg_state_list(int local_node_id, int fast)
{
@@ -791,6 +879,9 @@
membership = build_member_list(ch, &local_node_id);
rgs = rg_state_list(local_node_id, fast);
+ if (rgs) {
+ flag_rgmanager_nodes(membership);
+ }
if (refresh_sec) {
setupterm((char *) 0, STDOUT_FILENO, (int *) 0);
^ permalink raw reply [flat|nested] 11+ messages in thread* [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
@ 2007-12-30 8:27 fabbione
0 siblings, 0 replies; 11+ messages in thread
From: fabbione @ 2007-12-30 8:27 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: fabbione at sourceware.org 2007-12-30 08:27:21
Modified files:
rgmanager : ChangeLog
rgmanager/include: resgroup.h
rgmanager/src/clulib: alloc.c
rgmanager/src/daemons: rg_state.c
Log message:
Fix building when -DDEBUG is defined.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.69&r2=1.70
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.26&r2=1.27
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/alloc.c.diff?cvsroot=cluster&r1=1.12&r2=1.13
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.43&r2=1.44
--- cluster/rgmanager/ChangeLog 2007/12/24 05:26:42 1.69
+++ cluster/rgmanager/ChangeLog 2007/12/30 08:27:21 1.70
@@ -1,3 +1,7 @@
+2007-12-30 Fabio M. Di Nitto <fabbione@ubuntu.com>
+ * src/daemons/rg_state.c, include/resgroup.h, src/clulib/alloc.c:
+ Fix building when -DDEBUG is defined.
+
2007-12-24 Fabio M. Di Nitto <fabbione@ubuntu.com>
* src/clulib/vft.c: Change ifdef to fix build on parisc.
--- cluster/rgmanager/include/resgroup.h 2007/12/14 19:37:00 1.26
+++ cluster/rgmanager/include/resgroup.h 2007/12/30 08:27:21 1.27
@@ -186,7 +186,7 @@
#define rg_lock(name, p) _rg_lock_dbg(name, p, __FILE__, __LINE__)
int _rg_unlock_dbg(struct dlm_lksb *, char *, int);
-#define rg_unlock(name, p) _rg_unlock_dbg(name, p, __FILE__, __LINE__)
+#define rg_unlock(p) _rg_unlock_dbg(p, __FILE__, __LINE__)
#else
int rg_lock(char *name, struct dlm_lksb *p);
--- cluster/rgmanager/src/clulib/alloc.c 2007/08/09 09:22:24 1.12
+++ cluster/rgmanager/src/clulib/alloc.c 2007/12/30 08:27:21 1.13
@@ -116,10 +116,12 @@
#include <unistd.h>
#include <sys/wait.h>
-#if !defined(__ia64__) && !defined(__hppa__)
#ifndef DEBUG
#define DEBUG /* Record program counter of malloc/calloc */
#endif /* or realloc call; print misc stuff out */
+
+#if defined(__ia64__) || defined(__hppa__)
+#undef DEBUG
#endif
/* Tunable stuff XXX This should be external */
--- cluster/rgmanager/src/daemons/rg_state.c 2007/12/14 19:37:00 1.43
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/12/30 08:27:21 1.44
@@ -195,7 +195,6 @@
return _rg_lock(name, p);
}
#endif
-
int
@@ -211,7 +210,7 @@
#ifdef DEBUG
int
-_rg_unlock_dbg(void *p, char *file, int line)
+_rg_unlock_dbg(struct dlm_lksb *p, char *file, int line)
{
dprintf("rg_unlock() @ %s:%d\n", file, line);
return _rg_unlock(p);
^ permalink raw reply [flat|nested] 11+ messages in thread* [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
@ 2007-12-14 19:37 lhh
0 siblings, 0 replies; 11+ messages in thread
From: lhh @ 2007-12-14 19:37 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: lhh at sourceware.org 2007-12-14 19:37:00
Modified files:
rgmanager : ChangeLog
rgmanager/include: resgroup.h
rgmanager/src/daemons: rg_state.c service_op.c
Log message:
Add return value for inability to run due to exclusive flag being present
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.64&r2=1.65
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.25&r2=1.26
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.42&r2=1.43
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/service_op.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
--- cluster/rgmanager/ChangeLog 2007/12/12 21:41:32 1.64
+++ cluster/rgmanager/ChangeLog 2007/12/14 19:37:00 1.65
@@ -1,3 +1,7 @@
+2007-12-14 Lon Hohberger <lhh@redhat.com>
+ * Add return code for inability to run because of exclusive
+ tag
+
2007-12-12 Lon Hohberger <lhh@redhat.com>
* Misc changes; add missing ds.h
* src/resources/default*.sl: Make clusvcadm -r go to a different
--- cluster/rgmanager/include/resgroup.h 2007/11/30 21:36:28 1.25
+++ cluster/rgmanager/include/resgroup.h 2007/12/14 19:37:00 1.26
@@ -202,6 +202,8 @@
int my_id(void);
/* Return codes */
+#define RG_EEXCL -16 /* Service not runnable due to
+ inability to start exclusively */
#define RG_EDOMAIN -15 /* Service not runnable given the
set of nodes and its failover
domain */
--- cluster/rgmanager/src/daemons/rg_state.c 2007/11/30 21:36:28 1.42
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/12/14 19:37:00 1.43
@@ -2067,7 +2067,7 @@
if (check_exclusive_resources(membership, svcName) != 0) {
free_member_list(membership);
pthread_mutex_unlock(&exclusive_mutex);
- return RG_EFAIL;
+ return RG_EEXCL;
}
}
free_member_list(membership);
--- cluster/rgmanager/src/daemons/service_op.c 2007/11/30 21:36:28 1.1
+++ cluster/rgmanager/src/daemons/service_op.c 2007/12/14 19:37:00 1.2
@@ -51,6 +51,7 @@
{
int target;
int ret, x;
+ int excl = 0, dep = 0, fail = 0;
rg_state_t svcStatus;
if (get_service_state_internal(svcName, &svcStatus) < 0) {
@@ -74,8 +75,14 @@
if (new_owner)
*new_owner = svcStatus.rs_owner;
return 0;
+ case RG_EEXCL:
+ ++excl;
+ continue;
case RG_EDEPEND:
+ ++dep;
+ continue;
case RG_EFAIL:
+ ++fail;
continue;
case RG_EABORT:
svc_report_failure(svcName);
@@ -100,7 +107,15 @@
}
}
- return RG_EFAIL;
+ ret = RG_EFAIL;
+ if (excl == target_list_len)
+ ret = RG_EEXCL;
+ else if (dep == target_list_len)
+ ret = RG_EDEPEND;
+
+ clulog(LOG_INFO, "Start failed; node reports: %d failures, "
+ "%d exclusive, %d dependency errors\n", fail, excl, dep);
+ return ret;
}
^ permalink raw reply [flat|nested] 11+ messages in thread* [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
@ 2007-11-30 20:36 lhh
0 siblings, 0 replies; 11+ messages in thread
From: lhh @ 2007-11-30 20:36 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: lhh at sourceware.org 2007-11-30 20:36:18
Modified files:
rgmanager : ChangeLog
rgmanager/include: resgroup.h reslist.h
rgmanager/src/daemons: Makefile fo_domain.c groups.c main.c
reslist.c resrules.c restree.c rg_state.c
test.c
Added files:
rgmanager/include: restart_counter.h
rgmanager/src/daemons: restart_counter.c
Log message:
Merges from RHEL5 branch - round 2.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.60&r2=1.61
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/Makefile.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/fo_domain.c.diff?cvsroot=cluster&r1=1.13&r2=1.14
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.39&r2=1.40
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.44&r2=1.45
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/reslist.c.diff?cvsroot=cluster&r1=1.19&r2=1.20
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/resrules.c.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&r1=1.37&r2=1.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.40&r2=1.41
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/test.c.diff?cvsroot=cluster&r1=1.12&r2=1.13
--- cluster/rgmanager/ChangeLog 2007/11/30 20:06:55 1.60
+++ cluster/rgmanager/ChangeLog 2007/11/30 20:36:17 1.61
@@ -1,6 +1,8 @@
2007-11-30 Lon Hohberger <lhh@redhat.com>
- * src/resources/*: Merge from RHEL5 branch.
- * src/utils/*: Merge from RHEL5 branch.
+ * src/resources/*: Merge misc. updates from RHEL5 branch.
+ * src/utils/*: Merge misc. updates from RHEL5 branch.
+ * include/*.h, src/daemons/*: Merge status-counter patch
+ from RHEL5 branch.
2007-08-30 Lon Hohberger <lhh@redhat.com>
* src/daemons/restree.c, rg_state.c: Fix tree-restart bug
--- cluster/rgmanager/include/restart_counter.h 2007/11/26 21:46:26 1.1
+++ cluster/rgmanager/include/restart_counter.h 2007/11/30 20:36:17 1.2
@@ -0,0 +1,12 @@
+#ifndef _RESTART_COUNTER_H
+#define _RESTART_COUNTER_H
+
+typedef void *restart_counter_t;
+
+int restart_add(restart_counter_t arg);
+int restart_clear(restart_counter_t arg);
+int restart_count(restart_counter_t arg);
+restart_counter_t restart_init(time_t expire_timeout, int max_restarts);
+int restart_cleanup(restart_counter_t arg);
+
+#endif
--- cluster/rgmanager/include/resgroup.h 2007/06/27 14:03:51 1.23
+++ cluster/rgmanager/include/resgroup.h 2007/11/30 20:36:17 1.24
@@ -150,6 +150,8 @@
int svc_freeze(char *svcName);
int svc_unfreeze(char *svcName);
int svc_migrate(char *svcName, int target);
+int check_restart(char *svcName);
+
int rt_enqueue_request(const char *resgroupname, int request,
msgctx_t *resp_ctx,
int max, uint32_t target, int arg0, int arg1);
--- cluster/rgmanager/include/reslist.h 2007/08/02 14:53:37 1.23
+++ cluster/rgmanager/include/reslist.h 2007/11/30 20:36:17 1.24
@@ -126,6 +126,7 @@
struct _rg_node *rn_child, *rn_parent;
resource_t *rn_resource;
resource_act_t *rn_actions;
+ restart_counter_t rn_restart_counter;
int rn_state; /* State of this instance of rn_resource */
int rn_flags;
int rn_last_status;
--- cluster/rgmanager/src/daemons/restart_counter.c 2007/11/26 21:46:27 1.1
+++ cluster/rgmanager/src/daemons/restart_counter.c 2007/11/30 20:36:17 1.2
@@ -0,0 +1,185 @@
+/*
+ Copyright Red Hat, Inc. 2007
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License version 2 as published
+ by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+*/
+/* Time-based restart counters for rgmanager */
+
+#include <stdio.h>
+#include <list.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <time.h>
+#include <restart_counter.h>
+
+
+
+#define RESTART_INFO_MAGIC 0x184820ab
+
+typedef struct {
+ list_head();
+ time_t restart_time;
+} restart_item_t;
+
+typedef struct {
+ int magic;
+ time_t expire_timeout;
+ int max_restarts;
+ int restart_count;
+ restart_item_t *restart_nodes;
+} restart_info_t;
+
+
+#define VALIDATE(arg, ret) \
+do { \
+ if (((restart_info_t *)arg)->magic != RESTART_INFO_MAGIC) {\
+ errno = EINVAL; \
+ return ret; \
+ } \
+} while(0)
+
+
+/* Remove expired restarts */
+static int
+restart_timer_purge(restart_counter_t arg, time_t now)
+{
+ restart_info_t *restarts = (restart_info_t *)arg;
+ restart_item_t *i;
+ int x, done = 0;
+
+ VALIDATE(arg, -1);
+
+ /* No timeout */
+ if (restarts->expire_timeout == 0)
+ return 0;
+
+ do {
+ done = 1;
+ list_for(&restarts->restart_nodes, i, x) {
+ if ((now - i->restart_time) >=
+ restarts->expire_timeout) {
+ restarts->restart_count--;
+ list_remove(&restarts->restart_nodes, i);
+ done = 0;
+ break;
+ }
+ }
+ } while(!done);
+
+ return 0;
+}
+
+
+int
+restart_count(restart_counter_t arg)
+{
+ restart_info_t *restarts = (restart_info_t *)arg;
+ time_t now;
+
+ VALIDATE(arg, -1);
+ now = time(NULL);
+ restart_timer_purge(arg, now);
+ return restarts->restart_count;
+}
+
+
+/* Add a restart entry to the list. Returns 1 if restart
+ count is exceeded */
+int
+restart_add(restart_counter_t arg)
+{
+ restart_info_t *restarts = (restart_info_t *)arg;
+ restart_item_t *i;
+ time_t t;
+
+ if (!arg)
+ /* No max restarts / threshold = always
+ ok to restart! */
+ return 0;
+
+ VALIDATE(arg, -1);
+
+ i = malloc(sizeof(*i));
+ if (!i) {
+ return -1;
+ }
+
+ t = time(NULL);
+ i->restart_time = t;
+
+ list_insert(&restarts->restart_nodes, i);
+ restarts->restart_count++;
+
+ /* Check and remove old entries */
+ restart_timer_purge(restarts, t);
+
+ if (restarts->restart_count > restarts->max_restarts)
+ return 1;
+
+ return 0;
+}
+
+
+int
+restart_clear(restart_counter_t arg)
+{
+ restart_info_t *restarts = (restart_info_t *)arg;
+ restart_item_t *i;
+
+ VALIDATE(arg, -1);
+ while ((i = restarts->restart_nodes)) {
+ list_remove(&restarts->restart_nodes, i);
+ free(i);
+ }
+
+ restarts->restart_count = 0;
+
+ return 0;
+}
+
+
+restart_counter_t
+restart_init(time_t expire_timeout, int max_restarts)
+{
+ restart_info_t *info;
+
+ if (max_restarts < 0) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ info = malloc(sizeof(*info));
+ if (info == NULL)
+ return NULL;
+
+ info->magic = RESTART_INFO_MAGIC;
+ info->expire_timeout = expire_timeout;
+ info->max_restarts = max_restarts;
+ info->restart_count = 0;
+
+ return (void *)info;
+}
+
+
+int
+restart_cleanup(restart_counter_t arg)
+{
+ VALIDATE(arg, -1);
+ restart_clear(arg);
+ free(arg);
+ return 0;
+}
--- cluster/rgmanager/src/daemons/Makefile 2007/08/28 04:35:47 1.23
+++ cluster/rgmanager/src/daemons/Makefile 2007/11/30 20:36:17 1.24
@@ -31,12 +31,14 @@
rg_queue.o \
rg_state.o \
rg_thread.o \
+ restart_counter.o \
watchdog.o
OBJS2= clurmtabd.o \
clurmtabd_lib.o
-OBJS3= test-noccs.o
+OBJS3= test-noccs.o \
+ restart_counter.o
OBJS4= dtest-noccs.o
--- cluster/rgmanager/src/daemons/fo_domain.c 2007/03/20 17:09:57 1.13
+++ cluster/rgmanager/src/daemons/fo_domain.c 2007/11/30 20:36:17 1.14
@@ -27,6 +27,7 @@
#include <list.h>
#include <clulog.h>
#include <resgroup.h>
+#include <restart_counter.h>
#include <reslist.h>
#include <ccs.h>
#include <pthread.h>
--- cluster/rgmanager/src/daemons/groups.c 2007/08/02 14:53:38 1.39
+++ cluster/rgmanager/src/daemons/groups.c 2007/11/30 20:36:17 1.40
@@ -20,6 +20,7 @@
//#define DEBUG
#include <platform.h>
#include <resgroup.h>
+#include <restart_counter.h>
#include <reslist.h>
#include <vf.h>
#include <message.h>
@@ -179,6 +180,29 @@
}
+resource_node_t *
+node_by_ref(resource_node_t **tree, char *name)
+{
+ resource_t *res;
+ resource_node_t *node, *ret = NULL;
+ char rgname[64];
+ int x;
+
+ list_for(&_tree, node, x) {
+
+ res = node->rn_resource;
+ res_build_name(rgname, sizeof(rgname), res);
+
+ if (!strcasecmp(name, rgname)) {
+ ret = node;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+
int
count_resource_groups_local(cman_node_t *mp)
{
@@ -1587,6 +1611,28 @@
}
+int
+check_restart(char *rg_name)
+{
+ resource_node_t *node;
+ int ret = 1;
+
+ pthread_rwlock_rdlock(&resource_lock);
+ node = node_by_ref(&_tree, rg_name);
+ if (node) {
+ ret = restart_add(node->rn_restart_counter);
+ if (ret) {
+ /* Clear it out - caller is about
+ to relocate the service anyway */
+ restart_clear(node->rn_restart_counter);
+ }
+ }
+ pthread_rwlock_unlock(&resource_lock);
+
+ return ret;
+}
+
+
void
kill_resource_groups(void)
{
--- cluster/rgmanager/src/daemons/main.c 2007/09/19 09:54:19 1.44
+++ cluster/rgmanager/src/daemons/main.c 2007/11/30 20:36:17 1.45
@@ -166,6 +166,7 @@
old_membership = member_list();
new_ml = get_member_list(h);
+ memb_mark_down(new_ml, 0);
for (x = 0; x < new_ml->cml_count; x++) {
@@ -182,19 +183,25 @@
quorate = cman_is_listening(h,
new_ml->cml_members[x].cn_nodeid,
port);
+
if (quorate == 0) {
clulog(LOG_DEBUG, "Node %d is not listening\n",
new_ml->cml_members[x].cn_nodeid);
new_ml->cml_members[x].cn_member = 0;
} else if (quorate < 0) {
+ if (errno == ENOTCONN) {
+ new_ml->cml_members[x].cn_member = 0;
+ break;
+ }
perror("cman_is_listening");
usleep(50000);
continue;
}
-
#ifdef DEBUG
- printf("Node %d IS listening\n",
- new_ml->cml_members[x].cn_nodeid);
+ else {
+ printf("Node %d IS listening\n",
+ new_ml->cml_members[x].cn_nodeid);
+ }
#endif
break;
} while(1);
@@ -202,7 +209,6 @@
cman_finish(h);
member_list_update(new_ml);
- member_set_state(0, 0); /* Mark qdisk as dead */
/*
* Handle nodes lost. Do our local node event first.
--- cluster/rgmanager/src/daemons/reslist.c 2007/07/31 18:00:25 1.19
+++ cluster/rgmanager/src/daemons/reslist.c 2007/11/30 20:36:17 1.20
@@ -26,6 +26,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <list.h>
+#include <restart_counter.h>
#include <reslist.h>
#include <pthread.h>
#ifndef NO_CCS
--- cluster/rgmanager/src/daemons/resrules.c 2007/07/31 18:00:25 1.23
+++ cluster/rgmanager/src/daemons/resrules.c 2007/11/30 20:36:17 1.24
@@ -27,6 +27,8 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <list.h>
+#include <ctype.h>
+#include <restart_counter.h>
#include <reslist.h>
#include <pthread.h>
#include <dirent.h>
@@ -230,43 +232,70 @@
int
-expand_time(char *val)
+expand_time (char *val)
{
- int l = strlen(val);
- char c = val[l - 1];
- int ret = atoi(val);
+ int curval, len;
+ int ret = 0;
+ char *start = val, ival[16];
- if (ret <= 0)
- return 0;
+ if (!val)
+ return (time_t)0;
+
+ while (start[0]) {
+
+ len = 0;
+ curval = 0;
+ memset(ival, 0, sizeof(ival));
+
+ while (isdigit(start[len])) {
+ ival[len] = start[len];
+ len++;
+ }
+
+ if (len) {
+ curval = atoi(ival);
+ } else {
+ len = 1;
+ }
- if ((c >= '0') && (c <= '9'))
- return ret;
+ switch(start[len]) {
+ case 0:
+ case 'S':
+ case 's':
+ break;
+ case 'M':
+ case 'm':
+ curval *= 60;
+ break;
+ case 'h':
+ case 'H':
+ curval *= 3600;
+ break;
+ case 'd':
+ case 'D':
+ curval *= 86400;
+ break;
+ case 'w':
+ case 'W':
+ curval *= 604800;
+ break;
+ case 'y':
+ case 'Y':
+ curval *= 31536000;
+ break;
+ default:
+ curval = 0;
+ }
- switch(c) {
- case 'S':
- case 's':
- return (ret);
- case 'M':
- case 'm':
- return (ret * 60);
- case 'h':
- case 'H':
- return (ret * 3600);
- case 'd':
- case 'D':
- return (ret * 86400);
- case 'w':
- case 'W':
- return (ret * 604800);
- case 'y':
- case 'Y':
- return (ret * 31536000);
+ ret += (time_t)curval;
+ start += len;
}
return ret;
}
+
/**
* Store a resource action
* @param actsp Action array; may be modified and returned!
--- cluster/rgmanager/src/daemons/restree.c 2007/08/30 16:09:39 1.37
+++ cluster/rgmanager/src/daemons/restree.c 2007/11/30 20:36:17 1.38
@@ -30,6 +30,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <list.h>
+#include <restart_counter.h>
#include <reslist.h>
#include <pthread.h>
#include <clulog.h>
@@ -432,6 +433,39 @@
}
+static inline void
+assign_restart_policy(resource_t *curres, resource_node_t *parent,
+ resource_node_t *node)
+{
+ char *val;
+ int max_restarts = 0;
+ time_t restart_expire_time = 0;
+
+ node->rn_restart_counter = NULL;
+
+ if (!curres || !node)
+ return;
+ if (parent) /* Non-parents don't get one for now */
+ return;
+
+ val = res_attr_value(curres, "max_restarts");
+ if (!val)
+ return;
+ max_restarts = atoi(val);
+ if (max_restarts <= 0)
+ return;
+ val = res_attr_value(curres, "restart_expire_time");
+ if (val) {
+ restart_expire_time = (time_t)expand_time(val);
+ if (!restart_expire_time)
+ return;
+ }
+
+ node->rn_restart_counter = restart_init(restart_expire_time,
+ max_restarts);
+}
+
+
static inline int
do_load_resource(int ccsfd, char *base,
resource_rule_t *rule,
@@ -514,6 +548,7 @@
node->rn_state = RES_STOPPED;
node->rn_flags = 0;
node->rn_actions = (resource_act_t *)act_dup(curres->r_actions);
+ assign_restart_policy(curres, parent, node);
snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base);
#ifndef NO_CCS
@@ -769,6 +804,11 @@
destroy_resource_tree(&(*tree)->rn_child);
list_remove(tree, node);
+
+ if (node->rn_restart_counter) {
+ restart_cleanup(node->rn_restart_counter);
+ }
+
if(node->rn_actions){
free(node->rn_actions);
}
--- cluster/rgmanager/src/daemons/rg_state.c 2007/08/30 16:09:39 1.40
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/11/30 20:36:18 1.41
@@ -1350,7 +1350,8 @@
}
if ((svcStatus.rs_state != RG_STATE_STOPPING) &&
- (svcStatus.rs_state != RG_STATE_ERROR)) {
+ (svcStatus.rs_state != RG_STATE_ERROR) &&
+ (svcStatus.rs_state != RG_STATE_RECOVER)) {
rg_unlock(&lockp);
return 0;
}
@@ -1829,8 +1830,10 @@
* We got sent here from handle_start_req.
* We're DONE.
*/
- if (request == RG_START_RECOVER)
+ if (request == RG_START_RECOVER) {
+ _svc_stop_finish(svcName, 0, RG_STATE_STOPPED);
return RG_EFAIL;
+ }
/*
* All potential places for the service to start have been exhausted.
@@ -1839,7 +1842,7 @@
exhausted:
if (!rg_locked()) {
clulog(LOG_WARNING,
- "#70: Attempting to restart service %s locally.\n",
+ "#70: Failed to relocate %s; restarting locally\n",
svcName);
if (svc_start(svcName, RG_START_RECOVER) == 0) {
*new_owner = me;
@@ -2078,6 +2081,14 @@
new_owner);
}
+ /* Check restart counter/timer for this resource */
+ if (check_restart(svcName) > 0) {
+ clulog(LOG_NOTICE, "Restart threshold for %s exceeded; "
+ "attempting to relocate\n", svcName);
+ return handle_relocate_req(svcName, RG_START_RECOVER, -1,
+ new_owner);
+ }
+
return handle_start_req(svcName, RG_START_RECOVER, new_owner);
}
--- cluster/rgmanager/src/daemons/test.c 2007/07/31 18:02:49 1.12
+++ cluster/rgmanager/src/daemons/test.c 2007/11/30 20:36:18 1.13
@@ -25,6 +25,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <list.h>
+#include <restart_counter.h>
#include <reslist.h>
#include <pthread.h>
#include <depends.h>
^ permalink raw reply [flat|nested] 11+ messages in thread* [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
@ 2007-06-27 14:03 lhh
0 siblings, 0 replies; 11+ messages in thread
From: lhh @ 2007-06-27 14:03 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: lhh at sourceware.org 2007-06-27 14:03:52
Modified files:
rgmanager : ChangeLog
rgmanager/include: resgroup.h reslist.h
rgmanager/src/clulib: rg_strings.c
rgmanager/src/daemons: groups.c main.c nodeevent.c restree.c
rg_state.c rg_thread.c test.c
rgmanager/src/resources: vm.sh
Log message:
Merge from RHEL5 branch
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.48&r2=1.49
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.22&r2=1.23
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&r1=1.20&r2=1.21
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/rg_strings.c.diff?cvsroot=cluster&r1=1.8&r2=1.9
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.33&r2=1.34
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.39&r2=1.40
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/nodeevent.c.diff?cvsroot=cluster&r1=1.6&r2=1.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&r1=1.33&r2=1.34
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.35&r2=1.36
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_thread.c.diff?cvsroot=cluster&r1=1.21&r2=1.22
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/test.c.diff?cvsroot=cluster&r1=1.9&r2=1.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/vm.sh.diff?cvsroot=cluster&r1=1.4&r2=1.5
--- cluster/rgmanager/ChangeLog 2007/06/21 18:39:08 1.48
+++ cluster/rgmanager/ChangeLog 2007/06/27 14:03:51 1.49
@@ -1,3 +1,30 @@
+2007-06-27 Lon Hohberger <lhh@redhat.com>
+ * Merge from RHEL5 branch.
+ * src/daemons/vm.sh: Un-break migrate (#231692). Make status
+ checks happen every 30 seconds instead of 30 minutes.
+ * include/resgroup.h: Move inline recovery flags to a header file,
+ add RG_STATUS_INQUIRY for locating virtual machines which may have
+ migrated.
+ * include/reslist.h: Change res_exec() back to using agent_op_str()
+ inline so we can squelch errors while performing RG_STATUS_INQUIRY
+ * src/clulib/rg_strings.c: Add new strings for new error code /
+ request types
+ * src/daemons/groups.c: Change group_migrate() to use the correct
+ calling semantics
+ * src/daemons/main.c, nodeevent.c: Clean up cases which could cause
+ #244143
+ * src/daemons/resrules.c: Clear up noise
+ * src/daemons/restree.c: Squelch errors during RG_STATUS_INQUIRY
+ Patch up inline service recovery (#229650)
+ * src/daemons/rg_state.c: Don't let migrations or relocations to a
+ node running exclusive services occur in the first place and return
+ a useful error. (goes with #237144). Locate virtual machines (or
+ generally, services with the 'migrate' ability) elsewhere in the
+ cluster prior to trying to start one. Detect if someone migrates
+ such a service without using the cluster tools (#232300)
+ * src/daemons/test.c: Make rg_test do the right thing for migrate
+ operations
+
2007-06-21 Fabio M. Di Nitto <fabbione@ubuntu.com>
* rgmanager/src/clulib/alloc.c: Undefine DEBUG when building on IA64.
The __builtin_address functionality should be taken from libunwind
--- cluster/rgmanager/include/resgroup.h 2007/06/14 19:08:57 1.22
+++ cluster/rgmanager/include/resgroup.h 2007/06/27 14:03:51 1.23
@@ -98,6 +98,7 @@
#define RG_MIGRATE 22
#define RG_FREEZE 23
#define RG_UNFREEZE 24
+#define RG_STATUS_INQUIRY 25
#define RG_NONE 999
const char *rg_req_str(int req);
@@ -143,6 +144,7 @@
int svc_start(char *svcName, int req);
int svc_stop(char *svcName, int error);
int svc_status(char *svcName);
+int svc_status_inquiry(char *svcName);
int svc_disable(char *svcName);
int svc_fail(char *svcName);
int svc_freeze(char *svcName);
@@ -188,6 +190,8 @@
int my_id(void);
/* Return codes */
+#define RG_EFENCE -13 /* Fencing operation pending */
+#define RG_ENODE -12 /* Node is dead/nonexistent */
#define RG_EFROZEN -11 /* Service is frozen */
#define RG_ERUN -10 /* Service is already running */
#define RG_EQUORUM -9 /* Operation requires quorum */
@@ -221,6 +225,12 @@
#define FOD_RESTRICTED (1<<1)
#define FOD_NOFAILBACK (1<<2)
+/*
+ Status tree flags
+ */
+#define SFL_FAILURE (1<<0)
+#define SFL_RECOVERABLE (1<<1)
+
//#define DEBUG
#ifdef DEBUG
--- cluster/rgmanager/include/reslist.h 2007/05/31 19:08:14 1.20
+++ cluster/rgmanager/include/reslist.h 2007/06/27 14:03:51 1.21
@@ -144,7 +144,7 @@
int res_status(resource_node_t **tree, resource_t *res, void *ret);
int res_condstart(resource_node_t **tree, resource_t *res, void *ret);
int res_condstop(resource_node_t **tree, resource_t *res, void *ret);
-int res_exec(resource_node_t *node, const char *op, const char *arg, int depth);
+int res_exec(resource_node_t *node, int op, const char *arg, int depth);
/*int res_resinfo(resource_node_t **tree, resource_t *res, void *ret);*/
int expand_time(char *val);
int store_action(resource_act_t **actsp, char *name, int depth, int timeout, int interval);
--- cluster/rgmanager/src/clulib/rg_strings.c 2007/04/27 18:10:10 1.8
+++ cluster/rgmanager/src/clulib/rg_strings.c 2007/06/27 14:03:51 1.9
@@ -26,6 +26,8 @@
const struct string_val rg_error_strings[] = {
+ { RG_EFENCE, "Fencing operation pending; try again later" },
+ { RG_ENODE, "Target node dead / nonexistent" },
{ RG_ERUN, "Service is already running" },
{ RG_EQUORUM, "Operation requires quorum" },
{ RG_EINVAL, "Invalid operation for resource" },
@@ -68,6 +70,7 @@
{RG_UNLOCK, "unlocking"},
{RG_QUERY_LOCK, "lock status inquiry"},
{RG_MIGRATE, "migrate"},
+ {RG_STATUS_INQUIRY, "out of band service status inquiry"},
{RG_NONE, "none"},
{0, NULL}
};
@@ -182,5 +185,6 @@
const char *
agent_op_str(int val)
{
+ printf("searching agent_ops for %d\n", val);
return rg_search_table(agent_ops, val);
}
--- cluster/rgmanager/src/daemons/groups.c 2007/05/31 19:08:14 1.33
+++ cluster/rgmanager/src/daemons/groups.c 2007/06/27 14:03:51 1.34
@@ -896,7 +896,7 @@
}
clulog(LOG_NOTICE, "Migrating %s to %s\n", groupname, tgt_name);
- ret = res_exec(rn, agent_op_str(RS_MIGRATE), tgt_name, 0);
+ ret = res_exec(rn, RS_MIGRATE, tgt_name, 0);
if (ret == 0) {
clulog(LOG_NOTICE,
"Migration of %s to %s completed\n",
--- cluster/rgmanager/src/daemons/main.c 2007/06/14 15:06:51 1.39
+++ cluster/rgmanager/src/daemons/main.c 2007/06/27 14:03:51 1.40
@@ -617,10 +617,12 @@
clulog(LOG_WARNING, "#67: Shutting down uncleanly\n");
rg_set_inquorate();
rg_doall(RG_INIT, 1, "Emergency stop of %s");
+ rg_set_uninitialized();
#if defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2
/* cman_replyto_shutdown() */
#endif
- exit(0);
+ running = 0;
+ break;
}
return ret;
@@ -700,6 +702,9 @@
}
}
+ if (!running)
+ return 0;
+
if (need_reconfigure || check_config_update()) {
need_reconfigure = 0;
configure_logging(-1, 0);
@@ -985,7 +990,8 @@
}
}
- cleanup(cluster_ctx);
+ if (rg_initialized())
+ cleanup(cluster_ctx);
clulog(LOG_NOTICE, "Shutdown complete, exiting\n");
clu_lock_finished(rgmanager_lsname);
cman_finish(clu);
--- cluster/rgmanager/src/daemons/nodeevent.c 2007/03/27 19:33:20 1.6
+++ cluster/rgmanager/src/daemons/nodeevent.c 2007/06/27 14:03:51 1.7
@@ -72,8 +72,10 @@
if (local) {
/* Local Node Event */
- if (nodeStatus == 0)
+ if (nodeStatus == 0) {
+ clulog(LOG_ERR, "Exiting uncleanly\n");
hard_exit();
+ }
if (!rg_initialized()) {
if (init_resource_groups(0) != 0) {
--- cluster/rgmanager/src/daemons/restree.c 2007/06/13 20:32:41 1.33
+++ cluster/rgmanager/src/daemons/restree.c 2007/06/27 14:03:51 1.34
@@ -39,10 +39,6 @@
void malloc_zap_mutex(void);
#endif
-#define FL_FAILURE 0x1
-#define FL_RECOVERABLE 0x2
-
-
/* XXX from resrules.c */
int store_childtype(resource_child_t **childp, char *name, int start,
int stop, int forbid, int flags);
@@ -335,12 +331,13 @@
@see build_env
*/
int
-res_exec(resource_node_t *node, const char *op, const char *arg, int depth)
+res_exec(resource_node_t *node, int op, const char *arg, int depth)
{
int childpid, pid;
int ret = 0;
char **env = NULL;
resource_t *res = node->rn_resource;
+ const char *op_str = agent_op_str(op);
char fullpath[2048];
if (!res->r_rule->rr_agent)
@@ -354,7 +351,7 @@
#ifdef NO_CCS
if (_no_op_mode_) {
- printf("[%s] %s:%s\n", op, res->r_rule->rr_type,
+ printf("[%s] %s:%s\n", op_str, res->r_rule->rr_type,
res->r_attrs->ra_value);
return 0;
}
@@ -392,9 +389,9 @@
restore_signals();
if (arg)
- execle(fullpath, fullpath, op, arg, NULL, env);
+ execle(fullpath, fullpath, op_str, arg, NULL, env);
else
- execle(fullpath, fullpath, op, NULL, env);
+ execle(fullpath, fullpath, op_str, NULL, env);
}
#ifdef DEBUG
@@ -411,10 +408,16 @@
ret = WEXITSTATUS(ret);
+#ifndef NO_CCS
+ if ((op == RS_STATUS &&
+ node->rn_state == RES_STARTED && ret) ||
+ (op != RS_STATUS && ret)) {
+#else
if (ret) {
+#endif
clulog(LOG_NOTICE,
"%s on %s \"%s\" returned %d (%s)\n",
- op, res->r_rule->rr_type,
+ op_str, res->r_rule->rr_type,
res->r_attrs->ra_value, ret,
ocf_strerror(ret));
}
@@ -864,7 +867,7 @@
rule->rr_childtypes[x].rc_name,
ret, op);
- if (rv & FL_FAILURE && op != RS_STOP)
+ if (rv & SFL_FAILURE && op != RS_STOP)
return rv;
}
@@ -911,7 +914,7 @@
list_for(&node->rn_child, child, y) {
rv |= _xx_child_internal(node, first, child, ret, op);
- if (rv & FL_FAILURE)
+ if (rv & SFL_FAILURE)
return rv;
}
} else {
@@ -957,7 +960,7 @@
if (op == RS_START || op == RS_STATUS) {
rv = _do_child_levels(tree, first, ret, op);
- if (rv & FL_FAILURE)
+ if (rv & SFL_FAILURE)
return rv;
/* Start default level after specified ones */
@@ -1016,12 +1019,6 @@
if (strcmp(node->rn_actions[x].ra_name, "status"))
continue;
- /* If a status check has never been done, reset its status. */
- if (!node->rn_actions[x].ra_last) {
- node->rn_actions[x].ra_last = now;
- continue;
- }
-
delta = now - node->rn_actions[x].ra_last;
/*
@@ -1067,7 +1064,8 @@
node->rn_actions[idx].ra_depth,
(int)node->rn_actions[idx].ra_interval);*/
- if ((x = res_exec(node, agent_op_str(RS_STATUS), NULL,
+ node->rn_actions[idx].ra_last = now;
+ if ((x = res_exec(node, RS_STATUS, NULL,
node->rn_actions[idx].ra_depth)) == 0)
return 0;
@@ -1075,7 +1073,7 @@
return x;
/* Strange/failed status. Try to recover inline. */
- if ((x = res_exec(node, agent_op_str(RS_RECOVER), NULL, 0)) == 0)
+ if ((x = res_exec(node, RS_RECOVER, NULL, 0)) == 0)
return 0;
return x;
@@ -1163,7 +1161,7 @@
char *type, void *__attribute__((unused))ret, int realop,
resource_node_t *node)
{
- int rv, me, op;
+ int rv = 0, me, op;
/* Restore default operation. */
op = realop;
@@ -1217,10 +1215,10 @@
if (me && (op == RS_START)) {
node->rn_flags &= ~RF_NEEDSTART;
- rv = res_exec(node, agent_op_str(op), NULL, 0);
+ rv = res_exec(node, op, NULL, 0);
if (rv != 0) {
node->rn_state = RES_FAILED;
- return FL_FAILURE;
+ return SFL_FAILURE;
}
set_time("start", 0, node);
@@ -1248,9 +1246,9 @@
resources of this node must be restarted,
but siblings of this node are not affected. */
if (node->rn_flags & RF_INDEPENDENT)
- return FL_RECOVERABLE;
+ return SFL_RECOVERABLE;
- return FL_FAILURE;
+ return SFL_FAILURE;
}
}
@@ -1266,20 +1264,20 @@
does not matter: its dependent children must
also be independent of this node's siblings. */
if (node->rn_flags & RF_INDEPENDENT)
- return FL_RECOVERABLE;
+ return SFL_RECOVERABLE;
- return FL_FAILURE;
+ return SFL_FAILURE;
}
}
/* Stop should occur after children have stopped */
if (me && (op == RS_STOP)) {
node->rn_flags &= ~RF_NEEDSTOP;
- rv = res_exec(node, agent_op_str(op), NULL, 0);
+ rv = res_exec(node, op, NULL, 0);
if (rv != 0) {
node->rn_state = RES_FAILED;
- return FL_FAILURE;
+ return SFL_FAILURE;
}
if (node->rn_state != RES_STOPPED) {
@@ -1292,7 +1290,7 @@
//node->rn_resource->r_rule->rr_type,
//primary_attr_value(node->rn_resource));
- return 0;
+ return rv;
}
@@ -1332,12 +1330,12 @@
/* If we hit a problem during a 'status' op in an
independent subtree, rv will have the
- FL_RECOVERABLE bit set, but not FL_FAILURE.
- If we ever hit FL_FAILURE during a status
+ SFL_RECOVERABLE bit set, but not SFL_FAILURE.
+ If we ever hit SFL_FAILURE during a status
operation, we're *DONE* - even if the subtree
is flagged w/ indy-subtree */
- if (rv & FL_FAILURE)
+ if (rv & SFL_FAILURE)
return rv;
}
}
@@ -1411,33 +1409,7 @@
int
res_status(resource_node_t **tree, resource_t *res, void *ret)
{
- int rv;
- rv = _res_op(tree, res, NULL, ret, RS_STATUS);
-
- if (rv == 0)
- return 0;
-
- if (rv & FL_FAILURE)
- return rv;
-
- clulog(LOG_WARNING, "Some independent resources in %s:%s failed; "
- "Attempting inline recovery\n",
- res->r_rule->rr_type, res->r_attrs->ra_value);
-
- rv = res_condstop(tree, res, ret);
- if (rv & FL_FAILURE)
- goto out_fail;
- rv = res_condstart(tree, res, ret);
- if (rv & FL_FAILURE)
- goto out_fail;
-
- clulog(LOG_NOTICE, "Inline recovery of %s:%s successful\n",
- res->r_rule->rr_type, res->r_attrs->ra_value);
- return 0;
-out_fail:
- clulog(LOG_WARNING, "Inline recovery of %s:%s failed\n",
- res->r_rule->rr_type, res->r_attrs->ra_value);
- return 1;
+ return _res_op(tree, res, NULL, ret, RS_STATUS);
}
--- cluster/rgmanager/src/daemons/rg_state.c 2007/06/25 16:49:28 1.35
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/06/27 14:03:51 1.36
@@ -36,6 +36,10 @@
#include <rg_queue.h>
#include <msgsimple.h>
+/* XXX - copied :( */
+#define cn_svccount cn_address.cna_address[0] /* Theses are uint8_t size */
+#define cn_svcexcl cn_address.cna_address[1]
+
int node_should_start_safe(uint32_t, cluster_member_list_t *, char *);
int next_node_id(cluster_member_list_t *membership, int me);
@@ -50,6 +54,10 @@
int group_migratory(char *servicename, int lock);
int have_exclusive_resources(void);
int check_exclusive_resources(cluster_member_list_t *membership, char *svcName);
+static int msvc_check_cluster(char *svcName);
+static inline int handle_started_status(char *svcName, int ret, rg_state_t *svcStatus);
+static inline int handle_migrate_status(char *svcName, int ret, rg_state_t *svcStatus);
+int count_resource_groups_local(cman_node_t *mp);
int
@@ -837,10 +845,27 @@
struct dlm_lksb lockp;
rg_state_t svcStatus;
int ret;
+ cluster_member_list_t *membership;
+ cman_node_t *m;
if (!group_migratory(svcName, 1))
return RG_EINVAL;
+ membership = member_list();
+ m = memb_id_to_p(membership, target);
+ if (!m) {
+ free_member_list(membership);
+ return RG_EINVAL;
+ }
+
+ count_resource_groups_local(m);
+ if (m->cn_svcexcl) {
+ free_member_list(membership);
+ return RG_EDEPEND;
+ }
+ free_member_list(membership);
+
+
if (rg_lock(svcName, &lockp) < 0) {
clulog(LOG_ERR, "#45: Unable to obtain cluster lock: %s\n",
strerror(errno));
@@ -905,6 +930,129 @@
/**
+ * Ask the other nodes if they've seen this service. This can be used
+ * to allow users the ability to use non-rgmanager tools to migrate
+ * a virtual machine to another node in the cluster.
+ *
+ * Returns the node ID of the new owner, if any. -1 if no one in the
+ * cluster has seen the service.
+ */
+int
+get_new_owner(char *svcName)
+{
+ SmMessageSt msgp, response;
+ msgctx_t ctx;
+ cluster_member_list_t *membership;
+ int x, ret = -1, me = my_id();
+
+ /* Build message */
+ msgp.sm_hdr.gh_magic = GENERIC_HDR_MAGIC;
+ msgp.sm_hdr.gh_command = RG_ACTION_REQUEST;
+ msgp.sm_hdr.gh_arg1 = RG_STATUS_INQUIRY;
+ msgp.sm_hdr.gh_length = sizeof(msgp);
+ msgp.sm_data.d_action = RG_STATUS_INQUIRY;
+ strncpy(msgp.sm_data.d_svcName, svcName,
+ sizeof(msgp.sm_data.d_svcName));
+ msgp.sm_data.d_svcOwner = 0;
+ msgp.sm_data.d_ret = 0;
+
+ swab_SmMessageSt(&msgp);
+
+ membership = member_list();
+ for (x = 0; x < membership->cml_count && ret < 0; x++) {
+
+ /* don't query down members */
+ if (!membership->cml_members[x].cn_member)
+ continue;
+ /* don't query self */
+ if (membership->cml_members[x].cn_nodeid == me)
+ continue;
+
+ if (msg_open(MSG_CLUSTER, membership->cml_members[x].cn_nodeid,
+ RG_PORT, &ctx, 2) < 0) {
+ /* failed to open: better to claim false successful
+ status rather than claim a failure and possibly
+ end up with a service on >1 node */
+ goto out;
+ }
+
+ msg_send(&ctx, &msgp, sizeof(msgp));
+ msg_receive(&ctx, &response, sizeof (response), 5);
+
+ swab_SmMessageSt(&response);
+ if (response.sm_data.d_ret == RG_SUCCESS)
+ ret = response.sm_data.d_svcOwner;
+ else
+ ret = -1;
+
+ msg_close(&ctx);
+ }
+
+out:
+ free_member_list(membership);
+
+ return ret;
+}
+
+
+/**
+ If a service is 'migratory' - that is, it has the 'migratory' attribute
+ and has no children, this will query other nodes in the cluster, checking
+ to see if the service has migrated to that node using a status inquiry
+ message. Note that this is a very inefficient thing to do; it would be
+ much, much better to simply use the cluster tools to migrate rather than
+ using the standard management tools for the service/virtual machine.
+ */
+static int
+msvc_check_cluster(char *svcName)
+{
+ struct dlm_lksb lockp;
+ int newowner;
+ rg_state_t svcStatus;
+
+ if (!group_migratory(svcName, 1))
+ return -1;
+
+ newowner = get_new_owner(svcName);
+ if (newowner < 0) {
+ clulog(LOG_DEBUG, "No other nodes have seen %s\n", svcName);
+ return -1;
+ }
+
+ /* New owner found */
+ clulog(LOG_NOTICE, "Migration: %s is running on %d\n", svcName, newowner);
+
+ /* If the check succeeds (returns 0), then flip the state back to
+ 'started' - with a new owner */
+ if (rg_lock(svcName, &lockp) < 0) {
+ clulog(LOG_ERR, "#451: Unable to obtain cluster lock: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ if (get_rg_state(svcName, &svcStatus) != 0) {
+ rg_unlock(&lockp);
+ clulog(LOG_ERR, "#452: Failed getting status for RG %s\n",
+ svcName);
+ return -1;
+ }
+
+ svcStatus.rs_state = RG_STATE_STARTED;
+ svcStatus.rs_owner = newowner;
+
+ if (set_rg_state(svcName, &svcStatus) != 0) {
+ rg_unlock(&lockp);
+ clulog(LOG_ERR, "#453: Failed setting status for RG %s\n",
+ svcName);
+ return -1;
+ }
+ rg_unlock(&lockp);
+
+ return newowner;
+}
+
+
+/**
* Check status of a cluster service
*
* @param svcName Service name to check.
@@ -946,14 +1094,58 @@
ret = group_op(svcName, RG_STATUS);
- /* For running services, just check the return code */
+ /* For running services, if the return code is 0, we're done*/
if (svcStatus.rs_state == RG_STATE_STARTED)
- return ret;
+ return handle_started_status(svcName, ret, &svcStatus);
+
+ return handle_migrate_status(svcName, ret, &svcStatus);
+}
+
+
+static inline int
+handle_started_status(char *svcName, int ret, rg_state_t *svcStatus)
+{
+ if (ret & SFL_FAILURE) {
+ ret = msvc_check_cluster(svcName);
+ if (ret >= 0)
+ return 1;
+ }
+
+ /* Ok, we have a recoverable service. Try to perform
+ inline recovery */
+ if (ret & SFL_RECOVERABLE) {
+
+ clulog(LOG_WARNING, "Some independent resources in %s failed; "
+ "Attempting inline recovery\n", svcName);
+ ret = group_op(svcName, RG_CONDSTOP);
+ if (!(ret & SFL_FAILURE)) {
+ ret = group_op(svcName, RG_CONDSTART);
+ }
+
+ if (ret) {
+ clulog(LOG_WARNING, "Inline recovery of %s failed\n",
+ svcName);
+ } else {
+ clulog(LOG_NOTICE,
+ "Inline recovery of %s succeeded\n",
+ svcName);
+ return 0;
+ }
+ }
+
+ return ret;
+}
+
+
+static inline int
+handle_migrate_status(char *svcName, int ret, rg_state_t *svcStatus)
+{
+ struct dlm_lksb lockp;
/* For service(s) migrating to the local node, ignore invalid
return codes.
XXX Should put a timeout on migrating services */
- if (ret < 0)
+ if (ret != 0)
return 0;
/* If the check succeeds (returns 0), then flip the state back to
@@ -964,8 +1156,8 @@
return RG_EFAIL;
}
- svcStatus.rs_state = RG_STATE_STARTED;
- if (set_rg_state(svcName, &svcStatus) != 0) {
+ svcStatus->rs_state = RG_STATE_STARTED;
+ if (set_rg_state(svcName, svcStatus) != 0) {
rg_unlock(&lockp);
clulog(LOG_ERR, "#46: Failed getting status for RG %s\n",
svcName);
@@ -1417,8 +1609,10 @@
int *new_owner)
{
cluster_member_list_t *allowed_nodes, *backup = NULL;
+ cman_node_t *m;
int target = preferred_target, me = my_id();
int ret, x;
+ rg_state_t svcStatus;
/*
* Stop the service - if we haven't already done so.
@@ -1436,9 +1630,22 @@
return RG_EFORWARD;
}
- if (preferred_target >= 0) {
+ if (preferred_target > 0) {
allowed_nodes = member_list();
+ m = memb_id_to_p(allowed_nodes, preferred_target);
+ if (!m) {
+ free_member_list(allowed_nodes);
+ return RG_EINVAL;
+ }
+
+ /* Avoid even bothering the other node if we can */
+ count_resource_groups_local(m);
+ if (m->cn_svcexcl) {
+ free_member_list(allowed_nodes);
+ return RG_EDEPEND;
+ }
+
/*
Mark everyone except me and the preferred target DOWN for now
If we can't start it on the preferred target, then we'll try
@@ -1472,7 +1679,6 @@
if (target == me && me != preferred_target)
goto exhausted;
-
if (target == me) {
/*
Relocate to self. Don't send a network request
@@ -1508,7 +1714,7 @@
//count_resource_groups(allowed_nodes);
}
- if (preferred_target >= 0)
+ if (preferred_target > 0)
memb_mark_down(allowed_nodes, preferred_target);
memb_mark_down(allowed_nodes, me);
@@ -1517,7 +1723,16 @@
if (target == me)
goto exhausted;
- switch (relocate_service(svcName, request, target)) {
+ ret = relocate_service(svcName, request, target);
+ switch (ret) {
+ case RG_ERUN:
+ /* Someone stole the service while we were
+ trying to relo it */
+ get_rg_state_local(svcName, &svcStatus);
+ *new_owner = svcStatus.rs_owner;
+ free_member_list(allowed_nodes);
+ return 0;
+ case RG_EDEPEND:
case RG_EFAIL:
memb_mark_down(allowed_nodes, target);
continue;
@@ -1525,12 +1740,17 @@
svc_report_failure(svcName);
free_member_list(allowed_nodes);
return RG_EFAIL;
+ default:
+ /* deliberate fallthrough */
+ clulog(LOG_ERR,
+ "#61: Invalid reply from member %d during"
+ " relocate operation!\n", target);
case RG_NO:
/* state uncertain */
free_member_list(allowed_nodes);
- clulog(LOG_DEBUG, "State Uncertain: svc:%s "
- "nid:%08x req:%d\n", svcName,
- target, request);
+ clulog(LOG_CRIT, "State Uncertain: svc:%s "
+ "nid:%d req:%s ret:%d\n", svcName,
+ target, rg_req_str(request), ret);
return 0;
case 0:
*new_owner = target;
@@ -1538,10 +1758,6 @@
"on member %d\n", svcName, (int)target);
free_member_list(allowed_nodes);
return 0;
- default:
- clulog(LOG_ERR,
- "#61: Invalid reply from member %d during"
- " relocate operation!\n", target);
}
}
free_member_list(allowed_nodes);
@@ -1592,8 +1808,20 @@
handle_start_req(char *svcName, int req, int *new_owner)
{
int ret, tolerance = FOD_BEST;
- cluster_member_list_t *membership = member_list();
- int need_check = have_exclusive_resources();
+ cluster_member_list_t *membership;
+ int need_check, actual_failure = 0;
+
+ /* When we get an enable req. for a migratory service,
+ check other nodes to see if they are already running
+ said service - and ignore failover domain constraints
+ */
+ if ((ret = msvc_check_cluster(svcName)) >= 0) {
+ *new_owner = ret;
+ return RG_SUCCESS;
+ }
+
+ need_check = have_exclusive_resources();
+ membership = member_list();
/*
* When a service request is from a user application (eg, clusvcadm),
@@ -1672,14 +1900,16 @@
*/
return RG_EABORT;
}
+ actual_failure = 1;
relocate:
/*
* OK, it failed to start - but succeeded to stop. Now,
* we should relocate the service.
*/
- clulog(LOG_WARNING, "#71: Relocating failed service %s\n",
- svcName);
+ if (actual_failure)
+ clulog(LOG_WARNING, "#71: Relocating failed service %s\n",
+ svcName);
ret = handle_relocate_req(svcName, RG_START_RECOVER, -1, new_owner);
/* If we leave the service stopped, instead of disabled, someone
@@ -1780,46 +2010,56 @@
return handle_start_req(svcName, RG_START_RECOVER, new_owner);
}
+
int
handle_fd_start_req(char *svcName, int request, int *new_owner)
{
- cluster_member_list_t *allowed_nodes;
- int target, me = my_id();
- int ret;
-
- allowed_nodes = member_list();
-
- while (memb_count(allowed_nodes)) {
- target = best_target_node(allowed_nodes, -1,
- svcName, 1);
- if (target == me) {
- ret = handle_start_remote_req(svcName, request);
- } else if (target < 0) {
- free_member_list(allowed_nodes);
- return RG_EFAIL;
- } else {
- ret = relocate_service(svcName, request, target);
- }
-
- switch(ret) {
- case RG_ESUCCESS:
- return RG_ESUCCESS;
- case RG_ERUN:
- return RG_ERUN;
- case RG_EFAIL:
- memb_mark_down(allowed_nodes, target);
- continue;
- case RG_EABORT:
- svc_report_failure(svcName);
- free_member_list(allowed_nodes);
- return RG_EFAIL;
- default:
- clulog(LOG_ERR,
- "#6X: Invalid reply [%d] from member %d during"
- " relocate operation!\n", ret, target);
- }
- }
+ cluster_member_list_t *allowed_nodes;
+ int target, me = my_id();
+ int ret = RG_EFAIL;
+
+ /* When we get an enable req. for a migratory service,
+ check other nodes to see if they are already running
+ said service - and ignore failover domain constraints
+ */
+ if ((ret = msvc_check_cluster(svcName)) >= 0) {
+ *new_owner = ret;
+ return RG_SUCCESS;
+ }
+
+ allowed_nodes = member_list();
- free_member_list(allowed_nodes);
- return RG_EFAIL;
+ while (memb_count(allowed_nodes)) {
+ target = best_target_node(allowed_nodes, -1,
+ svcName, 1);
+ if (target == me) {
+ ret = handle_start_remote_req(svcName, request);
+ } else if (target < 0) {
+ free_member_list(allowed_nodes);
+ return RG_EFAIL;
+ } else {
+ ret = relocate_service(svcName, request, target);
+ }
+
+ switch(ret) {
+ case RG_ESUCCESS:
+ return RG_ESUCCESS;
+ case RG_ERUN:
+ return RG_ERUN;
+ case RG_EFAIL:
+ memb_mark_down(allowed_nodes, target);
+ continue;
+ case RG_EABORT:
+ svc_report_failure(svcName);
+ free_member_list(allowed_nodes);
+ return RG_EFAIL;
+ default:
+ clulog(LOG_ERR,
+ "#6X: Invalid reply [%d] from member %d during"
+ " relocate operation!\n", ret, target);
+ }
+ }
+
+ free_member_list(allowed_nodes);
+ return RG_EFAIL;
}
--- cluster/rgmanager/src/daemons/rg_thread.c 2007/06/14 15:06:52 1.21
+++ cluster/rgmanager/src/daemons/rg_thread.c 2007/06/27 14:03:51 1.22
@@ -441,6 +441,19 @@
ret = RG_EFAIL;
break;
+ case RG_STATUS_INQUIRY:
+ error = svc_status_inquiry(myname);
+
+ if (error == 0) {
+ ret = RG_SUCCESS;
+ newowner = my_id();
+ } else {
+ ret = RG_EFAIL;
+ newowner = -1;
+ }
+
+ break;
+
default:
printf("Unhandled request %d\n", req->rr_request);
ret = RG_NONE;
--- cluster/rgmanager/src/daemons/test.c 2007/03/22 23:46:58 1.9
+++ cluster/rgmanager/src/daemons/test.c 2007/06/27 14:03:51 1.10
@@ -217,7 +217,7 @@
}
#endif
- if (res_exec(rn, "migrate", argv[4], 0)) {
+ if (res_exec(rn, RS_MIGRATE, argv[4], 0)) {
ret = -1;
goto out;
}
@@ -226,9 +226,9 @@
} else if (!strcmp(argv[1], "status")) {
printf("Checking status of %s...\n", argv[3]);
- if (res_status(&tree, curres, NULL)) {
+ ret = res_status(&tree, curres, NULL);
+ if (ret) {
printf("Status check of %s failed\n", argv[3]);
- ret = -1;
goto out;
}
printf("Status of %s is good\n", argv[3]);
@@ -391,5 +391,5 @@
out:
xmlCleanupParser();
malloc_dump_table();
- return 0;
+ return ret;
}
--- cluster/rgmanager/src/resources/vm.sh 2007/04/19 17:53:05 1.4
+++ cluster/rgmanager/src/resources/vm.sh 2007/06/27 14:03:51 1.5
@@ -182,9 +182,8 @@
<action name="start" timeout="20"/>
<action name="stop" timeout="120"/>
- <!-- No-ops. Groups are abstract resource types. -->
- <action name="status" timeout="10" interval="30m"/>
- <action name="monitor" timeout="10" interval="30m"/>
+ <action name="status" timeout="10" interval="30"/>
+ <action name="monitor" timeout="10" interval="30"/>
<!-- reconfigure - reconfigure with new OCF parameters.
NOT OCF COMPATIBLE AT ALL -->
@@ -273,13 +272,15 @@
# Start a virtual machine given the parameters from
# the environment.
#
-start()
+do_start()
{
# Use /dev/null for the configuration file, if xmdefconfig
# doesn't exist...
#
declare cmdline
+ do_status && return 0
+
cmdline="`build_xm_cmdline`"
echo "# xm command line: $cmdline"
@@ -293,7 +294,7 @@
# Stop a VM. Try to shut it down. Wait a bit, and if it
# doesn't shut down, destroy it.
#
-stop()
+do_stop()
{
declare -i timeout=60
declare -i ret=1
@@ -307,7 +308,7 @@
while [ $timeout -gt 0 ]; do
sleep 5
((timeout -= 5))
- status || return 0
+ do_status || return 0
while read dom state; do
#
# State is "stopped". Kill it.
@@ -346,10 +347,27 @@
# Simple status check: Find the VM in the list of running
# VMs
#
-status()
+do_status()
{
declare line
+ xm list $OCF_RESKEY_name &> /dev/null
+ if [ $? -eq 0 ]; then
+ return $OCF_SUCCESS
+ fi
+ xm list migrating-$OCF_RESKEY_name &> /dev/null
+ if [ $? -eq 1 ]; then
+ return $OCF_NOT_RUNNING
+ fi
+
+ return $OCF_ERR_GENERIC
+
+### NOT REACHED ###
+
+ # virsh doesn't handle migrating domains right now
+ # When this gets fixed, we need to revisit this status
+ # function.
+
line=$(virsh domstate $OCF_RESKEY_name)
if [ "$line" = "" ]; then
return $OCF_NOT_RUNNING
@@ -400,26 +418,26 @@
case $1 in
start)
- start
+ do_start
exit $?
;;
stop)
- stop shutdown destroy
+ do_stop shutdown destroy
exit $?
;;
kill)
- stop destroy
+ do_stop destroy
exit $?
;;
recover|restart)
exit 0
;;
status|monitor)
- status
+ do_status
exit $?
;;
migrate)
- migrate $2 # Send VM to this node
+ do_migrate $2 # Send VM to this node
exit $?
;;
reload)
^ permalink raw reply [flat|nested] 11+ messages in thread* [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
@ 2007-06-26 21:55 lhh
0 siblings, 0 replies; 11+ messages in thread
From: lhh @ 2007-06-26 21:55 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL5
Changes by: lhh at sourceware.org 2007-06-26 21:55:46
Modified files:
rgmanager : ChangeLog
rgmanager/include: resgroup.h reslist.h
rgmanager/src/clulib: rg_strings.c
rgmanager/src/daemons: groups.c main.c nodeevent.c resrules.c
restree.c rg_state.c rg_thread.c test.c
rgmanager/src/resources: vm.sh
Log message:
Fix: #244143, #232300, and patch up a couple of other fixed bugs (see ChangeLog)
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.31.2.14&r2=1.31.2.15
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.4&r2=1.15.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.3&r2=1.15.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/rg_strings.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.5.2.2&r2=1.5.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.25.2.6&r2=1.25.2.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.34.2.5&r2=1.34.2.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/nodeevent.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.2&r2=1.4.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/resrules.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.16.2.5&r2=1.16.2.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.23.2.7&r2=1.23.2.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.24.2.8&r2=1.24.2.9
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_thread.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.5&r2=1.15.2.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/test.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.6.2.2&r2=1.6.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/vm.sh.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.3&r2=1.1.2.4
--- cluster/rgmanager/ChangeLog 2007/06/18 20:51:44 1.31.2.14
+++ cluster/rgmanager/ChangeLog 2007/06/26 21:55:44 1.31.2.15
@@ -1,3 +1,27 @@
+2007-06-26 Lon Hohberger <lhh@redhat.com>
+ * src/daemons/vm.sh: Un-break migrate (#231692). Make status
+ checks happen every 30 seconds instead of 30 minutes.
+ * include/resgroup.h: Move inline recovery flags to a header file,
+ add RG_STATUS_INQUIRY for locating virtual machines which may have
+ migrated.
+ * include/reslist.h: Change res_exec() back to using agent_op_str()
+ inline so we can squelch errors while performing RG_STATUS_INQUIRY
+ * src/clulib/rg_strings.c: Add new strings for new error code /
+ request types
+ * src/daemons/groups.c: Change group_migrate() to use the correct
+ calling semantics
+ * src/daemons/main.c, nodeevent.c: Clean up cases which could cause
+ #244143
+ * src/daemons/resrules.c: Clear up noise
+ * src/daemons/restree.c: Squelch errors during RG_STATUS_INQUIRY
+ Patch up inline service recovery (#229650)
+ * src/daemons/rg_state.c: Don't let migrations or relocations to a
+ node running exclusive services occur in the first place and return
+ a useful error. (goes with #237144). Locate virtual machines (or
+ generally, services with the 'migrate' ability) elsewhere in the
+ cluster prior to trying to start one. Detect if someone migrates
+ such a service without using the cluster tools (#232300)
+
2007-06-18 Lon Hohberger <lhh@redhat.com>
* src/daemons/restree.c: Wait for status check time intervals to
elapse for the first status check in rgmanager, but not in rg_test.
--- cluster/rgmanager/include/resgroup.h 2007/06/14 13:35:59 1.15.2.4
+++ cluster/rgmanager/include/resgroup.h 2007/06/26 21:55:45 1.15.2.5
@@ -79,6 +79,7 @@
#define RG_UNLOCK 20
#define RG_QUERY_LOCK 21
#define RG_MIGRATE 22
+#define RG_STATUS_INQUIRY 23
#define RG_NONE 999
const char *rg_req_str(int req);
@@ -163,6 +164,8 @@
int my_id(void);
/* Return codes */
+#define RG_EFENCE -12 /* Fencing operation pending */
+#define RG_ENODE -11 /* Node is dead/nonexistent */
#define RG_ERUN -10 /* Service is already running */
#define RG_EQUORUM -9 /* Operation requires quorum */
#define RG_EINVAL -8 /* Invalid operation for resource */
@@ -195,6 +198,12 @@
#define FOD_RESTRICTED (1<<1)
#define FOD_NOFAILBACK (1<<2)
+/*
+ Status tree flags
+ */
+#define SFL_FAILURE (1<<0)
+#define SFL_RECOVERABLE (1<<1)
+
//#define DEBUG
#ifdef DEBUG
--- cluster/rgmanager/include/reslist.h 2007/05/31 18:58:46 1.15.2.3
+++ cluster/rgmanager/include/reslist.h 2007/06/26 21:55:45 1.15.2.4
@@ -144,7 +144,7 @@
int res_status(resource_node_t **tree, resource_t *res, void *ret);
int res_condstart(resource_node_t **tree, resource_t *res, void *ret);
int res_condstop(resource_node_t **tree, resource_t *res, void *ret);
-int res_exec(resource_node_t *node, const char *op, const char *arg, int depth);
+int res_exec(resource_node_t *node, int op, const char *arg, int depth);
/*int res_resinfo(resource_node_t **tree, resource_t *res, void *ret);*/
int expand_time(char *val);
int store_action(resource_act_t **actsp, char *name, int depth, int timeout, int interval);
--- cluster/rgmanager/src/clulib/rg_strings.c 2007/03/20 17:09:11 1.5.2.2
+++ cluster/rgmanager/src/clulib/rg_strings.c 2007/06/26 21:55:45 1.5.2.3
@@ -26,6 +26,8 @@
const struct string_val rg_error_strings[] = {
+ { RG_EFENCE, "Fencing operation pending; try again later" },
+ { RG_ENODE, "Target node dead / nonexistent" },
{ RG_ERUN, "Service is already running" },
{ RG_EQUORUM, "Operation requires quorum" },
{ RG_EINVAL, "Invalid operation for resource" },
@@ -67,6 +69,7 @@
{RG_UNLOCK, "unlocking"},
{RG_QUERY_LOCK, "lock status inquiry"},
{RG_MIGRATE, "migrate"},
+ {RG_STATUS_INQUIRY, "out of band service status inquiry"},
{RG_NONE, "none"},
{0, NULL}
};
@@ -145,5 +148,6 @@
const char *
agent_op_str(int val)
{
+ printf("searching agent_ops for %d\n", val);
return rg_search_table(agent_ops, val);
}
--- cluster/rgmanager/src/daemons/groups.c 2007/05/31 18:58:46 1.25.2.6
+++ cluster/rgmanager/src/daemons/groups.c 2007/06/26 21:55:46 1.25.2.7
@@ -893,7 +893,7 @@
}
clulog(LOG_NOTICE, "Migrating %s to %s\n", groupname, tgt_name);
- ret = res_exec(rn, agent_op_str(RS_MIGRATE), tgt_name, 0);
+ ret = res_exec(rn, RS_MIGRATE, tgt_name, 0);
if (ret == 0) {
clulog(LOG_NOTICE,
"Migration of %s to %s completed\n",
--- cluster/rgmanager/src/daemons/main.c 2007/06/14 13:35:59 1.34.2.5
+++ cluster/rgmanager/src/daemons/main.c 2007/06/26 21:55:46 1.34.2.6
@@ -617,10 +617,12 @@
clulog(LOG_WARNING, "#67: Shutting down uncleanly\n");
rg_set_inquorate();
rg_doall(RG_INIT, 1, "Emergency stop of %s");
+ rg_set_uninitialized();
#if defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2
/* cman_replyto_shutdown() */
#endif
- exit(0);
+ running = 0;
+ break;
}
return ret;
@@ -700,6 +702,9 @@
}
}
+ if (!running)
+ return 0;
+
if (need_reconfigure || check_config_update()) {
need_reconfigure = 0;
configure_logging(-1, 0);
@@ -985,7 +990,8 @@
}
}
- cleanup(cluster_ctx);
+ if (rg_initialized())
+ cleanup(cluster_ctx);
clulog(LOG_NOTICE, "Shutdown complete, exiting\n");
clu_lock_finished(rgmanager_lsname);
cman_finish(clu);
--- cluster/rgmanager/src/daemons/nodeevent.c 2007/05/10 16:23:43 1.4.2.2
+++ cluster/rgmanager/src/daemons/nodeevent.c 2007/06/26 21:55:46 1.4.2.3
@@ -72,8 +72,10 @@
if (local) {
/* Local Node Event */
- if (nodeStatus == 0)
+ if (nodeStatus == 0) {
+ clulog(LOG_ERR, "Exiting uncleanly\n");
hard_exit();
+ }
if (!rg_initialized()) {
if (init_resource_groups(0) != 0) {
--- cluster/rgmanager/src/daemons/resrules.c 2007/05/31 18:58:46 1.16.2.5
+++ cluster/rgmanager/src/daemons/resrules.c 2007/06/26 21:55:46 1.16.2.6
@@ -1058,7 +1058,7 @@
continue;
if (st_buf.st_mode & (S_IXUSR|S_IXOTH|S_IXGRP)) {
- printf("Loading resource rule from %s\n", path);
+ //printf("Loading resource rule from %s\n", path);
load_resource_rulefile(path, rules);
}
}
--- cluster/rgmanager/src/daemons/restree.c 2007/06/18 20:51:44 1.23.2.7
+++ cluster/rgmanager/src/daemons/restree.c 2007/06/26 21:55:46 1.23.2.8
@@ -39,10 +39,6 @@
void malloc_zap_mutex(void);
#endif
-#define FL_FAILURE 0x1
-#define FL_RECOVERABLE 0x2
-
-
/* XXX from resrules.c */
int store_childtype(resource_child_t **childp, char *name, int start,
int stop, int forbid, int flags);
@@ -335,12 +331,13 @@
@see build_env
*/
int
-res_exec(resource_node_t *node, const char *op, const char *arg, int depth)
+res_exec(resource_node_t *node, int op, const char *arg, int depth)
{
int childpid, pid;
int ret = 0;
char **env = NULL;
resource_t *res = node->rn_resource;
+ const char *op_str = agent_op_str(op);
char fullpath[2048];
if (!res->r_rule->rr_agent)
@@ -354,7 +351,7 @@
#ifdef NO_CCS
if (_no_op_mode_) {
- printf("[%s] %s:%s\n", op, res->r_rule->rr_type,
+ printf("[%s] %s:%s\n", op_str, res->r_rule->rr_type,
res->r_attrs->ra_value);
return 0;
}
@@ -392,9 +389,9 @@
restore_signals();
if (arg)
- execle(fullpath, fullpath, op, arg, NULL, env);
+ execle(fullpath, fullpath, op_str, arg, NULL, env);
else
- execle(fullpath, fullpath, op, NULL, env);
+ execle(fullpath, fullpath, op_str, NULL, env);
}
#ifdef DEBUG
@@ -411,10 +408,16 @@
ret = WEXITSTATUS(ret);
+#ifndef NO_CCS
+ if ((op == RS_STATUS &&
+ node->rn_state == RES_STARTED && ret) ||
+ (op != RS_STATUS && ret)) {
+#else
if (ret) {
+#endif
clulog(LOG_NOTICE,
"%s on %s \"%s\" returned %d (%s)\n",
- op, res->r_rule->rr_type,
+ op_str, res->r_rule->rr_type,
res->r_attrs->ra_value, ret,
ocf_strerror(ret));
}
@@ -859,7 +862,7 @@
rule->rr_childtypes[x].rc_name,
ret, op);
- if (rv & FL_FAILURE && op != RS_STOP)
+ if (rv & SFL_FAILURE && op != RS_STOP)
return rv;
}
@@ -906,7 +909,7 @@
list_for(&node->rn_child, child, y) {
rv |= _xx_child_internal(node, first, child, ret, op);
- if (rv & FL_FAILURE)
+ if (rv & SFL_FAILURE)
return rv;
}
} else {
@@ -952,7 +955,7 @@
if (op == RS_START || op == RS_STATUS) {
rv = _do_child_levels(tree, first, ret, op);
- if (rv & FL_FAILURE)
+ if (rv & SFL_FAILURE)
return rv;
/* Start default level after specified ones */
@@ -1011,15 +1014,6 @@
if (strcmp(node->rn_actions[x].ra_name, "status"))
continue;
-#ifndef NO_CCS
- /* If a status check has never been done, reset its status. */
- /* Don't do this from rg_test (ifndef NO_CCS) */
- if (!node->rn_actions[x].ra_last) {
- node->rn_actions[x].ra_last = now;
- continue;
- }
-#endif
-
delta = now - node->rn_actions[x].ra_last;
/*
@@ -1045,7 +1039,7 @@
return 0;
node->rn_actions[idx].ra_last = now;
- if ((x = res_exec(node, agent_op_str(RS_STATUS), NULL,
+ if ((x = res_exec(node, RS_STATUS, NULL,
node->rn_actions[idx].ra_depth)) == 0)
return 0;
@@ -1053,7 +1047,7 @@
return x;
/* Strange/failed status. Try to recover inline. */
- if ((x = res_exec(node, agent_op_str(RS_RECOVER), NULL, 0)) == 0)
+ if ((x = res_exec(node, RS_RECOVER, NULL, 0)) == 0)
return 0;
return x;
@@ -1140,7 +1134,7 @@
char *type, void *__attribute__((unused))ret, int realop,
resource_node_t *node)
{
- int rv, me, op;
+ int rv = 0, me, op;
/* Restore default operation. */
op = realop;
@@ -1194,10 +1188,10 @@
if (me && (op == RS_START)) {
node->rn_flags &= ~RF_NEEDSTART;
- rv = res_exec(node, agent_op_str(op), NULL, 0);
+ rv = res_exec(node, op, NULL, 0);
if (rv != 0) {
node->rn_state = RES_FAILED;
- return FL_FAILURE;
+ return SFL_FAILURE;
}
set_time("start", 0, node);
@@ -1225,9 +1219,9 @@
resources of this node must be restarted,
but siblings of this node are not affected. */
if (node->rn_flags & RF_INDEPENDENT)
- return FL_RECOVERABLE;
+ return SFL_RECOVERABLE;
- return FL_FAILURE;
+ return SFL_FAILURE;
}
}
@@ -1243,20 +1237,20 @@
does not matter: its dependent children must
also be independent of this node's siblings. */
if (node->rn_flags & RF_INDEPENDENT)
- return FL_RECOVERABLE;
+ return SFL_RECOVERABLE;
- return FL_FAILURE;
+ return SFL_FAILURE;
}
}
/* Stop should occur after children have stopped */
if (me && (op == RS_STOP)) {
node->rn_flags &= ~RF_NEEDSTOP;
- rv = res_exec(node, agent_op_str(op), NULL, 0);
+ rv = res_exec(node, op, NULL, 0);
if (rv != 0) {
node->rn_state = RES_FAILED;
- return FL_FAILURE;
+ return SFL_FAILURE;
}
if (node->rn_state != RES_STOPPED) {
@@ -1269,7 +1263,7 @@
//node->rn_resource->r_rule->rr_type,
//primary_attr_value(node->rn_resource));
- return 0;
+ return rv;
}
@@ -1309,12 +1303,12 @@
/* If we hit a problem during a 'status' op in an
independent subtree, rv will have the
- FL_RECOVERABLE bit set, but not FL_FAILURE.
- If we ever hit FL_FAILURE during a status
+ SFL_RECOVERABLE bit set, but not SFL_FAILURE.
+ If we ever hit SFL_FAILURE during a status
operation, we're *DONE* - even if the subtree
is flagged w/ indy-subtree */
- if (rv & FL_FAILURE)
+ if (rv & SFL_FAILURE)
return rv;
}
}
@@ -1388,33 +1382,7 @@
int
res_status(resource_node_t **tree, resource_t *res, void *ret)
{
- int rv;
- rv = _res_op(tree, res, NULL, ret, RS_STATUS);
-
- if (rv == 0)
- return 0;
-
- if (rv & FL_FAILURE)
- return rv;
-
- clulog(LOG_WARNING, "Some independent resources in %s:%s failed; "
- "Attempting inline recovery\n",
- res->r_rule->rr_type, res->r_attrs->ra_value);
-
- rv = res_condstop(tree, res, ret);
- if (rv & FL_FAILURE)
- goto out_fail;
- rv = res_condstart(tree, res, ret);
- if (rv & FL_FAILURE)
- goto out_fail;
-
- clulog(LOG_NOTICE, "Inline recovery of %s:%s successful\n",
- res->r_rule->rr_type, res->r_attrs->ra_value);
- return 0;
-out_fail:
- clulog(LOG_WARNING, "Inline recovery of %s:%s failed\n",
- res->r_rule->rr_type, res->r_attrs->ra_value);
- return 1;
+ return _res_op(tree, res, NULL, ret, RS_STATUS);
}
--- cluster/rgmanager/src/daemons/rg_state.c 2007/06/14 14:53:42 1.24.2.8
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/06/26 21:55:46 1.24.2.9
@@ -36,6 +36,10 @@
#include <rg_queue.h>
#include <msgsimple.h>
+/* XXX - copied :( */
+#define cn_svccount cn_address.cna_address[0] /* Theses are uint8_t size */
+#define cn_svcexcl cn_address.cna_address[1]
+
int node_should_start_safe(uint32_t, cluster_member_list_t *, char *);
int next_node_id(cluster_member_list_t *membership, int me);
@@ -50,6 +54,10 @@
int group_migratory(char *servicename, int lock);
int have_exclusive_resources(void);
int check_exclusive_resources(cluster_member_list_t *membership, char *svcName);
+static int msvc_check_cluster(char *svcName);
+static inline int handle_started_status(char *svcName, int ret, rg_state_t *svcStatus);
+static inline int handle_migrate_status(char *svcName, int ret, rg_state_t *svcStatus);
+int count_resource_groups_local(cman_node_t *mp);
int
@@ -820,10 +828,27 @@
struct dlm_lksb lockp;
rg_state_t svcStatus;
int ret;
+ cluster_member_list_t *membership;
+ cman_node_t *m;
if (!group_migratory(svcName, 1))
return RG_EINVAL;
+ membership = member_list();
+ m = memb_id_to_p(membership, target);
+ if (!m) {
+ free_member_list(membership);
+ return RG_EINVAL;
+ }
+
+ count_resource_groups_local(m);
+ if (m->cn_svcexcl) {
+ free_member_list(membership);
+ return RG_EDEPEND;
+ }
+ free_member_list(membership);
+
+
if (rg_lock(svcName, &lockp) < 0) {
clulog(LOG_ERR, "#45: Unable to obtain cluster lock: %s\n",
strerror(errno));
@@ -888,6 +913,129 @@
/**
+ * Ask the other nodes if they've seen this service. This can be used
+ * to allow users the ability to use non-rgmanager tools to migrate
+ * a virtual machine to another node in the cluster.
+ *
+ * Returns the node ID of the new owner, if any. -1 if no one in the
+ * cluster has seen the service.
+ */
+int
+get_new_owner(char *svcName)
+{
+ SmMessageSt msgp, response;
+ msgctx_t ctx;
+ cluster_member_list_t *membership;
+ int x, ret = -1, me = my_id();
+
+ /* Build message */
+ msgp.sm_hdr.gh_magic = GENERIC_HDR_MAGIC;
+ msgp.sm_hdr.gh_command = RG_ACTION_REQUEST;
+ msgp.sm_hdr.gh_arg1 = RG_STATUS_INQUIRY;
+ msgp.sm_hdr.gh_length = sizeof(msgp);
+ msgp.sm_data.d_action = RG_STATUS_INQUIRY;
+ strncpy(msgp.sm_data.d_svcName, svcName,
+ sizeof(msgp.sm_data.d_svcName));
+ msgp.sm_data.d_svcOwner = 0;
+ msgp.sm_data.d_ret = 0;
+
+ swab_SmMessageSt(&msgp);
+
+ membership = member_list();
+ for (x = 0; x < membership->cml_count && ret < 0; x++) {
+
+ /* don't query down members */
+ if (!membership->cml_members[x].cn_member)
+ continue;
+ /* don't query self */
+ if (membership->cml_members[x].cn_nodeid == me)
+ continue;
+
+ if (msg_open(MSG_CLUSTER, membership->cml_members[x].cn_nodeid,
+ RG_PORT, &ctx, 2) < 0) {
+ /* failed to open: better to claim false successful
+ status rather than claim a failure and possibly
+ end up with a service on >1 node */
+ goto out;
+ }
+
+ msg_send(&ctx, &msgp, sizeof(msgp));
+ msg_receive(&ctx, &response, sizeof (response), 5);
+
+ swab_SmMessageSt(&response);
+ if (response.sm_data.d_ret == RG_SUCCESS)
+ ret = response.sm_data.d_svcOwner;
+ else
+ ret = -1;
+
+ msg_close(&ctx);
+ }
+
+out:
+ free_member_list(membership);
+
+ return ret;
+}
+
+
+/**
+ If a service is 'migratory' - that is, it has the 'migratory' attribute
+ and has no children, this will query other nodes in the cluster, checking
+ to see if the service has migrated to that node using a status inquiry
+ message. Note that this is a very inefficient thing to do; it would be
+ much, much better to simply use the cluster tools to migrate rather than
+ using the standard management tools for the service/virtual machine.
+ */
+static int
+msvc_check_cluster(char *svcName)
+{
+ struct dlm_lksb lockp;
+ int newowner;
+ rg_state_t svcStatus;
+
+ if (!group_migratory(svcName, 1))
+ return -1;
+
+ newowner = get_new_owner(svcName);
+ if (newowner < 0) {
+ clulog(LOG_DEBUG, "No other nodes have seen %s\n", svcName);
+ return -1;
+ }
+
+ /* New owner found */
+ clulog(LOG_NOTICE, "Migration: %s is running on %d\n", svcName, newowner);
+
+ /* If the check succeeds (returns 0), then flip the state back to
+ 'started' - with a new owner */
+ if (rg_lock(svcName, &lockp) < 0) {
+ clulog(LOG_ERR, "#451: Unable to obtain cluster lock: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ if (get_rg_state(svcName, &svcStatus) != 0) {
+ rg_unlock(&lockp);
+ clulog(LOG_ERR, "#452: Failed getting status for RG %s\n",
+ svcName);
+ return -1;
+ }
+
+ svcStatus.rs_state = RG_STATE_STARTED;
+ svcStatus.rs_owner = newowner;
+
+ if (set_rg_state(svcName, &svcStatus) != 0) {
+ rg_unlock(&lockp);
+ clulog(LOG_ERR, "#453: Failed setting status for RG %s\n",
+ svcName);
+ return -1;
+ }
+ rg_unlock(&lockp);
+
+ return newowner;
+}
+
+
+/**
* Check status of a cluster service
*
* @param svcName Service name to check.
@@ -925,14 +1073,58 @@
ret = group_op(svcName, RG_STATUS);
- /* For running services, just check the return code */
+ /* For running services, if the return code is 0, we're done*/
if (svcStatus.rs_state == RG_STATE_STARTED)
- return ret;
+ return handle_started_status(svcName, ret, &svcStatus);
+
+ return handle_migrate_status(svcName, ret, &svcStatus);
+}
+
+static inline int
+handle_started_status(char *svcName, int ret, rg_state_t *svcStatus)
+{
+ if (ret & SFL_FAILURE) {
+ ret = msvc_check_cluster(svcName);
+ if (ret >= 0)
+ return 1;
+ }
+
+ /* Ok, we have a recoverable service. Try to perform
+ inline recovery */
+ if (ret & SFL_RECOVERABLE) {
+
+ clulog(LOG_WARNING, "Some independent resources in %s failed; "
+ "Attempting inline recovery\n", svcName);
+
+ ret = group_op(svcName, RG_CONDSTOP);
+ if (!(ret & SFL_FAILURE)) {
+ ret = group_op(svcName, RG_CONDSTART);
+ }
+
+ if (ret) {
+ clulog(LOG_WARNING, "Inline recovery of %s failed\n",
+ svcName);
+ } else {
+ clulog(LOG_NOTICE,
+ "Inline recovery of %s succeeded\n",
+ svcName);
+ return 0;
+ }
+ }
+
+ return ret;
+}
+
+
+static inline int
+handle_migrate_status(char *svcName, int ret, rg_state_t *svcStatus)
+{
+ struct dlm_lksb lockp;
/* For service(s) migrating to the local node, ignore invalid
return codes.
XXX Should put a timeout on migrating services */
- if (ret < 0)
+ if (ret != 0)
return 0;
/* If the check succeeds (returns 0), then flip the state back to
@@ -943,8 +1135,8 @@
return RG_EFAIL;
}
- svcStatus.rs_state = RG_STATE_STARTED;
- if (set_rg_state(svcName, &svcStatus) != 0) {
+ svcStatus->rs_state = RG_STATE_STARTED;
+ if (set_rg_state(svcName, svcStatus) != 0) {
rg_unlock(&lockp);
clulog(LOG_ERR, "#46: Failed getting status for RG %s\n",
svcName);
@@ -1312,8 +1504,10 @@
int *new_owner)
{
cluster_member_list_t *allowed_nodes, *backup = NULL;
+ cman_node_t *m;
int target = preferred_target, me = my_id();
int ret, x;
+ rg_state_t svcStatus;
/*
* Stop the service - if we haven't already done so.
@@ -1328,9 +1522,22 @@
return RG_EFORWARD;
}
- if (preferred_target >= 0) {
+ if (preferred_target > 0) {
allowed_nodes = member_list();
+ m = memb_id_to_p(allowed_nodes, preferred_target);
+ if (!m) {
+ free_member_list(allowed_nodes);
+ return RG_EINVAL;
+ }
+
+ /* Avoid even bothering the other node if we can */
+ count_resource_groups_local(m);
+ if (m->cn_svcexcl) {
+ free_member_list(allowed_nodes);
+ return RG_EDEPEND;
+ }
+
/*
Mark everyone except me and the preferred target DOWN for now
If we can't start it on the preferred target, then we'll try
@@ -1364,7 +1571,6 @@
if (target == me && me != preferred_target)
goto exhausted;
-
if (target == me) {
/*
Relocate to self. Don't send a network request
@@ -1400,7 +1606,7 @@
//count_resource_groups(allowed_nodes);
}
- if (preferred_target >= 0)
+ if (preferred_target > 0)
memb_mark_down(allowed_nodes, preferred_target);
memb_mark_down(allowed_nodes, me);
@@ -1409,7 +1615,16 @@
if (target == me)
goto exhausted;
- switch (relocate_service(svcName, request, target)) {
+ ret = relocate_service(svcName, request, target);
+ switch (ret) {
+ case RG_ERUN:
+ /* Someone stole the service while we were
+ trying to relo it */
+ get_rg_state_local(svcName, &svcStatus);
+ *new_owner = svcStatus.rs_owner;
+ free_member_list(allowed_nodes);
+ return 0;
+ case RG_EDEPEND:
case RG_EFAIL:
memb_mark_down(allowed_nodes, target);
continue;
@@ -1417,12 +1632,17 @@
svc_report_failure(svcName);
free_member_list(allowed_nodes);
return RG_EFAIL;
+ default:
+ /* deliberate fallthrough */
+ clulog(LOG_ERR,
+ "#61: Invalid reply from member %d during"
+ " relocate operation!\n", target);
case RG_NO:
/* state uncertain */
free_member_list(allowed_nodes);
- clulog(LOG_DEBUG, "State Uncertain: svc:%s "
- "nid:%08x req:%d\n", svcName,
- target, request);
+ clulog(LOG_CRIT, "State Uncertain: svc:%s "
+ "nid:%d req:%s ret:%d\n", svcName,
+ target, rg_req_str(request), ret);
return 0;
case 0:
*new_owner = target;
@@ -1430,10 +1650,6 @@
"on member %d\n", svcName, (int)target);
free_member_list(allowed_nodes);
return 0;
- default:
- clulog(LOG_ERR,
- "#61: Invalid reply from member %d during"
- " relocate operation!\n", target);
}
}
free_member_list(allowed_nodes);
@@ -1484,16 +1700,26 @@
handle_start_req(char *svcName, int req, int *new_owner)
{
int ret, tolerance = FOD_BEST;
- cluster_member_list_t *membership = member_list();
- int need_check = have_exclusive_resources();
+ cluster_member_list_t *membership;
+ int need_check, actual_failure = 0;
+ /* When we get an enable req. for a migratory service,
+ check other nodes to see if they are already running
+ said service - and ignore failover domain constraints
+ */
+ if ((ret = msvc_check_cluster(svcName)) >= 0) {
+ *new_owner = ret;
+ return RG_SUCCESS;
+ }
+
+ need_check = have_exclusive_resources();
+ membership = member_list();
/*
* When a service request is from a user application (eg, clusvcadm),
* accept FOD_GOOD instead of FOD_BEST
*/
if (req == RG_ENABLE)
tolerance = FOD_GOOD;
-/*
if (req != RG_RESTART &&
req != RG_START_RECOVER &&
(node_should_start_safe(my_id(), membership, svcName) <
@@ -1514,7 +1740,7 @@
}
}
free_member_list(membership);
-*/
+
/* Check for dependency. We cannot start unless our
dependency is met */
if (check_depend_safe(svcName) == 0)
@@ -1565,14 +1791,16 @@
*/
return RG_EABORT;
}
+ actual_failure = 1;
relocate:
/*
* OK, it failed to start - but succeeded to stop. Now,
* we should relocate the service.
*/
- clulog(LOG_WARNING, "#71: Relocating failed service %s\n",
- svcName);
+ if (actual_failure)
+ clulog(LOG_WARNING, "#71: Relocating failed service %s\n",
+ svcName);
ret = handle_relocate_req(svcName, RG_START_RECOVER, -1, new_owner);
/* If we leave the service stopped, instead of disabled, someone
@@ -1673,6 +1901,7 @@
return handle_start_req(svcName, RG_START_RECOVER, new_owner);
}
+
int
handle_fd_start_req(char *svcName, int request, int *new_owner)
{
@@ -1680,6 +1909,15 @@
int target, me = my_id();
int ret = RG_EFAIL;
+ /* When we get an enable req. for a migratory service,
+ check other nodes to see if they are already running
+ said service - and ignore failover domain constraints
+ */
+ if ((ret = msvc_check_cluster(svcName)) >= 0) {
+ *new_owner = ret;
+ return RG_SUCCESS;
+ }
+
allowed_nodes = member_list();
while (memb_count(allowed_nodes)) {
--- cluster/rgmanager/src/daemons/rg_thread.c 2007/06/14 13:35:59 1.15.2.5
+++ cluster/rgmanager/src/daemons/rg_thread.c 2007/06/26 21:55:46 1.15.2.6
@@ -429,6 +429,19 @@
break;
+ case RG_STATUS_INQUIRY:
+ error = svc_status_inquiry(myname);
+
+ if (error == 0) {
+ ret = RG_SUCCESS;
+ newowner = my_id();
+ } else {
+ ret = RG_EFAIL;
+ newowner = -1;
+ }
+
+ break;
+
default:
printf("Unhandled request %d\n", req->rr_request);
ret = RG_NONE;
--- cluster/rgmanager/src/daemons/test.c 2007/03/23 00:06:34 1.6.2.2
+++ cluster/rgmanager/src/daemons/test.c 2007/06/26 21:55:46 1.6.2.3
@@ -192,9 +192,9 @@
} else if (!strcmp(argv[1], "status")) {
printf("Checking status of %s...\n", argv[3]);
- if (res_status(&tree, curres, NULL)) {
+ ret = res_status(&tree, curres, NULL);
+ if (ret) {
printf("Status check of %s failed\n", argv[3]);
- ret = -1;
goto out;
}
printf("Status of %s is good\n", argv[3]);
@@ -354,5 +354,5 @@
out:
xmlCleanupParser();
malloc_dump_table();
- return 0;
+ return ret;
}
--- cluster/rgmanager/src/resources/vm.sh 2007/06/22 16:59:50 1.1.2.3
+++ cluster/rgmanager/src/resources/vm.sh 2007/06/26 21:55:46 1.1.2.4
@@ -180,9 +180,8 @@
<action name="start" timeout="20"/>
<action name="stop" timeout="120"/>
- <!-- No-ops. Groups are abstract resource types. -->
- <action name="status" timeout="10" interval="30m"/>
- <action name="monitor" timeout="10" interval="30m"/>
+ <action name="status" timeout="10" interval="30"/>
+ <action name="monitor" timeout="10" interval="30"/>
<!-- reconfigure - reconfigure with new OCF parameters.
NOT OCF COMPATIBLE AT ALL -->
@@ -278,6 +277,8 @@
#
declare cmdline
+ status && return 0
+
cmdline="`build_xm_cmdline`"
echo "# xm command line: $cmdline"
@@ -347,6 +348,10 @@
status()
{
xm list $OCF_RESKEY_name &> /dev/null
+ if [ $? -eq 0 ]; then
+ return 0
+ fi
+ xm list migrating-$OCF_RESKEY_name &> /dev/null
return $?
}
^ permalink raw reply [flat|nested] 11+ messages in thread* [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
@ 2007-06-14 15:06 mgrac
0 siblings, 0 replies; 11+ messages in thread
From: mgrac @ 2007-06-14 15:06 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: mgrac at sourceware.org 2007-06-14 15:06:52
Modified files:
rgmanager : ChangeLog
rgmanager/include: resgroup.h
rgmanager/src/daemons: main.c rg_state.c rg_thread.c
rgmanager/src/utils: clusvcadm.c
Log message:
New flag -F for clusvcadm to respect failover domain (#211469). Also changes clusvcadm -e service00 which enable service on local node and do not respect failover (same as in RHEL4, in RHEL 5.0 it just wrote Failure).
Old flag -F (freeze, introduced after RHEL50) was changed to -Z.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.45&r2=1.46
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.20&r2=1.21
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.38&r2=1.39
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.32&r2=1.33
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_thread.c.diff?cvsroot=cluster&r1=1.20&r2=1.21
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clusvcadm.c.diff?cvsroot=cluster&r1=1.19&r2=1.20
--- cluster/rgmanager/ChangeLog 2007/06/13 20:13:26 1.45
+++ cluster/rgmanager/ChangeLog 2007/06/14 15:06:51 1.46
@@ -1,3 +1,10 @@
+2007-06-14 Marek Grac <mgrac@redhat.com>
+ * src/daemons/main.c, rg_state.c, rg_thread.c,
+ src/utils/clusvcadm.c
+ * #211469 - RFE: Flag (-F) for clusvcadm to respect failover domain
+ * 'clusvcadm -e service00' works same as in RHEL4 (differs from RHEL5.0)
+ * -F for freeze was changed to -Z
+
2007-06-13 Lon Hohberger <lhh@redhat.com>
* src/daemons/restree.c: Fix #229650 uninitialized bug
--- cluster/rgmanager/include/resgroup.h 2007/04/27 18:10:07 1.20
+++ cluster/rgmanager/include/resgroup.h 2007/06/14 15:06:51 1.21
@@ -90,6 +90,7 @@
int handle_relocate_req(char *svcName, int request, int preferred_target,
int *new_owner);
int handle_start_req(char *svcName, int req, int *new_owner);
+int handle_fd_start_req(char *svcName, int req, int *new_owner);
int handle_recover_req(char *svcName, int *new_owner);
int handle_start_remote_req(char *svcName, int req);
--- cluster/rgmanager/src/daemons/main.c 2007/04/19 20:21:22 1.38
+++ cluster/rgmanager/src/daemons/main.c 2007/06/14 15:06:51 1.39
@@ -493,7 +493,9 @@
/* Queue request */
rt_enqueue_request(msg_sm->sm_data.d_svcName,
msg_sm->sm_data.d_action,
- ctx, 0, msg_sm->sm_data.d_svcOwner, 0, 0);
+ ctx, 0, msg_sm->sm_data.d_svcOwner,
+ msg_sm->sm_hdr.gh_arg1,
+ msg_sm->sm_hdr.gh_arg2);
return 0;
case RG_EVENT:
--- cluster/rgmanager/src/daemons/rg_state.c 2007/04/27 18:10:10 1.32
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/06/14 15:06:51 1.33
@@ -1316,7 +1316,7 @@
strncpy(msg_relo.sm_data.d_svcName, svcName,
sizeof(msg_relo.sm_data.d_svcName));
msg_relo.sm_data.d_ret = 0;
-
+ msg_relo.sm_data.d_svcOwner = target;
/* Open a connection to the other node */
if (msg_open(MSG_CLUSTER, target, RG_PORT, &ctx, 2)< 0) {
@@ -1592,7 +1592,7 @@
handle_start_req(char *svcName, int req, int *new_owner)
{
int ret, tolerance = FOD_BEST;
- cluster_member_list_t *membership = member_list();
+// cluster_member_list_t *membership = member_list();
int need_check = have_exclusive_resources();
/*
@@ -1601,7 +1601,7 @@
*/
if (req == RG_ENABLE)
tolerance = FOD_GOOD;
-
+/*
if (req != RG_RESTART &&
req != RG_START_RECOVER &&
(node_should_start_safe(my_id(), membership, svcName) <
@@ -1622,7 +1622,7 @@
}
}
free_member_list(membership);
-
+*/
/* Check for dependency. We cannot start unless our
dependency is met */
if (check_depend_safe(svcName) == 0)
@@ -1674,7 +1674,7 @@
return RG_EABORT;
}
-relocate:
+//relocate:
/*
* OK, it failed to start - but succeeded to stop. Now,
* we should relocate the service.
@@ -1743,10 +1743,12 @@
}
free_member_list(membership);
- if (svc_start(svcName, req) == 0) {
+ x = svc_start(svcName, req);
+
+ if ((x == 0) || (x == RG_ERUN)) {
if (need_check)
pthread_mutex_unlock(&exclusive_mutex);
- return 0;
+ return x;
}
if (need_check)
pthread_mutex_unlock(&exclusive_mutex);
@@ -1778,3 +1780,47 @@
return handle_start_req(svcName, RG_START_RECOVER, new_owner);
}
+
+int
+handle_fd_start_req(char *svcName, int request, int *new_owner)
+{
+ cluster_member_list_t *allowed_nodes;
+ int target, me = my_id();
+ int ret;
+
+ allowed_nodes = member_list();
+
+ while (memb_count(allowed_nodes)) {
+ target = best_target_node(allowed_nodes, -1,
+ svcName, 1);
+ if (target == me) {
+ ret = handle_start_remote_req(svcName, request);
+ } else if (target < 0) {
+ free_member_list(allowed_nodes);
+ return RG_EFAIL;
+ } else {
+ ret = relocate_service(svcName, request, target);
+ }
+
+ switch(ret) {
+ case RG_ESUCCESS:
+ return RG_ESUCCESS;
+ case RG_ERUN:
+ return RG_ERUN;
+ case RG_EFAIL:
+ memb_mark_down(allowed_nodes, target);
+ continue;
+ case RG_EABORT:
+ svc_report_failure(svcName);
+ free_member_list(allowed_nodes);
+ return RG_EFAIL;
+ default:
+ clulog(LOG_ERR,
+ "#6X: Invalid reply [%d] from member %d during"
+ " relocate operation!\n", ret, target);
+ }
+ }
+
+ free_member_list(allowed_nodes);
+ return RG_EFAIL;
+}
--- cluster/rgmanager/src/daemons/rg_thread.c 2007/04/27 18:10:10 1.20
+++ cluster/rgmanager/src/daemons/rg_thread.c 2007/06/14 15:06:52 1.21
@@ -248,8 +248,15 @@
break;
}
case RG_START:
- error = handle_start_req(myname, req->rr_request,
- &newowner);
+ if (req->rr_arg0) {
+ error = handle_fd_start_req(myname,
+ req->rr_request,
+ &newowner);
+ } else {
+ error = handle_start_req(myname,
+ req->rr_request,
+ &newowner);
+ }
break;
case RG_RELOCATE:
--- cluster/rgmanager/src/utils/clusvcadm.c 2007/04/27 18:10:10 1.19
+++ cluster/rgmanager/src/utils/clusvcadm.c 2007/06/14 15:06:52 1.20
@@ -39,11 +39,14 @@
void
-build_message(SmMessageSt *msgp, int action, char *svcName, int target)
+build_message(SmMessageSt *msgp, int action, char *svcName, int target,
+ int arg1, int arg2)
{
msgp->sm_hdr.gh_magic = GENERIC_HDR_MAGIC;
msgp->sm_hdr.gh_command = RG_ACTION_REQUEST;
msgp->sm_hdr.gh_length = sizeof(*msgp);
+ msgp->sm_hdr.gh_arg1 = arg1;
+ msgp->sm_hdr.gh_arg2 = arg2;
msgp->sm_data.d_action = action;
strncpy(msgp->sm_data.d_svcName, svcName,
sizeof(msgp->sm_data.d_svcName));
@@ -155,6 +158,8 @@
printf(" %s -d <group> Disable <group>\n", name);
printf(" %s -e <group> Enable <group>\n",
name);
+printf(" %s -e <group> -F Enable <group> according to failover\n"
+ " domain rules\n", name);
printf(" %s -e <group> -m <member> Enable <group>"
" on <member>\n", name);
printf(" %s -r <group> -m <member> Relocate <group> [to <member>]\n",
@@ -230,6 +235,7 @@
SmMessageSt msg;
generic_msg_hdr *h = (generic_msg_hdr *)&msg;
int action = RG_STATUS;
+ int fod = 0;
int node_specified = 0;
int me, svctarget = 0;
char *actionstr = NULL;
@@ -240,7 +246,7 @@
return 1;
}
- while ((opt = getopt(argc, argv, "lSue:M:d:r:n:m:vR:s:F:U:qh?")) != EOF) {
+ while ((opt = getopt(argc, argv, "lSue:M:d:r:n:m:FvR:s:Z:U:qh?")) != EOF) {
switch (opt) {
case 'l':
return do_lock();
@@ -257,6 +263,14 @@
action = RG_ENABLE;
svcname = optarg;
break;
+ case 'F':
+ if (node_specified) {
+ fprintf(stderr,
+ "Cannot use '-F' with '-n' or '-m'\n");
+ return 1;
+ }
+ fod = 1;
+ break;
case 'd':
/* DISABLE */
actionstr = "disabling";
@@ -288,13 +302,18 @@
break;
case 'm': /* member ... */
case 'n': /* node .. same thing */
+ if (fod) {
+ fprintf(stderr,
+ "Cannot use '-F' with '-n' or '-m'\n");
+ return 1;
+ }
strncpy(nodename,optarg,sizeof(nodename));
node_specified = 1;
break;
case 'v':
printf("%s\n",PACKAGE_VERSION);
return 0;
- case 'F':
+ case 'Z':
actionstr = "freezing";
action = RG_FREEZE;
svcname = optarg;
@@ -361,8 +380,8 @@
*/
//strcpy(nodename,"me");
}
-
- build_message(&msg, action, svcname, svctarget);
+
+ build_message(&msg, action, svcname, svctarget, fod, 0);
if (action != RG_RELOCATE && action != RG_MIGRATE) {
if (!node_specified)
^ permalink raw reply [flat|nested] 11+ messages in thread* [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
@ 2007-06-14 13:36 mgrac
0 siblings, 0 replies; 11+ messages in thread
From: mgrac @ 2007-06-14 13:36 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL5
Changes by: mgrac at sourceware.org 2007-06-14 13:35:59
Modified files:
rgmanager : ChangeLog
rgmanager/include: resgroup.h
rgmanager/src/daemons: main.c rg_state.c rg_thread.c
rgmanager/src/utils: clusvcadm.c
Log message:
New flag -F for clusvcadm to respect failover domain (#211469). Also changes clusvcadm -e service00 which enable service on local node and do not respect failover (same as in RHEL4, in RHEL 5.0 it just wrote Failure).
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.31.2.11&r2=1.31.2.12
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.3&r2=1.15.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.34.2.4&r2=1.34.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.24.2.6&r2=1.24.2.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_thread.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.4&r2=1.15.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clusvcadm.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.12.2.3&r2=1.12.2.4
--- cluster/rgmanager/ChangeLog 2007/06/13 20:12:19 1.31.2.11
+++ cluster/rgmanager/ChangeLog 2007/06/14 13:35:58 1.31.2.12
@@ -1,3 +1,9 @@
+2007-06-14 Marek Grac <mgrac@redhat.com>
+ * src/daemons/main.c, rg_state.c, rg_thread.c,
+ src/utils/clusvcadm.c
+ * #211469 - RFE: Flag for clusvcadm to respect failover domain
+ * 'clusvcadm -e service00' works same as in RHEL4 (differs from RHEL5.0)
+
2007-06-13 Lon Hohberger <lhh@redhat.com>
* src/daemons/restree.c: Fix #229650 uninitialized bug
--- cluster/rgmanager/include/resgroup.h 2007/03/20 17:09:11 1.15.2.3
+++ cluster/rgmanager/include/resgroup.h 2007/06/14 13:35:59 1.15.2.4
@@ -86,6 +86,7 @@
int handle_relocate_req(char *svcName, int request, int preferred_target,
int *new_owner);
int handle_start_req(char *svcName, int req, int *new_owner);
+int handle_fd_start_req(char *svcName, int req, int *new_owner);
int handle_recover_req(char *svcName, int *new_owner);
int handle_start_remote_req(char *svcName, int req);
--- cluster/rgmanager/src/daemons/main.c 2007/05/10 16:23:43 1.34.2.4
+++ cluster/rgmanager/src/daemons/main.c 2007/06/14 13:35:59 1.34.2.5
@@ -493,7 +493,9 @@
/* Queue request */
rt_enqueue_request(msg_sm->sm_data.d_svcName,
msg_sm->sm_data.d_action,
- ctx, 0, msg_sm->sm_data.d_svcOwner, 0, 0);
+ ctx, 0, msg_sm->sm_data.d_svcOwner,
+ msg_sm->sm_hdr.gh_arg1,
+ msg_sm->sm_hdr.gh_arg2);
return 0;
case RG_EVENT:
--- cluster/rgmanager/src/daemons/rg_state.c 2007/04/19 18:05:37 1.24.2.6
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/06/14 13:35:59 1.24.2.7
@@ -1211,7 +1211,7 @@
strncpy(msg_relo.sm_data.d_svcName, svcName,
sizeof(msg_relo.sm_data.d_svcName));
msg_relo.sm_data.d_ret = 0;
-
+ msg_relo.sm_data.d_svcOwner = target;
/* Open a connection to the other node */
if (msg_open(MSG_CLUSTER, target, RG_PORT, &ctx, 2)< 0) {
@@ -1493,7 +1493,7 @@
*/
if (req == RG_ENABLE)
tolerance = FOD_GOOD;
-
+/*
if (req != RG_RESTART &&
req != RG_START_RECOVER &&
(node_should_start_safe(my_id(), membership, svcName) <
@@ -1514,7 +1514,7 @@
}
}
free_member_list(membership);
-
+*/
/* Check for dependency. We cannot start unless our
dependency is met */
if (check_depend_safe(svcName) == 0)
@@ -1635,10 +1635,12 @@
}
free_member_list(membership);
- if (svc_start(svcName, req) == 0) {
+ x = svc_start(svcName, req);
+
+ if ((x == 0) || (x == RG_ERUN)) {
if (need_check)
pthread_mutex_unlock(&exclusive_mutex);
- return 0;
+ return x;
}
if (need_check)
pthread_mutex_unlock(&exclusive_mutex);
@@ -1670,3 +1672,47 @@
return handle_start_req(svcName, RG_START_RECOVER, new_owner);
}
+
+int
+handle_fd_start_req(char *svcName, int request, int *new_owner)
+{
+ cluster_member_list_t *allowed_nodes;
+ int target, me = my_id();
+ int ret;
+
+ allowed_nodes = member_list();
+
+ while (memb_count(allowed_nodes)) {
+ target = best_target_node(allowed_nodes, -1,
+ svcName, 1);
+ if (target == me) {
+ ret = handle_start_remote_req(svcName, request);
+ } else if (target < 0) {
+ free_member_list(allowed_nodes);
+ return RG_EFAIL;
+ } else {
+ ret = relocate_service(svcName, request, target);
+ }
+
+ switch(ret) {
+ case RG_ESUCCESS:
+ return RG_ESUCCESS;
+ case RG_ERUN:
+ return RG_ERUN;
+ case RG_EFAIL:
+ memb_mark_down(allowed_nodes, target);
+ continue;
+ case RG_EABORT:
+ svc_report_failure(svcName);
+ free_member_list(allowed_nodes);
+ return RG_EFAIL;
+ default:
+ clulog(LOG_ERR,
+ "#6X: Invalid reply [%d] from member %d during"
+ " relocate operation!\n", ret, target);
+ }
+ }
+
+ free_member_list(allowed_nodes);
+ return RG_EFAIL;
+}
--- cluster/rgmanager/src/daemons/rg_thread.c 2007/05/10 16:23:43 1.15.2.4
+++ cluster/rgmanager/src/daemons/rg_thread.c 2007/06/14 13:35:59 1.15.2.5
@@ -248,8 +248,15 @@
break;
}
case RG_START:
- error = handle_start_req(myname, req->rr_request,
- &newowner);
+ if (req->rr_arg0) {
+ error = handle_fd_start_req(myname,
+ req->rr_request,
+ &newowner);
+ } else {
+ error = handle_start_req(myname,
+ req->rr_request,
+ &newowner);
+ }
break;
case RG_RELOCATE:
--- cluster/rgmanager/src/utils/clusvcadm.c 2007/03/20 17:09:12 1.12.2.3
+++ cluster/rgmanager/src/utils/clusvcadm.c 2007/06/14 13:35:59 1.12.2.4
@@ -39,11 +39,14 @@
void
-build_message(SmMessageSt *msgp, int action, char *svcName, int target)
+build_message(SmMessageSt *msgp, int action, char *svcName, int target,
+ int arg1, int arg2)
{
msgp->sm_hdr.gh_magic = GENERIC_HDR_MAGIC;
msgp->sm_hdr.gh_command = RG_ACTION_REQUEST;
msgp->sm_hdr.gh_length = sizeof(*msgp);
+ msgp->sm_hdr.gh_arg1 = arg1;
+ msgp->sm_hdr.gh_arg2 = arg2;
msgp->sm_data.d_action = action;
strncpy(msgp->sm_data.d_svcName, svcName,
sizeof(msgp->sm_data.d_svcName));
@@ -155,6 +158,8 @@
printf(" %s -d <group> Disable <group>\n", name);
printf(" %s -e <group> Enable <group>\n",
name);
+printf(" %s -e <group> -F Enable <group> according to failover\n"
+ " domain rules\n", name);
printf(" %s -e <group> -m <member> Enable <group>"
" on <member>\n", name);
printf(" %s -r <group> -m <member> Relocate <group> [to <member>]\n",
@@ -230,6 +235,7 @@
SmMessageSt msg;
generic_msg_hdr *h = (generic_msg_hdr *)&msg;
int action = RG_STATUS;
+ int fod = 0;
int node_specified = 0;
int me, svctarget = 0;
char *actionstr = NULL;
@@ -240,7 +246,7 @@
return 1;
}
- while ((opt = getopt(argc, argv, "lSue:M:d:r:n:m:vR:s:qh?")) != EOF) {
+ while ((opt = getopt(argc, argv, "lSue:M:d:r:n:m:FvR:s:qh?")) != EOF) {
switch (opt) {
case 'l':
return do_lock();
@@ -257,6 +263,14 @@
action = RG_ENABLE;
svcname = optarg;
break;
+ case 'F':
+ if (node_specified) {
+ fprintf(stderr,
+ "Cannot use '-F' with '-n' or '-m'\n");
+ return 1;
+ }
+ fod = 1;
+ break;
case 'd':
/* DISABLE */
actionstr = "disabling";
@@ -288,6 +302,11 @@
break;
case 'm': /* member ... */
case 'n': /* node .. same thing */
+ if (fod) {
+ fprintf(stderr,
+ "Cannot use '-F' with '-n' or '-m'\n");
+ return 1;
+ }
strncpy(nodename,optarg,sizeof(nodename));
node_specified = 1;
break;
@@ -351,8 +370,8 @@
*/
//strcpy(nodename,"me");
}
-
- build_message(&msg, action, svcname, svctarget);
+
+ build_message(&msg, action, svcname, svctarget, fod, 0);
if (action != RG_RELOCATE && action != RG_MIGRATE) {
if (!node_specified)
^ permalink raw reply [flat|nested] 11+ messages in thread* [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
@ 2007-04-27 18:10 lhh
0 siblings, 0 replies; 11+ messages in thread
From: lhh @ 2007-04-27 18:10 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: lhh at sourceware.org 2007-04-27 19:10:10
Modified files:
rgmanager : ChangeLog
rgmanager/include: resgroup.h
rgmanager/src/clulib: rg_strings.c
rgmanager/src/daemons: groups.c rg_state.c rg_thread.c
rgmanager/src/utils: clustat.c clusvcadm.c
Added files:
rgmanager/src/daemons: sbuf.c
Log message:
Add patch from Simone Gotti to implement service freeze/unfreeze. Add simple buffer handling for later use.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.39&r2=1.40
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.19&r2=1.20
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/rg_strings.c.diff?cvsroot=cluster&r1=1.7&r2=1.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/sbuf.c.diff?cvsroot=cluster&r1=NONE&r2=1.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.31&r2=1.32
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.31&r2=1.32
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_thread.c.diff?cvsroot=cluster&r1=1.19&r2=1.20
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clustat.c.diff?cvsroot=cluster&r1=1.31&r2=1.32
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clusvcadm.c.diff?cvsroot=cluster&r1=1.18&r2=1.19
--- cluster/rgmanager/ChangeLog 2007/04/27 04:23:05 1.39
+++ cluster/rgmanager/ChangeLog 2007/04/27 18:10:07 1.40
@@ -1,3 +1,10 @@
+2007-04-27 Lon Hohberger <lhh@redhat.com>
+ * include/resgroup.h, src/clulib/rg_strings.c src/daemons/groups.c,
+ rg_state.c, rg_thread.c, src/utils/clustat.c, clusvcadm.c: Apply
+ patch to implement service freeze/unfreeze from Simone Gotti
+ * src/daemons/sbuf.c: Add simple buffer handlers for future use
+ by svc_status_inquiry
+
2007-04-27 Fabio M. Di Nitto <fabbione@ubuntu.com>
* src/clulib/vft.c: Change ifdef to fix build on parisc.
--- cluster/rgmanager/include/resgroup.h 2007/03/20 17:09:56 1.19
+++ cluster/rgmanager/include/resgroup.h 2007/04/27 18:10:07 1.20
@@ -35,6 +35,7 @@
uint32_t rs_restarts; /**< Number of cluster-induced
restarts */
uint64_t rs_transition; /**< Last service transition time */
+ uint32_t rs_flags; /**< User setted flags */
} rg_state_t;
#define swab_rg_state_t(ptr) \
@@ -46,6 +47,7 @@
swab32((ptr)->rs_state);\
swab32((ptr)->rs_restarts);\
swab64((ptr)->rs_transition);\
+ swab32((ptr)->rs_flags);\
}
@@ -79,6 +81,8 @@
#define RG_UNLOCK 20
#define RG_QUERY_LOCK 21
#define RG_MIGRATE 22
+#define RG_FREEZE 23
+#define RG_UNFREEZE 24
#define RG_NONE 999
const char *rg_req_str(int req);
@@ -105,7 +109,11 @@
#define DEFAULT_CHECK_INTERVAL 10
+/* Resource group flags (for now) */
+#define RG_FLAG_FROZEN (1<<0) /** Resource frozen */
+
const char *rg_state_str(int val);
+const char *rg_flags_str(char *flags_string, size_t size, int val, char *separator);
const char *agent_op_str(int val);
int eval_groups(int local, uint32_t nodeid, int nodeStatus);
@@ -121,6 +129,8 @@
int svc_status(char *svcName);
int svc_disable(char *svcName);
int svc_fail(char *svcName);
+int svc_freeze(char *svcName);
+int svc_unfreeze(char *svcName);
int svc_migrate(char *svcName, int target);
int rt_enqueue_request(const char *resgroupname, int request,
msgctx_t *resp_ctx,
@@ -162,6 +172,7 @@
int my_id(void);
/* Return codes */
+#define RG_EFROZEN -11 /* Service is frozen */
#define RG_ERUN -10 /* Service is already running */
#define RG_EQUORUM -9 /* Operation requires quorum */
#define RG_EINVAL -8 /* Invalid operation for resource */
--- cluster/rgmanager/src/clulib/rg_strings.c 2007/03/10 00:20:54 1.7
+++ cluster/rgmanager/src/clulib/rg_strings.c 2007/04/27 18:10:10 1.8
@@ -35,6 +35,7 @@
{ RG_ENOSERVICE,"Service does not exist" },
{ RG_EFORWARD, "Service not mastered locally" },
{ RG_EABORT, "Aborted; service failed" },
+ { RG_EFROZEN, "Failure: Service is frozen"},
{ RG_EFAIL, "Failure" },
{ RG_ESUCCESS, "Success" },
{ RG_YES, "Yes" },
@@ -88,6 +89,12 @@
};
+const struct string_val rg_flags_strings[] = {
+ {RG_FLAG_FROZEN, "frozen"},
+ {0, NULL}
+};
+
+
const struct string_val agent_ops[] = {
{RS_START, "start"},
{RS_STOP, "stop"},
@@ -122,6 +129,20 @@
}
+static inline const char *
+rg_flag_search_table(const struct string_val *table, int val)
+{
+ int x;
+
+ for (x = 0; table[x].str != NULL; x++) {
+ if (table[x].val == val) {
+ return table[x].str;
+ }
+ }
+
+ return "Unknown";
+}
+
const char *
rg_strerror(int val)
{
@@ -134,6 +155,22 @@
return rg_search_table(rg_state_strings, val);
}
+const char *
+rg_flags_str(char *flags_string, size_t size, int val, char *separator)
+{
+ int i;
+ const char *string;
+
+ for (i = 0; i < sizeof(uint32_t); i++) {
+ if ( val & (1 << i)) {
+ if (strlen(flags_string))
+ strncat(flags_string, separator, size - (strlen(flags_string) + strlen(separator) + 1));
+ string = rg_search_table(rg_flags_strings, (1 << i));
+ strncat(flags_string, string, size - (strlen(flags_string) + strlen(string) + 1));
+ }
+ }
+ return flags_string;
+}
const char *
rg_req_str(int val)
/cvs/cluster/cluster/rgmanager/src/daemons/sbuf.c,v --> standard output
revision 1.1
--- cluster/rgmanager/src/daemons/sbuf.c
+++ - 2007-04-27 19:10:12.507518000 +0100
@@ -0,0 +1,85 @@
+#include <string.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <errno.h>
+
+struct _retbuf {
+ char *data;
+ ssize_t maxsize;
+ ssize_t cursize;
+ char magic[8];
+};
+
+static char _x_buf_magic[]="m461kz31";
+
+void *
+buf_init(void *buf, size_t len)
+{
+ struct _retbuf *b = (struct _retbuf *)buf;
+
+ errno = EINVAL;
+ if (!len || !buf)
+ return NULL;
+ if (len < sizeof(*b) + 16)
+ return NULL;
+
+ memset(b, 0, len);
+ b->data = buf + sizeof(*b);
+ b->maxsize = len - sizeof (*b);
+ b->cursize = 0;
+ memcpy(b->magic, _x_buf_magic, sizeof(b->magic));
+
+ return buf;
+}
+
+ssize_t
+buf_append(void *buf, char *info)
+{
+ struct _retbuf *b = (struct _retbuf *)buf;
+ ssize_t len;
+
+ errno = EINVAL;
+ if (!buf)
+ return -1;
+ if (memcmp(b->magic, _x_buf_magic, sizeof(b->magic)))
+ return -1;
+ if (!info)
+ return 0;
+ len = strlen(info);
+ if (!len)
+ return 0;
+
+ errno = ENOSPC;
+ if (b->maxsize - b->cursize < len)
+ return -1;
+
+ memcpy(&(b->data[b->cursize]), info, len);
+ b->cursize += len;
+ return len;
+}
+
+char *
+buf_data(void *buf)
+{
+ struct _retbuf *b = (struct _retbuf *)buf;
+ errno = EINVAL;
+ if (!buf)
+ return NULL;
+ if (memcmp(b->magic, _x_buf_magic, sizeof(b->magic)))
+ return NULL;
+ return ((struct _retbuf *)buf)->data;
+}
+
+
+int
+buf_finished(void *buf)
+{
+ struct _retbuf *b = (struct _retbuf *)buf;
+ errno = EINVAL;
+ if (!buf)
+ return -1;
+ if (memcmp(b->magic, _x_buf_magic, sizeof(b->magic)))
+ return -1;
+ memset(b->magic, 0, sizeof(b->magic));
+ return 0;
+}
--- cluster/rgmanager/src/daemons/groups.c 2007/04/19 17:59:36 1.31
+++ cluster/rgmanager/src/daemons/groups.c 2007/04/27 18:10:10 1.32
@@ -376,6 +376,9 @@
mp = memb_id_to_p(membership, my_id());
assert(mp);
+ /* Service cannot be started if Frozen */
+ if (svcStatus->rs_flags & RG_FLAG_FROZEN)
+ return;
/*
* Service must be not be running elsewhere to consider for a
* local start.
--- cluster/rgmanager/src/daemons/rg_state.c 2007/04/19 17:59:36 1.31
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/04/27 18:10:10 1.32
@@ -282,6 +282,7 @@
svcblk->rs_owner = 0;
svcblk->rs_last_owner = 0;
svcblk->rs_state = RG_STATE_STOPPED;
+ svcblk->rs_flags = 0;
svcblk->rs_restarts = 0;
svcblk->rs_transition = 0;
strncpy(svcblk->rs_name, name, sizeof(svcblk->rs_name));
@@ -418,6 +419,7 @@
svcblk->rs_owner = 0;
svcblk->rs_last_owner = 0;
svcblk->rs_state = RG_STATE_UNINITIALIZED;
+ svcblk->rs_flags = 0;
svcblk->rs_restarts = 0;
svcblk->rs_transition = 0;
strncpy(svcblk->rs_name, name, sizeof(svcblk->rs_name));
@@ -446,6 +448,7 @@
* 2 = DO NOT stop service, return 0 (success)
* 3 = DO NOT stop service, return RG_EFORWARD
* 4 = DO NOT stop service, return RG_EAGAIN
+ * 5 = DO NOT stop service, return RG_EFROZEN
*/
int
svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req)
@@ -453,6 +456,11 @@
cluster_member_list_t *membership = member_list();
int ret = 0;
+ if (svcStatus->rs_flags & RG_FLAG_FROZEN) {
+ clulog(LOG_DEBUG, "Service %s frozen.\n", svcName);
+ return 5;
+ }
+
switch(svcStatus->rs_state) {
case RG_STATE_FAILED:
if (req == RG_DISABLE)
@@ -568,6 +576,7 @@
* 2 = DO NOT start service, return 0
* 3 = DO NOT start service, return RG_EAGAIN
* 4 = DO NOT start service, return RG_ERUN
+ * 5 = DO NOT start service, return RG_EFROZEN
*/
int
svc_advise_start(rg_state_t *svcStatus, char *svcName, int req)
@@ -575,6 +584,11 @@
cluster_member_list_t *membership = member_list();
int ret = 0;
+ if (svcStatus->rs_flags & RG_FLAG_FROZEN) {
+ clulog(LOG_DEBUG, "Service %s frozen.\n", svcName);
+ return 5;
+ }
+
switch(svcStatus->rs_state) {
case RG_STATE_FAILED:
clulog(LOG_ERR,
@@ -752,6 +766,9 @@
case 4:
rg_unlock(&lockp);
return RG_ERUN;
+ case 5:
+ rg_unlock(&lockp);
+ return RG_EFROZEN;
default:
break;
}
@@ -914,6 +931,10 @@
}
rg_unlock(&lockp);
+ if (svcStatus.rs_flags & RG_FLAG_FROZEN)
+ /* Don't check status if the service is frozen */
+ return 0;
+
if (svcStatus.rs_owner != my_id())
/* Don't check status for anything not owned */
return 0;
@@ -961,6 +982,17 @@
int
svc_status_inquiry(char *svcName)
{
+ rg_state_t svcStatus;
+
+ if (get_rg_state_local(svcName, &svcStatus) != 0) {
+ clulog(LOG_ERR, "Failed getting local status for RG %s\n",
+ svcName);
+ return RG_EFAIL;
+ }
+
+ if (svcStatus.rs_flags & RG_FLAG_FROZEN)
+ return 0;
+
return group_op(svcName, RG_STATUS);
}
@@ -1015,6 +1047,9 @@
case 4:
rg_unlock(&lockp);
return RG_EAGAIN;
+ case 5:
+ rg_unlock(&lockp);
+ return RG_EFROZEN;
default:
break;
}
@@ -1191,6 +1226,76 @@
return 0;
}
+/**
+ * Flag/Unflag a cluster service as frozen.
+ *
+ * @param svcName Service ID to flag/unflag as frozen.
+ * @return FAIL, 0
+ */
+int
+_svc_freeze(char *svcName, int enabled)
+{
+ struct dlm_lksb lockp;
+ rg_state_t svcStatus;
+
+ if (rg_lock(svcName, &lockp) == RG_EFAIL) {
+ clulog(LOG_ERR, "#55: Unable to obtain cluster lock: %s\n",
+ strerror(errno));
+ return RG_EFAIL;
+ }
+
+ clulog(LOG_DEBUG, "Handling %s request for RG %s\n", svcName, enabled?"freeze":"unfreeze");
+
+ if (get_rg_state(svcName, &svcStatus) != 0) {
+ rg_unlock(&lockp);
+ clulog(LOG_ERR, "#56: Failed getting status for RG %s\n",
+ svcName);
+ return RG_EFAIL;
+ }
+
+ switch(svcStatus.rs_state) {
+ case RG_STATE_STOPPED:
+ case RG_STATE_STARTED:
+ case RG_STATE_DISABLED:
+
+ if (enabled == 1) {
+ clulog(LOG_DEBUG, "Freezing RG %s\n", svcName);
+ svcStatus.rs_flags |= RG_FLAG_FROZEN;
+ } else {
+ clulog(LOG_DEBUG, "Unfreezing RG %s\n", svcName);
+ svcStatus.rs_flags &= ~RG_FLAG_FROZEN;
+ }
+
+ if (set_rg_state(svcName, &svcStatus) != 0) {
+ rg_unlock(&lockp);
+ clulog(LOG_ERR, "#57: Failed changing RG status\n");
+ return RG_EFAIL;
+ }
+ break;
+
+ default:
+ rg_unlock(&lockp);
+ return RG_EFAIL;
+ break;
+ }
+
+ rg_unlock(&lockp);
+
+ return 0;
+}
+
+int
+svc_freeze(char *svcName)
+{
+ return _svc_freeze(svcName, 1);
+}
+
+int
+svc_unfreeze(char *svcName)
+{
+ return _svc_freeze(svcName, 0);
+}
+
/*
* Send a message to the target node to start the service.
@@ -1324,6 +1429,9 @@
svc_fail(svcName);
return RG_EFAIL;
}
+ if (ret == RG_EFROZEN) {
+ return RG_EFROZEN;
+ }
if (ret == RG_EFORWARD)
return RG_EFORWARD;
}
@@ -1531,7 +1639,7 @@
/*
If services are locked, return the error
*/
- if (ret == RG_EAGAIN || ret == RG_ERUN)
+ if (ret == RG_EAGAIN || ret == RG_ERUN || ret == RG_EFROZEN)
return ret;
/*
--- cluster/rgmanager/src/daemons/rg_thread.c 2007/03/27 19:33:20 1.19
+++ cluster/rgmanager/src/daemons/rg_thread.c 2007/04/27 18:10:10 1.20
@@ -422,6 +422,18 @@
break;
+ case RG_FREEZE:
+ error = svc_freeze(myname);
+ if (error != 0)
+ ret = RG_EFAIL;
+ break;
+
+ case RG_UNFREEZE:
+ error = svc_unfreeze(myname);
+ if (error != 0)
+ ret = RG_EFAIL;
+ break;
+
default:
printf("Unhandled request %d\n", req->rr_request);
ret = RG_NONE;
--- cluster/rgmanager/src/utils/clustat.c 2007/02/06 20:21:17 1.31
+++ cluster/rgmanager/src/utils/clustat.c 2007/04/27 18:10:10 1.32
@@ -416,7 +416,7 @@
_txt_rg_state(rg_state_t *rs, cluster_member_list_t *members, int flags)
{
char owner[31];
-
+ char flags_string[255] = "";
if (rs->rs_state == RG_STATE_STOPPED ||
rs->rs_state == RG_STATE_DISABLED ||
@@ -430,19 +430,34 @@
snprintf(owner, sizeof(owner), "%-.30s",
my_memb_id_to_name(members, rs->rs_owner));
}
- printf(" %-20.20s %-30.30s %-16.16s\n",
+ rg_flags_str(flags_string, sizeof(flags_string), rs->rs_flags, ", ");
+ printf(" %-20.20s %-30.30s %-16.16s ",
rs->rs_name,
owner,
rg_state_str(rs->rs_state));
+ if(strlen(flags_string))
+ printf ("%-30.30s\n", flags_string);
+ else
+ printf("\n");
}
void
_txt_rg_state_v(rg_state_t *rs, cluster_member_list_t *members, int flags)
{
+ char flags_string[255] = "";
+
+ rg_flags_str(flags_string, sizeof(flags_string), rs->rs_flags, ", ");
+
printf("Service Name : %s\n", rs->rs_name);
printf(" Current State : %s (%d)\n",
rg_state_str(rs->rs_state), rs->rs_state);
+ if (rs->rs_flags)
+ printf(" Flags : %s (%d)\n",
+ flags_string, rs->rs_flags);
+ else
+ printf(" Flags : none (%d)\n",
+ rs->rs_flags);
printf(" Owner : %s\n",
my_memb_id_to_name(members, rs->rs_owner));
printf(" Last Owner : %s\n",
@@ -466,6 +481,7 @@
xml_rg_state(rg_state_t *rs, cluster_member_list_t *members, int flags)
{
char time_str[32];
+ char flags_string[255] = "";
int x;
/* Chop off newlines */
@@ -477,12 +493,15 @@
}
}
- printf(" <group name=\"%s\" state=\"%d\" state_str=\"%s\" "
+ printf(" <group name=\"%s\" state=\"%d\" state_str=\"%s\""
+ " flags=\"%d\" flags_str=\"%s\""
" owner=\"%s\" last_owner=\"%s\" restarts=\"%d\""
" last_transition=\"%llu\" last_transition_str=\"%s\"/>\n",
rs->rs_name,
rs->rs_state,
rg_state_str(rs->rs_state),
+ rs->rs_flags,
+ rg_flags_str(flags_string, sizeof(flags_string), rs->rs_flags, " "),
my_memb_id_to_name(members, rs->rs_owner),
my_memb_id_to_name(members, rs->rs_last_owner),
rs->rs_restarts,
@@ -504,10 +523,10 @@
ret = -1;
if (!(flags & RG_VERBOSE)) {
- printf(" %-20.20s %-30.30s %-14.14s\n",
- "Service Name", "Owner (Last)", "State");
- printf(" %-20.20s %-30.30s %-14.14s\n",
- "------- ----", "----- ------", "-----");
+ printf(" %-20.20s %-30.30s %-16.16s %-30.30s\n",
+ "Service Name", "Owner (Last)", "State", "Flags");
+ printf(" %-20.20s %-30.30s %-16.16s %-30.30s\n",
+ "------- ----", "----- ------", "-----", "-----");
} else {
printf("Service Information\n"
"------- -----------\n\n");
--- cluster/rgmanager/src/utils/clusvcadm.c 2007/03/20 17:09:57 1.18
+++ cluster/rgmanager/src/utils/clusvcadm.c 2007/04/27 18:10:10 1.19
@@ -240,7 +240,7 @@
return 1;
}
- while ((opt = getopt(argc, argv, "lSue:M:d:r:n:m:vR:s:qh?")) != EOF) {
+ while ((opt = getopt(argc, argv, "lSue:M:d:r:n:m:vR:s:F:U:qh?")) != EOF) {
switch (opt) {
case 'l':
return do_lock();
@@ -294,6 +294,16 @@
case 'v':
printf("%s\n",PACKAGE_VERSION);
return 0;
+ case 'F':
+ actionstr = "freezing";
+ action = RG_FREEZE;
+ svcname = optarg;
+ break;
+ case 'U':
+ actionstr = "unfreezing";
+ action = RG_UNFREEZE;
+ svcname = optarg;
+ break;
case 'q':
close(STDOUT_FILENO);
break;
^ permalink raw reply [flat|nested] 11+ messages in thread* [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
@ 2006-09-01 19:02 lhh
0 siblings, 0 replies; 11+ messages in thread
From: lhh @ 2006-09-01 19:02 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: lhh at sourceware.org 2006-09-01 19:02:22
Modified files:
rgmanager : ChangeLog
rgmanager/include: resgroup.h vf.h
rgmanager/src/clulib: rg_strings.c vft.c
rgmanager/src/daemons: groups.c main.c
rgmanager/src/utils: clustat.c clusvcadm.c
Log message:
2006-09-01 Lon Hohberger <lhh@redhat.com>
* include/resgroup.h: Add proto for rg_strerror
* include/vf.h: Add proto for vf_invalidate (flushes vf cache)
* src/clulib/rg_strings.c: Add rg_strerror function, define
human-readable strings for rgmanager error values
* src/clulib/vft.c: Add vf_invalidate (separate from vf_shutdown)
* src/daemons/groups.c: Fix obvious logic error
* src/daemons/main.c: Fix rg_doall() message during loss of quorum.
Invalidate local VF cache and kill resource configurations on
loss of quorum (#202497). Send RG_EQUORUM back to clustat/clusvcadm
so that they report why they can't get information. Don't queue
status checks if we've lost quorum. Add command line parameter to
disable internal crash watchdog
* src/utils/clustat.c, clusvcadm.c: Handle SIGPIPE, and produce
useful errors if possible.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.22&r2=1.23
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.13&r2=1.14
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/vf.h.diff?cvsroot=cluster&r1=1.5&r2=1.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/rg_strings.c.diff?cvsroot=cluster&r1=1.4&r2=1.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/vft.c.diff?cvsroot=cluster&r1=1.15&r2=1.16
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.21&r2=1.22
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.30&r2=1.31
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clustat.c.diff?cvsroot=cluster&r1=1.19&r2=1.20
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clusvcadm.c.diff?cvsroot=cluster&r1=1.11&r2=1.12
--- cluster/rgmanager/ChangeLog 2006/08/21 15:14:08 1.22
+++ cluster/rgmanager/ChangeLog 2006/09/01 19:02:20 1.23
@@ -1,3 +1,25 @@
+2006-09-01 Lon Hohberger <lhh@redhat.com>
+ * include/resgroup.h: Add proto for rg_strerror
+ * include/vf.h: Add proto for vf_invalidate (flushes vf cache)
+ * src/clulib/rg_strings.c: Add rg_strerror function, define
+ human-readable strings for rgmanager error values
+ * src/clulib/vft.c: Add vf_invalidate (separate from vf_shutdown)
+ * src/daemons/groups.c: Fix obvious logic error
+ * src/daemons/main.c: Fix rg_doall() message during loss of quorum.
+ Invalidate local VF cache and kill resource configurations on
+ loss of quorum (#202497). Send RG_EQUORUM back to clustat/clusvcadm
+ so that they report why they can't get information. Don't queue
+ status checks if we've lost quorum. Add command line parameter to
+ disable internal crash watchdog
+ * src/utils/clustat.c, clusvcadm.c: Handle SIGPIPE, and produce
+ useful errors if possible.
+
+2006-08-31 Marek Gr??c <mgrac@redhat.com>
+ * src/daemons/restree.c: Fix #203720. Do not run backup copies (ends
+ with ~) of resource agents.
+ * src/resources/apache.*, mysql.*: Add Apache & MySQL resource agents
+ * src/resources/utils/*: Add utility scripts for resource agents
+
2006-08-21 Lon Hohberger <lhh@redhat.com>
* src/daemons/main.c: Fix #202500 - simultaneous starts confuse
rgmanager. This happened due to the fact that rgmanager was not
--- cluster/rgmanager/include/resgroup.h 2006/08/18 15:26:22 1.13
+++ cluster/rgmanager/include/resgroup.h 2006/09/01 19:02:21 1.14
@@ -174,6 +174,9 @@
#define RG_YES 1
#define RG_NO 2
+char *rg_strerror(int val);
+
+
/*
* Fail-over domain states
*/
--- cluster/rgmanager/include/vf.h 2006/07/12 14:04:06 1.5
+++ cluster/rgmanager/include/vf.h 2006/09/01 19:02:21 1.6
@@ -170,6 +170,7 @@
* VF Stuff. VF only talks to peers.
*/
int vf_init(int, uint16_t, vf_vote_cb_t, vf_commit_cb_t);
+int vf_invalidate(void);
int vf_shutdown(void);
/*
--- cluster/rgmanager/src/clulib/rg_strings.c 2006/07/11 23:52:41 1.4
+++ cluster/rgmanager/src/clulib/rg_strings.c 2006/09/01 19:02:22 1.5
@@ -16,6 +16,39 @@
Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
MA 02139, USA.
*/
+#include <resgroup.h>
+
+struct { int val; char *str; } rg_error_strings[] = {
+ { RG_EQUORUM, "Operation requires quorum" },
+ { RG_EINVAL, "Invalid operation for resource" },
+ { RG_EDEPEND, "Operation violates dependency rule" },
+ { RG_EAGAIN, "Temporary failure; try again" },
+ { RG_EDEADLCK, "Operation would cause a deadlock" },
+ { RG_ENOSERVICE,"Service does not exist" },
+ { RG_EFORWARD, "Service not mastered locally" },
+ { RG_EABORT, "Aborted; service failed" },
+ { RG_EFAIL, "Failure" },
+ { RG_ESUCCESS, "Success" },
+ { RG_YES, "Yes" },
+ { RG_NO, "No" },
+ { 0, NULL }
+};
+
+
+char *rg_strerror(int err)
+{
+ int x;
+
+ for (x = 0; rg_error_strings[x].str != NULL; x++) {
+ if (rg_error_strings[x].val == err) {
+ return rg_error_strings[x].str;
+ }
+ }
+
+ return "Unknown";
+}
+
+
const char *rg_state_strings[] = {
"stopped",
"starting",
@@ -51,3 +84,4 @@
"user stop",
""
};
+
--- cluster/rgmanager/src/clulib/vft.c 2006/08/07 22:05:01 1.15
+++ cluster/rgmanager/src/clulib/vft.c 2006/09/01 19:02:22 1.16
@@ -935,22 +935,13 @@
}
-/**
- Shut down VF
- */
int
-vf_shutdown(void)
+vf_invalidate(void)
{
key_node_t *c_key;
view_node_t *c_jv;
commit_node_t *c_cn;
- pthread_mutex_lock(&vf_mutex);
- vf_thread_ready = 0;
- pthread_cancel(vf_thread);
- pthread_join(vf_thread, NULL);
- _port = 0;
- _node_id = (int)-1;
pthread_mutex_lock(&key_list_mutex);
while ((c_key = key_list) != NULL) {
@@ -974,6 +965,29 @@
}
pthread_mutex_unlock(&key_list_mutex);
+ return 0;
+}
+
+
+/**
+ Shut down VF
+ */
+int
+vf_shutdown(void)
+{
+ key_node_t *c_key;
+ view_node_t *c_jv;
+ commit_node_t *c_cn;
+
+ pthread_mutex_lock(&vf_mutex);
+ vf_thread_ready = 0;
+ pthread_cancel(vf_thread);
+ pthread_join(vf_thread, NULL);
+ _port = 0;
+ _node_id = (int)-1;
+
+ vf_invalidate();
+
pthread_mutex_unlock(&vf_mutex);
return 0;
--- cluster/rgmanager/src/daemons/groups.c 2006/08/18 15:26:22 1.21
+++ cluster/rgmanager/src/daemons/groups.c 2006/09/01 19:02:22 1.22
@@ -273,7 +273,7 @@
* local start.
*/
if (svcStatus->rs_state == RG_STATE_STARTED &&
- svcStatus->rs_owner == mp->cn_nodeid)
+ svcStatus->rs_owner != mp->cn_nodeid)
return;
if (svcStatus->rs_state == RG_STATE_DISABLED)
--- cluster/rgmanager/src/daemons/main.c 2006/08/21 15:14:09 1.30
+++ cluster/rgmanager/src/daemons/main.c 2006/09/01 19:02:22 1.31
@@ -123,7 +123,13 @@
rg_set_inquorate();
member_list_update(NULL);/* Clear member list */
rg_lockall(L_SYS);
- rg_doall(RG_INIT, 1, "Emergency stop of %s");
+ rg_doall(RG_INIT, 1, "Emergency stop of %s\n");
+#ifndef USE_OPENAIS
+ clulog(LOG_DEBUG, "Invalidating local VF cache\n");
+ vf_invalidate();
+#endif
+ clulog(LOG_DEBUG, "Flushing resource group cache\n");
+ kill_resource_groups();
rg_set_uninitialized();
return -1;
} else if (!rg_quorate()) {
@@ -131,7 +137,7 @@
rg_set_quorate();
rg_unlockall(L_SYS);
rg_unlockall(L_USER);
- clulog(LOG_NOTICE, "Quorum Formed\n");
+ clulog(LOG_NOTICE, "Quorum Regained\n");
}
old_membership = member_list();
@@ -562,7 +568,7 @@
case M_STATECHANGE:
msg_receive(ctx, NULL, 0, 0);
clulog(LOG_DEBUG, "Membership Change Event\n");
- if (rg_quorate() && running) {
+ if (running) {
rg_unlockall(L_SYS);
membership_update();
}
@@ -644,6 +650,7 @@
}
if (!rg_initialized()) {
+ msg_send_simple(newctx, RG_FAIL, RG_EQUORUM, 0);
msg_close(newctx);
msg_free_ctx(newctx);
continue;
@@ -651,6 +658,7 @@
if (!rg_quorate()) {
printf("Dropping connect: NO QUORUM\n");
+ msg_send_simple(newctx, RG_FAIL, RG_EQUORUM, 0);
msg_close(newctx);
msg_free_ctx(newctx);
}
@@ -668,7 +676,7 @@
return 0;
/* No new messages. Drop in the status check requests. */
- if (n == 0) {
+ if (n == 0 && rg_quorate()) {
do_status_checks();
return 0;
}
@@ -805,15 +813,18 @@
main(int argc, char **argv)
{
int rv;
- char foreground = 0;
+ char foreground = 0, wd = 1;
cman_node_t me;
msgctx_t *cluster_ctx;
msgctx_t *local_ctx;
pthread_t th;
cman_handle_t clu = NULL;
- while ((rv = getopt(argc, argv, "fd")) != EOF) {
+ while ((rv = getopt(argc, argv, "wfd")) != EOF) {
switch (rv) {
+ case 'w':
+ wd = 0;
+ break;
case 'd':
debug = 1;
break;
@@ -834,7 +845,7 @@
if (!foreground && (geteuid() == 0)) {
daemon_init(argv[0]);
- if (!debug && !watchdog_init())
+ if (wd && !debug && !watchdog_init())
clulog(LOG_NOTICE, "Failed to start watchdog\n");
}
--- cluster/rgmanager/src/utils/clustat.c 2006/08/07 22:05:01 1.19
+++ cluster/rgmanager/src/utils/clustat.c 2006/09/01 19:02:22 1.20
@@ -10,6 +10,7 @@
#include <termios.h>
#include <ccs.h>
#include <libcman.h>
+#include <signal.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
@@ -46,7 +47,7 @@
rg_state_list(int local_node_id, int fast)
{
msgctx_t ctx;
- int max, n, x;
+ int max = 0, n, x;
rg_state_list_t *rsl = NULL;
generic_msg_hdr *msgp = NULL;
rg_state_msg_t *rsmp = NULL;
@@ -91,6 +92,7 @@
}
n = msg_receive_simple(&ctx, &msgp, tv.tv_sec);
+
if (n < 0) {
if (errno == EAGAIN)
continue;
@@ -109,6 +111,13 @@
swab_generic_msg_hdr(msgp);
+ if (msgp->gh_command == RG_FAIL) {
+ printf("Service states unavailable: %s\n",
+ rg_strerror(msgp->gh_arg1));
+ msg_close(&ctx);
+ return NULL;
+ }
+
if (msgp->gh_command == RG_SUCCESS) {
free(msgp);
break;
@@ -736,6 +745,8 @@
return 1;
}
+ signal(SIGPIPE, SIG_IGN);
+
/* Connect & grab all our info */
ch = cman_init(NULL);
--- cluster/rgmanager/src/utils/clusvcadm.c 2006/08/09 21:48:34 1.11
+++ cluster/rgmanager/src/utils/clusvcadm.c 2006/09/01 19:02:22 1.12
@@ -31,6 +31,7 @@
#include <libcman.h>
#include <resgroup.h>
#include <msgsimple.h>
+#include <signal.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
@@ -187,6 +188,7 @@
msgctx_t ctx;
cman_handle_t ch;
SmMessageSt msg;
+ generic_msg_hdr *h = (generic_msg_hdr *)&msg;
int action = RG_STATUS;
int node_specified = 0;
int me, svctarget = 0;
@@ -274,6 +276,8 @@
svcname = realsvcname;
}
+ signal(SIGPIPE, SIG_IGN);
+
/* No login */
ch = cman_init(NULL);
if (!ch) {
@@ -320,48 +324,23 @@
return 1;
}
- opt = msg_send(&ctx, &msg, sizeof(msg));
-
- if (opt < sizeof(msg)) {
- perror("msg_send");
- fprintf(stderr, "Could not send entire message!\n");
- return 1;
- }
+ msg_send(&ctx, &msg, sizeof(msg));
- if (msg_receive(&ctx, &msg, sizeof(msg), 0) != sizeof(msg)) {
+ /* Reusing opt here */
+ if ((opt = msg_receive(&ctx, &msg, sizeof(msg), 0)) < sizeof(*h)) {
perror("msg_receive");
fprintf(stderr, "Error receiving reply!\n");
return 1;
}
/* Decode */
- swab_SmMessageSt(&msg);
- switch (msg.sm_data.d_ret) {
- case RG_ESUCCESS:
- printf("success\n");
- break;
- case RG_EFAIL:
- printf("failed\n");
- break;
- case RG_EABORT:
- printf("cancelled by resource manager\n");
- break;
- case RG_ENOSERVICE:
- printf("failed: Service does not exist\n");
- break;
- case RG_EDEADLCK:
- printf("failed: Operation would deadlock\n");
- break;
- case RG_EAGAIN:
- printf("failed: Try again (resource groups locked)\n");
- break;
- case RG_EDEPEND:
- printf("failed: Operation would break dependency\n");
- break;
- default:
- printf("failed: unknown reason %d\n", msg.sm_data.d_ret);
- break;
+ if (opt < sizeof(msg)) {
+ swab_generic_msg_hdr(h);
+ printf("%s\n", rg_strerror(h->gh_arg1));
+ return h->gh_arg1;
}
+ swab_SmMessageSt(&msg);
+ printf("%s\n", rg_strerror(msg.sm_data.d_ret));
return msg.sm_data.d_ret;
}
^ permalink raw reply [flat|nested] 11+ messages in thread* [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...
@ 2006-08-18 15:26 lhh
0 siblings, 0 replies; 11+ messages in thread
From: lhh @ 2006-08-18 15:26 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: lhh at sourceware.org 2006-08-18 15:26:23
Modified files:
rgmanager : ChangeLog
rgmanager/include: resgroup.h
rgmanager/src/clulib: ckpt_state.c
rgmanager/src/daemons: groups.c main.c rg_state.c
rgmanager/src/resources: clusterfs.sh fs.sh nfsclient.sh
ra-api-1-modified.dtd script.sh
Log message:
2006-08-18 Lon Hohberger <lhh@redhat.com>
* include/resgroup.h: Change ordering and add magic field to
rgmanager state field (warning: breaks compatibility from 08/08 CVS!)
* src/clulib/ckpt_state.c, src/daemons/rg_state.c: Fix bug
preventing correct operation of ckpt operation after initial boot.
Get rid of debug info.
* src/daemons/groups,c, main.c: Fix #202499 - shutdown while handling
transitions sometimes allows services to restart (due to not locking
RGs locally)
* src/resources/clusterfs.sh, fs.sh, nfsclient.sh: Add proper
warning messages if status check fails
* src/resources/ra-api-1-modified.dtd: Allow 'migrate' option
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.19&r2=1.20
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.12&r2=1.13
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/ckpt_state.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.20&r2=1.21
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.28&r2=1.29
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.19&r2=1.20
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/clusterfs.sh.diff?cvsroot=cluster&r1=1.10&r2=1.11
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/fs.sh.diff?cvsroot=cluster&r1=1.16&r2=1.17
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/nfsclient.sh.diff?cvsroot=cluster&r1=1.12&r2=1.13
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/ra-api-1-modified.dtd.diff?cvsroot=cluster&r1=1.3&r2=1.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/script.sh.diff?cvsroot=cluster&r1=1.7&r2=1.8
--- cluster/rgmanager/ChangeLog 2006/08/09 21:48:34 1.19
+++ cluster/rgmanager/ChangeLog 2006/08/18 15:26:21 1.20
@@ -1,3 +1,16 @@
+2006-08-18 Lon Hohberger <lhh@redhat.com>
+ * include/resgroup.h: Change ordering and add magic field to
+ rgmanager state field (warning: breaks compatibility from 08/08 CVS!)
+ * src/clulib/ckpt_state.c, src/daemons/rg_state.c: Fix bug
+ preventing correct operation of ckpt operation after initial boot.
+ Get rid of debug info.
+ * src/daemons/groups,c, main.c: Fix #202499 - shutdown while handling
+ transitions sometimes allows services to restart (due to not locking
+ RGs locally)
+ * src/resources/clusterfs.sh, fs.sh, nfsclient.sh: Add proper
+ warning messages if status check fails
+ * src/resources/ra-api-1-modified.dtd: Allow 'migrate' option
+
2006-08-08 Lon Hohberger <lhh@redhat.com>
* src/clulib/members.c: Fix gained/lost list creation so that the
count is actually nonzero (#201713)
--- cluster/rgmanager/include/resgroup.h 2006/07/19 18:43:32 1.12
+++ cluster/rgmanager/include/resgroup.h 2006/08/18 15:26:22 1.13
@@ -27,31 +27,30 @@
*/
typedef struct {
char rs_name[64]; /**< Service name */
+ uint32_t rs_id; /**< Service ID */
+ uint32_t rs_magic; /**< Magic ID */
uint32_t rs_owner; /**< Member ID running service. */
uint32_t rs_last_owner; /**< Last member to run the service. */
uint32_t rs_state; /**< State of service. */
uint32_t rs_restarts; /**< Number of cluster-induced
restarts */
uint64_t rs_transition; /**< Last service transition time */
- uint32_t rs_id; /**< Service ID */
- uint32_t rs_pad; /**< pad to 64-bit boundary */
} rg_state_t;
#define swab_rg_state_t(ptr) \
{\
+ swab32((ptr)->rs_id);\
+ swab32((ptr)->rs_magic);\
swab32((ptr)->rs_owner);\
swab32((ptr)->rs_last_owner);\
swab32((ptr)->rs_state);\
swab32((ptr)->rs_restarts);\
swab64((ptr)->rs_transition);\
- swab32((ptr)->rs_pad);\
}
#define RG_PORT 177
-#define RG_VF_PORT 178
-#define RG_PURPOSE 0x11398fed
-#define RG_SERVICE_GROUP "usrm::manager"
+#define RG_MAGIC 0x11398fed
#define RG_ACTION_REQUEST /* Message header */ 0x138582
#define RG_EVENT 0x138583
--- cluster/rgmanager/src/clulib/ckpt_state.c 2006/08/07 22:05:01 1.1
+++ cluster/rgmanager/src/clulib/ckpt_state.c 2006/08/18 15:26:22 1.2
@@ -75,8 +75,10 @@
{
SaCkptCheckpointCreationAttributesT attrs;
SaCkptCheckpointOpenFlagsT flags;
+#if 0
SaCkptCheckpointDescriptorT status;
- SaAisErrorT err;
+#endif
+ SaAisErrorT err = SA_AIS_OK;
key_node_t *newnode = NULL;
newnode = kn_find_key(keyid);
@@ -111,6 +113,7 @@
&newnode->kn_cph);
if (err == SA_AIS_OK) {
+#if 0
saCkptCheckpointStatusGet(newnode->kn_cph,
&status);
@@ -141,12 +144,10 @@
(int)status.checkpointCreationAttributes.maxSectionIdSize);
printf("Section count = %d\n", status.numberOfSections);
printf("\n");
-
+#endif
goto good;
}
- printf("Retrying w/ create\n");
-
attrs.creationFlags = SA_CKPT_WR_ALL_REPLICAS;
attrs.checkpointSize = (SaSizeT)maxsize;
attrs.retentionDuration = SA_TIME_ONE_HOUR;
@@ -175,7 +176,9 @@
newnode->kn_ready = 1;
newnode->kn_next = key_list;
key_list = newnode;
+#if 0
printf("Opened ckpt %s\n", keyid);
+#endif
return err;
}
--- cluster/rgmanager/src/daemons/groups.c 2006/07/19 18:43:32 1.20
+++ cluster/rgmanager/src/daemons/groups.c 2006/08/18 15:26:22 1.21
@@ -418,7 +418,7 @@
int ret;
if (rg_locked()) {
- clulog(LOG_NOTICE,
+ clulog(LOG_DEBUG,
"Resource groups locked; not evaluating\n");
return -EAGAIN;
}
--- cluster/rgmanager/src/daemons/main.c 2006/08/09 21:48:34 1.28
+++ cluster/rgmanager/src/daemons/main.c 2006/08/18 15:26:22 1.29
@@ -792,6 +792,7 @@
void *
shutdown_thread(void *arg)
{
+ rg_lockall(L_SYS);
rg_doall(RG_STOP_EXITING, 1, NULL);
running = 0;
--- cluster/rgmanager/src/daemons/rg_state.c 2006/08/07 22:05:01 1.19
+++ cluster/rgmanager/src/daemons/rg_state.c 2006/08/18 15:26:22 1.20
@@ -306,11 +306,12 @@
if (errno == ENOENT) {
ds_key_init(res, DS_MIN_SIZE, 10);
} else {
+ perror("ds_read");
return -1;
}
}
- if (datalen < 0) {
+ if (datalen <= 0) {
ret = init_rg(name, svcblk);
if (ret < 0) {
@@ -326,6 +327,7 @@
}
memcpy(svcblk, data, sizeof(*svcblk));
+
return 0;
#else
membership = member_list();
--- cluster/rgmanager/src/resources/clusterfs.sh 2006/06/02 17:37:10 1.10
+++ cluster/rgmanager/src/resources/clusterfs.sh 2006/08/18 15:26:22 1.11
@@ -889,12 +889,16 @@
;;
status|monitor)
isMounted ${OCF_RESKEY_device} ${OCF_RESKEY_mountpoint}
- [ $? -ne $YES ] && exit $OCF_ERR_GENERIC
+ if [ $? -ne $YES ]; then
+ ocf_log err "fs:${OCF_RESKEY_name}: ${OCF_RESKEY_device} is not mounted on ${OCF_RESKEY_mountpoint}"
+ exit $OCF_ERR_GENERIC
+ fi
isAlive ${OCF_RESKEY_mountpoint}
- [ $? -ne $YES ] && exit $OCF_ERR_GENERIC
-
- exit 0
+ [ $? -eq $YES ] && exit 0
+
+ ocf_log err "fs:${OCF_RESKEY_name}: Mount point is not accessible!"
+ exit $OCF_ERR_GENERIC
;;
restart)
stopFilesystem
--- cluster/rgmanager/src/resources/fs.sh 2006/06/02 17:37:10 1.16
+++ cluster/rgmanager/src/resources/fs.sh 2006/08/18 15:26:22 1.17
@@ -243,7 +243,7 @@
{
if [ -z "$OCF_RESKEY_mountpoint" ]; then
ocf_log err "No mount point specified."
- return 1
+ return $OCF_ERR_ARGS
fi
if ! [ -e "$OCF_RESKEY_mountpoint" ]; then
@@ -514,7 +514,7 @@
dev=$(real_device $1)
if [ -z "$dev" ]; then
ocf_log err \
- "isMounted: Could not match $1 with a real device"
+ "fs (isMounted): Could not match $1 with a real device"
return $FAIL
fi
mp=$2
@@ -553,14 +553,14 @@
declare rw
if [ $# -ne 1 ]; then
- logAndPrint $LOG_ERR "Usage: isAlive mount_point"
+ ocf_log err "Usage: isAlive mount_point"
return $FAIL
fi
mount_point=$1
test -d $mount_point
if [ $? -ne 0 ]; then
- logAndPrint $LOG_ERR "$mount_point is not a directory"
+ ocf_log err "fs (isAlive): $mount_point is not a directory"
return $FAIL
fi
@@ -707,6 +707,7 @@
return $ret
}
+
activeMonitor() {
declare monpath=$OCF_RESKEY_mountpoint/.clumanager
declare p
@@ -733,7 +734,7 @@
case $1 in
start)
ocf_log info "Starting active monitoring of $OCF_RESKEY_mountpoint"
- mkdir -p $(dirname $monpath) || return 1
+ mkdir -p $(dirname $monpath) || return $OCF_ERR_GENERIC
devmon $args -p $monpath/devmon.data -P $monpath/devmon.pid
;;
stop)
@@ -794,7 +795,7 @@
if [ -z "`which quotaon`" ]; then
ocf_log err "quotaon not found in $PATH"
- return 1
+ return $OCF_ERR_GENERIC
fi
for mopt in `echo $opts | sed -e s/,/\ /g`; do
@@ -1211,29 +1212,35 @@
;;
status|monitor)
isMounted ${OCF_RESKEY_device} ${OCF_RESKEY_mountpoint}
- [ $? -ne $YES ] && exit $OCF_ERR_GENERIC
+ if [ $? -ne $YES ]; then
+ ocf_log err "fs:${OCF_RESKEY_name}: ${OCF_RESKEY_device} is not mounted on ${OCF_RESKEY_mountpoint}"
+ exit $OCF_ERR_GENERIC
+ fi
if [ "$OCF_RESKEY_active_monitor" = "yes" ] ||
[ "$OCF_RESKEY_active_monitor" = "1" ]; then
- activeMonitor status || exit $OCF_ERR_GENERIC
- exit 0
+ activeMonitor status
+ [ $? -eq 0 ] && exit 0
+ ocf_log err "fs:${OCF_RESKEY_name}: Active Monitoring reported a failure"
+ exit $OCF_ERR_GENERIC
fi
isAlive ${OCF_RESKEY_mountpoint}
- [ $? -ne $YES ] && exit $OCF_ERR_GENERIC
-
- exit 0
+ [ $? -eq $YES ] && exit 0
+
+ ocf_log err "fs:${OCF_RESKEY_name}: Mount point is not accessible!"
+ exit $OCF_ERR_GENERIC
;;
restart)
stopFilesystem
if [ $? -ne 0 ]; then
- exit 1
+ exit $OCF_ERR_GENERIC
fi
startFilesystem
if [ $? -ne 0 ]; then
- exit 1
+ exit $OCF_ERR_GENERIC
fi
exit 0
--- cluster/rgmanager/src/resources/nfsclient.sh 2006/08/02 17:24:31 1.12
+++ cluster/rgmanager/src/resources/nfsclient.sh 2006/08/18 15:26:22 1.13
@@ -320,7 +320,11 @@
sed -e 's/*/[*]/g' -e 's/?/[?]/g' -e 's/\./\\./g')
exportfs -v | tr -d "\n" | sed -e 's/([^)]*)/\n/g' | grep -q \
"^${OCF_RESKEY_path}[\t ]*.*${OCF_RESKEY_target_regexp}"
+
rv=$?
+ if [ $rv -ne 0 ]; then
+ ocf_log err "nfsclient:$OCF_RESKEY_name is missing!"
+ fi
;;
recover)
--- cluster/rgmanager/src/resources/ra-api-1-modified.dtd 2006/07/19 18:43:32 1.3
+++ cluster/rgmanager/src/resources/ra-api-1-modified.dtd 2006/08/18 15:26:22 1.4
@@ -25,7 +25,8 @@
primary (1|0) "0"
required (1|0) "0"
inherit CDATA ""
- unique (1|0) "0">
+ unique (1|0) "0"
+ reconfig (1|0) "0">
<!ELEMENT longdesc ANY>
<!ATTLIST longdesc
@@ -42,7 +43,7 @@
<!ELEMENT action EMPTY>
<!ATTLIST action
- name (start|stop|recover|status|monitor|reload|meta-data|verify-all|migrate) #REQUIRED
+ name (start|stop|recover|status|reconfig|monitor|reload|meta-data|verify-all|migrate) #REQUIRED
timeout CDATA #REQUIRED
interval CDATA #IMPLIED
start-delay CDATA #IMPLIED
--- cluster/rgmanager/src/resources/script.sh 2005/11/21 21:48:42 1.7
+++ cluster/rgmanager/src/resources/script.sh 2006/08/18 15:26:23 1.8
@@ -110,4 +110,10 @@
# Don't need to catch return codes; this one will work.
ocf_log info "Executing ${OCF_RESKEY_file} $1"
-exec /bin/sh ${OCF_RESKEY_file} $1
+${OCF_RESKEY_file} $1
+
+declare -i rv=$?
+if [ $rv -ne 0 ]; then
+ ocf_log err "script:$OCF_RESKEY_name: $1 of $OCF_RESKEY_file failed (returned $rv)"
+ return $OCF_ERR_GENERIC
+fi
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2007-12-30 8:27 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-10-06 21:22 [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h lhh
-- strict thread matches above, loose matches on Subject: below --
2007-12-30 8:27 fabbione
2007-12-14 19:37 lhh
2007-11-30 20:36 lhh
2007-06-27 14:03 lhh
2007-06-26 21:55 lhh
2007-06-14 15:06 mgrac
2007-06-14 13:36 mgrac
2007-04-27 18:10 lhh
2006-09-01 19:02 lhh
2006-08-18 15:26 lhh
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).