From: lhh@sourceware.org <lhh@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/rgmanager ChangeLog include/reslist.h ...
Date: 2 Aug 2007 14:47:46 -0000 [thread overview]
Message-ID: <20070802144746.19944.qmail@sourceware.org> (raw)
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL51
Changes by: lhh at sourceware.org 2007-08-02 14:47:45
Modified files:
rgmanager : ChangeLog
rgmanager/include: reslist.h
rgmanager/src/daemons: groups.c main.c nodeevent.c restree.c
rg_forward.c rg_state.c
rgmanager/src/resources: vm.sh
Log message:
Fix #248727, round 2
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.31.2.19.2.2&r2=1.31.2.19.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.15.2.4.2.1&r2=1.15.2.4.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.25.2.9.2.2&r2=1.25.2.9.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.34.2.6.2.1&r2=1.34.2.6.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/nodeevent.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.4.2.3.2.1&r2=1.4.2.3.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.23.2.8.2.1&r2=1.23.2.8.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_forward.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.8.2.1.2.1&r2=1.8.2.1.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.24.2.10.2.1&r2=1.24.2.10.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/vm.sh.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.1.2.4.2.2&r2=1.1.2.4.2.3
--- cluster/rgmanager/ChangeLog 2007/07/31 17:56:10 1.31.2.19.2.2
+++ cluster/rgmanager/ChangeLog 2007/08/02 14:47:45 1.31.2.19.2.3
@@ -1,3 +1,16 @@
+2007-08-02 Lon Hohberger <lhh@redhat.com>
+ * general: More fixes around #248727
+ * include/reslist.h, src/daemons/restree.c: Make last-value be
+ returned or resources which have been checked recently
+ * src/daemons/groups.c: Make VMs use migrate semantics instead of
+ relocate semantics when employing failover domain rules
+ * src/daemons/nodeevent.c: Fix VMs ending up on wrong nodes when
+ simultaneous boot occurs
+ * src/daemons/rg_forward.c: Fix erroneous timeout
+ * src/daemons/rg_state.c: Handle RG_STATE_MIGRATE in svc_advise_*
+ Handle certain migration failures.
+ * src/resources/vm.sh: Handle certain migration failures
+
2007-07-31 Lon Hohberger <lhh@redhat.com>
* general: Make VMs not change state when added/removed from the
cluster config or bounce services/VMs when minor config changes
--- cluster/rgmanager/include/reslist.h 2007/07/31 17:56:10 1.15.2.4.2.1
+++ cluster/rgmanager/include/reslist.h 2007/08/02 14:47:45 1.15.2.4.2.2
@@ -128,6 +128,10 @@
resource_act_t *rn_actions;
int rn_state; /* State of this instance of rn_resource */
int rn_flags;
+ int rn_last_status;
+ int rn_last_depth;
+ int rn_checked;
+ int rn_pad;
} resource_node_t;
typedef struct _fod_node {
--- cluster/rgmanager/src/daemons/groups.c 2007/07/31 17:56:10 1.25.2.9.2.2
+++ cluster/rgmanager/src/daemons/groups.c 2007/08/02 14:47:45 1.25.2.9.2.3
@@ -500,13 +500,14 @@
consider_relocate(char *svcName, rg_state_t *svcStatus, uint32_t nodeid,
cluster_member_list_t *membership)
{
- int a, b;
+ int a, b, req = RG_RELOCATE;
/*
Service must be running locally in order to consider for
a relocate
*/
- if (svcStatus->rs_state != RG_STATE_STARTED ||
+ if ((svcStatus->rs_state != RG_STATE_STARTING &&
+ svcStatus->rs_state != RG_STATE_STARTED) ||
svcStatus->rs_owner != my_id())
return;
@@ -526,11 +527,16 @@
if (a <= b)
return;
- clulog(LOG_DEBUG, "Relocating group %s to better node %s\n",
+ if (group_migratory(svcName, 1)) {
+ req = RG_MIGRATE;
+ }
+
+ clulog(LOG_NOTICE, "%s %s to better node %s\n",
+ req==RG_MIGRATE ? "Migrating":"Relocating",
svcName,
memb_id_to_name(membership, nodeid));
- rt_enqueue_request(svcName, RG_RELOCATE, NULL, 0, nodeid, 0, 0);
+ rt_enqueue_request(svcName, req, NULL, 0, nodeid, 0, 0);
}
--- cluster/rgmanager/src/daemons/main.c 2007/07/24 18:49:18 1.34.2.6.2.1
+++ cluster/rgmanager/src/daemons/main.c 2007/08/02 14:47:45 1.34.2.6.2.2
@@ -43,7 +43,7 @@
#ifdef WRAP_THREADS
void dump_thread_states(FILE *);
#endif
-int configure_logging(int ccsfd, int debug);
+int configure_rgmanager(int ccsfd, int debug);
void node_event(int, int, int, int);
void node_event_q(int, int, int, int);
@@ -730,7 +730,7 @@
if (need_reconfigure || check_config_update()) {
need_reconfigure = 0;
- configure_logging(-1, 0);
+ configure_rgmanager(-1, 0);
init_resource_groups(1);
return 0;
}
@@ -789,7 +789,7 @@
* Configure logging based on data in cluster.conf
*/
int
-configure_logging(int ccsfd, int dbg)
+configure_rgmanager(int ccsfd, int dbg)
{
char *v;
char internal = 0;
@@ -812,6 +812,12 @@
free(v);
}
+ if (ccs_get(ccsfd, "/cluster/rm/@transition_throttling", &v) == 0) {
+ if (!dbg)
+ set_transition_throttling(atoi(v));
+ free(v);
+ }
+
if (internal)
ccs_disconnect(ccsfd);
@@ -956,7 +962,7 @@
We know we're quorate. At this point, we need to
read the resource group trees from ccsd.
*/
- configure_logging(-1, debug);
+ configure_rgmanager(-1, debug);
clulog(LOG_NOTICE, "Resource Group Manager Starting\n");
if (init_resource_groups(0) != 0) {
--- cluster/rgmanager/src/daemons/nodeevent.c 2007/07/24 18:49:18 1.4.2.3.2.1
+++ cluster/rgmanager/src/daemons/nodeevent.c 2007/08/02 14:47:45 1.4.2.3.2.2
@@ -42,6 +42,7 @@
#endif
static nevent_t *event_queue = NULL;
static pthread_t ne_thread = 0;
+static int transition_throttling = 5;
int ne_queue_request(int local, int nodeid, int state);
void hard_exit(void);
@@ -53,6 +54,15 @@
extern int shutdown_pending;
+void
+set_transition_throttling(int nsecs)
+{
+ if (nsecs < 0)
+ nsecs = 0;
+ transition_throttling = nsecs;
+}
+
+
/**
Called to handle the transition of a cluster member from up->down or
down->up. This handles initializing services (in the local node-up case),
@@ -88,11 +98,16 @@
if (shutdown_pending) {
clulog(LOG_NOTICE, "Processing delayed exit signal\n");
running = 0;
+ return;
}
setup_signal(SIGINT, flag_shutdown);
setup_signal(SIGTERM, flag_shutdown);
setup_signal(SIGHUP, flag_reconfigure);
+ /* Let things settle if we're booting multiple */
+ if (transition_throttling)
+ sleep(transition_throttling);
+
eval_groups(1, nodeID, 1);
return;
}
--- cluster/rgmanager/src/daemons/restree.c 2007/07/31 17:56:10 1.23.2.8.2.1
+++ cluster/rgmanager/src/daemons/restree.c 2007/08/02 14:47:45 1.23.2.8.2.2
@@ -665,8 +665,10 @@
}
}
/* No resource rule matching the child? Press on... */
- if (!flags)
+ if (!flags) {
+ free(ref);
continue;
+ }
flags = 0;
/* Don't descend on anything we should have already picked
@@ -686,11 +688,9 @@
break;
}
- if (flags == 2) {
- free(ref);
- continue;
- }
free(ref);
+ if (flags == 2)
+ continue;
x = 1;
switch(do_load_resource(ccsfd, tok, childrule, tree,
@@ -1035,12 +1035,21 @@
}
/* No check levels ready at the moment. */
- if (idx == -1)
+ if (idx == -1) {
+ if (node->rn_checked)
+ return node->rn_last_status;
return 0;
+ }
- node->rn_actions[idx].ra_last = now;
- if ((x = res_exec(node, RS_STATUS, NULL,
- node->rn_actions[idx].ra_depth)) == 0)
+
+ node->rn_actions[idx].ra_last = now;
+ x = res_exec(node, RS_STATUS, NULL, node->rn_actions[idx].ra_depth);
+
+ node->rn_last_status = x;
+ node->rn_last_depth = node->rn_actions[idx].ra_depth;
+ node->rn_checked = 1;
+
+ if (x == 0)
return 0;
if (!has_recover)
@@ -1101,14 +1110,18 @@
now = res->r_started;
- for (; node->rn_actions[x].ra_name; x++) {
+ for (; node->rn_actions[x].ra_name; x++) {
- if (strcmp(node->rn_actions[x].ra_name, "monitor") &&
- strcmp(node->rn_actions[x].ra_name, "status"))
+ if (strcmp(node->rn_actions[x].ra_name, "monitor") &&
+ strcmp(node->rn_actions[x].ra_name, "status"))
continue;
- node->rn_actions[x].ra_last = now;
+ node->rn_actions[x].ra_last = now;
}
+
+ node->rn_checked = 0;
+ node->rn_last_status = 0;
+ node->rn_last_depth = 0;
}
--- cluster/rgmanager/src/daemons/rg_forward.c 2007/07/24 18:49:18 1.8.2.1.2.1
+++ cluster/rgmanager/src/daemons/rg_forward.c 2007/08/02 14:47:45 1.8.2.1.2.2
@@ -122,10 +122,12 @@
m = NULL;
continue;
}
- goto out_fail;
+
+ if (ret == 0)
+ continue;
}
break;
- } while(++retries < 60); /* old 60 second rule */
+ } while(++retries < 60); /* old 600 second rule */
swab_SmMessageSt(&msg);
--- cluster/rgmanager/src/daemons/rg_state.c 2007/07/24 18:49:18 1.24.2.10.2.1
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/08/02 14:47:45 1.24.2.10.2.2
@@ -35,6 +35,7 @@
#include <ccs.h>
#include <rg_queue.h>
#include <msgsimple.h>
+#include <res-ocf.h>
/* XXX - copied :( */
#define cn_svccount cn_address.cna_address[0] /* Theses are uint8_t size */
@@ -467,6 +468,7 @@
case RG_STATE_CHECK:
case RG_STATE_STARTING:
case RG_STATE_RECOVER:
+ case RG_STATE_MIGRATE:
if ((svcStatus->rs_owner != my_id()) &&
memb_online(membership, svcStatus->rs_owner)) {
/*
@@ -583,6 +585,10 @@
"#43: Service %s has failed; can not start.\n",
svcName);
break;
+
+ case RG_STATE_MIGRATE:
+ ret = 4;
+ break;
case RG_STATE_STOPPING:
case RG_STATE_STARTED:
@@ -892,16 +898,60 @@
ret = group_migrate(svcName, target);
- if (ret == -1 || ret > 0) {
+ switch(ret) {
+ default:
+ case -1:
+ case OCF_RA_ERROR:
+ svc_fail(svcName);
/* XXX run svc_status again here to see if it's still
healthy; if it is, don't FAIL it; it could be that
the target node simply died; in this case, set status
back to started */
- /* if ret > 0 { svc_status... */
- svc_fail(svcName);
+ return RG_EFAIL;
+ break;
+ case OCF_RA_NOT_RUNNING:
+ /* For these two, the VM was either not running or
+ migration is simply impossible. */
+ /* Don't mark the service as failed; since it's either
+ recoverable or still running. */
ret = RG_EFAIL;
+ break;
+ case OCF_RA_NOT_CONFIGURED:
+ ret = RG_EINVAL;
+ break;
+ case 0:
+ return 0;
}
+ /* Ok, we've hit a recoverable condition. Since VMs and migratory
+ services are ... well, migratable, we can just flip the state
+ back to 'started' and error checking will fix it later. */
+ if (rg_lock(svcName, &lockp) < 0) {
+ clulog(LOG_ERR, "#45: Unable to obtain cluster lock: %s\n",
+ strerror(errno));
+ return ret;
+ }
+
+ if (get_rg_state(svcName, &svcStatus) != 0) {
+ rg_unlock(&lockp);
+ clulog(LOG_ERR, "#46: Failed getting status for RG %s\n",
+ svcName);
+ return ret;
+ }
+
+ if (svcStatus.rs_last_owner != my_id() ||
+ svcStatus.rs_owner != target ||
+ svcStatus.rs_state != RG_STATE_MIGRATE) {
+ rg_unlock(&lockp);
+ return ret;
+ }
+
+ svcStatus.rs_owner = my_id();
+ svcStatus.rs_state = RG_STATE_STARTED;
+
+ set_rg_state(svcName, &svcStatus);
+ rg_unlock(&lockp);
+
return ret;
}
@@ -954,7 +1004,8 @@
}
msg_send(&ctx, &msgp, sizeof(msgp));
- msg_receive(&ctx, &response, sizeof (response), 5);
+ if (msg_receive(&ctx, &response, sizeof (response), 5) != sizeof(response))
+ goto cont;;
swab_SmMessageSt(&response);
if (response.sm_data.d_ret == RG_SUCCESS)
@@ -962,6 +1013,7 @@
else
ret = -1;
+cont:
msg_close(&ctx);
}
@@ -1937,7 +1989,7 @@
allowed_nodes = member_list();
while (memb_count(allowed_nodes)) {
- target = best_target_node(allowed_nodes, -1,
+ target = best_target_node(allowed_nodes, 0,
svcName, 1);
if (target == me) {
ret = handle_start_remote_req(svcName, request);
@@ -1947,7 +1999,7 @@
ret = RG_EFAIL;
goto out;
} else {
- ret = relocate_service(svcName, request, target);
+ ret = relocate_service(svcName, RG_START_REMOTE, target);
}
switch(ret) {
--- cluster/rgmanager/src/resources/vm.sh 2007/07/31 17:56:10 1.1.2.4.2.2
+++ cluster/rgmanager/src/resources/vm.sh 2007/08/02 14:47:45 1.1.2.4.2.3
@@ -22,6 +22,8 @@
export PATH
+. $(dirname $0)/ocf-shellfuncs
+
#
# Virtual Machine start/stop script (requires the xm command)
#
@@ -216,7 +218,7 @@
# controlled externally; the external monitoring app
# should.
#
- declare cmdline="on_shutdown=\"destroy\" on_reboot=\"destroy\" on_crash=\"destroy\""
+ declare cmdline="restart=\"never\""
declare varp val temp
#
@@ -375,8 +377,22 @@
migrate()
{
declare target=$1
+ declare errstr rv
+
+ err=$(xm migrate $OCF_RESKEY_name $target 2>&1 | head -1)
+ rv=$?
+
+ if [ $rv -ne 0 ]; then
+ if [ "$err" != "${err/does not exist/}" ]; then
+ ocf_log warn "Trying to migrate '$OCF_RESKEY_name' - domain does not exist"
+ return $OCF_NOT_RUNNING
+ fi
+ if [ "$err" != "${err/Connection refused/}" ]; then
+ ocf_log warn "Trying to migrate '$OCF_RESKEY_name' - connect refused"
+ return $OCF_ERR_CONFIGURED
+ fi
+ fi
- xm migrate $OCF_RESKEY_name $target
return $?
}
next reply other threads:[~2007-08-02 14:47 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-08-02 14:47 lhh [this message]
-- strict thread matches above, loose matches on Subject: below --
2007-11-26 21:46 [Cluster-devel] cluster/rgmanager ChangeLog include/reslist.h lhh
2007-08-02 14:53 lhh
2007-08-02 14:46 lhh
2007-05-31 19:08 lhh
2007-05-31 18:58 lhh
2007-05-03 15:02 lhh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20070802144746.19944.qmail@sourceware.org \
--to=lhh@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.