From mboxrd@z Thu Jan 1 00:00:00 1970 From: lhh@sourceware.org Date: 1 Sep 2006 19:02:24 -0000 Subject: [Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ... Message-ID: <20060901190224.14010.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: lhh at sourceware.org 2006-09-01 19:02:22 Modified files: rgmanager : ChangeLog rgmanager/include: resgroup.h vf.h rgmanager/src/clulib: rg_strings.c vft.c rgmanager/src/daemons: groups.c main.c rgmanager/src/utils: clustat.c clusvcadm.c Log message: 2006-09-01 Lon Hohberger * include/resgroup.h: Add proto for rg_strerror * include/vf.h: Add proto for vf_invalidate (flushes vf cache) * src/clulib/rg_strings.c: Add rg_strerror function, define human-readable strings for rgmanager error values * src/clulib/vft.c: Add vf_invalidate (separate from vf_shutdown) * src/daemons/groups.c: Fix obvious logic error * src/daemons/main.c: Fix rg_doall() message during loss of quorum. Invalidate local VF cache and kill resource configurations on loss of quorum (#202497). Send RG_EQUORUM back to clustat/clusvcadm so that they report why they can't get information. Don't queue status checks if we've lost quorum. Add command line parameter to disable internal crash watchdog * src/utils/clustat.c, clusvcadm.c: Handle SIGPIPE, and produce useful errors if possible. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.22&r2=1.23 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.13&r2=1.14 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/vf.h.diff?cvsroot=cluster&r1=1.5&r2=1.6 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/rg_strings.c.diff?cvsroot=cluster&r1=1.4&r2=1.5 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/vft.c.diff?cvsroot=cluster&r1=1.15&r2=1.16 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.21&r2=1.22 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.30&r2=1.31 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clustat.c.diff?cvsroot=cluster&r1=1.19&r2=1.20 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clusvcadm.c.diff?cvsroot=cluster&r1=1.11&r2=1.12 --- cluster/rgmanager/ChangeLog 2006/08/21 15:14:08 1.22 +++ cluster/rgmanager/ChangeLog 2006/09/01 19:02:20 1.23 @@ -1,3 +1,25 @@ +2006-09-01 Lon Hohberger + * include/resgroup.h: Add proto for rg_strerror + * include/vf.h: Add proto for vf_invalidate (flushes vf cache) + * src/clulib/rg_strings.c: Add rg_strerror function, define + human-readable strings for rgmanager error values + * src/clulib/vft.c: Add vf_invalidate (separate from vf_shutdown) + * src/daemons/groups.c: Fix obvious logic error + * src/daemons/main.c: Fix rg_doall() message during loss of quorum. + Invalidate local VF cache and kill resource configurations on + loss of quorum (#202497). Send RG_EQUORUM back to clustat/clusvcadm + so that they report why they can't get information. Don't queue + status checks if we've lost quorum. Add command line parameter to + disable internal crash watchdog + * src/utils/clustat.c, clusvcadm.c: Handle SIGPIPE, and produce + useful errors if possible. + +2006-08-31 Marek Gr??c + * src/daemons/restree.c: Fix #203720. Do not run backup copies (ends + with ~) of resource agents. + * src/resources/apache.*, mysql.*: Add Apache & MySQL resource agents + * src/resources/utils/*: Add utility scripts for resource agents + 2006-08-21 Lon Hohberger * src/daemons/main.c: Fix #202500 - simultaneous starts confuse rgmanager. This happened due to the fact that rgmanager was not --- cluster/rgmanager/include/resgroup.h 2006/08/18 15:26:22 1.13 +++ cluster/rgmanager/include/resgroup.h 2006/09/01 19:02:21 1.14 @@ -174,6 +174,9 @@ #define RG_YES 1 #define RG_NO 2 +char *rg_strerror(int val); + + /* * Fail-over domain states */ --- cluster/rgmanager/include/vf.h 2006/07/12 14:04:06 1.5 +++ cluster/rgmanager/include/vf.h 2006/09/01 19:02:21 1.6 @@ -170,6 +170,7 @@ * VF Stuff. VF only talks to peers. */ int vf_init(int, uint16_t, vf_vote_cb_t, vf_commit_cb_t); +int vf_invalidate(void); int vf_shutdown(void); /* --- cluster/rgmanager/src/clulib/rg_strings.c 2006/07/11 23:52:41 1.4 +++ cluster/rgmanager/src/clulib/rg_strings.c 2006/09/01 19:02:22 1.5 @@ -16,6 +16,39 @@ Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include + +struct { int val; char *str; } rg_error_strings[] = { + { RG_EQUORUM, "Operation requires quorum" }, + { RG_EINVAL, "Invalid operation for resource" }, + { RG_EDEPEND, "Operation violates dependency rule" }, + { RG_EAGAIN, "Temporary failure; try again" }, + { RG_EDEADLCK, "Operation would cause a deadlock" }, + { RG_ENOSERVICE,"Service does not exist" }, + { RG_EFORWARD, "Service not mastered locally" }, + { RG_EABORT, "Aborted; service failed" }, + { RG_EFAIL, "Failure" }, + { RG_ESUCCESS, "Success" }, + { RG_YES, "Yes" }, + { RG_NO, "No" }, + { 0, NULL } +}; + + +char *rg_strerror(int err) +{ + int x; + + for (x = 0; rg_error_strings[x].str != NULL; x++) { + if (rg_error_strings[x].val == err) { + return rg_error_strings[x].str; + } + } + + return "Unknown"; +} + + const char *rg_state_strings[] = { "stopped", "starting", @@ -51,3 +84,4 @@ "user stop", "" }; + --- cluster/rgmanager/src/clulib/vft.c 2006/08/07 22:05:01 1.15 +++ cluster/rgmanager/src/clulib/vft.c 2006/09/01 19:02:22 1.16 @@ -935,22 +935,13 @@ } -/** - Shut down VF - */ int -vf_shutdown(void) +vf_invalidate(void) { key_node_t *c_key; view_node_t *c_jv; commit_node_t *c_cn; - pthread_mutex_lock(&vf_mutex); - vf_thread_ready = 0; - pthread_cancel(vf_thread); - pthread_join(vf_thread, NULL); - _port = 0; - _node_id = (int)-1; pthread_mutex_lock(&key_list_mutex); while ((c_key = key_list) != NULL) { @@ -974,6 +965,29 @@ } pthread_mutex_unlock(&key_list_mutex); + return 0; +} + + +/** + Shut down VF + */ +int +vf_shutdown(void) +{ + key_node_t *c_key; + view_node_t *c_jv; + commit_node_t *c_cn; + + pthread_mutex_lock(&vf_mutex); + vf_thread_ready = 0; + pthread_cancel(vf_thread); + pthread_join(vf_thread, NULL); + _port = 0; + _node_id = (int)-1; + + vf_invalidate(); + pthread_mutex_unlock(&vf_mutex); return 0; --- cluster/rgmanager/src/daemons/groups.c 2006/08/18 15:26:22 1.21 +++ cluster/rgmanager/src/daemons/groups.c 2006/09/01 19:02:22 1.22 @@ -273,7 +273,7 @@ * local start. */ if (svcStatus->rs_state == RG_STATE_STARTED && - svcStatus->rs_owner == mp->cn_nodeid) + svcStatus->rs_owner != mp->cn_nodeid) return; if (svcStatus->rs_state == RG_STATE_DISABLED) --- cluster/rgmanager/src/daemons/main.c 2006/08/21 15:14:09 1.30 +++ cluster/rgmanager/src/daemons/main.c 2006/09/01 19:02:22 1.31 @@ -123,7 +123,13 @@ rg_set_inquorate(); member_list_update(NULL);/* Clear member list */ rg_lockall(L_SYS); - rg_doall(RG_INIT, 1, "Emergency stop of %s"); + rg_doall(RG_INIT, 1, "Emergency stop of %s\n"); +#ifndef USE_OPENAIS + clulog(LOG_DEBUG, "Invalidating local VF cache\n"); + vf_invalidate(); +#endif + clulog(LOG_DEBUG, "Flushing resource group cache\n"); + kill_resource_groups(); rg_set_uninitialized(); return -1; } else if (!rg_quorate()) { @@ -131,7 +137,7 @@ rg_set_quorate(); rg_unlockall(L_SYS); rg_unlockall(L_USER); - clulog(LOG_NOTICE, "Quorum Formed\n"); + clulog(LOG_NOTICE, "Quorum Regained\n"); } old_membership = member_list(); @@ -562,7 +568,7 @@ case M_STATECHANGE: msg_receive(ctx, NULL, 0, 0); clulog(LOG_DEBUG, "Membership Change Event\n"); - if (rg_quorate() && running) { + if (running) { rg_unlockall(L_SYS); membership_update(); } @@ -644,6 +650,7 @@ } if (!rg_initialized()) { + msg_send_simple(newctx, RG_FAIL, RG_EQUORUM, 0); msg_close(newctx); msg_free_ctx(newctx); continue; @@ -651,6 +658,7 @@ if (!rg_quorate()) { printf("Dropping connect: NO QUORUM\n"); + msg_send_simple(newctx, RG_FAIL, RG_EQUORUM, 0); msg_close(newctx); msg_free_ctx(newctx); } @@ -668,7 +676,7 @@ return 0; /* No new messages. Drop in the status check requests. */ - if (n == 0) { + if (n == 0 && rg_quorate()) { do_status_checks(); return 0; } @@ -805,15 +813,18 @@ main(int argc, char **argv) { int rv; - char foreground = 0; + char foreground = 0, wd = 1; cman_node_t me; msgctx_t *cluster_ctx; msgctx_t *local_ctx; pthread_t th; cman_handle_t clu = NULL; - while ((rv = getopt(argc, argv, "fd")) != EOF) { + while ((rv = getopt(argc, argv, "wfd")) != EOF) { switch (rv) { + case 'w': + wd = 0; + break; case 'd': debug = 1; break; @@ -834,7 +845,7 @@ if (!foreground && (geteuid() == 0)) { daemon_init(argv[0]); - if (!debug && !watchdog_init()) + if (wd && !debug && !watchdog_init()) clulog(LOG_NOTICE, "Failed to start watchdog\n"); } --- cluster/rgmanager/src/utils/clustat.c 2006/08/07 22:05:01 1.19 +++ cluster/rgmanager/src/utils/clustat.c 2006/09/01 19:02:22 1.20 @@ -10,6 +10,7 @@ #include #include #include +#include #ifdef HAVE_CONFIG_H #include @@ -46,7 +47,7 @@ rg_state_list(int local_node_id, int fast) { msgctx_t ctx; - int max, n, x; + int max = 0, n, x; rg_state_list_t *rsl = NULL; generic_msg_hdr *msgp = NULL; rg_state_msg_t *rsmp = NULL; @@ -91,6 +92,7 @@ } n = msg_receive_simple(&ctx, &msgp, tv.tv_sec); + if (n < 0) { if (errno == EAGAIN) continue; @@ -109,6 +111,13 @@ swab_generic_msg_hdr(msgp); + if (msgp->gh_command == RG_FAIL) { + printf("Service states unavailable: %s\n", + rg_strerror(msgp->gh_arg1)); + msg_close(&ctx); + return NULL; + } + if (msgp->gh_command == RG_SUCCESS) { free(msgp); break; @@ -736,6 +745,8 @@ return 1; } + signal(SIGPIPE, SIG_IGN); + /* Connect & grab all our info */ ch = cman_init(NULL); --- cluster/rgmanager/src/utils/clusvcadm.c 2006/08/09 21:48:34 1.11 +++ cluster/rgmanager/src/utils/clusvcadm.c 2006/09/01 19:02:22 1.12 @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef HAVE_CONFIG_H #include @@ -187,6 +188,7 @@ msgctx_t ctx; cman_handle_t ch; SmMessageSt msg; + generic_msg_hdr *h = (generic_msg_hdr *)&msg; int action = RG_STATUS; int node_specified = 0; int me, svctarget = 0; @@ -274,6 +276,8 @@ svcname = realsvcname; } + signal(SIGPIPE, SIG_IGN); + /* No login */ ch = cman_init(NULL); if (!ch) { @@ -320,48 +324,23 @@ return 1; } - opt = msg_send(&ctx, &msg, sizeof(msg)); - - if (opt < sizeof(msg)) { - perror("msg_send"); - fprintf(stderr, "Could not send entire message!\n"); - return 1; - } + msg_send(&ctx, &msg, sizeof(msg)); - if (msg_receive(&ctx, &msg, sizeof(msg), 0) != sizeof(msg)) { + /* Reusing opt here */ + if ((opt = msg_receive(&ctx, &msg, sizeof(msg), 0)) < sizeof(*h)) { perror("msg_receive"); fprintf(stderr, "Error receiving reply!\n"); return 1; } /* Decode */ - swab_SmMessageSt(&msg); - switch (msg.sm_data.d_ret) { - case RG_ESUCCESS: - printf("success\n"); - break; - case RG_EFAIL: - printf("failed\n"); - break; - case RG_EABORT: - printf("cancelled by resource manager\n"); - break; - case RG_ENOSERVICE: - printf("failed: Service does not exist\n"); - break; - case RG_EDEADLCK: - printf("failed: Operation would deadlock\n"); - break; - case RG_EAGAIN: - printf("failed: Try again (resource groups locked)\n"); - break; - case RG_EDEPEND: - printf("failed: Operation would break dependency\n"); - break; - default: - printf("failed: unknown reason %d\n", msg.sm_data.d_ret); - break; + if (opt < sizeof(msg)) { + swab_generic_msg_hdr(h); + printf("%s\n", rg_strerror(h->gh_arg1)); + return h->gh_arg1; } + swab_SmMessageSt(&msg); + printf("%s\n", rg_strerror(msg.sm_data.d_ret)); return msg.sm_data.d_ret; }