From mboxrd@z Thu Jan 1 00:00:00 1970 From: lhh@sourceware.org Date: 3 Jan 2007 21:08:18 -0000 Subject: [Cluster-devel] cluster/rgmanager src/daemons/rg_forward.c src ... Message-ID: <20070103210818.22006.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL4 Changes by: lhh at sourceware.org 2007-01-03 21:08:17 Modified files: rgmanager/src/daemons: rg_forward.c rgmanager/src/utils: clusvcadm.c rgmanager/include: resgroup.h Log message: Resolves: 201396 Part 1: Make rgmanager check the states of nodes during forward operations to remote nodes Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_forward.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.2.2.2&r2=1.2.2.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clusvcadm.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.2.2.7&r2=1.2.2.8 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.3.2.8&r2=1.3.2.9 --- cluster/rgmanager/src/daemons/rg_forward.c 2006/12/13 18:19:56 1.2.2.2 +++ cluster/rgmanager/src/daemons/rg_forward.c 2007/01/03 21:08:16 1.2.2.3 @@ -48,8 +48,9 @@ rg_state_t rgs; request_t *req = (request_t *)arg; void *lockp; - int fd; + int fd, ret; SmMessageSt msg; + cluster_member_list_t *m = NULL; if (rg_lock(req->rr_group, &lockp) != 0) { msg_close(req->rr_resp_fd); @@ -88,19 +89,43 @@ pthread_exit(NULL); } - if (msg_receive(fd, &msg, sizeof(msg)) != sizeof(msg)) { - msg_close(fd); - msg_close(req->rr_resp_fd); - rq_free(req); - pthread_exit(NULL); - } + /* + * Ok, we're forwarding a message to another node. Keep tabs on + * the node to make sure it doesn't die. Basically, wake up every + * now and again to make sure it's still online. If it isn't, send + * a response back to the caller. + */ + do { + ret = msg_receive_timeout(fd, &msg, sizeof(msg), 10); + if (ret < (int)sizeof(msg)) { + if (ret < 0 && errno == ETIMEDOUT) { + m = member_list(); + if (!memb_online(m, rgs.rs_owner)) { + msg.sm_data.d_ret = RG_ENODEDEATH; + /* we decode down below, + * so encode here */ + swab_SmMessageSt(&msg); + break; + } + cml_free(m); + m = NULL; + continue; + } + msg_close(fd); + msg_close(req->rr_resp_fd); + goto out; + } + break; + } while(1); + + if (m) + cml_free(m); msg_close(fd); swab_SmMessageSt(&msg); send_response(msg.sm_data.d_ret, req->rr_target, req); - +out: rq_free(req); - pthread_exit(NULL); } --- cluster/rgmanager/src/utils/clusvcadm.c 2006/12/13 18:19:56 1.2.2.7 +++ cluster/rgmanager/src/utils/clusvcadm.c 2007/01/03 21:08:17 1.2.2.8 @@ -147,6 +147,43 @@ } +int +do_msg_receive(uint64_t msgtarget, int fd, void *buf, size_t len) +{ + int ret; + cluster_member_list_t *m = NULL; + + if ((int64_t)msgtarget < (int64_t)0) + return msg_receive(fd, buf, len); + + /* Make sure a node hasn't died while processing our request. */ + do { + ret = msg_receive_timeout(fd, buf, len, 20); + if (ret < (int)len) { + if (ret < 0 && errno == ETIMEDOUT) { + m = clu_member_list(RG_SERVICE_GROUP); + if (!memb_online(m, msgtarget)) { + ret = RG_ENODEDEATH; + break; + } + cml_free(m); + m = NULL; + continue; + } + + /* Make sure we don't overwrite ENODEDEATH */ + if (ret < 0) + ret = -1; + } + break; + } while(1); + + if (m) + cml_free(m); + return ret; +} + + void usage(char *name) { @@ -259,7 +296,6 @@ usage(basename(argv[0])); return 1; } - /* No login */ fd = clu_connect(RG_SERVICE_GROUP, 0); @@ -294,10 +330,15 @@ fflush(stdout); msgfd = msg_open(msgtarget, RG_PORT, 0, 5); } else { - printf("Trying to relocate %s to %s", svcname, nodename); + if (node_specified) + printf("Trying to relocate %s to %s", svcname, nodename); + else + printf("Trying to relocate %s", svcname); printf("..."); fflush(stdout); msgfd = msg_open(me, RG_PORT, 0, 5); + /* just do a normal receive from the local node */ + msgtarget = (uint64_t)-1; } if (msgfd < 0) { @@ -312,10 +353,25 @@ return 1; } - if (msg_receive(msgfd, &msg, sizeof(msg)) != sizeof(msg)) { - perror("msg_receive"); - fprintf(stderr, "Error receiving reply!\n"); - return 1; + /* reusing opt */ + opt = do_msg_receive(msgtarget, msgfd, &msg, + sizeof(msg)); + if (opt < (int)sizeof(msg)) { + if (opt != RG_ENODEDEATH) { + perror("msg_receive"); + fprintf(stderr, "Error receiving reply!\n"); + return 1; + } + + /* + * XXX hack to enable node death processing along side + * all the rest of the possible responses. If an end-node + * died while processing, this will have been set by the + * rgmanager and a response with RG_ENODEDEATH as the d_ret + * would have been received. + */ + msg.sm_data.d_ret = RG_ENODEDEATH; + swab_SmMessageSt(&msg); } /* Decode */ @@ -346,6 +402,10 @@ case RG_EFAIL: printf("failed\n"); break; + case RG_ENODEDEATH: + printf("node processing request died\n"); + printf("(Status unknown)\n"); + break; case RG_EABORT: printf("cancelled by resource manager\n"); break; --- cluster/rgmanager/include/resgroup.h 2006/12/13 18:19:57 1.3.2.8 +++ cluster/rgmanager/include/resgroup.h 2007/01/03 21:08:17 1.3.2.9 @@ -156,6 +156,7 @@ cluster_member_list_t *member_list(void); uint64_t my_id(void); +#define RG_ENODEDEATH -8 /* Processing node died */ #define RG_ERUN -7 /* Service is running already */ #define RG_EAGAIN -6 /* Try again */ #define RG_EDEADLCK -5 /* Operation would cause deadlock */