* [Cluster-devel] cluster group/gfs_controld/lock_dlm.h group/gf ...
@ 2006-08-14 17:22 teigland
0 siblings, 0 replies; only message in thread
From: teigland @ 2006-08-14 17:22 UTC (permalink / raw)
To: cluster-devel.redhat.com
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-08-14 18:22:53
Modified files:
group/gfs_controld: lock_dlm.h main.c recover.c
gfs2/mount : util.c
Log message:
There's been a relatively unusual problem explained in the comments that
I'd been putting off fixing for lack of a nice solution. Turns out this
problem could crop up more often than hoped, so have had to fix it.
1) mount.gfs asks gfs_controld to join mount group
2) gfs_controld does and notifies mount.gfs to go ahead with mount(2)
3) gfs_controld gets a stop callback for the group due to another node
mounting
4) gfs_controld needs to wait for the kernel mount to complete before it
can stop/suspend the mount group (through sysfs)
5) mount(2) fails in the kernel for whatever reason
6) mount.gfs tells gfs_controld the kernel mount failed
gfs_controld is waiting for the kernel mount to complete outside its
normal poll loop, though, so it won't ever get the message in step 6, and
will wait forever for the failed mount to actually complete.
Added a pipe between mount.gfs and gfs_controld that mount.gfs just uses
to send a failed mount message. gfs_controld watches the pipe for this
error message while waiting for the kernel mount. mount.gfs uses unix
socket ancilliary data to send an fd to gfs_controld.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.13&r2=1.14
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/main.c.diff?cvsroot=cluster&r1=1.8&r2=1.9
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.11&r2=1.12
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gfs2/mount/util.c.diff?cvsroot=cluster&r1=1.12&r2=1.13
--- cluster/group/gfs_controld/lock_dlm.h 2006/08/09 19:35:26 1.13
+++ cluster/group/gfs_controld/lock_dlm.h 2006/08/14 17:22:53 1.14
@@ -142,6 +142,7 @@
int mount_client_delay;
int delay_send_journals;
int kernel_mount_error;
+ int mount_error_fd;
int got_kernel_mount;
int first_mounter;
int first_mounter_done;
@@ -255,6 +256,8 @@
int process_plocks(void);
void exit_cman(void);
+void setup_mount_error_fd(struct mountgroup *mg);
+
int do_mount(int ci, char *dir, char *type, char *proto, char *table,
char *options);
int do_unmount(int ci, char *dir, int mnterr);
--- cluster/group/gfs_controld/main.c 2006/08/09 19:35:26 1.8
+++ cluster/group/gfs_controld/main.c 2006/08/14 17:22:53 1.9
@@ -143,6 +143,56 @@
return 0;
}
+/* mount.gfs sends us a special fd that it will write an error message to
+ if mount(2) fails. We can monitor this fd for an error message while
+ waiting for the kernel mount outside our main poll loop */
+
+void setup_mount_error_fd(struct mountgroup *mg)
+{
+ struct msghdr msg;
+ struct cmsghdr *cmsg;
+ struct iovec vec;
+ char tmp[CMSG_SPACE(sizeof(int))];
+ int fd, socket = client[mg->mount_client].fd;
+ char ch;
+ ssize_t n;
+
+ memset(&msg, 0, sizeof(msg));
+
+ vec.iov_base = &ch;
+ vec.iov_len = 1;
+ msg.msg_iov = &vec;
+ msg.msg_iovlen = 1;
+ msg.msg_control = tmp;
+ msg.msg_controllen = sizeof(tmp);
+
+ n = recvmsg(socket, &msg, 0);
+ if (n < 0) {
+ log_group(mg, "setup_mount_error_fd recvmsg err %d errno %d",
+ n, errno);
+ return;
+ }
+ if (n != 1) {
+ log_group(mg, "setup_mount_error_fd recvmsg got %ld", (long)n);
+ return;
+ }
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+
+ if (cmsg->cmsg_type != SCM_RIGHTS) {
+ log_group(mg, "setup_mount_error_fd expected type %d got %d",
+ SCM_RIGHTS, cmsg->cmsg_type);
+ return;
+ }
+
+ fd = (*(int *)CMSG_DATA(cmsg));
+ mg->mount_error_fd = fd;
+
+ fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK);
+
+ log_group(mg, "setup_mount_error_fd got fd %d", fd);
+}
+
static int process_client(int ci)
{
char *cmd, *dir, *type, *proto, *table, *extra;
--- cluster/group/gfs_controld/recover.c 2006/08/10 19:40:50 1.11
+++ cluster/group/gfs_controld/recover.c 2006/08/14 17:22:53 1.12
@@ -1174,6 +1174,8 @@
list_add(&mg->list, &mounts);
+ setup_mount_error_fd(mg);
+
group_join(gh, name);
return 0;
@@ -1565,7 +1567,29 @@
if (!rv)
break;
usleep(100000);
+
+ memset(buf, 0, sizeof(buf));
+
+ /* attempt to solve the problem described below where we
+ don't get the kernel_mount_error until after the stop and
+ this loop... this mount_error_fd was sent from mount.gfs and
+ mount.gfs will write on this fd if there was a mount(2)
+ error */
+
+ if (!mg->mount_error_fd)
+ continue;
+
+ rv = read(mg->mount_error_fd, buf, sizeof(buf));
+ if (rv > 0) {
+ log_group(mg, "wait_for_kernel_mount: mount error %s",
+ buf);
+ mg->kernel_mount_error = 1;
+ break;
+ }
}
+
+ close(mg->mount_error_fd);
+ mg->mount_error_fd = 0;
}
/* The processing of new mounters (send/recv options, send/recv journals,
@@ -1615,7 +1639,8 @@
3) kernel mount fails, 4) mount.gfs sends a leave
with mnterr, 5) we don't recv it and don't set
kernel_mount_error because we're stuck in
- wait_for_kernel_mount() from do_stop */
+ wait_for_kernel_mount() from do_stop. update:
+ attempt to fix above using mount_error_fd */
if (!mg->kernel_mount_error)
wait_for_kernel_mount(mg);
--- cluster/gfs2/mount/util.c 2006/07/20 20:19:04 1.12
+++ cluster/gfs2/mount/util.c 2006/08/14 17:22:53 1.13
@@ -11,6 +11,7 @@
extern char *prog_name;
extern char *fsname;
extern int verbose;
+static int mount_error_fd;
#define LOCK_DLM_SOCK_PATH "gfs_controld_sock" /* FIXME: use a header */
#define MAXLINE 256 /* size of messages with gfs_controld */
@@ -310,6 +311,57 @@
return fd;
}
+/* We create a pipe and pass the receiving end to gfs_controld. If the
+ mount fails, we write an error message to this pipe. gfs_controld monitors
+ this fd outside its main poll loop because it may need to detect a mount
+ failure while watching for the kernel mount (while waiting for the kernel
+ mount, gfs_controld is _not_ in its main poll loop which is why the normal
+ leave message w/ mnterr we send isn't sufficient.) */
+
+void setup_mount_error_fd(int socket)
+{
+ struct msghdr msg;
+ struct cmsghdr *cmsg;
+ struct iovec vec;
+ char tmp[CMSG_SPACE(sizeof(int))];
+ char ch = '\0';
+ ssize_t n;
+ int rv, fds[2];
+
+ rv = pipe(fds);
+ if (rv < 0) {
+ log_debug("setup_mount_error_fd pipe error %d %d", rv, errno);
+ return;
+ }
+
+ memset(&msg, 0, sizeof(msg));
+
+ msg.msg_control = (caddr_t)tmp;
+ msg.msg_controllen = CMSG_LEN(sizeof(int));
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ *(int *)CMSG_DATA(cmsg) = fds[0];
+
+ vec.iov_base = &ch;
+ vec.iov_len = 1;
+ msg.msg_iov = &vec;
+ msg.msg_iovlen = 1;
+
+ n = sendmsg(socket, &msg, 0);
+ if (n < 0) {
+ log_debug("setup_mount_error_fd sendmsg error %d %d", n, errno);
+ close(fds[0]);
+ close(fds[1]);
+ return;
+ }
+
+ mount_error_fd = fds[1];
+
+ log_debug("setup_mount_error_fd %d %d", fds[0], fds[1]);
+}
+
int lock_dlm_join(struct mount_options *mo, struct gen_sb *sb)
{
int i, fd, rv;
@@ -363,6 +415,8 @@
goto out;
}
+ setup_mount_error_fd(fd);
+
/*
* read response from gfs_controld to our join request:
* it sends back an int as a string, 0 or -EXXX
@@ -481,6 +535,11 @@
log_debug("message to gfs_controld: asking to leave mountgroup:");
log_debug("lock_dlm_leave: write \"%s\"", buf);
+ if (mnterr && mount_error_fd) {
+ rv = write(mount_error_fd, buf, sizeof(buf));
+ log_debug("lock_dlm_leave: write to mount_error_fd %d", rv);
+ }
+
rv = write(fd, buf, sizeof(buf));
if (rv < 0) {
warn("lock_dlm_leave: gfs_controld write error: %d", rv);
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2006-08-14 17:22 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-08-14 17:22 [Cluster-devel] cluster group/gfs_controld/lock_dlm.h group/gf teigland
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.