From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([208.118.235.92]:59656) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TGoQQ-00011k-Hz for qemu-devel@nongnu.org; Wed, 26 Sep 2012 06:01:04 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1TGoQL-0001YF-5V for qemu-devel@nongnu.org; Wed, 26 Sep 2012 06:01:02 -0400 Received: from mx1.redhat.com ([209.132.183.28]:35800) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TGoQK-0001Xs-QS for qemu-devel@nongnu.org; Wed, 26 Sep 2012 06:00:57 -0400 Message-ID: <5062D24F.7030304@redhat.com> Date: Wed, 26 Sep 2012 12:00:47 +0200 From: Kevin Wolf MIME-Version: 1.0 References: <20120924091008.GJ18470@in.ibm.com> <20120924091340.GN18470@in.ibm.com> In-Reply-To: <20120924091340.GN18470@in.ibm.com> Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [PATCH v9 4/4] block: Support GlusterFS as a QEMU block backend. List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: bharata@linux.vnet.ibm.com Cc: Anthony Liguori , Anand Avati , Vijay Bellur , Stefan Hajnoczi , Harsh Bora , Amar Tumballi , qemu-devel@nongnu.org, "Richard W.M. Jones" , Blue Swirl , Avi Kivity , Paolo Bonzini , Daniel Veillard Am 24.09.2012 11:13, schrieb Bharata B Rao: > block: Support GlusterFS as a QEMU block backend. > > From: Bharata B Rao > > This patch adds gluster as the new block backend in QEMU. This gives > QEMU the ability to boot VM images from gluster volumes. Its already > possible to boot from VM images on gluster volumes using FUSE mount, but > this patchset provides the ability to boot VM images from gluster volumes > by by-passing the FUSE layer in gluster. This is made possible by > using libgfapi routines to perform IO on gluster volumes directly. > > VM Image on gluster volume is specified like this: > > file=gluster[+transport]://[server[:port]]/volname/image[?socket=...] > > 'gluster' is the protocol. > > 'transport' specifies the transport type used to connect to gluster > management daemon (glusterd). Valid transport types are > tcp, unix and rdma. If a transport type isn't specified, then tcp > type is assumed. > > 'server' specifies the server where the volume file specification for > the given volume resides. This can be either hostname, ipv4 address > or ipv6 address. ipv6 address needs to be within square brackets [ ]. > If transport type is 'unix', then server field is ignored, but the > 'socket' field needs to be populated with the path to unix domain > socket. > > 'port' is the port number on which glusterd is listening. This is optional > and if not specified, QEMU will send 0 which will make gluster to use the > default port. port is ignored for unix type of transport. > > 'volname' is the name of the gluster volume which contains the VM image. > > 'image' is the path to the actual VM image that resides on gluster volume. > > Examples: > > file=gluster://1.2.3.4/testvol/a.img > file=gluster+tcp://1.2.3.4/testvol/a.img > file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img > file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img > file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img > file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img > file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket > file=gluster+rdma://1.2.3.4:24007/testvol/a.img > > Signed-off-by: Bharata B Rao > --- > > block/Makefile.objs | 1 > block/gluster.c | 642 +++++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 643 insertions(+), 0 deletions(-) > create mode 100644 block/gluster.c > > > diff --git a/block/Makefile.objs b/block/Makefile.objs > index b5754d3..a1ae67f 100644 > --- a/block/Makefile.objs > +++ b/block/Makefile.objs > @@ -9,3 +9,4 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o > block-obj-$(CONFIG_LIBISCSI) += iscsi.o > block-obj-$(CONFIG_CURL) += curl.o > block-obj-$(CONFIG_RBD) += rbd.o > +block-obj-$(CONFIG_GLUSTERFS) += gluster.o > diff --git a/block/gluster.c b/block/gluster.c > new file mode 100644 > index 0000000..a2f8303 > --- /dev/null > +++ b/block/gluster.c > @@ -0,0 +1,642 @@ > +/* > + * GlusterFS backend for QEMU > + * > + * Copyright (C) 2012 Bharata B Rao > + * > + * Pipe handling mechanism in AIO implementation is derived from > + * block/rbd.c. Hence, > + * > + * Copyright (C) 2010-2011 Christian Brunner , > + * Josh Durgin > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + * Contributions after 2012-01-13 are licensed under the terms of the > + * GNU GPL, version 2 or (at your option) any later version. > + */ > +#include > +#include "block_int.h" > +#include "qemu_socket.h" > +#include "uri.h" > + > +typedef struct GlusterAIOCB { > + BlockDriverAIOCB common; > + int64_t size; > + int ret; > + bool *finished; > + QEMUBH *bh; > +} GlusterAIOCB; > + > +typedef struct BDRVGlusterState { > + struct glfs *glfs; > + int fds[2]; > + struct glfs_fd *fd; > + int qemu_aio_count; > + int event_reader_pos; > + GlusterAIOCB *event_acb; > +} BDRVGlusterState; > + > +#define GLUSTER_FD_READ 0 > +#define GLUSTER_FD_WRITE 1 > + > +typedef struct GlusterConf { > + char *server; > + int port; > + char *volname; > + char *image; > + char *transport; > +} GlusterConf; > + > +static void qemu_gluster_gconf_free(GlusterConf *gconf) > +{ > + g_free(gconf->server); > + g_free(gconf->volname); > + g_free(gconf->image); > + g_free(gconf->transport); > + g_free(gconf); > +} > + > +static int parse_volume_options(GlusterConf *gconf, char *path) > +{ > + char *token, *saveptr; > + > + /* volname */ > + token = strtok_r(path, "/", &saveptr); > + if (!token) { > + return -EINVAL; > + } > + gconf->volname = g_strdup(token); > + > + /* image */ > + token = strtok_r(NULL, "?", &saveptr); If I understand uri.c right, there is no ? in the path, so there's no reason to call strtok. You could just use the rest of the string. > + if (!token) { > + return -EINVAL; > + } > + gconf->image = g_strdup(token); > + return 0; > +} > + > +/* > + * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...] > + * > + * 'gluster' is the protocol. > + * > + * 'transport' specifies the transport type used to connect to gluster > + * management daemon (glusterd). Valid transport types are > + * tcp, unix and rdma. If a transport type isn't specified, then tcp > + * type is assumed. > + * > + * 'server' specifies the server where the volume file specification for > + * the given volume resides. This can be either hostname, ipv4 address > + * or ipv6 address. ipv6 address needs to be within square brackets [ ]. > + * If transport type is 'unix', then server field is ignored, but the > + * 'socket' field needs to be populated with the path to unix domain > + * socket. > + * > + * 'port' is the port number on which glusterd is listening. This is optional > + * and if not specified, QEMU will send 0 which will make gluster to use the > + * default port. port is ignored for unix type of transport. > + * > + * 'volname' is the name of the gluster volume which contains the VM image. > + * > + * 'image' is the path to the actual VM image that resides on gluster volume. > + * > + * Examples: > + * > + * file=gluster://1.2.3.4/testvol/a.img > + * file=gluster+tcp://1.2.3.4/testvol/a.img > + * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img > + * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img > + * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img > + * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img > + * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket > + * file=gluster+rdma://1.2.3.4:24007/testvol/a.img > + */ > +static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) > +{ > + URI *uri; > + QueryParams *qp = NULL; > + bool is_unix = false; > + int ret = 0; > + char *unescape_str = NULL; > + > + uri = uri_parse(filename); > + if (!uri) { > + return -EINVAL; > + } > + > + /* transport */ > + if (!strcmp(uri->scheme, "gluster")) { > + gconf->transport = g_strdup("tcp"); > + } else if (!strcmp(uri->scheme, "gluster+tcp")) { > + gconf->transport = g_strdup("tcp"); > + } else if (!strcmp(uri->scheme, "gluster+unix")) { > + gconf->transport = g_strdup("unix"); > + is_unix = true; > + } else if (!strcmp(uri->scheme, "gluster+rdma")) { > + gconf->transport = g_strdup("rdma"); > + } else { > + ret = -EINVAL; > + goto out; > + } > + > + ret = parse_volume_options(gconf, uri->path); > + if (ret < 0) { > + goto out; > + } > + > + if (uri->query) { > + unescape_str = uri_string_unescape(uri->query, -1, NULL); > + if (!unescape_str) { > + ret = -EINVAL; > + goto out; > + } > + } I agree with Paolo here, this need to go away. The example that you posted ("gluster+unix:///b?c=d%26e=f") has indeed only one argument with name 'c' and value 'd&e=f'. If you unescape here, you would incorrectly require double escaping. > + > + qp = query_params_parse(unescape_str); > + if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) { > + ret = -EINVAL; > + goto out; > + } > + > + if (is_unix) { > + if (strcmp(qp->p[0].name, "socket")) { > + ret = -EINVAL; > + goto out; > + } > + gconf->server = g_strdup(qp->p[0].value); Maybe add a check that uri->server is empty? > + } else { > + gconf->server = g_strdup(uri->server); > + gconf->port = uri->port; > + } > + > +out: > + if (qp) { > + query_params_free(qp); > + } > + g_free(unescape_str); > + uri_free(uri); > + return ret; > +} > + > +static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename) > +{ > + struct glfs *glfs = NULL; > + int ret; > + > + ret = qemu_gluster_parseuri(gconf, filename); > + if (ret < 0) { > + error_report("Usage: file=gluster[+transport]://[server[:port]]/" > + "volname/image[?socket=...]"); > + errno = -ret; > + goto out; > + } > + > + glfs = glfs_new(gconf->volname); > + if (!glfs) { > + goto out; > + } > + > + ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server, > + gconf->port); > + if (ret < 0) { > + goto out; > + } > + > + /* > + * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when > + * GlusterFS makes GF_LOG_* macros available to libgfapi users. > + */ > + ret = glfs_set_logging(glfs, "-", 4); > + if (ret < 0) { > + goto out; > + } > + > + ret = glfs_init(glfs); > + if (ret) { > + error_report("Gluster connection failed for server=%s port=%d " > + "volume=%s image=%s transport=%s\n", gconf->server, gconf->port, > + gconf->volname, gconf->image, gconf->transport); > + goto out; > + } > + return glfs; > + > +out: > + if (glfs) { > + glfs_fini(glfs); Does this corrupt errno? > + } > + return NULL; > +} > + > +static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s) > +{ > + int ret; > + bool *finished = acb->finished; > + BlockDriverCompletionFunc *cb = acb->common.cb; > + void *opaque = acb->common.opaque; > + > + if (!acb->ret || acb->ret == acb->size) { > + ret = 0; /* Success */ > + } else if (acb->ret < 0) { > + ret = acb->ret; /* Read/Write failed */ > + } else { > + ret = -EIO; /* Partial read/write - fail it */ > + } > + > + s->qemu_aio_count--; > + qemu_aio_release(acb); > + cb(opaque, ret); > + if (finished) { > + *finished = true; > + } > +} > + > +static void qemu_gluster_aio_event_reader(void *opaque) > +{ > + BDRVGlusterState *s = opaque; > + ssize_t ret; > + > + do { > + char *p = (char *)&s->event_acb; > + > + ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos, > + sizeof(s->event_acb) - s->event_reader_pos); > + if (ret > 0) { > + s->event_reader_pos += ret; > + if (s->event_reader_pos == sizeof(s->event_acb)) { > + s->event_reader_pos = 0; > + qemu_gluster_complete_aio(s->event_acb, s); > + } > + } > + } while (ret < 0 && errno == EINTR); > +} > + > +static int qemu_gluster_aio_flush_cb(void *opaque) > +{ > + BDRVGlusterState *s = opaque; > + > + return (s->qemu_aio_count > 0); > +} > + > +static int qemu_gluster_open(BlockDriverState *bs, const char *filename, > + int bdrv_flags) > +{ > + BDRVGlusterState *s = bs->opaque; > + int open_flags = 0; > + int ret = 0; > + GlusterConf *gconf = g_malloc0(sizeof(GlusterConf)); > + > + s->glfs = qemu_gluster_init(gconf, filename); > + if (!s->glfs) { > + ret = -errno; > + goto out; > + } > + > + open_flags |= O_BINARY; > + open_flags &= ~O_ACCMODE; open_flags == O_BINARY here, so no O_ACCMODE bits to clear. > + if (bdrv_flags & BDRV_O_RDWR) { > + open_flags |= O_RDWR; > + } else { > + open_flags |= O_RDONLY; > + } > + > + if ((bdrv_flags & BDRV_O_NOCACHE)) { > + open_flags |= O_DIRECT; > + } > + > + s->fd = glfs_open(s->glfs, gconf->image, open_flags); > + if (!s->fd) { > + ret = -errno; > + goto out; > + } > + > + ret = qemu_pipe(s->fds); > + if (ret < 0) { > + ret = -errno; > + goto out; > + } > + fcntl(s->fds[GLUSTER_FD_READ], F_SETFL, O_NONBLOCK); > + qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], > + qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s); > + > +out: > + qemu_gluster_gconf_free(gconf); > + if (!ret) { > + return ret; > + } > + if (s->fd) { > + glfs_close(s->fd); > + } > + if (s->glfs) { > + glfs_fini(s->glfs); > + } > + return ret; > +} > + > +static int qemu_gluster_create(const char *filename, > + QEMUOptionParameter *options) > +{ > + struct glfs *glfs; > + struct glfs_fd *fd; > + int ret = 0; > + int64_t total_size = 0; > + GlusterConf *gconf = g_malloc0(sizeof(GlusterConf)); > + > + glfs = qemu_gluster_init(gconf, filename); > + if (!glfs) { > + ret = -errno; > + goto out; > + } > + > + while (options && options->name) { > + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { > + total_size = options->value.n / BDRV_SECTOR_SIZE; > + } > + options++; > + } > + > + fd = glfs_creat(glfs, gconf->image, > + O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR); > + if (!fd) { > + ret = -errno; > + } else { > + if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) { > + ret = -errno; > + } > + if (glfs_close(fd) != 0) { > + ret = -errno; > + } > + } > +out: > + qemu_gluster_gconf_free(gconf); > + if (glfs) { > + glfs_fini(glfs); > + } > + return ret; > +} > + > +static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb) > +{ > + GlusterAIOCB *acb = (GlusterAIOCB *)blockacb; > + bool finished = false; > + > + acb->finished = &finished; > + while (!finished) { > + qemu_aio_wait(); > + } > +} > + > +static AIOPool gluster_aio_pool = { > + .aiocb_size = sizeof(GlusterAIOCB), > + .cancel = qemu_gluster_aio_cancel, > +}; > + > +static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb) > +{ > + int ret = 0; > + > + while (1) { > + int fd = s->fds[GLUSTER_FD_WRITE]; > + > + ret = write(fd, (void *)&acb, sizeof(acb)); > + if (ret >= 0) { > + break; > + } > + if (errno == EINTR) { > + continue; > + } > + if (errno != EAGAIN) { > + break; > + } Variatio delectat? ;-) How about just do { ... } while (errno == EINTR || errno == EAGAIN); ? > + } > + return ret; > +} Kevin