qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
@ 2010-05-19 19:22 Christian Brunner
  2010-05-20 20:31 ` Blue Swirl
  0 siblings, 1 reply; 64+ messages in thread
From: Christian Brunner @ 2010-05-19 19:22 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: ceph-devel

The attached patch is a block driver for the distributed file system
Ceph (http://ceph.newdream.net/). This driver uses librados (which
is part of the Ceph server) for direct access to the Ceph object
store and is running entirely in userspace. Therefore it is
called "rbd" - rados block device.

To compile the driver a recent version of ceph (>= 0.20.1) is needed
and you have to "--enable-rbd" when running configure.

Additional information is available on the Ceph-Wiki:

http://ceph.newdream.net/wiki/Kvm-rbd

---
 Makefile          |    3 +
 Makefile.objs     |    1 +
 block/rados.h     |  376 ++++++++++++++++++++++++++++++++++
 block/rbd.c       |  585 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/rbd_types.h |   48 +++++
 configure         |   27 +++
 6 files changed, 1040 insertions(+), 0 deletions(-)
 create mode 100644 block/rados.h
 create mode 100644 block/rbd.c
 create mode 100644 block/rbd_types.h

diff --git a/Makefile b/Makefile
index eb9e02b..b1ab3e9 100644
--- a/Makefile
+++ b/Makefile
@@ -27,6 +27,9 @@ configure: ;
 $(call set-vpath, $(SRC_PATH):$(SRC_PATH)/hw)
 
 LIBS+=-lz $(LIBS_TOOLS)
+ifdef CONFIG_RBD
+LIBS+=-lrados
+endif
 
 ifdef BUILD_DOCS
 DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8
diff --git a/Makefile.objs b/Makefile.objs
index acbaf22..85791ac 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -18,6 +18,7 @@ block-nested-y += parallels.o nbd.o blkdebug.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
+block-nested-$(CONFIG_RBD) += rbd.o
 
 block-obj-y +=  $(addprefix block/, $(block-nested-y))
 
diff --git a/block/rados.h b/block/rados.h
new file mode 100644
index 0000000..6cde9a1
--- /dev/null
+++ b/block/rados.h
@@ -0,0 +1,376 @@
+#ifndef __RADOS_H
+#define __RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+
+
+/*
+ * osdmap encoding versions
+ */
+#define CEPH_OSDMAP_INC_VERSION     5
+#define CEPH_OSDMAP_INC_VERSION_EXT 5
+#define CEPH_OSDMAP_VERSION         5
+#define CEPH_OSDMAP_VERSION_EXT     5
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+	unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+				    const struct ceph_fsid *b)
+{
+	return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
+
+struct ceph_timespec {
+	__le32 tv_sec;
+	__le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH     1
+#define CEPH_OBJECT_LAYOUT_LINEAR   2
+#define CEPH_OBJECT_LAYOUT_HASHINO  3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH  0
+#define CEPH_PG_LAYOUT_HASH   1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+	__le16 preferred; /* preferred primary osd */
+	__le16 ps;        /* placement seed */
+	__le32 pool;      /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ *  pg_num -- base number of pseudorandomly placed pgs
+ *
+ *  pgp_num -- effective number when calculating pg placement.  this
+ * is used for pg_num increases.  new pgs result in data being "split"
+ * into new pgs.  for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split.  only _then_ are the new
+ * pgs placed independently.
+ *
+ *  lpg_num -- localized pg count (per device).  replicas are randomly
+ * selected.
+ *
+ *  lpgp_num -- as above.
+ */
+#define CEPH_PG_TYPE_REP     1
+#define CEPH_PG_TYPE_RAID4   2
+#define CEPH_PG_POOL_VERSION 2
+struct ceph_pg_pool {
+	__u8 type;                /* CEPH_PG_TYPE_* */
+	__u8 size;                /* number of osds in each pg */
+	__u8 crush_ruleset;       /* crush placement rule */
+	__u8 object_hash;         /* hash mapping object name to ps */
+	__le32 pg_num, pgp_num;   /* number of pg's */
+	__le32 lpg_num, lpgp_num; /* number of localized pg's */
+	__le32 last_change;       /* most recent epoch changed */
+	__le64 snap_seq;          /* seq for per-pool snapshot */
+	__le32 snap_epoch;        /* epoch of last snap */
+	__le32 num_snaps;
+	__le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
+	__le64 auid;               /* who owns the pg */
+} __attribute__ ((packed));
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time.  b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+	if ((x & bmask) < b)
+		return x & bmask;
+	else
+		return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
+	__le32 ol_stripe_unit;    /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+	__le32 epoch;
+	__le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS 1
+#define CEPH_OSD_UP     2
+
+/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN  0x10000
+#define CEPH_OSD_OUT 0
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
+
+/*
+ * osd ops
+ */
+#define CEPH_OSD_OP_MODE       0xf000
+#define CEPH_OSD_OP_MODE_RD    0x1000
+#define CEPH_OSD_OP_MODE_WR    0x2000
+#define CEPH_OSD_OP_MODE_RMW   0x3000
+#define CEPH_OSD_OP_MODE_SUB   0x4000
+
+#define CEPH_OSD_OP_TYPE       0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK  0x0100
+#define CEPH_OSD_OP_TYPE_DATA  0x0200
+#define CEPH_OSD_OP_TYPE_ATTR  0x0300
+#define CEPH_OSD_OP_TYPE_EXEC  0x0400
+#define CEPH_OSD_OP_TYPE_PG    0x0500
+
+enum {
+	/** data **/
+	/* read */
+	CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
+	CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
+
+	/* fancy read */
+	CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
+
+	/* write */
+	CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
+	CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
+	CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
+	CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
+	CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
+
+	/* fancy write */
+	CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
+	CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
+	CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
+	CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
+
+	CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
+	CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
+	CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
+
+	CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+
+	/** attrs **/
+	/* read */
+	CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
+	CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+
+	/* write */
+	CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
+	CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
+	CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
+	CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
+
+	/** subop **/
+	CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
+	CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
+	CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
+	CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
+	CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
+
+	/** lock **/
+	CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
+	CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
+	CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
+	CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
+	CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
+	CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
+
+	/** exec **/
+	CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
+
+	/** pg **/
+	CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+};
+
+static inline int ceph_osd_op_type_lock(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+}
+
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_RM  'r'
+
+extern const char *ceph_osd_op_name(int op);
+
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+	CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
+	CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
+	CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
+	CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
+	CEPH_OSD_FLAG_READ = 16,        /* op may read */
+	CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
+	CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
+	CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
+	CEPH_OSD_FLAG_BALANCE_READS = 256,
+	CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
+	CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
+	CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
+};
+
+enum {
+	CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
+};
+
+#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+
+/*
+ * an individual object operation.  each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+	__le16 op;           /* CEPH_OSD_OP_* */
+	__le32 flags;        /* CEPH_OSD_FLAG_* */
+	union {
+		struct {
+			__le64 offset, length;
+			__le64 truncate_size;
+			__le32 truncate_seq;
+		} __attribute__ ((packed)) extent;
+		struct {
+			__le32 name_len;
+			__le32 value_len;
+		} __attribute__ ((packed)) xattr;
+		struct {
+			__u8 class_len;
+			__u8 method_len;
+			__u8 argc;
+			__le32 indata_len;
+		} __attribute__ ((packed)) cls;
+		struct {
+			__le64 cookie, count;
+		} __attribute__ ((packed)) pgls;
+	};
+	__le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * osd request message header.  each request may include multiple
+ * ceph_osd_op object operations.
+ */
+struct ceph_osd_request_head {
+	__le32 client_inc;                 /* client incarnation */
+	struct ceph_object_layout layout;  /* pgid */
+	__le32 osdmap_epoch;               /* client's osdmap epoch */
+
+	__le32 flags;
+
+	struct ceph_timespec mtime;        /* for mutations only */
+	struct ceph_eversion reassert_version; /* if we are replaying op */
+
+	__le32 object_len;     /* length of object name */
+
+	__le64 snapid;         /* snapid to read */
+	__le64 snap_seq;       /* writer's snap context */
+	__le32 num_snaps;
+
+	__le16 num_ops;
+	struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
+} __attribute__ ((packed));
+
+struct ceph_osd_reply_head {
+	__le32 client_inc;                /* client incarnation */
+	__le32 flags;
+	struct ceph_object_layout layout;
+	__le32 osdmap_epoch;
+	struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+	__le32 result;                    /* result code */
+
+	__le32 object_len;                /* length of object name */
+	__le32 num_ops;
+	struct ceph_osd_op ops[0];  /* ops[], object */
+} __attribute__ ((packed));
+
+
+#endif
diff --git a/block/rbd.c b/block/rbd.c
new file mode 100644
index 0000000..eedae50
--- /dev/null
+++ b/block/rbd.c
@@ -0,0 +1,585 @@
+/*
+ * QEMU Block driver for RADOS (Ceph)
+ *
+ * Copyright (C) 2010 Christian Brunner <chb@muc.de>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include <sys/types.h>
+#include <stdbool.h>
+
+#include <qemu-common.h>
+
+#include "rbd_types.h"
+#include "rados.h"
+#include "module.h"
+#include "block_int.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <rados/librados.h>
+
+#include <signal.h>
+
+/*
+ * When specifying the image filename use:
+ *
+ * rbd:poolname/devicename
+ *
+ * poolname must be the name of an existing rados pool
+ *
+ * devicename is the basename for all objects used to
+ * emulate the raw device.
+ *
+ * Metadata information (image size, ...) is stored in an
+ * object with the name "devicename.rbd".
+ *
+ * The raw device is split into 4MB sized objects by default.
+ * The sequencenumber is encoded in a 12 byte long hex-string,
+ * and is attached to the devicename, separated by a dot.
+ * e.g. "devicename.1234567890ab"
+ *
+ */
+
+#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
+
+typedef struct RBDAIOCB {
+    BlockDriverAIOCB common;
+    QEMUBH *bh;
+    int ret;
+    QEMUIOVector *qiov;
+    char *bounce;
+    int write;
+    int64_t sector_num;
+    int aiocnt;
+    int error;
+} RBDAIOCB;
+
+typedef struct RADOSCB {
+    int rcbid;
+    RBDAIOCB *acb;
+    int done;
+    int64_t segsize;
+    char *buf;
+} RADOSCB;
+
+typedef struct RBDRVRBDState {
+    rados_pool_t pool;
+    char name[RBD_MAX_OBJ_NAME_SIZE];
+    int name_len;
+    uint64_t size;
+    uint64_t objsize;
+} RBDRVRBDState;
+
+typedef struct rbd_obj_header_ondisk RbdHeader1;
+
+static int rbd_parsename(const char *filename, char *pool, char *name)
+{
+    const char *rbdname;
+    char *p, *n;
+    int l;
+
+    if (!strstart(filename, "rbd:", &rbdname)) {
+        return -EINVAL;
+    }
+
+    pstrcpy(pool, 2 * RBD_MAX_SEG_NAME_SIZE, rbdname);
+    p = strchr(pool, '/');
+    if (p == NULL) {
+        return -EINVAL;
+    }
+
+    *p = '\0';
+    n = ++p;
+
+    l = strlen(n);
+
+    if (l > RBD_MAX_OBJ_NAME_SIZE) {
+        fprintf(stderr, "object name to long\n");
+        return -EINVAL;
+    } else if (l <= 0) {
+        fprintf(stderr, "object name to short\n");
+        return -EINVAL;
+    }
+
+    strcpy(name, n);
+
+    return l;
+}
+
+static int create_tmap_op(uint8_t op, const char *name, char **tmap_desc)
+{
+    uint32_t len = strlen(name);
+    uint32_t total_len = 1 + (sizeof(uint32_t) + len) + sizeof(uint32_t);       /* encoding op + name + empty buffer */
+    char *desc;
+
+    desc = qemu_malloc(total_len);
+    if (!desc) {
+        return -ENOMEM;
+    }
+
+    *tmap_desc = desc;
+
+    *desc = op;
+    desc++;
+    memcpy(desc, &len, sizeof(len));
+    desc += sizeof(len);
+    memcpy(desc, name, len);
+    desc += len;
+    len = 0;
+    memcpy(desc, &len, sizeof(len));
+    desc += sizeof(len);
+
+    return desc - *tmap_desc;
+}
+
+static void free_tmap_op(char *tmap_desc)
+{
+    qemu_free(tmap_desc);
+}
+
+static int rbd_register_image(rados_pool_t pool, const char *name)
+{
+    char *tmap_desc;
+    const char *dir = RBD_DIRECTORY;
+    int ret;
+
+    ret = create_tmap_op(CEPH_OSD_TMAP_SET, name, &tmap_desc);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = rados_tmap_update(pool, dir, tmap_desc, ret);
+    free_tmap_op(tmap_desc);
+
+    return ret;
+}
+
+static int rbd_create(const char *filename, QEMUOptionParameter *options)
+{
+    int64_t bytes = 0;
+    int64_t objsize;
+    uint64_t size;
+    time_t mtime;
+    uint8_t obj_order = RBD_DEFAULT_OBJ_ORDER;
+    char pool[RBD_MAX_SEG_NAME_SIZE];
+    char n[RBD_MAX_SEG_NAME_SIZE];
+    char name[RBD_MAX_SEG_NAME_SIZE];
+    RbdHeader1 header;
+    rados_pool_t p;
+    int name_len;
+    int ret;
+
+    if ((name_len = rbd_parsename(filename, pool, name)) < 0) {
+        return -EINVAL;
+    }
+
+    snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s%s", name, RBD_SUFFIX);
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            bytes = options->value.n;
+        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+            if (options->value.n) {
+                objsize = options->value.n;
+                if (!objsize || ((objsize - 1) & objsize)) {    /* not a power of 2? */
+                    fprintf(stderr, "obj size needs to be power of 2\n");
+                    return -EINVAL;
+                }
+                if (objsize < 4096) {
+                    fprintf(stderr, "obj size too small\n");
+                    return -EINVAL;
+                }
+
+                for (obj_order = 0; obj_order < 64; obj_order++) {
+                    if (objsize == 1)
+                        break;
+                    objsize >>= 1;
+                }
+            }
+        }
+        options++;
+    }
+
+    memset(&header, 0, sizeof(header));
+    pstrcpy(header.text, sizeof(header.text), rbd_text);
+    pstrcpy(header.signature, sizeof(header.signature), rbd_signature);
+    pstrcpy(header.version, sizeof(header.version), rbd_version);
+    header.image_size = bytes;
+    cpu_to_le64s((uint64_t *) & header.image_size);
+    header.obj_order = obj_order;
+    header.crypt_type = RBD_CRYPT_NONE;
+    header.comp_type = RBD_COMP_NONE;
+    header.snap_seq = 0;
+    header.snap_count = 0;
+    cpu_to_le32s(&header.snap_count);
+
+    if (rados_initialize(0, NULL) < 0) {
+        fprintf(stderr, "error initializing\n");
+        return -EIO;
+    }
+
+    if (rados_open_pool(pool, &p)) {
+        fprintf(stderr, "error opening pool %s\n", pool);
+        return -EIO;
+    }
+
+    /* check for existing rbd header file */
+    ret = rados_stat(p, n, &size, &mtime);
+    if (ret == 0) {
+        ret=-EEXIST;
+        goto done;
+    }
+
+    /* create header file */
+    ret = rados_write(p, n, 0, (const char *)&header, sizeof(header));
+    if (ret < 0) {
+        goto done;
+    }
+
+    ret = rbd_register_image(p, name);
+done:
+    rados_close_pool(p);
+    rados_deinitialize();
+
+    return ret;
+}
+
+static int rbd_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    RBDRVRBDState *s = bs->opaque;
+    char pool[RBD_MAX_SEG_NAME_SIZE];
+    char n[RBD_MAX_SEG_NAME_SIZE];
+    char hbuf[4096];
+
+    if ((s->name_len = rbd_parsename(filename, pool, s->name)) < 0) {
+        return -EINVAL;
+    }
+    snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s%s", s->name, RBD_SUFFIX);
+
+    if (rados_initialize(0, NULL) < 0) {
+        fprintf(stderr, "error initializing\n");
+        return -EIO;
+    }
+
+    if (rados_open_pool(pool, &s->pool)) {
+        fprintf(stderr, "error opening pool %s\n", pool);
+        return -EIO;
+    }
+
+    if (rados_read(s->pool, n, 0, hbuf, 4096) < 0) {
+        fprintf(stderr, "error reading header from %s\n", s->name);
+        return -EIO;
+    }
+    if (!strncmp(hbuf + 64, rbd_signature, 4)) {
+        if (!strncmp(hbuf + 68, rbd_version, 8)) {
+            RbdHeader1 *header;
+
+            header = (RbdHeader1 *) hbuf;
+            le64_to_cpus((uint64_t *) & header->image_size);
+            s->size = header->image_size;
+            s->objsize = 1 << header->obj_order;
+        } else {
+            fprintf(stderr, "Unknown image version %s\n", hbuf + 68);
+            return -EIO;
+        }
+    } else {
+        fprintf(stderr, "Invalid header signature %s\n", hbuf + 64);
+        return -EIO;
+    }
+
+    return 0;
+}
+
+static void rbd_close(BlockDriverState *bs)
+{
+    RBDRVRBDState *s = bs->opaque;
+
+    rados_close_pool(s->pool);
+    rados_deinitialize();
+}
+
+static int rbd_rw(BlockDriverState *bs, int64_t sector_num,
+                  uint8_t *buf, int nb_sectors, int write)
+{
+    RBDRVRBDState *s = bs->opaque;
+    char n[RBD_MAX_SEG_NAME_SIZE];
+
+    int64_t segnr, segoffs, segsize, r;
+    int64_t off, size;
+
+    off = sector_num * 512;
+    size = nb_sectors * 512;
+    segnr = (int64_t) (off / s->objsize);
+    segoffs = (int64_t) (off % s->objsize);
+    segsize = (int64_t) (s->objsize - segoffs);
+
+    while (size > 0) {
+        if (size < segsize) {
+            segsize = size;
+        }
+
+        snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name,
+                 (long long unsigned int)segnr);
+
+        if (write) {
+            if ((r = rados_write(s->pool, n, segoffs, (const char *)buf,
+                segsize)) < 0) {
+                return r;
+            }
+        } else {
+            r = rados_read(s->pool, n, segoffs, (char *)buf, segsize);
+            if (r == -ENOENT) {
+                memset(buf, 0, segsize);
+            } else if (r < 0) {
+                return(r);
+            } else if (r < segsize) {
+                memset(buf + r, 0, segsize - r);
+            }
+            r = segsize;
+        }
+
+        buf += segsize;
+        size -= segsize;
+        segoffs = 0;
+        segsize = s->objsize;
+        segnr++;
+    }
+
+    return (0);
+}
+
+static int rbd_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    return rbd_rw(bs, sector_num, buf, nb_sectors, 0);
+}
+
+static int rbd_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    return rbd_rw(bs, sector_num, (uint8_t *) buf, nb_sectors, 1);
+}
+
+static void rbd_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    RBDAIOCB *acb = (RBDAIOCB *) blockacb;
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qemu_aio_release(acb);
+}
+
+static AIOPool rbd_aio_pool = {
+    .aiocb_size = sizeof(RBDAIOCB),
+    .cancel = rbd_aio_cancel,
+};
+
+/* This is the callback function for rados_aio_read and _write */
+static void rbd_finish_aiocb(rados_completion_t c, RADOSCB *rcb)
+{
+    RBDAIOCB *acb = rcb->acb;
+    int64_t r;
+    int i;
+
+    acb->aiocnt--;
+    r = rados_aio_get_return_value(c);
+    rados_aio_release(c);
+    if (acb->write) {
+        if (r < 0) {
+            acb->ret = r;
+            acb->error = 1;
+        } else if (!acb->error) {
+            acb->ret += rcb->segsize;
+        }
+    } else {
+        if (r == -ENOENT) {
+            memset(rcb->buf, 0, rcb->segsize);
+            if (!acb->error) {
+                acb->ret += rcb->segsize;
+            }
+        } else if (r < 0) {
+            acb->ret = r;
+            acb->error = 1;
+        } else if (r < rcb->segsize) {
+            memset(rcb->buf + r, 0, rcb->segsize - r);
+            if (!acb->error) {
+                acb->ret += rcb->segsize;
+            }
+        } else if (!acb->error) {
+            acb->ret += r;
+        }
+    }
+    qemu_free(rcb);
+    i = 0;
+    if (!acb->aiocnt && acb->bh) {
+        qemu_bh_schedule(acb->bh);
+    }
+}
+
+/* Callback when all queued rados_aio requests are complete */
+static void rbd_aio_bh_cb(void *opaque)
+{
+    RBDAIOCB *acb = opaque;
+
+    if (!acb->write) {
+        qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
+    }
+    qemu_vfree(acb->bounce);
+    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *rbd_aio_rw_vector(BlockDriverState *bs,
+                                           int64_t sector_num,
+                                           QEMUIOVector *qiov,
+                                           int nb_sectors,
+                                           BlockDriverCompletionFunc *cb,
+                                           void *opaque, int write)
+{
+    RBDAIOCB *acb;
+    RADOSCB *rcb;
+    rados_completion_t c;
+    char n[RBD_MAX_SEG_NAME_SIZE];
+    int64_t segnr, segoffs, segsize, last_segnr;
+    int64_t off, size;
+    char *buf;
+
+    RBDRVRBDState *s = bs->opaque;
+
+    acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque);
+    acb->write = write;
+    acb->qiov = qiov;
+    acb->bounce = qemu_blockalign(bs, qiov->size);
+    acb->aiocnt = 0;
+    acb->ret = 0;
+    acb->error = 0;
+
+    if (!acb->bh) {
+        acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
+    }
+
+    if (write) {
+        qemu_iovec_to_buffer(acb->qiov, acb->bounce);
+    }
+
+    buf = acb->bounce;
+
+    off = sector_num * 512;
+    size = nb_sectors * 512;
+    segnr = (int64_t) (off / s->objsize);
+    segoffs = (int64_t) (off % s->objsize);
+    segsize = (int64_t) (s->objsize - segoffs);
+
+    last_segnr = ((off + size - 1) / s->objsize);
+    acb->aiocnt = (last_segnr - segnr) + 1;
+
+    while (size > 0) {
+        if (size < segsize) {
+            segsize = size;
+        }
+
+        snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name,
+                 (long long unsigned int)segnr);
+
+        rcb = qemu_malloc(sizeof(RADOSCB));
+        rcb->done = 0;
+        rcb->acb = acb;
+        rcb->segsize = segsize;
+        rcb->buf = buf;
+
+        if (write) {
+            rados_aio_create_completion(rcb, NULL,
+                                        (rados_callback_t) rbd_finish_aiocb, &c);
+            rados_aio_write(s->pool, n, segoffs, buf, segsize, c);
+        } else {
+            rados_aio_create_completion(rcb, (rados_callback_t) rbd_finish_aiocb,
+                                        NULL, &c);
+            rados_aio_read(s->pool, n, segoffs, buf, segsize, c);
+        }
+
+        buf += segsize;
+        size -= segsize;
+        segoffs = 0;
+        segsize = s->objsize;
+        segnr++;
+    }
+
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *rbd_aio_readv(BlockDriverState *bs,
+                                       int64_t sector_num, QEMUIOVector *qiov,
+                                       int nb_sectors,
+                                       BlockDriverCompletionFunc *cb,
+                                       void *opaque)
+{
+    return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *rbd_aio_writev(BlockDriverState *bs,
+                                        int64_t sector_num, QEMUIOVector *qiov,
+                                        int nb_sectors,
+                                        BlockDriverCompletionFunc *cb,
+                                        void *opaque)
+{
+    return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+static int rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    RBDRVRBDState *s = bs->opaque;
+    bdi->cluster_size = s->objsize;
+    return 0;
+}
+
+static int64_t rbd_getlength(BlockDriverState *bs)
+{
+    RBDRVRBDState *s = bs->opaque;
+
+    return s->size;
+}
+
+static QEMUOptionParameter rbd_create_options[] = {
+    {
+     .name = BLOCK_OPT_SIZE,
+     .type = OPT_SIZE,
+     .help = "Virtual disk size"
+    },
+    {
+     .name = BLOCK_OPT_CLUSTER_SIZE,
+     .type = OPT_SIZE,
+     .help = "RBD object size"
+    },
+    {NULL}
+};
+
+static BlockDriver bdrv_rbd = {
+    .format_name = "rbd",
+    .instance_size = sizeof(RBDRVRBDState),
+    .bdrv_open = rbd_open,
+    .bdrv_read = rbd_read,
+    .bdrv_write = rbd_write,
+    .bdrv_close = rbd_close,
+    .bdrv_create = rbd_create,
+    .bdrv_get_info = rbd_getinfo,
+    .create_options = rbd_create_options,
+    .bdrv_getlength = rbd_getlength,
+    .protocol_name = "rbd",
+
+    .bdrv_aio_readv = rbd_aio_readv,
+    .bdrv_aio_writev = rbd_aio_writev,
+};
+
+static void bdrv_rbd_init(void)
+{
+    bdrv_register(&bdrv_rbd);
+}
+
+block_init(bdrv_rbd_init);
diff --git a/block/rbd_types.h b/block/rbd_types.h
new file mode 100644
index 0000000..dfd5aa0
--- /dev/null
+++ b/block/rbd_types.h
@@ -0,0 +1,48 @@
+#ifndef _FS_CEPH_RBD
+#define _FS_CEPH_RBD
+
+#include <linux/types.h>
+
+/*
+ * rbd image 'foo' consists of objects
+ *   foo.rbd      - image metadata
+ *   foo.00000000
+ *   foo.00000001
+ *   ...          - data
+ */
+
+#define RBD_SUFFIX	 	".rbd"
+#define RBD_DIRECTORY           "rbd_directory"
+
+#define RBD_DEFAULT_OBJ_ORDER	22   /* 4MB */
+
+#define RBD_MAX_OBJ_NAME_SIZE	96
+#define RBD_MAX_SEG_NAME_SIZE	128
+
+#define RBD_COMP_NONE		0
+#define RBD_CRYPT_NONE		0
+
+static const char rbd_text[] = "<<< Rados Block Device Image >>>\n";
+static const char rbd_signature[] = "RBD";
+static const char rbd_version[] = "001.001";
+
+struct rbd_obj_snap_ondisk {
+	__le64 id;
+	__le64 image_size;
+} __attribute__((packed));
+
+struct rbd_obj_header_ondisk {
+	char text[64];
+	char signature[4];
+	char version[8];
+	__le64 image_size;
+	__u8 obj_order;
+	__u8 crypt_type;
+	__u8 comp_type;
+	__le32 snap_seq;
+	__le32 snap_count;
+	__le64 snap_names_len;
+	struct rbd_obj_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+#endif
diff --git a/configure b/configure
index 36d028f..d07a7e5 100755
--- a/configure
+++ b/configure
@@ -299,6 +299,7 @@ pkgversion=""
 check_utests="no"
 user_pie="no"
 zero_malloc=""
+rbd="no"
 
 # OS specific
 if check_define __linux__ ; then
@@ -660,6 +661,8 @@ for opt do
   ;;
   --enable-vhost-net) vhost_net="yes"
   ;;
+  --enable-rbd) rbd="yes"
+  ;;
   *) echo "ERROR: unknown option $opt"; show_help="yes"
   ;;
   esac
@@ -826,6 +829,7 @@ echo "  --enable-docs            enable documentation build"
 echo "  --disable-docs           disable documentation build"
 echo "  --disable-vhost-net      disable vhost-net acceleration support"
 echo "  --enable-vhost-net       enable vhost-net acceleration support"
+echo "  --enable-rbd		 enable building the rados block device (rbd)"
 echo ""
 echo "NOTE: The object files are built at the place where configure is launched"
 exit 1
@@ -1569,6 +1573,25 @@ if test "$mingw32" != yes -a "$pthread" = no; then
 fi
 
 ##########################################
+# rbd probe
+if test "$rbd" != "no" ; then
+  cat > $TMPC <<EOF
+#include <stdio.h>
+#include <rados/librados.h>
+int main(void) { rados_initialize(0, NULL); return 0; }
+EOF
+  if compile_prog "" "-lrados -lcrypto" ; then
+    rbd=yes
+    LIBS="$LIBS -lrados -lcrypto"
+  else
+    if test "$rbd" = "yes" ; then
+      feature_not_found "rados block device"
+    fi
+    rbd=no
+  fi
+fi
+
+##########################################
 # linux-aio probe
 
 if test "$linux_aio" != "no" ; then
@@ -2031,6 +2054,7 @@ echo "preadv support    $preadv"
 echo "fdatasync         $fdatasync"
 echo "uuid support      $uuid"
 echo "vhost-net support $vhost_net"
+echo "rbd support       $rbd"
 
 if test $sdl_too_old = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -2260,6 +2284,9 @@ echo "CONFIG_UNAME_RELEASE=\"$uname_release\"" >> $config_host_mak
 if test "$zero_malloc" = "yes" ; then
   echo "CONFIG_ZERO_MALLOC=y" >> $config_host_mak
 fi
+if test "$rbd" = "yes" ; then
+  echo "CONFIG_RBD=y" >> $config_host_mak
+fi
 
 # USB host support
 case "$usb" in
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-19 19:22 [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm Christian Brunner
@ 2010-05-20 20:31 ` Blue Swirl
  2010-05-20 21:18   ` Christian Brunner
  2010-05-20 23:02   ` Yehuda Sadeh Weinraub
  0 siblings, 2 replies; 64+ messages in thread
From: Blue Swirl @ 2010-05-20 20:31 UTC (permalink / raw)
  To: Christian Brunner; +Cc: ceph-devel, qemu-devel, kvm

On Wed, May 19, 2010 at 7:22 PM, Christian Brunner <chb@muc.de> wrote:
> The attached patch is a block driver for the distributed file system
> Ceph (http://ceph.newdream.net/). This driver uses librados (which
> is part of the Ceph server) for direct access to the Ceph object
> store and is running entirely in userspace. Therefore it is
> called "rbd" - rados block device.
>
> To compile the driver a recent version of ceph (>= 0.20.1) is needed
> and you have to "--enable-rbd" when running configure.
>
> Additional information is available on the Ceph-Wiki:
>
> http://ceph.newdream.net/wiki/Kvm-rbd


I have no idea whether it makes sense to add Ceph (no objection
either). I have some minor comments below.

>
> ---
>  Makefile          |    3 +
>  Makefile.objs     |    1 +
>  block/rados.h     |  376 ++++++++++++++++++++++++++++++++++
>  block/rbd.c       |  585 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/rbd_types.h |   48 +++++
>  configure         |   27 +++
>  6 files changed, 1040 insertions(+), 0 deletions(-)
>  create mode 100644 block/rados.h
>  create mode 100644 block/rbd.c
>  create mode 100644 block/rbd_types.h
>
> diff --git a/Makefile b/Makefile
> index eb9e02b..b1ab3e9 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -27,6 +27,9 @@ configure: ;
>  $(call set-vpath, $(SRC_PATH):$(SRC_PATH)/hw)
>
>  LIBS+=-lz $(LIBS_TOOLS)
> +ifdef CONFIG_RBD
> +LIBS+=-lrados
> +endif
>
>  ifdef BUILD_DOCS
>  DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8
> diff --git a/Makefile.objs b/Makefile.objs
> index acbaf22..85791ac 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -18,6 +18,7 @@ block-nested-y += parallels.o nbd.o blkdebug.o
>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>  block-nested-$(CONFIG_CURL) += curl.o
> +block-nested-$(CONFIG_RBD) += rbd.o
>
>  block-obj-y +=  $(addprefix block/, $(block-nested-y))
>
> diff --git a/block/rados.h b/block/rados.h
> new file mode 100644
> index 0000000..6cde9a1
> --- /dev/null
> +++ b/block/rados.h
> @@ -0,0 +1,376 @@
> +#ifndef __RADOS_H
> +#define __RADOS_H

IIRC underscores here may conflict with system header use. Please use
something like QEMU_BLOCK_RADOS_H.

> +
> +/*
> + * Data types for the Ceph distributed object storage layer RADOS
> + * (Reliable Autonomic Distributed Object Store).
> + */
> +
> +
> +
> +/*
> + * osdmap encoding versions
> + */
> +#define CEPH_OSDMAP_INC_VERSION     5
> +#define CEPH_OSDMAP_INC_VERSION_EXT 5
> +#define CEPH_OSDMAP_VERSION         5
> +#define CEPH_OSDMAP_VERSION_EXT     5
> +
> +/*
> + * fs id
> + */
> +struct ceph_fsid {
> +       unsigned char fsid[16];

Too large indent, please check also elsewhere.

> +};
> +
> +static inline int ceph_fsid_compare(const struct ceph_fsid *a,
> +                                   const struct ceph_fsid *b)
> +{
> +       return memcmp(a, b, sizeof(*a));
> +}
> +
> +/*
> + * ino, object, etc.
> + */
> +typedef __le64 ceph_snapid_t;

Please use uint64_t and le_to_cpu()/cpu_to_le().

> +#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */

Likewise, uint64_t is the standard type. Also other places.

> +#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
> +#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
> +
> +struct ceph_timespec {
> +       __le32 tv_sec;
> +       __le32 tv_nsec;
> +} __attribute__ ((packed));
> +
> +
> +/*
> + * object layout - how objects are mapped into PGs
> + */
> +#define CEPH_OBJECT_LAYOUT_HASH     1
> +#define CEPH_OBJECT_LAYOUT_LINEAR   2
> +#define CEPH_OBJECT_LAYOUT_HASHINO  3
> +
> +/*
> + * pg layout -- how PGs are mapped onto (sets of) OSDs
> + */
> +#define CEPH_PG_LAYOUT_CRUSH  0
> +#define CEPH_PG_LAYOUT_HASH   1
> +#define CEPH_PG_LAYOUT_LINEAR 2
> +#define CEPH_PG_LAYOUT_HYBRID 3
> +
> +
> +/*
> + * placement group.
> + * we encode this into one __le64.
> + */
> +struct ceph_pg {
> +       __le16 preferred; /* preferred primary osd */
> +       __le16 ps;        /* placement seed */
> +       __le32 pool;      /* object pool */
> +} __attribute__ ((packed));
> +
> +/*
> + * pg_pool is a set of pgs storing a pool of objects
> + *
> + *  pg_num -- base number of pseudorandomly placed pgs
> + *
> + *  pgp_num -- effective number when calculating pg placement.  this
> + * is used for pg_num increases.  new pgs result in data being "split"
> + * into new pgs.  for this to proceed smoothly, new pgs are intiially
> + * colocated with their parents; that is, pgp_num doesn't increase
> + * until the new pgs have successfully split.  only _then_ are the new
> + * pgs placed independently.
> + *
> + *  lpg_num -- localized pg count (per device).  replicas are randomly
> + * selected.
> + *
> + *  lpgp_num -- as above.
> + */
> +#define CEPH_PG_TYPE_REP     1
> +#define CEPH_PG_TYPE_RAID4   2
> +#define CEPH_PG_POOL_VERSION 2
> +struct ceph_pg_pool {
> +       __u8 type;                /* CEPH_PG_TYPE_* */
> +       __u8 size;                /* number of osds in each pg */
> +       __u8 crush_ruleset;       /* crush placement rule */
> +       __u8 object_hash;         /* hash mapping object name to ps */
> +       __le32 pg_num, pgp_num;   /* number of pg's */
> +       __le32 lpg_num, lpgp_num; /* number of localized pg's */
> +       __le32 last_change;       /* most recent epoch changed */

Is the intent here that one uint32_t is implicitly added for padding
or is the structure really unaligned? I'd make the padding explicit to
be sure.

> +       __le64 snap_seq;          /* seq for per-pool snapshot */
> +       __le32 snap_epoch;        /* epoch of last snap */
> +       __le32 num_snaps;
> +       __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */

Unaligned?

> +       __le64 auid;               /* who owns the pg */
> +} __attribute__ ((packed));
> +
> +/*
> + * stable_mod func is used to control number of placement groups.
> + * similar to straight-up modulo, but produces a stable mapping as b
> + * increases over time.  b is the number of bins, and bmask is the
> + * containing power of 2 minus 1.
> + *
> + * b <= bmask and bmask=(2**n)-1
> + * e.g., b=12 -> bmask=15, b=123 -> bmask=127
> + */
> +static inline int ceph_stable_mod(int x, int b, int bmask)
> +{
> +       if ((x & bmask) < b)
> +               return x & bmask;
> +       else
> +               return x & (bmask >> 1);

Please check CODING_STYLE for brace use.

> +}
> +
> +/*
> + * object layout - how a given object should be stored.
> + */
> +struct ceph_object_layout {
> +       struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
> +       __le32 ol_stripe_unit;    /* for per-object parity, if any */
> +} __attribute__ ((packed));
> +
> +/*
> + * compound epoch+version, used by storage layer to serialize mutations
> + */
> +struct ceph_eversion {
> +       __le32 epoch;

Unaligned?

> +       __le64 version;
> +} __attribute__ ((packed));
> +
> +/*
> + * osd map bits
> + */
> +
> +/* status bits */
> +#define CEPH_OSD_EXISTS 1
> +#define CEPH_OSD_UP     2
> +
> +/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
> +#define CEPH_OSD_IN  0x10000
> +#define CEPH_OSD_OUT 0
> +
> +
> +/*
> + * osd map flag bits
> + */
> +#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
> +#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
> +#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
> +#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
> +#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
> +
> +/*
> + * osd ops
> + */
> +#define CEPH_OSD_OP_MODE       0xf000
> +#define CEPH_OSD_OP_MODE_RD    0x1000
> +#define CEPH_OSD_OP_MODE_WR    0x2000
> +#define CEPH_OSD_OP_MODE_RMW   0x3000
> +#define CEPH_OSD_OP_MODE_SUB   0x4000
> +
> +#define CEPH_OSD_OP_TYPE       0x0f00
> +#define CEPH_OSD_OP_TYPE_LOCK  0x0100
> +#define CEPH_OSD_OP_TYPE_DATA  0x0200
> +#define CEPH_OSD_OP_TYPE_ATTR  0x0300
> +#define CEPH_OSD_OP_TYPE_EXEC  0x0400
> +#define CEPH_OSD_OP_TYPE_PG    0x0500
> +
> +enum {
> +       /** data **/
> +       /* read */
> +       CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
> +       CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
> +
> +       /* fancy read */
> +       CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
> +
> +       /* write */
> +       CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
> +       CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
> +       CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
> +       CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
> +       CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
> +
> +       /* fancy write */
> +       CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
> +       CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
> +       CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
> +       CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
> +
> +       CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
> +       CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
> +       CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
> +
> +       CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
> +
> +       /** attrs **/
> +       /* read */
> +       CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
> +       CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
> +
> +       /* write */
> +       CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
> +       CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
> +       CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
> +       CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
> +
> +       /** subop **/
> +       CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
> +       CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
> +       CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
> +       CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
> +       CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
> +
> +       /** lock **/
> +       CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
> +       CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
> +       CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
> +       CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
> +       CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
> +       CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
> +
> +       /** exec **/
> +       CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
> +
> +       /** pg **/
> +       CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
> +};
> +
> +static inline int ceph_osd_op_type_lock(int op)
> +{
> +       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
> +}
> +static inline int ceph_osd_op_type_data(int op)
> +{
> +       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
> +}
> +static inline int ceph_osd_op_type_attr(int op)
> +{
> +       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
> +}
> +static inline int ceph_osd_op_type_exec(int op)
> +{
> +       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
> +}
> +static inline int ceph_osd_op_type_pg(int op)
> +{
> +       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
> +}
> +
> +static inline int ceph_osd_op_mode_subop(int op)
> +{
> +       return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
> +}
> +static inline int ceph_osd_op_mode_read(int op)
> +{
> +       return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
> +}
> +static inline int ceph_osd_op_mode_modify(int op)
> +{
> +       return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
> +}
> +
> +#define CEPH_OSD_TMAP_HDR 'h'
> +#define CEPH_OSD_TMAP_SET 's'
> +#define CEPH_OSD_TMAP_RM  'r'
> +
> +extern const char *ceph_osd_op_name(int op);
> +
> +
> +/*
> + * osd op flags
> + *
> + * An op may be READ, WRITE, or READ|WRITE.
> + */
> +enum {
> +       CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
> +       CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
> +       CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
> +       CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
> +       CEPH_OSD_FLAG_READ = 16,        /* op may read */
> +       CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
> +       CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
> +       CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
> +       CEPH_OSD_FLAG_BALANCE_READS = 256,
> +       CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
> +       CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
> +       CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
> +};
> +
> +enum {
> +       CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
> +};
> +
> +#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
> +#define EBLACKLISTED ESHUTDOWN /* blacklisted */

Are these used somewhere? Maybe these could clash with system errnos.

> +
> +/*
> + * an individual object operation.  each may be accompanied by some data
> + * payload
> + */
> +struct ceph_osd_op {
> +       __le16 op;           /* CEPH_OSD_OP_* */

Unaligned?

> +       __le32 flags;        /* CEPH_OSD_FLAG_* */
> +       union {
> +               struct {
> +                       __le64 offset, length;
> +                       __le64 truncate_size;
> +                       __le32 truncate_seq;
> +               } __attribute__ ((packed)) extent;
> +               struct {
> +                       __le32 name_len;
> +                       __le32 value_len;
> +               } __attribute__ ((packed)) xattr;
> +               struct {
> +                       __u8 class_len;
> +                       __u8 method_len;
> +                       __u8 argc;

Unaligned?

> +                       __le32 indata_len;
> +               } __attribute__ ((packed)) cls;
> +               struct {
> +                       __le64 cookie, count;
> +               } __attribute__ ((packed)) pgls;
> +       };
> +       __le32 payload_len;
> +} __attribute__ ((packed));
> +
> +/*
> + * osd request message header.  each request may include multiple
> + * ceph_osd_op object operations.
> + */
> +struct ceph_osd_request_head {
> +       __le32 client_inc;                 /* client incarnation */
> +       struct ceph_object_layout layout;  /* pgid */

Unaligned on 64 bit hosts?

> +       __le32 osdmap_epoch;               /* client's osdmap epoch */
> +
> +       __le32 flags;
> +
> +       struct ceph_timespec mtime;        /* for mutations only */
> +       struct ceph_eversion reassert_version; /* if we are replaying op */
> +
> +       __le32 object_len;     /* length of object name */
> +
> +       __le64 snapid;         /* snapid to read */

Unaligned?

> +       __le64 snap_seq;       /* writer's snap context */
> +       __le32 num_snaps;
> +
> +       __le16 num_ops;
> +       struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */

Unaligned?

> +} __attribute__ ((packed));
> +
> +struct ceph_osd_reply_head {
> +       __le32 client_inc;                /* client incarnation */
> +       __le32 flags;
> +       struct ceph_object_layout layout;
> +       __le32 osdmap_epoch;

Unaligned on 64 bit hosts?

> +       struct ceph_eversion reassert_version; /* for replaying uncommitted */
> +
> +       __le32 result;                    /* result code */
> +
> +       __le32 object_len;                /* length of object name */
> +       __le32 num_ops;
> +       struct ceph_osd_op ops[0];  /* ops[], object */

Unaligned on 64 bit hosts?

> +} __attribute__ ((packed));
> +
> +
> +#endif
> diff --git a/block/rbd.c b/block/rbd.c
> new file mode 100644
> index 0000000..eedae50
> --- /dev/null
> +++ b/block/rbd.c
> @@ -0,0 +1,585 @@
> +/*
> + * QEMU Block driver for RADOS (Ceph)
> + *
> + * Copyright (C) 2010 Christian Brunner <chb@muc.de>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include <sys/types.h>
> +#include <stdbool.h>
> +
> +#include <qemu-common.h>
> +
> +#include "rbd_types.h"
> +#include "rados.h"
> +#include "module.h"
> +#include "block_int.h"
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <rados/librados.h>
> +
> +#include <signal.h>
> +
> +/*
> + * When specifying the image filename use:
> + *
> + * rbd:poolname/devicename
> + *
> + * poolname must be the name of an existing rados pool
> + *
> + * devicename is the basename for all objects used to
> + * emulate the raw device.
> + *
> + * Metadata information (image size, ...) is stored in an
> + * object with the name "devicename.rbd".
> + *
> + * The raw device is split into 4MB sized objects by default.
> + * The sequencenumber is encoded in a 12 byte long hex-string,
> + * and is attached to the devicename, separated by a dot.
> + * e.g. "devicename.1234567890ab"
> + *
> + */
> +
> +#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
> +
> +typedef struct RBDAIOCB {
> +    BlockDriverAIOCB common;
> +    QEMUBH *bh;
> +    int ret;
> +    QEMUIOVector *qiov;
> +    char *bounce;
> +    int write;
> +    int64_t sector_num;
> +    int aiocnt;
> +    int error;
> +} RBDAIOCB;
> +
> +typedef struct RADOSCB {
> +    int rcbid;
> +    RBDAIOCB *acb;
> +    int done;
> +    int64_t segsize;
> +    char *buf;
> +} RADOSCB;
> +
> +typedef struct RBDRVRBDState {
> +    rados_pool_t pool;
> +    char name[RBD_MAX_OBJ_NAME_SIZE];
> +    int name_len;
> +    uint64_t size;
> +    uint64_t objsize;
> +} RBDRVRBDState;
> +
> +typedef struct rbd_obj_header_ondisk RbdHeader1;
> +
> +static int rbd_parsename(const char *filename, char *pool, char *name)
> +{
> +    const char *rbdname;
> +    char *p, *n;
> +    int l;
> +
> +    if (!strstart(filename, "rbd:", &rbdname)) {
> +        return -EINVAL;
> +    }
> +
> +    pstrcpy(pool, 2 * RBD_MAX_SEG_NAME_SIZE, rbdname);
> +    p = strchr(pool, '/');
> +    if (p == NULL) {
> +        return -EINVAL;
> +    }
> +
> +    *p = '\0';
> +    n = ++p;
> +
> +    l = strlen(n);
> +
> +    if (l > RBD_MAX_OBJ_NAME_SIZE) {
> +        fprintf(stderr, "object name to long\n");
> +        return -EINVAL;
> +    } else if (l <= 0) {
> +        fprintf(stderr, "object name to short\n");
> +        return -EINVAL;
> +    }
> +
> +    strcpy(name, n);

pstrcpy(name, l, n);

> +
> +    return l;
> +}
> +
> +static int create_tmap_op(uint8_t op, const char *name, char **tmap_desc)
> +{
> +    uint32_t len = strlen(name);
> +    uint32_t total_len = 1 + (sizeof(uint32_t) + len) + sizeof(uint32_t);       /* encoding op + name + empty buffer */
> +    char *desc;
> +
> +    desc = qemu_malloc(total_len);
> +    if (!desc) {
> +        return -ENOMEM;
> +    }

qemu_malloc won't return NULL, the check is not useful.

> +
> +    *tmap_desc = desc;
> +
> +    *desc = op;
> +    desc++;
> +    memcpy(desc, &len, sizeof(len));
> +    desc += sizeof(len);
> +    memcpy(desc, name, len);
> +    desc += len;
> +    len = 0;
> +    memcpy(desc, &len, sizeof(len));
> +    desc += sizeof(len);
> +
> +    return desc - *tmap_desc;
> +}
> +
> +static void free_tmap_op(char *tmap_desc)
> +{
> +    qemu_free(tmap_desc);
> +}
> +
> +static int rbd_register_image(rados_pool_t pool, const char *name)
> +{
> +    char *tmap_desc;
> +    const char *dir = RBD_DIRECTORY;
> +    int ret;
> +
> +    ret = create_tmap_op(CEPH_OSD_TMAP_SET, name, &tmap_desc);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = rados_tmap_update(pool, dir, tmap_desc, ret);
> +    free_tmap_op(tmap_desc);
> +
> +    return ret;
> +}
> +
> +static int rbd_create(const char *filename, QEMUOptionParameter *options)
> +{
> +    int64_t bytes = 0;
> +    int64_t objsize;
> +    uint64_t size;
> +    time_t mtime;
> +    uint8_t obj_order = RBD_DEFAULT_OBJ_ORDER;
> +    char pool[RBD_MAX_SEG_NAME_SIZE];
> +    char n[RBD_MAX_SEG_NAME_SIZE];
> +    char name[RBD_MAX_SEG_NAME_SIZE];
> +    RbdHeader1 header;
> +    rados_pool_t p;
> +    int name_len;
> +    int ret;
> +
> +    if ((name_len = rbd_parsename(filename, pool, name)) < 0) {
> +        return -EINVAL;
> +    }
> +
> +    snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s%s", name, RBD_SUFFIX);
> +
> +    /* Read out options */
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            bytes = options->value.n;
> +        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
> +            if (options->value.n) {
> +                objsize = options->value.n;
> +                if (!objsize || ((objsize - 1) & objsize)) {    /* not a power of 2? */
> +                    fprintf(stderr, "obj size needs to be power of 2\n");
> +                    return -EINVAL;
> +                }
> +                if (objsize < 4096) {
> +                    fprintf(stderr, "obj size too small\n");
> +                    return -EINVAL;
> +                }
> +
> +                for (obj_order = 0; obj_order < 64; obj_order++) {
> +                    if (objsize == 1)
> +                        break;
> +                    objsize >>= 1;
> +                }
> +            }
> +        }
> +        options++;
> +    }
> +
> +    memset(&header, 0, sizeof(header));
> +    pstrcpy(header.text, sizeof(header.text), rbd_text);
> +    pstrcpy(header.signature, sizeof(header.signature), rbd_signature);
> +    pstrcpy(header.version, sizeof(header.version), rbd_version);
> +    header.image_size = bytes;
> +    cpu_to_le64s((uint64_t *) & header.image_size);
> +    header.obj_order = obj_order;
> +    header.crypt_type = RBD_CRYPT_NONE;
> +    header.comp_type = RBD_COMP_NONE;
> +    header.snap_seq = 0;
> +    header.snap_count = 0;
> +    cpu_to_le32s(&header.snap_count);
> +
> +    if (rados_initialize(0, NULL) < 0) {
> +        fprintf(stderr, "error initializing\n");
> +        return -EIO;
> +    }
> +
> +    if (rados_open_pool(pool, &p)) {
> +        fprintf(stderr, "error opening pool %s\n", pool);
> +        return -EIO;
> +    }
> +
> +    /* check for existing rbd header file */
> +    ret = rados_stat(p, n, &size, &mtime);
> +    if (ret == 0) {
> +        ret=-EEXIST;
> +        goto done;
> +    }
> +
> +    /* create header file */
> +    ret = rados_write(p, n, 0, (const char *)&header, sizeof(header));
> +    if (ret < 0) {
> +        goto done;
> +    }
> +
> +    ret = rbd_register_image(p, name);
> +done:
> +    rados_close_pool(p);
> +    rados_deinitialize();
> +
> +    return ret;
> +}
> +
> +static int rbd_open(BlockDriverState *bs, const char *filename, int flags)
> +{
> +    RBDRVRBDState *s = bs->opaque;
> +    char pool[RBD_MAX_SEG_NAME_SIZE];
> +    char n[RBD_MAX_SEG_NAME_SIZE];
> +    char hbuf[4096];
> +
> +    if ((s->name_len = rbd_parsename(filename, pool, s->name)) < 0) {
> +        return -EINVAL;
> +    }
> +    snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s%s", s->name, RBD_SUFFIX);
> +
> +    if (rados_initialize(0, NULL) < 0) {
> +        fprintf(stderr, "error initializing\n");
> +        return -EIO;
> +    }
> +
> +    if (rados_open_pool(pool, &s->pool)) {
> +        fprintf(stderr, "error opening pool %s\n", pool);
> +        return -EIO;
> +    }
> +
> +    if (rados_read(s->pool, n, 0, hbuf, 4096) < 0) {
> +        fprintf(stderr, "error reading header from %s\n", s->name);
> +        return -EIO;
> +    }
> +    if (!strncmp(hbuf + 64, rbd_signature, 4)) {
> +        if (!strncmp(hbuf + 68, rbd_version, 8)) {
> +            RbdHeader1 *header;
> +
> +            header = (RbdHeader1 *) hbuf;
> +            le64_to_cpus((uint64_t *) & header->image_size);
> +            s->size = header->image_size;
> +            s->objsize = 1 << header->obj_order;
> +        } else {
> +            fprintf(stderr, "Unknown image version %s\n", hbuf + 68);
> +            return -EIO;
> +        }
> +    } else {
> +        fprintf(stderr, "Invalid header signature %s\n", hbuf + 64);
> +        return -EIO;
> +    }
> +
> +    return 0;
> +}
> +
> +static void rbd_close(BlockDriverState *bs)
> +{
> +    RBDRVRBDState *s = bs->opaque;
> +
> +    rados_close_pool(s->pool);
> +    rados_deinitialize();
> +}
> +
> +static int rbd_rw(BlockDriverState *bs, int64_t sector_num,
> +                  uint8_t *buf, int nb_sectors, int write)
> +{
> +    RBDRVRBDState *s = bs->opaque;
> +    char n[RBD_MAX_SEG_NAME_SIZE];
> +
> +    int64_t segnr, segoffs, segsize, r;
> +    int64_t off, size;
> +
> +    off = sector_num * 512;
> +    size = nb_sectors * 512;
> +    segnr = (int64_t) (off / s->objsize);
> +    segoffs = (int64_t) (off % s->objsize);
> +    segsize = (int64_t) (s->objsize - segoffs);
> +
> +    while (size > 0) {
> +        if (size < segsize) {
> +            segsize = size;
> +        }
> +
> +        snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name,
> +                 (long long unsigned int)segnr);

Please use PRIx64 instead of llx, the cast won't be needed then. Also elsewhere.

> +
> +        if (write) {
> +            if ((r = rados_write(s->pool, n, segoffs, (const char *)buf,
> +                segsize)) < 0) {
> +                return r;
> +            }
> +        } else {
> +            r = rados_read(s->pool, n, segoffs, (char *)buf, segsize);
> +            if (r == -ENOENT) {
> +                memset(buf, 0, segsize);
> +            } else if (r < 0) {
> +                return(r);
> +            } else if (r < segsize) {
> +                memset(buf + r, 0, segsize - r);
> +            }
> +            r = segsize;
> +        }
> +
> +        buf += segsize;
> +        size -= segsize;
> +        segoffs = 0;
> +        segsize = s->objsize;
> +        segnr++;
> +    }
> +
> +    return (0);
> +}
> +
> +static int rbd_read(BlockDriverState *bs, int64_t sector_num,
> +                    uint8_t *buf, int nb_sectors)
> +{
> +    return rbd_rw(bs, sector_num, buf, nb_sectors, 0);
> +}
> +
> +static int rbd_write(BlockDriverState *bs, int64_t sector_num,
> +                     const uint8_t *buf, int nb_sectors)
> +{
> +    return rbd_rw(bs, sector_num, (uint8_t *) buf, nb_sectors, 1);
> +}
> +
> +static void rbd_aio_cancel(BlockDriverAIOCB *blockacb)
> +{
> +    RBDAIOCB *acb = (RBDAIOCB *) blockacb;
> +    qemu_bh_delete(acb->bh);
> +    acb->bh = NULL;
> +    qemu_aio_release(acb);
> +}
> +
> +static AIOPool rbd_aio_pool = {
> +    .aiocb_size = sizeof(RBDAIOCB),
> +    .cancel = rbd_aio_cancel,
> +};
> +
> +/* This is the callback function for rados_aio_read and _write */
> +static void rbd_finish_aiocb(rados_completion_t c, RADOSCB *rcb)
> +{
> +    RBDAIOCB *acb = rcb->acb;
> +    int64_t r;
> +    int i;
> +
> +    acb->aiocnt--;
> +    r = rados_aio_get_return_value(c);
> +    rados_aio_release(c);
> +    if (acb->write) {
> +        if (r < 0) {
> +            acb->ret = r;
> +            acb->error = 1;
> +        } else if (!acb->error) {
> +            acb->ret += rcb->segsize;
> +        }
> +    } else {
> +        if (r == -ENOENT) {
> +            memset(rcb->buf, 0, rcb->segsize);
> +            if (!acb->error) {
> +                acb->ret += rcb->segsize;
> +            }
> +        } else if (r < 0) {
> +            acb->ret = r;
> +            acb->error = 1;
> +        } else if (r < rcb->segsize) {
> +            memset(rcb->buf + r, 0, rcb->segsize - r);
> +            if (!acb->error) {
> +                acb->ret += rcb->segsize;
> +            }
> +        } else if (!acb->error) {
> +            acb->ret += r;
> +        }
> +    }
> +    qemu_free(rcb);
> +    i = 0;
> +    if (!acb->aiocnt && acb->bh) {
> +        qemu_bh_schedule(acb->bh);
> +    }
> +}
> +
> +/* Callback when all queued rados_aio requests are complete */
> +static void rbd_aio_bh_cb(void *opaque)
> +{
> +    RBDAIOCB *acb = opaque;
> +
> +    if (!acb->write) {
> +        qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
> +    }
> +    qemu_vfree(acb->bounce);
> +    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
> +    qemu_bh_delete(acb->bh);
> +    acb->bh = NULL;
> +    qemu_aio_release(acb);
> +}
> +
> +static BlockDriverAIOCB *rbd_aio_rw_vector(BlockDriverState *bs,
> +                                           int64_t sector_num,
> +                                           QEMUIOVector *qiov,
> +                                           int nb_sectors,
> +                                           BlockDriverCompletionFunc *cb,
> +                                           void *opaque, int write)
> +{
> +    RBDAIOCB *acb;
> +    RADOSCB *rcb;
> +    rados_completion_t c;
> +    char n[RBD_MAX_SEG_NAME_SIZE];
> +    int64_t segnr, segoffs, segsize, last_segnr;
> +    int64_t off, size;
> +    char *buf;
> +
> +    RBDRVRBDState *s = bs->opaque;
> +
> +    acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque);
> +    acb->write = write;
> +    acb->qiov = qiov;
> +    acb->bounce = qemu_blockalign(bs, qiov->size);
> +    acb->aiocnt = 0;
> +    acb->ret = 0;
> +    acb->error = 0;
> +
> +    if (!acb->bh) {
> +        acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
> +    }
> +
> +    if (write) {
> +        qemu_iovec_to_buffer(acb->qiov, acb->bounce);
> +    }
> +
> +    buf = acb->bounce;
> +
> +    off = sector_num * 512;
> +    size = nb_sectors * 512;
> +    segnr = (int64_t) (off / s->objsize);
> +    segoffs = (int64_t) (off % s->objsize);
> +    segsize = (int64_t) (s->objsize - segoffs);
> +
> +    last_segnr = ((off + size - 1) / s->objsize);
> +    acb->aiocnt = (last_segnr - segnr) + 1;
> +
> +    while (size > 0) {
> +        if (size < segsize) {
> +            segsize = size;
> +        }
> +
> +        snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name,
> +                 (long long unsigned int)segnr);
> +
> +        rcb = qemu_malloc(sizeof(RADOSCB));
> +        rcb->done = 0;
> +        rcb->acb = acb;
> +        rcb->segsize = segsize;
> +        rcb->buf = buf;
> +
> +        if (write) {
> +            rados_aio_create_completion(rcb, NULL,
> +                                        (rados_callback_t) rbd_finish_aiocb, &c);
> +            rados_aio_write(s->pool, n, segoffs, buf, segsize, c);
> +        } else {
> +            rados_aio_create_completion(rcb, (rados_callback_t) rbd_finish_aiocb,
> +                                        NULL, &c);
> +            rados_aio_read(s->pool, n, segoffs, buf, segsize, c);
> +        }
> +
> +        buf += segsize;
> +        size -= segsize;
> +        segoffs = 0;
> +        segsize = s->objsize;
> +        segnr++;
> +    }
> +
> +    return &acb->common;
> +}
> +
> +static BlockDriverAIOCB *rbd_aio_readv(BlockDriverState *bs,
> +                                       int64_t sector_num, QEMUIOVector *qiov,
> +                                       int nb_sectors,
> +                                       BlockDriverCompletionFunc *cb,
> +                                       void *opaque)
> +{
> +    return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
> +}
> +
> +static BlockDriverAIOCB *rbd_aio_writev(BlockDriverState *bs,
> +                                        int64_t sector_num, QEMUIOVector *qiov,
> +                                        int nb_sectors,
> +                                        BlockDriverCompletionFunc *cb,
> +                                        void *opaque)
> +{
> +    return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
> +}
> +
> +static int rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
> +{
> +    RBDRVRBDState *s = bs->opaque;
> +    bdi->cluster_size = s->objsize;
> +    return 0;
> +}
> +
> +static int64_t rbd_getlength(BlockDriverState *bs)
> +{
> +    RBDRVRBDState *s = bs->opaque;
> +
> +    return s->size;
> +}
> +
> +static QEMUOptionParameter rbd_create_options[] = {
> +    {
> +     .name = BLOCK_OPT_SIZE,
> +     .type = OPT_SIZE,
> +     .help = "Virtual disk size"
> +    },
> +    {
> +     .name = BLOCK_OPT_CLUSTER_SIZE,
> +     .type = OPT_SIZE,
> +     .help = "RBD object size"
> +    },
> +    {NULL}
> +};
> +
> +static BlockDriver bdrv_rbd = {
> +    .format_name = "rbd",
> +    .instance_size = sizeof(RBDRVRBDState),
> +    .bdrv_open = rbd_open,
> +    .bdrv_read = rbd_read,
> +    .bdrv_write = rbd_write,
> +    .bdrv_close = rbd_close,
> +    .bdrv_create = rbd_create,
> +    .bdrv_get_info = rbd_getinfo,
> +    .create_options = rbd_create_options,
> +    .bdrv_getlength = rbd_getlength,
> +    .protocol_name = "rbd",
> +
> +    .bdrv_aio_readv = rbd_aio_readv,
> +    .bdrv_aio_writev = rbd_aio_writev,
> +};
> +
> +static void bdrv_rbd_init(void)
> +{
> +    bdrv_register(&bdrv_rbd);
> +}
> +
> +block_init(bdrv_rbd_init);
> diff --git a/block/rbd_types.h b/block/rbd_types.h
> new file mode 100644
> index 0000000..dfd5aa0
> --- /dev/null
> +++ b/block/rbd_types.h
> @@ -0,0 +1,48 @@
> +#ifndef _FS_CEPH_RBD
> +#define _FS_CEPH_RBD

QEMU_BLOCK_RBD?

> +
> +#include <linux/types.h>

Can you use standard includes, like <sys/types.h> or <inttypes.h>? Are
Ceph libraries used in other systems than Linux?

> +
> +/*
> + * rbd image 'foo' consists of objects
> + *   foo.rbd      - image metadata
> + *   foo.00000000
> + *   foo.00000001
> + *   ...          - data
> + */
> +
> +#define RBD_SUFFIX             ".rbd"
> +#define RBD_DIRECTORY           "rbd_directory"
> +
> +#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
> +
> +#define RBD_MAX_OBJ_NAME_SIZE  96
> +#define RBD_MAX_SEG_NAME_SIZE  128
> +
> +#define RBD_COMP_NONE          0
> +#define RBD_CRYPT_NONE         0
> +
> +static const char rbd_text[] = "<<< Rados Block Device Image >>>\n";
> +static const char rbd_signature[] = "RBD";
> +static const char rbd_version[] = "001.001";
> +
> +struct rbd_obj_snap_ondisk {
> +       __le64 id;
> +       __le64 image_size;
> +} __attribute__((packed));
> +
> +struct rbd_obj_header_ondisk {
> +       char text[64];
> +       char signature[4];
> +       char version[8];
> +       __le64 image_size;

Unaligned? Is the disk format fixed?

> +       __u8 obj_order;
> +       __u8 crypt_type;
> +       __u8 comp_type;

Unaligned?

> +       __le32 snap_seq;
> +       __le32 snap_count;
> +       __le64 snap_names_len;

Unaligned?

> +       struct rbd_obj_snap_ondisk snaps[0];
> +} __attribute__((packed));
> +
> +#endif
> diff --git a/configure b/configure
> index 36d028f..d07a7e5 100755
> --- a/configure
> +++ b/configure
> @@ -299,6 +299,7 @@ pkgversion=""
>  check_utests="no"
>  user_pie="no"
>  zero_malloc=""
> +rbd="no"
>
>  # OS specific
>  if check_define __linux__ ; then
> @@ -660,6 +661,8 @@ for opt do
>   ;;
>   --enable-vhost-net) vhost_net="yes"
>   ;;
> +  --enable-rbd) rbd="yes"
> +  ;;
>   *) echo "ERROR: unknown option $opt"; show_help="yes"
>   ;;
>   esac
> @@ -826,6 +829,7 @@ echo "  --enable-docs            enable documentation build"
>  echo "  --disable-docs           disable documentation build"
>  echo "  --disable-vhost-net      disable vhost-net acceleration support"
>  echo "  --enable-vhost-net       enable vhost-net acceleration support"
> +echo "  --enable-rbd            enable building the rados block device (rbd)"
>  echo ""
>  echo "NOTE: The object files are built at the place where configure is launched"
>  exit 1
> @@ -1569,6 +1573,25 @@ if test "$mingw32" != yes -a "$pthread" = no; then
>  fi
>
>  ##########################################
> +# rbd probe
> +if test "$rbd" != "no" ; then
> +  cat > $TMPC <<EOF
> +#include <stdio.h>
> +#include <rados/librados.h>
> +int main(void) { rados_initialize(0, NULL); return 0; }
> +EOF
> +  if compile_prog "" "-lrados -lcrypto" ; then
> +    rbd=yes
> +    LIBS="$LIBS -lrados -lcrypto"
> +  else
> +    if test "$rbd" = "yes" ; then
> +      feature_not_found "rados block device"
> +    fi
> +    rbd=no
> +  fi
> +fi
> +
> +##########################################
>  # linux-aio probe
>
>  if test "$linux_aio" != "no" ; then
> @@ -2031,6 +2054,7 @@ echo "preadv support    $preadv"
>  echo "fdatasync         $fdatasync"
>  echo "uuid support      $uuid"
>  echo "vhost-net support $vhost_net"
> +echo "rbd support       $rbd"
>
>  if test $sdl_too_old = "yes"; then
>  echo "-> Your SDL version is too old - please upgrade to have SDL support"
> @@ -2260,6 +2284,9 @@ echo "CONFIG_UNAME_RELEASE=\"$uname_release\"" >> $config_host_mak
>  if test "$zero_malloc" = "yes" ; then
>   echo "CONFIG_ZERO_MALLOC=y" >> $config_host_mak
>  fi
> +if test "$rbd" = "yes" ; then
> +  echo "CONFIG_RBD=y" >> $config_host_mak
> +fi
>
>  # USB host support
>  case "$usb" in
> --
> 1.7.0.4
>
>
>

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-20 20:31 ` Blue Swirl
@ 2010-05-20 21:18   ` Christian Brunner
  2010-05-20 21:29     ` Anthony Liguori
  2010-05-20 23:02   ` Yehuda Sadeh Weinraub
  1 sibling, 1 reply; 64+ messages in thread
From: Christian Brunner @ 2010-05-20 21:18 UTC (permalink / raw)
  To: Blue Swirl; +Cc: ceph-devel, qemu-devel, kvm

2010/5/20 Blue Swirl <blauwirbel@gmail.com>:
> On Wed, May 19, 2010 at 7:22 PM, Christian Brunner <chb@muc.de> wrote:
>> The attached patch is a block driver for the distributed file system
>> Ceph (http://ceph.newdream.net/). This driver uses librados (which
>> is part of the Ceph server) for direct access to the Ceph object
>> store and is running entirely in userspace. Therefore it is
>> called "rbd" - rados block device.
>>
>> To compile the driver a recent version of ceph (>= 0.20.1) is needed
>> and you have to "--enable-rbd" when running configure.
>>
>> Additional information is available on the Ceph-Wiki:
>>
>> http://ceph.newdream.net/wiki/Kvm-rbd
>
>
> I have no idea whether it makes sense to add Ceph (no objection
> either). I have some minor comments below.

Thanks for your comments. I'll send an updated patch in a few days.

Having a central storage system is quite essential in larger hosting
environments, it enables you to move your guest systems from one node
to another easily (live-migration or dynamic restart). Traditionally
this has been done using SAN, iSCSI or NFS. However most of these
systems don't scale very well and and the costs for high-availability
are quite high.

With new approaches like Sheepdog or Ceph, things are getting a lot
cheaper and you can scale your system without disrupting your service.
The concepts are quite similar to what Amazon is doing in their EC2
environment, but they certainly won't publish it as OpenSource anytime
soon.

Both projects have advantages and disadvantages. Ceph is a bit more
universal as it implements a whole filesystem. Sheepdog is more
feature complete in regards of managing images (e.g. snapshots). Both
projects require some additional work to become stable, but they are
on a good way.

I would really like to see both drivers in the qemu tree, as they are
the key to a design shift in how storage in the datacenter is being
built.

Christian

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-20 21:18   ` Christian Brunner
@ 2010-05-20 21:29     ` Anthony Liguori
  2010-05-20 22:16       ` Christian Brunner
  2010-05-23 12:01       ` Avi Kivity
  0 siblings, 2 replies; 64+ messages in thread
From: Anthony Liguori @ 2010-05-20 21:29 UTC (permalink / raw)
  To: Christian Brunner; +Cc: Blue Swirl, ceph-devel, qemu-devel, kvm

On 05/20/2010 04:18 PM, Christian Brunner wrote:
> Thanks for your comments. I'll send an updated patch in a few days.
>
> Having a central storage system is quite essential in larger hosting
> environments, it enables you to move your guest systems from one node
> to another easily (live-migration or dynamic restart). Traditionally
> this has been done using SAN, iSCSI or NFS. However most of these
> systems don't scale very well and and the costs for high-availability
> are quite high.
>
> With new approaches like Sheepdog or Ceph, things are getting a lot
> cheaper and you can scale your system without disrupting your service.
> The concepts are quite similar to what Amazon is doing in their EC2
> environment, but they certainly won't publish it as OpenSource anytime
> soon.
>
> Both projects have advantages and disadvantages. Ceph is a bit more
> universal as it implements a whole filesystem. Sheepdog is more
> feature complete in regards of managing images (e.g. snapshots). Both
> projects require some additional work to become stable, but they are
> on a good way.
>
> I would really like to see both drivers in the qemu tree, as they are
> the key to a design shift in how storage in the datacenter is being
> built.
>    

I'd be more interested in enabling people to build these types of 
storage systems without touching qemu.

Both sheepdog and ceph ultimately transmit I/O over a socket to a 
central daemon, right?  So could we not standardize a protocol for this 
that both sheepdog and ceph could implement?

Regards,

Anthony Liguori

> Christian
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-20 21:29     ` Anthony Liguori
@ 2010-05-20 22:16       ` Christian Brunner
  2010-05-21  5:28         ` Stefan Hajnoczi
  2010-05-21  5:54         ` MORITA Kazutaka
  2010-05-23 12:01       ` Avi Kivity
  1 sibling, 2 replies; 64+ messages in thread
From: Christian Brunner @ 2010-05-20 22:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Blue Swirl, ceph-devel, qemu-devel, kvm

2010/5/20 Anthony Liguori <anthony@codemonkey.ws>:
>> With new approaches like Sheepdog or Ceph, things are getting a lot
>> cheaper and you can scale your system without disrupting your service.
>> The concepts are quite similar to what Amazon is doing in their EC2
>> environment, but they certainly won't publish it as OpenSource anytime
>> soon.
>>
>> Both projects have advantages and disadvantages. Ceph is a bit more
>> universal as it implements a whole filesystem. Sheepdog is more
>> feature complete in regards of managing images (e.g. snapshots). Both
>> projects require some additional work to become stable, but they are
>> on a good way.
>>
>> I would really like to see both drivers in the qemu tree, as they are
>> the key to a design shift in how storage in the datacenter is being
>> built.
>>
>
> I'd be more interested in enabling people to build these types of storage
> systems without touching qemu.

You could do this by using Yehuda's rbd kernel driver, but I think
that it would be better to avoid this additional layer.

> Both sheepdog and ceph ultimately transmit I/O over a socket to a central
> daemon, right?  So could we not standardize a protocol for this that both
> sheepdog and ceph could implement?

There is no central daemon. The concept is that they talk to many
storage nodes at the same time. Data is distributed and replicated
over many nodes in the network. The mechanism to do this is quite
complex. I don't know about sheepdog, but in Ceph this is called RADOS
(reliable autonomic distributed object store). Sheepdog and Ceph may
look similar, but this is where they act different. I don't think that
it would be possible to implement a common protocol.

Regards,
Christian

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-20 20:31 ` Blue Swirl
  2010-05-20 21:18   ` Christian Brunner
@ 2010-05-20 23:02   ` Yehuda Sadeh Weinraub
  2010-05-23  7:59     ` Blue Swirl
  1 sibling, 1 reply; 64+ messages in thread
From: Yehuda Sadeh Weinraub @ 2010-05-20 23:02 UTC (permalink / raw)
  To: Blue Swirl; +Cc: ceph-devel, Christian Brunner, kvm, qemu-devel

On Thu, May 20, 2010 at 1:31 PM, Blue Swirl <blauwirbel@gmail.com> wrote:
> On Wed, May 19, 2010 at 7:22 PM, Christian Brunner <chb@muc.de> wrote:
>> The attached patch is a block driver for the distributed file system
>> Ceph (http://ceph.newdream.net/). This driver uses librados (which
>> is part of the Ceph server) for direct access to the Ceph object
>> store and is running entirely in userspace. Therefore it is
>> called "rbd" - rados block device.
...
>
> IIRC underscores here may conflict with system header use. Please use
> something like QEMU_BLOCK_RADOS_H.

This header is shared between the linux kernel client and the ceph
userspace servers and client. We can actually get rid of it, as we
only need it to define CEPH_OSD_TMAP_SET. We can move this definition
to librados.h.

>> diff --git a/block/rbd_types.h b/block/rbd_types.h
>> new file mode 100644
>> index 0000000..dfd5aa0
>> --- /dev/null
>> +++ b/block/rbd_types.h
>> @@ -0,0 +1,48 @@
>> +#ifndef _FS_CEPH_RBD
>> +#define _FS_CEPH_RBD
>
> QEMU_BLOCK_RBD?

This header is shared between the ceph kernel client, between the qemu
rbd module (and between other ceph utilities). It'd be much easier
maintaining it without having to have a different implementation for
each. The same goes to the use of __le32/64 and __u32/64 within these
headers.

>
>> +
>> +#include <linux/types.h>
>
> Can you use standard includes, like <sys/types.h> or <inttypes.h>? Are
> Ceph libraries used in other systems than Linux?

Not at the moment. I guess that we can take this include out.

>
>> +
>> +/*
>> + * rbd image 'foo' consists of objects
>> + *   foo.rbd      - image metadata
>> + *   foo.00000000
>> + *   foo.00000001
>> + *   ...          - data
>> + */
>> +
>> +#define RBD_SUFFIX             ".rbd"
>> +#define RBD_DIRECTORY           "rbd_directory"
>> +
>> +#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
>> +
>> +#define RBD_MAX_OBJ_NAME_SIZE  96
>> +#define RBD_MAX_SEG_NAME_SIZE  128
>> +
>> +#define RBD_COMP_NONE          0
>> +#define RBD_CRYPT_NONE         0
>> +
>> +static const char rbd_text[] = "<<< Rados Block Device Image >>>\n";
>> +static const char rbd_signature[] = "RBD";
>> +static const char rbd_version[] = "001.001";
>> +
>> +struct rbd_obj_snap_ondisk {
>> +       __le64 id;
>> +       __le64 image_size;
>> +} __attribute__((packed));
>> +
>> +struct rbd_obj_header_ondisk {
>> +       char text[64];
>> +       char signature[4];
>> +       char version[8];
>> +       __le64 image_size;
>
> Unaligned? Is the disk format fixed?

This is a packed structure that represents the on disk format.
Operations on it are being done only to read from the disk header or
to write to the disk header.


Yehuda

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-20 22:16       ` Christian Brunner
@ 2010-05-21  5:28         ` Stefan Hajnoczi
  2010-05-21  6:13           ` MORITA Kazutaka
  2010-05-21  5:54         ` MORITA Kazutaka
  1 sibling, 1 reply; 64+ messages in thread
From: Stefan Hajnoczi @ 2010-05-21  5:28 UTC (permalink / raw)
  To: Christian Brunner; +Cc: Blue Swirl, ceph-devel, qemu-devel, kvm

On Thu, May 20, 2010 at 11:16 PM, Christian Brunner <chb@muc.de> wrote:
> 2010/5/20 Anthony Liguori <anthony@codemonkey.ws>:
>> Both sheepdog and ceph ultimately transmit I/O over a socket to a central
>> daemon, right?  So could we not standardize a protocol for this that both
>> sheepdog and ceph could implement?
>
> There is no central daemon. The concept is that they talk to many
> storage nodes at the same time. Data is distributed and replicated
> over many nodes in the network. The mechanism to do this is quite
> complex. I don't know about sheepdog, but in Ceph this is called RADOS
> (reliable autonomic distributed object store). Sheepdog and Ceph may
> look similar, but this is where they act different. I don't think that
> it would be possible to implement a common protocol.

I believe Sheepdog has a local daemon on each node.  The QEMU storage
backend talks to the daemon on the same node, which then does the real
network communication with the rest of the distributed storage system.
 So I think we're not talking about a network protocol here, we're
talking about a common interface that can be used by QEMU and other
programs to take advantage of Ceph, Sheepdog, etc services available
on the local node.

Haven't looked into your patch enough yet, but does librados talk
directly over the network or does it connect to a local daemon/driver?

Stefan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-20 22:16       ` Christian Brunner
  2010-05-21  5:28         ` Stefan Hajnoczi
@ 2010-05-21  5:54         ` MORITA Kazutaka
  1 sibling, 0 replies; 64+ messages in thread
From: MORITA Kazutaka @ 2010-05-21  5:54 UTC (permalink / raw)
  To: Christian Brunner; +Cc: Blue Swirl, ceph-devel, kvm, qemu-devel

At Fri, 21 May 2010 00:16:46 +0200,
Christian Brunner wrote:
> 
> 2010/5/20 Anthony Liguori <anthony@codemonkey.ws>:
> >> With new approaches like Sheepdog or Ceph, things are getting a lot
> >> cheaper and you can scale your system without disrupting your service.
> >> The concepts are quite similar to what Amazon is doing in their EC2
> >> environment, but they certainly won't publish it as OpenSource anytime
> >> soon.
> >>
> >> Both projects have advantages and disadvantages. Ceph is a bit more
> >> universal as it implements a whole filesystem. Sheepdog is more
> >> feature complete in regards of managing images (e.g. snapshots). Both

I think a major difference is that Sheepdog servers act fully
autonomously.  Any Sheepdog server has no fixed role such as a monitor
server, and Sheepdog doesn't require any configuration about a list of
nodes in the cluster.


> >> projects require some additional work to become stable, but they are
> >> on a good way.
> >>
> >> I would really like to see both drivers in the qemu tree, as they are
> >> the key to a design shift in how storage in the datacenter is being
> >> built.
> >>
> >
> > I'd be more interested in enabling people to build these types of storage
> > systems without touching qemu.
> 
> You could do this by using Yehuda's rbd kernel driver, but I think
> that it would be better to avoid this additional layer.
> 

I agree.  In addition, if a storage client is a qemu driver, the
storage system can support some features specific to qemu such as live
snapshot from qemu monitor.

Regards,

Kazutaka

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-21  5:28         ` Stefan Hajnoczi
@ 2010-05-21  6:13           ` MORITA Kazutaka
  0 siblings, 0 replies; 64+ messages in thread
From: MORITA Kazutaka @ 2010-05-21  6:13 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner

At Fri, 21 May 2010 06:28:42 +0100,
Stefan Hajnoczi wrote:
> 
> On Thu, May 20, 2010 at 11:16 PM, Christian Brunner <chb@muc.de> wrote:
> > 2010/5/20 Anthony Liguori <anthony@codemonkey.ws>:
> >> Both sheepdog and ceph ultimately transmit I/O over a socket to a central
> >> daemon, right?  So could we not standardize a protocol for this that both
> >> sheepdog and ceph could implement?
> >
> > There is no central daemon. The concept is that they talk to many
> > storage nodes at the same time. Data is distributed and replicated
> > over many nodes in the network. The mechanism to do this is quite
> > complex. I don't know about sheepdog, but in Ceph this is called RADOS
> > (reliable autonomic distributed object store). Sheepdog and Ceph may
> > look similar, but this is where they act different. I don't think that
> > it would be possible to implement a common protocol.
> 
> I believe Sheepdog has a local daemon on each node.  The QEMU storage
> backend talks to the daemon on the same node, which then does the real
> network communication with the rest of the distributed storage system.

Yes.  It is because Sheepdog doesn't have a configuration about
cluster membership as I mentioned in another mail, so the drvier
doesn't know which node to access other than localhost.

>  So I think we're not talking about a network protocol here, we're
> talking about a common interface that can be used by QEMU and other
> programs to take advantage of Ceph, Sheepdog, etc services available
> on the local node.
> 
> Haven't looked into your patch enough yet, but does librados talk
> directly over the network or does it connect to a local daemon/driver?
> 

AFAIK, librados access directly over the network, so I think it is
difficult to define a common interface.


Thanks,

Kazutaka

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-20 23:02   ` Yehuda Sadeh Weinraub
@ 2010-05-23  7:59     ` Blue Swirl
  2010-05-24  2:17       ` Yehuda Sadeh Weinraub
  0 siblings, 1 reply; 64+ messages in thread
From: Blue Swirl @ 2010-05-23  7:59 UTC (permalink / raw)
  To: Yehuda Sadeh Weinraub; +Cc: ceph-devel, Christian Brunner, kvm, qemu-devel

On Thu, May 20, 2010 at 11:02 PM, Yehuda Sadeh Weinraub
<yehudasa@gmail.com> wrote:
> On Thu, May 20, 2010 at 1:31 PM, Blue Swirl <blauwirbel@gmail.com> wrote:
>> On Wed, May 19, 2010 at 7:22 PM, Christian Brunner <chb@muc.de> wrote:
>>> The attached patch is a block driver for the distributed file system
>>> Ceph (http://ceph.newdream.net/). This driver uses librados (which
>>> is part of the Ceph server) for direct access to the Ceph object
>>> store and is running entirely in userspace. Therefore it is
>>> called "rbd" - rados block device.
> ...
>>
>> IIRC underscores here may conflict with system header use. Please use
>> something like QEMU_BLOCK_RADOS_H.
>
> This header is shared between the linux kernel client and the ceph
> userspace servers and client. We can actually get rid of it, as we
> only need it to define CEPH_OSD_TMAP_SET. We can move this definition
> to librados.h.
>
>>> diff --git a/block/rbd_types.h b/block/rbd_types.h
>>> new file mode 100644
>>> index 0000000..dfd5aa0
>>> --- /dev/null
>>> +++ b/block/rbd_types.h
>>> @@ -0,0 +1,48 @@
>>> +#ifndef _FS_CEPH_RBD
>>> +#define _FS_CEPH_RBD
>>
>> QEMU_BLOCK_RBD?
>
> This header is shared between the ceph kernel client, between the qemu
> rbd module (and between other ceph utilities). It'd be much easier
> maintaining it without having to have a different implementation for
> each. The same goes to the use of __le32/64 and __u32/64 within these
> headers.

This is user space, so identifiers must conform to C standards. The
identifiers beginning with underscores are reserved.

Doesn't __le32/64 also depend on some GCC extension? Or sparse magic?

>
>>
>>> +
>>> +#include <linux/types.h>
>>
>> Can you use standard includes, like <sys/types.h> or <inttypes.h>? Are
>> Ceph libraries used in other systems than Linux?
>
> Not at the moment. I guess that we can take this include out.
>
>>
>>> +
>>> +/*
>>> + * rbd image 'foo' consists of objects
>>> + *   foo.rbd      - image metadata
>>> + *   foo.00000000
>>> + *   foo.00000001
>>> + *   ...          - data
>>> + */
>>> +
>>> +#define RBD_SUFFIX             ".rbd"
>>> +#define RBD_DIRECTORY           "rbd_directory"
>>> +
>>> +#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
>>> +
>>> +#define RBD_MAX_OBJ_NAME_SIZE  96
>>> +#define RBD_MAX_SEG_NAME_SIZE  128
>>> +
>>> +#define RBD_COMP_NONE          0
>>> +#define RBD_CRYPT_NONE         0
>>> +
>>> +static const char rbd_text[] = "<<< Rados Block Device Image >>>\n";
>>> +static const char rbd_signature[] = "RBD";
>>> +static const char rbd_version[] = "001.001";
>>> +
>>> +struct rbd_obj_snap_ondisk {
>>> +       __le64 id;
>>> +       __le64 image_size;
>>> +} __attribute__((packed));
>>> +
>>> +struct rbd_obj_header_ondisk {
>>> +       char text[64];
>>> +       char signature[4];
>>> +       char version[8];
>>> +       __le64 image_size;
>>
>> Unaligned? Is the disk format fixed?
>
> This is a packed structure that represents the on disk format.
> Operations on it are being done only to read from the disk header or
> to write to the disk header.

That's clear. But what exactly is the alignment of field 'image_size'?
Could there be implicit padding to mod 8 between 'version' and
'image_size' with some compilers?

If there were no other constraints, I'd either make the padding
explicit, or rearrange/resize fields so that the field alignment is
natural. Thus my question, can you change the disk format or are there
already some deployments?

Otherwise, I'd just add some warning comment so people don't try to
use clever pointer tricks which will crash on machines with enforced
alignment.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-20 21:29     ` Anthony Liguori
  2010-05-20 22:16       ` Christian Brunner
@ 2010-05-23 12:01       ` Avi Kivity
  2010-05-24  7:12         ` MORITA Kazutaka
                           ` (2 more replies)
  1 sibling, 3 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-23 12:01 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Blue Swirl, ceph-devel, Christian Brunner, kvm, qemu-devel

On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>
> I'd be more interested in enabling people to build these types of 
> storage systems without touching qemu.
>
> Both sheepdog and ceph ultimately transmit I/O over a socket to a 
> central daemon, right? 

That incurs an extra copy.

> So could we not standardize a protocol for this that both sheepdog and 
> ceph could implement?

The protocol already exists, nbd.  It doesn't support snapshotting etc. 
but we could extend it.

But IMO what's needed is a plugin API for the block layer.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-23  7:59     ` Blue Swirl
@ 2010-05-24  2:17       ` Yehuda Sadeh Weinraub
  2010-05-25 20:13         ` Blue Swirl
  0 siblings, 1 reply; 64+ messages in thread
From: Yehuda Sadeh Weinraub @ 2010-05-24  2:17 UTC (permalink / raw)
  To: Blue Swirl; +Cc: ceph-devel, Christian Brunner, kvm, qemu-devel

On Sun, May 23, 2010 at 12:59 AM, Blue Swirl <blauwirbel@gmail.com> wrote:
> On Thu, May 20, 2010 at 11:02 PM, Yehuda Sadeh Weinraub
> <yehudasa@gmail.com> wrote:
>> On Thu, May 20, 2010 at 1:31 PM, Blue Swirl <blauwirbel@gmail.com> wrote:
>>> On Wed, May 19, 2010 at 7:22 PM, Christian Brunner <chb@muc.de> wrote:
>>>> The attached patch is a block driver for the distributed file system
>>>> Ceph (http://ceph.newdream.net/). This driver uses librados (which
>>>> is part of the Ceph server) for direct access to the Ceph object
>>>> store and is running entirely in userspace. Therefore it is
>>>> called "rbd" - rados block device.
>> ...
>>>
>>> IIRC underscores here may conflict with system header use. Please use
>>> something like QEMU_BLOCK_RADOS_H.
>>
>> This header is shared between the linux kernel client and the ceph
>> userspace servers and client. We can actually get rid of it, as we
>> only need it to define CEPH_OSD_TMAP_SET. We can move this definition
>> to librados.h.
>>
>>>> diff --git a/block/rbd_types.h b/block/rbd_types.h
>>>> new file mode 100644
>>>> index 0000000..dfd5aa0
>>>> --- /dev/null
>>>> +++ b/block/rbd_types.h
>>>> @@ -0,0 +1,48 @@
>>>> +#ifndef _FS_CEPH_RBD
>>>> +#define _FS_CEPH_RBD
>>>
>>> QEMU_BLOCK_RBD?
>>
>> This header is shared between the ceph kernel client, between the qemu
>> rbd module (and between other ceph utilities). It'd be much easier
>> maintaining it without having to have a different implementation for
>> each. The same goes to the use of __le32/64 and __u32/64 within these
>> headers.
>
> This is user space, so identifiers must conform to C standards. The
> identifiers beginning with underscores are reserved.
>
> Doesn't __le32/64 also depend on some GCC extension? Or sparse magic?
It depends on gcc extension. If needed we can probably have a separate
header for the qemu block device that uses alternative types. Though
looking at the qemu code I see use of other gcc extensions so I'm not
sure this is a real issue.

>
>>
>>>
>>>> +
>>>> +#include <linux/types.h>
>>>
>>> Can you use standard includes, like <sys/types.h> or <inttypes.h>? Are
>>> Ceph libraries used in other systems than Linux?
>>
>> Not at the moment. I guess that we can take this include out.
>>
>>>
>>>> +
>>>> +/*
>>>> + * rbd image 'foo' consists of objects
>>>> + *   foo.rbd      - image metadata
>>>> + *   foo.00000000
>>>> + *   foo.00000001
>>>> + *   ...          - data
>>>> + */
>>>> +
>>>> +#define RBD_SUFFIX             ".rbd"
>>>> +#define RBD_DIRECTORY           "rbd_directory"
>>>> +
>>>> +#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
>>>> +
>>>> +#define RBD_MAX_OBJ_NAME_SIZE  96
>>>> +#define RBD_MAX_SEG_NAME_SIZE  128
>>>> +
>>>> +#define RBD_COMP_NONE          0
>>>> +#define RBD_CRYPT_NONE         0
>>>> +
>>>> +static const char rbd_text[] = "<<< Rados Block Device Image >>>\n";
>>>> +static const char rbd_signature[] = "RBD";
>>>> +static const char rbd_version[] = "001.001";
>>>> +
>>>> +struct rbd_obj_snap_ondisk {
>>>> +       __le64 id;
>>>> +       __le64 image_size;
>>>> +} __attribute__((packed));
>>>> +
>>>> +struct rbd_obj_header_ondisk {
>>>> +       char text[64];
>>>> +       char signature[4];
>>>> +       char version[8];
>>>> +       __le64 image_size;
>>>
>>> Unaligned? Is the disk format fixed?
>>
>> This is a packed structure that represents the on disk format.
>> Operations on it are being done only to read from the disk header or
>> to write to the disk header.
>
> That's clear. But what exactly is the alignment of field 'image_size'?
> Could there be implicit padding to mod 8 between 'version' and
> 'image_size' with some compilers?

Obviously it's not 64 bit aligned. As it's an on-disk header, I don't
see alignment a real issue. As was said before, any operation on these
fields have to go through endianity conversion anyway, and this
structure should not be used directly. For such datastructures I'd
rather have the fields ordered in some logical order than maintaining
the alignment by ourselves. That's why we have that __attribute__
packed in the end to let the compiler deal with those issues. Other
compilers though have their own syntax for packed structures (but I do
see other uses of this packed syntax in the qemu code).

>
> If there were no other constraints, I'd either make the padding
> explicit, or rearrange/resize fields so that the field alignment is
> natural. Thus my question, can you change the disk format or are there
> already some deployments?

We can certainly make changes to the disk format at this point. I'm
not very happy with those 3 __u8 in the middle, and they can probably
be changed to a 32 bit flags field. We can get it 64 bit aligned too.

>
> Otherwise, I'd just add some warning comment so people don't try to
> use clever pointer tricks which will crash on machines with enforced
> alignment.
>
Any clever pointer tricks that'll work on one architecture will
probably be wrong on another (different word
size/alignment/endianity), so maybe crashing machines is a good
indicator to bad implementation. We shouldn't try to hide the
problems.

Thanks,
Yehuda

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-23 12:01       ` Avi Kivity
@ 2010-05-24  7:12         ` MORITA Kazutaka
  2010-05-24 11:05           ` Avi Kivity
  2010-05-24  8:27         ` Stefan Hajnoczi
  2010-05-25 11:02         ` Kevin Wolf
  2 siblings, 1 reply; 64+ messages in thread
From: MORITA Kazutaka @ 2010-05-24  7:12 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner

At Sun, 23 May 2010 15:01:59 +0300,
Avi Kivity wrote:
> 
> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
> >
> > I'd be more interested in enabling people to build these types of 
> > storage systems without touching qemu.
> >
> > Both sheepdog and ceph ultimately transmit I/O over a socket to a 
> > central daemon, right? 
> 
> That incurs an extra copy.
> 
> > So could we not standardize a protocol for this that both sheepdog and 
> > ceph could implement?
> 
> The protocol already exists, nbd.  It doesn't support snapshotting etc. 
> but we could extend it.
> 

I have no objection to use another protocol for Sheepdog support, but
I think nbd protocol is unsuitable for the large storage pool with
many VM images.  It is because nbd protocol doesn't support specifing
a file name to open.  If we use nbd with such a storage system, the
server needs to listen ports as many as the number of VM images.  As
far as I see the protocol, It looks difficult to extend it without
breaking backward compatibility.

Regards,

Kazutaka

> But IMO what's needed is a plugin API for the block layer.
> 

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-23 12:01       ` Avi Kivity
  2010-05-24  7:12         ` MORITA Kazutaka
@ 2010-05-24  8:27         ` Stefan Hajnoczi
  2010-05-24 11:03           ` Avi Kivity
  2010-05-25 11:02         ` Kevin Wolf
  2 siblings, 1 reply; 64+ messages in thread
From: Stefan Hajnoczi @ 2010-05-24  8:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner

On Sun, May 23, 2010 at 1:01 PM, Avi Kivity <avi@redhat.com> wrote:
> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>>
>> I'd be more interested in enabling people to build these types of storage
>> systems without touching qemu.
>>
>> Both sheepdog and ceph ultimately transmit I/O over a socket to a central
>> daemon, right?
>
> That incurs an extra copy.

Besides a shared memory approach, I wonder if the splice() family of
syscalls could be used to send/receive data through a storage daemon
without the daemon looking at or copying the data?

Stefan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24  8:27         ` Stefan Hajnoczi
@ 2010-05-24 11:03           ` Avi Kivity
  2010-05-24 19:19             ` Anthony Liguori
  0 siblings, 1 reply; 64+ messages in thread
From: Avi Kivity @ 2010-05-24 11:03 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner

On 05/24/2010 11:27 AM, Stefan Hajnoczi wrote:
> On Sun, May 23, 2010 at 1:01 PM, Avi Kivity<avi@redhat.com>  wrote:
>    
>> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>>      
>>> I'd be more interested in enabling people to build these types of storage
>>> systems without touching qemu.
>>>
>>> Both sheepdog and ceph ultimately transmit I/O over a socket to a central
>>> daemon, right?
>>>        
>> That incurs an extra copy.
>>      
> Besides a shared memory approach, I wonder if the splice() family of
> syscalls could be used to send/receive data through a storage daemon
> without the daemon looking at or copying the data?
>    

Excellent idea.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24  7:12         ` MORITA Kazutaka
@ 2010-05-24 11:05           ` Avi Kivity
  2010-05-24 11:42             ` MORITA Kazutaka
  0 siblings, 1 reply; 64+ messages in thread
From: Avi Kivity @ 2010-05-24 11:05 UTC (permalink / raw)
  To: MORITA Kazutaka
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner

On 05/24/2010 10:12 AM, MORITA Kazutaka wrote:
> At Sun, 23 May 2010 15:01:59 +0300,
> Avi Kivity wrote:
>    
>> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>>      
>>> I'd be more interested in enabling people to build these types of
>>> storage systems without touching qemu.
>>>
>>> Both sheepdog and ceph ultimately transmit I/O over a socket to a
>>> central daemon, right?
>>>        
>> That incurs an extra copy.
>>
>>      
>>> So could we not standardize a protocol for this that both sheepdog and
>>> ceph could implement?
>>>        
>> The protocol already exists, nbd.  It doesn't support snapshotting etc.
>> but we could extend it.
>>
>>      
> I have no objection to use another protocol for Sheepdog support, but
> I think nbd protocol is unsuitable for the large storage pool with
> many VM images.  It is because nbd protocol doesn't support specifing
> a file name to open.  If we use nbd with such a storage system, the
> server needs to listen ports as many as the number of VM images.  As
> far as I see the protocol, It looks difficult to extend it without
> breaking backward compatibility.
>    

The server would be local and talk over a unix domain socket, perhaps 
anonymous.

nbd has other issues though, such as requiring a copy and no support for 
metadata operations such as snapshot and file size extension.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24 11:05           ` Avi Kivity
@ 2010-05-24 11:42             ` MORITA Kazutaka
  2010-05-24 11:56               ` Avi Kivity
  0 siblings, 1 reply; 64+ messages in thread
From: MORITA Kazutaka @ 2010-05-24 11:42 UTC (permalink / raw)
  To: Avi Kivity
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

At Mon, 24 May 2010 14:05:58 +0300,
Avi Kivity wrote:
> 
> On 05/24/2010 10:12 AM, MORITA Kazutaka wrote:
> > At Sun, 23 May 2010 15:01:59 +0300,
> > Avi Kivity wrote:
> >    
> >> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
> >>      
> >>> I'd be more interested in enabling people to build these types of
> >>> storage systems without touching qemu.
> >>>
> >>> Both sheepdog and ceph ultimately transmit I/O over a socket to a
> >>> central daemon, right?
> >>>        
> >> That incurs an extra copy.
> >>
> >>      
> >>> So could we not standardize a protocol for this that both sheepdog and
> >>> ceph could implement?
> >>>        
> >> The protocol already exists, nbd.  It doesn't support snapshotting etc.
> >> but we could extend it.
> >>
> >>      
> > I have no objection to use another protocol for Sheepdog support, but
> > I think nbd protocol is unsuitable for the large storage pool with
> > many VM images.  It is because nbd protocol doesn't support specifing
> > a file name to open.  If we use nbd with such a storage system, the
> > server needs to listen ports as many as the number of VM images.  As
> > far as I see the protocol, It looks difficult to extend it without
> > breaking backward compatibility.
> >    
> 
> The server would be local and talk over a unix domain socket, perhaps 
> anonymous.
> 
> nbd has other issues though, such as requiring a copy and no support for 
> metadata operations such as snapshot and file size extension.
> 

Sorry, my explanation was unclear.  I'm not sure how running servers
on localhost can solve the problem.

What I wanted to say was that we cannot specify the image of VM. With
nbd protocol, command line arguments are as follows:

 $ qemu nbd:hostname:port

As this syntax shows, with nbd protocol the client cannot pass the VM
image name to the server.

Regards,

Kazutaka

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24 11:42             ` MORITA Kazutaka
@ 2010-05-24 11:56               ` Avi Kivity
  2010-05-24 12:07                 ` Cláudio Martins
                                   ` (2 more replies)
  0 siblings, 3 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-24 11:56 UTC (permalink / raw)
  To: MORITA Kazutaka
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner

On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
>
>> The server would be local and talk over a unix domain socket, perhaps
>> anonymous.
>>
>> nbd has other issues though, such as requiring a copy and no support for
>> metadata operations such as snapshot and file size extension.
>>
>>      
> Sorry, my explanation was unclear.  I'm not sure how running servers
> on localhost can solve the problem.
>    

The local server can convert from the local (nbd) protocol to the remote 
(sheepdog, ceph) protocol.

> What I wanted to say was that we cannot specify the image of VM. With
> nbd protocol, command line arguments are as follows:
>
>   $ qemu nbd:hostname:port
>
> As this syntax shows, with nbd protocol the client cannot pass the VM
> image name to the server.
>    

We would extend it to allow it to connect to a unix domain socket:

   qemu nbd:unix:/path/to/socket

The server at the other end would associate the socket with a filename 
and forward it to the server using the remote protocol.

However, I don't think nbd would be a good protocol.  My preference 
would be for a plugin API, or for a new local protocol that uses 
splice() to avoid copies.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24 11:56               ` Avi Kivity
@ 2010-05-24 12:07                 ` Cláudio Martins
  2010-05-24 14:01                 ` MORITA Kazutaka
  2010-05-24 19:16                 ` Anthony Liguori
  2 siblings, 0 replies; 64+ messages in thread
From: Cláudio Martins @ 2010-05-24 12:07 UTC (permalink / raw)
  To: Avi Kivity
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka


On Mon, 24 May 2010 14:56:29 +0300 Avi Kivity <avi@redhat.com> wrote:
> On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
> >
> >> The server would be local and talk over a unix domain socket, perhaps
> >> anonymous.
> >>
> >> nbd has other issues though, such as requiring a copy and no support for
> >> metadata operations such as snapshot and file size extension.
> >>
> >>      
> > Sorry, my explanation was unclear.  I'm not sure how running servers
> > on localhost can solve the problem.
> >    
> 
> The local server can convert from the local (nbd) protocol to the remote 
> (sheepdog, ceph) protocol.
> 

 Please note that this shouldn't be relevant to the block driver based
on ceph, as it does not use a local daemon -- it connects to the Object
Storage Devices directly over the network.

 Best regards

Cláudio

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24 11:56               ` Avi Kivity
  2010-05-24 12:07                 ` Cláudio Martins
@ 2010-05-24 14:01                 ` MORITA Kazutaka
  2010-05-24 19:07                   ` Christian Brunner
  2010-05-24 19:16                 ` Anthony Liguori
  2 siblings, 1 reply; 64+ messages in thread
From: MORITA Kazutaka @ 2010-05-24 14:01 UTC (permalink / raw)
  To: Avi Kivity
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

At Mon, 24 May 2010 14:56:29 +0300,
Avi Kivity wrote:
> 
> On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
> >
> >> The server would be local and talk over a unix domain socket, perhaps
> >> anonymous.
> >>
> >> nbd has other issues though, such as requiring a copy and no support for
> >> metadata operations such as snapshot and file size extension.
> >>
> >>      
> > Sorry, my explanation was unclear.  I'm not sure how running servers
> > on localhost can solve the problem.
> >    
> 
> The local server can convert from the local (nbd) protocol to the remote 
> (sheepdog, ceph) protocol.
> 
> > What I wanted to say was that we cannot specify the image of VM. With
> > nbd protocol, command line arguments are as follows:
> >
> >   $ qemu nbd:hostname:port
> >
> > As this syntax shows, with nbd protocol the client cannot pass the VM
> > image name to the server.
> >    
> 
> We would extend it to allow it to connect to a unix domain socket:
> 
>    qemu nbd:unix:/path/to/socket
> 
> The server at the other end would associate the socket with a filename 
> and forward it to the server using the remote protocol.
> 

Thank you for the explanation.  Sheepdog could achieve desired
behavior by creating socket files for all the VM images when the
daemon starts up.

> However, I don't think nbd would be a good protocol.  My preference 
> would be for a plugin API, or for a new local protocol that uses 
> splice() to avoid copies.
> 

Both would be okay for Sheepdog.  I want to take a suitable approach
for qemu.

Thanks,

Kazutaka

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24 14:01                 ` MORITA Kazutaka
@ 2010-05-24 19:07                   ` Christian Brunner
  2010-05-24 19:38                     ` Anthony Liguori
  0 siblings, 1 reply; 64+ messages in thread
From: Christian Brunner @ 2010-05-24 19:07 UTC (permalink / raw)
  To: MORITA Kazutaka; +Cc: kvm, qemu-devel, Blue Swirl, Avi Kivity, ceph-devel

2010/5/24 MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>:

>> However, I don't think nbd would be a good protocol.  My preference
>> would be for a plugin API, or for a new local protocol that uses
>> splice() to avoid copies.
>>
>
> Both would be okay for Sheepdog.  I want to take a suitable approach
> for qemu.

I think both should be possible:

- Using splice() we would need a daemon that is listening on a control
socket for
  requests from qemu-processes or admin commands. When a qemu-process
  wants to open an image it could call open_image("protocol:imagename") on the
  controll socket and the daemon has to create a pipe to which the
image is mapped.
  (What I'm unsure about, are the security implications. Do we need some kind of
  authentication for the sockets? What about sVirt?

- Building a plugin API seems a bit simpler to me, although I'm to
sure if I'd get the
  idea correctly:
  The block layer has already some kind of api (.bdrv_file_open, .bdrv_read). We
  could simply compile the block-drivers as shared objects and create a method
  for loading the necessary modules at runtime.

Are you planing to use this for all block drivers?

Regards,
Christian

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24 11:56               ` Avi Kivity
  2010-05-24 12:07                 ` Cláudio Martins
  2010-05-24 14:01                 ` MORITA Kazutaka
@ 2010-05-24 19:16                 ` Anthony Liguori
  2010-05-25  9:19                   ` Avi Kivity
  2010-05-25 13:26                   ` MORITA Kazutaka
  2 siblings, 2 replies; 64+ messages in thread
From: Anthony Liguori @ 2010-05-24 19:16 UTC (permalink / raw)
  To: Avi Kivity
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

On 05/24/2010 06:56 AM, Avi Kivity wrote:
> On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
>>
>>> The server would be local and talk over a unix domain socket, perhaps
>>> anonymous.
>>>
>>> nbd has other issues though, such as requiring a copy and no support 
>>> for
>>> metadata operations such as snapshot and file size extension.
>>>
>> Sorry, my explanation was unclear.  I'm not sure how running servers
>> on localhost can solve the problem.
>
> The local server can convert from the local (nbd) protocol to the 
> remote (sheepdog, ceph) protocol.
>
>> What I wanted to say was that we cannot specify the image of VM. With
>> nbd protocol, command line arguments are as follows:
>>
>>   $ qemu nbd:hostname:port
>>
>> As this syntax shows, with nbd protocol the client cannot pass the VM
>> image name to the server.
>
> We would extend it to allow it to connect to a unix domain socket:
>
>   qemu nbd:unix:/path/to/socket

nbd is a no-go because it only supports a single, synchronous I/O 
operation at a time and has no mechanism for extensibility.

If we go this route, I think two options are worth considering.  The 
first would be a purely socket based approach where we just accepted the 
extra copy.

The other potential approach would be shared memory based.  We export 
all guest ram as shared memory along with a small bounce buffer pool.  
We would then use a ring queue (potentially even using virtio-blk) and 
an eventfd for notification.

> The server at the other end would associate the socket with a filename 
> and forward it to the server using the remote protocol.
>
> However, I don't think nbd would be a good protocol.  My preference 
> would be for a plugin API, or for a new local protocol that uses 
> splice() to avoid copies.

I think a good shared memory implementation would be preferable to 
plugins.  I think it's worth attempting to do a plugin interface for the 
block layer but I strongly suspect it would not be sufficient.

I would not want to see plugins that interacted with BlockDriverState 
directly, for instance.  We change it far too often.  Our main loop 
functions are also not terribly stable so I'm not sure how we would 
handle that (unless we forced all block plugins to be in a separate thread).

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24 11:03           ` Avi Kivity
@ 2010-05-24 19:19             ` Anthony Liguori
  2010-05-25  9:22               ` Avi Kivity
  0 siblings, 1 reply; 64+ messages in thread
From: Anthony Liguori @ 2010-05-24 19:19 UTC (permalink / raw)
  To: Avi Kivity
  Cc: kvm, Stefan Hajnoczi, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/24/2010 06:03 AM, Avi Kivity wrote:
> On 05/24/2010 11:27 AM, Stefan Hajnoczi wrote:
>> On Sun, May 23, 2010 at 1:01 PM, Avi Kivity<avi@redhat.com>  wrote:
>>> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>>>> I'd be more interested in enabling people to build these types of 
>>>> storage
>>>> systems without touching qemu.
>>>>
>>>> Both sheepdog and ceph ultimately transmit I/O over a socket to a 
>>>> central
>>>> daemon, right?
>>> That incurs an extra copy.
>> Besides a shared memory approach, I wonder if the splice() family of
>> syscalls could be used to send/receive data through a storage daemon
>> without the daemon looking at or copying the data?
>
> Excellent idea.

splice() eventually requires a copy.  You cannot splice() to linux-aio 
so you'd have to splice() to a temporary buffer and then call into 
linux-aio.  With shared memory, you can avoid ever bringing the data 
into memory via O_DIRECT and linux-aio.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24 19:07                   ` Christian Brunner
@ 2010-05-24 19:38                     ` Anthony Liguori
  2010-05-25  9:14                       ` Avi Kivity
  0 siblings, 1 reply; 64+ messages in thread
From: Anthony Liguori @ 2010-05-24 19:38 UTC (permalink / raw)
  To: Christian Brunner
  Cc: kvm, qemu-devel, Blue Swirl, Avi Kivity, ceph-devel,
	MORITA Kazutaka

[-- Attachment #1: Type: text/plain, Size: 1895 bytes --]

On 05/24/2010 02:07 PM, Christian Brunner wrote:
> 2010/5/24 MORITA Kazutaka<morita.kazutaka@lab.ntt.co.jp>:
>
>    
>>> However, I don't think nbd would be a good protocol.  My preference
>>> would be for a plugin API, or for a new local protocol that uses
>>> splice() to avoid copies.
>>>
>>>        
>> Both would be okay for Sheepdog.  I want to take a suitable approach
>> for qemu.
>>      
> I think both should be possible:
>
> - Using splice() we would need a daemon that is listening on a control
> socket for
>    requests from qemu-processes or admin commands. When a qemu-process
>    wants to open an image it could call open_image("protocol:imagename") on the
>    controll socket and the daemon has to create a pipe to which the
> image is mapped.
>    (What I'm unsure about, are the security implications. Do we need some kind of
>    authentication for the sockets? What about sVirt?
>    

This is a fairly old patch that I dug out of a backup.  It uses the 9p 
protocol and does proper support for AIO.

At one point in time, I actually implemented splice() support but it 
didn't result in a significant improvement in benchmarks.

> - Building a plugin API seems a bit simpler to me, although I'm to
> sure if I'd get the
>    idea correctly:
>    The block layer has already some kind of api (.bdrv_file_open, .bdrv_read). We
>    could simply compile the block-drivers as shared objects and create a method
>    for loading the necessary modules at runtime.
>    

That approach would be a recipe for disaster.   We would have to 
introduce a new, reduced functionality block API that was supported for 
plugins.  Otherwise, the only way a plugin could keep up with our API 
changes would be if it was in tree which defeats the purpose of having 
plugins.

Regards,

Anthony Liguori

> Are you planing to use this for all block drivers?
>
> Regards,
> Christian
>    


[-- Attachment #2: block-9p.patch --]
[-- Type: text/plain, Size: 50279 bytes --]

diff --git a/Makefile b/Makefile
index 4f7a55a..541b26a 100644
--- a/Makefile
+++ b/Makefile
@@ -53,7 +53,7 @@ BLOCK_OBJS=cutils.o qemu-malloc.o
 BLOCK_OBJS+=block-cow.o block-qcow.o aes.o block-vmdk.o block-cloop.o
 BLOCK_OBJS+=block-dmg.o block-bochs.o block-vpc.o block-vvfat.o
 BLOCK_OBJS+=block-qcow2.o block-parallels.o block-nbd.o
-BLOCK_OBJS+=nbd.o block.o aio.o
+BLOCK_OBJS+=nbd.o block.o aio.o block-9p.o p9.o p9c.o
 
 ifdef CONFIG_WIN32
 BLOCK_OBJS += block-raw-win32.o
diff --git a/block-9p.c b/block-9p.c
new file mode 100644
index 0000000..5570f37
--- /dev/null
+++ b/block-9p.c
@@ -0,0 +1,573 @@
+/*
+ * 9p based block driver for QEMU
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "p9c.h"
+#include "qemu_socket.h"
+
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+
+//#define DEBUG_BLOCK_9P
+
+#ifdef DEBUG_BLOCK_9P
+#define dprintf(fmt, ...) \
+    do { printf("block-9p: " fmt, ## __VA_ARGS__); } while (0)
+#define _dprintf(fmt, ...) \
+    do { printf(fmt, ## __VA_ARGS__); } while (0)
+#else
+#define dprintf(fmt, ...) \
+    do { } while (0)
+#define _dprintf(fmt, ...) \
+    do { } while (0)
+#endif
+
+typedef struct BDRV9pState {
+    P9IOState iops;
+    BlockDriverState *bs;
+    P9ClientState *client_state;
+    int fd;
+    char filename[1024];
+    int nwnames;
+    const char *wnames[256];
+    int do_loop;
+    int64_t length;
+    int32_t msize;
+    int count;
+} BDRV9pState;
+
+typedef struct P9AIOCB {
+    BlockDriverAIOCB common;
+    BDRV9pState *s;
+    int64_t offset;
+    size_t size;
+    void *buf;
+} P9AIOCB;
+
+static void p9_recv_notify(void *opaque)
+{
+    BDRV9pState *s = opaque;
+    p9c_notify_can_recv(s->client_state);
+}
+
+static void p9_send_notify(void *opaque)
+{
+    BDRV9pState *s = opaque;
+    p9c_notify_can_send(s->client_state);
+}
+
+static BDRV9pState *to_bs(P9IOState *iops)
+{
+    return container_of(iops, BDRV9pState, iops);
+}
+
+static ssize_t p9_send(P9IOState *iops, const void *data, size_t size)
+{
+    BDRV9pState *s = to_bs(iops);
+    ssize_t len;
+    len = send(s->fd, data, size, 0);
+    if (len == -1)
+        errno = socket_error();
+    return len;
+}
+
+static ssize_t p9_recv(P9IOState *iops, void *data, size_t size)
+{
+    BDRV9pState *s = to_bs(iops);
+    ssize_t len;
+    len = recv(s->fd, data, size, 0);
+    if (len == -1)
+        errno = socket_error();
+    return len;
+}
+
+static int p9_flush(void *opaque)
+{
+    BDRV9pState *s = opaque;
+    return !!s->count || s->do_loop;
+}
+
+static void p9_set_send_notify(P9IOState *iops, int enable)
+{
+    BDRV9pState *s = to_bs(iops);
+
+    if (enable)
+        qemu_aio_set_fd_handler(s->fd, p9_recv_notify, p9_send_notify, p9_flush, s);
+    else 
+        qemu_aio_set_fd_handler(s->fd, p9_recv_notify, NULL, p9_flush, s);
+}
+
+static int p9_open_cb(void *opaque, int ret, const P9QID *qid, int32_t iounit)
+{
+    BDRV9pState *s = opaque;
+
+    if (ret) {
+        dprintf("Rerror: %s\n", strerror(ret));
+        s->do_loop = 0;
+        return -ret;
+    }
+
+    dprintf("Ropen(qid={type=%d, version=%d, path=%" PRId64 "}, iounit=%d)\n",
+            qid->type, qid->version, qid->path, iounit);
+
+    s->do_loop = 0;
+
+    return 0;
+}
+
+static int p9_stat_cb(void *opaque, int ret, const P9Stat *stbuf)
+{
+    BDRV9pState *s = opaque;
+
+    if (ret) {
+        dprintf("Rstat error: %s\n", strerror(ret));
+        s->do_loop = 0;
+        return -ret;
+    }
+
+    dprintf("Rstat(size=%d, type=%d, dev=%d, "
+            "qid={type=%d, version=%d, path=%" PRId64 "}, "
+            "mode=%d, atime=%d, mtime=%d, length=%" PRId64 ", name=%s, uid=%s, "
+            "gid=%s, muid=%s, extension=%s, nuid=%d, ngid=%d, nmuid=%d)\n",
+            stbuf->size, stbuf->type, stbuf->dev, stbuf->qid.type,
+            stbuf->qid.version, stbuf->qid.path, stbuf->mode, stbuf->atime,
+            stbuf->mtime, stbuf->length, stbuf->name, stbuf->uid, stbuf->gid,
+            stbuf->muid, stbuf->extension, stbuf->n_uid, stbuf->n_gid,
+            stbuf->n_muid);
+
+    s->length = stbuf->length;
+
+    if (p9c_open(s->client_state, 1, P9_O_RDWR, p9_open_cb, s) < 0) {
+        dprintf("Topen failed\n");
+        s->do_loop = 0;
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static int p9_walk_cb(void *opaque, int ret, int16_t nwqid, const P9QID *wqids)
+{
+    BDRV9pState *s = opaque;
+    int i;
+
+    if (ret) {
+        dprintf("Rerror: %s\n", strerror(ret));
+        s->do_loop = 0;
+        return -ret;
+    }
+
+    dprintf("Rwalk(nwqid=%d, wqids={");
+    for (i = 0; i < nwqid; i++) {
+        if (i)
+            _dprintf(", ");
+        _dprintf("{type=%d, version=%d, path=%" PRId64 "}",
+                 wqids[i].type, wqids[i].version, wqids[i].path);
+    }
+    _dprintf("})\n");
+
+    if (p9c_stat(s->client_state, 1, p9_stat_cb, s) < 0) {
+        dprintf("Tstat failed\n");
+        s->do_loop = 0;
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static int p9_attach_cb(void *opaque, int ret, const P9QID *qid)
+{
+    BDRV9pState *s = opaque;
+
+    if (ret) {
+        dprintf("Rerror: %s\n", strerror(ret));
+        s->do_loop = 0;
+        return -ret;
+    }
+
+    dprintf("Rattach(qid={type=%d, version=%d, path=%" PRId64 "})\n",
+            qid->type, qid->version, qid->path);
+
+    if (p9c_walk(s->client_state, 0, 1, s->nwnames, s->wnames,
+                 p9_walk_cb, s) < 0) {
+        dprintf("Twalk failed\n");
+        s->do_loop = 0;
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static int p9_version_cb(void *opaque, int ret, int32_t msize,
+                         const char *version)
+{
+    BDRV9pState *s = opaque;
+
+    if (ret) {
+        dprintf("Rerror: %s\n", strerror(ret));
+        s->do_loop = 0;
+        return -ret;
+
+    }
+
+    s->msize = msize;
+
+    dprintf("Rversion(msize=%d, version=%s)\n", msize, version);
+
+    /* FIXME get username */
+    if (p9c_attach(s->client_state, 0, -1, "anthony", NULL, 200,
+                   p9_attach_cb, s) < 0) {
+        dprintf("Tattach failed\n");
+        s->do_loop = 0;
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static int p9dial_outgoing_unix(const char *path)
+{
+    int s;
+    struct sockaddr_un addr;
+
+    s = socket(PF_UNIX, SOCK_STREAM, 0);
+    if (s == -1)
+        return -1;
+
+    memset(&addr, 0, sizeof(addr));
+    addr.sun_family = AF_UNIX;
+    snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path);
+
+    if (connect(s, (struct sockaddr *)&addr, sizeof(addr)) == -1)
+        goto error;
+
+    return s;
+error:
+    close(s);
+    return -1;
+}
+
+static int p9dial_outgoing_tcp(const char *hostname, const char *service)
+{
+    int s;
+    struct in_addr in;
+    struct sockaddr_in addr;
+    uint16_t port;
+    char *endptr;
+
+    s = socket(PF_INET, SOCK_STREAM, 0);
+    if (s == -1)
+        return -1;
+
+    if (inet_aton(hostname, &in) == 0) {
+        struct hostent *ent;
+
+        ent = gethostbyname(hostname);
+        if (ent == NULL)
+            goto error;
+
+        memcpy(&in, ent->h_addr, sizeof(in));
+    }
+
+    port = strtol(service, &endptr, 10);
+    if (endptr && *endptr) {
+	struct servent *ent;
+
+	ent = getservbyname(service, "tcp");
+	if (ent == NULL)
+	    goto error;
+
+	port = ent->s_port;
+    }
+
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(port);
+    memcpy(&addr.sin_addr.s_addr, &in, sizeof(in));
+
+    if (connect(s, (struct sockaddr *)&addr, sizeof(addr)) == -1)
+        goto error;
+
+    return s;
+error:
+    close(s);
+    return -1;
+}
+
+static int p9dial(const char *path)
+{
+    int fd = -1;
+    const char *p;
+
+    if (strstart(path, "tcp!", &p)) {
+	char hostname[1024];
+	char *service;
+	size_t len;
+
+	service = strchr(p, '!');
+	if (!service) {
+	    errno = EINVAL;
+	    goto out;
+	}
+
+	len = MIN(sizeof(hostname) - 1, service - p);
+	memcpy(hostname, p, len);
+	hostname[len] = 0;
+
+	fd = p9dial_outgoing_tcp(hostname, service + 1);
+    } else if (strstart(path, "unix!", &p)) {
+	fd = p9dial_outgoing_unix(p);
+    } else
+	errno = EINVAL;
+
+    out:
+    return fd;
+}
+
+static int p9_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRV9pState *s = bs->opaque;
+    const char *p;
+    char *file, *ptr;
+    char host[1024];
+    int len;
+
+    if (!strstart(filename, "9p:", &p))
+        return -EINVAL;
+
+    /* FIXME handle quoting */
+
+    file = strchr(p, ':');
+    if (file == NULL)
+        return -EINVAL;
+
+    snprintf(s->filename, sizeof(s->filename), "%s", file + 1);
+
+    /* FIXME be dynamic */
+
+    s->nwnames = 0;
+    ptr = s->filename;
+    while (ptr && s->nwnames < 256) {
+        s->wnames[s->nwnames++] = ptr;
+        ptr = strchr(ptr, '/');
+        if (ptr) {
+            *ptr = 0;
+            ptr++;
+        }
+    }
+
+    s->count = 0;
+
+    len = MIN(file - p, sizeof(host) - 1);
+    memcpy(host, p, len);
+    host[len] = 0;
+
+    s->fd = p9dial(host);
+
+    socket_set_nonblock(s->fd);
+
+    qemu_aio_set_fd_handler(s->fd, p9_recv_notify, NULL, p9_flush, s);
+
+    /* FIXME better cleanup */
+
+    s->iops.send = p9_send;
+    s->iops.recv = p9_recv;
+    s->iops.set_send_notify = p9_set_send_notify;
+
+    s->client_state = p9c_init(&s->iops);
+    if (s->client_state == NULL) {
+        dprintf("p9c_init failed\n");
+        return -EINVAL;
+    }
+
+    if (p9c_version(s->client_state, p9_version_cb, s) < 0) {
+        dprintf("Tversion failed\n");
+        return -EINVAL;
+    }
+
+    dprintf("Entering wait loop\n");
+    s->do_loop = 1;
+    while (s->do_loop)
+        qemu_aio_wait();
+    dprintf("Left wait loop\n");
+    
+    return 0;
+}
+
+static int p9c_read_cb(void *opaque, int ret, int32_t count, const void *data)
+{
+    P9AIOCB *aiocb = opaque;
+    BDRV9pState *s = aiocb->s;
+
+    s->count--;
+
+    if (ret) {
+        dprintf("Rerror: %s\n", strerror(ret));
+        aiocb->common.cb(aiocb->common.opaque, ret);
+        qemu_aio_release(aiocb);
+        return -ret;
+    }
+
+    memcpy(aiocb->buf, data, count);
+    aiocb->buf += count;
+    aiocb->offset += count;
+    aiocb->size -= count;
+
+    dprintf("Rread(count=%d, data=...)\n", count);
+
+    if (aiocb->size) {
+        s->count++;
+        if (p9c_read(aiocb->s->client_state, 1, aiocb->offset,
+                     MIN(aiocb->size, aiocb->s->msize - 24),
+                     p9c_read_cb, aiocb) < 0) {
+            dprintf("Tread failed\n");
+            return -1;
+        }
+    } else {
+        aiocb->common.cb(aiocb->common.opaque, 0);
+        qemu_aio_release(aiocb);
+    }
+
+    return 0;
+}
+
+static BlockDriverAIOCB *p9_aio_read(BlockDriverState *bs, int64_t sector_num,
+                                     uint8_t *buf, int nb_sectors,
+                                     BlockDriverCompletionFunc *cb,
+                                     void *opaque)
+{
+    BDRV9pState *s = bs->opaque;
+    P9AIOCB *aiocb;
+
+    dprintf("aio_read(sector_num=%" PRId64 ", nb_sectors=%d)\n",
+            sector_num, nb_sectors);
+
+    aiocb = qemu_aio_get(bs, cb, opaque);
+    if (aiocb == NULL)
+        return NULL;
+
+    aiocb->s = s;
+    aiocb->offset = sector_num * 512;
+    aiocb->size = nb_sectors * 512;
+    aiocb->buf = buf;
+
+    s->count++;
+    if (p9c_read(aiocb->s->client_state, 1, aiocb->offset,
+                 MIN(aiocb->size, s->msize - 24),
+                 p9c_read_cb, aiocb) < 0) {
+        dprintf("Tread failed\n");
+        return NULL;
+    }
+
+    return &aiocb->common;
+}
+
+static int p9c_write_cb(void *opaque, int ret, int32_t count)
+{
+    P9AIOCB *aiocb = opaque;
+    BDRV9pState *s = aiocb->s;
+
+    s->count--;
+
+    if (ret) {
+        dprintf("Rerror: %s\n", strerror(ret));
+        aiocb->common.cb(aiocb->common.opaque, ret);
+        qemu_aio_release(aiocb);
+        return -ret;
+    }
+
+    aiocb->buf += count;
+    aiocb->offset += count;
+    aiocb->size -= count;
+
+    dprintf("Rwrite(count=%d)\n", count);
+
+    if (aiocb->size) {
+        s->count++;
+        if (p9c_write(aiocb->s->client_state, 1, aiocb->offset,
+                      MIN(aiocb->size, aiocb->s->msize - 24),
+                      aiocb->buf, p9c_write_cb, aiocb) < 0) {
+            dprintf("Twrite failed\n");
+            return -1;
+        }
+    } else {
+        aiocb->common.cb(aiocb->common.opaque, 0);
+        qemu_aio_release(aiocb);
+    }
+
+    return 0;
+}
+
+static BlockDriverAIOCB *p9_aio_write(BlockDriverState *bs, int64_t sector_num,
+                                      const uint8_t *buf, int nb_sectors,
+                                      BlockDriverCompletionFunc *cb,
+                                      void *opaque)
+{
+    BDRV9pState *s = bs->opaque;
+    P9AIOCB *aiocb;
+
+    dprintf("aio_write(sector_num=%" PRId64 ", nb_sectors=%d)\n",
+            sector_num, nb_sectors);
+
+    aiocb = qemu_aio_get(bs, cb, opaque);
+    if (aiocb == NULL)
+        return NULL;
+
+    aiocb->s = s;
+    aiocb->offset = sector_num * 512;
+    aiocb->size = nb_sectors * 512;
+    aiocb->buf = (void *)buf;
+
+    s->count++;
+    if (p9c_write(aiocb->s->client_state, 1, aiocb->offset,
+                  MIN(aiocb->size, s->msize - 24),
+                  aiocb->buf,
+                  p9c_write_cb, aiocb) < 0) {
+        dprintf("Twrite failed\n");
+        return NULL;
+    }
+
+    return &aiocb->common;
+}
+
+static void p9_close(BlockDriverState *bs)
+{
+    BDRV9pState *s = bs->opaque;
+
+    dprintf("Closing\n");
+
+    /* FIXME should I clunk? */
+    qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL);
+    closesocket(s->fd);
+    p9c_free(s->client_state);
+}
+
+static int64_t p9_getlength(BlockDriverState *bs)
+{
+    BDRV9pState *s = bs->opaque;
+    return s->length;
+}
+
+BlockDriver bdrv_9p = {
+    .format_name = "9p",
+    .instance_size = sizeof(BDRV9pState),
+    .bdrv_open = p9_open,
+    .bdrv_aio_read = p9_aio_read,
+    .bdrv_aio_write = p9_aio_write,
+    .aiocb_size = sizeof(P9AIOCB),
+    .bdrv_close = p9_close,
+    .bdrv_getlength = p9_getlength,
+    .protocol_name = "9p",
+};
+
diff --git a/block.c b/block.c
index 7c744c7..7bb4f98 100644
--- a/block.c
+++ b/block.c
@@ -1535,6 +1535,7 @@ void bdrv_init(void)
     bdrv_register(&bdrv_qcow2);
     bdrv_register(&bdrv_parallels);
     bdrv_register(&bdrv_nbd);
+    bdrv_register(&bdrv_9p);
 }
 
 void *qemu_aio_get(BlockDriverState *bs, BlockDriverCompletionFunc *cb,
diff --git a/block.h b/block.h
index e1927dd..bcde8e0 100644
--- a/block.h
+++ b/block.h
@@ -20,6 +20,7 @@ extern BlockDriver bdrv_vvfat;
 extern BlockDriver bdrv_qcow2;
 extern BlockDriver bdrv_parallels;
 extern BlockDriver bdrv_nbd;
+extern BlockDriver bdrv_9p;
 
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
diff --git a/p9.c b/p9.c
new file mode 100644
index 0000000..4e151a5
--- /dev/null
+++ b/p9.c
@@ -0,0 +1,637 @@
+#include <string.h>
+#include <inttypes.h>
+#include <malloc.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#include "sys-queue.h"
+
+#include "p9.h"
+
+#define cpu_to_le8(val) (val)
+#define cpu_to_le16(val) (val)
+#define cpu_to_le32(val) (val)
+#define cpu_to_le64(val) (val)
+
+#define BUG() do { abort(); } while (0)
+
+typedef struct _P9PDU
+{
+    P9PDU pdu;
+    TAILQ_ENTRY(_P9PDU) node;
+} _P9PDU;
+
+struct P9State
+{
+    int32_t msize;
+
+    size_t active_tx_offset;
+    P9PDU *active_tx;
+    TAILQ_HEAD(, _P9PDU) tx_queue;
+
+    P9PDU *active_rx;
+
+    P9IOState *iops;
+    P9PDUState *pduops;
+};
+
+static size_t pdu_read(P9PDU *pdu, void *data, size_t size)
+{
+    size_t len = MIN(pdu->size - pdu->offset, size);
+    memcpy(data, &pdu->buffer[pdu->offset], len);
+    pdu->offset += len;
+    return size - len;
+}
+
+static size_t pdu_write(P9PDU *pdu, const void *data, size_t size)
+{
+    size_t len = MIN(pdu->capacity - pdu->size, size);
+    memcpy(&pdu->buffer[pdu->size], data, len);
+    pdu->size += len;
+    return size - len;
+}
+
+/* b - int8_t
+   w - int16_t
+   d - int32_t
+   q - int64_t
+   s - string
+   S - stat
+   Q - qid
+   D - data blob (int32_t size followed by void *, the results are not freed)
+   T - array of strings (int16_t count, followed by strings)
+   R - array of qids (int16_t count, followed by qids)
+   ? - if optional = 1, continue parsing
+*/
+
+int p9pdu_vreadf(P9PDU *pdu, int optional, const char *fmt, va_list ap)
+{
+    const char *ptr;
+    int errcode = 0;
+
+    for (ptr = fmt; *ptr; ptr++) {
+        switch (*ptr) {
+        case 'b': {
+            int8_t *val = va_arg(ap, int8_t *);
+            if (pdu_read(pdu, val, sizeof(*val))) {
+                errcode = -EFAULT;
+                break;
+            }
+            *val = cpu_to_le8(*val);
+        }   break;
+        case 'w': {
+            int16_t *val = va_arg(ap, int16_t *);
+            if (pdu_read(pdu, val, sizeof(*val))) {
+                errcode = -EFAULT;
+                break;
+            }
+            *val = cpu_to_le16(*val);
+        }   break;
+        case 'd': {
+            int32_t *val = va_arg(ap, int32_t *);
+            if (pdu_read(pdu, val, sizeof(*val))) {
+                errcode = -EFAULT;
+                break;
+            }
+            *val = cpu_to_le32(*val);
+        }   break;
+        case 'q': {
+            int64_t *val = va_arg(ap, int64_t *);
+            if (pdu_read(pdu, val, sizeof(*val))) {
+                errcode = -EFAULT;
+                break;
+            }
+            *val = cpu_to_le64(*val);
+        }   break;
+        case 's': {
+            char **ptr = va_arg(ap, char **);
+            int16_t len;
+            int size;
+
+            errcode = p9pdu_readf(pdu, optional, "w", &len);
+            if (errcode)
+                break;
+
+            size = MAX(len, 0);
+
+            *ptr = malloc(size + 1);
+            if (*ptr == NULL) {
+                errcode = -EFAULT;
+                break;
+            }
+            if (pdu_read(pdu, *ptr, size)) {
+                errcode = -EFAULT;
+                free(*ptr);
+                *ptr = NULL;
+            } else
+                (*ptr)[size] = 0;
+        }   break;
+        case 'Q': {
+            P9QID *qid = va_arg(ap, P9QID *);
+
+            errcode = p9pdu_readf(pdu, optional, "bdq",
+                                  &qid->type, &qid->version, &qid->path);
+        }   break;
+        case 'S': {
+            P9Stat *stbuf = va_arg(ap, P9Stat *);
+
+            stbuf->extension = NULL;
+            stbuf->n_uid = stbuf->n_gid = stbuf->n_muid = -1;
+
+            errcode = p9pdu_readf(pdu, optional, "wwdQdddqssss?sddd",
+                                  &stbuf->size, &stbuf->type,
+                                  &stbuf->dev, &stbuf->qid,
+                                  &stbuf->mode, &stbuf->atime,
+                                  &stbuf->mtime, &stbuf->length,
+                                  &stbuf->name, &stbuf->uid,
+                                  &stbuf->gid, &stbuf->muid,
+                                  &stbuf->extension, &stbuf->n_uid,
+                                  &stbuf->n_gid, &stbuf->n_muid);
+            if (errcode)
+                p9stat_free(stbuf);
+        }   break;
+        case 'D': {
+            int32_t *count = va_arg(ap, int32_t *);
+            void **data = va_arg(ap, void **);
+
+            errcode = p9pdu_readf(pdu, optional, "d", count);
+            if (!errcode) {
+                *count = MIN(*count, pdu->size - pdu->offset);
+                *data = &pdu->buffer[pdu->offset];
+            }
+        }   break;
+        case 'T': {
+            int16_t *nwname = va_arg(ap, int16_t *);
+            char ***wnames = va_arg(ap, char ***);
+
+            errcode = p9pdu_readf(pdu, optional, "w", nwname);
+            if (!errcode) {
+                *wnames = malloc(sizeof(char *) * *nwname);
+                if (!*wnames)
+                    errcode = -ENOMEM;
+            }
+
+            if (!errcode) {
+                int i;
+
+                for (i = 0; i < *nwname; i++) {
+                    errcode = p9pdu_readf(pdu, optional, "s", &(*wnames)[i]);
+                    if (errcode)
+                        break;
+                }
+            }
+
+            if (errcode) {
+                if (*wnames) {
+                    int i;
+
+                    for (i = 0 ; i < *nwname; i++)
+                        free((*wnames)[i]);
+                }
+                free(*wnames);
+                *wnames = NULL;
+            }
+        }   break;
+        case 'R': {
+            int16_t *nwqid = va_arg(ap, int16_t *);
+            P9QID **wqids = va_arg(ap, P9QID **);
+
+            *wqids = NULL;
+
+            errcode = p9pdu_readf(pdu, optional, "w", nwqid);
+            if (!errcode) {
+                *wqids = malloc(*nwqid * sizeof(P9QID));
+                if (*wqids == NULL)
+                    errcode = -ENOMEM;
+            }
+
+            if (!errcode) {
+                int i;
+
+                for (i = 0; i < *nwqid; i++) {
+                    errcode = p9pdu_readf(pdu, optional, "Q", &(*wqids)[i]);
+                    if (errcode)
+                        break;
+                }
+            }
+
+            if (errcode) {
+                free(*wqids);
+                *wqids = NULL;
+            }
+        }   break;
+        case '?':
+            if (!optional)
+                return 0;
+            break;
+        default:
+            BUG();
+            break;
+        }
+
+        if (errcode)
+            break;
+    }
+
+    return errcode;
+}
+
+int p9pdu_vwritef(P9PDU *pdu, int optional, const char *fmt, va_list ap)
+{
+    const char *ptr;
+    int errcode = 0;
+
+    for (ptr = fmt; *ptr; ptr++) {
+        switch (*ptr) {
+        case 'b': {
+            int8_t val = va_arg(ap, int);
+            if (pdu_write(pdu, &val, sizeof(val)))
+                errcode = -EFAULT;
+        }   break;
+        case 'w': {
+            int16_t val = va_arg(ap, int);
+            if (pdu_write(pdu, &val, sizeof(val)))
+                errcode = -EFAULT;
+        }   break;
+        case 'd': {
+            int32_t val = va_arg(ap, int32_t);
+            if (pdu_write(pdu, &val, sizeof(val)))
+                errcode = -EFAULT;
+        }   break;
+        case 'q': {
+            int64_t val = va_arg(ap, int64_t);
+            if (pdu_write(pdu, &val, sizeof(val)))
+                errcode = -EFAULT;
+        }   break;
+        case 's': {
+            const char *ptr = va_arg(ap, const char *);
+            int16_t len = 0;
+
+            if (ptr)
+                len = MIN(strlen(ptr), INT16_MAX);
+
+            errcode = p9pdu_writef(pdu, optional, "w", len);
+            if (!errcode && pdu_write(pdu, ptr, len))
+                errcode = -EFAULT;
+        }   break;
+        case 'Q': {
+            const P9QID *qid = va_arg(ap, const P9QID *);
+            errcode = p9pdu_writef(pdu, optional, "bdq",
+                                   qid->type, qid->version, qid->path);
+        }   break;
+        case 'S': {
+            const P9Stat *stbuf = va_arg(ap, const P9Stat *);
+            errcode = p9pdu_writef(pdu, optional, "wwdQdddqssss?sddd",
+                                   stbuf->size, stbuf->type,
+                                   stbuf->dev, stbuf->qid,
+                                   stbuf->mode, stbuf->atime,
+                                   stbuf->mtime, stbuf->length,
+                                   stbuf->name, stbuf->uid,
+                                   stbuf->gid, stbuf->muid,
+                                   stbuf->extension, stbuf->n_uid,
+                                   stbuf->n_gid, stbuf->n_muid);
+        }   break;
+        case 'D': {
+            int32_t count = va_arg(ap, int32_t);
+            const void *data = va_arg(ap, const void *);
+
+            errcode = p9pdu_writef(pdu, optional, "d", count);
+            if (!errcode && pdu_write(pdu, data, count))
+                errcode = -EFAULT;
+        }   break;
+        case 'T': {
+            int16_t nwname = va_arg(ap, int);
+            const char **wnames = va_arg(ap, const char **);
+
+            errcode = p9pdu_writef(pdu, optional, "w", nwname);
+            if (!errcode) {
+                int i;
+
+                for (i = 0; i < nwname; i++) {
+                    errcode = p9pdu_writef(pdu, optional, "s", wnames[i]);
+                    if (errcode)
+                        break;
+                }
+            }
+        }   break;
+        case 'R': {
+            int16_t nwqid = va_arg(ap, int);
+            P9QID *wqids = va_arg(ap, P9QID *);
+
+            errcode = p9pdu_writef(pdu, optional, "w", nwqid);
+            if (!errcode) {
+                int i;
+
+                for (i = 0; i < nwqid; i++) {
+                    errcode = p9pdu_writef(pdu, optional, "Q", &wqids[i]);
+                    if (errcode)
+                        break;
+                }
+            }
+        }   break;
+        case '?':
+            if (!optional)
+                return 0;
+            break;
+        default:
+            BUG();
+            break;
+        }
+
+        if (errcode)
+            break;
+    }
+
+    return errcode;
+}
+
+int p9pdu_readf(P9PDU *pdu, int optional, const char *fmt, ...)
+{
+    va_list ap;
+    int ret;
+
+    va_start(ap, fmt);
+    ret = p9pdu_vreadf(pdu, optional, fmt, ap);
+    va_end(ap);
+
+    return ret;
+}
+
+int p9pdu_writef(P9PDU *pdu, int optional, const char *fmt, ...)
+{
+    va_list ap;
+    int ret;
+
+    va_start(ap, fmt);
+    ret = p9pdu_vwritef(pdu, optional, fmt, ap);
+    va_end(ap);
+
+    return ret;
+}
+
+void p9stat_free(P9Stat *stbuf)
+{
+    free(stbuf->name);
+    free(stbuf->uid);
+    free(stbuf->gid);
+    free(stbuf->muid);
+    free(stbuf->extension);
+}
+
+static P9PDU *p9pdu_get(P9State *s)
+{
+    _P9PDU *pdu;
+
+    pdu = malloc(sizeof(*pdu) + s->msize);
+    if (pdu == NULL)
+        return NULL;
+
+    pdu->pdu.offset = 0;
+    pdu->pdu.size = 0;
+    pdu->pdu.capacity = s->msize;
+    pdu->pdu.buffer = (uint8_t *)pdu + sizeof(*pdu);
+
+    return &pdu->pdu;
+}
+
+static void p9pdu_put(P9State *s, P9PDU *pdu)
+{
+    _P9PDU *_pdu = container_of(pdu, _P9PDU, pdu);
+    free(_pdu);
+}
+
+#include <stdio.h>
+
+static int p9_try_to_tx(P9State *s)
+{
+    ssize_t ret;
+
+    do {
+        P9PDU *pdu;
+        size_t len;
+
+        if (!s->active_tx) {
+            _P9PDU *_pdu;
+
+            if (TAILQ_EMPTY(&s->tx_queue))
+                break;
+
+            _pdu = TAILQ_FIRST(&s->tx_queue);
+            TAILQ_REMOVE(&s->tx_queue, _pdu, node);
+            s->active_tx_offset = 0;
+            s->active_tx = &_pdu->pdu;
+        }
+
+        pdu = s->active_tx;
+
+        len = pdu->size - s->active_tx_offset;
+
+        ret = s->iops->send(s->iops, pdu->buffer + s->active_tx_offset, len);
+        if (ret == -1) {
+            if (errno == EINTR)
+                continue;
+            if (errno == EAGAIN) {
+                s->iops->set_send_notify(s->iops, 1);
+                break;
+            }
+            return -errno;
+        } else if (ret == 0)
+            return -EPIPE;
+
+        s->active_tx_offset += ret;
+        if (s->active_tx_offset == pdu->size) {
+            p9pdu_put(s, pdu);
+            s->active_tx = NULL;
+            s->active_tx_offset = 0;
+        }
+    } while (ret > 0);
+
+    return 0;
+}
+
+int p9_notify_can_send(P9State *s)
+{
+    s->iops->set_send_notify(s->iops, 0);
+
+    return p9_try_to_tx(s);
+}
+
+int p9_notify_can_recv(P9State *s)
+{
+    P9PDU *rx;
+    int ret;
+
+    while (1) {
+        int32_t size;
+
+        if (s->active_rx == NULL)
+            s->active_rx = p9pdu_get(s);
+
+        rx = s->active_rx;
+
+        while (rx->size < 7) {
+            ssize_t len;
+
+            len = s->iops->recv(s->iops, rx->buffer + rx->size, 7 - rx->size);
+            if (len == -1 && errno == EINTR)
+                continue;
+            else if (len == -1 && errno == EAGAIN)
+                return 0;
+            else if (len == 0) {
+                ret = -EPIPE;
+                goto err;
+            } else if (len == -1) {
+                ret = -errno;
+                goto err;
+            }
+
+            rx->size += len;
+        }
+
+        memcpy(&size, rx->buffer, 4);
+        size = cpu_to_le32(size);
+        if (size < 0 || size < 7) {
+            ret = -EFAULT;
+            goto err;
+        }
+
+        /* Our packet size is greater than msize, FIXME we should drain this
+         * many bytes from the socket in order to allow us to continue */
+        if (size > rx->capacity) {
+            ret = -EFAULT;
+            goto err;
+        }
+    
+        while (rx->size < size) {
+            ssize_t len;
+            
+            len = s->iops->recv(s->iops, rx->buffer + rx->size, size - rx->size);
+            if (len == -1 && errno == EINTR)
+                continue;
+            else if (len == -1 && errno == EAGAIN)
+                return 0;
+            else if (len == 0) {
+                ret = -EPIPE;
+                goto err;
+            } else if (len == -1) {
+                ret = -errno;
+                goto err;
+            }
+
+            rx->size += len;
+        }
+
+        ret = s->pduops->dispatch_pdu(s->pduops, rx);
+        if (ret)
+            goto err;
+
+        p9pdu_put(s, rx);
+        s->active_rx = NULL;
+    }
+
+    return 0;
+
+err:
+    p9pdu_put(s, rx);
+    s->active_rx = NULL;
+    return ret;
+}
+
+int p9_set_msize(P9State *s, int32_t msize)
+{
+    if (msize < 7)
+        return -EINVAL;
+
+    s->msize = msize;
+
+    return 0;
+}
+
+int p9_send_pdu(P9State *s, P9PDU *pdu)
+{
+    _P9PDU *_pdu = container_of(pdu, _P9PDU, pdu);
+
+    TAILQ_INSERT_TAIL(&s->tx_queue, _pdu, node);
+
+    return p9_try_to_tx(s);
+}
+
+int p9_send_vpduf(P9State *s, int optional, int16_t tag, int8_t type, const char *fmt, va_list ap)
+{
+    P9PDU *pdu;
+    int32_t size;
+    int errcode;
+
+    pdu = p9pdu_get(s);
+    if (pdu == NULL)
+        return -ENOMEM;
+
+    pdu->size = 7;
+    errcode = p9pdu_vwritef(pdu, optional, fmt, ap);
+
+    if (errcode) {
+        p9pdu_put(s, pdu);
+        return errcode;
+    }
+
+    /* FIXME endianness */
+    size = pdu->size;
+    memcpy(pdu->buffer + 0, &size, 4);
+    pdu->buffer[4] = type;
+    memcpy(pdu->buffer + 5, &tag, 2);
+
+    return p9_send_pdu(s, pdu);
+}
+
+int p9_send_pduf(P9State *s, int optional, int16_t tag, int8_t type, const char *fmt, ...)
+{
+    int errcode;
+    va_list ap;
+
+    va_start(ap, fmt);
+    errcode = p9_send_vpduf(s, optional, tag, type, fmt, ap);
+    va_end(ap);
+
+    return errcode;
+}
+
+P9State *p9_init(P9IOState *iops, P9PDUState *pduops)
+{
+    P9State *s;
+
+    s = malloc(sizeof(*s));
+    if (s == NULL)
+        return NULL;
+
+    s->msize = 4096;
+
+    s->active_tx_offset = 0;
+    s->active_tx = NULL;
+    TAILQ_INIT(&s->tx_queue);
+
+    s->active_rx = NULL;
+
+    s->iops = iops;
+    s->pduops = pduops;
+
+    return s;
+}
+
+void p9_free(P9State *s)
+{
+    if (s->active_rx)
+        p9pdu_put(s, s->active_rx);
+    if (s->active_tx)
+        p9pdu_put(s, s->active_tx);
+
+    while (!TAILQ_EMPTY(&s->tx_queue)) {
+        _P9PDU *_pdu;
+
+        _pdu = TAILQ_FIRST(&s->tx_queue);
+        TAILQ_REMOVE(&s->tx_queue, _pdu, node);
+        p9pdu_put(s, &_pdu->pdu);
+    }
+
+    free(s);
+}
diff --git a/p9.h b/p9.h
new file mode 100644
index 0000000..f63e424
--- /dev/null
+++ b/p9.h
@@ -0,0 +1,163 @@
+/*
+ * 9p client library
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef LIBP9_H
+#define LIBP9_H
+
+#include <sys/types.h>
+#include <inttypes.h>
+#include <stdarg.h>
+
+#include "sys-queue.h"
+
+#ifndef MIN
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#ifndef MAX
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef offset_of
+#define offset_of(type, memb) \
+    ((unsigned long)(&((type *)0)->memb))
+#endif
+#ifndef container_of
+#define container_of(obj, type, memb) \
+    ((type *)(((char *)obj) - offset_of(type, memb)))
+#endif
+
+#define P9_VERSION	100
+#define P9_AUTH		102
+#define P9_ATTACH	104
+#define P9_ERROR	106
+#define P9_FLUSH	108
+#define P9_WALK		110
+#define P9_OPEN		112
+#define P9_CREATE	114
+#define P9_READ		116
+#define P9_WRITE	118
+#define P9_CLUNK	120
+#define P9_REMOVE	122
+#define P9_STAT		124
+#define P9_WSTAT	126
+
+#define P9_O_READ	0x00
+#define P9_O_WRITE	0x01
+#define P9_O_RDWR	0x02
+#define P9_O_EXEC	0x03
+#define P9_O_EXCL	0x04
+#define P9_O_TRUNC	0x10
+#define P9_O_REXEC	0x20
+#define P9_O_RCLOSE	0x40
+#define P9_O_APPEND	0x80
+
+#define P9_STAT_MODE_DIR	0x80000000
+#define P9_STAT_MODE_APPEND	0x40000000
+#define P9_STAT_MODE_EXCL	0x20000000
+#define P9_STAT_MODE_MOUNT	0x10000000
+#define P9_STAT_MODE_AUTH	0x08000000
+#define P9_STAT_MODE_TMP	0x04000000
+#define P9_STAT_MODE_SYMLINK	0x02000000
+#define P9_STAT_MODE_LINK	0x01000000
+#define P9_STAT_MODE_DEVICE	0x00800000
+#define P9_STAT_MODE_NAMED_PIPE	0x00200000
+#define P9_STAT_MODE_SOCKET	0x00100000
+#define P9_STAT_MODE_SETUID	0x00080000
+#define P9_STAT_MODE_SETGID	0x00040000
+#define P9_STAT_MODE_SETVTX	0x00010000
+
+#define P9_STAT_MODE_SPECIAL	(P9_STAT_MODE_NAMED_PIPE | \
+				 P9_STAT_MODE_SYMLINK | \
+				 P9_STAT_MODE_LINK | \
+				 P9_STAT_MODE_DEVICE)
+
+
+#define P9_QID_TYPE_DIR		0x80
+#define P9_QID_TYPE_SYMLINK	0x02
+
+typedef struct P9PDU
+{
+    size_t offset;
+    size_t size;
+    size_t capacity;
+    uint8_t *buffer;
+} P9PDU;
+
+typedef struct P9QID
+{
+    int8_t type;
+    int32_t version;
+    int64_t path;
+} P9QID;
+
+typedef struct P9Stat
+{
+    int16_t size;
+    int16_t type;
+    int32_t dev;
+    P9QID qid;
+    int32_t mode;
+    int32_t atime;
+    int32_t mtime;
+    int64_t length;
+    char *name;
+    char *uid;
+    char *gid;
+    char *muid;
+    char *extension;
+    int32_t n_uid;
+    int32_t n_gid;
+    int32_t n_muid;
+} P9Stat;
+
+typedef struct P9State P9State;
+
+typedef struct P9IOState P9IOState;
+
+struct P9IOState
+{
+    /* IO helpers */
+    ssize_t (*send)(P9IOState *s, const void *data, size_t size);
+    ssize_t (*recv)(P9IOState *s, void *data, size_t size);
+    void (*set_send_notify)(P9IOState *s, int enable);
+};
+
+typedef struct P9PDUState P9PDUState;
+
+struct P9PDUState
+{
+    int (*dispatch_pdu)(P9PDUState *s, P9PDU *pdu);
+};
+
+P9State *p9_init(P9IOState *iops, P9PDUState *pduops);
+
+void p9_free(P9State *s);
+
+int p9_set_msize(P9State *s, int32_t msize);
+
+int p9_send_vpduf(P9State *s, int optional, int16_t tag, int8_t type, const char *fmt, va_list ap);
+int p9_send_pduf(P9State *s, int optional, int16_t tag, int8_t type, const char *fmt, ...);
+
+int p9_notify_can_send(P9State *s);
+int p9_notify_can_recv(P9State *s);
+
+int p9pdu_vreadf(P9PDU *pdu, int optional, const char *fmt, va_list ap);
+int p9pdu_vwritef(P9PDU *pdu, int optional, const char *fmt, va_list ap);
+
+int p9pdu_readf(P9PDU *pdu, int optional, const char *fmt, ...);
+int p9pdu_writef(P9PDU *pdu, int optional, const char *fmt, ...);
+
+void p9stat_free(P9Stat *stbuf);
+
+#endif
diff --git a/p9c.c b/p9c.c
new file mode 100644
index 0000000..3e4d8be
--- /dev/null
+++ b/p9c.c
@@ -0,0 +1,437 @@
+/*
+ * 9p client library
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <string.h>
+#include <errno.h>
+#include <malloc.h>
+#include <stdbool.h>
+
+#include "p9c.h"
+#include "sys-queue.h"
+
+#define P9_MSIZE	(64 << 10)
+
+typedef struct P9Tag {
+    int8_t type;
+    int16_t tag;
+    union {
+        P9VersionFunc *version;
+        P9AuthFunc *auth;
+        P9AttachFunc *attach;
+        P9WalkFunc *walk;
+        P9OpenFunc *open;
+        P9CreateFunc *create;
+        P9StatFunc *stat;
+        P9WStatFunc *wstat;
+        P9WriteFunc *write;
+        P9ReadFunc *read;
+        P9FlushFunc *flush;
+        P9ClunkFunc *clunk;
+        void *generic;
+    } cb;
+    void *opaque;
+    TAILQ_ENTRY(P9Tag) node;
+} P9Tag;
+
+struct P9ClientState
+{
+    P9State *p9_state;
+    P9PDUState pdu_state;
+
+    bool dotu;
+
+    int max_tag;
+
+    TAILQ_HEAD(, P9Tag) inflight_requests;
+    TAILQ_HEAD(, P9Tag) tag_pool;
+};
+
+static P9Tag *p9c_alloc_tag(P9ClientState *s, int8_t type, void *cb, void *opaque)
+{
+    P9Tag *tag;
+
+    if (TAILQ_EMPTY(&s->tag_pool)) {
+        tag = malloc(sizeof(*tag));
+        if (tag == NULL)
+            return NULL;
+
+        if (s->max_tag == (1 << 16))
+            return NULL;
+
+        tag->tag = s->max_tag++;
+    } else {
+        tag = TAILQ_FIRST(&s->tag_pool);
+        TAILQ_REMOVE(&s->tag_pool, tag, node);
+    }
+
+    tag->type = type;
+    tag->cb.generic = cb;
+    tag->opaque = opaque;
+
+    return tag;
+}
+
+static P9Tag *p9c_find_tag(P9ClientState *s, int16_t tag)
+{
+    P9Tag *i;
+
+    TAILQ_FOREACH(i, &s->inflight_requests, node) {
+        if (i->tag == tag)
+            break;
+    }
+
+    if (i)
+        TAILQ_REMOVE(&s->inflight_requests, i, node);
+
+    return i;
+}
+
+static void p9c_dispatch_error(P9Tag *tag, const char *ename, int32_t ecode)
+{
+    switch (tag->type) {
+    case P9_VERSION:
+        tag->cb.version(tag->opaque, ecode, 0, NULL);
+        break;
+    case P9_AUTH:
+        tag->cb.auth(tag->opaque, ecode, NULL);
+        break;
+    case P9_ATTACH:
+        tag->cb.attach(tag->opaque, ecode, NULL);
+        break;
+    case P9_WALK:
+        tag->cb.walk(tag->opaque, ecode, 0, NULL);
+        break;
+    case P9_OPEN:
+        tag->cb.open(tag->opaque, ecode, NULL, 0);
+        break;
+    case P9_CREATE:
+        tag->cb.create(tag->opaque, ecode, NULL, 0);
+        break;
+    case P9_STAT:
+        tag->cb.stat(tag->opaque, ecode, NULL);
+        break;
+    case P9_WSTAT:
+        tag->cb.wstat(tag->opaque, ecode);
+        break;
+    case P9_WRITE:
+        tag->cb.write(tag->opaque, ecode, 0);
+        break;
+    case P9_READ:
+        tag->cb.read(tag->opaque, ecode, 0, NULL);
+        break;
+    case P9_FLUSH:
+        tag->cb.flush(tag->opaque, ecode);
+        break;
+    case P9_CLUNK:
+        tag->cb.clunk(tag->opaque, ecode);
+        break;
+    }
+}
+
+static int p9c_dispatch_pdu(P9PDUState *pdu_state, P9PDU *pdu)
+{
+    P9ClientState *s = container_of(pdu_state, P9ClientState, pdu_state);
+    int32_t size;
+    int8_t type;
+    int16_t ntag;
+    P9Tag *tag;
+    int errcode;
+
+    errcode = p9pdu_readf(pdu, s->dotu, "dbw", &size, &type, &ntag);
+    if (errcode)
+        return errcode;
+
+    tag = p9c_find_tag(s, ntag);
+    if (tag == NULL)
+        return -EFAULT;
+
+    switch (type - 1) {
+    case P9_VERSION: {
+        int32_t msize;
+        char *version = NULL;
+
+        errcode = p9pdu_readf(pdu, s->dotu, "ds", &msize, &version);
+        if (!errcode) {
+            if (strcmp(version, "9P2000.u") == 0)
+                s->dotu = true;
+            else if (strcmp(version, "9P2000") == 0)
+                s->dotu = false;
+            else
+                errcode = -EINVAL;
+        }
+
+        if (!errcode) {
+            if (msize > 24)
+                errcode = p9_set_msize(s->p9_state, msize);
+            else
+                errcode = -EFAULT;
+        }
+
+        if (!errcode)
+            errcode = tag->cb.version(tag->opaque, 0, msize, version);
+
+        free(version);
+    }   break;
+    case P9_AUTH: {
+        P9QID qid;
+
+        errcode = p9pdu_readf(pdu, s->dotu, "Q", &qid);
+        if (!errcode)
+            errcode = tag->cb.auth(tag->opaque, 0, &qid);
+    }   break;
+    case P9_ATTACH: {
+        P9QID qid;
+
+        errcode = p9pdu_readf(pdu, s->dotu, "Q", &qid);
+        if (!errcode)
+            errcode = tag->cb.attach(tag->opaque, 0, &qid);
+    }   break;
+    case P9_WALK: {
+        P9QID *wqids = NULL;
+        int16_t nwqid;
+
+        errcode = p9pdu_readf(pdu, s->dotu, "R", &nwqid, &wqids);
+        if (!errcode)
+            errcode = tag->cb.walk(tag->opaque, 0, nwqid, wqids);
+
+        free(wqids);
+    }   break;
+    case P9_OPEN: {
+        P9QID qid;
+        int32_t iounit;
+
+        errcode = p9pdu_readf(pdu, s->dotu, "Qd", &qid, &iounit);
+        if (!errcode)
+            errcode = tag->cb.open(tag->opaque, 0, &qid, iounit);
+    }   break;
+    case P9_CREATE: {
+        P9QID qid;
+        int32_t iounit;
+
+        errcode = p9pdu_readf(pdu, s->dotu, "Qd", &qid, &iounit);
+        if (!errcode)
+            errcode = tag->cb.create(tag->opaque, 0, &qid, iounit);
+    }   break;
+    case P9_STAT: {
+        P9Stat stbuf;
+
+        memset(&stbuf, 0, sizeof(stbuf));
+
+        errcode = p9pdu_readf(pdu, s->dotu, "S", &stbuf);
+        if (!errcode)
+            errcode = tag->cb.stat(tag->opaque, 0, &stbuf);
+
+        p9stat_free(&stbuf);
+    }   break;
+    case P9_WSTAT:
+        tag->cb.wstat(tag->opaque, 0);
+        break;
+    case P9_WRITE: {
+        int32_t count;
+
+        errcode = p9pdu_readf(pdu, s->dotu, "d", &count);
+        if (!errcode)
+            errcode = tag->cb.write(tag->opaque, 0, count);
+    }   break;
+    case P9_READ: {
+        int32_t count;
+        const void *data = NULL;
+
+        errcode = p9pdu_readf(pdu, s->dotu, "D", &count, &data);
+        if (!errcode)
+            errcode = tag->cb.read(tag->opaque, 0, count, data);
+    }   break;
+    case P9_FLUSH:
+        tag->cb.flush(tag->opaque, 0);
+        break;
+    case P9_CLUNK:
+        tag->cb.clunk(tag->opaque, 0);
+        break;
+    case P9_ERROR: {
+        char *ename = NULL;
+        int32_t ecode = -1;
+
+        errcode = p9pdu_readf(pdu, s->dotu, "s?d", &ename, &ecode);
+        if (!errcode)
+            p9c_dispatch_error(tag, ename, ecode);
+
+        free(ename);
+    }   break;
+    default:
+        break;
+    }
+
+    TAILQ_INSERT_HEAD(&s->tag_pool, tag, node);
+
+    return errcode;
+}
+
+void p9c_notify_can_recv(P9ClientState *s)
+{
+    p9_notify_can_recv(s->p9_state);
+}
+
+void p9c_notify_can_send(P9ClientState *s)
+{
+    p9_notify_can_send(s->p9_state);
+}
+
+P9ClientState *p9c_init(P9IOState *iops)
+{
+    P9ClientState *s;
+
+    s = malloc(sizeof(*s));
+    if (s == NULL)
+        return NULL;
+
+    s->pdu_state.dispatch_pdu = p9c_dispatch_pdu;
+    s->p9_state = p9_init(iops, &s->pdu_state);
+    if (s->p9_state == NULL) {
+        free(s);
+        return NULL;
+    }
+
+    s->dotu = false;
+    s->max_tag = 0;
+
+    TAILQ_INIT(&s->inflight_requests);
+    TAILQ_INIT(&s->tag_pool);
+
+    return s;
+}
+
+void p9c_free(P9ClientState *s)
+{
+    p9_free(s->p9_state);
+
+    while (!TAILQ_EMPTY(&s->inflight_requests)) {
+        P9Tag *node;
+        node = TAILQ_FIRST(&s->inflight_requests);
+        TAILQ_REMOVE(&s->inflight_requests, node, node);
+        p9c_dispatch_error(node, "Interrupted", EINTR);
+        free(node);
+    }
+
+    while (!TAILQ_EMPTY(&s->tag_pool)) {
+        P9Tag *node;
+        node = TAILQ_FIRST(&s->tag_pool);
+        TAILQ_REMOVE(&s->tag_pool, node, node);
+        free(node);
+    }
+
+    free(s);
+}
+
+static int p9c_send_pduf(P9ClientState *s, int8_t type,
+                         void *cb, void *opaque,
+                         const char *fmt, ...)
+{
+    P9Tag *tag;
+    int errcode;
+    va_list ap;
+
+    tag = p9c_alloc_tag(s, type, cb, opaque);
+    if (tag == NULL)
+        return -ENOMEM;
+
+    va_start(ap, fmt);
+    errcode = p9_send_vpduf(s->p9_state, s->dotu, tag->tag, type, fmt, ap);
+    va_end(ap);
+
+    if (errcode)
+        TAILQ_INSERT_HEAD(&s->tag_pool, tag, node);
+    else
+        TAILQ_INSERT_HEAD(&s->inflight_requests, tag, node);
+
+    return errcode;
+}
+
+int p9c_version(P9ClientState *s, P9VersionFunc *cb, void *opaque)
+{
+    return p9c_send_pduf(s, P9_VERSION, cb, opaque,
+                         "ds", P9_MSIZE, "9P2000.u");
+}
+
+int p9c_auth(P9ClientState *s, int32_t afid, const char *uname,
+             const char *aname, int32_t n_uname, P9AuthFunc *cb, void *opaque)
+{
+    return p9c_send_pduf(s, P9_AUTH, cb, opaque,
+                         "dss?d", afid, uname, aname, n_uname);
+}
+
+int p9c_attach(P9ClientState *s, int32_t fid, int32_t afid,
+               const char *uname, const char *aname, int32_t n_uname,
+               P9AttachFunc *cb, void *opaque)
+{
+    return p9c_send_pduf(s, P9_ATTACH, cb, opaque,
+                         "ddss?d", fid, afid, uname, aname, n_uname);
+}
+
+int p9c_walk(P9ClientState *s, int32_t fid, int32_t newfid,
+             int16_t nwname, const char **wnames,
+             P9WalkFunc *cb, void *opaque)
+{
+    return  p9c_send_pduf(s, P9_WALK, cb, opaque, 
+                          "ddT", fid, newfid, nwname, wnames);
+}
+
+int p9c_open(P9ClientState *s, int32_t fid, int8_t mode,
+             P9OpenFunc *cb, void *opaque)
+{
+    return p9c_send_pduf(s, P9_OPEN, cb, opaque,
+                         "db", fid, mode);
+}
+
+int p9c_create(P9ClientState *s, int32_t fid, const char *name, int32_t perm,
+               int8_t mode, const char *extension, P9OpenFunc *cb, void *opaque)
+{
+    return p9c_send_pduf(s, P9_CREATE, cb, opaque,
+                         "dsdb?s", fid, name, perm, mode, extension);
+}
+
+int p9c_stat(P9ClientState *s, int32_t fid, P9StatFunc *cb, void *opaque)
+{
+    return p9c_send_pduf(s, P9_STAT, cb, opaque, "d", fid);
+}
+
+int p9c_wstat(P9ClientState *s, int32_t fid, const P9Stat *stbuf,
+              P9StatFunc *cb, void *opaque)
+{
+    return p9c_send_pduf(s, P9_WSTAT, cb, opaque,
+                         "dwS", fid, 0, stbuf);
+}
+
+int p9c_write(P9ClientState *s, int32_t fid, int64_t offset,
+              int32_t count, const void *data,
+              P9WriteFunc *cb, void *opaque)
+{
+    return p9c_send_pduf(s, P9_WRITE, cb, opaque,
+                         "dqD", fid, offset, count, data);
+}
+
+int p9c_read(P9ClientState *s, int32_t fid, int64_t offset,
+             int32_t count, P9ReadFunc *cb, void *opaque)
+{
+    return p9c_send_pduf(s, P9_READ, cb, opaque,
+                         "dqd", fid, offset, count);
+}
+
+int p9c_flush(P9ClientState *s, int16_t oldtag, P9FlushFunc *cb, void *opaque)
+{
+    return p9c_send_pduf(s, P9_FLUSH, cb, opaque, "d", oldtag);
+}
+
+int p9c_clunk(P9ClientState *s, int32_t fid, P9ClunkFunc *cb, void *opaque)
+{
+    return p9c_send_pduf(s, P9_CLUNK, cb, opaque, "d", fid);
+}
diff --git a/p9c.h b/p9c.h
new file mode 100644
index 0000000..32a805f
--- /dev/null
+++ b/p9c.h
@@ -0,0 +1,81 @@
+/*
+ * 9p client library
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef P9C_H
+#define P9C_H
+
+#include "p9.h"
+
+typedef struct P9ClientState P9ClientState;
+
+typedef int (P9VersionFunc)(void *opaque, int ret, int32_t msize,
+                            const char *version);
+typedef int (P9AuthFunc)(void *opaque, int ret, const P9QID *qid);
+typedef int (P9AttachFunc)(void *opaque, int ret, const P9QID *qid);
+typedef int (P9WalkFunc)(void *opaque, int ret, int16_t nwqid,
+                         const P9QID *wqids);
+typedef int (P9OpenFunc)(void *opaque, int ret, const P9QID *qid,
+                         int32_t iounit);
+typedef int (P9CreateFunc)(void *opaque, int ret, const P9QID *qid,
+                           int32_t iounit);
+typedef int (P9StatFunc)(void *opaque, int ret, const P9Stat *stbuf);
+typedef int (P9WStatFunc)(void *opaque, int ret);
+typedef int (P9WriteFunc)(void *opaque, int ret, int32_t count);
+typedef int (P9ReadFunc)(void *opaque, int ret, int32_t count,
+                         const void *data);
+typedef int (P9FlushFunc)(void *opaque, int ret);
+typedef int (P9ClunkFunc)(void *opaque, int ret);
+
+P9ClientState *p9c_init(P9IOState *ops);
+
+void p9c_notify_can_send(P9ClientState *s);
+
+void p9c_notify_can_recv(P9ClientState *s);
+
+void p9c_free(P9ClientState *s);
+
+/* client messages */
+
+int p9c_version(P9ClientState *s, P9VersionFunc *cb, void *opaque);
+
+int p9c_auth(P9ClientState *s, int32_t afid, const char *uname,
+             const char *aname, int32_t n_uname, P9AuthFunc *cb, void *opaque);
+
+int p9c_attach(P9ClientState *s, int32_t fid, int32_t afid, const char *uname,
+               const char *aname, int32_t n_uname, P9AttachFunc *cb, void *opaque);
+
+int p9c_walk(P9ClientState *s, int32_t fid, int32_t newfid, int16_t nwname,
+             const char **wnames, P9WalkFunc *cb, void *opaque);
+
+int p9c_open(P9ClientState *s, int32_t fid, int8_t mode,
+             P9OpenFunc *cb, void *opaque);
+
+int p9c_create(P9ClientState *s, int32_t fid, const char *name, int32_t perm,
+               int8_t mode, const char *extension, P9OpenFunc *cb, void *opaque);
+
+int p9c_stat(P9ClientState *s, int32_t fid, P9StatFunc *cb, void *opaque);
+
+int p9c_wstat(P9ClientState *s, int32_t fid, const P9Stat *stbuf,
+              P9StatFunc *cb, void *opaque);
+
+int p9c_write(P9ClientState *s, int32_t fid, int64_t offset, int32_t count,
+              const void *data, P9WriteFunc *cb, void *opaque);
+
+int p9c_read(P9ClientState *s, int32_t fid, int64_t offset, int32_t count,
+             P9ReadFunc *cb, void *opaque);
+
+int p9c_flush(P9ClientState *s, int16_t oldtag, P9FlushFunc *cb, void *opaque);
+
+int p9c_clunk(P9ClientState *s, int32_t fid, P9ClunkFunc *cb, void *opaque);
+
+#endif

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24 19:38                     ` Anthony Liguori
@ 2010-05-25  9:14                       ` Avi Kivity
  2010-05-25 13:17                         ` Anthony Liguori
  0 siblings, 1 reply; 64+ messages in thread
From: Avi Kivity @ 2010-05-25  9:14 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

On 05/24/2010 10:38 PM, Anthony Liguori wrote:
>
>> - Building a plugin API seems a bit simpler to me, although I'm to
>> sure if I'd get the
>>    idea correctly:
>>    The block layer has already some kind of api (.bdrv_file_open, 
>> .bdrv_read). We
>>    could simply compile the block-drivers as shared objects and 
>> create a method
>>    for loading the necessary modules at runtime.
>
> That approach would be a recipe for disaster.   We would have to 
> introduce a new, reduced functionality block API that was supported 
> for plugins.  Otherwise, the only way a plugin could keep up with our 
> API changes would be if it was in tree which defeats the purpose of 
> having plugins.

We could guarantee API/ABI stability in a stable branch but not across 
releases.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24 19:16                 ` Anthony Liguori
@ 2010-05-25  9:19                   ` Avi Kivity
  2010-05-25 13:26                   ` MORITA Kazutaka
  1 sibling, 0 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-25  9:19 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

On 05/24/2010 10:16 PM, Anthony Liguori wrote:
> On 05/24/2010 06:56 AM, Avi Kivity wrote:
>> On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
>>>
>>>> The server would be local and talk over a unix domain socket, perhaps
>>>> anonymous.
>>>>
>>>> nbd has other issues though, such as requiring a copy and no 
>>>> support for
>>>> metadata operations such as snapshot and file size extension.
>>>>
>>> Sorry, my explanation was unclear.  I'm not sure how running servers
>>> on localhost can solve the problem.
>>
>> The local server can convert from the local (nbd) protocol to the 
>> remote (sheepdog, ceph) protocol.
>>
>>> What I wanted to say was that we cannot specify the image of VM. With
>>> nbd protocol, command line arguments are as follows:
>>>
>>>   $ qemu nbd:hostname:port
>>>
>>> As this syntax shows, with nbd protocol the client cannot pass the VM
>>> image name to the server.
>>
>> We would extend it to allow it to connect to a unix domain socket:
>>
>>   qemu nbd:unix:/path/to/socket
>
> nbd is a no-go because it only supports a single, synchronous I/O 
> operation at a time and has no mechanism for extensibility.
>
> If we go this route, I think two options are worth considering.  The 
> first would be a purely socket based approach where we just accepted 
> the extra copy.
>
> The other potential approach would be shared memory based.  We export 
> all guest ram as shared memory along with a small bounce buffer pool.  
> We would then use a ring queue (potentially even using virtio-blk) and 
> an eventfd for notification.

We can't actually export guest memory unless we allocate it as a shared 
memory object, which has many disadvantages.  The only way to export 
anonymous memory now is vmsplice(), which is fairly limited.


>
>> The server at the other end would associate the socket with a 
>> filename and forward it to the server using the remote protocol.
>>
>> However, I don't think nbd would be a good protocol.  My preference 
>> would be for a plugin API, or for a new local protocol that uses 
>> splice() to avoid copies.
>
> I think a good shared memory implementation would be preferable to 
> plugins.  I think it's worth attempting to do a plugin interface for 
> the block layer but I strongly suspect it would not be sufficient.
>
> I would not want to see plugins that interacted with BlockDriverState 
> directly, for instance.  We change it far too often.  Our main loop 
> functions are also not terribly stable so I'm not sure how we would 
> handle that (unless we forced all block plugins to be in a separate 
> thread).

If we manage to make a good long-term stable plugin API, it would be a 
good candidate for the block layer itself.

Some OSes manage to have a stable block driver ABI, so it should be 
possible, if difficult.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24 19:19             ` Anthony Liguori
@ 2010-05-25  9:22               ` Avi Kivity
  0 siblings, 0 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-25  9:22 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm, Stefan Hajnoczi, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/24/2010 10:19 PM, Anthony Liguori wrote:
> On 05/24/2010 06:03 AM, Avi Kivity wrote:
>> On 05/24/2010 11:27 AM, Stefan Hajnoczi wrote:
>>> On Sun, May 23, 2010 at 1:01 PM, Avi Kivity<avi@redhat.com>  wrote:
>>>> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>>>>> I'd be more interested in enabling people to build these types of 
>>>>> storage
>>>>> systems without touching qemu.
>>>>>
>>>>> Both sheepdog and ceph ultimately transmit I/O over a socket to a 
>>>>> central
>>>>> daemon, right?
>>>> That incurs an extra copy.
>>> Besides a shared memory approach, I wonder if the splice() family of
>>> syscalls could be used to send/receive data through a storage daemon
>>> without the daemon looking at or copying the data?
>>
>> Excellent idea.
>
> splice() eventually requires a copy.  You cannot splice() to linux-aio 
> so you'd have to splice() to a temporary buffer and then call into 
> linux-aio.  With shared memory, you can avoid ever bringing the data 
> into memory via O_DIRECT and linux-aio.

If the final destination is a socket, then you end up queuing guest 
memory as an skbuff.  In theory we could do an aio splice to block 
devices but I don't think that's realistic given our experience with aio 
changes.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-23 12:01       ` Avi Kivity
  2010-05-24  7:12         ` MORITA Kazutaka
  2010-05-24  8:27         ` Stefan Hajnoczi
@ 2010-05-25 11:02         ` Kevin Wolf
  2010-05-25 11:25           ` Avi Kivity
  2 siblings, 1 reply; 64+ messages in thread
From: Kevin Wolf @ 2010-05-25 11:02 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner

Am 23.05.2010 14:01, schrieb Avi Kivity:
> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>>
>> I'd be more interested in enabling people to build these types of 
>> storage systems without touching qemu.
>>
>> Both sheepdog and ceph ultimately transmit I/O over a socket to a 
>> central daemon, right? 
> 
> That incurs an extra copy.
> 
>> So could we not standardize a protocol for this that both sheepdog and 
>> ceph could implement?
> 
> The protocol already exists, nbd.  It doesn't support snapshotting etc. 
> but we could extend it.
> 
> But IMO what's needed is a plugin API for the block layer.

What would it buy us, apart from more downstreams and having to maintain
a stable API and ABI? Hiding block drivers somewhere else doesn't make
them stop existing, they just might not be properly integrated, but
rather hacked in to fit that limited stable API.

Kevin

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 11:02         ` Kevin Wolf
@ 2010-05-25 11:25           ` Avi Kivity
  2010-05-25 12:03             ` Christoph Hellwig
  2010-05-25 13:25             ` Anthony Liguori
  0 siblings, 2 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 11:25 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner

On 05/25/2010 02:02 PM, Kevin Wolf wrote:
>
>>
>>> So could we not standardize a protocol for this that both sheepdog and
>>> ceph could implement?
>>>        
>> The protocol already exists, nbd.  It doesn't support snapshotting etc.
>> but we could extend it.
>>
>> But IMO what's needed is a plugin API for the block layer.
>>      
> What would it buy us, apart from more downstreams and having to maintain
> a stable API and ABI?

Currently if someone wants to add a new block format, they have to 
upstream it and wait for a new qemu to be released.  With a plugin API, 
they can add a new block format to an existing, supported qemu.

> Hiding block drivers somewhere else doesn't make
> them stop existing, they just might not be properly integrated, but
> rather hacked in to fit that limited stable API.
>    

They would hack it to fit the current API, and hack the API in qemu.git 
to fit their requirements for the next release.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 11:25           ` Avi Kivity
@ 2010-05-25 12:03             ` Christoph Hellwig
  2010-05-25 12:13               ` Avi Kivity
  2010-05-25 13:25             ` Anthony Liguori
  1 sibling, 1 reply; 64+ messages in thread
From: Christoph Hellwig @ 2010-05-25 12:03 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On Tue, May 25, 2010 at 02:25:53PM +0300, Avi Kivity wrote:
> Currently if someone wants to add a new block format, they have to  
> upstream it and wait for a new qemu to be released.  With a plugin API,  
> they can add a new block format to an existing, supported qemu.

So?  Unless we want a stable driver ABI which I fundamentally oppose as
it would make block driver development hell they'd have to wait for
a new release of the block layer.  It's really just going to be a lot
of pain for no major gain.  qemu releases are frequent enough, and if
users care enough they can also easily patch qemu.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 12:03             ` Christoph Hellwig
@ 2010-05-25 12:13               ` Avi Kivity
  0 siblings, 0 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 12:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 03:03 PM, Christoph Hellwig wrote:
> On Tue, May 25, 2010 at 02:25:53PM +0300, Avi Kivity wrote:
>    
>> Currently if someone wants to add a new block format, they have to
>> upstream it and wait for a new qemu to be released.  With a plugin API,
>> they can add a new block format to an existing, supported qemu.
>>      
> So?  Unless we want a stable driver ABI which I fundamentally oppose as
> it would make block driver development hell

We'd only freeze it for a major release.

> they'd have to wait for
> a new release of the block layer.  It's really just going to be a lot
> of pain for no major gain.  qemu releases are frequent enough, and if
> users care enough they can also easily patch qemu.
>    

May not be so easy for them, they lose binary updates from their distro 
and have to keep repatching.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25  9:14                       ` Avi Kivity
@ 2010-05-25 13:17                         ` Anthony Liguori
  2010-05-25 13:25                           ` Avi Kivity
  0 siblings, 1 reply; 64+ messages in thread
From: Anthony Liguori @ 2010-05-25 13:17 UTC (permalink / raw)
  To: Avi Kivity
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

On 05/25/2010 04:14 AM, Avi Kivity wrote:
> On 05/24/2010 10:38 PM, Anthony Liguori wrote:
>>
>>> - Building a plugin API seems a bit simpler to me, although I'm to
>>> sure if I'd get the
>>>    idea correctly:
>>>    The block layer has already some kind of api (.bdrv_file_open, 
>>> .bdrv_read). We
>>>    could simply compile the block-drivers as shared objects and 
>>> create a method
>>>    for loading the necessary modules at runtime.
>>
>> That approach would be a recipe for disaster.   We would have to 
>> introduce a new, reduced functionality block API that was supported 
>> for plugins.  Otherwise, the only way a plugin could keep up with our 
>> API changes would be if it was in tree which defeats the purpose of 
>> having plugins.
>
> We could guarantee API/ABI stability in a stable branch but not across 
> releases.

We have releases every six months.  There would be tons of block plugins 
that didn't work for random sets of releases.  That creates a lot of 
user confusion and unhappiness.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 11:25           ` Avi Kivity
  2010-05-25 12:03             ` Christoph Hellwig
@ 2010-05-25 13:25             ` Anthony Liguori
  2010-05-25 13:31               ` Avi Kivity
  2010-05-25 13:53               ` Kevin Wolf
  1 sibling, 2 replies; 64+ messages in thread
From: Anthony Liguori @ 2010-05-25 13:25 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 06:25 AM, Avi Kivity wrote:
> On 05/25/2010 02:02 PM, Kevin Wolf wrote:
>>
>>>
>>>> So could we not standardize a protocol for this that both sheepdog and
>>>> ceph could implement?
>>> The protocol already exists, nbd.  It doesn't support snapshotting etc.
>>> but we could extend it.
>>>
>>> But IMO what's needed is a plugin API for the block layer.
>> What would it buy us, apart from more downstreams and having to maintain
>> a stable API and ABI?
>
> Currently if someone wants to add a new block format, they have to 
> upstream it and wait for a new qemu to be released.  With a plugin 
> API, they can add a new block format to an existing, supported qemu.

Whether we have a plugin or protocol based mechanism to implement block 
formats really ends up being just an implementation detail.

In order to implement either, we need to take a subset of block 
functionality that we feel we can support long term and expose that.  
Right now, that's basically just querying characteristics (like size and 
geometry) and asynchronous reads and writes.

A protocol based mechanism has the advantage of being more robust in the 
face of poorly written block backends so if it's possible to make it 
perform as well as a plugin, it's a preferable approach.

Plugins that just expose chunks of QEMU internal state directly (like 
BlockDriver) are a really bad idea IMHO.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:17                         ` Anthony Liguori
@ 2010-05-25 13:25                           ` Avi Kivity
  2010-05-25 13:29                             ` Anthony Liguori
  2010-05-25 14:01                             ` Kevin Wolf
  0 siblings, 2 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 13:25 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

On 05/25/2010 04:17 PM, Anthony Liguori wrote:
> On 05/25/2010 04:14 AM, Avi Kivity wrote:
>> On 05/24/2010 10:38 PM, Anthony Liguori wrote:
>>>
>>>> - Building a plugin API seems a bit simpler to me, although I'm to
>>>> sure if I'd get the
>>>>    idea correctly:
>>>>    The block layer has already some kind of api (.bdrv_file_open, 
>>>> .bdrv_read). We
>>>>    could simply compile the block-drivers as shared objects and 
>>>> create a method
>>>>    for loading the necessary modules at runtime.
>>>
>>> That approach would be a recipe for disaster.   We would have to 
>>> introduce a new, reduced functionality block API that was supported 
>>> for plugins.  Otherwise, the only way a plugin could keep up with 
>>> our API changes would be if it was in tree which defeats the purpose 
>>> of having plugins.
>>
>> We could guarantee API/ABI stability in a stable branch but not 
>> across releases.
>
> We have releases every six months.  There would be tons of block 
> plugins that didn't work for random sets of releases.  That creates a 
> lot of user confusion and unhappiness.

The current situation is that those block format drivers only exist in 
qemu.git or as patches.  Surely that's even more unhappiness.

Confusion could be mitigated:

   $ qemu -module my-fancy-block-format-driver.so
   my-fancy-block-format-driver.so does not support this version of qemu 
(0.19.2).  Please contact my-fancy-block-format-driver-devel@example.org.

The question is how many such block format drivers we expect.  We now 
have two in the pipeline (ceph, sheepdog), it's reasonable to assume 
we'll want an lvm2 driver and btrfs driver.  This is an area with a lot 
of activity and a relatively simply interface.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24 19:16                 ` Anthony Liguori
  2010-05-25  9:19                   ` Avi Kivity
@ 2010-05-25 13:26                   ` MORITA Kazutaka
  1 sibling, 0 replies; 64+ messages in thread
From: MORITA Kazutaka @ 2010-05-25 13:26 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm, qemu-devel, Blue Swirl, Avi Kivity, ceph-devel,
	Christian Brunner, MORITA Kazutaka

At Mon, 24 May 2010 14:16:32 -0500,
Anthony Liguori wrote:
> 
> On 05/24/2010 06:56 AM, Avi Kivity wrote:
> > On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
> >>
> >>> The server would be local and talk over a unix domain socket, perhaps
> >>> anonymous.
> >>>
> >>> nbd has other issues though, such as requiring a copy and no support 
> >>> for
> >>> metadata operations such as snapshot and file size extension.
> >>>
> >> Sorry, my explanation was unclear.  I'm not sure how running servers
> >> on localhost can solve the problem.
> >
> > The local server can convert from the local (nbd) protocol to the 
> > remote (sheepdog, ceph) protocol.
> >
> >> What I wanted to say was that we cannot specify the image of VM. With
> >> nbd protocol, command line arguments are as follows:
> >>
> >>   $ qemu nbd:hostname:port
> >>
> >> As this syntax shows, with nbd protocol the client cannot pass the VM
> >> image name to the server.
> >
> > We would extend it to allow it to connect to a unix domain socket:
> >
> >   qemu nbd:unix:/path/to/socket
> 
> nbd is a no-go because it only supports a single, synchronous I/O 
> operation at a time and has no mechanism for extensibility.
> 
> If we go this route, I think two options are worth considering.  The 
> first would be a purely socket based approach where we just accepted the 
> extra copy.
> 
> The other potential approach would be shared memory based.  We export 
> all guest ram as shared memory along with a small bounce buffer pool.  
> We would then use a ring queue (potentially even using virtio-blk) and 
> an eventfd for notification.
> 

The shared memory approach assumes that there is a local server who
can talk with the storage system.  But Ceph doesn't require the local
server, and Sheepdog would be extended to support VMs running outside
the storage system.  We could run a local daemon who can only work as
proxy, but I don't think it looks a clean approach.  So I think a
socket based approach is the right way to go.

BTW, is it required to design a common interface?  The way Sheepdog
replicates data is different from Ceph, so I think it is not possible
to define a common protocol as Christian says.

Regards,

Kazutaka

> > The server at the other end would associate the socket with a filename 
> > and forward it to the server using the remote protocol.
> >
> > However, I don't think nbd would be a good protocol.  My preference 
> > would be for a plugin API, or for a new local protocol that uses 
> > splice() to avoid copies.
> 
> I think a good shared memory implementation would be preferable to 
> plugins.  I think it's worth attempting to do a plugin interface for the 
> block layer but I strongly suspect it would not be sufficient.
> 
> I would not want to see plugins that interacted with BlockDriverState 
> directly, for instance.  We change it far too often.  Our main loop 
> functions are also not terribly stable so I'm not sure how we would 
> handle that (unless we forced all block plugins to be in a separate thread).
> 

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:25                           ` Avi Kivity
@ 2010-05-25 13:29                             ` Anthony Liguori
  2010-05-25 13:36                               ` Avi Kivity
  2010-05-25 14:01                             ` Kevin Wolf
  1 sibling, 1 reply; 64+ messages in thread
From: Anthony Liguori @ 2010-05-25 13:29 UTC (permalink / raw)
  To: Avi Kivity
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

On 05/25/2010 08:25 AM, Avi Kivity wrote:
> On 05/25/2010 04:17 PM, Anthony Liguori wrote:
>> On 05/25/2010 04:14 AM, Avi Kivity wrote:
>>> On 05/24/2010 10:38 PM, Anthony Liguori wrote:
>>>>
>>>>> - Building a plugin API seems a bit simpler to me, although I'm to
>>>>> sure if I'd get the
>>>>>    idea correctly:
>>>>>    The block layer has already some kind of api (.bdrv_file_open, 
>>>>> .bdrv_read). We
>>>>>    could simply compile the block-drivers as shared objects and 
>>>>> create a method
>>>>>    for loading the necessary modules at runtime.
>>>>
>>>> That approach would be a recipe for disaster.   We would have to 
>>>> introduce a new, reduced functionality block API that was supported 
>>>> for plugins.  Otherwise, the only way a plugin could keep up with 
>>>> our API changes would be if it was in tree which defeats the 
>>>> purpose of having plugins.
>>>
>>> We could guarantee API/ABI stability in a stable branch but not 
>>> across releases.
>>
>> We have releases every six months.  There would be tons of block 
>> plugins that didn't work for random sets of releases.  That creates a 
>> lot of user confusion and unhappiness.
>
> The current situation is that those block format drivers only exist in 
> qemu.git or as patches.  Surely that's even more unhappiness.
>
> Confusion could be mitigated:
>
>   $ qemu -module my-fancy-block-format-driver.so
>   my-fancy-block-format-driver.so does not support this version of 
> qemu (0.19.2).  Please contact 
> my-fancy-block-format-driver-devel@example.org.
>
> The question is how many such block format drivers we expect.  We now 
> have two in the pipeline (ceph, sheepdog), it's reasonable to assume 
> we'll want an lvm2 driver and btrfs driver.  This is an area with a 
> lot of activity and a relatively simply interface.

If we expose a simple interface, I'm all for it.  But BlockDriver is not 
simple and things like the snapshoting API need love.

Of course, there's certainly a question of why we're solving this in 
qemu at all.  Wouldn't it be more appropriate to either (1) implement a 
kernel module for ceph/sheepdog if performance matters or (2) implement 
BUSE to complement FUSE and CUSE to enable proper userspace block devices.

If you want to use a block device within qemu, you almost certainly want 
to be able to manipulate it on the host using standard tools (like mount 
and parted) so it stands to reason that addressing this in the kernel 
makes more sense.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:25             ` Anthony Liguori
@ 2010-05-25 13:31               ` Avi Kivity
  2010-05-25 13:35                 ` Anthony Liguori
  2010-05-25 13:53               ` Kevin Wolf
  1 sibling, 1 reply; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 13:31 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 04:25 PM, Anthony Liguori wrote:
>> Currently if someone wants to add a new block format, they have to 
>> upstream it and wait for a new qemu to be released.  With a plugin 
>> API, they can add a new block format to an existing, supported qemu.
>
>
> Whether we have a plugin or protocol based mechanism to implement 
> block formats really ends up being just an implementation detail.

True.

> In order to implement either, we need to take a subset of block 
> functionality that we feel we can support long term and expose that.  
> Right now, that's basically just querying characteristics (like size 
> and geometry) and asynchronous reads and writes.

Unfortunately, you're right.

> A protocol based mechanism has the advantage of being more robust in 
> the face of poorly written block backends so if it's possible to make 
> it perform as well as a plugin, it's a preferable approach.

May be hard due to difficulty of exposing guest memory.

>
> Plugins that just expose chunks of QEMU internal state directly (like 
> BlockDriver) are a really bad idea IMHO.

Also, we don't want to expose all of the qemu API.  We should default 
the visibility attribute to "hidden" and expose only select functions, 
perhaps under their own interface.  And no inlines.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:31               ` Avi Kivity
@ 2010-05-25 13:35                 ` Anthony Liguori
  2010-05-25 13:38                   ` Avi Kivity
  0 siblings, 1 reply; 64+ messages in thread
From: Anthony Liguori @ 2010-05-25 13:35 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 08:31 AM, Avi Kivity wrote:
>> A protocol based mechanism has the advantage of being more robust in 
>> the face of poorly written block backends so if it's possible to make 
>> it perform as well as a plugin, it's a preferable approach.
>
> May be hard due to difficulty of exposing guest memory.

If someone did a series to add plugins, I would expect a very strong 
argument as to why a shared memory mechanism was not possible or at 
least plausible.

I'm not sure I understand why shared memory is such a bad thing wrt 
KVM.  Can you elaborate?  Is it simply a matter of fork()?

>>
>> Plugins that just expose chunks of QEMU internal state directly (like 
>> BlockDriver) are a really bad idea IMHO.
>
> Also, we don't want to expose all of the qemu API.  We should default 
> the visibility attribute to "hidden" and expose only select functions, 
> perhaps under their own interface.  And no inlines.

Yeah, if we did plugins, this would be a key requirement.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:29                             ` Anthony Liguori
@ 2010-05-25 13:36                               ` Avi Kivity
  2010-05-25 13:54                                 ` Anthony Liguori
  0 siblings, 1 reply; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 13:36 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

On 05/25/2010 04:29 PM, Anthony Liguori wrote:
>> The current situation is that those block format drivers only exist 
>> in qemu.git or as patches.  Surely that's even more unhappiness.
>>
>> Confusion could be mitigated:
>>
>>   $ qemu -module my-fancy-block-format-driver.so
>>   my-fancy-block-format-driver.so does not support this version of 
>> qemu (0.19.2).  Please contact 
>> my-fancy-block-format-driver-devel@example.org.
>>
>> The question is how many such block format drivers we expect.  We now 
>> have two in the pipeline (ceph, sheepdog), it's reasonable to assume 
>> we'll want an lvm2 driver and btrfs driver.  This is an area with a 
>> lot of activity and a relatively simply interface.
>
>
> If we expose a simple interface, I'm all for it.  But BlockDriver is 
> not simple and things like the snapshoting API need love.
>
> Of course, there's certainly a question of why we're solving this in 
> qemu at all.  Wouldn't it be more appropriate to either (1) implement 
> a kernel module for ceph/sheepdog if performance matters 

We'd need a kernel-level generic snapshot API for this eventually.

> or (2) implement BUSE to complement FUSE and CUSE to enable proper 
> userspace block devices.

Likely slow due do lots of copying.  Also needs a snapshot API.

(ABUSE was proposed a while ago by Zach).

> If you want to use a block device within qemu, you almost certainly 
> want to be able to manipulate it on the host using standard tools 
> (like mount and parted) so it stands to reason that addressing this in 
> the kernel makes more sense.

qemu-nbd also allows this.

This reasoning also applies to qcow2, btw.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:35                 ` Anthony Liguori
@ 2010-05-25 13:38                   ` Avi Kivity
  2010-05-25 13:55                     ` Anthony Liguori
  0 siblings, 1 reply; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 13:38 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 04:35 PM, Anthony Liguori wrote:
> On 05/25/2010 08:31 AM, Avi Kivity wrote:
>>> A protocol based mechanism has the advantage of being more robust in 
>>> the face of poorly written block backends so if it's possible to 
>>> make it perform as well as a plugin, it's a preferable approach.
>>
>> May be hard due to difficulty of exposing guest memory.
>
> If someone did a series to add plugins, I would expect a very strong 
> argument as to why a shared memory mechanism was not possible or at 
> least plausible.
>
> I'm not sure I understand why shared memory is such a bad thing wrt 
> KVM.  Can you elaborate?  Is it simply a matter of fork()?

fork() doesn't work in the with of memory hotplug.  What else is there?

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:25             ` Anthony Liguori
  2010-05-25 13:31               ` Avi Kivity
@ 2010-05-25 13:53               ` Kevin Wolf
  2010-05-25 13:55                 ` Avi Kivity
  1 sibling, 1 reply; 64+ messages in thread
From: Kevin Wolf @ 2010-05-25 13:53 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm, qemu-devel, Blue Swirl, Avi Kivity, ceph-devel,
	Christian Brunner

Am 25.05.2010 15:25, schrieb Anthony Liguori:
> On 05/25/2010 06:25 AM, Avi Kivity wrote:
>> On 05/25/2010 02:02 PM, Kevin Wolf wrote:
>>>
>>>>
>>>>> So could we not standardize a protocol for this that both sheepdog and
>>>>> ceph could implement?
>>>> The protocol already exists, nbd.  It doesn't support snapshotting etc.
>>>> but we could extend it.
>>>>
>>>> But IMO what's needed is a plugin API for the block layer.
>>> What would it buy us, apart from more downstreams and having to maintain
>>> a stable API and ABI?
>>
>> Currently if someone wants to add a new block format, they have to 
>> upstream it and wait for a new qemu to be released.  With a plugin 
>> API, they can add a new block format to an existing, supported qemu.
> 
> Whether we have a plugin or protocol based mechanism to implement block 
> formats really ends up being just an implementation detail.
> 
> In order to implement either, we need to take a subset of block 
> functionality that we feel we can support long term and expose that.  
> Right now, that's basically just querying characteristics (like size and 
> geometry) and asynchronous reads and writes.
> 
> A protocol based mechanism has the advantage of being more robust in the 
> face of poorly written block backends so if it's possible to make it 
> perform as well as a plugin, it's a preferable approach.
> 
> Plugins that just expose chunks of QEMU internal state directly (like 
> BlockDriver) are a really bad idea IMHO.

I'm still not convinced that we need either. I share Christoph's concern
that we would make our life harder for almost no gain. It's probably a
very small group of users (if it exists at all) that wants to add new
block drivers themselves, but at the same time can't run upstream qemu.

But if we were to decide that there's no way around it, I agree with you
that directly exposing the internal API isn't going to work.

Kevin

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:36                               ` Avi Kivity
@ 2010-05-25 13:54                                 ` Anthony Liguori
  2010-05-25 13:57                                   ` Avi Kivity
  0 siblings, 1 reply; 64+ messages in thread
From: Anthony Liguori @ 2010-05-25 13:54 UTC (permalink / raw)
  To: Avi Kivity
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

On 05/25/2010 08:36 AM, Avi Kivity wrote:
>
> We'd need a kernel-level generic snapshot API for this eventually.
>
>> or (2) implement BUSE to complement FUSE and CUSE to enable proper 
>> userspace block devices.
>
> Likely slow due do lots of copying.  Also needs a snapshot API.

The kernel could use splice.

> (ABUSE was proposed a while ago by Zach).
>
>> If you want to use a block device within qemu, you almost certainly 
>> want to be able to manipulate it on the host using standard tools 
>> (like mount and parted) so it stands to reason that addressing this 
>> in the kernel makes more sense.
>
> qemu-nbd also allows this.
>
> This reasoning also applies to qcow2, btw.

I know.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:38                   ` Avi Kivity
@ 2010-05-25 13:55                     ` Anthony Liguori
  2010-05-25 14:01                       ` Avi Kivity
  0 siblings, 1 reply; 64+ messages in thread
From: Anthony Liguori @ 2010-05-25 13:55 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 08:38 AM, Avi Kivity wrote:
> On 05/25/2010 04:35 PM, Anthony Liguori wrote:
>> On 05/25/2010 08:31 AM, Avi Kivity wrote:
>>>> A protocol based mechanism has the advantage of being more robust 
>>>> in the face of poorly written block backends so if it's possible to 
>>>> make it perform as well as a plugin, it's a preferable approach.
>>>
>>> May be hard due to difficulty of exposing guest memory.
>>
>> If someone did a series to add plugins, I would expect a very strong 
>> argument as to why a shared memory mechanism was not possible or at 
>> least plausible.
>>
>> I'm not sure I understand why shared memory is such a bad thing wrt 
>> KVM.  Can you elaborate?  Is it simply a matter of fork()?
>
> fork() doesn't work in the with of memory hotplug.  What else is there?
>

Is it that fork() doesn't work or is it that fork() is very expensive?

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:53               ` Kevin Wolf
@ 2010-05-25 13:55                 ` Avi Kivity
  2010-05-25 14:03                   ` Anthony Liguori
  2010-05-25 14:09                   ` Kevin Wolf
  0 siblings, 2 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 13:55 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner

On 05/25/2010 04:53 PM, Kevin Wolf wrote:
>
> I'm still not convinced that we need either. I share Christoph's concern
> that we would make our life harder for almost no gain. It's probably a
> very small group of users (if it exists at all) that wants to add new
> block drivers themselves, but at the same time can't run upstream qemu.
>
>    

The first part of your argument may be true, but the second isn't.  No 
user can run upstream qemu.git.  It's not tested or supported, and has 
no backwards compatibility guarantees.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:54                                 ` Anthony Liguori
@ 2010-05-25 13:57                                   ` Avi Kivity
  2010-05-25 14:02                                     ` Anthony Liguori
  0 siblings, 1 reply; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 13:57 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

On 05/25/2010 04:54 PM, Anthony Liguori wrote:
> On 05/25/2010 08:36 AM, Avi Kivity wrote:
>>
>> We'd need a kernel-level generic snapshot API for this eventually.
>>
>>> or (2) implement BUSE to complement FUSE and CUSE to enable proper 
>>> userspace block devices.
>>
>> Likely slow due do lots of copying.  Also needs a snapshot API.
>
> The kernel could use splice.

Still can't make guest memory appear in (A)BUSE process memory without 
either mmu tricks (vmsplice in reverse) or a copy.  May be workable for 
an (A)BUSE driver that talks over a network, and thus can splice() its 
way out.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:55                     ` Anthony Liguori
@ 2010-05-25 14:01                       ` Avi Kivity
  2010-05-25 14:05                         ` Anthony Liguori
  0 siblings, 1 reply; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 14:01 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 04:55 PM, Anthony Liguori wrote:
> On 05/25/2010 08:38 AM, Avi Kivity wrote:
>> On 05/25/2010 04:35 PM, Anthony Liguori wrote:
>>> On 05/25/2010 08:31 AM, Avi Kivity wrote:
>>>>> A protocol based mechanism has the advantage of being more robust 
>>>>> in the face of poorly written block backends so if it's possible 
>>>>> to make it perform as well as a plugin, it's a preferable approach.
>>>>
>>>> May be hard due to difficulty of exposing guest memory.
>>>
>>> If someone did a series to add plugins, I would expect a very strong 
>>> argument as to why a shared memory mechanism was not possible or at 
>>> least plausible.
>>>
>>> I'm not sure I understand why shared memory is such a bad thing wrt 
>>> KVM.  Can you elaborate?  Is it simply a matter of fork()?
>>
>> fork() doesn't work in the with of memory hotplug.  What else is there?
>>
>
> Is it that fork() doesn't work or is it that fork() is very expensive?

It doesn't work, fork() is done at block device creation time, which 
freezes the child memory map, while guest memory is allocated at hotplug 
time.

fork() actually isn't very expensive since we use MADV_DONTFORK 
(probably fast enough for everything except realtime).

It may be possible to do a processfd() which can be mmap()ed by another 
process to export anonymous memory using mmu notifiers, not sure how 
easy or mergeable that is.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:25                           ` Avi Kivity
  2010-05-25 13:29                             ` Anthony Liguori
@ 2010-05-25 14:01                             ` Kevin Wolf
  2010-05-25 16:21                               ` Avi Kivity
  1 sibling, 1 reply; 64+ messages in thread
From: Kevin Wolf @ 2010-05-25 14:01 UTC (permalink / raw)
  To: Avi Kivity
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

Am 25.05.2010 15:25, schrieb Avi Kivity:
> On 05/25/2010 04:17 PM, Anthony Liguori wrote:
>> On 05/25/2010 04:14 AM, Avi Kivity wrote:
>>> On 05/24/2010 10:38 PM, Anthony Liguori wrote:
>>>>
>>>>> - Building a plugin API seems a bit simpler to me, although I'm to
>>>>> sure if I'd get the
>>>>>    idea correctly:
>>>>>    The block layer has already some kind of api (.bdrv_file_open, 
>>>>> .bdrv_read). We
>>>>>    could simply compile the block-drivers as shared objects and 
>>>>> create a method
>>>>>    for loading the necessary modules at runtime.
>>>>
>>>> That approach would be a recipe for disaster.   We would have to 
>>>> introduce a new, reduced functionality block API that was supported 
>>>> for plugins.  Otherwise, the only way a plugin could keep up with 
>>>> our API changes would be if it was in tree which defeats the purpose 
>>>> of having plugins.
>>>
>>> We could guarantee API/ABI stability in a stable branch but not 
>>> across releases.
>>
>> We have releases every six months.  There would be tons of block 
>> plugins that didn't work for random sets of releases.  That creates a 
>> lot of user confusion and unhappiness.
> 
> The current situation is that those block format drivers only exist in 
> qemu.git or as patches.  Surely that's even more unhappiness.

The difference is that in the current situation these drivers will be
part of the next qemu release, so the patch may be obsolete, but you
don't even need it any more.

If you start keeping block drivers outside qemu and not even try
integrating them, they'll stay external.

> Confusion could be mitigated:
> 
>    $ qemu -module my-fancy-block-format-driver.so
>    my-fancy-block-format-driver.so does not support this version of qemu 
> (0.19.2).  Please contact my-fancy-block-format-driver-devel@example.org.
> 
> The question is how many such block format drivers we expect.  We now 
> have two in the pipeline (ceph, sheepdog), it's reasonable to assume 
> we'll want an lvm2 driver and btrfs driver.  This is an area with a lot 
> of activity and a relatively simply interface.

What's the reason for not having these drivers upstream? Do we gain
anything by hiding them from our users and requiring them to install the
drivers separately from somewhere else?

Kevin

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:57                                   ` Avi Kivity
@ 2010-05-25 14:02                                     ` Anthony Liguori
  2010-05-26  8:44                                       ` Avi Kivity
  0 siblings, 1 reply; 64+ messages in thread
From: Anthony Liguori @ 2010-05-25 14:02 UTC (permalink / raw)
  To: Avi Kivity
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

On 05/25/2010 08:57 AM, Avi Kivity wrote:
> On 05/25/2010 04:54 PM, Anthony Liguori wrote:
>> On 05/25/2010 08:36 AM, Avi Kivity wrote:
>>>
>>> We'd need a kernel-level generic snapshot API for this eventually.
>>>
>>>> or (2) implement BUSE to complement FUSE and CUSE to enable proper 
>>>> userspace block devices.
>>>
>>> Likely slow due do lots of copying.  Also needs a snapshot API.
>>
>> The kernel could use splice.
>
> Still can't make guest memory appear in (A)BUSE process memory without 
> either mmu tricks (vmsplice in reverse) or a copy.  May be workable 
> for an (A)BUSE driver that talks over a network, and thus can splice() 
> its way out.

splice() actually takes offset parameter so it may be possible to treat 
that offset parameter as a file offset.  That would essentially allow 
you to implement a splice() based thread pool where splice() replaces 
preadv/pwritev.

It's not quite linux-aio, but it should take you pretty far.   I think 
the main point is that the problem of allowing block plugins to qemu is 
the same as block plugins for the kernel.  The kernel doesn't provide a 
stable interface (and we probably can't for the same reasons) and it's 
generally discourage from a code quality perspective.

That said, making an external program work well as a block backend is 
identical to making userspace block devices fast.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:55                 ` Avi Kivity
@ 2010-05-25 14:03                   ` Anthony Liguori
  2010-05-25 15:02                     ` Avi Kivity
  2010-05-25 14:09                   ` Kevin Wolf
  1 sibling, 1 reply; 64+ messages in thread
From: Anthony Liguori @ 2010-05-25 14:03 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 08:55 AM, Avi Kivity wrote:
> On 05/25/2010 04:53 PM, Kevin Wolf wrote:
>>
>> I'm still not convinced that we need either. I share Christoph's concern
>> that we would make our life harder for almost no gain. It's probably a
>> very small group of users (if it exists at all) that wants to add new
>> block drivers themselves, but at the same time can't run upstream qemu.
>>
>
> The first part of your argument may be true, but the second isn't.  No 
> user can run upstream qemu.git.  It's not tested or supported, and has 
> no backwards compatibility guarantees.

Yes, it does have backwards compatibility guarantees.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 14:01                       ` Avi Kivity
@ 2010-05-25 14:05                         ` Anthony Liguori
  2010-05-25 15:00                           ` Avi Kivity
  0 siblings, 1 reply; 64+ messages in thread
From: Anthony Liguori @ 2010-05-25 14:05 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 09:01 AM, Avi Kivity wrote:
> On 05/25/2010 04:55 PM, Anthony Liguori wrote:
>> On 05/25/2010 08:38 AM, Avi Kivity wrote:
>>> On 05/25/2010 04:35 PM, Anthony Liguori wrote:
>>>> On 05/25/2010 08:31 AM, Avi Kivity wrote:
>>>>>> A protocol based mechanism has the advantage of being more robust 
>>>>>> in the face of poorly written block backends so if it's possible 
>>>>>> to make it perform as well as a plugin, it's a preferable approach.
>>>>>
>>>>> May be hard due to difficulty of exposing guest memory.
>>>>
>>>> If someone did a series to add plugins, I would expect a very 
>>>> strong argument as to why a shared memory mechanism was not 
>>>> possible or at least plausible.
>>>>
>>>> I'm not sure I understand why shared memory is such a bad thing wrt 
>>>> KVM.  Can you elaborate?  Is it simply a matter of fork()?
>>>
>>> fork() doesn't work in the with of memory hotplug.  What else is there?
>>>
>>
>> Is it that fork() doesn't work or is it that fork() is very expensive?
>
> It doesn't work, fork() is done at block device creation time, which 
> freezes the child memory map, while guest memory is allocated at 
> hotplug time.

Now I'm confused.  I thought you were saying shared memory somehow 
affects fork().  If you're talking about shared memory inheritance via 
fork(), that's less important.  You can also pass /dev/shm fd's via 
SCM_RIGHTs to establish shared memory segments dynamically.

Regards,

Anthony Liguori

> fork() actually isn't very expensive since we use MADV_DONTFORK 
> (probably fast enough for everything except realtime).
>
> It may be possible to do a processfd() which can be mmap()ed by 
> another process to export anonymous memory using mmu notifiers, not 
> sure how easy or mergeable that is.
>

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 13:55                 ` Avi Kivity
  2010-05-25 14:03                   ` Anthony Liguori
@ 2010-05-25 14:09                   ` Kevin Wolf
  2010-05-25 15:01                     ` Avi Kivity
  1 sibling, 1 reply; 64+ messages in thread
From: Kevin Wolf @ 2010-05-25 14:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner

Am 25.05.2010 15:55, schrieb Avi Kivity:
> On 05/25/2010 04:53 PM, Kevin Wolf wrote:
>>
>> I'm still not convinced that we need either. I share Christoph's concern
>> that we would make our life harder for almost no gain. It's probably a
>> very small group of users (if it exists at all) that wants to add new
>> block drivers themselves, but at the same time can't run upstream qemu.
>>
>>    
> 
> The first part of your argument may be true, but the second isn't.  No 
> user can run upstream qemu.git.  It's not tested or supported, and has 
> no backwards compatibility guarantees.

The second part was basically meant to say "developers don't count here".

Kevin

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 14:05                         ` Anthony Liguori
@ 2010-05-25 15:00                           ` Avi Kivity
  2010-05-25 15:01                             ` Anthony Liguori
  0 siblings, 1 reply; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 15:00 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 05:05 PM, Anthony Liguori wrote:
> On 05/25/2010 09:01 AM, Avi Kivity wrote:
>> On 05/25/2010 04:55 PM, Anthony Liguori wrote:
>>> On 05/25/2010 08:38 AM, Avi Kivity wrote:
>>>> On 05/25/2010 04:35 PM, Anthony Liguori wrote:
>>>>> On 05/25/2010 08:31 AM, Avi Kivity wrote:
>>>>>>> A protocol based mechanism has the advantage of being more 
>>>>>>> robust in the face of poorly written block backends so if it's 
>>>>>>> possible to make it perform as well as a plugin, it's a 
>>>>>>> preferable approach.
>>>>>>
>>>>>> May be hard due to difficulty of exposing guest memory.
>>>>>
>>>>> If someone did a series to add plugins, I would expect a very 
>>>>> strong argument as to why a shared memory mechanism was not 
>>>>> possible or at least plausible.
>>>>>
>>>>> I'm not sure I understand why shared memory is such a bad thing 
>>>>> wrt KVM.  Can you elaborate?  Is it simply a matter of fork()?
>>>>
>>>> fork() doesn't work in the with of memory hotplug.  What else is 
>>>> there?
>>>>
>>>
>>> Is it that fork() doesn't work or is it that fork() is very expensive?
>>
>> It doesn't work, fork() is done at block device creation time, which 
>> freezes the child memory map, while guest memory is allocated at 
>> hotplug time.
>
> Now I'm confused.  I thought you were saying shared memory somehow 
> affects fork().  If you're talking about shared memory inheritance via 
> fork(), that's less important. 

The latter.  Why is it less important?  If you don't inherit the memory, 
you can't access it.

> You can also pass /dev/shm fd's via SCM_RIGHTs to establish shared 
> memory segments dynamically.

Doesn't work for anonymous memory.


-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 14:09                   ` Kevin Wolf
@ 2010-05-25 15:01                     ` Avi Kivity
  0 siblings, 0 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 15:01 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner

On 05/25/2010 05:09 PM, Kevin Wolf wrote:
>
>> The first part of your argument may be true, but the second isn't.  No
>> user can run upstream qemu.git.  It's not tested or supported, and has
>> no backwards compatibility guarantees.
>>      
> The second part was basically meant to say "developers don't count here".
>    

Agreed.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 15:00                           ` Avi Kivity
@ 2010-05-25 15:01                             ` Anthony Liguori
  2010-05-25 16:16                               ` Avi Kivity
  0 siblings, 1 reply; 64+ messages in thread
From: Anthony Liguori @ 2010-05-25 15:01 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 10:00 AM, Avi Kivity wrote:
> The latter.  Why is it less important?  If you don't inherit the 
> memory, you can't access it.
>
>> You can also pass /dev/shm fd's via SCM_RIGHTs to establish shared 
>> memory segments dynamically.
>
> Doesn't work for anonymous memory.

What's wrong with /dev/shm memory?

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 14:03                   ` Anthony Liguori
@ 2010-05-25 15:02                     ` Avi Kivity
  0 siblings, 0 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 15:02 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 05:03 PM, Anthony Liguori wrote:
> On 05/25/2010 08:55 AM, Avi Kivity wrote:
>> On 05/25/2010 04:53 PM, Kevin Wolf wrote:
>>>
>>> I'm still not convinced that we need either. I share Christoph's 
>>> concern
>>> that we would make our life harder for almost no gain. It's probably a
>>> very small group of users (if it exists at all) that wants to add new
>>> block drivers themselves, but at the same time can't run upstream qemu.
>>>
>>
>> The first part of your argument may be true, but the second isn't.  
>> No user can run upstream qemu.git.  It's not tested or supported, and 
>> has no backwards compatibility guarantees.
>
> Yes, it does have backwards compatibility guarantees.

I meant a random untagged qemu.git snapshot.  Do we guarantee anything 
about it, except that it's likely to be broken?

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 15:01                             ` Anthony Liguori
@ 2010-05-25 16:16                               ` Avi Kivity
  2010-05-25 16:21                                 ` Anthony Liguori
  0 siblings, 1 reply; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 16:16 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 06:01 PM, Anthony Liguori wrote:
> On 05/25/2010 10:00 AM, Avi Kivity wrote:
>> The latter.  Why is it less important?  If you don't inherit the 
>> memory, you can't access it.
>>
>>> You can also pass /dev/shm fd's via SCM_RIGHTs to establish shared 
>>> memory segments dynamically.
>>
>> Doesn't work for anonymous memory.
>
> What's wrong with /dev/shm memory?

The kernel treats anonymous and nonymous memory differently for swapping 
(see /proc/sys/vm/swappiness); transparent hugepages won't work for 
/dev/shm (though it may be argued that that's a problem with thp); setup 
(/dev/shm defaults to half memory IIRC, we want mem+swap); different 
cgroup handling; somewhat clunky (a minor concern to be sure).

Nothing is a killer, but we should prefer anonymous memory.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 16:16                               ` Avi Kivity
@ 2010-05-25 16:21                                 ` Anthony Liguori
  2010-05-25 16:27                                   ` Avi Kivity
  0 siblings, 1 reply; 64+ messages in thread
From: Anthony Liguori @ 2010-05-25 16:21 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 11:16 AM, Avi Kivity wrote:
> On 05/25/2010 06:01 PM, Anthony Liguori wrote:
>> On 05/25/2010 10:00 AM, Avi Kivity wrote:
>>> The latter.  Why is it less important?  If you don't inherit the 
>>> memory, you can't access it.
>>>
>>>> You can also pass /dev/shm fd's via SCM_RIGHTs to establish shared 
>>>> memory segments dynamically.
>>>
>>> Doesn't work for anonymous memory.
>>
>> What's wrong with /dev/shm memory?
>
> The kernel treats anonymous and nonymous memory differently for 
> swapping (see /proc/sys/vm/swappiness); transparent hugepages won't 
> work for /dev/shm (though it may be argued that that's a problem with 
> thp); setup (/dev/shm defaults to half memory IIRC, we want mem+swap); 
> different cgroup handling; somewhat clunky (a minor concern to be sure).

Surely, with mmu notifiers, it wouldn't be that hard to share anonymous 
memory via an fd though, no?

Regards,

Anthony Liguori

>
> Nothing is a killer, but we should prefer anonymous memory.
>

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 14:01                             ` Kevin Wolf
@ 2010-05-25 16:21                               ` Avi Kivity
  2010-05-25 17:12                                 ` Sage Weil
  0 siblings, 1 reply; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 16:21 UTC (permalink / raw)
  To: Kevin Wolf
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

On 05/25/2010 05:01 PM, Kevin Wolf wrote:
>
>> The current situation is that those block format drivers only exist in
>> qemu.git or as patches.  Surely that's even more unhappiness.
>>      
> The difference is that in the current situation these drivers will be
> part of the next qemu release, so the patch may be obsolete, but you
> don't even need it any more.
>    

The next qemu release may be six months in the future.  So if you're not 
happy with running qemu.git master or with patching a stable release, 
you have to wait.

> If you start keeping block drivers outside qemu and not even try
> integrating them, they'll stay external.
>    

Which may or may not be a problem.

>> Confusion could be mitigated:
>>
>>     $ qemu -module my-fancy-block-format-driver.so
>>     my-fancy-block-format-driver.so does not support this version of qemu
>> (0.19.2).  Please contact my-fancy-block-format-driver-devel@example.org.
>>
>> The question is how many such block format drivers we expect.  We now
>> have two in the pipeline (ceph, sheepdog), it's reasonable to assume
>> we'll want an lvm2 driver and btrfs driver.  This is an area with a lot
>> of activity and a relatively simply interface.
>>      
> What's the reason for not having these drivers upstream? Do we gain
> anything by hiding them from our users and requiring them to install the
> drivers separately from somewhere else?
>    

Six months.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 16:21                                 ` Anthony Liguori
@ 2010-05-25 16:27                                   ` Avi Kivity
  0 siblings, 0 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-25 16:27 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner

On 05/25/2010 07:21 PM, Anthony Liguori wrote:
> On 05/25/2010 11:16 AM, Avi Kivity wrote:
>> On 05/25/2010 06:01 PM, Anthony Liguori wrote:
>>> On 05/25/2010 10:00 AM, Avi Kivity wrote:
>>>> The latter.  Why is it less important?  If you don't inherit the 
>>>> memory, you can't access it.
>>>>
>>>>> You can also pass /dev/shm fd's via SCM_RIGHTs to establish shared 
>>>>> memory segments dynamically.
>>>>
>>>> Doesn't work for anonymous memory.
>>>
>>> What's wrong with /dev/shm memory?
>>
>> The kernel treats anonymous and nonymous memory differently for 
>> swapping (see /proc/sys/vm/swappiness); transparent hugepages won't 
>> work for /dev/shm (though it may be argued that that's a problem with 
>> thp); setup (/dev/shm defaults to half memory IIRC, we want 
>> mem+swap); different cgroup handling; somewhat clunky (a minor 
>> concern to be sure).
>
> Surely, with mmu notifiers, it wouldn't be that hard to share 
> anonymous memory via an fd though, no?

That's what I suggested with processfd().  I wouldn't call it easy but 
it's likely doable.  Whether it's mergable is a different issue.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 16:21                               ` Avi Kivity
@ 2010-05-25 17:12                                 ` Sage Weil
  2010-05-26  5:24                                   ` MORITA Kazutaka
  2010-05-26  8:46                                   ` Avi Kivity
  0 siblings, 2 replies; 64+ messages in thread
From: Sage Weil @ 2010-05-25 17:12 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner, MORITA Kazutaka

On Tue, 25 May 2010, Avi Kivity wrote:
> > What's the reason for not having these drivers upstream? Do we gain
> > anything by hiding them from our users and requiring them to install the
> > drivers separately from somewhere else?
> >    
> 
> Six months.

FWIW, we (Ceph) aren't complaining about the 6 month lag time (and I don't 
think the Sheepdog guys are either).

>From our perspective, the current BlockDriver abstraction is ideal, as it 
represents the reality of qemu's interaction with storage.  Any 'external' 
interface will be inferior to that in one way or another.  But either way, 
we are perfectly willing to work with you to all to keep in sync with any 
future BlockDriver API improvements.  It is worth our time investment even 
if the API is less stable.

The ability to dynamically load a shared object using the existing api 
would make development a bit easier, but I'm not convinced it's better for 
for users.  I think having ceph and sheepdog upstream with qemu will serve 
end users best, and we at least are willing to spend the time to help 
maintain that code in qemu.git.

sage

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-24  2:17       ` Yehuda Sadeh Weinraub
@ 2010-05-25 20:13         ` Blue Swirl
  0 siblings, 0 replies; 64+ messages in thread
From: Blue Swirl @ 2010-05-25 20:13 UTC (permalink / raw)
  To: Yehuda Sadeh Weinraub; +Cc: ceph-devel, Christian Brunner, kvm, qemu-devel

On Mon, May 24, 2010 at 2:17 AM, Yehuda Sadeh Weinraub
<yehudasa@gmail.com> wrote:
> On Sun, May 23, 2010 at 12:59 AM, Blue Swirl <blauwirbel@gmail.com> wrote:
>> On Thu, May 20, 2010 at 11:02 PM, Yehuda Sadeh Weinraub
>> <yehudasa@gmail.com> wrote:
>>> On Thu, May 20, 2010 at 1:31 PM, Blue Swirl <blauwirbel@gmail.com> wrote:
>>>> On Wed, May 19, 2010 at 7:22 PM, Christian Brunner <chb@muc.de> wrote:
>>>>> The attached patch is a block driver for the distributed file system
>>>>> Ceph (http://ceph.newdream.net/). This driver uses librados (which
>>>>> is part of the Ceph server) for direct access to the Ceph object
>>>>> store and is running entirely in userspace. Therefore it is
>>>>> called "rbd" - rados block device.
>>> ...
>>>>
>>>> IIRC underscores here may conflict with system header use. Please use
>>>> something like QEMU_BLOCK_RADOS_H.
>>>
>>> This header is shared between the linux kernel client and the ceph
>>> userspace servers and client. We can actually get rid of it, as we
>>> only need it to define CEPH_OSD_TMAP_SET. We can move this definition
>>> to librados.h.
>>>
>>>>> diff --git a/block/rbd_types.h b/block/rbd_types.h
>>>>> new file mode 100644
>>>>> index 0000000..dfd5aa0
>>>>> --- /dev/null
>>>>> +++ b/block/rbd_types.h
>>>>> @@ -0,0 +1,48 @@
>>>>> +#ifndef _FS_CEPH_RBD
>>>>> +#define _FS_CEPH_RBD
>>>>
>>>> QEMU_BLOCK_RBD?
>>>
>>> This header is shared between the ceph kernel client, between the qemu
>>> rbd module (and between other ceph utilities). It'd be much easier
>>> maintaining it without having to have a different implementation for
>>> each. The same goes to the use of __le32/64 and __u32/64 within these
>>> headers.
>>
>> This is user space, so identifiers must conform to C standards. The
>> identifiers beginning with underscores are reserved.
>>
>> Doesn't __le32/64 also depend on some GCC extension? Or sparse magic?
> It depends on gcc extension. If needed we can probably have a separate
> header for the qemu block device that uses alternative types. Though
> looking at the qemu code I see use of other gcc extensions so I'm not
> sure this is a real issue.

We use some (contained with for example macros if possible), but in
earlier discussions, __le32 etc. were considered problematic. IIRC
it's hard to provide alternate versions for other compilers (or older
versions of gcc).

>
>>
>>>
>>>>
>>>>> +
>>>>> +#include <linux/types.h>
>>>>
>>>> Can you use standard includes, like <sys/types.h> or <inttypes.h>? Are
>>>> Ceph libraries used in other systems than Linux?
>>>
>>> Not at the moment. I guess that we can take this include out.
>>>
>>>>
>>>>> +
>>>>> +/*
>>>>> + * rbd image 'foo' consists of objects
>>>>> + *   foo.rbd      - image metadata
>>>>> + *   foo.00000000
>>>>> + *   foo.00000001
>>>>> + *   ...          - data
>>>>> + */
>>>>> +
>>>>> +#define RBD_SUFFIX             ".rbd"
>>>>> +#define RBD_DIRECTORY           "rbd_directory"
>>>>> +
>>>>> +#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
>>>>> +
>>>>> +#define RBD_MAX_OBJ_NAME_SIZE  96
>>>>> +#define RBD_MAX_SEG_NAME_SIZE  128
>>>>> +
>>>>> +#define RBD_COMP_NONE          0
>>>>> +#define RBD_CRYPT_NONE         0
>>>>> +
>>>>> +static const char rbd_text[] = "<<< Rados Block Device Image >>>\n";
>>>>> +static const char rbd_signature[] = "RBD";
>>>>> +static const char rbd_version[] = "001.001";
>>>>> +
>>>>> +struct rbd_obj_snap_ondisk {
>>>>> +       __le64 id;
>>>>> +       __le64 image_size;
>>>>> +} __attribute__((packed));
>>>>> +
>>>>> +struct rbd_obj_header_ondisk {
>>>>> +       char text[64];
>>>>> +       char signature[4];
>>>>> +       char version[8];
>>>>> +       __le64 image_size;
>>>>
>>>> Unaligned? Is the disk format fixed?
>>>
>>> This is a packed structure that represents the on disk format.
>>> Operations on it are being done only to read from the disk header or
>>> to write to the disk header.
>>
>> That's clear. But what exactly is the alignment of field 'image_size'?
>> Could there be implicit padding to mod 8 between 'version' and
>> 'image_size' with some compilers?
>
> Obviously it's not 64 bit aligned. As it's an on-disk header, I don't
> see alignment a real issue. As was said before, any operation on these
> fields have to go through endianity conversion anyway, and this
> structure should not be used directly. For such datastructures I'd
> rather have the fields ordered in some logical order than maintaining
> the alignment by ourselves. That's why we have that __attribute__
> packed in the end to let the compiler deal with those issues. Other
> compilers though have their own syntax for packed structures (but I do
> see other uses of this packed syntax in the qemu code).

Packed structures are OK, but the padding should be explicit to avoid
compiler problems.

Eventually the disk format is read into memory buffer and then aligned
fields should be also faster on all architectures, even on x86.

>>
>> If there were no other constraints, I'd either make the padding
>> explicit, or rearrange/resize fields so that the field alignment is
>> natural. Thus my question, can you change the disk format or are there
>> already some deployments?
>
> We can certainly make changes to the disk format at this point. I'm
> not very happy with those 3 __u8 in the middle, and they can probably
> be changed to a 32 bit flags field. We can get it 64 bit aligned too.

I hope my comments helped you to avoid possible problems in the
future. From purely QEMU code base point of view, any architecture
goes. Some architectures are faster to emulate, others are slower.

>>
>> Otherwise, I'd just add some warning comment so people don't try to
>> use clever pointer tricks which will crash on machines with enforced
>> alignment.
>>
> Any clever pointer tricks that'll work on one architecture will
> probably be wrong on another (different word
> size/alignment/endianity), so maybe crashing machines is a good
> indicator to bad implementation. We shouldn't try to hide the
> problems.
>
> Thanks,
> Yehuda
>

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 17:12                                 ` Sage Weil
@ 2010-05-26  5:24                                   ` MORITA Kazutaka
  2010-05-26  8:46                                   ` Avi Kivity
  1 sibling, 0 replies; 64+ messages in thread
From: MORITA Kazutaka @ 2010-05-26  5:24 UTC (permalink / raw)
  To: Sage Weil
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, Avi Kivity, ceph-devel,
	Christian Brunner, MORITA Kazutaka

At Tue, 25 May 2010 10:12:53 -0700 (PDT),
Sage Weil wrote:
> 
> On Tue, 25 May 2010, Avi Kivity wrote:
> > > What's the reason for not having these drivers upstream? Do we gain
> > > anything by hiding them from our users and requiring them to install the
> > > drivers separately from somewhere else?
> > >    
> > 
> > Six months.
> 
> FWIW, we (Ceph) aren't complaining about the 6 month lag time (and I don't 
> think the Sheepdog guys are either).
> 
I agree.  We aren't complaining about it.

> From our perspective, the current BlockDriver abstraction is ideal, as it 
> represents the reality of qemu's interaction with storage.  Any 'external' 
> interface will be inferior to that in one way or another.  But either way, 
> we are perfectly willing to work with you to all to keep in sync with any 
> future BlockDriver API improvements.  It is worth our time investment even 
> if the API is less stable.
> 
I agree.

> The ability to dynamically load a shared object using the existing api 
> would make development a bit easier, but I'm not convinced it's better for 
> for users.  I think having ceph and sheepdog upstream with qemu will serve 
> end users best, and we at least are willing to spend the time to help 
> maintain that code in qemu.git.
> 
I agree.

Regards,

Kazutaka

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 14:02                                     ` Anthony Liguori
@ 2010-05-26  8:44                                       ` Avi Kivity
  0 siblings, 0 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-26  8:44 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm, qemu-devel, Blue Swirl, ceph-devel, Christian Brunner,
	MORITA Kazutaka

On 05/25/2010 05:02 PM, Anthony Liguori wrote:
> On 05/25/2010 08:57 AM, Avi Kivity wrote:
>> On 05/25/2010 04:54 PM, Anthony Liguori wrote:
>>> On 05/25/2010 08:36 AM, Avi Kivity wrote:
>>>>
>>>> We'd need a kernel-level generic snapshot API for this eventually.
>>>>
>>>>> or (2) implement BUSE to complement FUSE and CUSE to enable proper 
>>>>> userspace block devices.
>>>>
>>>> Likely slow due do lots of copying.  Also needs a snapshot API.
>>>
>>> The kernel could use splice.
>>
>> Still can't make guest memory appear in (A)BUSE process memory 
>> without either mmu tricks (vmsplice in reverse) or a copy.  May be 
>> workable for an (A)BUSE driver that talks over a network, and thus 
>> can splice() its way out.
>
> splice() actually takes offset parameter so it may be possible to 
> treat that offset parameter as a file offset.  That would essentially 
> allow you to implement a splice() based thread pool where splice() 
> replaces preadv/pwritev.

Right.

(note: need splicev() here)

>
> It's not quite linux-aio, but it should take you pretty far.   I think 
> the main point is that the problem of allowing block plugins to qemu 
> is the same as block plugins for the kernel.  The kernel doesn't 
> provide a stable interface (and we probably can't for the same 
> reasons) and it's generally discourage from a code quality perspective.

The kernel does provide a stable interface for FUSE, and it could 
provide a stable interface for ABUSE.  Why can the kernel support these 
and qemu can't support essentially the same thing?

> That said, making an external program work well as a block backend is 
> identical to making userspace block devices fast.

More or less, yes.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm
  2010-05-25 17:12                                 ` Sage Weil
  2010-05-26  5:24                                   ` MORITA Kazutaka
@ 2010-05-26  8:46                                   ` Avi Kivity
  1 sibling, 0 replies; 64+ messages in thread
From: Avi Kivity @ 2010-05-26  8:46 UTC (permalink / raw)
  To: Sage Weil
  Cc: Kevin Wolf, kvm, qemu-devel, Blue Swirl, ceph-devel,
	Christian Brunner, MORITA Kazutaka

On 05/25/2010 08:12 PM, Sage Weil wrote:
> On Tue, 25 May 2010, Avi Kivity wrote:
>    
>>> What's the reason for not having these drivers upstream? Do we gain
>>> anything by hiding them from our users and requiring them to install the
>>> drivers separately from somewhere else?
>>>
>>>        
>> Six months.
>>      
> FWIW, we (Ceph) aren't complaining about the 6 month lag time (and I don't
> think the Sheepdog guys are either).
>
>    

In that case (and if there are no other potential users), then there's 
no need for a plugin API.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 64+ messages in thread

end of thread, other threads:[~2010-05-26  9:17 UTC | newest]

Thread overview: 64+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-05-19 19:22 [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm Christian Brunner
2010-05-20 20:31 ` Blue Swirl
2010-05-20 21:18   ` Christian Brunner
2010-05-20 21:29     ` Anthony Liguori
2010-05-20 22:16       ` Christian Brunner
2010-05-21  5:28         ` Stefan Hajnoczi
2010-05-21  6:13           ` MORITA Kazutaka
2010-05-21  5:54         ` MORITA Kazutaka
2010-05-23 12:01       ` Avi Kivity
2010-05-24  7:12         ` MORITA Kazutaka
2010-05-24 11:05           ` Avi Kivity
2010-05-24 11:42             ` MORITA Kazutaka
2010-05-24 11:56               ` Avi Kivity
2010-05-24 12:07                 ` Cláudio Martins
2010-05-24 14:01                 ` MORITA Kazutaka
2010-05-24 19:07                   ` Christian Brunner
2010-05-24 19:38                     ` Anthony Liguori
2010-05-25  9:14                       ` Avi Kivity
2010-05-25 13:17                         ` Anthony Liguori
2010-05-25 13:25                           ` Avi Kivity
2010-05-25 13:29                             ` Anthony Liguori
2010-05-25 13:36                               ` Avi Kivity
2010-05-25 13:54                                 ` Anthony Liguori
2010-05-25 13:57                                   ` Avi Kivity
2010-05-25 14:02                                     ` Anthony Liguori
2010-05-26  8:44                                       ` Avi Kivity
2010-05-25 14:01                             ` Kevin Wolf
2010-05-25 16:21                               ` Avi Kivity
2010-05-25 17:12                                 ` Sage Weil
2010-05-26  5:24                                   ` MORITA Kazutaka
2010-05-26  8:46                                   ` Avi Kivity
2010-05-24 19:16                 ` Anthony Liguori
2010-05-25  9:19                   ` Avi Kivity
2010-05-25 13:26                   ` MORITA Kazutaka
2010-05-24  8:27         ` Stefan Hajnoczi
2010-05-24 11:03           ` Avi Kivity
2010-05-24 19:19             ` Anthony Liguori
2010-05-25  9:22               ` Avi Kivity
2010-05-25 11:02         ` Kevin Wolf
2010-05-25 11:25           ` Avi Kivity
2010-05-25 12:03             ` Christoph Hellwig
2010-05-25 12:13               ` Avi Kivity
2010-05-25 13:25             ` Anthony Liguori
2010-05-25 13:31               ` Avi Kivity
2010-05-25 13:35                 ` Anthony Liguori
2010-05-25 13:38                   ` Avi Kivity
2010-05-25 13:55                     ` Anthony Liguori
2010-05-25 14:01                       ` Avi Kivity
2010-05-25 14:05                         ` Anthony Liguori
2010-05-25 15:00                           ` Avi Kivity
2010-05-25 15:01                             ` Anthony Liguori
2010-05-25 16:16                               ` Avi Kivity
2010-05-25 16:21                                 ` Anthony Liguori
2010-05-25 16:27                                   ` Avi Kivity
2010-05-25 13:53               ` Kevin Wolf
2010-05-25 13:55                 ` Avi Kivity
2010-05-25 14:03                   ` Anthony Liguori
2010-05-25 15:02                     ` Avi Kivity
2010-05-25 14:09                   ` Kevin Wolf
2010-05-25 15:01                     ` Avi Kivity
2010-05-20 23:02   ` Yehuda Sadeh Weinraub
2010-05-23  7:59     ` Blue Swirl
2010-05-24  2:17       ` Yehuda Sadeh Weinraub
2010-05-25 20:13         ` Blue Swirl

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).