From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:46183) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1YL65R-00048G-Dg for qemu-devel@nongnu.org; Tue, 10 Feb 2015 03:22:26 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1YL65N-0005nX-B2 for qemu-devel@nongnu.org; Tue, 10 Feb 2015 03:22:25 -0500 Received: from tama50.ecl.ntt.co.jp ([129.60.39.147]:39986) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1YL65M-0005mn-Sz for qemu-devel@nongnu.org; Tue, 10 Feb 2015 03:22:21 -0500 Message-ID: <54D9BFAA.8050307@lab.ntt.co.jp> Date: Tue, 10 Feb 2015 17:22:02 +0900 From: Teruaki Ishizaki MIME-Version: 1.0 References: <1422347727-13006-1-git-send-email-ishizaki.teruaki@lab.ntt.co.jp> <20150210031051.GB3859@ubuntu-trusty> In-Reply-To: <20150210031051.GB3859@ubuntu-trusty> Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Liu Yuan Cc: kwolf@redhat.com, mitake.hitoshi@lab.ntt.co.jp, sheepdog@lists.wpkg.org, qemu-devel@nongnu.org, stefanha@redhat.com (2015/02/10 12:10), Liu Yuan wrote: > On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote: >> Previously, qemu block driver of sheepdog used hard-coded VDI object size. >> This patch enables users to handle "block_size_shift" value for >> calculating VDI object size. >> >> When you start qemu, you don't need to specify additional command option. >> >> But when you create the VDI which doesn't have default object size >> with qemu-img command, you specify block_size_shift option. >> >> If you want to create a VDI of 8MB(1 << 23) object size, >> you need to specify following command option. >> >> # qemu-img create -o block_size_shift=23 sheepdog:test1 100M >> >> In addition, when you don't specify qemu-img command option, >> a default value of sheepdog cluster is used for creating VDI. >> >> # qemu-img create sheepdog:test2 100M >> >> Signed-off-by: Teruaki Ishizaki >> --- >> V4: >> - Limit a read/write buffer size for creating a preallocated VDI. >> - Replace a parse function for the block_size_shift option. >> - Fix an error message. >> >> V3: >> - Delete the needless operation of buffer. >> - Delete the needless operations of request header. >> for SD_OP_GET_CLUSTER_DEFAULT. >> - Fix coding style problems. >> >> V2: >> - Fix coding style problem (white space). >> - Add members, store_policy and block_size_shift to struct SheepdogVdiReq. >> - Initialize request header to use block_size_shift specified by user. >> --- >> block/sheepdog.c | 138 ++++++++++++++++++++++++++++++++++++++------- >> include/block/block_int.h | 1 + >> 2 files changed, 119 insertions(+), 20 deletions(-) >> >> diff --git a/block/sheepdog.c b/block/sheepdog.c >> index be3176f..a43b947 100644 >> --- a/block/sheepdog.c >> +++ b/block/sheepdog.c >> @@ -37,6 +37,7 @@ >> #define SD_OP_READ_VDIS 0x15 >> #define SD_OP_FLUSH_VDI 0x16 >> #define SD_OP_DEL_VDI 0x17 >> +#define SD_OP_GET_CLUSTER_DEFAULT 0x18 >> >> #define SD_FLAG_CMD_WRITE 0x01 >> #define SD_FLAG_CMD_COW 0x02 >> @@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq { >> uint32_t base_vdi_id; >> uint8_t copies; >> uint8_t copy_policy; >> - uint8_t reserved[2]; >> + uint8_t store_policy; >> + uint8_t block_size_shift; >> uint32_t snapid; >> uint32_t type; >> uint32_t pad[2]; >> @@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp { >> uint32_t pad[5]; >> } SheepdogVdiRsp; >> >> +typedef struct SheepdogClusterRsp { >> + uint8_t proto_ver; >> + uint8_t opcode; >> + uint16_t flags; >> + uint32_t epoch; >> + uint32_t id; >> + uint32_t data_length; >> + uint32_t result; >> + uint8_t nr_copies; >> + uint8_t copy_policy; >> + uint8_t block_size_shift; >> + uint8_t __pad1; >> + uint32_t __pad2[6]; >> +} SheepdogClusterRsp; >> + >> typedef struct SheepdogInode { >> char name[SD_MAX_VDI_LEN]; >> char tag[SD_MAX_VDI_TAG_LEN]; >> @@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, >> hdr.vdi_size = s->inode.vdi_size; >> hdr.copy_policy = s->inode.copy_policy; >> hdr.copies = s->inode.nr_copies; >> + hdr.block_size_shift = s->inode.block_size_shift; >> >> ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); >> >> @@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, >> static int sd_prealloc(const char *filename, Error **errp) >> { >> BlockDriverState *bs = NULL; >> + BDRVSheepdogState *base = NULL; >> + unsigned long buf_size; >> uint32_t idx, max_idx; >> + uint32_t object_size; >> int64_t vdi_size; >> - void *buf = g_malloc0(SD_DATA_OBJ_SIZE); >> + void *buf = NULL; >> int ret; >> >> ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, >> @@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp) >> ret = vdi_size; >> goto out; >> } >> - max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE); >> + >> + base = bs->opaque; >> + object_size = (UINT32_C(1) << base->inode.block_size_shift); >> + buf_size = MIN(object_size, SD_DATA_OBJ_SIZE); >> + buf = g_malloc0(buf_size); >> + >> + max_idx = DIV_ROUND_UP(vdi_size, buf_size); >> >> for (idx = 0; idx < max_idx; idx++) { >> /* >> * The created image can be a cloned image, so we need to read >> * a data from the source image. >> */ >> - ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE); >> + ret = bdrv_pread(bs, idx * buf_size, buf, buf_size); >> if (ret < 0) { >> goto out; >> } >> - ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE); >> + ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size); >> if (ret < 0) { >> goto out; >> } >> @@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt) >> return 0; >> } >> >> +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt) >> +{ >> + struct SheepdogInode *inode = &s->inode; >> + inode->block_size_shift = >> + (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0); >> + if (inode->block_size_shift == 0) { >> + /* block_size_shift is set for cluster default value by sheepdog */ >> + return 0; >> + } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) { >> + return -EINVAL; >> + } >> + >> + return 0; >> +} >> + >> static int sd_create(const char *filename, QemuOpts *opts, >> Error **errp) >> { >> @@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts, >> BDRVSheepdogState *s; >> char tag[SD_MAX_VDI_TAG_LEN]; >> uint32_t snapid; >> + uint64_t max_vdi_size; >> bool prealloc = false; >> >> s = g_new0(BDRVSheepdogState, 1); >> @@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts, >> } >> } >> >> - if (s->inode.vdi_size > SD_MAX_VDI_SIZE) { >> - error_setg(errp, "too big image size"); >> - ret = -EINVAL; >> + ret = parse_block_size_shift(s, opts); >> + if (ret < 0) { >> + error_setg(errp, "Invalid block_size_shift:" >> + " '%s'. please use 20-31", buf); >> goto out; >> } >> >> @@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts, >> } >> >> s->aio_context = qemu_get_aio_context(); >> + >> + /* if block_size_shift is not specified, get cluster default value */ >> + if (s->inode.block_size_shift == 0) { > > Seems that we don't keep backward compatibility for new QEMU and old sheep that > doesn't support SD_OP_GET_CLUSTER_DEFAULT. Then, I'll add the operation that if block_size_shift from SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22. Is it OK? > > What will happen if old QEMU with new sheep that support block_size_shift? Most > distributions will ship the old stable qemu that wouldn't be aware of > block_size_shift. If old QEMU with new sheep is used, VDI is created with cluster default block_size_shift and accessed as 4MB object_size. So I think that for backward compatibility, users must do cluster format command with default block_size_shift 22 equal to 4MB object_size. If users want to use for iSCSI target with changing block_size_shift, they must specify the value to create VDI. > > I'd suggest we have 0 in block_size_shift represent 4MB to keep backward > compatiblity. > Thanks, Teruaki Ishizaki