From: Fam Zheng <famz@redhat.com>
To: Peter Lieven <pl@kamp.de>
Cc: kwolf@redhat.com, qemu-devel@nongnu.org, armbru@redhat.com,
stefanha@redhat.com, pbonzini@redhat.com, mreitz@redhat.com
Subject: Re: [Qemu-devel] [PATCHv2] block: optimize zero writes with bdrv_write_zeroes
Date: Tue, 1 Apr 2014 15:33:19 +0800 [thread overview]
Message-ID: <20140401073319.GA22447@T430.nay.redhat.com> (raw)
In-Reply-To: <1396017982-7390-1-git-send-email-pl@kamp.de>
On Fri, 03/28 15:46, Peter Lieven wrote:
> this patch tries to optimize zero write requests
> by automatically using bdrv_write_zeroes if it is
> supported by the format.
>
> This significantly speeds up file system initialization and
> should speed zero write test used to test backend storage
> performance.
>
> I ran the following 2 tests on my internal SSD with a
> 50G QCOW2 container and on an attached iSCSI storage.
>
> a) mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/vdX
>
> QCOW2 [off] [on] [unmap]
> -----
> runtime: 14secs 1.1secs 1.1secs
> filesize: 937M 18M 18M
>
> iSCSI [off] [on] [unmap]
> ----
> runtime: 9.3s 0.9s 0.9s
>
> b) dd if=/dev/zero of=/dev/vdX bs=1M oflag=direct
>
> QCOW2 [off] [on] [unmap]
> -----
> runtime: 246secs 18secs 18secs
> filesize: 51G 192K 192K
> throughput: 203M/s 2.3G/s 2.3G/s
>
> iSCSI* [off] [on] [unmap]
> ----
> runtime: 8mins 45secs 33secs
> throughput: 106M/s 1.2G/s 1.6G/s
> allocated: 100% 100% 0%
>
> * The storage was connected via an 1Gbit interface.
> It seems to internally handle writing zeroes
> via WRITESAME16 very fast.
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
> v1->v2: - added tests to commit message (Markus)
> RFCv2->v1: - fixed paramter parsing strncmp -> strcmp (Eric)
> - fixed typo (choosen->chosen) (Eric)
> - added second example to commit msg
>
> RFCv1->RFCv2: - add detect-zeroes=off|on|unmap knob to drive cmdline parameter
> - call zero detection only for format (bs->file != NULL)
>
> block.c | 39 ++++++++++++++++++++++++++++++++++++++-
> include/block/block_int.h | 12 ++++++++++++
> include/qemu-common.h | 1 +
> qemu-options.hx | 6 ++++++
> util/iov.c | 21 +++++++++++++++++++++
> 5 files changed, 78 insertions(+), 1 deletion(-)
>
> diff --git a/block.c b/block.c
> index acb70fd..3f61f76 100644
> --- a/block.c
> +++ b/block.c
> @@ -817,6 +817,25 @@ static int bdrv_assign_node_name(BlockDriverState *bs,
> return 0;
> }
>
> +static int bdrv_set_detect_zeroes(BlockDriverState *bs,
> + const char *detect_zeroes,
> + Error **errp)
> +{
> + if (!detect_zeroes || !strcmp(detect_zeroes, "off")) {
> + bs->detect_zeroes = BDRV_DETECT_ZEROES_OFF;
> + } else if (!strcmp(detect_zeroes, "on")) {
> + bs->detect_zeroes = BDRV_DETECT_ZEROES_ON;
> + } else if (!strcmp(detect_zeroes, "unmap")) {
> + bs->detect_zeroes = BDRV_DETECT_ZEROES_UNMAP;
> + } else {
> + error_setg(errp, "invalid value for detect-zeroes: %s",
> + detect_zeroes);
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> /*
> * Common part for opening disk images and files
> *
> @@ -827,7 +846,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
> {
> int ret, open_flags;
> const char *filename;
> - const char *node_name = NULL;
> + const char *node_name = NULL, *detect_zeroes = NULL;
> Error *local_err = NULL;
>
> assert(drv != NULL);
> @@ -855,6 +874,13 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
> }
> qdict_del(options, "node-name");
>
> + detect_zeroes = qdict_get_try_str(options, "detect-zeroes");
> + ret = bdrv_set_detect_zeroes(bs, detect_zeroes, errp);
> + if (ret < 0) {
> + return ret;
> + }
> + qdict_del(options, "detect-zeroes");
> +
> /* bdrv_open() with directly using a protocol as drv. This layer is already
> * opened, so assign it to bs (while file becomes a closed BlockDriverState)
> * and return immediately. */
> @@ -3179,6 +3205,17 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
>
> ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
>
> + if (!ret && bs->detect_zeroes != BDRV_DETECT_ZEROES_OFF &&
> + !(flags & BDRV_REQ_ZERO_WRITE) && bs->file &&
> + drv->bdrv_co_write_zeroes && qemu_iovec_is_zero(qiov)) {
> + flags |= BDRV_REQ_ZERO_WRITE;
> + /* if the device was not opened with discard=on the below flag
> + * is immediately cleared again in bdrv_co_do_write_zeroes */
> + if (bs->detect_zeroes == BDRV_DETECT_ZEROES_UNMAP) {
> + flags |= BDRV_REQ_MAY_UNMAP;
> + }
> + }
> +
> if (ret < 0) {
> /* Do nothing, write notifier decided to fail this request */
> } else if (flags & BDRV_REQ_ZERO_WRITE) {
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index cd5bc73..7a3013a 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -271,6 +271,17 @@ typedef struct BlockLimits {
> } BlockLimits;
>
> /*
> + * Different operation modes for automatic zero detection
> + * to speed the write operation up with bdrv_write_zeroes.
> + */
> +typedef enum {
> + BDRV_DETECT_ZEROES_OFF = 0x0,
> + BDRV_DETECT_ZEROES_ON = 0x1,
> + /* also set the BDRV_MAY_UNMAP flag with bdrv_write_zeroes */
> + BDRV_DETECT_ZEROES_UNMAP = 0x2,
> +} BdrvDetectZeroes;
> +
> +/*
> * Note: the function bdrv_append() copies and swaps contents of
> * BlockDriverStates, so if you add new fields to this struct, please
> * inspect bdrv_append() to determine if the new fields need to be
> @@ -365,6 +376,7 @@ struct BlockDriverState {
> BlockJob *job;
>
> QDict *options;
> + BdrvDetectZeroes detect_zeroes;
> };
>
> int get_tmp_filename(char *filename, int size);
> diff --git a/include/qemu-common.h b/include/qemu-common.h
> index c8a58a8..574da73 100644
> --- a/include/qemu-common.h
> +++ b/include/qemu-common.h
> @@ -330,6 +330,7 @@ void qemu_iovec_concat(QEMUIOVector *dst,
> void qemu_iovec_concat_iov(QEMUIOVector *dst,
> struct iovec *src_iov, unsigned int src_cnt,
> size_t soffset, size_t sbytes);
> +bool qemu_iovec_is_zero(QEMUIOVector *qiov);
> void qemu_iovec_destroy(QEMUIOVector *qiov);
> void qemu_iovec_reset(QEMUIOVector *qiov);
> size_t qemu_iovec_to_buf(QEMUIOVector *qiov, size_t offset,
> diff --git a/qemu-options.hx b/qemu-options.hx
> index ee5437b..f824d9b 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -410,6 +410,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
> " [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n"
> " [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
> " [,readonly=on|off][,copy-on-read=on|off]\n"
> + " [,detect-zeroes=on|off|unmap]\n"
> " [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]]\n"
> " [[,iops=i]|[[,iops_rd=r][,iops_wr=w]]]\n"
> " [[,bps_max=bm]|[[,bps_rd_max=rm][,bps_wr_max=wm]]]\n"
> @@ -470,6 +471,11 @@ Open drive @option{file} as read-only. Guest write attempts will fail.
> @item copy-on-read=@var{copy-on-read}
> @var{copy-on-read} is "on" or "off" and enables whether to copy read backing
> file sectors into the image file.
> +@item detect-zeroes=@var{detect-zeroes}
> +@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
> +conversion of plain zero writes by the OS to driver specific optimized
> +zero write commands. If "unmap" is chosen and @var{discard} is "on"
> +a zero write may even be converted to an UNMAP operation.
> @end table
>
> By default, the @option{cache=writeback} mode is used. It will report data
> diff --git a/util/iov.c b/util/iov.c
> index 6569b5a..0b17392 100644
> --- a/util/iov.c
> +++ b/util/iov.c
> @@ -335,6 +335,27 @@ void qemu_iovec_concat(QEMUIOVector *dst,
> qemu_iovec_concat_iov(dst, src->iov, src->niov, soffset, sbytes);
> }
>
> +/*
> + * check if the contents of the iovecs is all zero
contents of ... are? content of ... is? Other than that,
Reviewed-by: Fam Zheng <famz@redhat.com>
> + */
> +bool qemu_iovec_is_zero(QEMUIOVector *qiov)
> +{
> + int i;
> + for (i = 0; i < qiov->niov; i++) {
> + size_t offs = qiov->iov[i].iov_len & ~(4 * sizeof(long) - 1);
> + uint8_t *ptr = qiov->iov[i].iov_base;
> + if (offs && !buffer_is_zero(qiov->iov[i].iov_base, offs)) {
> + return false;
> + }
> + for (; offs < qiov->iov[i].iov_len; offs++) {
> + if (ptr[offs]) {
> + return false;
> + }
> + }
> + }
> + return true;
> +}
> +
> void qemu_iovec_destroy(QEMUIOVector *qiov)
> {
> assert(qiov->nalloc != -1);
> --
> 1.7.9.5
>
prev parent reply other threads:[~2014-04-01 7:33 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-03-28 14:46 [Qemu-devel] [PATCHv2] block: optimize zero writes with bdrv_write_zeroes Peter Lieven
2014-03-28 15:53 ` Eric Blake
2014-04-01 20:38 ` Peter Lieven
2014-04-01 7:33 ` Fam Zheng [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20140401073319.GA22447@T430.nay.redhat.com \
--to=famz@redhat.com \
--cc=armbru@redhat.com \
--cc=kwolf@redhat.com \
--cc=mreitz@redhat.com \
--cc=pbonzini@redhat.com \
--cc=pl@kamp.de \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).