All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Denis V. Lunev" <den@openvz.org>
To: Roman Kagan <rkagan@parallels.com>, Kevin Wolf <kwolf@redhat.com>,
	qemu-devel@nongnu.org, Stefan Hajnoczi <stefanha@redhat.com>
Subject: Re: [Qemu-devel] [PATCH 17/19] block/parallels: delay writing to BAT till bdrv_co_flush_to_os
Date: Wed, 14 Jan 2015 16:08:50 +0300	[thread overview]
Message-ID: <54B66A62.1050202@openvz.org> (raw)
In-Reply-To: <20150114130339.GB27418@rkaganb.sw.ru>

On 14/01/15 16:03, Roman Kagan wrote:
> On Tue, Dec 30, 2014 at 01:07:10PM +0300, Denis V. Lunev wrote:
>> The idea is that we do not need to immediately sync BAT to the image as
>> from the guest point of view there is a possibility that IO is lost
>> even in the physical controller until flush command was finished.
>> bdrv_co_flush_to_os is exactly the right place for this purpose.
>>
>> Technically the patch aligns writes into BAT to MAX(bdrv_align, 4096),
>> which elliminates read-modify-write transactions on BAT update and
>> cache ready-to-write content in a special buffer in BDRVParallelsState.
>>
>> This buffer possibly contains ParallelsHeader if the first page of the
>> image should be modified. The header occupies first 64 bytes of the image
>> and the BAT starts immediately after it.
>>
>> It is also possible that BAT end is not aligned to the cluster size.
>> ParallelsHeader->data_off is not specified for this case. We should write
>> only part of the cache in that case.
>>
>> This patch speed ups
>>    qemu-img create -f parallels -o cluster_size=64k ./1.hds 64G
>>    qemu-io -f parallels -c "write -P 0x11 0 1024k" 1.hds
>> writing from 50-60 Mb/sec to 80-90 Mb/sec on rotational media and
>> from 160 Mb/sec to 190 Mb/sec on SSD disk.
>>
>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>> CC: Kevin Wolf <kwolf@redhat.com>
>> CC: Stefan Hajnoczi <stefanha@redhat.com>
>> ---
>>   block/parallels.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>   1 file changed, 94 insertions(+), 5 deletions(-)
>>
>> diff --git a/block/parallels.c b/block/parallels.c
>> index 46cf031..18b9267 100644
>> --- a/block/parallels.c
>> +++ b/block/parallels.c
>> @@ -62,6 +62,11 @@ typedef struct BDRVParallelsState {
>>       uint32_t *bat;
>>       unsigned int bat_size;
>>   
>> +    uint32_t *bat_cache;
>> +    unsigned bat_cache_size;
>> +    int bat_cache_off;
>> +    int data_off;
>> +
>>       unsigned int tracks;
>>   
>>       unsigned int off_multiplier;
>> @@ -148,6 +153,17 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
>>           le32_to_cpus(&s->bat[i]);
>>   
>>       qemu_co_mutex_init(&s->lock);
>> +
>> +    s->bat_cache_off = -1;
>> +    if (bs->open_flags & BDRV_O_RDWR) {
>> +        s->bat_cache_size = MAX(bdrv_opt_mem_align(bs->file), 4096);
>> +        s->bat_cache = qemu_blockalign(bs->file, s->bat_cache_size);
>> +    }
>> +    s->data_off = le32_to_cpu(s->ph.data_off) * BDRV_SECTOR_SIZE;
>> +    if (s->data_off == 0) {
>> +        s->data_off = ROUND_UP(bat_offset(s->bat_size), BDRV_SECTOR_SIZE);
>> +    }
>> +
>>       return 0;
>>   
>>   fail_format:
>> @@ -171,10 +187,71 @@ static int64_t seek_to_sector(BDRVParallelsState *s, int64_t sector_num)
>>       return (uint64_t)s->bat[index] * s->off_multiplier + offset;
>>   }
>>   
>> +static int write_bat_cache(BlockDriverState *bs)
>> +{
>> +    BDRVParallelsState *s = bs->opaque;
>> +    int size, off;
>> +
>> +    if (s->bat_cache_off == -1) {
>> +        /* no cached data */
>> +        return 0;
>> +    }
>> +
>> +    size = s->bat_cache_size;
>> +    if (size + s->bat_cache_off > s->data_off) {
>> +        /* avoid writing to the first data block */
>> +        size = s->data_off - s->bat_cache_off;
>> +    }
>> +
>> +    off = s->bat_cache_off;
>> +    s->bat_cache_off = -1;
>> +    return bdrv_pwrite(bs->file, off, s->bat_cache, size);
>> +}
>> +
>> +static int cache_bat(BlockDriverState *bs, uint32_t idx, uint32_t new_data_off)
>> +{
>> +    int ret, i, off, cache_off;
>> +    int64_t first_idx, last_idx;
>> +    BDRVParallelsState *s = bs->opaque;
>> +    uint32_t *cache = s->bat_cache;
>> +
>> +    off = bat_offset(idx);
>> +    cache_off = (off / s->bat_cache_size) * s->bat_cache_size;
>> +
>> +    if (s->bat_cache_off != -1 && s->bat_cache_off != cache_off) {
>> +        ret = write_bat_cache(bs);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +    }
>> +
>> +    first_idx = idx - (off - cache_off) / sizeof(uint32_t);
>> +    last_idx = first_idx + s->bat_cache_size / sizeof(uint32_t);
>> +    if (first_idx < 0) {
>> +        memcpy(s->bat_cache, &s->ph, sizeof(s->ph));
>> +        first_idx = 0;
>> +        cache = s->bat_cache + sizeof(s->ph) / sizeof(uint32_t);
>> +    }
>> +
>> +    if (last_idx > s->bat_size) {
>> +        memset(cache + s->bat_size - first_idx, 0,
>> +               sizeof(uint32_t) * (last_idx - s->bat_size));
>> +    }
>> +
>> +    for (i = 0; i < last_idx - first_idx; i++) {
>> +        cache[i] = cpu_to_le32(s->bat[first_idx + i]);
>> +    }
> You're re-populating the bat_cache on every bat update.  Why?
>
> Shouldn't this be done only with empty cache, i.e. under
> if (s->bat_cache_off == -1)?
reasonable, but condition should be a bit different, namely

if (s->bat_cache_off != -1 && s->bat_cache_off != cache_off) {



>
>> +    cache[idx - first_idx] = cpu_to_le32(new_data_off);
>> +    s->bat[idx] = new_data_off;
>> +
>> +    s->bat_cache_off = cache_off;
>> +    return 0;
>> +}
>> +
>>   static int64_t allocate_sector(BlockDriverState *bs, int64_t sector_num)
>>   {
>>       BDRVParallelsState *s = bs->opaque;
>> -    uint32_t idx, offset, tmp;
>> +    uint32_t idx, offset;
>>       int64_t pos;
>>       int ret;
>>   
>> @@ -190,17 +267,27 @@ static int64_t allocate_sector(BlockDriverState *bs, int64_t sector_num)
>>   
>>       pos = bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS;
>>       bdrv_truncate(bs->file, (pos + s->tracks) << BDRV_SECTOR_BITS);
>> -    s->bat[idx] = pos / s->off_multiplier;
>> -
>> -    tmp = cpu_to_le32(s->bat[idx]);
>>   
>> -    ret = bdrv_pwrite(bs->file, bat_offset(idx), &tmp, sizeof(tmp));
>> +    ret = cache_bat(bs, idx, pos / s->off_multiplier);
>>       if (ret < 0) {
>>           return ret;
>>       }
>>       return (uint64_t)s->bat[idx] * s->off_multiplier + offset;
>>   }
>>   
>> +static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs)
>> +{
>> +    BDRVParallelsState *s = bs->opaque;
>> +    int ret;
>> +
>> +    qemu_co_mutex_lock(&s->lock);
>> +    ret = write_bat_cache(bs);
>> +    qemu_co_mutex_unlock(&s->lock);
>> +
>> +    return ret;
>> +}
>> +
>> +
>>   static int cluster_remainder(BDRVParallelsState *s, int64_t sector_num,
>>           int nb_sectors)
>>   {
>> @@ -387,6 +474,7 @@ exit:
>>   static void parallels_close(BlockDriverState *bs)
>>   {
>>       BDRVParallelsState *s = bs->opaque;
>> +    qemu_vfree(s->bat_cache);
> Don't you need to flush the bat cache here first?

parallels_co_flush_to_os should happen beforehand


void bdrv_close(BlockDriverState *bs)
{
     ....
     bdrv_flush(bs); <-- namely inside this


>>       g_free(s->bat);
>>   }
>>   
>> @@ -416,6 +504,7 @@ static BlockDriver bdrv_parallels = {
>>       .bdrv_open		= parallels_open,
>>       .bdrv_close		= parallels_close,
>>       .bdrv_co_get_block_status = parallels_co_get_block_status,
>> +    .bdrv_co_flush_to_os      = parallels_co_flush_to_os,
>>       .bdrv_co_readv  = parallels_co_readv,
>>       .bdrv_co_writev = parallels_co_writev,
> Roman.

  reply	other threads:[~2015-01-14 13:09 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-12-30  9:28 [Qemu-devel] [PATCH v2 0/19] write/create for Parallels images with reasonable performance Denis V. Lunev
2014-12-30  9:28 ` [Qemu-devel] [PATCH 01/19] iotests, parallels: quote TEST_IMG in 076 test to be path-safe Denis V. Lunev
2014-12-30  9:28 ` [Qemu-devel] [PATCH 02/19] block/parallels: rename parallels_header to ParallelsHeader Denis V. Lunev
2014-12-30  9:28 ` [Qemu-devel] [PATCH 03/19] block/parallels: switch to bdrv_read Denis V. Lunev
2014-12-30  9:28 ` [Qemu-devel] [PATCH 04/19] block/parallels: read up to cluster end in one go Denis V. Lunev
2014-12-30  9:28 ` [Qemu-devel] [PATCH 05/19] block/parallels: add get_block_status Denis V. Lunev
2014-12-30  9:28 ` [Qemu-devel] [PATCH 06/19] block/parallels: provide _co_readv routine for parallels format driver Denis V. Lunev
2014-12-30  9:28 ` [Qemu-devel] [PATCH 07/19] block/parallels: replace magic constants 4, 64 with proper sizeofs Denis V. Lunev
2014-12-30  9:28 ` [Qemu-devel] [PATCH 08/19] block/parallels: _co_writev callback for Parallels format Denis V. Lunev
2014-12-30  9:28 ` [Qemu-devel] [PATCH 09/19] iotests, parallels: test for write into Parallels image Denis V. Lunev
2014-12-30 10:07 ` [Qemu-devel] [PATCH 10/19] block/parallels: support parallels image creation Denis V. Lunev
2014-12-30 10:07   ` [Qemu-devel] [PATCH 11/19] iotests, parallels: test for newly created parallels image via qemu-img Denis V. Lunev
2014-12-30 10:07   ` [Qemu-devel] [PATCH 12/19] parallels: change copyright information in the image header Denis V. Lunev
2014-12-30 10:07   ` [Qemu-devel] [PATCH 13/19] block/parallels: store ParallelsHeader to the BDRVParallelsState Denis V. Lunev
2015-01-13 14:13     ` Roman Kagan
2014-12-30 10:07   ` [Qemu-devel] [PATCH 14/19] block/parallels: create catalog_offset helper Denis V. Lunev
2015-01-13 14:14     ` Roman Kagan
2014-12-30 10:07   ` [Qemu-devel] [PATCH 15/19] block/parallels: rename catalog_ names to bat_ Denis V. Lunev
2015-01-13 14:15     ` Roman Kagan
2014-12-30 10:07   ` [Qemu-devel] [PATCH 16/19] block/parallels: no need to flush on each block allocation table update Denis V. Lunev
2015-01-13 14:50     ` Roman Kagan
2015-01-13 15:17       ` Denis V. Lunev
2015-01-13 20:16         ` Denis V. Lunev
2015-01-14 12:01           ` Roman Kagan
2015-01-14 12:30             ` Denis V. Lunev
2014-12-30 10:07   ` [Qemu-devel] [PATCH 17/19] block/parallels: delay writing to BAT till bdrv_co_flush_to_os Denis V. Lunev
2015-01-14 13:03     ` Roman Kagan
2015-01-14 13:08       ` Denis V. Lunev [this message]
2015-01-14 13:34         ` Roman Kagan
2015-01-14 14:29           ` Denis V. Lunev
2015-01-14 16:44             ` Denis V. Lunev
2014-12-30 10:07   ` [Qemu-devel] [PATCH 18/19] block/parallels: add prealloc-mode and prealloc-size open paramemets Denis V. Lunev
2015-01-14 14:26     ` Roman Kagan
2015-01-14 14:31       ` Denis V. Lunev
2015-01-14 17:22         ` Roman Kagan
2014-12-30 10:07   ` [Qemu-devel] [PATCH 19/19] block/parallels: optimize linear image expansion Denis V. Lunev
2015-01-14 17:56     ` Roman Kagan
2015-01-14 17:57       ` Denis V. Lunev

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=54B66A62.1050202@openvz.org \
    --to=den@openvz.org \
    --cc=kwolf@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=rkagan@parallels.com \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.