From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:58405) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1YBNhJ-00033R-7k for qemu-devel@nongnu.org; Wed, 14 Jan 2015 08:09:22 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1YBNhG-0000d3-26 for qemu-devel@nongnu.org; Wed, 14 Jan 2015 08:09:21 -0500 Received: from mx2.parallels.com ([199.115.105.18]:58430) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1YBNhF-0000cz-PU for qemu-devel@nongnu.org; Wed, 14 Jan 2015 08:09:17 -0500 Message-ID: <54B66A62.1050202@openvz.org> Date: Wed, 14 Jan 2015 16:08:50 +0300 From: "Denis V. Lunev" MIME-Version: 1.0 References: <1419931701-19362-1-git-send-email-den@openvz.org> <1419934032-24216-1-git-send-email-den@openvz.org> <1419934032-24216-8-git-send-email-den@openvz.org> <20150114130339.GB27418@rkaganb.sw.ru> In-Reply-To: <20150114130339.GB27418@rkaganb.sw.ru> Content-Type: text/plain; charset="windows-1252"; format=flowed Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [PATCH 17/19] block/parallels: delay writing to BAT till bdrv_co_flush_to_os List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Roman Kagan , Kevin Wolf , qemu-devel@nongnu.org, Stefan Hajnoczi On 14/01/15 16:03, Roman Kagan wrote: > On Tue, Dec 30, 2014 at 01:07:10PM +0300, Denis V. Lunev wrote: >> The idea is that we do not need to immediately sync BAT to the image as >> from the guest point of view there is a possibility that IO is lost >> even in the physical controller until flush command was finished. >> bdrv_co_flush_to_os is exactly the right place for this purpose. >> >> Technically the patch aligns writes into BAT to MAX(bdrv_align, 4096), >> which elliminates read-modify-write transactions on BAT update and >> cache ready-to-write content in a special buffer in BDRVParallelsState. >> >> This buffer possibly contains ParallelsHeader if the first page of the >> image should be modified. The header occupies first 64 bytes of the image >> and the BAT starts immediately after it. >> >> It is also possible that BAT end is not aligned to the cluster size. >> ParallelsHeader->data_off is not specified for this case. We should write >> only part of the cache in that case. >> >> This patch speed ups >> qemu-img create -f parallels -o cluster_size=64k ./1.hds 64G >> qemu-io -f parallels -c "write -P 0x11 0 1024k" 1.hds >> writing from 50-60 Mb/sec to 80-90 Mb/sec on rotational media and >> from 160 Mb/sec to 190 Mb/sec on SSD disk. >> >> Signed-off-by: Denis V. Lunev >> CC: Kevin Wolf >> CC: Stefan Hajnoczi >> --- >> block/parallels.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- >> 1 file changed, 94 insertions(+), 5 deletions(-) >> >> diff --git a/block/parallels.c b/block/parallels.c >> index 46cf031..18b9267 100644 >> --- a/block/parallels.c >> +++ b/block/parallels.c >> @@ -62,6 +62,11 @@ typedef struct BDRVParallelsState { >> uint32_t *bat; >> unsigned int bat_size; >> >> + uint32_t *bat_cache; >> + unsigned bat_cache_size; >> + int bat_cache_off; >> + int data_off; >> + >> unsigned int tracks; >> >> unsigned int off_multiplier; >> @@ -148,6 +153,17 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, >> le32_to_cpus(&s->bat[i]); >> >> qemu_co_mutex_init(&s->lock); >> + >> + s->bat_cache_off = -1; >> + if (bs->open_flags & BDRV_O_RDWR) { >> + s->bat_cache_size = MAX(bdrv_opt_mem_align(bs->file), 4096); >> + s->bat_cache = qemu_blockalign(bs->file, s->bat_cache_size); >> + } >> + s->data_off = le32_to_cpu(s->ph.data_off) * BDRV_SECTOR_SIZE; >> + if (s->data_off == 0) { >> + s->data_off = ROUND_UP(bat_offset(s->bat_size), BDRV_SECTOR_SIZE); >> + } >> + >> return 0; >> >> fail_format: >> @@ -171,10 +187,71 @@ static int64_t seek_to_sector(BDRVParallelsState *s, int64_t sector_num) >> return (uint64_t)s->bat[index] * s->off_multiplier + offset; >> } >> >> +static int write_bat_cache(BlockDriverState *bs) >> +{ >> + BDRVParallelsState *s = bs->opaque; >> + int size, off; >> + >> + if (s->bat_cache_off == -1) { >> + /* no cached data */ >> + return 0; >> + } >> + >> + size = s->bat_cache_size; >> + if (size + s->bat_cache_off > s->data_off) { >> + /* avoid writing to the first data block */ >> + size = s->data_off - s->bat_cache_off; >> + } >> + >> + off = s->bat_cache_off; >> + s->bat_cache_off = -1; >> + return bdrv_pwrite(bs->file, off, s->bat_cache, size); >> +} >> + >> +static int cache_bat(BlockDriverState *bs, uint32_t idx, uint32_t new_data_off) >> +{ >> + int ret, i, off, cache_off; >> + int64_t first_idx, last_idx; >> + BDRVParallelsState *s = bs->opaque; >> + uint32_t *cache = s->bat_cache; >> + >> + off = bat_offset(idx); >> + cache_off = (off / s->bat_cache_size) * s->bat_cache_size; >> + >> + if (s->bat_cache_off != -1 && s->bat_cache_off != cache_off) { >> + ret = write_bat_cache(bs); >> + if (ret < 0) { >> + return ret; >> + } >> + } >> + >> + first_idx = idx - (off - cache_off) / sizeof(uint32_t); >> + last_idx = first_idx + s->bat_cache_size / sizeof(uint32_t); >> + if (first_idx < 0) { >> + memcpy(s->bat_cache, &s->ph, sizeof(s->ph)); >> + first_idx = 0; >> + cache = s->bat_cache + sizeof(s->ph) / sizeof(uint32_t); >> + } >> + >> + if (last_idx > s->bat_size) { >> + memset(cache + s->bat_size - first_idx, 0, >> + sizeof(uint32_t) * (last_idx - s->bat_size)); >> + } >> + >> + for (i = 0; i < last_idx - first_idx; i++) { >> + cache[i] = cpu_to_le32(s->bat[first_idx + i]); >> + } > You're re-populating the bat_cache on every bat update. Why? > > Shouldn't this be done only with empty cache, i.e. under > if (s->bat_cache_off == -1)? reasonable, but condition should be a bit different, namely if (s->bat_cache_off != -1 && s->bat_cache_off != cache_off) { > >> + cache[idx - first_idx] = cpu_to_le32(new_data_off); >> + s->bat[idx] = new_data_off; >> + >> + s->bat_cache_off = cache_off; >> + return 0; >> +} >> + >> static int64_t allocate_sector(BlockDriverState *bs, int64_t sector_num) >> { >> BDRVParallelsState *s = bs->opaque; >> - uint32_t idx, offset, tmp; >> + uint32_t idx, offset; >> int64_t pos; >> int ret; >> >> @@ -190,17 +267,27 @@ static int64_t allocate_sector(BlockDriverState *bs, int64_t sector_num) >> >> pos = bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS; >> bdrv_truncate(bs->file, (pos + s->tracks) << BDRV_SECTOR_BITS); >> - s->bat[idx] = pos / s->off_multiplier; >> - >> - tmp = cpu_to_le32(s->bat[idx]); >> >> - ret = bdrv_pwrite(bs->file, bat_offset(idx), &tmp, sizeof(tmp)); >> + ret = cache_bat(bs, idx, pos / s->off_multiplier); >> if (ret < 0) { >> return ret; >> } >> return (uint64_t)s->bat[idx] * s->off_multiplier + offset; >> } >> >> +static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs) >> +{ >> + BDRVParallelsState *s = bs->opaque; >> + int ret; >> + >> + qemu_co_mutex_lock(&s->lock); >> + ret = write_bat_cache(bs); >> + qemu_co_mutex_unlock(&s->lock); >> + >> + return ret; >> +} >> + >> + >> static int cluster_remainder(BDRVParallelsState *s, int64_t sector_num, >> int nb_sectors) >> { >> @@ -387,6 +474,7 @@ exit: >> static void parallels_close(BlockDriverState *bs) >> { >> BDRVParallelsState *s = bs->opaque; >> + qemu_vfree(s->bat_cache); > Don't you need to flush the bat cache here first? parallels_co_flush_to_os should happen beforehand void bdrv_close(BlockDriverState *bs) { .... bdrv_flush(bs); <-- namely inside this >> g_free(s->bat); >> } >> >> @@ -416,6 +504,7 @@ static BlockDriver bdrv_parallels = { >> .bdrv_open = parallels_open, >> .bdrv_close = parallels_close, >> .bdrv_co_get_block_status = parallels_co_get_block_status, >> + .bdrv_co_flush_to_os = parallels_co_flush_to_os, >> .bdrv_co_readv = parallels_co_readv, >> .bdrv_co_writev = parallels_co_writev, > Roman.