From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:47304) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1ZulsD-0003Mt-6I for qemu-devel@nongnu.org; Fri, 06 Nov 2015 13:36:30 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1ZulsB-0004V1-6y for qemu-devel@nongnu.org; Fri, 06 Nov 2015 13:36:29 -0500 From: Max Reitz References: <1446805353-3047-1-git-send-email-famz@redhat.com> Message-ID: <563CF31E.6030707@redhat.com> Date: Fri, 6 Nov 2015 19:36:14 +0100 MIME-Version: 1.0 In-Reply-To: <1446805353-3047-1-git-send-email-famz@redhat.com> Content-Type: multipart/signed; micalg=pgp-sha256; protocol="application/pgp-signature"; boundary="medb00G8TXlGFLEOFaCVtJSDdE7naLs8k" Subject: Re: [Qemu-devel] [Qemu-block] [PATCH] mirror: Improve zero-write and discard with fragmented image List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Fam Zheng , qemu-devel@nongnu.org Cc: Kevin Wolf , pbonzini@redhat.com, qemu-block@nongnu.org This is an OpenPGP/MIME signed message (RFC 4880 and 3156) --medb00G8TXlGFLEOFaCVtJSDdE7naLs8k Content-Type: text/plain; charset=iso-8859-15 Content-Transfer-Encoding: quoted-printable On 06.11.2015 11:22, Fam Zheng wrote: > The "pnum < nb_sectors" condition in deciding whether to actually copy > data is unnecessarily strict, and the qiov initialization is > unnecessarily too, for both bdrv_aio_write_zeroes and bdrv_aio_discard > branches. >=20 > Reorganize mirror_iteration flow so that we: >=20 > 1) Find the contiguous zero/discarded sectors with > bdrv_get_block_status_above() before deciding what to do. We query > s->buf_size sized blocks at a time. >=20 > 2) If the sectors in question are zeroed/discarded and aligned to > target cluster, issue zero write or discard accordingly. It's done > in mirror_do_zero_or_discard, where we don't add buffer to qiov. >=20 > 3) Otherwise, do the same loop as before in mirror_do_read. >=20 > Signed-off-by: Fam Zheng > --- > block/mirror.c | 161 +++++++++++++++++++++++++++++++++++++++++++++----= -------- > 1 file changed, 127 insertions(+), 34 deletions(-) Looks good overall, some comments below. Max > diff --git a/block/mirror.c b/block/mirror.c > index b1252a1..ac796b4 100644 > --- a/block/mirror.c > +++ b/block/mirror.c > @@ -157,28 +157,21 @@ static void mirror_read_complete(void *opaque, in= t ret) > mirror_write_complete, op); > } > =20 > -static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) > +static uint64_t mirror_do_read(MirrorBlockJob *s) > { > BlockDriverState *source =3D s->common.bs; > - int nb_sectors, sectors_per_chunk, nb_chunks; > - int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sec= tor; > + int sectors_per_chunk, nb_sectors, nb_chunks; > + int64_t end, next_chunk, next_sector, hbitmap_next_sector, sector_= num; > uint64_t delay_ns =3D 0; > MirrorOp *op; > - int pnum; > - int64_t ret; > =20 > - s->sector_num =3D hbitmap_iter_next(&s->hbi); > - if (s->sector_num < 0) { > - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); > - s->sector_num =3D hbitmap_iter_next(&s->hbi); > - trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bit= map)); > - assert(s->sector_num >=3D 0); > - } > + sector_num =3D s->sector_num; > + sectors_per_chunk =3D s->granularity >> BDRV_SECTOR_BITS; > + end =3D s->bdev_length / BDRV_SECTOR_SIZE; > =20 > + next_sector =3D sector_num; > + next_chunk =3D sector_num / sectors_per_chunk; @next_sector and @next_chunk set here... > hbitmap_next_sector =3D s->sector_num; > - sector_num =3D s->sector_num; > - sectors_per_chunk =3D s->granularity >> BDRV_SECTOR_BITS; > - end =3D s->bdev_length / BDRV_SECTOR_SIZE; > =20 > /* Extend the QEMUIOVector to include all adjacent blocks that wil= l > * be copied in this operation. > @@ -198,14 +191,6 @@ static uint64_t coroutine_fn mirror_iteration(Mirr= orBlockJob *s) > next_sector =3D sector_num; > next_chunk =3D sector_num / sectors_per_chunk; =2E..and here already. So the above seems superfluous, considering that they are not read in between. (If you keep hbitmap_next_sector =3D s->sector_num; above the sector_num = =3D =2E.. block, that may reduce conflicts further) > =20 > - /* Wait for I/O to this cluster (from a previous iteration) to be = done. */ > - while (test_bit(next_chunk, s->in_flight_bitmap)) { > - trace_mirror_yield_in_flight(s, sector_num, s->in_flight); > - s->waiting_for_io =3D true; > - qemu_coroutine_yield(); > - s->waiting_for_io =3D false; > - } > - > do { > int added_sectors, added_chunks; > =20 > @@ -301,24 +286,132 @@ static uint64_t coroutine_fn mirror_iteration(Mi= rrorBlockJob *s) > s->sectors_in_flight +=3D nb_sectors; > trace_mirror_one_iteration(s, sector_num, nb_sectors); > =20 > - ret =3D bdrv_get_block_status_above(source, NULL, sector_num, > - nb_sectors, &pnum); > - if (ret < 0 || pnum < nb_sectors || > - (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) { > - bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, > - mirror_read_complete, op); > - } else if (ret & BDRV_BLOCK_ZERO) { > + bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, > + mirror_read_complete, op); > + return delay_ns; > +} > + > +static uint64_t mirror_do_zero_or_discard(MirrorBlockJob *s, > + int64_t sector_num, > + int nb_sectors, > + bool is_discard) > +{ > + int sectors_per_chunk, nb_chunks; > + int64_t next_chunk, next_sector, hbitmap_next_sector; > + uint64_t delay_ns =3D 0; > + MirrorOp *op; > + > + sectors_per_chunk =3D s->granularity >> BDRV_SECTOR_BITS; > + assert(nb_sectors >=3D sectors_per_chunk); > + next_chunk =3D sector_num / sectors_per_chunk; > + nb_chunks =3D DIV_ROUND_UP(nb_sectors, sectors_per_chunk); > + bitmap_set(s->in_flight_bitmap, next_chunk, nb_chunks); > + delay_ns =3D ratelimit_calculate_delay(&s->limit, nb_sectors); > + > + /* Allocate a MirrorOp that is used as an AIO callback. The qiov i= s zeroed > + * out so the freeing in iteration is nop. */ > + op =3D g_new0(MirrorOp, 1); > + op->s =3D s; > + op->sector_num =3D sector_num; > + op->nb_sectors =3D nb_sectors; > + > + /* Advance the HBitmapIter in parallel, so that we do not examine > + * the same sector twice. > + */ > + hbitmap_next_sector =3D sector_num; > + next_sector =3D sector_num + nb_sectors; > + while (next_sector > hbitmap_next_sector) { > + hbitmap_next_sector =3D hbitmap_iter_next(&s->hbi); > + if (hbitmap_next_sector < 0) { > + break; > + } > + } > + > + bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors); > + s->in_flight++; > + s->sectors_in_flight +=3D nb_sectors; > + if (is_discard) { > + bdrv_aio_discard(s->target, sector_num, op->nb_sectors, > + mirror_write_complete, op); > + } else { > bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, > s->unmap ? BDRV_REQ_MAY_UNMAP : 0, > mirror_write_complete, op); > - } else { > - assert(!(ret & BDRV_BLOCK_DATA)); > - bdrv_aio_discard(s->target, sector_num, op->nb_sectors, > - mirror_write_complete, op); > } > + > return delay_ns; > } > =20 > +static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) > +{ > + BlockDriverState *source =3D s->common.bs; > + int sectors_per_chunk; > + int64_t sector_num, next_chunk; > + int ret; > + int contiguous_sectors =3D s->buf_size >> BDRV_SECTOR_BITS; > + enum MirrorMethod { > + MIRROR_METHOD_COPY, > + MIRROR_METHOD_ZERO, > + MIRROR_METHOD_DISCARD > + } mirror_method; > + > + s->sector_num =3D hbitmap_iter_next(&s->hbi); > + if (s->sector_num < 0) { > + bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); > + s->sector_num =3D hbitmap_iter_next(&s->hbi); > + trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bit= map)); > + assert(s->sector_num >=3D 0); > + } > + > + sector_num =3D s->sector_num; > + sectors_per_chunk =3D s->granularity >> BDRV_SECTOR_BITS; > + next_chunk =3D sector_num / sectors_per_chunk; > + > + /* Wait for I/O to this cluster (from a previous iteration) to be = done. */ > + while (test_bit(next_chunk, s->in_flight_bitmap)) { > + trace_mirror_yield_in_flight(s, sector_num, s->in_flight); > + s->waiting_for_io =3D true; > + qemu_coroutine_yield(); > + s->waiting_for_io =3D false; > + } > + > + mirror_method =3D MIRROR_METHOD_COPY; > + ret =3D bdrv_get_block_status_above(source, NULL, sector_num, > + contiguous_sectors, > + &contiguous_sectors); We did have NULL as the base before, but is that correct? Shouldn't we use backing_bs(source) or s->base if @sync is none or top? Examining this is fun, the short answer is probably "NULL is wrong, but it's the best we can do here". (1) with NULL here: $ ./qemu-img create -f qcow2 -b base.qcow2 top.qcow2 $ ./qemu-io -c 'write -P 42 0 64k' base.qcow2 $ ./qemu-img create -f qcow2 -o compat=3D0.10 -b base.qcow2 top.qcow2 # compat=3D0.10 is important because otherwise discard will create a # zero cluster instead of truly discarding $ (echo "{'execute':'qmp_capabilities'} {'execute':'drive-mirror','arguments': {'device':'drive0','target':'out.qcow2', 'format':'qcow2','sync':'top'}} {'execute':'human-monitor-command','arguments': {'command-line':'qemu-io drive0 \"discard 0 64k\"'}} {'execute':'block-job-complete','arguments': {'device':'drive0'}} {'execute':'quit'}") | \ x86_64-softmmu/qemu-system-x86_64 -qmp stdio \ -drive if=3Dnone,file=3Dtop.qcow2,id=3Ddrive0,discard=3Dunmap {"QMP": {"version": {"qemu": {"micro": 50, "minor": 4, "major": 2}, "package": ""}, "capabilities": []}} {"return": {}} Formatting 'out.qcow2', fmt=3Dqcow2 size=3D67108864 backing_file=3Dbase.q= cow2 backing_fmt=3Dqcow2 encryption=3Doff cluster_size=3D65536 lazy_refcounts=3D= off refcount_bits=3D16 {"return": {}} discard 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0000 sec (3.815 GiB/sec and 62500.0000 ops/sec) {"return": ""} {"timestamp": {"seconds": 1446830775, "microseconds": 198114}, "event": "BLOCK_JOB_READY", "data": {"device": "drive0", "len": 0, "offset": 0, "speed": 0, "type": "mirror"}} {"return": {}} {"return": {}} {"timestamp": {"seconds": 1446830775, "microseconds": 198371}, "event": "SHUTDOWN"} {"timestamp": {"seconds": 1446830775, "microseconds": 205506}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "drive0", "len": 65536, "offset": 65536, "speed": 0, "type": "mirror"}} (Note that the discard must have been executed before BLOCK_JOB_READY, it's a bit racy) Now we check: $ ./qemu-io -c 'read -P 0 0 64k' out.qcow2 Pattern verification failed at offset 0, 65536 bytes read 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0000 sec (856.164 MiB/sec and 13698.6301 ops/sec) $ ./qemu-io -c 'read -P 42 0 64k' out.qcow2 read 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0000 sec (868.056 MiB/sec and 13888.8889 ops/sec) Well, that seems reasonable, considering that out.qcow2 has base.qcow2 as the backing file, but: $ ./qemu-img map out.qcow2 Offset Length Mapped to File 0 0x10000 0x50000 out.qcow2 Hm... Well, at least out.qcow2 and top.qcow2 return the same values when read, but we don't need to copy that from the backing file. Just leaving the cluster unallocated would have been enough. (2) So with s->base instead of NULL we get: $ ./qemu-io -c 'read -P 0 0 64k' out.qcow2 read 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0000 sec (856.164 MiB/sec and 13698.6301 ops/sec) $ ./qemu-io -c 'read -P 42 0 64k' out.qcow2 Pattern verification failed at offset 0, 65536 bytes read 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0000 sec (892.857 MiB/sec and 14285.7143 ops/sec) And: $ ./qemu-img map out.qcow2 Offset Length Mapped to File So the map output looks right, but now we don't read the backing file anymore. That's because for qcow2 v3, qemu writes zero clusters when asked to discard (that's why I had to use compat=3D0.10 for top.qcow2). Using a compat=3D0.10 image with @mode =3D existing fixes that. But well.= =2E. So the issue is: We don't have a reliable function to punch a hole into an overlay file. discard could do that (and does so for qcow2 v2), but we considered returning zeros the better option if possible because people might find that more reasonable. So while it would be best to copy holes in the overlay file created by discards to the mirrored file, there is no way for us to do that. Therefore, the best we can do is copy from the backing file. So, OK, NULL it is. > + > + contiguous_sectors -=3D contiguous_sectors % sectors_per_chunk; > + if (ret < 0 || contiguous_sectors < sectors_per_chunk) { > + contiguous_sectors =3D sectors_per_chunk; > + } else if (!(ret & BDRV_BLOCK_DATA)) { > + int64_t target_sector_num; > + int target_nb_sectors; > + bdrv_round_to_clusters(s->target, sector_num, contiguous_secto= rs, > + &target_sector_num, &target_nb_sectors)= ; > + if (target_sector_num =3D=3D sector_num && > + target_nb_sectors =3D=3D contiguous_sectors) { > + mirror_method =3D ret & BDRV_BLOCK_ZERO ? > + MIRROR_METHOD_ZERO : > + MIRROR_METHOD_DISCARD; > + } > + } > + > + switch (mirror_method) { > + case MIRROR_METHOD_COPY: > + return mirror_do_read(s); > + case MIRROR_METHOD_ZERO: > + return mirror_do_zero_or_discard(s, sector_num, > + contiguous_sectors, > + false); > + case MIRROR_METHOD_DISCARD: > + return mirror_do_zero_or_discard(s, sector_num, > + contiguous_sectors, > + false); s/false/true/? > + default: > + abort(); > + } > +} > + > static void mirror_free_init(MirrorBlockJob *s) > { > int granularity =3D s->granularity; >=20 --medb00G8TXlGFLEOFaCVtJSDdE7naLs8k Content-Type: application/pgp-signature; name="signature.asc" Content-Description: OpenPGP digital signature Content-Disposition: attachment; filename="signature.asc" -----BEGIN PGP SIGNATURE----- Version: GnuPG v2 iQEcBAEBCAAGBQJWPPMeAAoJEDuxQgLoOKytseUH/iTmToblFh5TgWHP7cYqmQ1g 7gh/iW8QYAYzt3oHZwkCcbQFudII3lqxtJwzV4XF2+YtguwdS4buOtdTqOUOwIXF Oto/ZadpxR0wyjcWXQS3cB1RabE1X3nWHg6iwYZFuK8IEYim/7zxmoL/uW9oA6z/ veILgcooQtrPGGYYtKRuPoQZqRKM1GSge0a5PdZku7Gv+9PGaS7Du7CE9Y1vUMu0 WUDNfOfAsc/RJrHpXNjSRVjILvvWXODMBMGuhzwSbh4IubhhweSVonLe/miAe5Zw h1PuV5a8CuAXTpNwnt2XGk/iyfpZQkZhno/saxdhgPQeGZYnZ/pYNmID3gbwHZ4= =cdiP -----END PGP SIGNATURE----- --medb00G8TXlGFLEOFaCVtJSDdE7naLs8k--