From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from eggs.gnu.org ([2001:4830:134:3::10]:59793)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <mreitz@redhat.com>) id 1ZyPWK-0005fC-Sr
	for qemu-devel@nongnu.org; Mon, 16 Nov 2015 14:32:58 -0500
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <mreitz@redhat.com>) id 1ZyPWJ-0000On-5T
	for qemu-devel@nongnu.org; Mon, 16 Nov 2015 14:32:56 -0500
References: <1447299842-21954-1-git-send-email-famz@redhat.com>
From: Max Reitz <mreitz@redhat.com>
Message-ID: <564A2F5A.2000801@redhat.com>
Date: Mon, 16 Nov 2015 20:32:42 +0100
MIME-Version: 1.0
In-Reply-To: <1447299842-21954-1-git-send-email-famz@redhat.com>
Content-Type: multipart/signed; micalg=pgp-sha256;
	protocol="application/pgp-signature";
	boundary="HewUrQnMINbjxGEoA1rfdu3c81IUWt9No"
Subject: Re: [Qemu-devel] [PATCH v4] mirror: Rewrite mirror_iteration
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
To: Fam Zheng <famz@redhat.com>, qemu-devel@nongnu.org
Cc: Kevin Wolf <kwolf@redhat.com>, pbonzini@redhat.com, Jeff Cody <jcody@redhat.com>, qemu-block@nongnu.org

This is an OpenPGP/MIME signed message (RFC 4880 and 3156)
--HewUrQnMINbjxGEoA1rfdu3c81IUWt9No
Content-Type: text/plain; charset=iso-8859-15
Content-Transfer-Encoding: quoted-printable

On 12.11.2015 04:44, Fam Zheng wrote:
> The "pnum < nb_sectors" condition in deciding whether to actually copy
> data is unnecessarily strict, and the qiov initialization is
> unnecessarily for bdrv_aio_write_zeroes and bdrv_aio_discard.
>=20
> Rewrite mirror_iteration to fix both flaws.
>=20
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
>  block/mirror.c | 198 ++++++++++++++++++++++++++++++++++++++-----------=
--------
>  1 file changed, 134 insertions(+), 64 deletions(-)

Some rather long comments below, but I still like this patch very much.
mirror_iteration() makes much more sense after this rewrite. Thanks!

> diff --git a/block/mirror.c b/block/mirror.c
> index b1252a1..5f22c65 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -45,7 +45,6 @@ typedef struct MirrorBlockJob {
>      BlockdevOnError on_source_error, on_target_error;
>      bool synced;
>      bool should_complete;
> -    int64_t sector_num;
>      int64_t granularity;
>      size_t buf_size;
>      int64_t bdev_length;
> @@ -157,28 +156,16 @@ static void mirror_read_complete(void *opaque, in=
t ret)
>                      mirror_write_complete, op);
>  }
> =20
> -static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
> +static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
> +                          int max_sectors)

I don't know if I like this parameter's name because we are actually
copying this much, or maybe even more (if sector_num is unaligned, but
more about that below). We never copy less, as far as I can see.

>  {
>      BlockDriverState *source =3D s->common.bs;
> -    int nb_sectors, sectors_per_chunk, nb_chunks;
> -    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sec=
tor;
> -    uint64_t delay_ns =3D 0;
> +    int sectors_per_chunk, nb_chunks;
> +    int64_t next_chunk, next_sector;
> +    int nb_sectors;
>      MirrorOp *op;
> -    int pnum;
> -    int64_t ret;
> =20
> -    s->sector_num =3D hbitmap_iter_next(&s->hbi);
> -    if (s->sector_num < 0) {
> -        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
> -        s->sector_num =3D hbitmap_iter_next(&s->hbi);
> -        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bit=
map));
> -        assert(s->sector_num >=3D 0);
> -    }
> -
> -    hbitmap_next_sector =3D s->sector_num;
> -    sector_num =3D s->sector_num;
>      sectors_per_chunk =3D s->granularity >> BDRV_SECTOR_BITS;
> -    end =3D s->bdev_length / BDRV_SECTOR_SIZE;
> =20
>      /* Extend the QEMUIOVector to include all adjacent blocks that wil=
l
>       * be copied in this operation.
> @@ -198,23 +185,9 @@ static uint64_t coroutine_fn mirror_iteration(Mirr=
orBlockJob *s)
>      next_sector =3D sector_num;
>      next_chunk =3D sector_num / sectors_per_chunk;
> =20
> -    /* Wait for I/O to this cluster (from a previous iteration) to be =
done.  */
> -    while (test_bit(next_chunk, s->in_flight_bitmap)) {
> -        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
> -        s->waiting_for_io =3D true;
> -        qemu_coroutine_yield();
> -        s->waiting_for_io =3D false;
> -    }
> -
>      do {
>          int added_sectors, added_chunks;
> =20
> -        if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) ||
> -            test_bit(next_chunk, s->in_flight_bitmap)) {
> -            assert(nb_sectors > 0);
> -            break;
> -        }
> -
>          added_sectors =3D sectors_per_chunk;

(5)

>          if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) {
>              bdrv_round_to_clusters(s->target,
> @@ -226,12 +199,15 @@ static uint64_t coroutine_fn mirror_iteration(Mir=
rorBlockJob *s)
>               */
>              if (next_sector < sector_num) {

This can happen only if there are less sectors per chunk than there are
sectors per cluster, right? Because this function will only be called
with sector_num rounded to chunks, as far as I can see.

But if that is the case, then (5) sets added_sectors to a value not
aligned to clusters, and (6) may retain that value. Therefore, the block
(7) adjusts all offsets/indices by a value not aligned to clusters.

Therefore, I think we only allow chunk sizes which are aligned to the
cluster size. And this should make this conditional block here unnecessar=
y.

>                  assert(nb_sectors =3D=3D 0);
> +                /* We have to make sure [sector_num, sector_num + max_=
sectors)
> +                 * covers the original range. */
> +                max_sectors +=3D sector_num - next_sector;
>                  sector_num =3D next_sector;
>                  next_chunk =3D next_sector / sectors_per_chunk;
>              }
>          }
> =20
> -        added_sectors =3D MIN(added_sectors, end - (sector_num + nb_se=
ctors));
> +        added_sectors =3D MIN(added_sectors, max_sectors);

(6)

>          added_chunks =3D (added_sectors + sectors_per_chunk - 1) / sec=
tors_per_chunk;
> =20
>          /* When doing COW, it may happen that there is not enough spac=
e for
> @@ -252,18 +228,13 @@ static uint64_t coroutine_fn mirror_iteration(Mir=
rorBlockJob *s)
>              break;
>          }
> =20
> -        /* We have enough free space to copy these sectors.  */
> -        bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks);
> -
>          nb_sectors +=3D added_sectors;
>          nb_chunks +=3D added_chunks;
>          next_sector +=3D added_sectors;
>          next_chunk +=3D added_chunks;

(7)

> -        if (!s->synced && s->common.speed) {
> -            delay_ns =3D ratelimit_calculate_delay(&s->limit, added_se=
ctors);
> -        }
> -    } while (delay_ns =3D=3D 0 && next_sector < end);
> +    } while (next_sector < sector_num + max_sectors);

All in all, what is this loop used for now anyway? Can't we just pull
the COW waiting loops out and drop it?

(i.e. just wait until we have enough free space to copy all max_sectors.)=


> =20
> +    assert(nb_sectors);
>      /* Allocate a MirrorOp that is used as an AIO callback.  */
>      op =3D g_new(MirrorOp, 1);
>      op->s =3D s;
> @@ -274,7 +245,6 @@ static uint64_t coroutine_fn mirror_iteration(Mirro=
rBlockJob *s)
>       * from s->buf_free.
>       */
>      qemu_iovec_init(&op->qiov, nb_chunks);
> -    next_sector =3D sector_num;
>      while (nb_chunks-- > 0) {
>          MirrorBuffer *buf =3D QSIMPLEQ_FIRST(&s->buf_free);
>          size_t remaining =3D (nb_sectors * BDRV_SECTOR_SIZE) - op->qio=
v.size;
> @@ -282,39 +252,139 @@ static uint64_t coroutine_fn mirror_iteration(Mi=
rrorBlockJob *s)
>          QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
>          s->buf_free_count--;
>          qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining))=
;
> -
> -        /* Advance the HBitmapIter in parallel, so that we do not exam=
ine
> -         * the same sector twice.
> -         */
> -        if (next_sector > hbitmap_next_sector
> -            && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {=

> -            hbitmap_next_sector =3D hbitmap_iter_next(&s->hbi);
> -        }
> -
> -        next_sector +=3D sectors_per_chunk;
>      }
> =20
> -    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors);
> -
>      /* Copy the dirty cluster.  */
>      s->in_flight++;
>      s->sectors_in_flight +=3D nb_sectors;
>      trace_mirror_one_iteration(s, sector_num, nb_sectors);
> =20
> -    ret =3D bdrv_get_block_status_above(source, NULL, sector_num,
> -                                      nb_sectors, &pnum);
> -    if (ret < 0 || pnum < nb_sectors ||
> -            (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
> -        bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> -                       mirror_read_complete, op);
> -    } else if (ret & BDRV_BLOCK_ZERO) {
> +    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> +                   mirror_read_complete, op);
> +    return nb_sectors;
> +}
> +
> +static void mirror_do_zero_or_discard(MirrorBlockJob *s,
> +                                      int64_t sector_num,
> +                                      int nb_sectors,
> +                                      bool is_discard)
> +{
> +    MirrorOp *op;
> +
> +    /* Allocate a MirrorOp that is used as an AIO callback. The qiov i=
s zeroed
> +     * so the freeing in mirror_iteration_done is nop. */
> +    op =3D g_new0(MirrorOp, 1);
> +    op->s =3D s;
> +    op->sector_num =3D sector_num;
> +    op->nb_sectors =3D nb_sectors;
> +
> +    s->in_flight++;
> +    s->sectors_in_flight +=3D nb_sectors;
> +    if (is_discard) {
> +        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
> +                         mirror_write_complete, op);
> +    } else {
>          bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
>                                s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
>                                mirror_write_complete, op);
> -    } else {
> -        assert(!(ret & BDRV_BLOCK_DATA));
> -        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
> -                         mirror_write_complete, op);
> +    }
> +}
> +
> +static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
> +{
> +    BlockDriverState *source =3D s->common.bs;
> +    int64_t sector_num;
> +    uint64_t delay_ns =3D 0;
> +    int nb_chunks =3D 0;
> +    int64_t end =3D s->bdev_length / BDRV_SECTOR_SIZE;
> +    int sectors_per_chunk =3D s->granularity >> BDRV_SECTOR_BITS;
> +
> +    sector_num =3D hbitmap_iter_next(&s->hbi);

(1)

> +    if (sector_num < 0) {
> +        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
> +        sector_num =3D hbitmap_iter_next(&s->hbi);
> +        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bit=
map));
> +        assert(sector_num >=3D 0);
> +    }
> +
> +
> +    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR=
_BITS)) {
> +        int64_t next_sector =3D sector_num + nb_chunks * sectors_per_c=
hunk;

I'm trying to wrap my head around what this loop does. I think I'd like
a comment with an explanation above it.

nb_chunks counts dirty chunks (4). However, the next_sector calculated
here is not necessarily related to the value returned by
hbitmap_iter_next(), so I don't know what this value is supposed to be.

Imagine the following dirty bitmap:

#---#--#   (# is dirty, - is clean)

Now, we call hbitmap_iter_next(&s->hbi) above (1). We are now here:

#---#--#
^

Therefore, sector_num is 0 and nb_chunks is 0, too (let's assume one
chunk is one sector in size).

Then, we call the hbitmap_iter_next(&s->hbi) below (3):

#---#--#
    ^

Now, nb_chunks is 1, and next_sector is consequently 1, too. That
sector/cluster is not dirty, so (2) will be false and we will quit this
loop.

So I guess this loop is trying to find consecutive dirty chunks?

But because of (3), won't we skip the first non-consecutive chunk in the
next iteration? So in the example above, after we've done (1), won't our
state be:

----#--#
       ^

(Assuming we have cleaned the first dirty chunk)

So we would have skipped the middle chunk. We will reset the iterator
later (if there are still dirty chunks), but it still isn't so nice.
Especially if we have something like ###-####.

Maybe initializing nb_chunks to 1 would help. We do know that the first
chunk is dirty, after all, so we don't have to check it.

> +        int64_t next_chunk =3D next_sector / sectors_per_chunk;
> +        if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {

(2)

Also: bdrv_get_dirty()/hbitmap_get() doesn't seem to do a bounds check.
I think we should make sure that next_sector is not beyond the end of
the BDS.

> +            break;
> +        }
> +        /* Wait for I/O to this cluster (from a previous iteration) to=
 be
> +         * done.*/
> +        while (test_bit(next_chunk, s->in_flight_bitmap)) {
> +            trace_mirror_yield_in_flight(s, next_sector, s->in_flight)=
;
> +            s->waiting_for_io =3D true;
> +            qemu_coroutine_yield();
> +            s->waiting_for_io =3D false;
> +        }
> +
> +        /* Advance the HBitmapIter in parallel, so that we do not exam=
ine
> +         * the same sector twice.
> +         */
> +        hbitmap_iter_next(&s->hbi);

(3): Here, s->hbi is brought to the next dirty cluster.

> +        nb_chunks++;

(4): nb_chunks now "counts" that dirty cluster.

And, by the way, hbitmap_iter_next() may return -1 in which case there
is no dirty chunk there, so nb_chunks should not be incremented.

But that would probably be fixed by initializing nb_chunks to 1 and
adding the range check at (2).

> +    }
> +    assert(nb_chunks);
> +
> +    /* Clear dirty bits before querying the block status, because
> +     * calling bdrv_reset_dirty_bitmap could yield - if some blocks ar=
e marked
> +     * dirty in this window, we need to know.
> +     */
> +    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
> +                            nb_chunks * sectors_per_chunk);
> +    bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb=
_chunks);
> +    while (nb_chunks > 0 && sector_num < end) {
> +        int io_sectors;
> +        enum MirrorMethod {
> +            MIRROR_METHOD_COPY,
> +            MIRROR_METHOD_ZERO,
> +            MIRROR_METHOD_DISCARD
> +        } mirror_method =3D MIRROR_METHOD_COPY;
> +        int ret =3D bdrv_get_block_status_above(source, NULL, sector_n=
um,
> +                                              nb_chunks * sectors_per_=
chunk,
> +                                              &io_sectors);
> +        if (ret < 0) {
> +            io_sectors =3D nb_chunks * sectors_per_chunk;
> +        }
> +
> +        io_sectors -=3D io_sectors % sectors_per_chunk;
> +        if (io_sectors < sectors_per_chunk) {
> +            io_sectors =3D sectors_per_chunk;
> +        } else if (!(ret & BDRV_BLOCK_DATA)) {

If ret < 0, this may still evaluate to true. But we may only take this
path if ret >=3D 0.

Max

> +            int64_t target_sector_num;
> +            int target_nb_sectors;
> +            bdrv_round_to_clusters(s->target, sector_num, io_sectors,
> +                                   &target_sector_num, &target_nb_sect=
ors);
> +            if (target_sector_num =3D=3D sector_num &&
> +                target_nb_sectors =3D=3D io_sectors) {
> +                mirror_method =3D ret & BDRV_BLOCK_ZERO ?
> +                                    MIRROR_METHOD_ZERO :
> +                                    MIRROR_METHOD_DISCARD;
> +            }
> +        }
> +
> +        switch (mirror_method) {
> +        case MIRROR_METHOD_COPY:
> +            io_sectors =3D mirror_do_read(s, sector_num, io_sectors);
> +            break;
> +        case MIRROR_METHOD_ZERO:
> +            mirror_do_zero_or_discard(s, sector_num, io_sectors, false=
);
> +            break;
> +        case MIRROR_METHOD_DISCARD:
> +            mirror_do_zero_or_discard(s, sector_num, io_sectors, true)=
;
> +            break;
> +        default:
> +            abort();
> +        }
> +        assert(io_sectors);
> +        sector_num +=3D io_sectors;
> +        nb_chunks -=3D io_sectors / sectors_per_chunk;
> +        delay_ns +=3D ratelimit_calculate_delay(&s->limit, io_sectors)=
;
>      }
>      return delay_ns;
>  }
>=20


--HewUrQnMINbjxGEoA1rfdu3c81IUWt9No
Content-Type: application/pgp-signature; name="signature.asc"
Content-Description: OpenPGP digital signature
Content-Disposition: attachment; filename="signature.asc"

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2

iQEcBAEBCAAGBQJWSi9aAAoJEDuxQgLoOKyt4awIAJN9vlaK+UdM5oO865LOKaP6
i0kev0OuApF/harnctIF9ACODVscyaiL4NtTYwojaVvVU7blikNOSBANYQP7I3Tm
XQb2+Uy2aFReVe87ONGqMtjk5cHHcIcBCoE0Xt1cJV168HCBOby11xR7o06AOSUT
mP3TyuVeOETEaPw5YfuipKwK8Xn8aWfA53jS75ZpALCqPbGNM+XdzsjkHShZpU9j
2KlXktMrSRGR+pwzKqYe3ke2B8ShbfLeVlrHBnS/Uz2PPitvNkLi3dlE20UjAK3r
6Y877Tq5QRDwC1N8dD45QPRbhrRLOq18aeWXv9H9cGAHToh96rWzBmwXZ7U9Sks=
=Nsjt
-----END PGP SIGNATURE-----

--HewUrQnMINbjxGEoA1rfdu3c81IUWt9No--