From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:35769) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1XNRnq-0005yb-SU for qemu-devel@nongnu.org; Fri, 29 Aug 2014 15:25:50 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1XNRnf-0008KL-Rd for qemu-devel@nongnu.org; Fri, 29 Aug 2014 15:25:38 -0400 Received: from mx1.redhat.com ([209.132.183.28]:8551) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1XNRnf-0008Jp-ES for qemu-devel@nongnu.org; Fri, 29 Aug 2014 15:25:31 -0400 Message-ID: <5400D39E.9060107@redhat.com> Date: Fri, 29 Aug 2014 21:25:18 +0200 From: Max Reitz MIME-Version: 1.0 References: <1409170706-24465-1-git-send-email-mreitz@redhat.com> <1409170706-24465-8-git-send-email-mreitz@redhat.com> <20140828160852.GA31509@nodalink.com> In-Reply-To: <20140828160852.GA31509@nodalink.com> Content-Type: text/plain; charset=windows-1252; format=flowed Content-Transfer-Encoding: quoted-printable Subject: Re: [Qemu-devel] [PATCH v4 07/10] qcow2: Rebuild refcount structure during check List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: =?windows-1252?Q?Beno=EEt_Canet?= Cc: Kevin Wolf , qemu-devel@nongnu.org, Stefan Hajnoczi On 28.08.2014 18:08, Beno=EEt Canet wrote: > On Wed, Aug 27, 2014 at 10:18:23PM +0200, Max Reitz wrote: >> The previous commit introduced the "rebuild" variable to qcow2's >> implementation of the image consistency check. Now make use of this by >> adding a function which creates a completely new refcount structure >> based solely on the in-memory information gathered before. >> >> The old refcount structure will be leaked, however. >> >> Signed-off-by: Max Reitz >> --- >> block/qcow2-refcount.c | 287 +++++++++++++++++++++++++++++++++++++++= +++++++++- >> 1 file changed, 284 insertions(+), 3 deletions(-) >> >> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c >> index 6300cec..2dca6d4 100644 >> --- a/block/qcow2-refcount.c >> +++ b/block/qcow2-refcount.c >> @@ -1603,6 +1603,267 @@ static void compare_refcounts(BlockDriverState= *bs, BdrvCheckResult *res, >> } >> =20 >> /* >> + * Allocates a cluster using an in-memory refcount table (IMRT) in co= ntrast to >> + * the on-disk refcount structures. >> + * >> + * *first_free_cluster does not necessarily point to the first free c= luster, but >> + * may point to one cluster as close as possible before it. The offse= t returned >> + * will never be before that cluster. >> + * >> + * Note that *first_free_cluster is a cluster index whereas the retur= n value is >> + * an offset. >> + */ >> +static int64_t alloc_clusters_imrt(BlockDriverState *bs, >> + int cluster_count, >> + uint16_t **refcount_table, >> + int64_t *nb_clusters, >> + int64_t *first_free_cluster) >> +{ >> + BDRVQcowState *s =3D bs->opaque; >> + int64_t cluster =3D *first_free_cluster, i; >> + bool first_gap =3D true; >> + int contiguous_free_clusters; >> + >> + /* Starting at *first_free_cluster, find a range of at least clus= ter_count >> + * continuously free clusters */ >> + for (contiguous_free_clusters =3D 0; >> + cluster < *nb_clusters && contiguous_free_clusters < cluster= _count; >> + cluster++) >> + { >> + if (!(*refcount_table)[cluster]) { >> + contiguous_free_clusters++; >> + if (first_gap) { >> + /* If this is the first free cluster found, update >> + * *first_free_cluster accordingly */ >> + *first_free_cluster =3D cluster; >> + first_gap =3D false; >> + } >> + } else if (contiguous_free_clusters) { >> + contiguous_free_clusters =3D 0; >> + } >> + } >> + >> + /* If contiguous_free_clusters is greater than zero, it contains = the number >> + * of continuously free clusters until the current cluster; the f= irst free >> + * cluster in the current "gap" is therefore >> + * cluster - contiguous_free_clusters */ >> + >> + /* If no such range could be found, grow the in-memory refcount t= able >> + * accordingly to append free clusters at the end of the image */ >> + if (contiguous_free_clusters < cluster_count) { >> + int64_t old_nb_clusters =3D *nb_clusters; >> + >> + /* There already is a gap of contiguous_free_clusters; we nee= d >> + * cluster_count clusters; therefore, we have to allocate >> + * cluster_count - contiguous_free_clusters new clusters at t= he end of >> + * the image (which is the current value of cluster; note tha= t cluster >> + * may exceed old_nb_clusters if *first_free_cluster pointed = beyond the >> + * image end) */ >> + *nb_clusters =3D cluster + cluster_count - contiguous_free_cl= usters; >> + *refcount_table =3D g_try_realloc(*refcount_table, >> + *nb_clusters * sizeof(uint16_= t)); >> + if (!*refcount_table) { >> + return -ENOMEM; >> + } >> + >> + memset(*refcount_table + old_nb_clusters, 0, >> + (*nb_clusters - old_nb_clusters) * sizeof(uint16_t)); >> + } >> + >> + /* Go back to the first free cluster */ >> + cluster -=3D contiguous_free_clusters; >> + for (i =3D 0; i < cluster_count; i++) { >> + (*refcount_table)[cluster + i] =3D 1; >> + } >> + >> + return cluster << s->cluster_bits; >> +} >> + >> +/* >> + * Creates a new refcount structure based solely on the in-memory inf= ormation >> + * given through *refcount_table. All necessary allocations will be r= eflected >> + * in that array. >> + * >> + * On success, the old refcount structure is leaked (it will be cover= ed by the >> + * new refcount structure). >> + */ >> +static int rebuild_refcount_structure(BlockDriverState *bs, >> + BdrvCheckResult *res, >> + uint16_t **refcount_table, >> + int64_t *nb_clusters) >> +{ >> + BDRVQcowState *s =3D bs->opaque; >> + int64_t first_free_cluster =3D 0, rt_ofs =3D -1, cluster =3D 0; >> + int64_t rb_ofs, rb_start, rb_index; >> + uint32_t reftable_size =3D 0; >> + uint64_t *reftable =3D NULL; >> + uint16_t *on_disk_rb; >> + uint8_t rt_offset_and_clusters[sizeof(uint64_t) + sizeof(uint32_t= )]; > Could this be a small struct ? Yes, it most certainly could, but considering there are other places=20 which use this form already (block/qcow2-refcount.c, line 406 (on=20 8b30301) or block/qcow2-cluster.c, line 40), I think it's not worth it. >> + int i, ret =3D 0; >> + >> + qcow2_cache_empty(bs, s->refcount_block_cache); >> + >> +write_refblocks: >> + for (; cluster < *nb_clusters; cluster++) { >> + if (!(*refcount_table)[cluster]) { >> + continue; >> + } >> + >> + rb_index =3D cluster >> (s->cluster_bits - 1); >> + rb_start =3D rb_index << (s->cluster_bits - 1); > Do we have an hardcoded hidden relationship to s->refcount_order / size= of(uint16_t) here ? You're right, we have. I guess I'll just add a new field to=20 BDRVQcowState named "refblock_shift" or something like that. >> + >> + /* Don't allocate a cluster in a refblock already written to = disk */ >> + if (first_free_cluster < rb_start) { >> + first_free_cluster =3D rb_start; >> + } >> + rb_ofs =3D alloc_clusters_imrt(bs, 1, refcount_table, nb_clus= ters, >> + &first_free_cluster); >> + if (rb_ofs < 0) { >> + fprintf(stderr, "ERROR allocating refblock: %s\n", strerr= or(-ret)); >> + res->check_errors++; >> + ret =3D rb_ofs; >> + goto fail; >> + } >> + >> + if (reftable_size <=3D rb_index) { >> + uint32_t old_rt_size =3D reftable_size; >> + reftable_size =3D ROUND_UP((rb_index + 1) * sizeof(uint64= _t), >> + s->cluster_size) / sizeof(uint64= _t); >> + reftable =3D g_try_realloc(reftable, >> + reftable_size * sizeof(uint64_t)= ); >> + if (!reftable) { >> + res->check_errors++; >> + ret =3D -ENOMEM; >> + goto fail; >> + } >> + >> + memset(reftable + old_rt_size, 0, >> + (reftable_size - old_rt_size) * sizeof(uint64_t)); >> + >> + /* The offset we have for the reftable is now no longer v= alid; >> + * this will leak that range, but we can easily fix that = by running >> + * a leak-fixing check after this rebuild operation */ >> + rt_ofs =3D -1; >> + } >> + reftable[rb_index] =3D rb_ofs; >> + >> + /* If this is apparently the last refblock (for now), try to = squeeze the >> + * reftable in; 1 << (s->cluster_bits - MAX(0, s->refcount_or= der - 3)) >> + * is the number of entries per refblock (1 << s->refcount_or= der is the >> + * number of bits per refblock entry, but we need the number = of bytes); >> + * therefore, (*nb_clusters - 1) >> ... is the index of the l= ast >> + * refblock */ >> + if (rb_index =3D=3D (*nb_clusters - 1) >> >> + (s->cluster_bits - MAX(0, s->refcount_order - 3)) && rt_o= fs < 0) >> + { >> + rt_ofs =3D alloc_clusters_imrt(bs, size_to_clusters(s, re= ftable_size * >> + sizeof(= uint64_t)), >> + refcount_table, nb_clusters, >> + &first_free_cluster); >> + if (rt_ofs < 0) { >> + fprintf(stderr, "ERROR allocating reftable: %s\n", >> + strerror(-ret)); >> + res->check_errors++; >> + ret =3D rt_ofs; >> + goto fail; >> + } >> + } >> + >> + ret =3D qcow2_pre_write_overlap_check(bs, 0, rb_ofs, s->clust= er_size); >> + if (ret < 0) { >> + fprintf(stderr, "ERROR writing refblock: %s\n", strerror(= -ret)); >> + goto fail; >> + } >> + > >> + on_disk_rb =3D g_malloc0(s->cluster_size); > Do we want g_try_malloc0 here ? There is no real reason not to, but there is also no real reason to.=20 s->cluster_size should be reasonably small so this should never fail. On=20 the other hand, if I'm respinning this anyway, I might as well. >> + for (i =3D 0; i < s->cluster_size / sizeof(uint16_t) && >> + rb_start + i < *nb_clusters; i++) >> + { >> + on_disk_rb[i] =3D cpu_to_be16((*refcount_table)[rb_start = + i]); >> + } >> + >> + ret =3D bdrv_write(bs->file, rb_ofs / BDRV_SECTOR_SIZE, >> + (void *)on_disk_rb, s->cluster_sectors); >> + g_free(on_disk_rb); >> + if (ret < 0) { >> + fprintf(stderr, "ERROR writing refblock: %s\n", strerror(= -ret)); >> + goto fail; >> + } >> + >> + /* Go to the end of this refblock */ >> + cluster =3D rb_start + s->cluster_size / sizeof(uint16_t) - 1= ; >> + } >> + >> + if (rt_ofs < 0) { >> + int64_t post_rb_start =3D ROUND_UP(*nb_clusters, >> + s->cluster_size / sizeof(uin= t16_t)); >> + >> + /* Not pretty but simple */ >> + if (first_free_cluster < post_rb_start) { >> + first_free_cluster =3D post_rb_start; >> + } >> + rt_ofs =3D alloc_clusters_imrt(bs, size_to_clusters(s, reftab= le_size * >> + sizeof(uint= 64_t)), >> + refcount_table, nb_clusters, >> + &first_free_cluster); >> + if (rt_ofs < 0) { >> + fprintf(stderr, "ERROR allocating reftable: %s\n", strerr= or(-ret)); >> + res->check_errors++; >> + ret =3D rt_ofs; >> + goto fail; >> + } >> + >> + goto write_refblocks; >> + } >> + >> + assert(reftable); >> + >> + for (rb_index =3D 0; rb_index < reftable_size; rb_index++) { >> + cpu_to_be64s(&reftable[rb_index]); >> + } >> + >> + ret =3D qcow2_pre_write_overlap_check(bs, 0, rt_ofs, >> + reftable_size * sizeof(uint64= _t)); >> + if (ret < 0) { >> + fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret= )); >> + goto fail; >> + } >> + >> + ret =3D bdrv_write(bs->file, rt_ofs / BDRV_SECTOR_SIZE, (void *)r= eftable, >> + reftable_size * sizeof(uint64_t) / BDRV_SECTOR_S= IZE); >> + if (ret < 0) { >> + fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret= )); >> + goto fail; >> + } >> + >> + /* Enter new reftable into the image header */ >> + cpu_to_be64w((uint64_t *)&rt_offset_and_clusters[0], rt_ofs); >> + cpu_to_be32w((uint32_t *)&rt_offset_and_clusters[sizeof(uint64_t)= ], >> + size_to_clusters(s, reftable_size * sizeof(uint64_t)= )); > Manipulating an utility struct would problably some casts. Hm... Well, I probably shouldn't use pre-existing bad code as an excuse.=20 Okay, I'll use an anonymous struct. >> + ret =3D bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, >> + refcount_table_offset), >> + rt_offset_and_clusters, >> + sizeof(rt_offset_and_clusters)); >> + if (ret < 0) { >> + fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret= )); >> + goto fail; >> + } >> + >> + for (rb_index =3D 0; rb_index < reftable_size; rb_index++) { >> + be64_to_cpus(&reftable[rb_index]); >> + } >> + s->refcount_table =3D reftable; >> + s->refcount_table_offset =3D rt_ofs; >> + s->refcount_table_size =3D reftable_size; >> + >> + return 0; >> + >> +fail: >> + g_free(reftable); >> + return ret; >> +} >> + >> +/* >> * Checks an image for refcount consistency. >> * >> * Returns 0 if no errors are found, the number of errors in case th= e image is >> @@ -1612,6 +1873,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, = BdrvCheckResult *res, >> BdrvCheckMode fix) >> { >> BDRVQcowState *s =3D bs->opaque; >> + BdrvCheckResult pre_compare_res; >> int64_t size, highest_cluster, nb_clusters; >> uint16_t *refcount_table =3D NULL; >> bool rebuild =3D false; >> @@ -1638,11 +1900,30 @@ int qcow2_check_refcounts(BlockDriverState *bs= , BdrvCheckResult *res, >> goto fail; >> } >> =20 >> - compare_refcounts(bs, res, fix, &rebuild, &highest_cluster, refco= unt_table, >> + /* In case we don't need to rebuild the refcount structure (but w= ant to fix >> + * something), this function is immediately called again, in whic= h case the >> + * result should be ignored */ >> + pre_compare_res =3D *res; >> + compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcoun= t_table, >> nb_clusters); >> =20 >> - if (rebuild) { >> - fprintf(stderr, "ERROR need to rebuild refcount structures\n"= ); >> + if (rebuild && (fix & BDRV_FIX_ERRORS)) { >> + fprintf(stderr, "Rebuilding refcount structure\n"); >> + ret =3D rebuild_refcount_structure(bs, res, &refcount_table, >> + &nb_clusters); >> + if (ret < 0) { >> + goto fail; >> + } >> + } else if (fix) { >> + if (rebuild) { >> + fprintf(stderr, "ERROR need to rebuild refcount structure= s\n"); >> + } >> + >> + if (res->leaks || res->corruptions) { >> + *res =3D pre_compare_res; >> + compare_refcounts(bs, res, fix, &rebuild, &highest_cluste= r, >> + refcount_table, nb_clusters); >> + } >> } >> =20 >> /* check OFLAG_COPIED */ >> --=20 >> 2.1.0 >> > I am nitpicking because I have trouble really understanding the code ;) Nitpicking is far better than a Reviewed-by without much meaning behind=20 it, so I'm glad about it. :-) Max