From: Max Reitz <mreitz@redhat.com>
To: Kevin Wolf <kwolf@redhat.com>
Cc: qemu-devel@nongnu.org, Stefan Hajnoczi <stefanha@redhat.com>
Subject: Re: [Qemu-devel] [PATCH v4 2/5] qcow2-cluster: Expand zero clusters
Date: Tue, 03 Sep 2013 09:11:40 +0200 [thread overview]
Message-ID: <52258BAC.3000606@redhat.com> (raw)
In-Reply-To: <20130902151341.GD3423@dhcp-200-207.str.redhat.com>
Am 02.09.2013 17:13, schrieb Kevin Wolf:
> Am 02.09.2013 um 12:04 hat Max Reitz geschrieben:
>> Add functionality for expanding zero clusters. This is necessary for
>> downgrading the image version to one without zero cluster support.
>>
>> For non-backed images, this function may also just discard zero clusters
>> instead of truly expanding them.
>>
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> ---
>> block/qcow2-cluster.c | 228 +++++++++++++++++++++++++++++++++++++++++++++++++
>> block/qcow2-refcount.c | 29 ++++---
>> block/qcow2.h | 5 ++
>> 3 files changed, 248 insertions(+), 14 deletions(-)
>>
>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>> index 2d5aa92..c90fb51 100644
>> --- a/block/qcow2-cluster.c
>> +++ b/block/qcow2-cluster.c
>> @@ -1497,3 +1497,231 @@ fail:
>>
>> return ret;
>> }
>> +
>> +/*
>> + * Expands all zero clusters in a specific L1 table (or deallocates them, for
>> + * non-backed non-pre-allocated zero clusters).
>> + *
>> + * expanded_clusters is a bitmap where every bit corresponds to one cluster in
>> + * the image file; a bit gets set if the corresponding cluster has been used for
>> + * zero expansion (i.e., has been filled with zeroes and is referenced from an
>> + * L2 table). nb_clusters contains the total cluster count of the image file,
>> + * i.e., the number of bits in expanded_clusters.
>> + */
>> +static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
>> + int l1_size, uint8_t *expanded_clusters,
>> + uint64_t nb_clusters)
>> +{
>> + BDRVQcowState *s = bs->opaque;
>> + bool is_active_l1 = (l1_table == s->l1_table);
>> + uint64_t *l2_table = NULL;
>> + int ret;
>> + int i, j;
>> +
>> + if (!is_active_l1) {
>> + /* inactive L2 tables require a buffer to be stored in when loading
>> + * them from disk */
>> + l2_table = qemu_blockalign(bs, s->cluster_size);
>> + }
>> +
>> + for (i = 0; i < l1_size; i++) {
>> + uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
>> + bool l2_dirty = false;
>> +
>> + if (!l2_offset) {
>> + /* unallocated */
>> + continue;
>> + }
>> +
>> + if (is_active_l1) {
>> + /* get active L2 tables from cache */
>> + ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
>> + (void **)&l2_table);
>> + } else {
>> + /* load inactive L2 tables from disk */
>> + ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
>> + (void *)l2_table, s->cluster_sectors);
>> + }
>> + if (ret < 0) {
>> + goto fail;
>> + }
>> +
>> + for (j = 0; j < s->l2_size; j++) {
>> + uint64_t l2_entry = be64_to_cpu(l2_table[j]);
>> + int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index;
>> + int cluster_type = qcow2_get_cluster_type(l2_entry);
>> +
>> + if (cluster_type == QCOW2_CLUSTER_NORMAL) {
>> + cluster_index = offset >> s->cluster_bits;
>> + assert((cluster_index >= 0) && (cluster_index < nb_clusters));
>> + if (expanded_clusters[cluster_index / 8] &
>> + (1 << (cluster_index % 8))) {
>> + /* Probably a shared L2 table; this cluster was a zero
>> + * cluster which has been expanded, its refcount
>> + * therefore most likely requires an update. */
>> + ret = qcow2_update_cluster_refcount(bs, cluster_index, 1,
>> + QCOW2_DISCARD_NEVER);
>> + if (ret < 0) {
>> + goto fail;
>> + }
>> + /* Since we just increased the refcount, the COPIED flag may
>> + * no longer be set. */
>> + l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED);
>> + l2_dirty = true;
>> + }
>> + continue;
>> + }
>> + else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) {
>> + continue;
>> + }
>> +
>> + if (!offset) {
>> + /* not preallocated */
>> + if (!bs->backing_hd) {
>> + /* not backed; therefore we can simply deallocate the
>> + * cluster */
>> + l2_table[j] = 0;
>> + l2_dirty = true;
>> + continue;
>> + }
>> +
>> + offset = qcow2_alloc_clusters(bs, s->cluster_size);
>> + if (offset < 0) {
>> + ret = offset;
>> + goto fail;
>> + }
>> + }
>> +
>> + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
>> + offset, s->cluster_size);
>> + if (ret < 0) {
>> + qcow2_free_clusters(bs, offset, s->cluster_size,
>> + QCOW2_DISCARD_ALWAYS);
>> + goto fail;
>> + }
>> +
>> + ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE,
>> + s->cluster_sectors);
>> + if (ret < 0) {
>> + qcow2_free_clusters(bs, offset, s->cluster_size,
>> + QCOW2_DISCARD_ALWAYS);
>> + goto fail;
>> + }
>> +
>> + l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
>> + l2_dirty = true;
>> +
>> + cluster_index = offset >> s->cluster_bits;
>> + assert((cluster_index >= 0) && (cluster_index < nb_clusters));
>> + expanded_clusters[cluster_index / 8] |= 1 << (cluster_index % 8);
>> + }
>> +
>> + if (is_active_l1) {
>> + if (l2_dirty) {
>> + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
>> + qcow2_cache_depends_on_flush(s->l2_table_cache);
>> + }
>> + ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
>> + if (ret < 0) {
>> + l2_table = NULL;
>> + goto fail;
>> + }
>> + } else {
>> + if (l2_dirty) {
>> + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT &
>> + ~(QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2), l2_offset,
>> + s->cluster_size);
>> + if (ret < 0) {
>> + goto fail;
>> + }
>> +
>> + ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
>> + (void *)l2_table, s->cluster_sectors);
>> + if (ret < 0) {
>> + goto fail;
>> + }
>> + }
>> + }
>> + }
>> +
>> + ret = 0;
>> +
>> +fail:
>> + if (l2_table) {
>> + if (!is_active_l1) {
>> + qemu_vfree(l2_table);
>> + } else {
>> + if (ret < 0) {
>> + qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
>> + } else {
>> + ret = qcow2_cache_put(bs, s->l2_table_cache,
>> + (void **)&l2_table);
>> + }
>> + }
>> + }
>> + return ret;
>> +}
>> +
>> +/*
>> + * For backed images, expands all zero clusters on the image. For non-backed
>> + * images, deallocates all non-pre-allocated zero clusters (and claims the
>> + * allocation for pre-allocated ones). This is important for downgrading to a
>> + * qcow2 version which doesn't yet support metadata zero clusters.
>> + */
>> +int qcow2_expand_zero_clusters(BlockDriverState *bs)
>> +{
>> + BDRVQcowState *s = bs->opaque;
>> + uint64_t *l1_table = NULL;
>> + uint64_t nb_clusters;
>> + uint8_t *expanded_clusters;
>> + int ret;
>> + int i, j;
>> +
>> + nb_clusters = bs->total_sectors >> (s->cluster_bits - BDRV_SECTOR_BITS);
> Shouldn't this value actually be rounded up?
Oh, yes, of course.
>> + expanded_clusters = g_malloc0(nb_clusters / 8);
> Just like the size here?
>
>> +
>> + ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
>> + expanded_clusters, nb_clusters);
>> + if (ret < 0) {
>> + goto fail;
>> + }
>> +
>> + /* Inactive L1 tables may point to active L2 tables - therefore it is
>> + * necessary to flush the L2 table cache before trying to access the L2
>> + * tables pointed to by inactive L1 entries (else we might try to expand
>> + * zero clusters that have already been expanded) */
>> + ret = qcow2_cache_flush(bs, s->l2_table_cache);
>> + if (ret < 0) {
>> + goto fail;
>> + }
> I'm afraid that this is not enough in fact. If you modify an active L2
> table through an inactive L1, you bypass the cache and use bdrv_write().
> The cache will still have a stale copy of the table.
Hm, yes, that didn't occur to me because the image isn't being written
to after this zero cluster expansion; at least not in the tests I ran.
> Can't we just access inactive L2 tables through the cache as well? Or
> can we get into trouble in other places if we do that? If so, we'd have
> to implement a function that flushes _and empties_ the cache. (And then
> we could always go the bdrv_read/write path, so there's some code to
> save either way)
I'd personally go for emptying the cache.
>> + for (i = 0; i < s->nb_snapshots; i++) {
>> + int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
>> + BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;
>> +
>> + l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
>> +
>> + ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset /
>> + BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors);
>> + if (ret < 0) {
>> + goto fail;
>> + }
>> +
>> + for (j = 0; j < s->snapshots[i].l1_size; j++) {
>> + be64_to_cpus(&l1_table[j]);
>> + }
>> +
>> + ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
>> + expanded_clusters, nb_clusters);
>> + if (ret < 0) {
>> + goto fail;
>> + }
>> + }
>> +
>> + ret = 0;
>> +
>> +fail:
>> + g_free(expanded_clusters);
>> + g_free(l1_table);
>> + return ret;
>> +}
> Kevin
Max
next prev parent reply other threads:[~2013-09-03 7:11 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-09-02 10:04 [Qemu-devel] [PATCH v4 0/5] block/qcow2: Image file option amendment Max Reitz
2013-09-02 10:04 ` [Qemu-devel] [PATCH v4 1/5] block: " Max Reitz
2013-09-02 10:04 ` [Qemu-devel] [PATCH v4 2/5] qcow2-cluster: Expand zero clusters Max Reitz
2013-09-02 15:13 ` Kevin Wolf
2013-09-03 7:11 ` Max Reitz [this message]
2013-09-02 10:04 ` [Qemu-devel] [PATCH v4 3/5] qcow2: Save refcount order in BDRVQcowState Max Reitz
2013-09-02 10:04 ` [Qemu-devel] [PATCH v4 4/5] qcow2: Implement bdrv_amend_options Max Reitz
2013-09-02 10:04 ` [Qemu-devel] [PATCH v4 5/5] qemu-iotest: qcow2 image option amendment Max Reitz
2013-09-02 15:36 ` Kevin Wolf
2013-09-03 7:12 ` Max Reitz
2013-09-03 8:23 ` Kevin Wolf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=52258BAC.3000606@redhat.com \
--to=mreitz@redhat.com \
--cc=kwolf@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).