From: Kevin Wolf <kwolf@redhat.com>
To: Lan Tianyu <tianyu.lan@intel.com>
Cc: penberg@kernel.org, kvm@vger.kernel.org, asias.hejun@gmail.com,
levinsasha928@gmail.com, prasadjoshi124@gmail.com
Subject: Re: [RFC v2 PATCH] kvm tools, qcow: Add the support for copy-on-write clusters
Date: Fri, 18 Nov 2011 11:10:28 +0100 [thread overview]
Message-ID: <4EC62F14.6090505@redhat.com> (raw)
In-Reply-To: <1321606038-29989-1-git-send-email-tianyu.lan@intel.com>
Am 18.11.2011 09:47, schrieb Lan Tianyu:
> When meeting request to write the cluster without copied flag,
> allocate a new cluster and write original data with modification
> to the new cluster. This also adds support for the writing operation
> of the qcow2 compressed image. After testing, image file can pass
> through "qemu-img check".
>
> Please enter the commit message for your changes. Lines starting
>
> Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
> ---
> tools/kvm/disk/qcow.c | 366 +++++++++++++++++++++++++++++-------------
> tools/kvm/include/kvm/qcow.h | 2 +
> 2 files changed, 255 insertions(+), 113 deletions(-)
>
> diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
> index 680b37d..4d9125d 100644
> --- a/tools/kvm/disk/qcow.c
> +++ b/tools/kvm/disk/qcow.c
> @@ -122,9 +122,6 @@ static int cache_table(struct qcow *q, struct qcow_l2_table *c)
> */
> lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list);
>
> - if (qcow_l2_cache_write(q, lru) < 0)
> - goto error;
> -
> /* Remove the node from the cache */
> rb_erase(&lru->node, r);
> list_del_init(&lru->list);
> @@ -618,9 +615,6 @@ static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
> if (rft->nr_cached == MAX_CACHE_NODES) {
> lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list);
>
> - if (write_refcount_block(q, lru) < 0)
> - goto error;
> -
> rb_erase(&lru->node, r);
> list_del_init(&lru->list);
> rft->nr_cached--;
> @@ -706,6 +700,11 @@ static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64
>
> rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
>
> + if (!rfb_offset) {
> + pr_warning("Don't support to grow refcount table");
> + return NULL;
> + }
> +
> rfb = refcount_block_search(q, rfb_offset);
> if (rfb)
> return rfb;
> @@ -728,35 +727,121 @@ error_free_rfb:
> return NULL;
> }
>
> -/*
> - * QCOW file might grow during a write operation. Not only data but metadata is
> - * also written at the end of the file. Therefore it is necessary to ensure
> - * every write is committed to disk. Hence we use uses qcow_pwrite_sync() to
> - * synchronize the in-core state of QCOW image to disk.
> - *
> - * We also try to restore the image to a consistent state if the metdata
> - * operation fails. The two metadat operations are: level 1 and level 2 table
> - * update. If either of them fails the image is truncated to a consistent state.
> +static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
> +{
> + struct qcow_refcount_block *rfb = NULL;
> + struct qcow_header *header = q->header;
> + u64 rfb_idx;
> +
> + rfb = qcow_read_refcount_block(q, clust_idx);
> + if (!rfb) {
> + pr_warning("Error while reading refcount table");
> + return -1;
> + }
> +
> + rfb_idx = clust_idx & (((1ULL <<
> + (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
> +
> + if (rfb_idx >= rfb->size) {
> + pr_warning("L1: refcount block index out of bounds");
> + return -1;
> + }
> +
> + return be16_to_cpu(rfb->entries[rfb_idx]);
> +}
> +
> +static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append)
> +{
> + struct qcow_refcount_block *rfb = NULL;
> + struct qcow_header *header = q->header;
> + u16 refcount;
> + u64 rfb_idx;
> +
> + rfb = qcow_read_refcount_block(q, clust_idx);
> + if (!rfb) {
> + pr_warning("error while reading refcount table");
> + return -1;
> + }
> +
> + rfb_idx = clust_idx & (((1ULL <<
> + (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
> + if (rfb_idx >= rfb->size) {
> + pr_warning("refcount block index out of bounds");
> + return -1;
> + }
> +
> + refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append;
> + rfb->entries[rfb_idx] = cpu_to_be16(refcount);
> + rfb->dirty = 1;
> +
> + /*write refcount block*/
> + write_refcount_block(q, rfb);
Missing error handling.
> +
> + /*update free_clust_idx since refcount becomes zero*/
> + if (!refcount && clust_idx < q->free_clust_idx)
> + q->free_clust_idx = clust_idx;
> +
> + return 0;
> +}
> +
> +static void qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size)
> +{
> + struct qcow_header *header = q->header;
> + u64 start, end, offset;
> +
> + start = clust_start & ~(q->cluster_size - 1);
> + end = (clust_start + size - 1) & ~(q->cluster_size - 1);
> + for (offset = start; offset <= end; offset += q->cluster_size)
> + update_cluster_refcount(q, offset >> header->cluster_bits, -1);
> +}
> +
> +/*Allocate clusters according to the size. Find a postion that
> + *can satisfy the size. free_clust_idx is initialized to zero and
> + *Record last position.
> +*/
> +static u64 qcow_alloc_clusters(struct qcow *q, u64 size)
> +{
> + struct qcow_header *header = q->header;
> + u16 clust_refcount;
> + u32 clust_idx, i;
> + u64 clust_num;
> +
> + clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits;
> +
> +again:
> + for (i = 0; i < clust_num; i++) {
> + clust_idx = q->free_clust_idx++;
> + clust_refcount = qcow_get_refcount(q, clust_idx);
> + if (clust_refcount < 0)
> + return -1;
> + else if (clust_refcount > 0)
> + goto again;
> + }
> +
> + for (i = 0; i < clust_num; i++)
> + update_cluster_refcount(q,
> + q->free_clust_idx - clust_num + i, 1);
Error handling again.
> +
> + return (q->free_clust_idx - clust_num) << header->cluster_bits;
> +}
> +
> +/*Get l2 table. If the table has been copied, read table directly.
> + *If the table exists, allocate a new cluster and copy the table
> + *to the new cluster.
> */
> -static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src_len)
> +static int get_cluster_table(struct qcow *q, u64 offset,
> + struct qcow_l2_table **result_l2t, u64 *result_l2_index)
> {
> struct qcow_header *header = q->header;
> struct qcow_l1_table *l1t = &q->table;
> struct qcow_l2_table *l2t;
> - u64 clust_start;
> - u64 clust_flags;
> - u64 l2t_offset;
> - u64 clust_off;
> - u64 l2t_size;
> - u64 clust_sz;
> u64 l1t_idx;
> + u64 l2t_offset;
> u64 l2t_idx;
> - u64 f_sz;
> - u64 len;
> + u64 l2t_size;
> + u64 l2t_new_offset;
>
> - l2t = NULL;
> - l2t_size = 1 << header->l2_bits;
> - clust_sz = 1 << header->cluster_bits;
> + l2t_size = 1 << header->l2_bits;
>
> l1t_idx = get_l1_index(q, offset);
> if (l1t_idx >= l1t->table_size)
> @@ -766,122 +851,166 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
> if (l2t_idx >= l2t_size)
> return -1;
>
> - clust_off = get_cluster_offset(q, offset);
> - if (clust_off >= clust_sz)
> - return -1;
> -
> - len = clust_sz - clust_off;
> - if (len > src_len)
> - len = src_len;
> -
> - mutex_lock(&q->mutex);
> -
> l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
> - if (l2t_offset & QCOW2_OFLAG_COMPRESSED) {
> - pr_warning("compressed clusters are not supported");
> - goto error;
> - }
> - if (!(l2t_offset & QCOW2_OFLAG_COPIED)) {
> - pr_warning("L2 copy-on-write clusters are not supported");
> - goto error;
> - }
> -
> - l2t_offset &= QCOW2_OFFSET_MASK;
> - if (l2t_offset) {
> - /* read and cache l2 table */
> + if (l2t_offset & QCOW2_OFLAG_COPIED) {
> + l2t_offset &= ~QCOW2_OFLAG_COPIED;
> l2t = qcow_read_l2_table(q, l2t_offset);
> if (!l2t)
> goto error;
> } else {
> - l2t = new_cache_table(q, l2t_offset);
> - if (!l2t)
> + l2t_new_offset = qcow_alloc_clusters(q, l2t_size*sizeof(u64));
> + if (l2t_new_offset < 0)
> goto error;
>
> - /* Capture the state of the consistent QCOW image */
> - f_sz = file_size(q->fd);
> - if (!f_sz)
> - goto free_cache;
> + l2t = new_cache_table(q, l2t_new_offset);
> + if (!l2t)
> + goto free_cluster;
> +
> + if (l2t_offset)
> + qcow2_read_cluster(q, l2t_offset, l2t->table,
> + l2t_size*sizeof(u64));
There must be a system behind it. :-)
> + else
> + memset(l2t->table, 0x00, l2t_size * sizeof(u64));
>
> - /* Write the l2 table of 0's at the end of the file */
> - l2t_offset = qcow_write_l2_table(q, l2t->table);
> - if (!l2t_offset)
> + /*write l2 table*/
> + l2t->dirty = 1;
> + if (qcow_l2_cache_write(q, l2t) < 0)
> goto free_cache;
You need to make sure that the refcount update is written first (e.g.
with fsync), otherwise you risk corruption when the host crashes in the
middle.
>
> - if (cache_table(q, l2t) < 0) {
> - if (ftruncate(q->fd, f_sz) < 0)
> - goto free_cache;
> + /* Update the l1 talble */
> + l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset
> + | QCOW2_OFLAG_COPIED);
>
> - goto free_cache;
> - }
> + if (pwrite_in_full(q->fd, l1t->l1_table,
> + l1t->table_size * sizeof(u64),
> + header->l1_table_offset) < 0)
> + goto error;
Likewise, the L1 table write must be ordered against the L2 write.
goto error is using the wrong label.
>
> - /* Update the in-core entry */
> - l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_offset);
> + /*cache l2 table*/
> + cache_table(q, l2t);
After so many explicit comments, you can probably guess what's wrong here...
> +
> + /*free old cluster*/
> + qcow_free_clusters(q, l2t_offset, q->cluster_size);
> }
>
> - /* Capture the state of the consistent QCOW image */
> - f_sz = file_size(q->fd);
> - if (!f_sz)
> - goto error;
> + *result_l2t = l2t;
> + *result_l2_index = l2t_idx;
>
> - clust_start = be64_to_cpu(l2t->table[l2t_idx]);
> + return 0;
>
> - clust_flags = clust_start & QCOW2_OFLAGS_MASK;
> - if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
> - pr_warning("compressed clusters are not supported");
> +free_cache:
> + free(l2t);
> +
> +free_cluster:
> + qcow_free_clusters(q, l2t_new_offset, q->cluster_size);
> +
> +error:
> + return -1;
> +}
> +
> +/*If the cluster has been copied, write data directly. If not,
> + *read the original data and write it to the new cluster with
> + *modification.
> +*/
> +static ssize_t qcow_write_cluster(struct qcow *q, u64 offset,
> + void *buf, u32 src_len)
> +{
> + struct qcow_header *header = q->header;
> + struct qcow_l2_table *l2t;
> + u64 clust_new_idx;
> + u64 clust_new_start;
> + u64 clust_start;
> + u64 clust_flags;
> + u64 clust_off;
> + u64 l2t_idx;
> + u64 len;
> +
> + l2t = NULL;
> +
> + clust_off = get_cluster_offset(q, offset);
> + if (clust_off >= q->cluster_size)
> + return -1;
> +
> + len = q->cluster_size - clust_off;
> + if (len > src_len)
> + len = src_len;
> +
> + mutex_lock(&q->mutex);
> +
> + if (get_cluster_table(q, offset, &l2t, &l2t_idx)) {
> + pr_warning("Get l2 table error");
> goto error;
> }
>
> - clust_start &= QCOW2_OFFSET_MASK;
> - if (!clust_start) {
> - clust_start = ALIGN(f_sz, clust_sz);
> - l2t->table[l2t_idx] = cpu_to_be64(clust_start | QCOW2_OFLAG_COPIED);
> - l2t->dirty = 1;
> - }
> + clust_start = be64_to_cpu(l2t->table[l2t_idx]);
> + clust_flags = clust_start & QCOW2_OFLAGS_MASK;
>
> + clust_start &= QCOW2_OFFSET_MASK;
> if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
> - struct qcow_refcount_block *rfb = NULL;
> - u16 clust_refcount;
> - u64 clust_idx;
> - u64 rfb_idx;
> -
> - clust_idx = (clust_start & QCOW2_OFFSET_MASK)
> - >> (header->cluster_bits);
>
> - rfb = qcow_read_refcount_block(q, clust_idx);
> - if (!rfb) {
> - pr_warning("L1: error while reading refcount table");
> + clust_new_start = qcow_alloc_clusters(q, q->cluster_size);
> + if (clust_new_start < 0) {
> + pr_warning("Cluster alloc error!");
> goto error;
> }
>
> - rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
> - if (rfb_idx >= rfb->size) {
> - pr_warning("L1: refcount block index out of bounds");
> - goto error;
> - }
> + clust_new_idx = clust_new_start >> header->cluster_bits;
> + offset &= ~(q->cluster_size - 1);
> +
> + /*if clust_start is not zero, read the original data*/
> + if (clust_start) {
> + mutex_unlock(&q->mutex);
> + if (qcow2_read_cluster(q, offset, q->copy_buff,
> + q->cluster_size) < 0) {
> + pr_warning("Read copy cluster error");
> + qcow_free_clusters(q, clust_new_start,
> + q->cluster_size);
> + return -1;
> + }
> + mutex_lock(&q->mutex);
> + } else
> + memset(q->copy_buff, 0x00, q->cluster_size);
> +
> + memcpy(q->copy_buff + clust_off, buf, len);
> +
> + /* Write actual data */
> + if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size,
> + clust_new_start) < 0)
> + goto free_cluster;
> +
> + /*update l2 table*/
> + l2t->table[l2t_idx] = cpu_to_be64(clust_new_start
> + | QCOW2_OFLAG_COPIED);
> + l2t->dirty = 1;
> +
> + if (qcow_l2_cache_write(q, l2t))
> + goto free_cluster;
Cluster allocation must be ordered against L2 update.
> +
> + /*free old cluster*/
> + if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
> + int size;
> + size = ((clust_start >> q->csize_shift) &
> + q->csize_mask) + 1;
> + size *= 512;
> + clust_start &= q->cluster_offset_mask;
> + clust_start &= ~511;
> +
> + qcow_free_clusters(q, clust_start, size);
> + } else if (clust_start)
> + qcow_free_clusters(q, clust_start, q->cluster_size);
>
> - clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]);
> - if (!clust_refcount) {
> - clust_refcount = 1;
> - rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount);
> - rfb->dirty = 1;
> - }
> -
> - if (clust_refcount > 1) {
> - pr_warning("L1 copy-on-write clusters are not supported");
> + } else {
> + /* Write actual data */
> + if (pwrite_in_full(q->fd, buf, len,
> + clust_start + clust_off) < 0)
> goto error;
> - }
> }
> -
> mutex_unlock(&q->mutex);
> -
> - /* Write actual data */
> - if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0)
> - return -1;
> -
> return len;
>
> -free_cache:
> - free(l2t);
> +free_cluster:
> + qcow_free_clusters(q, clust_new_start, q->cluster_size);
> +
> error:
> mutex_unlock(&q->mutex);
> return -1;
> @@ -993,6 +1122,7 @@ static int qcow_disk_close(struct disk_image *disk)
>
> refcount_table_free_cache(&q->refcount_table);
> l1_table_free_cache(&q->table);
> + free(q->copy_buff);
> free(q->cluster_data);
> free(q->cluster_cache);
> free(q->refcount_table.rf_table);
> @@ -1117,10 +1247,16 @@ static struct disk_image *qcow2_probe(int fd, bool readonly)
> q->cluster_offset_mask = (1LL << q->csize_shift) - 1;
> q->cluster_size = 1 << q->header->cluster_bits;
>
> + q->copy_buff = malloc(q->cluster_size);
> + if (!q->copy_buff) {
> + pr_warning("copy buff malloc error!");
> + goto free_header;
> + }
> +
> q->cluster_data = malloc(q->cluster_size);
> if (!q->cluster_data) {
> pr_warning("cluster data malloc error!");
> - goto free_header;
> + goto free_copy_buff;
> }
>
> q->cluster_cache = malloc(q->cluster_size);
> @@ -1163,6 +1299,9 @@ free_cluster_cache:
> free_cluster_data:
> if (q->cluster_data)
> free(q->cluster_data);
> +free_copy_buff:
> + if (q->cluster_data)
> + free(q->cluster_data);
This looks like the wrong field.
> free_header:
> if (q->header)
> free(q->header);
> @@ -1252,6 +1391,7 @@ static struct disk_image *qcow1_probe(int fd, bool readonly)
> q->version = QCOW1_VERSION;
> q->cluster_size = 1 << q->header->cluster_bits;
> q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1;
> + q->free_clust_idx = 0;
>
> q->cluster_data = malloc(q->cluster_size);
> if (!q->cluster_data) {
> diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
> index bbf7913..e032a1e 100644
> --- a/tools/kvm/include/kvm/qcow.h
> +++ b/tools/kvm/include/kvm/qcow.h
> @@ -84,8 +84,10 @@ struct qcow {
> u32 version;
> u64 cluster_size;
> u64 cluster_offset_mask;
> + u64 free_clust_idx;
> void *cluster_cache;
> void *cluster_data;
> + void *copy_buff;
> };
>
> struct qcow1_header_disk {
Kevin
next prev parent reply other threads:[~2011-11-18 10:07 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-11-18 8:47 [RFC v2 PATCH] kvm tools, qcow: Add the support for copy-on-write clusters Lan Tianyu
2011-11-18 10:10 ` Kevin Wolf [this message]
2011-11-19 2:09 ` Lan, Tianyu
2011-11-19 15:30 ` Lan, Tianyu
2011-11-19 16:26 ` Sasha Levin
2011-11-20 6:14 ` Lan, Tianyu
2011-11-20 6:23 ` Sasha Levin
2011-11-20 7:03 ` Lan, Tianyu
2011-11-20 10:59 ` Pekka Enberg
2011-11-21 8:30 ` Kevin Wolf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4EC62F14.6090505@redhat.com \
--to=kwolf@redhat.com \
--cc=asias.hejun@gmail.com \
--cc=kvm@vger.kernel.org \
--cc=levinsasha928@gmail.com \
--cc=penberg@kernel.org \
--cc=prasadjoshi124@gmail.com \
--cc=tianyu.lan@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).