qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel][PATCH,RFC] Zero cluster dedup
@ 2008-09-02 16:28 Shahar Frank
  2008-09-03  7:09 ` Laurent Vivier
  2008-09-03  9:38 ` Kevin Wolf
  0 siblings, 2 replies; 13+ messages in thread
From: Shahar Frank @ 2008-09-02 16:28 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 11705 bytes --]

Hi All,

The following patch is implementing a zero cluster dedup feature to the qcow2 image format.

The incentive for this feature is the image COW "inflation" problem (i.e. COW image growth). The causes for the COW image pollution are many; for example, Windows with NTFS do not reuse recently de-allocated space and by that pollute many blocks. A naïve solution would be to identify de-allocated space and de-allocate it from the image block space. The problems with this approach are that 1) this requires a windows side interface to the image, and 2) there is no de-allocate verb for the images.

The suggested solution is simple:
	1) Use windows side cleanup/wipe utilities such as "Erase" "Free Wipe Wizard" or "Disk Redactor" (or any other free/non free wipe utility) to periodically wipe free space and cleanup temporaries. Most utilities can be used to write *zeros* on the wiped blocks. 
	2) Make qcow2 to identify zero cluster writes and use them as de-allocation hints (or "make hole" verb). To implement such feature with minimal effort and minimal changes, I suggest to use the internal COW mechanism of the qcow2 to create a shared zero cluster. To avoid image format changes, such page is created on demand and then can be shared with all zero writes. A non zero write on the zero cluster will cause a normal COW operation (similar to the shared kernel zero page). 

The following patch is also suggesting an initial generic dedup infrastructure to allow future offline/batch/online dedup featues. Such dedup features can handle many other causes of image inflations such as Windows updates, applications updates, etc.

An additional included change is a small improvement for the alloc_clusters_noref function. The original version supported a very simple first fit algorithm, with a simple pointer to remember the last position that we got to searching for a free space. The main problem with the original function was that the pointer scheme is not optimized for different allocation sizes. If for example you allocate a large space of 64k (8 clusters) the pointer would skip many smaller free spaces such that any future allocation, even for one cluster will not be able to use the skipped space. The included improvement is very simple- it uses two pointers, one for single cluster allocations, and one for bigger allocations. This may be still too simple, but it should solve the simple one cluster allocation (which should be very common).

I tested this feature by some instrumentation that I added to the qemu-img to display the logical to physical mapping. I will post it very soon.

Limitations: this version is very naïve. Only aligned full zero clusters are supported. There is much room for improvement, to detect partial zero writes on zero clusters, and to detect that the result of a partial write is a full zero cluster.

Shahar

Signed-off-by: Shahar Frank <shaharf@qumranet.com>

diff --git a/block-qcow2.c b/block-qcow2.c
index b9f1688..ca3faf1 100644
--- a/block-qcow2.c
+++ b/block-qcow2.c
@@ -65,6 +65,14 @@
 #define offsetof(type, field) ((size_t) &((type *)0)->field)
 #endif
 
+int zero_dedup = 1;
+
+#if defined(DEBUG_ALLOC) || defined(DEBUG_ALLOC2)
+#define DEBUG(msg, args...)     fprintf(stderr, "(%d) %s: " msg "\n", getpid(),  __FUNCTION__, ##args)
+#else
+#define DEBUG(msg, args...)
+#endif
+
 typedef struct QCowHeader {
     uint32_t magic;
     uint32_t version;
@@ -140,8 +148,10 @@ typedef struct BDRVQcowState {
     uint32_t refcount_table_size;
     uint64_t refcount_block_cache_offset;
     uint16_t *refcount_block_cache;
-    int64_t free_cluster_index;
+    int64_t free_cluster_index1;
+    int64_t free_cluster_index2;
     int64_t free_byte_offset;
+    uint64_t zero_cluster;
 
     uint32_t crypt_method; /* current crypt method, 0 if no key yet */
     uint32_t crypt_method_header;
@@ -171,6 +181,8 @@ static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
 static int64_t alloc_bytes(BlockDriverState *bs, int size);
 static void free_clusters(BlockDriverState *bs,
                           int64_t offset, int64_t size);
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
+
 #ifdef DEBUG_ALLOC
 static void check_refcounts(BlockDriverState *bs);
 #endif
@@ -1123,20 +1135,19 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num,
     return 0;
 }
 
-static int qcow_write(BlockDriverState *bs, int64_t sector_num,
+static int write_cluster(BlockDriverState *bs, int64_t sector_num, uint64_t cluster_offset,
                      const uint8_t *buf, int nb_sectors)
 {
     BDRVQcowState *s = bs->opaque;
-    int ret, index_in_cluster, n;
-    uint64_t cluster_offset;
-    int n_end;
+    int index_in_cluster = sector_num & (s->cluster_sectors - 1);
+    int n_end = index_in_cluster + nb_sectors;
+    int n = nb_sectors;
+    int ret;
 
-    while (nb_sectors > 0) {
-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
-        n_end = index_in_cluster + nb_sectors;
-        if (s->crypt_method &&
-            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
+    if (s->crypt_method && n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
             n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+
+    if (!cluster_offset)
         cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
                                               index_in_cluster,
                                               n_end, &n);
@@ -1152,6 +1163,89 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num,
         }
         if (ret != n * 512)
             return -1;
+    return 0;
+}
+
+static int is_not_zero(const uint8_t *buf, int len)
+{
+    int i;
+    len >>= 2;
+    for(i = 0;i < len; i++) {
+        if (((uint32_t *)buf)[i] != 0)
+            return 1;
+    }
+    return 0;
+}
+
+static int qcow_remap(BlockDriverState *bs, uint64_t loffset, uint64_t poffset)
+{
+    uint64_t l2_offset, *l2_table;
+    BDRVQcowState *s = bs->opaque;
+    uint64_t ooffset;
+    int l2_index, ret;
+
+    ret = get_cluster_table(bs, loffset, &l2_table, &l2_offset, &l2_index);
+    if (ret == 0)
+        return 0;
+
+    ooffset = be64_to_cpu(l2_table[l2_index]);
+    if (ooffset == poffset)
+        return 1;	/* nothing to do, already mapped to poffset */
+
+    if (ooffset) {
+        DEBUG("free old offset %llx (%llx)", ooffset, ooffset & ~QCOW_OFLAG_COPIED);
+        free_any_clusters(bs, ooffset & ~QCOW_OFLAG_COPIED, 1);
+    }
+
+    /* update L2 table */
+    l2_table[l2_index] = cpu_to_be64(poffset);
+
+    if (bdrv_pwrite(s->hd,
+                    l2_offset + l2_index * sizeof(uint64_t),
+                    l2_table + l2_index, sizeof(uint64_t)) != sizeof(uint64_t))
+        return 0;
+
+    update_refcount(bs, poffset, s->cluster_size, 1);
+    return 1;
+}
+
+static int qcow_dedup(BlockDriverState *bs, int64_t sector_num,
+                             const uint8_t *buf, int *nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    int cluster = s->cluster_sectors;
+    uint64_t cluster_offset = sector_num << 9, poffset;
+
+    DEBUG("%s 0x%llx", bs->filename, cluster_offset);
+    if (!zero_dedup || (sector_num & (s->cluster_sectors - 1)) ||
+        *nb_sectors < cluster || is_not_zero(buf, s->cluster_size))
+        return 0;		/* cannot/should not dedup */
+
+    DEBUG("%s zero dedup 0x%llx", bs->filename, cluster_offset);
+    *nb_sectors = cluster;	/* allocate exactly one cluster */
+
+    if (!s->zero_cluster) {
+        if (!(poffset = alloc_clusters_noref(bs, s->cluster_size)) ||
+            write_cluster(bs, sector_num, poffset, buf, cluster) < 0)
+                return 0;
+        s->zero_cluster = poffset;
+    }
+
+    return qcow_remap(bs, cluster_offset, s->zero_cluster);
+}
+
+static int qcow_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    int n;
+
+    while (nb_sectors > 0) {
+        n = nb_sectors;
+        if (qcow_dedup(bs, sector_num, buf, &n) <= 0 &&
+            write_cluster(bs, sector_num, sector_num << 9, buf, n) < 0)
+            return -1;
+
         nb_sectors -= n;
         sector_num += n;
         buf += n * 512;
@@ -1298,6 +1392,7 @@ static void qcow_aio_write_cb(void *opaque, int ret)
 
     acb->hd_aiocb = NULL;
 
+    DEBUG("<<");
     if (ret < 0) {
     fail:
         acb->common.cb(acb->common.opaque, ret);
@@ -1305,6 +1400,7 @@ static void qcow_aio_write_cb(void *opaque, int ret)
         return;
     }
 
+    do {
     acb->nb_sectors -= acb->n;
     acb->sector_num += acb->n;
     acb->buf += acb->n * 512;
@@ -1315,6 +1411,13 @@ static void qcow_aio_write_cb(void *opaque, int ret)
         qemu_aio_release(acb);
         return;
     }
+        acb->n = acb->nb_sectors;
+    } while ((ret = qcow_dedup(bs, acb->sector_num, acb->buf, &acb->n)) > 0);
+
+    if (ret < 0) {
+        ret = -EIO;
+        goto fail;
+    }
 
     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
     n_end = index_in_cluster + acb->nb_sectors;
@@ -2172,25 +2275,33 @@ static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
 {
     BDRVQcowState *s = bs->opaque;
     int i, nb_clusters;
+    uint64_t free_cluster_index = s->free_cluster_index1;
 
     nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
+    if (nb_clusters > 1)
+        free_cluster_index = s->free_cluster_index2;
+
     for(;;) {
-        if (get_refcount(bs, s->free_cluster_index) == 0) {
-            s->free_cluster_index++;
+        if (get_refcount(bs, free_cluster_index) == 0) {
+            free_cluster_index++;
             for(i = 1; i < nb_clusters; i++) {
-                if (get_refcount(bs, s->free_cluster_index) != 0)
+                if (get_refcount(bs, free_cluster_index) != 0)
                     goto not_found;
-                s->free_cluster_index++;
+                free_cluster_index++;
             }
 #ifdef DEBUG_ALLOC2
             printf("alloc_clusters: size=%lld -> %lld\n",
                    size,
-                   (s->free_cluster_index - nb_clusters) << s->cluster_bits);
+                   (free_cluster_index - nb_clusters) << s->cluster_bits);
 #endif
-            return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
+            if (nb_clusters > 1)
+                s->free_cluster_index2 = free_cluster_index;
+            else
+                s->free_cluster_index1 = free_cluster_index;
+            return (free_cluster_index - nb_clusters) << s->cluster_bits;
         } else {
         not_found:
-            s->free_cluster_index++;
+            free_cluster_index++;
         }
     }
 }
@@ -2374,8 +2485,13 @@ static int update_cluster_refcount(BlockDriverState *bs,
     refcount += addend;
     if (refcount < 0 || refcount > 0xffff)
         return -EINVAL;
-    if (refcount == 0 && cluster_index < s->free_cluster_index) {
-        s->free_cluster_index = cluster_index;
+    if (refcount == 0) {
+        if (cluster_index < s->free_cluster_index1)
+            s->free_cluster_index1 = cluster_index;
+        if (cluster_index < s->free_cluster_index2)
+            s->free_cluster_index2 = cluster_index;
+        if (cluster_index == s->zero_cluster)
+            s->zero_cluster = 0;
     }
     s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
     if (bdrv_pwrite(s->hd,

[-- Attachment #2: zdedup.patch --]
[-- Type: application/octet-stream, Size: 8571 bytes --]

Signed-off-by: Shahar Frank <shaharf@qumranet.com>

diff --git a/block-qcow2.c b/block-qcow2.c
index b9f1688..ca3faf1 100644
--- a/block-qcow2.c
+++ b/block-qcow2.c
@@ -65,6 +65,14 @@
 #define offsetof(type, field) ((size_t) &((type *)0)->field)
 #endif
 
+int zero_dedup = 1;
+
+#if defined(DEBUG_ALLOC) || defined(DEBUG_ALLOC2)
+#define DEBUG(msg, args...)     fprintf(stderr, "(%d) %s: " msg "\n", getpid(),  __FUNCTION__, ##args)
+#else
+#define DEBUG(msg, args...)
+#endif
+
 typedef struct QCowHeader {
     uint32_t magic;
     uint32_t version;
@@ -140,8 +148,10 @@ typedef struct BDRVQcowState {
     uint32_t refcount_table_size;
     uint64_t refcount_block_cache_offset;
     uint16_t *refcount_block_cache;
-    int64_t free_cluster_index;
+    int64_t free_cluster_index1;
+    int64_t free_cluster_index2;
     int64_t free_byte_offset;
+    uint64_t zero_cluster;
 
     uint32_t crypt_method; /* current crypt method, 0 if no key yet */
     uint32_t crypt_method_header;
@@ -171,6 +181,8 @@ static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
 static int64_t alloc_bytes(BlockDriverState *bs, int size);
 static void free_clusters(BlockDriverState *bs,
                           int64_t offset, int64_t size);
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
+
 #ifdef DEBUG_ALLOC
 static void check_refcounts(BlockDriverState *bs);
 #endif
@@ -1123,20 +1135,19 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num,
     return 0;
 }
 
-static int qcow_write(BlockDriverState *bs, int64_t sector_num,
+static int write_cluster(BlockDriverState *bs, int64_t sector_num, uint64_t cluster_offset,
                      const uint8_t *buf, int nb_sectors)
 {
     BDRVQcowState *s = bs->opaque;
-    int ret, index_in_cluster, n;
-    uint64_t cluster_offset;
-    int n_end;
+    int index_in_cluster = sector_num & (s->cluster_sectors - 1);
+    int n_end = index_in_cluster + nb_sectors;
+    int n = nb_sectors;
+    int ret;
 
-    while (nb_sectors > 0) {
-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
-        n_end = index_in_cluster + nb_sectors;
-        if (s->crypt_method &&
-            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
+    if (s->crypt_method && n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
             n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+
+    if (!cluster_offset)
         cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
                                               index_in_cluster,
                                               n_end, &n);
@@ -1152,6 +1163,89 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num,
         }
         if (ret != n * 512)
             return -1;
+    return 0;
+}
+
+static int is_not_zero(const uint8_t *buf, int len)
+{
+    int i;
+    len >>= 2;
+    for(i = 0;i < len; i++) {
+        if (((uint32_t *)buf)[i] != 0)
+            return 1;
+    }
+    return 0;
+}
+
+static int qcow_remap(BlockDriverState *bs, uint64_t loffset, uint64_t poffset)
+{
+    uint64_t l2_offset, *l2_table;
+    BDRVQcowState *s = bs->opaque;
+    uint64_t ooffset;
+    int l2_index, ret;
+
+    ret = get_cluster_table(bs, loffset, &l2_table, &l2_offset, &l2_index);
+    if (ret == 0)
+        return 0;
+
+    ooffset = be64_to_cpu(l2_table[l2_index]);
+    if (ooffset == poffset)
+        return 1;	/* nothing to do, already mapped to poffset */
+
+    if (ooffset) {
+        DEBUG("free old offset %llx (%llx)", ooffset, ooffset & ~QCOW_OFLAG_COPIED);
+        free_any_clusters(bs, ooffset & ~QCOW_OFLAG_COPIED, 1);
+    }
+
+    /* update L2 table */
+    l2_table[l2_index] = cpu_to_be64(poffset);
+
+    if (bdrv_pwrite(s->hd,
+                    l2_offset + l2_index * sizeof(uint64_t),
+                    l2_table + l2_index, sizeof(uint64_t)) != sizeof(uint64_t))
+        return 0;
+
+    update_refcount(bs, poffset, s->cluster_size, 1);
+    return 1;
+}
+
+static int qcow_dedup(BlockDriverState *bs, int64_t sector_num,
+                             const uint8_t *buf, int *nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    int cluster = s->cluster_sectors;
+    uint64_t cluster_offset = sector_num << 9, poffset;
+
+    DEBUG("%s 0x%llx", bs->filename, cluster_offset);
+    if (!zero_dedup || (sector_num & (s->cluster_sectors - 1)) ||
+        *nb_sectors < cluster || is_not_zero(buf, s->cluster_size))
+        return 0;		/* cannot/should not dedup */
+
+    DEBUG("%s zero dedup 0x%llx", bs->filename, cluster_offset);
+    *nb_sectors = cluster;	/* allocate exactly one cluster */
+
+    if (!s->zero_cluster) {
+        if (!(poffset = alloc_clusters_noref(bs, s->cluster_size)) ||
+            write_cluster(bs, sector_num, poffset, buf, cluster) < 0)
+                return 0;
+        s->zero_cluster = poffset;
+    }
+
+    return qcow_remap(bs, cluster_offset, s->zero_cluster);
+}
+
+static int qcow_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    int n;
+
+    while (nb_sectors > 0) {
+        n = nb_sectors;
+        if (qcow_dedup(bs, sector_num, buf, &n) <= 0 &&
+            write_cluster(bs, sector_num, sector_num << 9, buf, n) < 0)
+            return -1;
+
         nb_sectors -= n;
         sector_num += n;
         buf += n * 512;
@@ -1298,6 +1392,7 @@ static void qcow_aio_write_cb(void *opaque, int ret)
 
     acb->hd_aiocb = NULL;
 
+    DEBUG("<<");
     if (ret < 0) {
     fail:
         acb->common.cb(acb->common.opaque, ret);
@@ -1305,6 +1400,7 @@ static void qcow_aio_write_cb(void *opaque, int ret)
         return;
     }
 
+    do {
     acb->nb_sectors -= acb->n;
     acb->sector_num += acb->n;
     acb->buf += acb->n * 512;
@@ -1315,6 +1411,13 @@ static void qcow_aio_write_cb(void *opaque, int ret)
         qemu_aio_release(acb);
         return;
     }
+        acb->n = acb->nb_sectors;
+    } while ((ret = qcow_dedup(bs, acb->sector_num, acb->buf, &acb->n)) > 0);
+
+    if (ret < 0) {
+        ret = -EIO;
+        goto fail;
+    }
 
     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
     n_end = index_in_cluster + acb->nb_sectors;
@@ -2172,25 +2275,33 @@ static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
 {
     BDRVQcowState *s = bs->opaque;
     int i, nb_clusters;
+    uint64_t free_cluster_index = s->free_cluster_index1;
 
     nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
+    if (nb_clusters > 1)
+        free_cluster_index = s->free_cluster_index2;
+
     for(;;) {
-        if (get_refcount(bs, s->free_cluster_index) == 0) {
-            s->free_cluster_index++;
+        if (get_refcount(bs, free_cluster_index) == 0) {
+            free_cluster_index++;
             for(i = 1; i < nb_clusters; i++) {
-                if (get_refcount(bs, s->free_cluster_index) != 0)
+                if (get_refcount(bs, free_cluster_index) != 0)
                     goto not_found;
-                s->free_cluster_index++;
+                free_cluster_index++;
             }
 #ifdef DEBUG_ALLOC2
             printf("alloc_clusters: size=%lld -> %lld\n",
                    size,
-                   (s->free_cluster_index - nb_clusters) << s->cluster_bits);
+                   (free_cluster_index - nb_clusters) << s->cluster_bits);
 #endif
-            return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
+            if (nb_clusters > 1)
+                s->free_cluster_index2 = free_cluster_index;
+            else
+                s->free_cluster_index1 = free_cluster_index;
+            return (free_cluster_index - nb_clusters) << s->cluster_bits;
         } else {
         not_found:
-            s->free_cluster_index++;
+            free_cluster_index++;
         }
     }
 }
@@ -2374,8 +2485,13 @@ static int update_cluster_refcount(BlockDriverState *bs,
     refcount += addend;
     if (refcount < 0 || refcount > 0xffff)
         return -EINVAL;
-    if (refcount == 0 && cluster_index < s->free_cluster_index) {
-        s->free_cluster_index = cluster_index;
+    if (refcount == 0) {
+        if (cluster_index < s->free_cluster_index1)
+            s->free_cluster_index1 = cluster_index;
+        if (cluster_index < s->free_cluster_index2)
+            s->free_cluster_index2 = cluster_index;
+        if (cluster_index == s->zero_cluster)
+            s->zero_cluster = 0;
     }
     s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
     if (bdrv_pwrite(s->hd,

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2008-09-03 17:47 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-02 16:28 [Qemu-devel][PATCH,RFC] Zero cluster dedup Shahar Frank
2008-09-03  7:09 ` Laurent Vivier
2008-09-03  7:35   ` Shahar Frank
2008-09-03  7:59     ` Laurent Vivier
2008-09-03  8:13       ` Kevin Wolf
2008-09-03  8:25         ` Laurent Vivier
2008-09-03  9:38 ` Kevin Wolf
2008-09-03 12:05   ` Shahar Frank
2008-09-03 12:47     ` Kevin Wolf
2008-09-03 13:07       ` Shahar Frank
2008-09-03 13:12         ` Laurent Vivier
2008-09-03 17:44           ` Shahar Frank
2008-09-03 13:09       ` Laurent Vivier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).