qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: "Benoît Canet" <benoit@irqsave.net>
To: qemu-devel@nongnu.org
Cc: kwolf@redhat.com, "Benoît Canet" <benoit@irqsave.net>,
	stefanha@redhat.com
Subject: [Qemu-devel] [RFC V8 02/24] qcow2: Add deduplication structures and fields.
Date: Thu, 20 Jun 2013 16:26:10 +0200	[thread overview]
Message-ID: <1371738392-9594-3-git-send-email-benoit@irqsave.net> (raw)
In-Reply-To: <1371738392-9594-1-git-send-email-benoit@irqsave.net>

Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
 block/qcow2.h |  203 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 201 insertions(+), 2 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index 9421843..953edfe 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -57,7 +57,182 @@
 #define REFCOUNT_CACHE_SIZE 4
 
 #define DEFAULT_CLUSTER_SIZE 65536
-
+#define DEFAULT_DEDUP_CLUSTER_SIZE 4096
+
+#define HASH_LENGTH 32
+
+/* indicate that this cluster hash has been deleted from the key value store */
+#define QCOW_DEDUP_DELETED (1LL << 61)
+/* indicate that the hash structure is empty and miss offset */
+#define QCOW_DEDUP_FLAG_EMPTY   (1LL << 62)
+
+#define SSD_ERASE_BLOCK_SIZE (512 * 1024) /* match SSD erase block size */
+#define JOURNAL_CLUSTER_SIZE 4096       /* used to read entries */
+#define HASH_STORE_CLUSTER_SIZE 4096
+
+#define QCOW_LOG_END_SIZE 2            /* size of a end block journal entry */
+#define QCOW_LOG_STORE_ENTRY_USED (1LL << 60) /* mark used entry in table */
+#define QCOW_LOG_STORE_BUCKET_SIZE 4   /* size of a cuckoo hash bucket */
+#define QCOW_LOG_STORE_MAX_KICKS 128   /* max numbers of cuckoo hash kicks */
+#define QCOW_LOG_STORE_JOURNAL_RATIO 2 /* the ratio to compute the extra
+                                        * room the journal will take based
+                                        * on the log store size
+                                        */
+#define QCOW2_NB_INCARNATION_GOAL  128 /* targeted number of incarnation */
+
+#define QCOW_DEDUP_DIRTY 1 /* dirty flag in the qcow2 header extension */
+
+typedef enum {
+    QCOW_LOG_NONE = 0xFF,     /* on SSD erased clusters will mark none */
+    QCOW_LOG_END = 1,         /* end a block and point to the next */
+    QCOW_LOG_HASH = 2,        /* used to journalize a QCowHashInfo */
+} QCowLogEntryType;
+
+typedef enum {
+    QCOW_HASH_SHA256 = 0,
+    QCOW_HASH_SHA3   = 1,
+    QCOW_HASH_SKEIN  = 2,
+} QCowHashAlgo;
+
+typedef struct {
+    uint8_t data[HASH_LENGTH]; /* 32 bytes hash of a given cluster */
+} __attribute__((packed)) QCowHash;
+
+/* deduplication info */
+typedef struct {
+    QCowHash hash;
+    uint64_t physical_sect;       /* where the cluster is stored on disk */
+    uint64_t first_logical_sect;  /* logical sector of the first occurrence of
+                                   * this cluster
+                                   */
+} __attribute__((packed)) QCowHashInfo;
+
+/* Used to keep a single precomputed hash between the calls of the dedup
+ * function
+ */
+typedef struct {
+    QCowHashInfo hash_info;
+    bool reuse;     /* The main deduplication function can set this field to
+                     * true before exiting to avoid computing the same hash
+                     * twice. It's a speed optimization.
+                     */
+} QcowPersistentHash;
+
+/* Undedupable hashes that must be written later to disk */
+typedef struct QCowHashElement {
+    QCowHashInfo hash_info;
+    QTAILQ_ENTRY(QCowHashElement) next;
+} QCowHashElement;
+
+typedef struct {
+    QcowPersistentHash phash;  /* contains a hash persisting between calls of
+                                * qcow2_dedup()
+                                */
+    QTAILQ_HEAD(, QCowHashElement) undedupables;
+    uint64_t nb_clusters_processed;
+    uint64_t nb_undedupable_sectors;
+} QCowDedupState;
+
+/* The code must take care that the maximum size field of a QCowJournalEntry
+ * will be no more than 254 bytes.
+ * It's required to save the 2 bytes of room for QCOW_LOG_END entries
+ * in every cases
+ */
+typedef union {
+    QCowHashInfo hash_info;
+    uint8_t      padding[254]; /* note the extra two bytes of padding to avoid
+                                * read overflow.
+                                */
+} QCowJournalEntryUnion;
+
+typedef struct {
+    uint8_t size;            /* maximum size of a journal entry is 254 bytes */
+    uint8_t type;            /* contains a QCowLogEntryType for future usage */
+    QCowJournalEntryUnion u;
+} __attribute__((packed)) QCowJournalEntry;
+
+typedef struct {
+    uint64_t sector;                  /* the journal physical on disk sector */
+    uint64_t size;                    /* the size of the journal in bytes */
+    uint64_t index;                   /* index of next buf cluster to write */
+    uint8_t  *write_buf;              /* used to buffer written data */
+    uint64_t offset_in_buf;           /* the offset in the write buffer */
+    bool     flushed;                 /* true if the buffer reached disk*/
+    uint8_t  *read_cache;             /* used to cache read data */
+    int64_t read_index;               /* index the cached read cluster */
+    bool started;                     /* has the journal been resumed */
+} QCowJournal;
+
+typedef struct {
+    QCowJournal journal;          /* the journal this log store will use */
+    uint32_t order;               /* the number of bits used for the sub hashes
+                                   * as sub hashes will be used as an index for
+                                   * locating each bucket nb_bucket = 2^order
+                                   */
+    uint16_t nb_kicks;            /* the number of cuckoo hash kicks done */
+    bool *kick_map;               /* take care of not doing kick path loops */
+    QCowHashInfo *hash_table;     /* nb_buckets * QCOW_LOG_STORE_BUCKET_SIZE */
+    QCowHashInfo *hash_table_copy; /* copy of the hash table for packing */
+
+    /* members required to freeze a log store into a hash store consumable
+     * hash table (incarnation)
+     */
+    uint8_t *write_buf;           /* the buffer used to write */
+    uint64_t write_buf_offset;    /* the on disk offset of the buffer */
+    uint32_t in_buf_offset;       /* the current offset in the write buffer */
+} QCowLogStore;
+
+/* a QCowIncarnation is a frozen QCowLogStore
+ * Freezes are read only and their in ram filters is queried from the youngest
+ * to the oldest in order to know if their hash table contains a given
+ * QCowHashInfo.
+ * If so a read is issued to retrieve the QCowHashInfo.
+ */
+typedef struct QCowIncarnation {
+    uint64_t filter_offset;      /* the on disk offset of the ram filter */
+    uint64_t hash_table_offset;  /* the on disk offset of the hash table
+                                  * (should be just after the end of the filter)
+                                  */
+    uint64_t size;               /* the on disk size of the incarnation */
+    uint8_t  *filter;            /* an in ram filter */
+    QTAILQ_ENTRY(QCowIncarnation) next;
+} QCowIncarnation;
+
+typedef struct QCowLimbo {
+    uint64_t offset;             /* the on disk offset of the to reincarnate
+                                  * disk space
+                                  */
+    QTAILQ_ENTRY(QCowLimbo) next;
+} QCowLimbo;
+
+typedef struct {
+    uint32_t order;               /* the number of bits used for the sub hashes
+                                   * as sub hashes will be used as an index for
+                                   * locating each bucket nb_bucket = 2^order
+                                   */
+    uint32_t nb_incarnations;     /* the current number of incarnations */
+    uint32_t nb_in_limbo;         /* the number of dead incarnations */
+    QTAILQ_HEAD(incarnations_head, QCowIncarnation) incarnations;
+                                               /* a list of frozen hash table
+                                                * ordered from the youngest
+                                                * to the oldest
+                                                */
+    QTAILQ_HEAD(in_limbo_head, QCowLimbo) in_limbo;
+                                       /* a list of dead incarnation which disk
+                                        * space can be recycled
+                                        */
+} QCowHashStore;
+
+typedef struct {
+    uint32_t      order;
+    QCowLogStore  log_store;         /* the current log store */
+    QCowLogStore  frozen_log_store;  /* the log store to incarnate */
+    bool          freezing;          /* are we incarnating a log store */
+    QCowHashStore hash_store;
+    CoMutex       insert_lock;       /* used to prevent multiple freeze attempts
+                                      * at the same time
+                                      */
+} QCowStore;
 
 #define QCOW2_OPT_LAZY_REFCOUNTS "lazy_refcounts"
 
@@ -117,8 +292,10 @@ enum {
 enum {
     QCOW2_INCOMPAT_DIRTY_BITNR   = 0,
     QCOW2_INCOMPAT_DIRTY         = 1 << QCOW2_INCOMPAT_DIRTY_BITNR,
+    QCOW2_INCOMPAT_DEDUP_BITNR   = 1,
+    QCOW2_INCOMPAT_DEDUP         = 1 << QCOW2_INCOMPAT_DEDUP_BITNR,
 
-    QCOW2_INCOMPAT_MASK          = QCOW2_INCOMPAT_DIRTY,
+    QCOW2_INCOMPAT_MASK          = QCOW2_INCOMPAT_DIRTY | QCOW2_INCOMPAT_DEDUP,
 };
 
 /* Compatible feature bits */
@@ -163,6 +340,28 @@ typedef struct BDRVQcowState {
     int64_t free_cluster_index;
     int64_t free_byte_offset;
 
+    bool has_dedup;                     /* indicate if this image has dedup */
+
+    /* the following fields are saved in QCOW2 dedup header extension */
+    bool dedup_dirty;                   /* mapped to the header dirty flag */
+    uint64_t dedup_conf_offset;         /* disk offset of the dedup config */
+    size_t dedup_conf_size;             /* disk size of the dedup config */
+    QCowHashAlgo dedup_hash_algo;       /* the cryptographic hash algo used */
+    uint32_t dedup_max_incarnations;    /* the maximum number of incarnations in
+                                         * hash store -> oldest incarnation will
+                                         * be dropped. It's harmless for the
+                                         * dedup and allow to scale.
+                                         * The whole thing act as a FIFO or LRU
+                                         * if value is 0 there is no limits
+                                         */
+
+    QCowStore key_value_store;          /* the key value store used to store
+                                         * dedup hashes
+                                         */
+    int freeze_errno;                   /* catch errors when incarnating */
+    Coroutine *freeze_co;               /* coroutine used when freezing */
+    Coroutine *load_filter_co;          /* used to load incarnations filters */
+
     CoMutex lock;
 
     uint32_t crypt_method; /* current crypt method, 0 if no key yet */
-- 
1.7.10.4

  parent reply	other threads:[~2013-06-20 14:25 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-06-20 14:26 [Qemu-devel] [RFC V8 00/24] QCOW2 deduplication core functionality Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 01/24] qcow2: Add journal specification Benoît Canet
2013-07-02 14:42   ` Stefan Hajnoczi
2013-07-02 14:54     ` Kevin Wolf
2013-07-02 21:26       ` Benoît Canet
2013-07-03  8:08         ` Kevin Wolf
2013-07-03  7:51       ` Stefan Hajnoczi
2013-07-02 21:23     ` Benoît Canet
2013-07-03  8:01       ` Stefan Hajnoczi
2013-07-03 12:35         ` Benoît Canet
2013-07-03  8:04       ` Kevin Wolf
2013-07-03 12:30         ` Benoît Canet
2013-07-03  8:12       ` Stefan Hajnoczi
2013-07-03 12:53         ` Benoît Canet
2013-07-04  7:13           ` Stefan Hajnoczi
2013-07-04 10:01             ` Benoît Canet
2013-07-16 22:45               ` Benoît Canet
2013-07-17  8:20                 ` Kevin Wolf
2013-06-20 14:26 ` Benoît Canet [this message]
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 03/24] qcow2: Add journal Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 04/24] qcow2: Create the log store Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 05/24] qcow2: Add the hash store Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 06/24] qcow2: Add the deduplication store Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 07/24] qcow2: Add qcow2_dedup_read_missing_and_concatenate Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 08/24] qcow2: Create a way to link to l2 tables when deduplicating Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 09/24] qcow2: Make qcow2_update_cluster_refcount public Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 10/24] qcow2: Add qcow2_dedup and related functions Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 11/24] qcow2: Add qcow2_dedup_store_new_hashes Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 12/24] qcow2: Do allocate on rewrite on the dedup case Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 13/24] qcow2: Implement qcow2_compute_cluster_hash Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 14/24] qcow2: Load and save deduplication table header extension Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 15/24] qcow2: Extract qcow2_set_incompat_feature and qcow2_clear_incompat_feature Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 16/24] block: Add qcow2_dedup format and image creation code Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 17/24] qcow2: Drop hash for a given cluster when dedup makes refcount > 2^16/2 Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 18/24] qcow2: Remove hash when cluster is deleted Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 19/24] qcow2: Integrate deduplication in qcow2_co_writev loop Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 20/24] qcow2: Serialize write requests when deduplication is activated Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 21/24] qcow2: Integrate SKEIN hash algorithm in deduplication Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 22/24] qcow2: Add qcow2_dedup_init and qcow2_dedup_close Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 23/24] qcow2: Enable the deduplication feature Benoît Canet
2013-06-20 14:26 ` [Qemu-devel] [RFC V8 24/24] qcow2: Enable deduplication tests Benoît Canet

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1371738392-9594-3-git-send-email-benoit@irqsave.net \
    --to=benoit@irqsave.net \
    --cc=kwolf@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).