[Qemu-devel] [RFC] qcow2 journalling draft

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [RFC] qcow2 journalling draft
@ 2013-09-03 13:45 Kevin Wolf
  2013-09-03 14:43 ` Benoît Canet
                   ` (4 more replies)
  0 siblings, 5 replies; 24+ messages in thread
From: Kevin Wolf @ 2013-09-03 13:45 UTC (permalink / raw)
  To: benoit.canet; +Cc: kwolf, famz, jcody, qemu-devel, mreitz, stefanha

This contains an extension of the qcow2 spec that introduces journalling
to the image format, plus some preliminary type definitions and
function prototypes in the qcow2 code.

Journalling functionality is a crucial feature for the design of data
deduplication, and it will improve the core part of qcow2 by avoiding
cluster leaks on crashes as well as provide an easier way to get a
reliable implementation of performance features like Delayed COW.

At this point of the RFC, it would be most important to review the
on-disk structure. Once we're confident that it can do everything we
want, we can start going into more detail on the qemu side of things.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/Makefile.objs   |   2 +-
 block/qcow2-journal.c |  55 ++++++++++++++
 block/qcow2.h         |  78 +++++++++++++++++++
 docs/specs/qcow2.txt  | 204 +++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 337 insertions(+), 2 deletions(-)
 create mode 100644 block/qcow2-journal.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 3bb85b5..59be314 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,5 +1,5 @@
 block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
-block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
+block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o qcow2-journal.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-y += vhdx.o
diff --git a/block/qcow2-journal.c b/block/qcow2-journal.c
new file mode 100644
index 0000000..5b20239
--- /dev/null
+++ b/block/qcow2-journal.c
@@ -0,0 +1,55 @@
+/*
+ * qcow2 journalling functions
+ *
+ * Copyright (c) 2013 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qcow2.h"
+
+#define QCOW2_JOURNAL_MAGIC 0x716a6f75726e616cULL  /* "qjournal" */
+#define QCOW2_JOURNAL_BLOCK_MAGIC 0x716a626b  /* "qjbk" */
+
+typedef struct Qcow2JournalHeader {
+    uint64_t    magic;
+    uint32_t    journal_size;
+    uint32_t    block_size;
+    uint32_t    synced_index;
+    uint32_t    synced_seq;
+    uint32_t    committed_seq;
+    uint32_t    checksum;
+} QEMU_PACKED Qcow2JournalHeader;
+
+/*
+ * One big transaction per journal block. The transaction is committed either
+ * time based or when a microtransaction (single set of operations that must be
+ * performed atomically) doesn't fit in the same block any more.
+ */
+typedef struct Qcow2JournalBlock {
+    uint32_t    magic;
+    uint32_t    checksum;
+    uint32_t    seq;
+    uint32_t    desc_offset; /* Allow block header extensions */
+    uint32_t    desc_bytes;
+    uint32_t    nb_data_blocks;
+} QEMU_PACKED Qcow2JournalBlock;
+
diff --git a/block/qcow2.h b/block/qcow2.h
index 1000239..2aee1fd 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -157,6 +157,10 @@ typedef struct Qcow2DiscardRegion {
     QTAILQ_ENTRY(Qcow2DiscardRegion) next;
 } Qcow2DiscardRegion;
 
+typedef struct Qcow2Journal {
+
+} Qcow2Journal;
+
 typedef struct BDRVQcowState {
     int cluster_bits;
     int cluster_size;
@@ -479,4 +483,78 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
     void **table);
 int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
 
+/* qcow2-journal.c functions */
+
+typedef struct Qcow2JournalTransaction Qcow2JournalTransaction;
+
+enum Qcow2JournalEntryTypeID {
+    QJ_DESC_NOOP    = 0,
+    QJ_DESC_WRITE   = 1,
+    QJ_DESC_COPY    = 2,
+
+    /* required after a cluster is freed and used for other purposes, so that
+     * new (unjournalled) data won't be overwritten with stale metadata */
+    QJ_DESC_REVOKE  = 3,
+};
+
+typedef struct Qcow2JournalEntryType {
+    enum Qcow2JournalEntryTypeID id;
+    int (*sync)(void *buf, size_t size);
+} Qcow2JournalEntryType;
+
+typedef struct Qcow2JournalDesc {
+    uint16_t    type;
+    uint16_t    size;
+} QEMU_PACKED Qcow2JournalDesc;
+
+typedef struct Qcow2JournalDescWrite {
+    Qcow2JournalDesc common;
+    struct {
+        uint32_t length;
+        uint64_t target_offset;
+        uint32_t data_block_index;
+    } write[];
+} QEMU_PACKED Qcow2JournalDescData;
+
+typedef struct Qcow2JournalDescCopy {
+    Qcow2JournalDesc common;
+    struct {
+        uint32_t length;
+        uint64_t target_offset;
+        uint64_t source_offset;
+    } copy[];
+} QEMU_PACKED Qcow2JournalDescCopy;
+
+typedef struct Qcow2JournalRevoke {
+    Qcow2JournalDesc common;
+    struct {
+        uint32_t length;
+        uint64_t target_offset;
+    } revoke[];
+} QEMU_PACKED Qcow2JournalDescRevoke;
+
+void qcow2_journal_register_entry_type(Qcow2JournalEntryType *type);
+
+/* When commit_interval seconds have passed since the last commit, or
+ * uncommitted journal data of at least commit_datasize bytes has accumulated
+ * (whatever occurs first), transactions are committed. */
+int qcow2_journal_init(Qcow2Journal **journal, uint64_t start_offset,
+                       int commit_interval, size_t commit_datasize);
+int qcow2_journal_destroy(Qcow2Journal *journal);
+
+/* These functions create microtransactions, i.e. a set of operations that must
+ * be executed atomically. In general, qemu doesn't map this to one qcow2
+ * on-disk transaction (which would leave a lot of space unused), but handles
+ * multiple microtransaction with one on-disk transaction. */
+Qcow2JournalTransaction *qcow2_journal_begin_transaction(Qcow2Journal *journal);
+void qcow2_journal_add(Qcow2JournalTransaction *ta, Qcow2JournalDesc *desc);
+void qcow2_journal_end_transaction(Qcow2JournalTransaction *ta);
+
+/* Commits all completed microtransactions (i.e. qcow2_journal_end_transaction
+ * has already been called) */
+int qcow2_journal_commit(Qcow2Journal *journal);
+
+/* Syncs all committed transactions */
+int qcow2_journal_sync(Qcow2Journal *journal);
+
 #endif
diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt
index 33eca36..7578a4b 100644
--- a/docs/specs/qcow2.txt
+++ b/docs/specs/qcow2.txt
@@ -85,6 +85,10 @@ in the description of a field.
                                 be written to (unless for regaining
                                 consistency).
 
+                    Bit 2:      Journal dirty. A replay of the main journal is
+                                needed in order to regain consistency before
+                                accessing the image.
+
                     Bits 2-63:  Reserved (set to 0)
 
          80 -  87:  compatible_features
@@ -103,7 +107,11 @@ in the description of a field.
                     write to an image with unknown auto-clear features if it
                     clears the respective bits from this field first.
 
-                    Bits 0-63:  Reserved (set to 0)
+                    Bit 0:      Journal valid bit. This bit indicates that the
+                                image contains a valid main journal starting at
+                                journal_offset.
+
+                    Bits 1-63:  Reserved (set to 0)
 
          96 -  99:  refcount_order
                     Describes the width of a reference count block entry (width
@@ -114,6 +122,16 @@ in the description of a field.
                     Length of the header structure in bytes. For version 2
                     images, the length is always assumed to be 72 bytes.
 
+        104 - 111:  journal_offset
+                    Offset into the image file at which the main image journal
+                    starts. Must be aligned to a cluster boundary. 0 means that
+                    no journal is used.
+
+                    This field is only valid if the journal feature bit in
+                    autoclear_features is set. If the field is invalid or the
+                    header is too short to contain the field, the field is
+                    assumed to be 0 (no journal is used)
+
 Directly after the image header, optional sections called header extensions can
 be stored. Each extension has a structure like the following:
 
@@ -355,3 +373,187 @@ Snapshot table entry:
         variable:   Unique ID string for the snapshot (not null terminated)
 
         variable:   Name of the snapshot (not null terminated)
+
+
+== Journal ==
+
+Journals are used to allow safe updates of metadata without impacting
+performance by requiring flushes to order updates to different parts of the
+metadata. They consist of transactions, which in turn contain operations that
+are effectively executed atomically. A qcow2 image can have a main image
+journal that deals with cluster management operations, and additional specific
+journals can be used by other features like data deduplication.
+
+
+As far as the on-disk format is concerned, a transaction is in one of the
+following states:
+
+    Incomplete:     This is the initial state of any transaction, while new
+                    operations can still be added. When opening an image with a
+                    dirty journal, incomplete transactions are discarded.
+
+    Committed:      When all operations that must be performed atomically
+                    during the transaction have been written and are stable on
+                    disk, the transaction can be committed by increasing the
+                    commited sequence number in the journal heder. A
+                    transaction in this state may not be changed. When opening
+                    an image with a dirty image, committed transactions should
+                    be replayed.
+
+    Synced:         A transaction is synced if all of its operations have been
+                    performed, all data written is stable on disk, and the
+                    synced sequence number is increased in the journal header.
+                    Synced transactions are no longer needed in the journal and
+                    can be overwritten. They are ignored during replay.
+
+The use of a sequence number implies that transactions are processed
+sequentially and an earlier transaction can never be unsynced/uncommitted if a
+later one is synced/committed.
+
+
+A journal is organised in journal blocks, all of which have a reference count
+of exactly 1. It starts with a block containing the following journal header:
+
+    Byte  0 -  7:   Magic ("qjournal" ASCII string)
+
+          8 - 11:   Journal size in bytes, including the header
+
+         12 - 15:   Journal block size order (block size in bytes = 1 << order)
+                    The block size must be at least 512 bytes and must not
+                    exceed the cluster size.
+
+         16 - 19:   Journal block index of the descriptor for the last
+                    transaction that has been synced, starting with 1 for the
+                    journal block after the header. 0 is used for empty
+                    journals.
+
+         20 - 23:   Sequence number of the last transaction that has been
+                    synced. 0 is recommended as the initial value.
+
+         24 - 27:   Sequence number of the last transaction that has been
+                    committed. When replaying a journal, all transactions
+                    after the last synced one up to the last commit one must be
+                    synced. Note that this may include a wraparound of sequence
+                    numbers.
+
+         28 -  31:  Checksum (one's complement of the sum of all bytes in the
+                    header journal block except those of the checksum field)
+
+         32 - 511:  Reserved (set to 0)
+
+
+The header is followed by journal blocks that are either descriptor or data
+blocks. The block index at byte 16 points to the first valid descriptor, except
+for completely empty journals, where it can be 0. The next descriptor can be
+found by skipping a descriptor and its associated data blocks. When the journal
+size is exceeded, a wraparound occurs, essentially forming a ring buffer.
+
+A wraparound may not occur in the middle of a single transaction, but only
+between two transactions. For the necessary padding an empty descriptor with
+any number of data blocks can be used as the last entry of the ring.
+
+The chain of valid descriptors ends if a descriptor is reached whose sequence
+number isn't the successor of the previous sequence number. This means in
+particular that the journal must be ordered chronologically and has ascending
+sequence numbers (except in the case of a sequence number wraparound).
+All blocks from the end of the descriptor chain until the starting point are
+unused.
+
+
+Descriptor blocks describe one transaction each and have the following
+structure:
+
+    Byte  0 -  3:   Magic ("qjbk" ASCII string)
+
+          4 -  7:   Checksum (one's complement of the sum of all bytes in the
+                    descriptor block except those of the checksum field, and
+                    all bytes in the associated data blocks)
+
+          8 - 11:   Sequence number of the transaction
+
+         12 - 15:   Byte offset into the descriptor block at which descriptors
+                    start
+
+         16 - 19:   Total length of descriptors in this block in bytes
+
+         20 - 23:   Number of following data blocks that are associated with
+                    this transaction.
+
+         24 -  n:   (Future extensions)
+
+          n -  m:   Array of descriptors as described below. The exact values
+                    of n and m are determined by the above fields.
+
+All descriptors start with a common part:
+
+    Byte  0 -  1:   Descriptor type
+                        0 - No-op descriptor
+                        1 - Write data block
+                        2 - Copy data
+                        3 - Revoke
+                        4 - Deduplication hash insertion
+                        5 - Deduplication hash deletion
+
+          2 -  3:   Size of the descriptor in bytes
+
+          4 -  n:   Type-specific data
+
+The following section specifies the purpose (i.e. the action that is to be
+performed when syncing) and type-specific data layout of each descriptor type:
+
+  * No-op descriptor: No action is to be performed when syncing this descriptor
+
+          4 -  n:   Ignored
+
+  * Write data block: Write literal data associated with this transaction from
+    the journal to a given offset.
+
+          4 -  7:   Length of the data to write in bytes
+
+          8 - 15:   Offset in the image file to write the data to
+
+         16 - 19:   Index of the journal block at which the data to write
+                    starts. The data must be stored sequentially and be fully
+                    contained in the data blocks associated with the
+                    transaction.
+
+    The type-specific data can be repeated, specifying multiple chunks of data
+    to be written in one operation. This means the size of the descriptor must
+    be 4 + 16 * n.
+
+  * Copy data: Copy data from one offset in the image to another one. This can
+    be used for journalling copy-on-write operations.
+
+          4 -  7:   Length of the data to write in bytes
+
+          8 - 15:   Target offset in the image file
+
+         16 - 23:   Source offset in the image file
+
+    The type-specific data can be repeated, specifying multiple chunks of data
+    to be copied in one operation. This means the size of the descriptor must
+    be 4 + 20 * n.
+
+  * Revoke: Marks operations on a given range in the imag file invalid for all
+    earlier transactions (this does not include the transaction containing the
+    revoke). They must not be executed on a sync operation (e.g. because the
+    range in question has been freed and may have been reused for other, not
+    journalled data structures that must not be overwritten with stale data).
+    Note that this may mean that operations are to be executed partially.
+
+          4 -  7:   Length of the range in bytes
+
+          8 - 15:   Offset of the range in the image file
+
+    The type-specific data can be repeated, specifying multiple ranges for
+    which operations should be revoked. This means the size of the descriptor
+    must be 4 + 12 * n.
+
+  * Deduplication hash insertion: Associates a hash value with a cluster.
+
+    TODO
+
+  * Deduplication hash deletion: Marks a hash value invalid (e.g. because the
+    hashed data has changed)
+
+    TODO
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-03 13:45 [Qemu-devel] [RFC] qcow2 journalling draft Kevin Wolf
@ 2013-09-03 14:43 ` Benoît Canet
  2013-09-04  8:03 ` Stefan Hajnoczi
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 24+ messages in thread
From: Benoît Canet @ 2013-09-03 14:43 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: benoit.canet, famz, jcody, qemu-devel, mreitz, stefanha

Le Tuesday 03 Sep 2013 à 15:45:52 (+0200), Kevin Wolf a écrit :
> This contains an extension of the qcow2 spec that introduces journalling
> to the image format, plus some preliminary type definitions and
> function prototypes in the qcow2 code.
> 
> Journalling functionality is a crucial feature for the design of data
> deduplication, and it will improve the core part of qcow2 by avoiding
> cluster leaks on crashes as well as provide an easier way to get a
> reliable implementation of performance features like Delayed COW.
> 
> At this point of the RFC, it would be most important to review the
> on-disk structure. Once we're confident that it can do everything we
> want, we can start going into more detail on the qemu side of things.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>  block/Makefile.objs   |   2 +-
>  block/qcow2-journal.c |  55 ++++++++++++++
>  block/qcow2.h         |  78 +++++++++++++++++++
>  docs/specs/qcow2.txt  | 204 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 337 insertions(+), 2 deletions(-)
>  create mode 100644 block/qcow2-journal.c
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 3bb85b5..59be314 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -1,5 +1,5 @@
>  block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
> -block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
> +block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o qcow2-journal.o
>  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-obj-y += qed-check.o
>  block-obj-y += vhdx.o
> diff --git a/block/qcow2-journal.c b/block/qcow2-journal.c
> new file mode 100644
> index 0000000..5b20239
> --- /dev/null
> +++ b/block/qcow2-journal.c
> @@ -0,0 +1,55 @@
> +/*
> + * qcow2 journalling functions
> + *
> + * Copyright (c) 2013 Kevin Wolf <kwolf@redhat.com>
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "qemu-common.h"
> +#include "block/block_int.h"
> +#include "qcow2.h"
> +
> +#define QCOW2_JOURNAL_MAGIC 0x716a6f75726e616cULL  /* "qjournal" */
> +#define QCOW2_JOURNAL_BLOCK_MAGIC 0x716a626b  /* "qjbk" */
> +
> +typedef struct Qcow2JournalHeader {
> +    uint64_t    magic;
> +    uint32_t    journal_size;
> +    uint32_t    block_size;
> +    uint32_t    synced_index;
> +    uint32_t    synced_seq;
> +    uint32_t    committed_seq;
> +    uint32_t    checksum;
> +} QEMU_PACKED Qcow2JournalHeader;
> +
> +/*
> + * One big transaction per journal block. The transaction is committed either
> + * time based or when a microtransaction (single set of operations that must be
> + * performed atomically) doesn't fit in the same block any more.
> + */
> +typedef struct Qcow2JournalBlock {
> +    uint32_t    magic;
> +    uint32_t    checksum;
> +    uint32_t    seq;
> +    uint32_t    desc_offset; /* Allow block header extensions */
> +    uint32_t    desc_bytes;
> +    uint32_t    nb_data_blocks;
> +} QEMU_PACKED Qcow2JournalBlock;
> +
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 1000239..2aee1fd 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -157,6 +157,10 @@ typedef struct Qcow2DiscardRegion {
>      QTAILQ_ENTRY(Qcow2DiscardRegion) next;
>  } Qcow2DiscardRegion;
>  
> +typedef struct Qcow2Journal {
> +
> +} Qcow2Journal;
> +
>  typedef struct BDRVQcowState {
>      int cluster_bits;
>      int cluster_size;
> @@ -479,4 +483,78 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
>      void **table);
>  int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
>  
> +/* qcow2-journal.c functions */
> +
> +typedef struct Qcow2JournalTransaction Qcow2JournalTransaction;
> +
> +enum Qcow2JournalEntryTypeID {
> +    QJ_DESC_NOOP    = 0,
> +    QJ_DESC_WRITE   = 1,
> +    QJ_DESC_COPY    = 2,
> +
> +    /* required after a cluster is freed and used for other purposes, so that
> +     * new (unjournalled) data won't be overwritten with stale metadata */
> +    QJ_DESC_REVOKE  = 3,
> +};
> +
> +typedef struct Qcow2JournalEntryType {
> +    enum Qcow2JournalEntryTypeID id;
> +    int (*sync)(void *buf, size_t size);
> +} Qcow2JournalEntryType;
> +
> +typedef struct Qcow2JournalDesc {
> +    uint16_t    type;
> +    uint16_t    size;
> +} QEMU_PACKED Qcow2JournalDesc;
> +
> +typedef struct Qcow2JournalDescWrite {
> +    Qcow2JournalDesc common;
> +    struct {
> +        uint32_t length;
> +        uint64_t target_offset;
> +        uint32_t data_block_index;
> +    } write[];
> +} QEMU_PACKED Qcow2JournalDescData;
> +
> +typedef struct Qcow2JournalDescCopy {
> +    Qcow2JournalDesc common;
> +    struct {
> +        uint32_t length;
> +        uint64_t target_offset;
> +        uint64_t source_offset;
> +    } copy[];
> +} QEMU_PACKED Qcow2JournalDescCopy;
> +
> +typedef struct Qcow2JournalRevoke {
> +    Qcow2JournalDesc common;
> +    struct {
> +        uint32_t length;
> +        uint64_t target_offset;
> +    } revoke[];
> +} QEMU_PACKED Qcow2JournalDescRevoke;
> +
> +void qcow2_journal_register_entry_type(Qcow2JournalEntryType *type);
> +
> +/* When commit_interval seconds have passed since the last commit, or
> + * uncommitted journal data of at least commit_datasize bytes has accumulated
> + * (whatever occurs first), transactions are committed. */
> +int qcow2_journal_init(Qcow2Journal **journal, uint64_t start_offset,
> +                       int commit_interval, size_t commit_datasize);
> +int qcow2_journal_destroy(Qcow2Journal *journal);
> +
> +/* These functions create microtransactions, i.e. a set of operations that must
> + * be executed atomically. In general, qemu doesn't map this to one qcow2
> + * on-disk transaction (which would leave a lot of space unused), but handles
> + * multiple microtransaction with one on-disk transaction. */
> +Qcow2JournalTransaction *qcow2_journal_begin_transaction(Qcow2Journal *journal);
> +void qcow2_journal_add(Qcow2JournalTransaction *ta, Qcow2JournalDesc *desc);
> +void qcow2_journal_end_transaction(Qcow2JournalTransaction *ta);
> +
> +/* Commits all completed microtransactions (i.e. qcow2_journal_end_transaction
> + * has already been called) */
> +int qcow2_journal_commit(Qcow2Journal *journal);
> +
> +/* Syncs all committed transactions */
> +int qcow2_journal_sync(Qcow2Journal *journal);
> +
>  #endif
> diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt
> index 33eca36..7578a4b 100644
> --- a/docs/specs/qcow2.txt
> +++ b/docs/specs/qcow2.txt
> @@ -85,6 +85,10 @@ in the description of a field.
>                                  be written to (unless for regaining
>                                  consistency).
>  
> +                    Bit 2:      Journal dirty. A replay of the main journal is
> +                                needed in order to regain consistency before
> +                                accessing the image.
> +
>                      Bits 2-63:  Reserved (set to 0)
>  
>           80 -  87:  compatible_features
> @@ -103,7 +107,11 @@ in the description of a field.
>                      write to an image with unknown auto-clear features if it
>                      clears the respective bits from this field first.
>  
> -                    Bits 0-63:  Reserved (set to 0)
> +                    Bit 0:      Journal valid bit. This bit indicates that the
> +                                image contains a valid main journal starting at
> +                                journal_offset.
> +
> +                    Bits 1-63:  Reserved (set to 0)
>  
>           96 -  99:  refcount_order
>                      Describes the width of a reference count block entry (width
> @@ -114,6 +122,16 @@ in the description of a field.
>                      Length of the header structure in bytes. For version 2
>                      images, the length is always assumed to be 72 bytes.
>  
> +        104 - 111:  journal_offset
> +                    Offset into the image file at which the main image journal
> +                    starts. Must be aligned to a cluster boundary. 0 means that
> +                    no journal is used.
> +
> +                    This field is only valid if the journal feature bit in
> +                    autoclear_features is set. If the field is invalid or the
> +                    header is too short to contain the field, the field is
> +                    assumed to be 0 (no journal is used)
> +
>  Directly after the image header, optional sections called header extensions can
>  be stored. Each extension has a structure like the following:
>  
> @@ -355,3 +373,187 @@ Snapshot table entry:
>          variable:   Unique ID string for the snapshot (not null terminated)
>  
>          variable:   Name of the snapshot (not null terminated)
> +
> +
> +== Journal ==
> +
> +Journals are used to allow safe updates of metadata without impacting
> +performance by requiring flushes to order updates to different parts of the
> +metadata. They consist of transactions, which in turn contain operations that
> +are effectively executed atomically. A qcow2 image can have a main image
> +journal that deals with cluster management operations, and additional specific
> +journals can be used by other features like data deduplication.
> +
> +
> +As far as the on-disk format is concerned, a transaction is in one of the
> +following states:
> +
> +    Incomplete:     This is the initial state of any transaction, while new
> +                    operations can still be added. When opening an image with a
> +                    dirty journal, incomplete transactions are discarded.
> +
> +    Committed:      When all operations that must be performed atomically
> +                    during the transaction have been written and are stable on
> +                    disk, the transaction can be committed by increasing the
> +                    commited sequence number in the journal heder. A
s/heder/header/
> +                    transaction in this state may not be changed. When opening
> +                    an image with a dirty image, committed transactions should
> +                    be replayed.
> +
> +    Synced:         A transaction is synced if all of its operations have been
> +                    performed, all data written is stable on disk, and the
> +                    synced sequence number is increased in the journal header.
> +                    Synced transactions are no longer needed in the journal and
> +                    can be overwritten. They are ignored during replay.

Some NAND SSD have a primitive FTL which will alway write the header data in the
same erase block leading to fast wear out.
(3000 erase cycles on a consumer NAND device)
Maybe creating a linked list with the transactions descriptor and storing the
synced/commited sequence number in a next transaction descriptor could help
to minimize this effect.
The journal header first block descriptor would be updated from time to time
only when really needed.
This would help spread the wear out.

> +
> +The use of a sequence number implies that transactions are processed
> +sequentially and an earlier transaction can never be unsynced/uncommitted if a
> +later one is synced/committed.
> +
> +
> +A journal is organised in journal blocks, all of which have a reference count
> +of exactly 1. It starts with a block containing the following journal header:
> +
> +    Byte  0 -  7:   Magic ("qjournal" ASCII string)
> +
> +          8 - 11:   Journal size in bytes, including the header
> +
> +         12 - 15:   Journal block size order (block size in bytes = 1 << order)
> +                    The block size must be at least 512 bytes and must not
> +                    exceed the cluster size.
> +
> +         16 - 19:   Journal block index of the descriptor for the last
> +                    transaction that has been synced, starting with 1 for the
> +                    journal block after the header. 0 is used for empty
> +                    journals.
> +
> +         20 - 23:   Sequence number of the last transaction that has been
> +                    synced. 0 is recommended as the initial value.
> +
> +         24 - 27:   Sequence number of the last transaction that has been
> +                    committed. When replaying a journal, all transactions
> +                    after the last synced one up to the last commit one must be
> +                    synced. Note that this may include a wraparound of sequence
> +                    numbers.
> +
> +         28 -  31:  Checksum (one's complement of the sum of all bytes in the
> +                    header journal block except those of the checksum field)
A checksum more resilient of similar bit flip would be usefull.

> +
> +         32 - 511:  Reserved (set to 0)
> +
> +
> +The header is followed by journal blocks that are either descriptor or data
> +blocks. The block index at byte 16 points to the first valid descriptor, except
> +for completely empty journals, where it can be 0. The next descriptor can be
> +found by skipping a descriptor and its associated data blocks. When the journal
> +size is exceeded, a wraparound occurs, essentially forming a ring buffer.
> +
> +A wraparound may not occur in the middle of a single transaction, but only
> +between two transactions. For the necessary padding an empty descriptor with
> +any number of data blocks can be used as the last entry of the ring.
> +
> +The chain of valid descriptors ends if a descriptor is reached whose sequence
> +number isn't the successor of the previous sequence number. This means in
> +particular that the journal must be ordered chronologically and has ascending
> +sequence numbers (except in the case of a sequence number wraparound).
> +All blocks from the end of the descriptor chain until the starting point are
> +unused.
> +
> +
> +Descriptor blocks describe one transaction each and have the following
> +structure:
> +
> +    Byte  0 -  3:   Magic ("qjbk" ASCII string)
> +
> +          4 -  7:   Checksum (one's complement of the sum of all bytes in the
> +                    descriptor block except those of the checksum field, and
> +                    all bytes in the associated data blocks)

I am not sure to understand "and all bytes in the associated data blocks".
Does it mean that all the array of descriptors is skipped ?
Should not they have their own individual checksum ?

> +
> +          8 - 11:   Sequence number of the transaction
> +
> +         12 - 15:   Byte offset into the descriptor block at which descriptors
> +                    start
> +
> +         16 - 19:   Total length of descriptors in this block in bytes
> +
> +         20 - 23:   Number of following data blocks that are associated with
> +                    this transaction.
> +
> +         24 -  n:   (Future extensions)
> +
> +          n -  m:   Array of descriptors as described below. The exact values
> +                    of n and m are determined by the above fields.
> +
> +All descriptors start with a common part:
> +
> +    Byte  0 -  1:   Descriptor type
> +                        0 - No-op descriptor
> +                        1 - Write data block
> +                        2 - Copy data
> +                        3 - Revoke
> +                        4 - Deduplication hash insertion
> +                        5 - Deduplication hash deletion
> +
> +          2 -  3:   Size of the descriptor in bytes
> +
> +          4 -  n:   Type-specific data
> +
> +The following section specifies the purpose (i.e. the action that is to be
> +performed when syncing) and type-specific data layout of each descriptor type:
> +
> +  * No-op descriptor: No action is to be performed when syncing this descriptor
> +
> +          4 -  n:   Ignored
> +
> +  * Write data block: Write literal data associated with this transaction from
> +    the journal to a given offset.
> +
> +          4 -  7:   Length of the data to write in bytes
> +
> +          8 - 15:   Offset in the image file to write the data to
> +
> +         16 - 19:   Index of the journal block at which the data to write
> +                    starts. The data must be stored sequentially and be fully
> +                    contained in the data blocks associated with the
> +                    transaction.
> +
> +    The type-specific data can be repeated, specifying multiple chunks of data
> +    to be written in one operation. This means the size of the descriptor must
> +    be 4 + 16 * n.
> +
> +  * Copy data: Copy data from one offset in the image to another one. This can
> +    be used for journalling copy-on-write operations.
> +
> +          4 -  7:   Length of the data to write in bytes
> +
> +          8 - 15:   Target offset in the image file
> +
> +         16 - 23:   Source offset in the image file
> +
> +    The type-specific data can be repeated, specifying multiple chunks of data
> +    to be copied in one operation. This means the size of the descriptor must
> +    be 4 + 20 * n.
> +
> +  * Revoke: Marks operations on a given range in the imag file invalid for all
s/imag/image/
> +    earlier transactions (this does not include the transaction containing the
> +    revoke). They must not be executed on a sync operation (e.g. because the
> +    range in question has been freed and may have been reused for other, not
> +    journalled data structures that must not be overwritten with stale data).
> +    Note that this may mean that operations are to be executed partially.
> +
> +          4 -  7:   Length of the range in bytes
> +
> +          8 - 15:   Offset of the range in the image file
> +
> +    The type-specific data can be repeated, specifying multiple ranges for
> +    which operations should be revoked. This means the size of the descriptor
> +    must be 4 + 12 * n.
> +
> +  * Deduplication hash insertion: Associates a hash value with a cluster.
> +
I will take care of these :)

Best regards

Benoît
> +    TODO
> +
> +  * Deduplication hash deletion: Marks a hash value invalid (e.g. because the
> +    hashed data has changed)
> +
> +    TODO
> -- 
> 1.8.1.4
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-03 13:45 [Qemu-devel] [RFC] qcow2 journalling draft Kevin Wolf
  2013-09-03 14:43 ` Benoît Canet
@ 2013-09-04  8:03 ` Stefan Hajnoczi
  2013-09-04  9:37   ` Benoît Canet
  2013-09-04  9:39   ` Kevin Wolf
  2013-09-04  8:32 ` Max Reitz
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 24+ messages in thread
From: Stefan Hajnoczi @ 2013-09-04  8:03 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: benoit.canet, jcody, famz, qemu-devel, mreitz

On Tue, Sep 03, 2013 at 03:45:52PM +0200, Kevin Wolf wrote:
> @@ -103,7 +107,11 @@ in the description of a field.
>                      write to an image with unknown auto-clear features if it
>                      clears the respective bits from this field first.
>  
> -                    Bits 0-63:  Reserved (set to 0)
> +                    Bit 0:      Journal valid bit. This bit indicates that the
> +                                image contains a valid main journal starting at
> +                                journal_offset.

Whether the journal is used can be determined from the journal_offset
value (header length must be large enough and journal offset must be
valid).

Why do we need this autoclear bit?

> +Journals are used to allow safe updates of metadata without impacting
> +performance by requiring flushes to order updates to different parts of the
> +metadata.

This sentence is hard to parse.  Maybe something shorter like this:

Journals allow safe metadata updates without the need for carefully
ordering and flushing between update steps.

> +They consist of transactions, which in turn contain operations that
> +are effectively executed atomically. A qcow2 image can have a main image
> +journal that deals with cluster management operations, and additional specific
> +journals can be used by other features like data deduplication.

I'm not sure if multiple journals will work in practice.  Doesn't this
re-introduce the need to order update steps and flush between them?

> +A journal is organised in journal blocks, all of which have a reference count
> +of exactly 1. It starts with a block containing the following journal header:
> +
> +    Byte  0 -  7:   Magic ("qjournal" ASCII string)
> +
> +          8 - 11:   Journal size in bytes, including the header
> +
> +         12 - 15:   Journal block size order (block size in bytes = 1 << order)
> +                    The block size must be at least 512 bytes and must not
> +                    exceed the cluster size.
> +
> +         16 - 19:   Journal block index of the descriptor for the last
> +                    transaction that has been synced, starting with 1 for the
> +                    journal block after the header. 0 is used for empty
> +                    journals.
> +
> +         20 - 23:   Sequence number of the last transaction that has been
> +                    synced. 0 is recommended as the initial value.
> +
> +         24 - 27:   Sequence number of the last transaction that has been
> +                    committed. When replaying a journal, all transactions
> +                    after the last synced one up to the last commit one must be
> +                    synced. Note that this may include a wraparound of sequence
> +                    numbers.
> +
> +         28 -  31:  Checksum (one's complement of the sum of all bytes in the
> +                    header journal block except those of the checksum field)
> +
> +         32 - 511:  Reserved (set to 0)

I'm not sure if these fields are necessary.  They require updates (and
maybe flush) after every commit and sync.

The fewer metadata updates, the better, not just for performance but
also to reduce the risk of data loss.  If any metadata required to
access the journal is corrupted, the image will be unavailable.

It should be possible to determine this information by scanning the
journal transactions.

> +A wraparound may not occur in the middle of a single transaction, but only
> +between two transactions. For the necessary padding an empty descriptor with
> +any number of data blocks can be used as the last entry of the ring.

Why have this limitation?

> +All descriptors start with a common part:
> +
> +    Byte  0 -  1:   Descriptor type
> +                        0 - No-op descriptor
> +                        1 - Write data block
> +                        2 - Copy data
> +                        3 - Revoke
> +                        4 - Deduplication hash insertion
> +                        5 - Deduplication hash deletion
> +
> +          2 -  3:   Size of the descriptor in bytes

Data blocks are not included in the descriptor size?  I just want to
make sure that we don't be limited to 64 KB for the actual data.

> +
> +          4 -  n:   Type-specific data
> +
> +The following section specifies the purpose (i.e. the action that is to be
> +performed when syncing) and type-specific data layout of each descriptor type:
> +
> +  * No-op descriptor: No action is to be performed when syncing this descriptor
> +
> +          4 -  n:   Ignored
> +
> +  * Write data block: Write literal data associated with this transaction from
> +    the journal to a given offset.
> +
> +          4 -  7:   Length of the data to write in bytes
> +
> +          8 - 15:   Offset in the image file to write the data to
> +
> +         16 - 19:   Index of the journal block at which the data to write
> +                    starts. The data must be stored sequentially and be fully
> +                    contained in the data blocks associated with the
> +                    transaction.
> +
> +    The type-specific data can be repeated, specifying multiple chunks of data
> +    to be written in one operation. This means the size of the descriptor must
> +    be 4 + 16 * n.

Why is the necessary?  Multiple data descriptors could be used, is it
worth the additional logic and testing?

> +
> +  * Copy data: Copy data from one offset in the image to another one. This can
> +    be used for journalling copy-on-write operations.

This reminds me to ask what the plan is for journal scope: metadata only
or also data?  For some operations like dedupe it seems that full data
journalling may be necessary.  But for an image without dedupe it would
not be necessary to journal the rewrites to an already allocated
cluster, for example.

> +          4 -  7:   Length of the data to write in bytes
> +
> +          8 - 15:   Target offset in the image file
> +
> +         16 - 23:   Source offset in the image file

Source and target cannot overlap?

> +
> +    The type-specific data can be repeated, specifying multiple chunks of data
> +    to be copied in one operation. This means the size of the descriptor must
> +    be 4 + 20 * n.
> +
> +  * Revoke: Marks operations on a given range in the imag file invalid for all

s/imag/image/

> +    earlier transactions (this does not include the transaction containing the
> +    revoke). They must not be executed on a sync operation (e.g. because the
> +    range in question has been freed and may have been reused for other, not
> +    journalled data structures that must not be overwritten with stale data).
> +    Note that this may mean that operations are to be executed partially.

Example scenario?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-03 13:45 [Qemu-devel] [RFC] qcow2 journalling draft Kevin Wolf
  2013-09-03 14:43 ` Benoît Canet
  2013-09-04  8:03 ` Stefan Hajnoczi
@ 2013-09-04  8:32 ` Max Reitz
  2013-09-04 10:12   ` Kevin Wolf
  2013-09-05  9:35 ` Stefan Hajnoczi
  2013-09-06  9:59 ` Fam Zheng
  4 siblings, 1 reply; 24+ messages in thread
From: Max Reitz @ 2013-09-04  8:32 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: benoit.canet, jcody, famz, qemu-devel, stefanha

On 2013-09-03 15:45, Kevin Wolf wrote:
> This contains an extension of the qcow2 spec that introduces journalling
> to the image format, plus some preliminary type definitions and
> function prototypes in the qcow2 code.
>
> Journalling functionality is a crucial feature for the design of data
> deduplication, and it will improve the core part of qcow2 by avoiding
> cluster leaks on crashes as well as provide an easier way to get a
> reliable implementation of performance features like Delayed COW.
>
> At this point of the RFC, it would be most important to review the
> on-disk structure. Once we're confident that it can do everything we
> want, we can start going into more detail on the qemu side of things.
>
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>   block/Makefile.objs   |   2 +-
>   block/qcow2-journal.c |  55 ++++++++++++++
>   block/qcow2.h         |  78 +++++++++++++++++++
>   docs/specs/qcow2.txt  | 204 +++++++++++++++++++++++++++++++++++++++++++++++++-
>   4 files changed, 337 insertions(+), 2 deletions(-)
>   create mode 100644 block/qcow2-journal.c
>
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 3bb85b5..59be314 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -1,5 +1,5 @@
>   block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
> -block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
> +block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o qcow2-journal.o
>   block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>   block-obj-y += qed-check.o
>   block-obj-y += vhdx.o
> diff --git a/block/qcow2-journal.c b/block/qcow2-journal.c
> new file mode 100644
> index 0000000..5b20239
> --- /dev/null
> +++ b/block/qcow2-journal.c
> @@ -0,0 +1,55 @@
> +/*
> + * qcow2 journalling functions
> + *
> + * Copyright (c) 2013 Kevin Wolf <kwolf@redhat.com>
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "qemu-common.h"
> +#include "block/block_int.h"
> +#include "qcow2.h"
> +
> +#define QCOW2_JOURNAL_MAGIC 0x716a6f75726e616cULL  /* "qjournal" */
> +#define QCOW2_JOURNAL_BLOCK_MAGIC 0x716a626b  /* "qjbk" */
> +
> +typedef struct Qcow2JournalHeader {
> +    uint64_t    magic;
> +    uint32_t    journal_size;
> +    uint32_t    block_size;
> +    uint32_t    synced_index;
> +    uint32_t    synced_seq;
> +    uint32_t    committed_seq;
> +    uint32_t    checksum;
> +} QEMU_PACKED Qcow2JournalHeader;
> +
> +/*
> + * One big transaction per journal block. The transaction is committed either
> + * time based or when a microtransaction (single set of operations that must be
> + * performed atomically) doesn't fit in the same block any more.
> + */
> +typedef struct Qcow2JournalBlock {
> +    uint32_t    magic;
> +    uint32_t    checksum;
> +    uint32_t    seq;
> +    uint32_t    desc_offset; /* Allow block header extensions */
> +    uint32_t    desc_bytes;
> +    uint32_t    nb_data_blocks;
> +} QEMU_PACKED Qcow2JournalBlock;
> +
Why is this in the C file...

> diff --git a/block/qcow2.h b/block/qcow2.h
> index 1000239..2aee1fd 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -157,6 +157,10 @@ typedef struct Qcow2DiscardRegion {
>       QTAILQ_ENTRY(Qcow2DiscardRegion) next;
>   } Qcow2DiscardRegion;
>   
> +typedef struct Qcow2Journal {
> +
> +} Qcow2Journal;
> +
>   typedef struct BDRVQcowState {
>       int cluster_bits;
>       int cluster_size;
> @@ -479,4 +483,78 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
>       void **table);
>   int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
>   
> +/* qcow2-journal.c functions */
> +
> +typedef struct Qcow2JournalTransaction Qcow2JournalTransaction;
> +
> +enum Qcow2JournalEntryTypeID {
> +    QJ_DESC_NOOP    = 0,
> +    QJ_DESC_WRITE   = 1,
> +    QJ_DESC_COPY    = 2,
> +
> +    /* required after a cluster is freed and used for other purposes, so that
> +     * new (unjournalled) data won't be overwritten with stale metadata */
> +    QJ_DESC_REVOKE  = 3,
> +};
> +
> +typedef struct Qcow2JournalEntryType {
> +    enum Qcow2JournalEntryTypeID id;
> +    int (*sync)(void *buf, size_t size);
> +} Qcow2JournalEntryType;
> +
> +typedef struct Qcow2JournalDesc {
> +    uint16_t    type;
> +    uint16_t    size;
> +} QEMU_PACKED Qcow2JournalDesc;
> +
> +typedef struct Qcow2JournalDescWrite {
> +    Qcow2JournalDesc common;
> +    struct {
> +        uint32_t length;
> +        uint64_t target_offset;
> +        uint32_t data_block_index;
> +    } write[];
> +} QEMU_PACKED Qcow2JournalDescData;
> +
> +typedef struct Qcow2JournalDescCopy {
> +    Qcow2JournalDesc common;
> +    struct {
> +        uint32_t length;
> +        uint64_t target_offset;
> +        uint64_t source_offset;
> +    } copy[];
> +} QEMU_PACKED Qcow2JournalDescCopy;
> +
> +typedef struct Qcow2JournalRevoke {
> +    Qcow2JournalDesc common;
> +    struct {
> +        uint32_t length;
> +        uint64_t target_offset;
> +    } revoke[];
> +} QEMU_PACKED Qcow2JournalDescRevoke;
And this in the header? All these structures are on-disk, therefore I'd 
personally move them all into the header or all into the C file.

> +
> +void qcow2_journal_register_entry_type(Qcow2JournalEntryType *type);
> +
> +/* When commit_interval seconds have passed since the last commit, or
> + * uncommitted journal data of at least commit_datasize bytes has accumulated
> + * (whatever occurs first), transactions are committed. */
> +int qcow2_journal_init(Qcow2Journal **journal, uint64_t start_offset,
> +                       int commit_interval, size_t commit_datasize);
> +int qcow2_journal_destroy(Qcow2Journal *journal);
> +
> +/* These functions create microtransactions, i.e. a set of operations that must
> + * be executed atomically. In general, qemu doesn't map this to one qcow2
> + * on-disk transaction (which would leave a lot of space unused), but handles
> + * multiple microtransaction with one on-disk transaction. */
> +Qcow2JournalTransaction *qcow2_journal_begin_transaction(Qcow2Journal *journal);
> +void qcow2_journal_add(Qcow2JournalTransaction *ta, Qcow2JournalDesc *desc);
> +void qcow2_journal_end_transaction(Qcow2JournalTransaction *ta);
> +
> +/* Commits all completed microtransactions (i.e. qcow2_journal_end_transaction
> + * has already been called) */
> +int qcow2_journal_commit(Qcow2Journal *journal);
> +
> +/* Syncs all committed transactions */
> +int qcow2_journal_sync(Qcow2Journal *journal);
> +
>   #endif
> diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt
> index 33eca36..7578a4b 100644
> --- a/docs/specs/qcow2.txt
> +++ b/docs/specs/qcow2.txt
> @@ -85,6 +85,10 @@ in the description of a field.
>                                   be written to (unless for regaining
>                                   consistency).
>   
> +                    Bit 2:      Journal dirty. A replay of the main journal is
> +                                needed in order to regain consistency before
> +                                accessing the image.
> +
>                       Bits 2-63:  Reserved (set to 0)
>   
>            80 -  87:  compatible_features
> @@ -103,7 +107,11 @@ in the description of a field.
>                       write to an image with unknown auto-clear features if it
>                       clears the respective bits from this field first.
>   
> -                    Bits 0-63:  Reserved (set to 0)
> +                    Bit 0:      Journal valid bit. This bit indicates that the
> +                                image contains a valid main journal starting at
> +                                journal_offset.
I second Stefan's question here. An application which does not support 
journaling will not be able to open an image with a dirty journal 
anyway; and it simply will not make use of the journal if it's clean 
(but it also won't do any transactions which would require modifying the 
journal). Therefore, if any feature bit is introduced at all, I'd put it 
into the compatible section.

> +
> +                    Bits 1-63:  Reserved (set to 0)
>   
>            96 -  99:  refcount_order
>                       Describes the width of a reference count block entry (width
> @@ -114,6 +122,16 @@ in the description of a field.
>                       Length of the header structure in bytes. For version 2
>                       images, the length is always assumed to be 72 bytes.
>   
> +        104 - 111:  journal_offset
> +                    Offset into the image file at which the main image journal
> +                    starts. Must be aligned to a cluster boundary. 0 means that
> +                    no journal is used.
> +
> +                    This field is only valid if the journal feature bit in
> +                    autoclear_features is set. If the field is invalid or the
> +                    header is too short to contain the field, the field is
> +                    assumed to be 0 (no journal is used)
> +
>   Directly after the image header, optional sections called header extensions can
>   be stored. Each extension has a structure like the following:
>   
> @@ -355,3 +373,187 @@ Snapshot table entry:
>           variable:   Unique ID string for the snapshot (not null terminated)
>   
>           variable:   Name of the snapshot (not null terminated)
> +
> +
> +== Journal ==
> +
> +Journals are used to allow safe updates of metadata without impacting
> +performance by requiring flushes to order updates to different parts of the
> +metadata. They consist of transactions, which in turn contain operations that
> +are effectively executed atomically. A qcow2 image can have a main image
> +journal that deals with cluster management operations, and additional specific
> +journals can be used by other features like data deduplication.
> +
> +
> +As far as the on-disk format is concerned, a transaction is in one of the
> +following states:
> +
> +    Incomplete:     This is the initial state of any transaction, while new
> +                    operations can still be added. When opening an image with a
> +                    dirty journal, incomplete transactions are discarded.
> +
> +    Committed:      When all operations that must be performed atomically
> +                    during the transaction have been written and are stable on
> +                    disk, the transaction can be committed by increasing the
> +                    commited sequence number in the journal heder. A
> +                    transaction in this state may not be changed. When opening
> +                    an image with a dirty image, committed transactions should
s/dirty image/dirty journal/
> +                    be replayed.
> +
> +    Synced:         A transaction is synced if all of its operations have been
> +                    performed, all data written is stable on disk, and the
> +                    synced sequence number is increased in the journal header.
> +                    Synced transactions are no longer needed in the journal and
> +                    can be overwritten. They are ignored during replay.
> +
> +The use of a sequence number implies that transactions are processed
> +sequentially and an earlier transaction can never be unsynced/uncommitted if a
> +later one is synced/committed.
> +
> +
> +A journal is organised in journal blocks, all of which have a reference count
> +of exactly 1. It starts with a block containing the following journal header:
> +
> +    Byte  0 -  7:   Magic ("qjournal" ASCII string)
Why exactly is there a magic string? To recover the journal if the 
corresponding header field got corrupted?

> +
> +          8 - 11:   Journal size in bytes, including the header
> +
> +         12 - 15:   Journal block size order (block size in bytes = 1 << order)
> +                    The block size must be at least 512 bytes and must not
> +                    exceed the cluster size.
> +
> +         16 - 19:   Journal block index of the descriptor for the last
> +                    transaction that has been synced, starting with 1 for the
> +                    journal block after the header. 0 is used for empty
> +                    journals.
> +
> +         20 - 23:   Sequence number of the last transaction that has been
> +                    synced. 0 is recommended as the initial value.
> +
> +         24 - 27:   Sequence number of the last transaction that has been
> +                    committed. When replaying a journal, all transactions
> +                    after the last synced one up to the last commit one must be
> +                    synced. Note that this may include a wraparound of sequence
> +                    numbers.
> +
> +         28 -  31:  Checksum (one's complement of the sum of all bytes in the
> +                    header journal block except those of the checksum field)
> +
> +         32 - 511:  Reserved (set to 0)
> +
> +
> +The header is followed by journal blocks that are either descriptor or data
> +blocks. The block index at byte 16 points to the first valid descriptor, except
> +for completely empty journals, where it can be 0. The next descriptor can be
> +found by skipping a descriptor and its associated data blocks. When the journal
> +size is exceeded, a wraparound occurs, essentially forming a ring buffer.
> +
> +A wraparound may not occur in the middle of a single transaction, but only
> +between two transactions. For the necessary padding an empty descriptor with
> +any number of data blocks can be used as the last entry of the ring.
> +
> +The chain of valid descriptors ends if a descriptor is reached whose sequence
> +number isn't the successor of the previous sequence number. This means in
> +particular that the journal must be ordered chronologically and has ascending
> +sequence numbers (except in the case of a sequence number wraparound).
> +All blocks from the end of the descriptor chain until the starting point are
> +unused.
So the journal is contiguous on disk including all the descriptors? Is 
the journal size in this header fixed (and thus the journal will be 
preallocated) or how will that be achieved?

> +
> +
> +Descriptor blocks describe one transaction each and have the following
> +structure:
> +
> +    Byte  0 -  3:   Magic ("qjbk" ASCII string)
> +
> +          4 -  7:   Checksum (one's complement of the sum of all bytes in the
> +                    descriptor block except those of the checksum field, and
> +                    all bytes in the associated data blocks)
> +
> +          8 - 11:   Sequence number of the transaction
> +
> +         12 - 15:   Byte offset into the descriptor block at which descriptors
> +                    start
> +
> +         16 - 19:   Total length of descriptors in this block in bytes
> +
> +         20 - 23:   Number of following data blocks that are associated with
> +                    this transaction.
> +
> +         24 -  n:   (Future extensions)
> +
> +          n -  m:   Array of descriptors as described below. The exact values
> +                    of n and m are determined by the above fields.
> +
> +All descriptors start with a common part:
> +
> +    Byte  0 -  1:   Descriptor type
> +                        0 - No-op descriptor
> +                        1 - Write data block
> +                        2 - Copy data
> +                        3 - Revoke
> +                        4 - Deduplication hash insertion
> +                        5 - Deduplication hash deletion
> +
> +          2 -  3:   Size of the descriptor in bytes
> +
> +          4 -  n:   Type-specific data
> +
> +The following section specifies the purpose (i.e. the action that is to be
> +performed when syncing) and type-specific data layout of each descriptor type:
> +
> +  * No-op descriptor: No action is to be performed when syncing this descriptor
> +
> +          4 -  n:   Ignored
> +
> +  * Write data block: Write literal data associated with this transaction from
> +    the journal to a given offset.
> +
> +          4 -  7:   Length of the data to write in bytes
> +
> +          8 - 15:   Offset in the image file to write the data to
> +
> +         16 - 19:   Index of the journal block at which the data to write
> +                    starts. The data must be stored sequentially and be fully
> +                    contained in the data blocks associated with the
> +                    transaction.
> +
> +    The type-specific data can be repeated, specifying multiple chunks of data
> +    to be written in one operation. This means the size of the descriptor must
> +    be 4 + 16 * n.
> +
> +  * Copy data: Copy data from one offset in the image to another one. This can
> +    be used for journalling copy-on-write operations.
> +
> +          4 -  7:   Length of the data to write in bytes
> +
> +          8 - 15:   Target offset in the image file
> +
> +         16 - 23:   Source offset in the image file
> +
> +    The type-specific data can be repeated, specifying multiple chunks of data
> +    to be copied in one operation. This means the size of the descriptor must
> +    be 4 + 20 * n.
> +
> +  * Revoke: Marks operations on a given range in the imag file invalid for all
> +    earlier transactions (this does not include the transaction containing the
> +    revoke). They must not be executed on a sync operation (e.g. because the
> +    range in question has been freed and may have been reused for other, not
> +    journalled data structures that must not be overwritten with stale data).
> +    Note that this may mean that operations are to be executed partially.
> +
> +          4 -  7:   Length of the range in bytes
> +
> +          8 - 15:   Offset of the range in the image file
> +
> +    The type-specific data can be repeated, specifying multiple ranges for
> +    which operations should be revoked. This means the size of the descriptor
> +    must be 4 + 12 * n.
> +
> +  * Deduplication hash insertion: Associates a hash value with a cluster.
> +
> +    TODO
> +
> +  * Deduplication hash deletion: Marks a hash value invalid (e.g. because the
> +    hashed data has changed)
> +
> +    TODO

Max

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-04  8:03 ` Stefan Hajnoczi
@ 2013-09-04  9:37   ` Benoît Canet
  2013-09-04  9:39   ` Kevin Wolf
  1 sibling, 0 replies; 24+ messages in thread
From: Benoît Canet @ 2013-09-04  9:37 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Kevin Wolf, benoit.canet, famz, jcody, qemu-devel, mreitz

> > +They consist of transactions, which in turn contain operations that
> > +are effectively executed atomically. A qcow2 image can have a main image
> > +journal that deals with cluster management operations, and additional specific
> > +journals can be used by other features like data deduplication.
> 
> I'm not sure if multiple journals will work in practice.  Doesn't this
> re-introduce the need to order update steps and flush between them?

The flush and data has reached stable storage requirement of the deduplication
journal are very weak.
The deduplication code maintains an incompatible "dedup dirty" flag and flush
the journal on exit then clear the flag.
If the flag is set at startup all deduplication metadata and journal content are
dropped and it does not harm the image file in any way.
The code just starts over.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-04  8:03 ` Stefan Hajnoczi
  2013-09-04  9:37   ` Benoît Canet
@ 2013-09-04  9:39   ` Kevin Wolf
  2013-09-04  9:55     ` Benoît Canet
                       ` (2 more replies)
  1 sibling, 3 replies; 24+ messages in thread
From: Kevin Wolf @ 2013-09-04  9:39 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: benoit.canet, jcody, famz, qemu-devel, mreitz

First of all, excuse any inconsistencies in the following mail. I wrote
it from top to bottom, and there was some thought process involved in
almost every paragraph...

Am 04.09.2013 um 10:03 hat Stefan Hajnoczi geschrieben:
> On Tue, Sep 03, 2013 at 03:45:52PM +0200, Kevin Wolf wrote:
> > @@ -103,7 +107,11 @@ in the description of a field.
> >                      write to an image with unknown auto-clear features if it
> >                      clears the respective bits from this field first.
> >  
> > -                    Bits 0-63:  Reserved (set to 0)
> > +                    Bit 0:      Journal valid bit. This bit indicates that the
> > +                                image contains a valid main journal starting at
> > +                                journal_offset.
> 
> Whether the journal is used can be determined from the journal_offset
> value (header length must be large enough and journal offset must be
> valid).
> 
> Why do we need this autoclear bit?

Hm, I introduced this one first and the journal dirty incompatible bit
later, perhaps it's unnecessary now. Let's check...

The obvious thing we need to protect against is applying stale journal
data to an image that has been changed by an older version. As long as
the journal is clean, this can't happen, and the journal dirty bit will
ensure that the old version can only open the image if it is clean.

However, what if we run 'qemu-img check -r leaks' with an old qemu-img
version? It will reclaim the clusters used by the journal, and if we
continue using the journal we'll corrupt whatever new data is there
now.

Can we protect against this without using an autoclear bit?

> > +Journals are used to allow safe updates of metadata without impacting
> > +performance by requiring flushes to order updates to different parts of the
> > +metadata.
> 
> This sentence is hard to parse.  Maybe something shorter like this:
> 
> Journals allow safe metadata updates without the need for carefully
> ordering and flushing between update steps.

Okay, I'll update the text with your proposal.

> > +They consist of transactions, which in turn contain operations that
> > +are effectively executed atomically. A qcow2 image can have a main image
> > +journal that deals with cluster management operations, and additional specific
> > +journals can be used by other features like data deduplication.
> 
> I'm not sure if multiple journals will work in practice.  Doesn't this
> re-introduce the need to order update steps and flush between them?

This is a question for Benoît, who made this requirement. I asked him
the same a while ago and apparently his explanation made some sense to
me, or I would have remembered that I don't want it. ;-)

It might have something to do with the fact that deduplication uses the
journal more as a kind of cache for hash values that can be dropped and
rebuilt after a crash.

> > +A journal is organised in journal blocks, all of which have a reference count
> > +of exactly 1. It starts with a block containing the following journal header:
> > +
> > +    Byte  0 -  7:   Magic ("qjournal" ASCII string)
> > +
> > +          8 - 11:   Journal size in bytes, including the header
> > +
> > +         12 - 15:   Journal block size order (block size in bytes = 1 << order)
> > +                    The block size must be at least 512 bytes and must not
> > +                    exceed the cluster size.
> > +
> > +         16 - 19:   Journal block index of the descriptor for the last
> > +                    transaction that has been synced, starting with 1 for the
> > +                    journal block after the header. 0 is used for empty
> > +                    journals.
> > +
> > +         20 - 23:   Sequence number of the last transaction that has been
> > +                    synced. 0 is recommended as the initial value.
> > +
> > +         24 - 27:   Sequence number of the last transaction that has been
> > +                    committed. When replaying a journal, all transactions
> > +                    after the last synced one up to the last commit one must be
> > +                    synced. Note that this may include a wraparound of sequence
> > +                    numbers.
> > +
> > +         28 -  31:  Checksum (one's complement of the sum of all bytes in the
> > +                    header journal block except those of the checksum field)
> > +
> > +         32 - 511:  Reserved (set to 0)
> 
> I'm not sure if these fields are necessary.  They require updates (and
> maybe flush) after every commit and sync.
> 
> The fewer metadata updates, the better, not just for performance but
> also to reduce the risk of data loss.  If any metadata required to
> access the journal is corrupted, the image will be unavailable.
> 
> It should be possible to determine this information by scanning the
> journal transactions.

This is rather handwavy. Can you elaborate how this would work in detail?

For example, let's assume we get to read this journal (a journal can be
rather large, too, so I'm not sure if we want to read it in completely):

 - Descriptor, seq 42, 2 data blocks
 - Data block
 - Data block
 - Data block starting with "qjbk"
 - Data block
 - Descriptor, seq 7, 0 data blocks
 - Descriptor, seq 8, 1 data block
 - Data block

Which of these have already been synced? Which have been committed?

I guess we could introduce an is_commited flag in the descriptor, but
wouldn't correct operation look like this then:

1. Write out descriptor commit flag clear and any data blocks
2. Flush
3. Rewrite descriptor with commit flag set

This ensures that the commit flag is only set if all the required data
is indeed stable on disk. What has changed compared to this proposal is
just the offset at which you write in step 3 (header vs. descriptor).

For sync I suppose the same option exists.

One unrelated thing we need to take care of is this 'data block starting
with "qjbk"' thing I mentioned in the example. I can see two solutions:
The first is never creating such a journal, by always blanking the next
block after a transaction that we write, so that the descriptor chain is
terminated. The second one is never creating such a journal, by
zeroing an initial "qjbk" in data blocks and setting a flag in the
descriptor instead.

I was thinking of the first, but I guess it would be worth at least
mentioning the problem in the spec.

> > +A wraparound may not occur in the middle of a single transaction, but only
> > +between two transactions. For the necessary padding an empty descriptor with
> > +any number of data blocks can be used as the last entry of the ring.
> 
> Why have this limitation?

I thought it would make implementations easier if they didn't have to
read/write in data blocks in two parts in some cases.

> > +All descriptors start with a common part:
> > +
> > +    Byte  0 -  1:   Descriptor type
> > +                        0 - No-op descriptor
> > +                        1 - Write data block
> > +                        2 - Copy data
> > +                        3 - Revoke
> > +                        4 - Deduplication hash insertion
> > +                        5 - Deduplication hash deletion
> > +
> > +          2 -  3:   Size of the descriptor in bytes
> 
> Data blocks are not included in the descriptor size?  I just want to
> make sure that we don't be limited to 64 KB for the actual data.

Right, this only for the descriptors, without data. It implies a maximum
journal block size of 64k, which I think is okay.

> > +
> > +          4 -  n:   Type-specific data
> > +
> > +The following section specifies the purpose (i.e. the action that is to be
> > +performed when syncing) and type-specific data layout of each descriptor type:
> > +
> > +  * No-op descriptor: No action is to be performed when syncing this descriptor
> > +
> > +          4 -  n:   Ignored
> > +
> > +  * Write data block: Write literal data associated with this transaction from
> > +    the journal to a given offset.
> > +
> > +          4 -  7:   Length of the data to write in bytes
> > +
> > +          8 - 15:   Offset in the image file to write the data to
> > +
> > +         16 - 19:   Index of the journal block at which the data to write
> > +                    starts. The data must be stored sequentially and be fully
> > +                    contained in the data blocks associated with the
> > +                    transaction.
> > +
> > +    The type-specific data can be repeated, specifying multiple chunks of data
> > +    to be written in one operation. This means the size of the descriptor must
> > +    be 4 + 16 * n.
> 
> Why is the necessary?  Multiple data descriptors could be used, is it
> worth the additional logic and testing?

What does a typical journal transaction look like?

My assumption is that initially it has lots of L2 table and refcount
block updates and little else. All of these are data writes. Once we
implement Delayed COW using the journal, we get some data copy
operations as well. Assuming a stupid guest, we get two copies and two
writes for each cluster allocation.

The space required for these is 2*(4 + 16n) + 2*(4 + 20n) = 16 + 72n. In
one 4k descriptor block you can fit 56 cluster allocations.

If you used separate descriptors, you get 2*20n + 2*24n = 88n. In one 4k
descriptor block you could fit 46 cluster allocations.

Considering that in both cases the space used by descriptors is dwarved
by the updated tables to be written, I guess it's not worth it for the
main journal. Things may look different for the dedpulication journal,
where no data blocks are used and the space taken is determined only by
the descriptors.

And in fact... All of the above sounded great, but is in fact wrong.
Typically you'd get _one_ L2 update for all (sequential) allocations,
the COW entries without data should dominate in the main journal as well.

> > +
> > +  * Copy data: Copy data from one offset in the image to another one. This can
> > +    be used for journalling copy-on-write operations.
> 
> This reminds me to ask what the plan is for journal scope: metadata only
> or also data?  For some operations like dedupe it seems that full data
> journalling may be necessary.  But for an image without dedupe it would
> not be necessary to journal the rewrites to an already allocated
> cluster, for example.

Generally only metadata.

The COW case is a bit tricky. Here we really store only metadata in the
journal as well ("this COW is still pending"), but it directly affects
data and the qemu read/write paths for data must take such pending COWs
into consideration. This will require some ordering between the journal
and data (e.g. only free the COWed cluster after the journal is
committed), but I think that's okay.

> > +          4 -  7:   Length of the data to write in bytes
> > +
> > +          8 - 15:   Target offset in the image file
> > +
> > +         16 - 23:   Source offset in the image file
> 
> Source and target cannot overlap?

I don't think there's a use case for it, so let's forbid it.

> > +
> > +    The type-specific data can be repeated, specifying multiple chunks of data
> > +    to be copied in one operation. This means the size of the descriptor must
> > +    be 4 + 20 * n.
> > +
> > +  * Revoke: Marks operations on a given range in the imag file invalid for all
> 
> s/imag/image/
> 
> > +    earlier transactions (this does not include the transaction containing the
> > +    revoke). They must not be executed on a sync operation (e.g. because the
> > +    range in question has been freed and may have been reused for other, not
> > +    journalled data structures that must not be overwritten with stale data).
> > +    Note that this may mean that operations are to be executed partially.
> 
> Example scenario?

Once I had one... Well, let's try this:

1. Cluster allocation that allocates a new L2 table, i.e. L1 update gets
   journalled

2. Another cluster allocation, this time the L1 has to grow. The write
   to the new L1 position is journalled, the old table is freed.

3. Next allocation reuses the clusters where the old L1 table was stored

4. Journal sync. The L1 update from step 1 is written out to the
   clusters that are now used for data. Oops.

Kevin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-04  9:39   ` Kevin Wolf
@ 2013-09-04  9:55     ` Benoît Canet
  2013-09-05  9:24       ` Stefan Hajnoczi
  2013-09-05  9:21     ` Stefan Hajnoczi
  2013-09-06  9:20     ` Fam Zheng
  2 siblings, 1 reply; 24+ messages in thread
From: Benoît Canet @ 2013-09-04  9:55 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: benoit.canet, famz, jcody, qemu-devel, mreitz, Stefan Hajnoczi

> > I'm not sure if multiple journals will work in practice.  Doesn't this
> > re-introduce the need to order update steps and flush between them?
> 
> This is a question for Benoît, who made this requirement. I asked him
> the same a while ago and apparently his explanation made some sense to
> me, or I would have remembered that I don't want it. ;-)

The reason behind the multiple journal requirement is that if a block get
created and deleted in a cyclic way it can generate cyclic insertions/deletions
journal entries.
The journal could easilly be filled if this pathological corner case happen.
When it happen the dedup code repack the journal by writting only the non
redundant information into a new journal and then use the new one.
It would not be easy to do so if non dedup journal entries are present in the
journal hence the multiple journal requirement.

The deduplication also need two journals because when the first one is frozen it
take some time to write the hash table to disk and anyway new entries must be
stored somewhere at the same time. The code cannot block.

> It might have something to do with the fact that deduplication uses the
> journal more as a kind of cache for hash values that can be dropped and
> rebuilt after a crash.

For dedupe the journal is more a "resume after exit" tool.

Best regards

Benoît

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-04  8:32 ` Max Reitz
@ 2013-09-04 10:12   ` Kevin Wolf
  0 siblings, 0 replies; 24+ messages in thread
From: Kevin Wolf @ 2013-09-04 10:12 UTC (permalink / raw)
  To: Max Reitz; +Cc: benoit.canet, jcody, famz, qemu-devel, stefanha

Am 04.09.2013 um 10:32 hat Max Reitz geschrieben:
> On 2013-09-03 15:45, Kevin Wolf wrote:
> >This contains an extension of the qcow2 spec that introduces journalling
> >to the image format, plus some preliminary type definitions and
> >function prototypes in the qcow2 code.
> >
> >Journalling functionality is a crucial feature for the design of data
> >deduplication, and it will improve the core part of qcow2 by avoiding
> >cluster leaks on crashes as well as provide an easier way to get a
> >reliable implementation of performance features like Delayed COW.
> >
> >At this point of the RFC, it would be most important to review the
> >on-disk structure. Once we're confident that it can do everything we
> >want, we can start going into more detail on the qemu side of things.
> >
> >Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> >---
> >  block/Makefile.objs   |   2 +-
> >  block/qcow2-journal.c |  55 ++++++++++++++
> >  block/qcow2.h         |  78 +++++++++++++++++++
> >  docs/specs/qcow2.txt  | 204 +++++++++++++++++++++++++++++++++++++++++++++++++-
> >  4 files changed, 337 insertions(+), 2 deletions(-)
> >  create mode 100644 block/qcow2-journal.c
> >
> >diff --git a/block/Makefile.objs b/block/Makefile.objs
> >index 3bb85b5..59be314 100644
> >--- a/block/Makefile.objs
> >+++ b/block/Makefile.objs
> >@@ -1,5 +1,5 @@
> >  block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
> >-block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
> >+block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o qcow2-journal.o
> >  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
> >  block-obj-y += qed-check.o
> >  block-obj-y += vhdx.o
> >diff --git a/block/qcow2-journal.c b/block/qcow2-journal.c
> >new file mode 100644
> >index 0000000..5b20239
> >--- /dev/null
> >+++ b/block/qcow2-journal.c
> >@@ -0,0 +1,55 @@
> >+/*
> >+ * qcow2 journalling functions
> >+ *
> >+ * Copyright (c) 2013 Kevin Wolf <kwolf@redhat.com>
> >+ *
> >+ * Permission is hereby granted, free of charge, to any person obtaining a copy
> >+ * of this software and associated documentation files (the "Software"), to deal
> >+ * in the Software without restriction, including without limitation the rights
> >+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> >+ * copies of the Software, and to permit persons to whom the Software is
> >+ * furnished to do so, subject to the following conditions:
> >+ *
> >+ * The above copyright notice and this permission notice shall be included in
> >+ * all copies or substantial portions of the Software.
> >+ *
> >+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> >+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> >+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> >+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> >+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> >+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> >+ * THE SOFTWARE.
> >+ */
> >+
> >+#include "qemu-common.h"
> >+#include "block/block_int.h"
> >+#include "qcow2.h"
> >+
> >+#define QCOW2_JOURNAL_MAGIC 0x716a6f75726e616cULL  /* "qjournal" */
> >+#define QCOW2_JOURNAL_BLOCK_MAGIC 0x716a626b  /* "qjbk" */
> >+
> >+typedef struct Qcow2JournalHeader {
> >+    uint64_t    magic;
> >+    uint32_t    journal_size;
> >+    uint32_t    block_size;
> >+    uint32_t    synced_index;
> >+    uint32_t    synced_seq;
> >+    uint32_t    committed_seq;
> >+    uint32_t    checksum;
> >+} QEMU_PACKED Qcow2JournalHeader;
> >+
> >+/*
> >+ * One big transaction per journal block. The transaction is committed either
> >+ * time based or when a microtransaction (single set of operations that must be
> >+ * performed atomically) doesn't fit in the same block any more.
> >+ */
> >+typedef struct Qcow2JournalBlock {
> >+    uint32_t    magic;
> >+    uint32_t    checksum;
> >+    uint32_t    seq;
> >+    uint32_t    desc_offset; /* Allow block header extensions */
> >+    uint32_t    desc_bytes;
> >+    uint32_t    nb_data_blocks;
> >+} QEMU_PACKED Qcow2JournalBlock;
> >+
> Why is this in the C file...
> 
> >diff --git a/block/qcow2.h b/block/qcow2.h
> >index 1000239..2aee1fd 100644
> >--- a/block/qcow2.h
> >+++ b/block/qcow2.h
> >@@ -157,6 +157,10 @@ typedef struct Qcow2DiscardRegion {
> >      QTAILQ_ENTRY(Qcow2DiscardRegion) next;
> >  } Qcow2DiscardRegion;
> >+typedef struct Qcow2Journal {
> >+
> >+} Qcow2Journal;
> >+
> >  typedef struct BDRVQcowState {
> >      int cluster_bits;
> >      int cluster_size;
> >@@ -479,4 +483,78 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
> >      void **table);
> >  int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
> >+/* qcow2-journal.c functions */
> >+
> >+typedef struct Qcow2JournalTransaction Qcow2JournalTransaction;
> >+
> >+enum Qcow2JournalEntryTypeID {
> >+    QJ_DESC_NOOP    = 0,
> >+    QJ_DESC_WRITE   = 1,
> >+    QJ_DESC_COPY    = 2,
> >+
> >+    /* required after a cluster is freed and used for other purposes, so that
> >+     * new (unjournalled) data won't be overwritten with stale metadata */
> >+    QJ_DESC_REVOKE  = 3,
> >+};
> >+
> >+typedef struct Qcow2JournalEntryType {
> >+    enum Qcow2JournalEntryTypeID id;
> >+    int (*sync)(void *buf, size_t size);
> >+} Qcow2JournalEntryType;
> >+
> >+typedef struct Qcow2JournalDesc {
> >+    uint16_t    type;
> >+    uint16_t    size;
> >+} QEMU_PACKED Qcow2JournalDesc;
> >+
> >+typedef struct Qcow2JournalDescWrite {
> >+    Qcow2JournalDesc common;
> >+    struct {
> >+        uint32_t length;
> >+        uint64_t target_offset;
> >+        uint32_t data_block_index;
> >+    } write[];
> >+} QEMU_PACKED Qcow2JournalDescData;
> >+
> >+typedef struct Qcow2JournalDescCopy {
> >+    Qcow2JournalDesc common;
> >+    struct {
> >+        uint32_t length;
> >+        uint64_t target_offset;
> >+        uint64_t source_offset;
> >+    } copy[];
> >+} QEMU_PACKED Qcow2JournalDescCopy;
> >+
> >+typedef struct Qcow2JournalRevoke {
> >+    Qcow2JournalDesc common;
> >+    struct {
> >+        uint32_t length;
> >+        uint64_t target_offset;
> >+    } revoke[];
> >+} QEMU_PACKED Qcow2JournalDescRevoke;
> And this in the header? All these structures are on-disk, therefore
> I'd personally move them all into the header or all into the C file.

I'll freely admit that the intentions of the code are yet
underdocumented...

The reason is that I was trying to follow the principle that everything
that can be private is in fact private. I can't see any code outside
qcow2-journal.c using the journal (block) header. However, code in other
places will have to add journal entries to a transaction, that's why
their definition is public.

If it turns out that some descriptor types are used only by one user
(likely to happen with deduplication), then their definition can be
moved to the respective source file.

> >+
> >+void qcow2_journal_register_entry_type(Qcow2JournalEntryType *type);
> >+
> >+/* When commit_interval seconds have passed since the last commit, or
> >+ * uncommitted journal data of at least commit_datasize bytes has accumulated
> >+ * (whatever occurs first), transactions are committed. */
> >+int qcow2_journal_init(Qcow2Journal **journal, uint64_t start_offset,
> >+                       int commit_interval, size_t commit_datasize);
> >+int qcow2_journal_destroy(Qcow2Journal *journal);
> >+
> >+/* These functions create microtransactions, i.e. a set of operations that must
> >+ * be executed atomically. In general, qemu doesn't map this to one qcow2
> >+ * on-disk transaction (which would leave a lot of space unused), but handles
> >+ * multiple microtransaction with one on-disk transaction. */
> >+Qcow2JournalTransaction *qcow2_journal_begin_transaction(Qcow2Journal *journal);
> >+void qcow2_journal_add(Qcow2JournalTransaction *ta, Qcow2JournalDesc *desc);
> >+void qcow2_journal_end_transaction(Qcow2JournalTransaction *ta);
> >+
> >+/* Commits all completed microtransactions (i.e. qcow2_journal_end_transaction
> >+ * has already been called) */
> >+int qcow2_journal_commit(Qcow2Journal *journal);
> >+
> >+/* Syncs all committed transactions */
> >+int qcow2_journal_sync(Qcow2Journal *journal);
> >+
> >  #endif
> >diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt
> >index 33eca36..7578a4b 100644
> >--- a/docs/specs/qcow2.txt
> >+++ b/docs/specs/qcow2.txt
> >@@ -85,6 +85,10 @@ in the description of a field.
> >                                  be written to (unless for regaining
> >                                  consistency).
> >+                    Bit 2:      Journal dirty. A replay of the main journal is
> >+                                needed in order to regain consistency before
> >+                                accessing the image.
> >+
> >                      Bits 2-63:  Reserved (set to 0)
> >           80 -  87:  compatible_features
> >@@ -103,7 +107,11 @@ in the description of a field.
> >                      write to an image with unknown auto-clear features if it
> >                      clears the respective bits from this field first.
> >-                    Bits 0-63:  Reserved (set to 0)
> >+                    Bit 0:      Journal valid bit. This bit indicates that the
> >+                                image contains a valid main journal starting at
> >+                                journal_offset.
> I second Stefan's question here. An application which does not
> support journaling will not be able to open an image with a dirty
> journal anyway; and it simply will not make use of the journal if
> it's clean (but it also won't do any transactions which would
> require modifying the journal). Therefore, if any feature bit is
> introduced at all, I'd put it into the compatible section.

See reply to Stefan's mail (qemu-img check -r on old version breaks it)

> >+
> >+                    Bits 1-63:  Reserved (set to 0)
> >           96 -  99:  refcount_order
> >                      Describes the width of a reference count block entry (width
> >@@ -114,6 +122,16 @@ in the description of a field.
> >                      Length of the header structure in bytes. For version 2
> >                      images, the length is always assumed to be 72 bytes.
> >+        104 - 111:  journal_offset
> >+                    Offset into the image file at which the main image journal
> >+                    starts. Must be aligned to a cluster boundary. 0 means that
> >+                    no journal is used.
> >+
> >+                    This field is only valid if the journal feature bit in
> >+                    autoclear_features is set. If the field is invalid or the
> >+                    header is too short to contain the field, the field is
> >+                    assumed to be 0 (no journal is used)
> >+
> >  Directly after the image header, optional sections called header extensions can
> >  be stored. Each extension has a structure like the following:
> >@@ -355,3 +373,187 @@ Snapshot table entry:
> >          variable:   Unique ID string for the snapshot (not null terminated)
> >          variable:   Name of the snapshot (not null terminated)
> >+
> >+
> >+== Journal ==
> >+
> >+Journals are used to allow safe updates of metadata without impacting
> >+performance by requiring flushes to order updates to different parts of the
> >+metadata. They consist of transactions, which in turn contain operations that
> >+are effectively executed atomically. A qcow2 image can have a main image
> >+journal that deals with cluster management operations, and additional specific
> >+journals can be used by other features like data deduplication.
> >+
> >+
> >+As far as the on-disk format is concerned, a transaction is in one of the
> >+following states:
> >+
> >+    Incomplete:     This is the initial state of any transaction, while new
> >+                    operations can still be added. When opening an image with a
> >+                    dirty journal, incomplete transactions are discarded.
> >+
> >+    Committed:      When all operations that must be performed atomically
> >+                    during the transaction have been written and are stable on
> >+                    disk, the transaction can be committed by increasing the
> >+                    commited sequence number in the journal heder. A
> >+                    transaction in this state may not be changed. When opening
> >+                    an image with a dirty image, committed transactions should
> s/dirty image/dirty journal/
> >+                    be replayed.
> >+
> >+    Synced:         A transaction is synced if all of its operations have been
> >+                    performed, all data written is stable on disk, and the
> >+                    synced sequence number is increased in the journal header.
> >+                    Synced transactions are no longer needed in the journal and
> >+                    can be overwritten. They are ignored during replay.
> >+
> >+The use of a sequence number implies that transactions are processed
> >+sequentially and an earlier transaction can never be unsynced/uncommitted if a
> >+later one is synced/committed.
> >+
> >+
> >+A journal is organised in journal blocks, all of which have a reference count
> >+of exactly 1. It starts with a block containing the following journal header:
> >+
> >+    Byte  0 -  7:   Magic ("qjournal" ASCII string)
> Why exactly is there a magic string? To recover the journal if the
> corresponding header field got corrupted?

Possibly, or just generally debugging purposes. It's so much nicer if I
can tell from a hexdump what a specific block is doing. It doesn't cost
us anything to include it (because we need to use a full journal block
anyway), so I just did it.

> >+
> >+          8 - 11:   Journal size in bytes, including the header
> >+
> >+         12 - 15:   Journal block size order (block size in bytes = 1 << order)
> >+                    The block size must be at least 512 bytes and must not
> >+                    exceed the cluster size.
> >+
> >+         16 - 19:   Journal block index of the descriptor for the last
> >+                    transaction that has been synced, starting with 1 for the
> >+                    journal block after the header. 0 is used for empty
> >+                    journals.
> >+
> >+         20 - 23:   Sequence number of the last transaction that has been
> >+                    synced. 0 is recommended as the initial value.
> >+
> >+         24 - 27:   Sequence number of the last transaction that has been
> >+                    committed. When replaying a journal, all transactions
> >+                    after the last synced one up to the last commit one must be
> >+                    synced. Note that this may include a wraparound of sequence
> >+                    numbers.
> >+
> >+         28 -  31:  Checksum (one's complement of the sum of all bytes in the
> >+                    header journal block except those of the checksum field)
> >+
> >+         32 - 511:  Reserved (set to 0)
> >+
> >+
> >+The header is followed by journal blocks that are either descriptor or data
> >+blocks. The block index at byte 16 points to the first valid descriptor, except
> >+for completely empty journals, where it can be 0. The next descriptor can be
> >+found by skipping a descriptor and its associated data blocks. When the journal
> >+size is exceeded, a wraparound occurs, essentially forming a ring buffer.
> >+
> >+A wraparound may not occur in the middle of a single transaction, but only
> >+between two transactions. For the necessary padding an empty descriptor with
> >+any number of data blocks can be used as the last entry of the ring.
> >+
> >+The chain of valid descriptors ends if a descriptor is reached whose sequence
> >+number isn't the successor of the previous sequence number. This means in
> >+particular that the journal must be ordered chronologically and has ascending
> >+sequence numbers (except in the case of a sequence number wraparound).
> >+All blocks from the end of the descriptor chain until the starting point are
> >+unused.
> So the journal is contiguous on disk including all the descriptors?

Aye. Ought to be mentioned somewhere, I guess.

> Is the journal size in this header fixed (and thus the journal will
> be preallocated) or how will that be achieved?

Right, I was expecting a static, preallocated journal size. If you ever
wanted to resize the journal, the easiest way to achieve it is probably
to sync the journal, disable it and free its clusters, create a new
journal with a different size.

Kevin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-04  9:39   ` Kevin Wolf
  2013-09-04  9:55     ` Benoît Canet
@ 2013-09-05  9:21     ` Stefan Hajnoczi
  2013-09-05 11:18       ` Kevin Wolf
  2013-09-06  9:20     ` Fam Zheng
  2 siblings, 1 reply; 24+ messages in thread
From: Stefan Hajnoczi @ 2013-09-05  9:21 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: benoit.canet, jcody, famz, qemu-devel, mreitz

On Wed, Sep 04, 2013 at 11:39:51AM +0200, Kevin Wolf wrote:
> First of all, excuse any inconsistencies in the following mail. I wrote
> it from top to bottom, and there was some thought process involved in
> almost every paragraph...

I should add this disclaimer to all my emails ;-).

> Am 04.09.2013 um 10:03 hat Stefan Hajnoczi geschrieben:
> > On Tue, Sep 03, 2013 at 03:45:52PM +0200, Kevin Wolf wrote:
> > > @@ -103,7 +107,11 @@ in the description of a field.
> > >                      write to an image with unknown auto-clear features if it
> > >                      clears the respective bits from this field first.
> > >  
> > > -                    Bits 0-63:  Reserved (set to 0)
> > > +                    Bit 0:      Journal valid bit. This bit indicates that the
> > > +                                image contains a valid main journal starting at
> > > +                                journal_offset.
> > 
> > Whether the journal is used can be determined from the journal_offset
> > value (header length must be large enough and journal offset must be
> > valid).
> > 
> > Why do we need this autoclear bit?
> 
> Hm, I introduced this one first and the journal dirty incompatible bit
> later, perhaps it's unnecessary now. Let's check...
> 
> The obvious thing we need to protect against is applying stale journal
> data to an image that has been changed by an older version. As long as
> the journal is clean, this can't happen, and the journal dirty bit will
> ensure that the old version can only open the image if it is clean.
> 
> However, what if we run 'qemu-img check -r leaks' with an old qemu-img
> version? It will reclaim the clusters used by the journal, and if we
> continue using the journal we'll corrupt whatever new data is there
> now.
> 
> Can we protect against this without using an autoclear bit?

You are right.  It's a weird case I didn't think of but it could happen.
An autoclear bit sounds like the simplest solution.

Please document this scenario.

> > > +A journal is organised in journal blocks, all of which have a reference count
> > > +of exactly 1. It starts with a block containing the following journal header:
> > > +
> > > +    Byte  0 -  7:   Magic ("qjournal" ASCII string)
> > > +
> > > +          8 - 11:   Journal size in bytes, including the header
> > > +
> > > +         12 - 15:   Journal block size order (block size in bytes = 1 << order)
> > > +                    The block size must be at least 512 bytes and must not
> > > +                    exceed the cluster size.
> > > +
> > > +         16 - 19:   Journal block index of the descriptor for the last
> > > +                    transaction that has been synced, starting with 1 for the
> > > +                    journal block after the header. 0 is used for empty
> > > +                    journals.
> > > +
> > > +         20 - 23:   Sequence number of the last transaction that has been
> > > +                    synced. 0 is recommended as the initial value.
> > > +
> > > +         24 - 27:   Sequence number of the last transaction that has been
> > > +                    committed. When replaying a journal, all transactions
> > > +                    after the last synced one up to the last commit one must be
> > > +                    synced. Note that this may include a wraparound of sequence
> > > +                    numbers.
> > > +
> > > +         28 -  31:  Checksum (one's complement of the sum of all bytes in the
> > > +                    header journal block except those of the checksum field)
> > > +
> > > +         32 - 511:  Reserved (set to 0)
> > 
> > I'm not sure if these fields are necessary.  They require updates (and
> > maybe flush) after every commit and sync.
> > 
> > The fewer metadata updates, the better, not just for performance but
> > also to reduce the risk of data loss.  If any metadata required to
> > access the journal is corrupted, the image will be unavailable.
> > 
> > It should be possible to determine this information by scanning the
> > journal transactions.
> 
> This is rather handwavy. Can you elaborate how this would work in detail?
> 
> 
> For example, let's assume we get to read this journal (a journal can be
> rather large, too, so I'm not sure if we want to read it in completely):
> 
>  - Descriptor, seq 42, 2 data blocks
>  - Data block
>  - Data block
>  - Data block starting with "qjbk"
>  - Data block
>  - Descriptor, seq 7, 0 data blocks
>  - Descriptor, seq 8, 1 data block
>  - Data block
> 
> Which of these have already been synced? Which have been committed?
> 
> 
> I guess we could introduce an is_commited flag in the descriptor, but
> wouldn't correct operation look like this then:
> 
> 1. Write out descriptor commit flag clear and any data blocks
> 2. Flush
> 3. Rewrite descriptor with commit flag set
> 
> This ensures that the commit flag is only set if all the required data
> is indeed stable on disk. What has changed compared to this proposal is
> just the offset at which you write in step 3 (header vs. descriptor).

A commit flag cannot be relied upon.  A transaction can be corrupted
after being committed, or it can be corrupted due to power failure while
writing the transaction.  In both cases we have an invalid transaction
and we must discard it.

The checksum tells us whether the transaction is valid (i.e. committed).
I see no need for a commit flag since the checksum already tells us if
the transaction is valid or not.

(If the checksum is weak then corruptions can sneak through, so we need
to choose a reasonable algorithm.)

> For sync I suppose the same option exists.

The sync process is:

1. Apply descriptor operations to the image file
2. Flush - ensure transactions have been applied to disk
3. Mark transactions synced

Transactions can be marked synced by writing a new block to the journal.
Or we could even piggyback the next transaction by including a
last_sync_seq in the header.

> One unrelated thing we need to take care of is this 'data block starting
> with "qjbk"' thing I mentioned in the example. I can see two solutions:
> The first is never creating such a journal, by always blanking the next
> block after a transaction that we write, so that the descriptor chain is
> terminated. The second one is never creating such a journal, by
> zeroing an initial "qjbk" in data blocks and setting a flag in the
> descriptor instead.
> 
> I was thinking of the first, but I guess it would be worth at least
> mentioning the problem in the spec.

Agreed.

> > > +A wraparound may not occur in the middle of a single transaction, but only
> > > +between two transactions. For the necessary padding an empty descriptor with
> > > +any number of data blocks can be used as the last entry of the ring.
> > 
> > Why have this limitation?
> 
> I thought it would make implementations easier if they didn't have to
> read/write in data blocks in two parts in some cases.

Readers must detect wrapped/truncated transactions anyway since the
journal may be corrupted.  So I don't think we save anything by adding
the padding special case.

> > > +
> > > +  * Copy data: Copy data from one offset in the image to another one. This can
> > > +    be used for journalling copy-on-write operations.
> > 
> > This reminds me to ask what the plan is for journal scope: metadata only
> > or also data?  For some operations like dedupe it seems that full data
> > journalling may be necessary.  But for an image without dedupe it would
> > not be necessary to journal the rewrites to an already allocated
> > cluster, for example.
> 
> Generally only metadata.

Okay, that's what I was hoping since writing all data through the
journal would mean massive churn and definitely affect performance
(everything that goes via the journal will be written twice).

> > > +          4 -  7:   Length of the data to write in bytes
> > > +
> > > +          8 - 15:   Target offset in the image file
> > > +
> > > +         16 - 23:   Source offset in the image file
> > 
> > Source and target cannot overlap?
> 
> I don't think there's a use case for it, so let's forbid it.

Agreed.

> > > +    earlier transactions (this does not include the transaction containing the
> > > +    revoke). They must not be executed on a sync operation (e.g. because the
> > > +    range in question has been freed and may have been reused for other, not
> > > +    journalled data structures that must not be overwritten with stale data).
> > > +    Note that this may mean that operations are to be executed partially.
> > 
> > Example scenario?
> 
> Once I had one... Well, let's try this:
> 
> 1. Cluster allocation that allocates a new L2 table, i.e. L1 update gets
>    journalled
> 
> 2. Another cluster allocation, this time the L1 has to grow. The write
>    to the new L1 position is journalled, the old table is freed.

Another solution is to sync all transactions before moving metadata
around or updating the qcow2 header (e.g. L1 table offset).

Since 'revoke' itself requires writing to the journal, it may not be
much more expensive to simply sync transactions.

I'm not sure which approach is best but I wanted to mention it if it
helps use keep the journal design and implementation simpler.

> 3. Next allocation reuses the clusters where the old L1 table was stored
> 
> 4. Journal sync. The L1 update from step 1 is written out to the
>    clusters that are now used for data. Oops.

Stefan

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-04  9:55     ` Benoît Canet
@ 2013-09-05  9:24       ` Stefan Hajnoczi
  2013-09-05 15:26         ` Benoît Canet
  0 siblings, 1 reply; 24+ messages in thread
From: Stefan Hajnoczi @ 2013-09-05  9:24 UTC (permalink / raw)
  To: Benoît Canet; +Cc: Kevin Wolf, jcody, famz, qemu-devel, mreitz

On Wed, Sep 04, 2013 at 11:55:23AM +0200, Benoît Canet wrote:
> > > I'm not sure if multiple journals will work in practice.  Doesn't this
> > > re-introduce the need to order update steps and flush between them?
> > 
> > This is a question for Benoît, who made this requirement. I asked him
> > the same a while ago and apparently his explanation made some sense to
> > me, or I would have remembered that I don't want it. ;-)
> 
> The reason behind the multiple journal requirement is that if a block get
> created and deleted in a cyclic way it can generate cyclic insertions/deletions
> journal entries.
> The journal could easilly be filled if this pathological corner case happen.
> When it happen the dedup code repack the journal by writting only the non
> redundant information into a new journal and then use the new one.
> It would not be easy to do so if non dedup journal entries are present in the
> journal hence the multiple journal requirement.
> 
> The deduplication also need two journals because when the first one is frozen it
> take some time to write the hash table to disk and anyway new entries must be
> stored somewhere at the same time. The code cannot block.
> 
> > It might have something to do with the fact that deduplication uses the
> > journal more as a kind of cache for hash values that can be dropped and
> > rebuilt after a crash.
> 
> For dedupe the journal is more a "resume after exit" tool.

I'm not sure anymore if dedupe needs the same kind of "journal" as a
metadata journal for qcow2.

Since you have a dirty flag to discard the "journal" on crash, the
journal is not used for data integrity.

That makes me wonder if the metadata journal is the right structure for
dedupe?  Maybe your original proposal was fine for dedupe and we just
misinterpreted it because we thought this needs to be a safe journal.

Stefan

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-03 13:45 [Qemu-devel] [RFC] qcow2 journalling draft Kevin Wolf
                   ` (2 preceding siblings ...)
  2013-09-04  8:32 ` Max Reitz
@ 2013-09-05  9:35 ` Stefan Hajnoczi
  2013-09-05 11:50   ` Kevin Wolf
  2013-09-06  9:59 ` Fam Zheng
  4 siblings, 1 reply; 24+ messages in thread
From: Stefan Hajnoczi @ 2013-09-05  9:35 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: benoit.canet, jcody, famz, qemu-devel, mreitz

On Tue, Sep 03, 2013 at 03:45:52PM +0200, Kevin Wolf wrote:
> This contains an extension of the qcow2 spec that introduces journalling
> to the image format, plus some preliminary type definitions and
> function prototypes in the qcow2 code.
> 
> Journalling functionality is a crucial feature for the design of data
> deduplication, and it will improve the core part of qcow2 by avoiding
> cluster leaks on crashes as well as provide an easier way to get a
> reliable implementation of performance features like Delayed COW.
> 
> At this point of the RFC, it would be most important to review the
> on-disk structure. Once we're confident that it can do everything we
> want, we can start going into more detail on the qemu side of things.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>  block/Makefile.objs   |   2 +-
>  block/qcow2-journal.c |  55 ++++++++++++++
>  block/qcow2.h         |  78 +++++++++++++++++++
>  docs/specs/qcow2.txt  | 204 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 337 insertions(+), 2 deletions(-)
>  create mode 100644 block/qcow2-journal.c

Although we are still discussing details of the on-disk layout, the
general design is clear enough to discuss how the journal will be used.

Today qcow2 uses Qcow2Cache to do lazy, ordered metadata updates.  The
performance is pretty good with two exceptions that I can think of:

1. The delayed CoW problem that Kevin has been working on.  Guests
   perform sequential writes that are smaller than a qcow2 cluster.  The
   first write triggers a copy-on-write of the full cluster.  Later
   writes then overwrite the copied data.  It would be more efficient to
   anticipate sequential writes and hold off on CoW where possible.

2. Lazy metadata updates lead to bursty behavior and expensive flushes.
   We do not take advantage of disk bandwidth since metadata updates
   stay in the Qcow2Cache until the last possible second.  When the
   guest issues a flush we must write out dirty Qcow2Cache entries and
   possibly fsync between them if dependencies have been set (e.g.
   refcount before L2).

How will the journal change this situation?  Writes that go through the
journal are doubled - they must first be journalled, fsync, and then
they can be applied to the actual image.

How do we benefit by using the journal?

Stefan

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-05  9:21     ` Stefan Hajnoczi
@ 2013-09-05 11:18       ` Kevin Wolf
  2013-09-05 14:55         ` Stefan Hajnoczi
  0 siblings, 1 reply; 24+ messages in thread
From: Kevin Wolf @ 2013-09-05 11:18 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: benoit.canet, jcody, famz, qemu-devel, mreitz

Am 05.09.2013 um 11:21 hat Stefan Hajnoczi geschrieben:
> On Wed, Sep 04, 2013 at 11:39:51AM +0200, Kevin Wolf wrote:
> > However, what if we run 'qemu-img check -r leaks' with an old qemu-img
> > version? It will reclaim the clusters used by the journal, and if we
> > continue using the journal we'll corrupt whatever new data is there
> > now.
> > 
> > Can we protect against this without using an autoclear bit?
> 
> You are right.  It's a weird case I didn't think of but it could happen.
> An autoclear bit sounds like the simplest solution.
> 
> Please document this scenario.

Okay, I've updated the description as follows:

    Bit 0:      Journal valid bit. This bit indicates that the
                image contains a valid main journal starting at
                journal_offset; it is used to mark journals
                invalid if the image was opened by older
                implementations that may have "reclaimed" the
                journal clusters that would appear as leaked
                clusters to them.

> > > > +A journal is organised in journal blocks, all of which have a reference count
> > > > +of exactly 1. It starts with a block containing the following journal header:
> > > > +
> > > > +    Byte  0 -  7:   Magic ("qjournal" ASCII string)
> > > > +
> > > > +          8 - 11:   Journal size in bytes, including the header
> > > > +
> > > > +         12 - 15:   Journal block size order (block size in bytes = 1 << order)
> > > > +                    The block size must be at least 512 bytes and must not
> > > > +                    exceed the cluster size.
> > > > +
> > > > +         16 - 19:   Journal block index of the descriptor for the last
> > > > +                    transaction that has been synced, starting with 1 for the
> > > > +                    journal block after the header. 0 is used for empty
> > > > +                    journals.
> > > > +
> > > > +         20 - 23:   Sequence number of the last transaction that has been
> > > > +                    synced. 0 is recommended as the initial value.
> > > > +
> > > > +         24 - 27:   Sequence number of the last transaction that has been
> > > > +                    committed. When replaying a journal, all transactions
> > > > +                    after the last synced one up to the last commit one must be
> > > > +                    synced. Note that this may include a wraparound of sequence
> > > > +                    numbers.
> > > > +
> > > > +         28 -  31:  Checksum (one's complement of the sum of all bytes in the
> > > > +                    header journal block except those of the checksum field)
> > > > +
> > > > +         32 - 511:  Reserved (set to 0)
> > > 
> > > I'm not sure if these fields are necessary.  They require updates (and
> > > maybe flush) after every commit and sync.
> > > 
> > > The fewer metadata updates, the better, not just for performance but
> > > also to reduce the risk of data loss.  If any metadata required to
> > > access the journal is corrupted, the image will be unavailable.
> > > 
> > > It should be possible to determine this information by scanning the
> > > journal transactions.
> > 
> > This is rather handwavy. Can you elaborate how this would work in detail?
> > 
> > 
> > For example, let's assume we get to read this journal (a journal can be
> > rather large, too, so I'm not sure if we want to read it in completely):
> > 
> >  - Descriptor, seq 42, 2 data blocks
> >  - Data block
> >  - Data block
> >  - Data block starting with "qjbk"
> >  - Data block
> >  - Descriptor, seq 7, 0 data blocks
> >  - Descriptor, seq 8, 1 data block
> >  - Data block
> > 
> > Which of these have already been synced? Which have been committed?

So what's your algorithm for this?

> > I guess we could introduce an is_commited flag in the descriptor, but
> > wouldn't correct operation look like this then:
> > 
> > 1. Write out descriptor commit flag clear and any data blocks
> > 2. Flush
> > 3. Rewrite descriptor with commit flag set
> > 
> > This ensures that the commit flag is only set if all the required data
> > is indeed stable on disk. What has changed compared to this proposal is
> > just the offset at which you write in step 3 (header vs. descriptor).
> 
> A commit flag cannot be relied upon.  A transaction can be corrupted
> after being committed, or it can be corrupted due to power failure while
> writing the transaction.  In both cases we have an invalid transaction
> and we must discard it.

No, I believe it is vitally important to distinguish these two cases.

If a transaction was corrupted due to power failure while writing the
transaction, then we can simply discard it indeed.

If, however, a transaction was committed and gets corrupted after the
fact, then we have a problem because the data on the disk is laid out as
described by on-disk metadat (e.g. L2 tables) _with the journal fully
applied_. The replay and consequently bdrv_open() must fail in this case.

The first case is handled by any information that tells us whether the
transaction is already committed; the second should never happen, but
would be caught by a checksum.

> The checksum tells us whether the transaction is valid (i.e. committed).
> I see no need for a commit flag since the checksum already tells us if
> the transaction is valid or not.
> 
> (If the checksum is weak then corruptions can sneak through, so we need
> to choose a reasonable algorithm.)

So you're essentially replacing a flush by a checksum? This is an
interesting thought, but feels quite dangerous. It also can only work if
the only goal is to ensure that the transaction is fully written, but
not for ordering. Generally after committing the journal, we want to
rely on the journalled data for other operations. I don't think we can
save that flush.

> > For sync I suppose the same option exists.
> 
> The sync process is:
> 
> 1. Apply descriptor operations to the image file
> 2. Flush - ensure transactions have been applied to disk
> 3. Mark transactions synced
> 
> Transactions can be marked synced by writing a new block to the journal.
> Or we could even piggyback the next transaction by including a
> last_sync_seq in the header.

Would this mean that there is always at least one unsynced transaction?

> > > > +A wraparound may not occur in the middle of a single transaction, but only
> > > > +between two transactions. For the necessary padding an empty descriptor with
> > > > +any number of data blocks can be used as the last entry of the ring.
> > > 
> > > Why have this limitation?
> > 
> > I thought it would make implementations easier if they didn't have to
> > read/write in data blocks in two parts in some cases.
> 
> Readers must detect wrapped/truncated transactions anyway since the
> journal may be corrupted.  So I don't think we save anything by adding
> the padding special case.

Fair enough. I'll change it.

> > > > +    earlier transactions (this does not include the transaction containing the
> > > > +    revoke). They must not be executed on a sync operation (e.g. because the
> > > > +    range in question has been freed and may have been reused for other, not
> > > > +    journalled data structures that must not be overwritten with stale data).
> > > > +    Note that this may mean that operations are to be executed partially.
> > > 
> > > Example scenario?
> > 
> > Once I had one... Well, let's try this:
> > 
> > 1. Cluster allocation that allocates a new L2 table, i.e. L1 update gets
> >    journalled
> > 
> > 2. Another cluster allocation, this time the L1 has to grow. The write
> >    to the new L1 position is journalled, the old table is freed.
> 
> Another solution is to sync all transactions before moving metadata
> around or updating the qcow2 header (e.g. L1 table offset).
> 
> Since 'revoke' itself requires writing to the journal, it may not be
> much more expensive to simply sync transactions.
> 
> I'm not sure which approach is best but I wanted to mention it if it
> helps use keep the journal design and implementation simpler.
> 
> > 3. Next allocation reuses the clusters where the old L1 table was stored
> > 
> > 4. Journal sync. The L1 update from step 1 is written out to the
> >    clusters that are now used for data. Oops.

I think revoke would be an essential part of Delayed COW anyway (not
having to actually perform the journalled COW is the whole point of
Delayed COW), so using it for simpler cases as well shouldn't hurt.

Actually I believe there's potential for simpler code with revoke:
update_refcounts() could directly notify the journalling code about any
clusters that reach a refcount of 0, so changes to them can be revoked.

Kevin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-05  9:35 ` Stefan Hajnoczi
@ 2013-09-05 11:50   ` Kevin Wolf
  2013-09-05 12:08     ` Benoît Canet
  0 siblings, 1 reply; 24+ messages in thread
From: Kevin Wolf @ 2013-09-05 11:50 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: benoit.canet, jcody, famz, qemu-devel, mreitz

Am 05.09.2013 um 11:35 hat Stefan Hajnoczi geschrieben:
> Although we are still discussing details of the on-disk layout, the
> general design is clear enough to discuss how the journal will be used.
> 
> Today qcow2 uses Qcow2Cache to do lazy, ordered metadata updates.  The
> performance is pretty good with two exceptions that I can think of:
> 
> 1. The delayed CoW problem that Kevin has been working on.  Guests
>    perform sequential writes that are smaller than a qcow2 cluster.  The
>    first write triggers a copy-on-write of the full cluster.  Later
>    writes then overwrite the copied data.  It would be more efficient to
>    anticipate sequential writes and hold off on CoW where possible.

To be clear, "more efficient" can mean a plus of 50% and more. COW
overhead is the only major overhead compared to raw when looking at
normal cluster allocations. So this is something that is really
important for cluster allocation performance.

The patches that I posted a while ago showed that it's possible to do
this without a journal, however the flush operation became very complex
(which we all found rather scary) and required that the COW be completed
before signalling flush completion.

With a journal, the only thing that you need to do on a flush is to
commit all transactions, i.e. write them out and bdrv_flush(bs->file).
The actualy data copy of the COW (i.e. the sync) can be further delayed
and doesn't have to happen at commit type as it would have without a
journal.

> 2. Lazy metadata updates lead to bursty behavior and expensive flushes.
>    We do not take advantage of disk bandwidth since metadata updates
>    stay in the Qcow2Cache until the last possible second.  When the
>    guest issues a flush we must write out dirty Qcow2Cache entries and
>    possibly fsync between them if dependencies have been set (e.g.
>    refcount before L2).

Hm, have we ever measured the impact of this?

I don't think a journal can make a fundamental difference here - either
you write only at the last possible second (today flush, with a journal
commit), or you write out more data than strictly necessary.

> How will the journal change this situation?  Writes that go through the
> journal are doubled - they must first be journalled, fsync, and then
> they can be applied to the actual image.
> 
> How do we benefit by using the journal?

I believe Delayed COW is a pretty strong one. But there are more cases
in which performance isn't that great.

I think you refer to the simple case with a normal empty image where new
clusters are allocated, which is pretty good indeed if we ignore COW.
Trouble starts when you also free clusters, which happens for example
with internal COW (internal snapshots, compressed images) or discard.
Deduplication as well in the future, I suppose.

Then you get very quickly alternating sequences of "L2 depends on
refcount update" (for allocation) and "refcount update depends on L2
update" (for freeing), which means that Qcow2Cache starts flushing all
the time without accumulating many requests. These are cases that would
benefit as well from the atomicity of journal transactions.

And then, of course, we still leak clusters on failed operations. With a
journal, this wouldn't happen any more and the image would always stay
consistent (instead of only corruption-free).

Kevin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-05 11:50   ` Kevin Wolf
@ 2013-09-05 12:08     ` Benoît Canet
  0 siblings, 0 replies; 24+ messages in thread
From: Benoît Canet @ 2013-09-05 12:08 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: benoit.canet, famz, jcody, qemu-devel, mreitz, Stefan Hajnoczi

> Then you get very quickly alternating sequences of "L2 depends on
> refcount update" (for allocation) and "refcount update depends on L2
> update" (for freeing), which means that Qcow2Cache starts flushing all
> the time without accumulating many requests. These are cases that would
> benefit as well from the atomicity of journal transactions.

True, Deduplication can hit this case on delete if I remember correctly
and it slow down everything.

Best regards

Benoît

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-05 11:18       ` Kevin Wolf
@ 2013-09-05 14:55         ` Stefan Hajnoczi
  2013-09-05 15:20           ` Kevin Wolf
  0 siblings, 1 reply; 24+ messages in thread
From: Stefan Hajnoczi @ 2013-09-05 14:55 UTC (permalink / raw)
  To: Kevin Wolf
  Cc: Benoît Canet, Fam Zheng, Jeff Cody, qemu-devel, Max Reitz,
	Stefan Hajnoczi

On Thu, Sep 5, 2013 at 1:18 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 05.09.2013 um 11:21 hat Stefan Hajnoczi geschrieben:
>> On Wed, Sep 04, 2013 at 11:39:51AM +0200, Kevin Wolf wrote:
>> > However, what if we run 'qemu-img check -r leaks' with an old qemu-img
>> > version? It will reclaim the clusters used by the journal, and if we
>> > continue using the journal we'll corrupt whatever new data is there
>> > now.
>> >
>> > Can we protect against this without using an autoclear bit?
>>
>> You are right.  It's a weird case I didn't think of but it could happen.
>> An autoclear bit sounds like the simplest solution.
>>
>> Please document this scenario.
>
> Okay, I've updated the description as follows:
>
>     Bit 0:      Journal valid bit. This bit indicates that the
>                 image contains a valid main journal starting at
>                 journal_offset; it is used to mark journals
>                 invalid if the image was opened by older
>                 implementations that may have "reclaimed" the
>                 journal clusters that would appear as leaked
>                 clusters to them.

Great, thanks.

>> > > > +A journal is organised in journal blocks, all of which have a reference count
>> > > > +of exactly 1. It starts with a block containing the following journal header:
>> > > > +
>> > > > +    Byte  0 -  7:   Magic ("qjournal" ASCII string)
>> > > > +
>> > > > +          8 - 11:   Journal size in bytes, including the header
>> > > > +
>> > > > +         12 - 15:   Journal block size order (block size in bytes = 1 << order)
>> > > > +                    The block size must be at least 512 bytes and must not
>> > > > +                    exceed the cluster size.
>> > > > +
>> > > > +         16 - 19:   Journal block index of the descriptor for the last
>> > > > +                    transaction that has been synced, starting with 1 for the
>> > > > +                    journal block after the header. 0 is used for empty
>> > > > +                    journals.
>> > > > +
>> > > > +         20 - 23:   Sequence number of the last transaction that has been
>> > > > +                    synced. 0 is recommended as the initial value.
>> > > > +
>> > > > +         24 - 27:   Sequence number of the last transaction that has been
>> > > > +                    committed. When replaying a journal, all transactions
>> > > > +                    after the last synced one up to the last commit one must be
>> > > > +                    synced. Note that this may include a wraparound of sequence
>> > > > +                    numbers.
>> > > > +
>> > > > +         28 -  31:  Checksum (one's complement of the sum of all bytes in the
>> > > > +                    header journal block except those of the checksum field)
>> > > > +
>> > > > +         32 - 511:  Reserved (set to 0)
>> > >
>> > > I'm not sure if these fields are necessary.  They require updates (and
>> > > maybe flush) after every commit and sync.
>> > >
>> > > The fewer metadata updates, the better, not just for performance but
>> > > also to reduce the risk of data loss.  If any metadata required to
>> > > access the journal is corrupted, the image will be unavailable.
>> > >
>> > > It should be possible to determine this information by scanning the
>> > > journal transactions.
>> >
>> > This is rather handwavy. Can you elaborate how this would work in detail?
>> >
>> >
>> > For example, let's assume we get to read this journal (a journal can be
>> > rather large, too, so I'm not sure if we want to read it in completely):
>> >
>> >  - Descriptor, seq 42, 2 data blocks
>> >  - Data block
>> >  - Data block
>> >  - Data block starting with "qjbk"
>> >  - Data block
>> >  - Descriptor, seq 7, 0 data blocks
>> >  - Descriptor, seq 8, 1 data block
>> >  - Data block
>> >
>> > Which of these have already been synced? Which have been committed?
>
> So what's your algorithm for this?

Scan the journal to find unsynced transactions, if they exist:

last_sync_seq = 0
last_seqno = 0
while True:
    block = journal[(i++) % journal_nblocks]
    if i >= journal_nblocks * 2:
        break # avoid infinite loop
    if block.magic != 'qjbk':
        continue
    if block.seqno < last_seqno:
        # Wrapped around to oldest transaction
        break
    elif block.seqno == seqno:
        # Corrupt journal, sequence number should be
        # monotonically increasing
        raise InvalidJournalException
    if block.last_sync_seq != last_sync_seq:
        last_sync_seq = block.last_sync_seq
    last_seqno = block.seqno

print 'First unsynced block seq no:', last_sync_seq
print 'Last block seq no:', last_seqno

This is broken pseudocode, but hopefully the idea makes sense.

>> > I guess we could introduce an is_commited flag in the descriptor, but
>> > wouldn't correct operation look like this then:
>> >
>> > 1. Write out descriptor commit flag clear and any data blocks
>> > 2. Flush
>> > 3. Rewrite descriptor with commit flag set
>> >
>> > This ensures that the commit flag is only set if all the required data
>> > is indeed stable on disk. What has changed compared to this proposal is
>> > just the offset at which you write in step 3 (header vs. descriptor).
>>
>> A commit flag cannot be relied upon.  A transaction can be corrupted
>> after being committed, or it can be corrupted due to power failure while
>> writing the transaction.  In both cases we have an invalid transaction
>> and we must discard it.
>
> No, I believe it is vitally important to distinguish these two cases.
>
> If a transaction was corrupted due to power failure while writing the
> transaction, then we can simply discard it indeed.

I agree that we need to detect whether a journal is corrupted partway
through or just the last unsynced transactions.

If there is a valid transaction after an invalid transaction we know
we're in trouble because the journal was corrupted partway through.

A committed bit isn't necessary for doing that.

> If, however, a transaction was committed and gets corrupted after the
> fact, then we have a problem because the data on the disk is laid out as
> described by on-disk metadat (e.g. L2 tables) _with the journal fully
> applied_. The replay and consequently bdrv_open() must fail in this case.
>
> The first case is handled by any information that tells us whether the
> transaction is already committed; the second should never happen, but
> would be caught by a checksum.
>
>> The checksum tells us whether the transaction is valid (i.e. committed).
>> I see no need for a commit flag since the checksum already tells us if
>> the transaction is valid or not.
>>
>> (If the checksum is weak then corruptions can sneak through, so we need
>> to choose a reasonable algorithm.)
>
> So you're essentially replacing a flush by a checksum? This is an
> interesting thought, but feels quite dangerous. It also can only work if
> the only goal is to ensure that the transaction is fully written, but
> not for ordering. Generally after committing the journal, we want to
> rely on the journalled data for other operations. I don't think we can
> save that flush.

We definitely need to flush if we're making a promise that the
transaction is safely in the journal.

My goal here is for the journal to be append-only instead of using
random accesses to modify the header.

Imagine we're adding a transaction to the journal and just updating
the journal header block when power fails.  This could corrupt the
journal header block.

If we have an append-only solution then the final transaction is
invalid but the rest of the journal is still accessible.

The journal is no good if the entire thing can be inaccessible due to
power failure.

>> > For sync I suppose the same option exists.
>>
>> The sync process is:
>>
>> 1. Apply descriptor operations to the image file
>> 2. Flush - ensure transactions have been applied to disk
>> 3. Mark transactions synced
>>
>> Transactions can be marked synced by writing a new block to the journal.
>> Or we could even piggyback the next transaction by including a
>> last_sync_seq in the header.
>
> Would this mean that there is always at least one unsynced transaction?

Yes.  If you want to sync all transactions you need to make a nop
transaction.  Kind of like a ring buffer with a dummy element so head
and tail don't overlap.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-05 14:55         ` Stefan Hajnoczi
@ 2013-09-05 15:20           ` Kevin Wolf
  2013-09-05 15:56             ` Eric Blake
  0 siblings, 1 reply; 24+ messages in thread
From: Kevin Wolf @ 2013-09-05 15:20 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Benoît Canet, Fam Zheng, Jeff Cody, qemu-devel, Max Reitz,
	Stefan Hajnoczi

Am 05.09.2013 um 16:55 hat Stefan Hajnoczi geschrieben:
> On Thu, Sep 5, 2013 at 1:18 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> > Am 05.09.2013 um 11:21 hat Stefan Hajnoczi geschrieben:
> >> On Wed, Sep 04, 2013 at 11:39:51AM +0200, Kevin Wolf wrote:
> >> > > > +A journal is organised in journal blocks, all of which have a reference count
> >> > > > +of exactly 1. It starts with a block containing the following journal header:
> >> > > > +
> >> > > > +    Byte  0 -  7:   Magic ("qjournal" ASCII string)
> >> > > > +
> >> > > > +          8 - 11:   Journal size in bytes, including the header
> >> > > > +
> >> > > > +         12 - 15:   Journal block size order (block size in bytes = 1 << order)
> >> > > > +                    The block size must be at least 512 bytes and must not
> >> > > > +                    exceed the cluster size.
> >> > > > +
> >> > > > +         16 - 19:   Journal block index of the descriptor for the last
> >> > > > +                    transaction that has been synced, starting with 1 for the
> >> > > > +                    journal block after the header. 0 is used for empty
> >> > > > +                    journals.
> >> > > > +
> >> > > > +         20 - 23:   Sequence number of the last transaction that has been
> >> > > > +                    synced. 0 is recommended as the initial value.
> >> > > > +
> >> > > > +         24 - 27:   Sequence number of the last transaction that has been
> >> > > > +                    committed. When replaying a journal, all transactions
> >> > > > +                    after the last synced one up to the last commit one must be
> >> > > > +                    synced. Note that this may include a wraparound of sequence
> >> > > > +                    numbers.
> >> > > > +
> >> > > > +         28 -  31:  Checksum (one's complement of the sum of all bytes in the
> >> > > > +                    header journal block except those of the checksum field)
> >> > > > +
> >> > > > +         32 - 511:  Reserved (set to 0)
> >> > >
> >> > > I'm not sure if these fields are necessary.  They require updates (and
> >> > > maybe flush) after every commit and sync.
> >> > >
> >> > > The fewer metadata updates, the better, not just for performance but
> >> > > also to reduce the risk of data loss.  If any metadata required to
> >> > > access the journal is corrupted, the image will be unavailable.
> >> > >
> >> > > It should be possible to determine this information by scanning the
> >> > > journal transactions.
> >> >
> >> > This is rather handwavy. Can you elaborate how this would work in detail?
> >> >
> >> >
> >> > For example, let's assume we get to read this journal (a journal can be
> >> > rather large, too, so I'm not sure if we want to read it in completely):
> >> >
> >> >  - Descriptor, seq 42, 2 data blocks
> >> >  - Data block
> >> >  - Data block
> >> >  - Data block starting with "qjbk"
> >> >  - Data block
> >> >  - Descriptor, seq 7, 0 data blocks
> >> >  - Descriptor, seq 8, 1 data block
> >> >  - Data block
> >> >
> >> > Which of these have already been synced? Which have been committed?
> >
> > So what's your algorithm for this?
> 
> Scan the journal to find unsynced transactions, if they exist:
> 
> last_sync_seq = 0
> last_seqno = 0
> while True:
>     block = journal[(i++) % journal_nblocks]
>     if i >= journal_nblocks * 2:
>         break # avoid infinite loop
>     if block.magic != 'qjbk':
>         continue

Important implication: This doesn't allow data blocks starting with
'qjbk'. Otherwise you're not even guaranteed to find a descriptor block
to start your seach with.

The second time you make this assumption is when there are stale data
blocks in the unused area between the head and tail of the journal.

>     if block.seqno < last_seqno:
>         # Wrapped around to oldest transaction
>         break

Why can you stop here? There might be transactions in the second half of
the journal that aren't synced yet.

>     elif block.seqno == seqno:
>         # Corrupt journal, sequence number should be
>         # monotonically increasing
>         raise InvalidJournalException
>     if block.last_sync_seq != last_sync_seq:
>         last_sync_seq = block.last_sync_seq

The 'if' doesn't add anything here, so you end up using the
last_sync_seq field of the last valid descriptor.

>     last_seqno = block.seqno
> 
> print 'First unsynced block seq no:', last_sync_seq
> print 'Last block seq no:', last_seqno
> 
> This is broken pseudocode, but hopefully the idea makes sense.

One additional thought that might make the thing a bit more interesting:
Sequence numbers can wrap around as well.

Kevin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-05  9:24       ` Stefan Hajnoczi
@ 2013-09-05 15:26         ` Benoît Canet
  2013-09-06  7:27           ` Kevin Wolf
  0 siblings, 1 reply; 24+ messages in thread
From: Benoît Canet @ 2013-09-05 15:26 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Benoît Canet, Kevin Wolf, famz, jcody, qemu-devel, mreitz

Le Thursday 05 Sep 2013 à 11:24:40 (+0200), Stefan Hajnoczi a écrit :
> On Wed, Sep 04, 2013 at 11:55:23AM +0200, Benoît Canet wrote:
> > > > I'm not sure if multiple journals will work in practice.  Doesn't this
> > > > re-introduce the need to order update steps and flush between them?
> > > 
> > > This is a question for Benoît, who made this requirement. I asked him
> > > the same a while ago and apparently his explanation made some sense to
> > > me, or I would have remembered that I don't want it. ;-)
> > 
> > The reason behind the multiple journal requirement is that if a block get
> > created and deleted in a cyclic way it can generate cyclic insertions/deletions
> > journal entries.
> > The journal could easilly be filled if this pathological corner case happen.
> > When it happen the dedup code repack the journal by writting only the non
> > redundant information into a new journal and then use the new one.
> > It would not be easy to do so if non dedup journal entries are present in the
> > journal hence the multiple journal requirement.
> > 
> > The deduplication also need two journals because when the first one is frozen it
> > take some time to write the hash table to disk and anyway new entries must be
> > stored somewhere at the same time. The code cannot block.
> > 
> > > It might have something to do with the fact that deduplication uses the
> > > journal more as a kind of cache for hash values that can be dropped and
> > > rebuilt after a crash.
> > 
> > For dedupe the journal is more a "resume after exit" tool.
> 
> I'm not sure anymore if dedupe needs the same kind of "journal" as a
> metadata journal for qcow2.
> 
> Since you have a dirty flag to discard the "journal" on crash, the
> journal is not used for data integrity.
> 
> That makes me wonder if the metadata journal is the right structure for
> dedupe?  Maybe your original proposal was fine for dedupe and we just
> misinterpreted it because we thought this needs to be a safe journal.

Kevin what do you think of this ?
I could strip down the dedupe journal code to specialize it.

Best regards

Benoît

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-05 15:20           ` Kevin Wolf
@ 2013-09-05 15:56             ` Eric Blake
  0 siblings, 0 replies; 24+ messages in thread
From: Eric Blake @ 2013-09-05 15:56 UTC (permalink / raw)
  To: Kevin Wolf
  Cc: Benoît Canet, Fam Zheng, Stefan Hajnoczi, Jeff Cody,
	qemu-devel, Max Reitz, Stefan Hajnoczi

[-- Attachment #1: Type: text/plain, Size: 598 bytes --]

On 09/05/2013 09:20 AM, Kevin Wolf wrote:
> 
> One additional thought that might make the thing a bit more interesting:
> Sequence numbers can wrap around as well.

On the other hand, if sequence numbers are 64-bit, the number of
operations required to cause a wrap far exceeds the expected lifetime of
any of us on this list, and we can safely assume it to be a non-issue.
(There's other places in qemu where we intentionally have an abort() if
a 64-bit number would wrap...)

-- 
Eric Blake   eblake redhat com    +1-919-301-3266
Libvirt virtualization library http://libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 621 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-05 15:26         ` Benoît Canet
@ 2013-09-06  7:27           ` Kevin Wolf
  2013-09-15 18:23             ` Benoît Canet
  0 siblings, 1 reply; 24+ messages in thread
From: Kevin Wolf @ 2013-09-06  7:27 UTC (permalink / raw)
  To: Benoît Canet; +Cc: jcody, famz, qemu-devel, Stefan Hajnoczi, mreitz

Am 05.09.2013 um 17:26 hat Benoît Canet geschrieben:
> Le Thursday 05 Sep 2013 à 11:24:40 (+0200), Stefan Hajnoczi a écrit :
> > On Wed, Sep 04, 2013 at 11:55:23AM +0200, Benoît Canet wrote:
> > > > > I'm not sure if multiple journals will work in practice.  Doesn't this
> > > > > re-introduce the need to order update steps and flush between them?
> > > > 
> > > > This is a question for Benoît, who made this requirement. I asked him
> > > > the same a while ago and apparently his explanation made some sense to
> > > > me, or I would have remembered that I don't want it. ;-)
> > > 
> > > The reason behind the multiple journal requirement is that if a block get
> > > created and deleted in a cyclic way it can generate cyclic insertions/deletions
> > > journal entries.
> > > The journal could easilly be filled if this pathological corner case happen.
> > > When it happen the dedup code repack the journal by writting only the non
> > > redundant information into a new journal and then use the new one.
> > > It would not be easy to do so if non dedup journal entries are present in the
> > > journal hence the multiple journal requirement.
> > > 
> > > The deduplication also need two journals because when the first one is frozen it
> > > take some time to write the hash table to disk and anyway new entries must be
> > > stored somewhere at the same time. The code cannot block.
> > > 
> > > > It might have something to do with the fact that deduplication uses the
> > > > journal more as a kind of cache for hash values that can be dropped and
> > > > rebuilt after a crash.
> > > 
> > > For dedupe the journal is more a "resume after exit" tool.
> > 
> > I'm not sure anymore if dedupe needs the same kind of "journal" as a
> > metadata journal for qcow2.
> > 
> > Since you have a dirty flag to discard the "journal" on crash, the
> > journal is not used for data integrity.
> > 
> > That makes me wonder if the metadata journal is the right structure for
> > dedupe?  Maybe your original proposal was fine for dedupe and we just
> > misinterpreted it because we thought this needs to be a safe journal.
> 
> Kevin what do you think of this ?
> I could strip down the dedupe journal code to specialize it.

If you think it turns out easier than using the journalling
infrastructure that we're going to implement anyway, then why not.

Kevin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-04  9:39   ` Kevin Wolf
  2013-09-04  9:55     ` Benoît Canet
  2013-09-05  9:21     ` Stefan Hajnoczi
@ 2013-09-06  9:20     ` Fam Zheng
  2013-09-06  9:57       ` Kevin Wolf
  2 siblings, 1 reply; 24+ messages in thread
From: Fam Zheng @ 2013-09-06  9:20 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: benoit.canet, jcody, qemu-devel, Stefan Hajnoczi, mreitz

On Wed, 09/04 11:39, Kevin Wolf wrote:
> First of all, excuse any inconsistencies in the following mail. I wrote
> it from top to bottom, and there was some thought process involved in
> almost every paragraph...
> 
> Am 04.09.2013 um 10:03 hat Stefan Hajnoczi geschrieben:
> > On Tue, Sep 03, 2013 at 03:45:52PM +0200, Kevin Wolf wrote:
> > > @@ -103,7 +107,11 @@ in the description of a field.
> > >                      write to an image with unknown auto-clear features if it
> > >                      clears the respective bits from this field first.
> > >  
> > > -                    Bits 0-63:  Reserved (set to 0)
> > > +                    Bit 0:      Journal valid bit. This bit indicates that the
> > > +                                image contains a valid main journal starting at
> > > +                                journal_offset.
> > 
> > Whether the journal is used can be determined from the journal_offset
> > value (header length must be large enough and journal offset must be
> > valid).
> > 
> > Why do we need this autoclear bit?
> 
> Hm, I introduced this one first and the journal dirty incompatible bit
> later, perhaps it's unnecessary now. Let's check...
> 
> The obvious thing we need to protect against is applying stale journal
> data to an image that has been changed by an older version. As long as
> the journal is clean, this can't happen, and the journal dirty bit will
> ensure that the old version can only open the image if it is clean.
> 
> However, what if we run 'qemu-img check -r leaks' with an old qemu-img
> version? It will reclaim the clusters used by the journal, and if we
> continue using the journal we'll corrupt whatever new data is there
> now.
> 
Why can old version qemu-img open the image with dirty journal in the first
place? It's incompatible bit.

> Can we protect against this without using an autoclear bit?
> 
> > > +Journals are used to allow safe updates of metadata without impacting
> > > +performance by requiring flushes to order updates to different parts of the
> > > +metadata.
> > 
> > This sentence is hard to parse.  Maybe something shorter like this:
> > 
> > Journals allow safe metadata updates without the need for carefully
> > ordering and flushing between update steps.
> 
> Okay, I'll update the text with your proposal.
> 
> > > +They consist of transactions, which in turn contain operations that
> > > +are effectively executed atomically. A qcow2 image can have a main image
> > > +journal that deals with cluster management operations, and additional specific
> > > +journals can be used by other features like data deduplication.
> > 
> > I'm not sure if multiple journals will work in practice.  Doesn't this
> > re-introduce the need to order update steps and flush between them?
> 
> This is a question for Benoît, who made this requirement. I asked him
> the same a while ago and apparently his explanation made some sense to
> me, or I would have remembered that I don't want it. ;-)
> 
> It might have something to do with the fact that deduplication uses the
> journal more as a kind of cache for hash values that can be dropped and
> rebuilt after a crash.
> 
> > > +A journal is organised in journal blocks, all of which have a reference count
> > > +of exactly 1. It starts with a block containing the following journal header:
> > > +
> > > +    Byte  0 -  7:   Magic ("qjournal" ASCII string)
> > > +
> > > +          8 - 11:   Journal size in bytes, including the header
> > > +
> > > +         12 - 15:   Journal block size order (block size in bytes = 1 << order)
> > > +                    The block size must be at least 512 bytes and must not
> > > +                    exceed the cluster size.
> > > +
> > > +         16 - 19:   Journal block index of the descriptor for the last
> > > +                    transaction that has been synced, starting with 1 for the
> > > +                    journal block after the header. 0 is used for empty
> > > +                    journals.
> > > +
> > > +         20 - 23:   Sequence number of the last transaction that has been
> > > +                    synced. 0 is recommended as the initial value.
> > > +
> > > +         24 - 27:   Sequence number of the last transaction that has been
> > > +                    committed. When replaying a journal, all transactions
> > > +                    after the last synced one up to the last commit one must be
> > > +                    synced. Note that this may include a wraparound of sequence
> > > +                    numbers.
> > > +
> > > +         28 -  31:  Checksum (one's complement of the sum of all bytes in the
> > > +                    header journal block except those of the checksum field)
> > > +
> > > +         32 - 511:  Reserved (set to 0)
> > 
> > I'm not sure if these fields are necessary.  They require updates (and
> > maybe flush) after every commit and sync.
> > 
> > The fewer metadata updates, the better, not just for performance but
> > also to reduce the risk of data loss.  If any metadata required to
> > access the journal is corrupted, the image will be unavailable.
> > 
> > It should be possible to determine this information by scanning the
> > journal transactions.
> 
> This is rather handwavy. Can you elaborate how this would work in detail?
> 
> 
> For example, let's assume we get to read this journal (a journal can be
> rather large, too, so I'm not sure if we want to read it in completely):
> 
>  - Descriptor, seq 42, 2 data blocks
>  - Data block
>  - Data block
>  - Data block starting with "qjbk"
>  - Data block
>  - Descriptor, seq 7, 0 data blocks
>  - Descriptor, seq 8, 1 data block
>  - Data block
> 
> Which of these have already been synced? Which have been committed?
> 
> 
> I guess we could introduce an is_commited flag in the descriptor, but
> wouldn't correct operation look like this then:
> 
> 1. Write out descriptor commit flag clear and any data blocks
> 2. Flush
> 3. Rewrite descriptor with commit flag set
> 
> This ensures that the commit flag is only set if all the required data
> is indeed stable on disk. What has changed compared to this proposal is
> just the offset at which you write in step 3 (header vs. descriptor).
> 
> For sync I suppose the same option exists.
> 
> 
> One unrelated thing we need to take care of is this 'data block starting
> with "qjbk"' thing I mentioned in the example. I can see two solutions:
> The first is never creating such a journal, by always blanking the next
> block after a transaction that we write, so that the descriptor chain is
> terminated. The second one is never creating such a journal, by
> zeroing an initial "qjbk" in data blocks and setting a flag in the
> descriptor instead.
> 
> I was thinking of the first, but I guess it would be worth at least
> mentioning the problem in the spec.
> 
> > > +A wraparound may not occur in the middle of a single transaction, but only
> > > +between two transactions. For the necessary padding an empty descriptor with
> > > +any number of data blocks can be used as the last entry of the ring.
> > 
> > Why have this limitation?
> 
> I thought it would make implementations easier if they didn't have to
> read/write in data blocks in two parts in some cases.
> 
> > > +All descriptors start with a common part:
> > > +
> > > +    Byte  0 -  1:   Descriptor type
> > > +                        0 - No-op descriptor
> > > +                        1 - Write data block
> > > +                        2 - Copy data
> > > +                        3 - Revoke
> > > +                        4 - Deduplication hash insertion
> > > +                        5 - Deduplication hash deletion
> > > +
> > > +          2 -  3:   Size of the descriptor in bytes
> > 
> > Data blocks are not included in the descriptor size?  I just want to
> > make sure that we don't be limited to 64 KB for the actual data.
> 
> Right, this only for the descriptors, without data. It implies a maximum
> journal block size of 64k, which I think is okay.
> 
> > > +
> > > +          4 -  n:   Type-specific data
> > > +
> > > +The following section specifies the purpose (i.e. the action that is to be
> > > +performed when syncing) and type-specific data layout of each descriptor type:
> > > +
> > > +  * No-op descriptor: No action is to be performed when syncing this descriptor
> > > +
> > > +          4 -  n:   Ignored
> > > +
> > > +  * Write data block: Write literal data associated with this transaction from
> > > +    the journal to a given offset.
> > > +
> > > +          4 -  7:   Length of the data to write in bytes
> > > +
> > > +          8 - 15:   Offset in the image file to write the data to
> > > +
> > > +         16 - 19:   Index of the journal block at which the data to write
> > > +                    starts. The data must be stored sequentially and be fully
> > > +                    contained in the data blocks associated with the
> > > +                    transaction.
> > > +
> > > +    The type-specific data can be repeated, specifying multiple chunks of data
> > > +    to be written in one operation. This means the size of the descriptor must
> > > +    be 4 + 16 * n.
> > 
> > Why is the necessary?  Multiple data descriptors could be used, is it
> > worth the additional logic and testing?
> 
> What does a typical journal transaction look like?
> 
> My assumption is that initially it has lots of L2 table and refcount
> block updates and little else. All of these are data writes. Once we
> implement Delayed COW using the journal, we get some data copy
> operations as well. Assuming a stupid guest, we get two copies and two
> writes for each cluster allocation.
> 
> The space required for these is 2*(4 + 16n) + 2*(4 + 20n) = 16 + 72n. In
> one 4k descriptor block you can fit 56 cluster allocations.
> 
> If you used separate descriptors, you get 2*20n + 2*24n = 88n. In one 4k
> descriptor block you could fit 46 cluster allocations.
> 
> Considering that in both cases the space used by descriptors is dwarved
> by the updated tables to be written, I guess it's not worth it for the
> main journal. Things may look different for the dedpulication journal,
> where no data blocks are used and the space taken is determined only by
> the descriptors.
> 
> And in fact... All of the above sounded great, but is in fact wrong.
> Typically you'd get _one_ L2 update for all (sequential) allocations,
> the COW entries without data should dominate in the main journal as well.
> 
> > > +
> > > +  * Copy data: Copy data from one offset in the image to another one. This can
> > > +    be used for journalling copy-on-write operations.
> > 
> > This reminds me to ask what the plan is for journal scope: metadata only
> > or also data?  For some operations like dedupe it seems that full data
> > journalling may be necessary.  But for an image without dedupe it would
> > not be necessary to journal the rewrites to an already allocated
> > cluster, for example.
> 
> Generally only metadata.
> 
> The COW case is a bit tricky. Here we really store only metadata in the
> journal as well ("this COW is still pending"), but it directly affects
> data and the qemu read/write paths for data must take such pending COWs
> into consideration. This will require some ordering between the journal
> and data (e.g. only free the COWed cluster after the journal is
> committed), but I think that's okay.
> 
> > > +          4 -  7:   Length of the data to write in bytes
> > > +
> > > +          8 - 15:   Target offset in the image file
> > > +
> > > +         16 - 23:   Source offset in the image file
> > 
> > Source and target cannot overlap?
> 
> I don't think there's a use case for it, so let's forbid it.
> 
> > > +
> > > +    The type-specific data can be repeated, specifying multiple chunks of data
> > > +    to be copied in one operation. This means the size of the descriptor must
> > > +    be 4 + 20 * n.
> > > +
> > > +  * Revoke: Marks operations on a given range in the imag file invalid for all
> > 
> > s/imag/image/
> > 
> > > +    earlier transactions (this does not include the transaction containing the
> > > +    revoke). They must not be executed on a sync operation (e.g. because the
> > > +    range in question has been freed and may have been reused for other, not
> > > +    journalled data structures that must not be overwritten with stale data).
> > > +    Note that this may mean that operations are to be executed partially.
> > 
> > Example scenario?
> 
> Once I had one... Well, let's try this:
> 
> 1. Cluster allocation that allocates a new L2 table, i.e. L1 update gets
>    journalled
> 
> 2. Another cluster allocation, this time the L1 has to grow. The write
>    to the new L1 position is journalled, the old table is freed.
> 
> 3. Next allocation reuses the clusters where the old L1 table was stored
> 
> 4. Journal sync. The L1 update from step 1 is written out to the
>    clusters that are now used for data. Oops.
> 
> Kevin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-06  9:20     ` Fam Zheng
@ 2013-09-06  9:57       ` Kevin Wolf
  2013-09-06 10:02         ` Fam Zheng
  0 siblings, 1 reply; 24+ messages in thread
From: Kevin Wolf @ 2013-09-06  9:57 UTC (permalink / raw)
  To: Fam Zheng; +Cc: benoit.canet, jcody, qemu-devel, Stefan Hajnoczi, mreitz

Am 06.09.2013 um 11:20 hat Fam Zheng geschrieben:
> On Wed, 09/04 11:39, Kevin Wolf wrote:
> > First of all, excuse any inconsistencies in the following mail. I wrote
> > it from top to bottom, and there was some thought process involved in
> > almost every paragraph...
> > 
> > Am 04.09.2013 um 10:03 hat Stefan Hajnoczi geschrieben:
> > > On Tue, Sep 03, 2013 at 03:45:52PM +0200, Kevin Wolf wrote:
> > > > @@ -103,7 +107,11 @@ in the description of a field.
> > > >                      write to an image with unknown auto-clear features if it
> > > >                      clears the respective bits from this field first.
> > > >  
> > > > -                    Bits 0-63:  Reserved (set to 0)
> > > > +                    Bit 0:      Journal valid bit. This bit indicates that the
> > > > +                                image contains a valid main journal starting at
> > > > +                                journal_offset.
> > > 
> > > Whether the journal is used can be determined from the journal_offset
> > > value (header length must be large enough and journal offset must be
> > > valid).
> > > 
> > > Why do we need this autoclear bit?
> > 
> > Hm, I introduced this one first and the journal dirty incompatible bit
> > later, perhaps it's unnecessary now. Let's check...
> > 
> > The obvious thing we need to protect against is applying stale journal
> > data to an image that has been changed by an older version. As long as
> > the journal is clean, this can't happen, and the journal dirty bit will
> > ensure that the old version can only open the image if it is clean.
> > 
> > However, what if we run 'qemu-img check -r leaks' with an old qemu-img
> > version? It will reclaim the clusters used by the journal, and if we
> > continue using the journal we'll corrupt whatever new data is there
> > now.
> > 
> Why can old version qemu-img open the image with dirty journal in the first
> place? It's incompatible bit.

This is about a clean journal.

Kevin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-03 13:45 [Qemu-devel] [RFC] qcow2 journalling draft Kevin Wolf
                   ` (3 preceding siblings ...)
  2013-09-05  9:35 ` Stefan Hajnoczi
@ 2013-09-06  9:59 ` Fam Zheng
  4 siblings, 0 replies; 24+ messages in thread
From: Fam Zheng @ 2013-09-06  9:59 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: benoit.canet, jcody, qemu-devel, stefanha, mreitz

On Tue, 09/03 15:45, Kevin Wolf wrote:
> This contains an extension of the qcow2 spec that introduces journalling
> to the image format, plus some preliminary type definitions and
> function prototypes in the qcow2 code.
> 
> Journalling functionality is a crucial feature for the design of data
> deduplication, and it will improve the core part of qcow2 by avoiding
> cluster leaks on crashes as well as provide an easier way to get a
> reliable implementation of performance features like Delayed COW.
> 
> At this point of the RFC, it would be most important to review the
> on-disk structure. Once we're confident that it can do everything we
> want, we can start going into more detail on the qemu side of things.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>  block/Makefile.objs   |   2 +-
>  block/qcow2-journal.c |  55 ++++++++++++++
>  block/qcow2.h         |  78 +++++++++++++++++++
>  docs/specs/qcow2.txt  | 204 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 337 insertions(+), 2 deletions(-)
>  create mode 100644 block/qcow2-journal.c
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 3bb85b5..59be314 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -1,5 +1,5 @@
>  block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
> -block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
> +block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o qcow2-journal.o
>  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-obj-y += qed-check.o
>  block-obj-y += vhdx.o
> diff --git a/block/qcow2-journal.c b/block/qcow2-journal.c
> new file mode 100644
> index 0000000..5b20239
> --- /dev/null
> +++ b/block/qcow2-journal.c
> @@ -0,0 +1,55 @@
> +/*
> + * qcow2 journalling functions
> + *
> + * Copyright (c) 2013 Kevin Wolf <kwolf@redhat.com>
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "qemu-common.h"
> +#include "block/block_int.h"
> +#include "qcow2.h"
> +
> +#define QCOW2_JOURNAL_MAGIC 0x716a6f75726e616cULL  /* "qjournal" */
> +#define QCOW2_JOURNAL_BLOCK_MAGIC 0x716a626b  /* "qjbk" */
> +
> +typedef struct Qcow2JournalHeader {
> +    uint64_t    magic;
> +    uint32_t    journal_size;
> +    uint32_t    block_size;
> +    uint32_t    synced_index;
> +    uint32_t    synced_seq;
> +    uint32_t    committed_seq;
> +    uint32_t    checksum;
> +} QEMU_PACKED Qcow2JournalHeader;
> +
> +/*
> + * One big transaction per journal block. The transaction is committed either
> + * time based or when a microtransaction (single set of operations that must be
> + * performed atomically) doesn't fit in the same block any more.
> + */
> +typedef struct Qcow2JournalBlock {
> +    uint32_t    magic;
> +    uint32_t    checksum;
> +    uint32_t    seq;
> +    uint32_t    desc_offset; /* Allow block header extensions */
> +    uint32_t    desc_bytes;
> +    uint32_t    nb_data_blocks;
> +} QEMU_PACKED Qcow2JournalBlock;
> +
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 1000239..2aee1fd 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -157,6 +157,10 @@ typedef struct Qcow2DiscardRegion {
>      QTAILQ_ENTRY(Qcow2DiscardRegion) next;
>  } Qcow2DiscardRegion;
>  
> +typedef struct Qcow2Journal {
> +
> +} Qcow2Journal;
> +
>  typedef struct BDRVQcowState {
>      int cluster_bits;
>      int cluster_size;
> @@ -479,4 +483,78 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
>      void **table);
>  int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
>  
> +/* qcow2-journal.c functions */
> +
> +typedef struct Qcow2JournalTransaction Qcow2JournalTransaction;
> +
> +enum Qcow2JournalEntryTypeID {
> +    QJ_DESC_NOOP    = 0,
> +    QJ_DESC_WRITE   = 1,
> +    QJ_DESC_COPY    = 2,
> +
> +    /* required after a cluster is freed and used for other purposes, so that
> +     * new (unjournalled) data won't be overwritten with stale metadata */
> +    QJ_DESC_REVOKE  = 3,
> +};
> +
> +typedef struct Qcow2JournalEntryType {
> +    enum Qcow2JournalEntryTypeID id;
> +    int (*sync)(void *buf, size_t size);
> +} Qcow2JournalEntryType;
> +
> +typedef struct Qcow2JournalDesc {
> +    uint16_t    type;
> +    uint16_t    size;
> +} QEMU_PACKED Qcow2JournalDesc;
> +
> +typedef struct Qcow2JournalDescWrite {
> +    Qcow2JournalDesc common;
> +    struct {
> +        uint32_t length;
> +        uint64_t target_offset;
> +        uint32_t data_block_index;
> +    } write[];
> +} QEMU_PACKED Qcow2JournalDescData;
> +
> +typedef struct Qcow2JournalDescCopy {
> +    Qcow2JournalDesc common;
> +    struct {
> +        uint32_t length;
> +        uint64_t target_offset;
> +        uint64_t source_offset;
> +    } copy[];
> +} QEMU_PACKED Qcow2JournalDescCopy;
> +
> +typedef struct Qcow2JournalRevoke {
> +    Qcow2JournalDesc common;
> +    struct {
> +        uint32_t length;
> +        uint64_t target_offset;
> +    } revoke[];
> +} QEMU_PACKED Qcow2JournalDescRevoke;
> +
> +void qcow2_journal_register_entry_type(Qcow2JournalEntryType *type);
> +
> +/* When commit_interval seconds have passed since the last commit, or
> + * uncommitted journal data of at least commit_datasize bytes has accumulated
> + * (whatever occurs first), transactions are committed. */
> +int qcow2_journal_init(Qcow2Journal **journal, uint64_t start_offset,
> +                       int commit_interval, size_t commit_datasize);
> +int qcow2_journal_destroy(Qcow2Journal *journal);
> +
> +/* These functions create microtransactions, i.e. a set of operations that must
> + * be executed atomically. In general, qemu doesn't map this to one qcow2
> + * on-disk transaction (which would leave a lot of space unused), but handles
> + * multiple microtransaction with one on-disk transaction. */
> +Qcow2JournalTransaction *qcow2_journal_begin_transaction(Qcow2Journal *journal);
> +void qcow2_journal_add(Qcow2JournalTransaction *ta, Qcow2JournalDesc *desc);
> +void qcow2_journal_end_transaction(Qcow2JournalTransaction *ta);
> +
> +/* Commits all completed microtransactions (i.e. qcow2_journal_end_transaction
> + * has already been called) */
> +int qcow2_journal_commit(Qcow2Journal *journal);
> +
> +/* Syncs all committed transactions */
> +int qcow2_journal_sync(Qcow2Journal *journal);
> +
>  #endif
> diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt
> index 33eca36..7578a4b 100644
> --- a/docs/specs/qcow2.txt
> +++ b/docs/specs/qcow2.txt
> @@ -85,6 +85,10 @@ in the description of a field.
>                                  be written to (unless for regaining
>                                  consistency).
>  
> +                    Bit 2:      Journal dirty. A replay of the main journal is
> +                                needed in order to regain consistency before
> +                                accessing the image.
> +
>                      Bits 2-63:  Reserved (set to 0)

-EBUSY for bit 2, should continue from bit 3. :)

>  
>           80 -  87:  compatible_features
> @@ -103,7 +107,11 @@ in the description of a field.
>                      write to an image with unknown auto-clear features if it
>                      clears the respective bits from this field first.
>  
> -                    Bits 0-63:  Reserved (set to 0)
> +                    Bit 0:      Journal valid bit. This bit indicates that the
> +                                image contains a valid main journal starting at
> +                                journal_offset.
> +
> +                    Bits 1-63:  Reserved (set to 0)
>  
>           96 -  99:  refcount_order
>                      Describes the width of a reference count block entry (width
> @@ -114,6 +122,16 @@ in the description of a field.
>                      Length of the header structure in bytes. For version 2
>                      images, the length is always assumed to be 72 bytes.
>  
> +        104 - 111:  journal_offset
> +                    Offset into the image file at which the main image journal
> +                    starts. Must be aligned to a cluster boundary. 0 means that
> +                    no journal is used.
> +
> +                    This field is only valid if the journal feature bit in
> +                    autoclear_features is set. If the field is invalid or the
> +                    header is too short to contain the field, the field is
> +                    assumed to be 0 (no journal is used)
> +
>  Directly after the image header, optional sections called header extensions can
>  be stored. Each extension has a structure like the following:
>  
> @@ -355,3 +373,187 @@ Snapshot table entry:
>          variable:   Unique ID string for the snapshot (not null terminated)
>  
>          variable:   Name of the snapshot (not null terminated)
> +
> +
> +== Journal ==
> +
> +Journals are used to allow safe updates of metadata without impacting
> +performance by requiring flushes to order updates to different parts of the
> +metadata. They consist of transactions, which in turn contain operations that
> +are effectively executed atomically. A qcow2 image can have a main image
> +journal that deals with cluster management operations, and additional specific
> +journals can be used by other features like data deduplication.
> +
> +
> +As far as the on-disk format is concerned, a transaction is in one of the
> +following states:
> +
> +    Incomplete:     This is the initial state of any transaction, while new
> +                    operations can still be added. When opening an image with a
> +                    dirty journal, incomplete transactions are discarded.
> +
> +    Committed:      When all operations that must be performed atomically
> +                    during the transaction have been written and are stable on
> +                    disk, the transaction can be committed by increasing the
> +                    commited sequence number in the journal heder. A
> +                    transaction in this state may not be changed. When opening
> +                    an image with a dirty image, committed transactions should
> +                    be replayed.
> +
> +    Synced:         A transaction is synced if all of its operations have been
> +                    performed, all data written is stable on disk, and the
> +                    synced sequence number is increased in the journal header.
> +                    Synced transactions are no longer needed in the journal and
> +                    can be overwritten. They are ignored during replay.
> +
> +The use of a sequence number implies that transactions are processed
> +sequentially and an earlier transaction can never be unsynced/uncommitted if a
> +later one is synced/committed.
> +
> +
> +A journal is organised in journal blocks, all of which have a reference count
> +of exactly 1. It starts with a block containing the following journal header:
> +
> +    Byte  0 -  7:   Magic ("qjournal" ASCII string)
> +
> +          8 - 11:   Journal size in bytes, including the header
> +
> +         12 - 15:   Journal block size order (block size in bytes = 1 << order)
> +                    The block size must be at least 512 bytes and must not
> +                    exceed the cluster size.
> +
> +         16 - 19:   Journal block index of the descriptor for the last
> +                    transaction that has been synced, starting with 1 for the
> +                    journal block after the header. 0 is used for empty

I suggest s/header/journal header/, for less confusion with image header.

> +                    journals.
> +
> +         20 - 23:   Sequence number of the last transaction that has been
> +                    synced. 0 is recommended as the initial value.
> +
> +         24 - 27:   Sequence number of the last transaction that has been
> +                    committed. When replaying a journal, all transactions
> +                    after the last synced one up to the last commit one must be
> +                    synced. Note that this may include a wraparound of sequence
> +                    numbers.
> +
I'm not sure of the downside of relatively frequent wraparound, but any reason
not to use 64 bit sequence numbers which makes it much rare?

> +         28 -  31:  Checksum (one's complement of the sum of all bytes in the
> +                    header journal block except those of the checksum field)
> +
> +         32 - 511:  Reserved (set to 0)
> +
> +
> +The header is followed by journal blocks that are either descriptor or data
> +blocks. The block index at byte 16 points to the first valid descriptor, except
> +for completely empty journals, where it can be 0. The next descriptor can be
> +found by skipping a descriptor and its associated data blocks. When the journal
> +size is exceeded, a wraparound occurs, essentially forming a ring buffer.
> +
> +A wraparound may not occur in the middle of a single transaction, but only
> +between two transactions. For the necessary padding an empty descriptor with
> +any number of data blocks can be used as the last entry of the ring.
> +
> +The chain of valid descriptors ends if a descriptor is reached whose sequence
> +number isn't the successor of the previous sequence number. This means in
> +particular that the journal must be ordered chronologically and has ascending
> +sequence numbers (except in the case of a sequence number wraparound).

Worth documenting the wraparound case ( (seq_num_t)-1 => 0x1)?

Fam
> +All blocks from the end of the descriptor chain until the starting point are
> +unused.
> +
> +
> +Descriptor blocks describe one transaction each and have the following
> +structure:
> +
> +    Byte  0 -  3:   Magic ("qjbk" ASCII string)
> +
> +          4 -  7:   Checksum (one's complement of the sum of all bytes in the
> +                    descriptor block except those of the checksum field, and
> +                    all bytes in the associated data blocks)
> +
> +          8 - 11:   Sequence number of the transaction
> +
> +         12 - 15:   Byte offset into the descriptor block at which descriptors
> +                    start
> +
> +         16 - 19:   Total length of descriptors in this block in bytes
> +
> +         20 - 23:   Number of following data blocks that are associated with
> +                    this transaction.
> +
> +         24 -  n:   (Future extensions)
> +
> +          n -  m:   Array of descriptors as described below. The exact values
> +                    of n and m are determined by the above fields.
> +
> +All descriptors start with a common part:
> +
> +    Byte  0 -  1:   Descriptor type
> +                        0 - No-op descriptor
> +                        1 - Write data block
> +                        2 - Copy data
> +                        3 - Revoke
> +                        4 - Deduplication hash insertion
> +                        5 - Deduplication hash deletion
> +
> +          2 -  3:   Size of the descriptor in bytes
> +
> +          4 -  n:   Type-specific data
> +
> +The following section specifies the purpose (i.e. the action that is to be
> +performed when syncing) and type-specific data layout of each descriptor type:
> +
> +  * No-op descriptor: No action is to be performed when syncing this descriptor
> +
> +          4 -  n:   Ignored
> +
> +  * Write data block: Write literal data associated with this transaction from
> +    the journal to a given offset.
> +
> +          4 -  7:   Length of the data to write in bytes
> +
> +          8 - 15:   Offset in the image file to write the data to
> +
> +         16 - 19:   Index of the journal block at which the data to write
> +                    starts. The data must be stored sequentially and be fully
> +                    contained in the data blocks associated with the
> +                    transaction.
> +
> +    The type-specific data can be repeated, specifying multiple chunks of data
> +    to be written in one operation. This means the size of the descriptor must
> +    be 4 + 16 * n.
> +
> +  * Copy data: Copy data from one offset in the image to another one. This can
> +    be used for journalling copy-on-write operations.
> +
> +          4 -  7:   Length of the data to write in bytes
> +
> +          8 - 15:   Target offset in the image file
> +
> +         16 - 23:   Source offset in the image file
> +
> +    The type-specific data can be repeated, specifying multiple chunks of data
> +    to be copied in one operation. This means the size of the descriptor must
> +    be 4 + 20 * n.
> +
> +  * Revoke: Marks operations on a given range in the imag file invalid for all
> +    earlier transactions (this does not include the transaction containing the
> +    revoke). They must not be executed on a sync operation (e.g. because the
> +    range in question has been freed and may have been reused for other, not
> +    journalled data structures that must not be overwritten with stale data).
> +    Note that this may mean that operations are to be executed partially.
> +
> +          4 -  7:   Length of the range in bytes
> +
> +          8 - 15:   Offset of the range in the image file
> +
> +    The type-specific data can be repeated, specifying multiple ranges for
> +    which operations should be revoked. This means the size of the descriptor
> +    must be 4 + 12 * n.
> +
> +  * Deduplication hash insertion: Associates a hash value with a cluster.
> +
> +    TODO
> +
> +  * Deduplication hash deletion: Marks a hash value invalid (e.g. because the
> +    hashed data has changed)
> +
> +    TODO
> -- 
> 1.8.1.4
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-06  9:57       ` Kevin Wolf
@ 2013-09-06 10:02         ` Fam Zheng
  0 siblings, 0 replies; 24+ messages in thread
From: Fam Zheng @ 2013-09-06 10:02 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: benoit.canet, jcody, qemu-devel, Stefan Hajnoczi, mreitz

On Fri, 09/06 11:57, Kevin Wolf wrote:
> Am 06.09.2013 um 11:20 hat Fam Zheng geschrieben:
> > On Wed, 09/04 11:39, Kevin Wolf wrote:
> > > First of all, excuse any inconsistencies in the following mail. I wrote
> > > it from top to bottom, and there was some thought process involved in
> > > almost every paragraph...
> > > 
> > > Am 04.09.2013 um 10:03 hat Stefan Hajnoczi geschrieben:
> > > > On Tue, Sep 03, 2013 at 03:45:52PM +0200, Kevin Wolf wrote:
> > > > > @@ -103,7 +107,11 @@ in the description of a field.
> > > > >                      write to an image with unknown auto-clear features if it
> > > > >                      clears the respective bits from this field first.
> > > > >  
> > > > > -                    Bits 0-63:  Reserved (set to 0)
> > > > > +                    Bit 0:      Journal valid bit. This bit indicates that the
> > > > > +                                image contains a valid main journal starting at
> > > > > +                                journal_offset.
> > > > 
> > > > Whether the journal is used can be determined from the journal_offset
> > > > value (header length must be large enough and journal offset must be
> > > > valid).
> > > > 
> > > > Why do we need this autoclear bit?
> > > 
> > > Hm, I introduced this one first and the journal dirty incompatible bit
> > > later, perhaps it's unnecessary now. Let's check...
> > > 
> > > The obvious thing we need to protect against is applying stale journal
> > > data to an image that has been changed by an older version. As long as
> > > the journal is clean, this can't happen, and the journal dirty bit will
> > > ensure that the old version can only open the image if it is clean.
> > > 
> > > However, what if we run 'qemu-img check -r leaks' with an old qemu-img
> > > version? It will reclaim the clusters used by the journal, and if we
> > > continue using the journal we'll corrupt whatever new data is there
> > > now.
> > > 
> > Why can old version qemu-img open the image with dirty journal in the first
> > place? It's incompatible bit.
> 
> This is about a clean journal.
> 
Ah yes, I get it, thanks.

Fam

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [RFC] qcow2 journalling draft
  2013-09-06  7:27           ` Kevin Wolf
@ 2013-09-15 18:23             ` Benoît Canet
  0 siblings, 0 replies; 24+ messages in thread
From: Benoît Canet @ 2013-09-15 18:23 UTC (permalink / raw)
  To: Kevin Wolf
  Cc: Benoît Canet, famz, jcody, qemu-devel, mreitz,
	Stefan Hajnoczi

> > Kevin what do you think of this ?
> > I could strip down the dedupe journal code to specialize it.
> 
> If you think it turns out easier than using the journalling
> infrastructure that we're going to implement anyway, then why not.

This question need to be though.
The good thing about the current dedup log is that the code is already written
and match very well the dedup usage.

Aside my personal interest of resuming the work on deduplication quickly I think
we should consider the overall cost of having a specialized log for dedupe.

Is the reviewing and maintainance of http://patchwork.ozlabs.org/patch/252955/
a reasonable extra cost ?

Best regards

Benoît

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2013-09-15 18:22 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-09-03 13:45 [Qemu-devel] [RFC] qcow2 journalling draft Kevin Wolf
2013-09-03 14:43 ` Benoît Canet
2013-09-04  8:03 ` Stefan Hajnoczi
2013-09-04  9:37   ` Benoît Canet
2013-09-04  9:39   ` Kevin Wolf
2013-09-04  9:55     ` Benoît Canet
2013-09-05  9:24       ` Stefan Hajnoczi
2013-09-05 15:26         ` Benoît Canet
2013-09-06  7:27           ` Kevin Wolf
2013-09-15 18:23             ` Benoît Canet
2013-09-05  9:21     ` Stefan Hajnoczi
2013-09-05 11:18       ` Kevin Wolf
2013-09-05 14:55         ` Stefan Hajnoczi
2013-09-05 15:20           ` Kevin Wolf
2013-09-05 15:56             ` Eric Blake
2013-09-06  9:20     ` Fam Zheng
2013-09-06  9:57       ` Kevin Wolf
2013-09-06 10:02         ` Fam Zheng
2013-09-04  8:32 ` Max Reitz
2013-09-04 10:12   ` Kevin Wolf
2013-09-05  9:35 ` Stefan Hajnoczi
2013-09-05 11:50   ` Kevin Wolf
2013-09-05 12:08     ` Benoît Canet
2013-09-06  9:59 ` Fam Zheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).