[Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update_bitmap more efficient
@ 2013-08-20 18:34 Charlie Shepherd
  2013-08-20 18:37 ` Charlie Shepherd
  2013-08-20 20:48 ` Paolo Bonzini
  0 siblings, 2 replies; 9+ messages in thread
From: Charlie Shepherd @ 2013-08-20 18:34 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, pbonzini, gabriel, Charlie Shepherd, stefanha

cow_co_is_allocated and cow_update_bitmap set bits by reading the relevant
word, setting the specific bit in it and writing it back. These functions set
a number of contiguous bits however, so this is an extremely inefficient way
of doing this. This patch converts them to read the whole bitmap they need in
one go, update it and then write it out, which should be much more more
efficient.

This patch has a startling effect on the speed reads and writes, as measured
by a simple benchmark. This performance increase could likely still
be improved quite easily.

Before:
$ qemu-io -c 'write 0 1M' test.cow
Average over 3 runs: 34Kb/s

$ qemu-io -c 'read 0 1M' test.cow
Average over 3 runs:28Mb/s

$ qemu-io -c 'read 0 100M' test.cow
Average over 3 runs:28Mb/s

After:
$ qemu-io -c 'write 0 1M' test.cow
Average over 3 runs: 10 Mb/s

$ qemu-io -c 'read 0 1M' test.cow
Average over 3 runs: 351 Mb/s

$ qemu-io -c 'read 0 100M' test.cow
Average over 3 runs: 426 Mb/s

Which is a ~300x increase in write speed and a ~10x increase in read speed.

Signed-off-by: Charlie Shepherd <charlie@ctshepherd.com>
---
 block/cow.c | 119 ++++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 68 insertions(+), 51 deletions(-)

diff --git a/block/cow.c b/block/cow.c
index 1cc2e89..cd411ec 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -102,42 +102,10 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags)
     return ret;
 }
 
-/*
- * XXX(hch): right now these functions are extremely inefficient.
- * We should just read the whole bitmap we'll need in one go instead.
- */
-static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum)
-{
-    uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
-    uint8_t bitmap;
-    int ret;
-
-    ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
-    if (ret < 0) {
-       return ret;
-    }
-
-    bitmap |= (1 << (bitnum % 8));
-
-    ret = bdrv_pwrite_sync(bs->file, offset, &bitmap, sizeof(bitmap));
-    if (ret < 0) {
-       return ret;
-    }
-    return 0;
-}
-
-static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
+/* Cannot use bitmap.c on big-endian machines.  */
+static int cow_test_bit(int64_t bitnum, const uint8_t *bitmap)
 {
-    uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
-    uint8_t bitmap;
-    int ret;
-
-    ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
-    if (ret < 0) {
-       return ret;
-    }
-
-    return !!(bitmap & (1 << (bitnum % 8)));
+    return (bitmap[bitnum / 8] & (1 << (bitnum & 7))) != 0;
 }
 
 /* Return true if first block has been changed (ie. current version is
@@ -146,40 +114,82 @@ static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
 static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *num_same)
 {
-    int changed;
+    int ret, changed;
+    uint64_t offset = sizeof(struct cow_header_v2) + sector_num / 8;
+
+    int init_bits = (sector_num % 8) ? (8 - (sector_num % 8)) : 0;
+    int remaining = sector_num - init_bits;
+    int full_bytes = remaining / 8;
+    int trail = remaining % 8;
+
+    int len = !!init_bits + full_bytes + !!trail;
+    uint8_t bitmap[len];
 
     if (nb_sectors == 0) {
-	*num_same = nb_sectors;
-	return 0;
+        *num_same = nb_sectors;
+        return 0;
     }
 
-    changed = is_bit_set(bs, sector_num);
-    if (changed < 0) {
-        return 0; /* XXX: how to return I/O errors? */
+    ret = bdrv_pread(bs->file, offset, bitmap, len);
+    if (ret < 0) {
+        return ret;
     }
 
+    changed = cow_test_bit(sector_num, bitmap);
     for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
-	if (is_bit_set(bs, sector_num + *num_same) != changed)
-	    break;
+        if (cow_test_bit(sector_num + *num_same, bitmap) != changed) {
+            break;
+        }
     }
 
     return changed;
 }
 
+/* Set the bits from sector_num to sector_num + nb_sectors in the bitmap of
+ * bs->file. */
 static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
         int nb_sectors)
 {
-    int error = 0;
-    int i;
+    int ret;
+    uint64_t offset = sizeof(struct cow_header_v2) + sector_num / 8;
 
-    for (i = 0; i < nb_sectors; i++) {
-        error = cow_set_bit(bs, sector_num + i);
-        if (error) {
-            break;
-        }
+    int init_bits = (sector_num % 8) ? (8 - (sector_num % 8)) : 0;
+    int remaining = sector_num - init_bits;
+    int full_bytes = remaining / 8;
+    int trail = remaining % 8;
+
+    int len = !!init_bits + full_bytes + !!trail;
+    uint8_t buf[len];
+
+    ret = bdrv_pread(bs->file, offset, buf, len);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Do sector_num -> nearest byte boundary */
+    if (init_bits) {
+        /* This sets the highest init_bits bits in the byte */
+        uint8_t bits = ((1 << init_bits) - 1) << (8 - init_bits);
+        buf[0] |= bits;
+    }
+
+    if (full_bytes) {
+        memset(&buf[!!init_bits], ~0, full_bytes);
+    }
+
+    /* Set the trailing bits in the final byte */
+    if (trail) {
+        /* This sets the lowest trail bits in the byte */
+        uint8_t bits = (1 << trail) - 1;
+        buf[len - 1] |= bits;
+    }
+
+    ret = bdrv_pwrite(bs->file, offset, buf, len);
+    if (ret < 0) {
+        return ret;
     }
 
-    return error;
+    return 0;
 }
 
 static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
@@ -237,6 +247,13 @@ static int cow_write(BlockDriverState *bs, int64_t sector_num,
         return ret;
     }
 
+    /* We need to flush the data before writing the metadata so that there is
+     * no chance of metadata referring to data that doesn't exist. */
+    ret = bdrv_flush(bs->file);
+    if (ret < 0) {
+        return ret;
+    }
+
     return cow_update_bitmap(bs, sector_num, nb_sectors);
 }
 
-- 
1.8.3.2

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update_bitmap more efficient
  2013-08-20 18:34 [Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update_bitmap more efficient Charlie Shepherd
@ 2013-08-20 18:37 ` Charlie Shepherd
  2013-08-20 20:48 ` Paolo Bonzini
  1 sibling, 0 replies; 9+ messages in thread
From: Charlie Shepherd @ 2013-08-20 18:37 UTC (permalink / raw)
  To: Charlie Shepherd; +Cc: kwolf, pbonzini, gabriel, qemu-devel, stefanha

Sorry, this is not only NOT 1/2, but it should say v2, as this is also 
based on merging in Paolo's similar patch in his get_block_status series.

On 20/08/2013 19:34, Charlie Shepherd wrote:
> cow_co_is_allocated and cow_update_bitmap set bits by reading the relevant
> word, setting the specific bit in it and writing it back. These functions set
> a number of contiguous bits however, so this is an extremely inefficient way
> of doing this. This patch converts them to read the whole bitmap they need in
> one go, update it and then write it out, which should be much more more
> efficient.
>
> This patch has a startling effect on the speed reads and writes, as measured
> by a simple benchmark. This performance increase could likely still
> be improved quite easily.
>
> Before:
> $ qemu-io -c 'write 0 1M' test.cow
> Average over 3 runs: 34Kb/s
>
> $ qemu-io -c 'read 0 1M' test.cow
> Average over 3 runs:28Mb/s
>
> $ qemu-io -c 'read 0 100M' test.cow
> Average over 3 runs:28Mb/s
>
> After:
> $ qemu-io -c 'write 0 1M' test.cow
> Average over 3 runs: 10 Mb/s
>
> $ qemu-io -c 'read 0 1M' test.cow
> Average over 3 runs: 351 Mb/s
>
> $ qemu-io -c 'read 0 100M' test.cow
> Average over 3 runs: 426 Mb/s
>
> Which is a ~300x increase in write speed and a ~10x increase in read speed.
>
> Signed-off-by: Charlie Shepherd <charlie@ctshepherd.com>
> ---
>   block/cow.c | 119 ++++++++++++++++++++++++++++++++++--------------------------
>   1 file changed, 68 insertions(+), 51 deletions(-)
>
> diff --git a/block/cow.c b/block/cow.c
> index 1cc2e89..cd411ec 100644
> --- a/block/cow.c
> +++ b/block/cow.c
> @@ -102,42 +102,10 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags)
>       return ret;
>   }
>   
> -/*
> - * XXX(hch): right now these functions are extremely inefficient.
> - * We should just read the whole bitmap we'll need in one go instead.
> - */
> -static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum)
> -{
> -    uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
> -    uint8_t bitmap;
> -    int ret;
> -
> -    ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
> -    if (ret < 0) {
> -       return ret;
> -    }
> -
> -    bitmap |= (1 << (bitnum % 8));
> -
> -    ret = bdrv_pwrite_sync(bs->file, offset, &bitmap, sizeof(bitmap));
> -    if (ret < 0) {
> -       return ret;
> -    }
> -    return 0;
> -}
> -
> -static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
> +/* Cannot use bitmap.c on big-endian machines.  */
> +static int cow_test_bit(int64_t bitnum, const uint8_t *bitmap)
>   {
> -    uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
> -    uint8_t bitmap;
> -    int ret;
> -
> -    ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
> -    if (ret < 0) {
> -       return ret;
> -    }
> -
> -    return !!(bitmap & (1 << (bitnum % 8)));
> +    return (bitmap[bitnum / 8] & (1 << (bitnum & 7))) != 0;
>   }
>   
>   /* Return true if first block has been changed (ie. current version is
> @@ -146,40 +114,82 @@ static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
>   static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
>           int64_t sector_num, int nb_sectors, int *num_same)
>   {
> -    int changed;
> +    int ret, changed;
> +    uint64_t offset = sizeof(struct cow_header_v2) + sector_num / 8;
> +
> +    int init_bits = (sector_num % 8) ? (8 - (sector_num % 8)) : 0;
> +    int remaining = sector_num - init_bits;
> +    int full_bytes = remaining / 8;
> +    int trail = remaining % 8;
> +
> +    int len = !!init_bits + full_bytes + !!trail;
> +    uint8_t bitmap[len];
>   
>       if (nb_sectors == 0) {
> -	*num_same = nb_sectors;
> -	return 0;
> +        *num_same = nb_sectors;
> +        return 0;
>       }
>   
> -    changed = is_bit_set(bs, sector_num);
> -    if (changed < 0) {
> -        return 0; /* XXX: how to return I/O errors? */
> +    ret = bdrv_pread(bs->file, offset, bitmap, len);
> +    if (ret < 0) {
> +        return ret;
>       }
>   
> +    changed = cow_test_bit(sector_num, bitmap);
>       for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
> -	if (is_bit_set(bs, sector_num + *num_same) != changed)
> -	    break;
> +        if (cow_test_bit(sector_num + *num_same, bitmap) != changed) {
> +            break;
> +        }
>       }
>   
>       return changed;
>   }
>   
> +/* Set the bits from sector_num to sector_num + nb_sectors in the bitmap of
> + * bs->file. */
>   static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
>           int nb_sectors)
>   {
> -    int error = 0;
> -    int i;
> +    int ret;
> +    uint64_t offset = sizeof(struct cow_header_v2) + sector_num / 8;
>   
> -    for (i = 0; i < nb_sectors; i++) {
> -        error = cow_set_bit(bs, sector_num + i);
> -        if (error) {
> -            break;
> -        }
> +    int init_bits = (sector_num % 8) ? (8 - (sector_num % 8)) : 0;
> +    int remaining = sector_num - init_bits;
> +    int full_bytes = remaining / 8;
> +    int trail = remaining % 8;
> +
> +    int len = !!init_bits + full_bytes + !!trail;
> +    uint8_t buf[len];
> +
> +    ret = bdrv_pread(bs->file, offset, buf, len);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    /* Do sector_num -> nearest byte boundary */
> +    if (init_bits) {
> +        /* This sets the highest init_bits bits in the byte */
> +        uint8_t bits = ((1 << init_bits) - 1) << (8 - init_bits);
> +        buf[0] |= bits;
> +    }
> +
> +    if (full_bytes) {
> +        memset(&buf[!!init_bits], ~0, full_bytes);
> +    }
> +
> +    /* Set the trailing bits in the final byte */
> +    if (trail) {
> +        /* This sets the lowest trail bits in the byte */
> +        uint8_t bits = (1 << trail) - 1;
> +        buf[len - 1] |= bits;
> +    }
> +
> +    ret = bdrv_pwrite(bs->file, offset, buf, len);
> +    if (ret < 0) {
> +        return ret;
>       }
>   
> -    return error;
> +    return 0;
>   }
>   
>   static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
> @@ -237,6 +247,13 @@ static int cow_write(BlockDriverState *bs, int64_t sector_num,
>           return ret;
>       }
>   
> +    /* We need to flush the data before writing the metadata so that there is
> +     * no chance of metadata referring to data that doesn't exist. */
> +    ret = bdrv_flush(bs->file);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
>       return cow_update_bitmap(bs, sector_num, nb_sectors);
>   }
>   

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update_bitmap more efficient
  2013-08-20 18:34 [Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update_bitmap more efficient Charlie Shepherd
  2013-08-20 18:37 ` Charlie Shepherd
@ 2013-08-20 20:48 ` Paolo Bonzini
  2013-08-20 22:53   ` Charlie Shepherd
  1 sibling, 1 reply; 9+ messages in thread
From: Paolo Bonzini @ 2013-08-20 20:48 UTC (permalink / raw)
  To: Charlie Shepherd; +Cc: kwolf, stefanha, gabriel, qemu-devel

Il 20/08/2013 20:34, Charlie Shepherd ha scritto:
> cow_co_is_allocated and cow_update_bitmap set bits by reading the relevant
> word, setting the specific bit in it and writing it back. These functions set
> a number of contiguous bits however, so this is an extremely inefficient way
> of doing this. This patch converts them to read the whole bitmap they need in
> one go, update it and then write it out, which should be much more more
> efficient.
> 
> This patch has a startling effect on the speed reads and writes, as measured
> by a simple benchmark. This performance increase could likely still
> be improved quite easily.
> 
> Before:
> $ qemu-io -c 'write 0 1M' test.cow
> Average over 3 runs: 34Kb/s
> 
> $ qemu-io -c 'read 0 1M' test.cow
> Average over 3 runs:28Mb/s
> 
> $ qemu-io -c 'read 0 100M' test.cow
> Average over 3 runs:28Mb/s
> 
> After:
> $ qemu-io -c 'write 0 1M' test.cow
> Average over 3 runs: 10 Mb/s
> 
> $ qemu-io -c 'read 0 1M' test.cow
> Average over 3 runs: 351 Mb/s
> 
> $ qemu-io -c 'read 0 100M' test.cow
> Average over 3 runs: 426 Mb/s
> 
> Which is a ~300x increase in write speed and a ~10x increase in read speed.
> 
> Signed-off-by: Charlie Shepherd <charlie@ctshepherd.com>
> ---
>  block/cow.c | 119 ++++++++++++++++++++++++++++++++++--------------------------
>  1 file changed, 68 insertions(+), 51 deletions(-)
> 
> diff --git a/block/cow.c b/block/cow.c
> index 1cc2e89..cd411ec 100644
> --- a/block/cow.c
> +++ b/block/cow.c
> @@ -102,42 +102,10 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags)
>      return ret;
>  }
>  
> -/*
> - * XXX(hch): right now these functions are extremely inefficient.
> - * We should just read the whole bitmap we'll need in one go instead.
> - */
> -static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum)
> -{
> -    uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
> -    uint8_t bitmap;
> -    int ret;
> -
> -    ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
> -    if (ret < 0) {
> -       return ret;
> -    }
> -
> -    bitmap |= (1 << (bitnum % 8));
> -
> -    ret = bdrv_pwrite_sync(bs->file, offset, &bitmap, sizeof(bitmap));
> -    if (ret < 0) {
> -       return ret;
> -    }
> -    return 0;
> -}
> -
> -static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
> +/* Cannot use bitmap.c on big-endian machines.  */
> +static int cow_test_bit(int64_t bitnum, const uint8_t *bitmap)
>  {
> -    uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
> -    uint8_t bitmap;
> -    int ret;
> -
> -    ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
> -    if (ret < 0) {
> -       return ret;
> -    }
> -
> -    return !!(bitmap & (1 << (bitnum % 8)));
> +    return (bitmap[bitnum / 8] & (1 << (bitnum & 7))) != 0;
>  }
>  
>  /* Return true if first block has been changed (ie. current version is
> @@ -146,40 +114,82 @@ static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
>  static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
>          int64_t sector_num, int nb_sectors, int *num_same)
>  {
> -    int changed;
> +    int ret, changed;
> +    uint64_t offset = sizeof(struct cow_header_v2) + sector_num / 8;
> +
> +    int init_bits = (sector_num % 8) ? (8 - (sector_num % 8)) : 0;
> +    int remaining = sector_num - init_bits;
> +    int full_bytes = remaining / 8;
> +    int trail = remaining % 8;
> +
> +    int len = !!init_bits + full_bytes + !!trail;
> +    uint8_t bitmap[len];

This is a basically unbounded allocation on the stack.  You should split
this in smaller ranges using the "num_same" argument, which is what I
did in my patch.

>      if (nb_sectors == 0) {
> -	*num_same = nb_sectors;
> -	return 0;
> +        *num_same = nb_sectors;
> +        return 0;
>      }
>  
> -    changed = is_bit_set(bs, sector_num);
> -    if (changed < 0) {
> -        return 0; /* XXX: how to return I/O errors? */
> +    ret = bdrv_pread(bs->file, offset, bitmap, len);
> +    if (ret < 0) {
> +        return ret;
>      }
>  
> +    changed = cow_test_bit(sector_num, bitmap);
>      for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
> -	if (is_bit_set(bs, sector_num + *num_same) != changed)
> -	    break;
> +        if (cow_test_bit(sector_num + *num_same, bitmap) != changed) {
> +            break;
> +        }
>      }
>  
>      return changed;
>  }
>  
> +/* Set the bits from sector_num to sector_num + nb_sectors in the bitmap of
> + * bs->file. */
>  static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
>          int nb_sectors)
>  {
> -    int error = 0;
> -    int i;
> +    int ret;
> +    uint64_t offset = sizeof(struct cow_header_v2) + sector_num / 8;
>  
> -    for (i = 0; i < nb_sectors; i++) {
> -        error = cow_set_bit(bs, sector_num + i);
> -        if (error) {
> -            break;
> -        }
> +    int init_bits = (sector_num % 8) ? (8 - (sector_num % 8)) : 0;
> +    int remaining = sector_num - init_bits;
> +    int full_bytes = remaining / 8;
> +    int trail = remaining % 8;
> +
> +    int len = !!init_bits + full_bytes + !!trail;
> +    uint8_t buf[len];

Here your patch has indeed an improvement over mine.  However, this is
another basically unbounded allocation on the stack.  You should split
bitmap updates in smaller parts (doing 512-byte aligned writes is fine,
each covers 2MB in the file and writes this big are very rare!).

> +    ret = bdrv_pread(bs->file, offset, buf, len);
> +    if (ret < 0) { 
> +        return ret;
> +    }
> +
> +    /* Do sector_num -> nearest byte boundary */
> +    if (init_bits) {
> +        /* This sets the highest init_bits bits in the byte */
> +        uint8_t bits = ((1 << init_bits) - 1) << (8 - init_bits);
> +        buf[0] |= bits;
> +    }
> +
> +    if (full_bytes) {
> +        memset(&buf[!!init_bits], ~0, full_bytes);
> +    }
> +
> +    /* Set the trailing bits in the final byte */
> +    if (trail) {
> +        /* This sets the lowest trail bits in the byte */
> +        uint8_t bits = (1 << trail) - 1;
> +        buf[len - 1] |= bits;
> +    }

... and you should also check if there is a change in the bits, and skip
the flush if there is no change.  Flushing a multi-megabyte write is
very expensive.  It basically makes format=cow as slow as
format=raw,cache=writethrough.

> +    ret = bdrv_pwrite(bs->file, offset, buf, len);
> +    if (ret < 0) {
> +        return ret;
>      }
>  
> -    return error;
> +    return 0;
>  }
>  
>  static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
> @@ -237,6 +247,13 @@ static int cow_write(BlockDriverState *bs, int64_t sector_num,
>          return ret;
>      }
>  
> +    /* We need to flush the data before writing the metadata so that there is
> +     * no chance of metadata referring to data that doesn't exist. */
> +    ret = bdrv_flush(bs->file);
> +    if (ret < 0) {
> +        return ret;
> +    }

See above about this flush.

Paolo

>      return cow_update_bitmap(bs, sector_num, nb_sectors);
>  }
>  
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update_bitmap more efficient
  2013-08-20 20:48 ` Paolo Bonzini
@ 2013-08-20 22:53   ` Charlie Shepherd
  2013-08-21  8:14     ` Paolo Bonzini
  0 siblings, 1 reply; 9+ messages in thread
From: Charlie Shepherd @ 2013-08-20 22:53 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kwolf, stefanha, gabriel, qemu-devel

On 20/08/2013 21:48, Paolo Bonzini wrote:
> Il 20/08/2013 20:34, Charlie Shepherd ha scritto:
>>   /* Return true if first block has been changed (ie. current version is
>> @@ -146,40 +114,82 @@ static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
>>   static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
>>           int64_t sector_num, int nb_sectors, int *num_same)
>>   {
>> -    int changed;
>> +    int ret, changed;
>> +    uint64_t offset = sizeof(struct cow_header_v2) + sector_num / 8;
>> +
>> +    int init_bits = (sector_num % 8) ? (8 - (sector_num % 8)) : 0;
>> +    int remaining = sector_num - init_bits;
>> +    int full_bytes = remaining / 8;
>> +    int trail = remaining % 8;
>> +
>> +    int len = !!init_bits + full_bytes + !!trail;
>> +    uint8_t bitmap[len];
> This is a basically unbounded allocation on the stack.  You should split
> this in smaller ranges using the "num_same" argument, which is what I
> did in my patch.

So if I understand your patch correctly, you read the next 512 bytes 
(ie, one BDRV_SECTOR_SIZE) after offset into bitmap? Is this guaranteed 
to be safe (like if the file isn't that long)? What if nb_sectors > 512 * 8?
I think it's best to use your version of cow_co_is_allocated(), but 
those are the questions that come to mind when trying to convert the 
stack allocation in cow_update_bitmap()

>>       if (nb_sectors == 0) {
>> -	*num_same = nb_sectors;
>> -	return 0;
>> +        *num_same = nb_sectors;
>> +        return 0;
>>       }
>>   
>> -    changed = is_bit_set(bs, sector_num);
>> -    if (changed < 0) {
>> -        return 0; /* XXX: how to return I/O errors? */
>> +    ret = bdrv_pread(bs->file, offset, bitmap, len);
>> +    if (ret < 0) {
>> +        return ret;
>>       }
>>   
>> +    changed = cow_test_bit(sector_num, bitmap);
>>       for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
>> -	if (is_bit_set(bs, sector_num + *num_same) != changed)
>> -	    break;
>> +        if (cow_test_bit(sector_num + *num_same, bitmap) != changed) {
>> +            break;
>> +        }
>>       }
>>   
>>       return changed;
>>   }
>>   
>> +/* Set the bits from sector_num to sector_num + nb_sectors in the bitmap of
>> + * bs->file. */
>>   static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
>>           int nb_sectors)
>>   {
>> -    int error = 0;
>> -    int i;
>> +    int ret;
>> +    uint64_t offset = sizeof(struct cow_header_v2) + sector_num / 8;
>>   
>> -    for (i = 0; i < nb_sectors; i++) {
>> -        error = cow_set_bit(bs, sector_num + i);
>> -        if (error) {
>> -            break;
>> -        }
>> +    int init_bits = (sector_num % 8) ? (8 - (sector_num % 8)) : 0;
>> +    int remaining = sector_num - init_bits;
>> +    int full_bytes = remaining / 8;
>> +    int trail = remaining % 8;
>> +
>> +    int len = !!init_bits + full_bytes + !!trail;
>> +    uint8_t buf[len];
> Here your patch has indeed an improvement over mine.  However, this is
> another basically unbounded allocation on the stack.  You should split
> bitmap updates in smaller parts (doing 512-byte aligned writes is fine,
> each covers 2MB in the file and writes this big are very rare!).
>
>> +    ret = bdrv_pread(bs->file, offset, buf, len);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    /* Do sector_num -> nearest byte boundary */
>> +    if (init_bits) {
>> +        /* This sets the highest init_bits bits in the byte */
>> +        uint8_t bits = ((1 << init_bits) - 1) << (8 - init_bits);
>> +        buf[0] |= bits;
>> +    }
>> +
>> +    if (full_bytes) {
>> +        memset(&buf[!!init_bits], ~0, full_bytes);
>> +    }
>> +
>> +    /* Set the trailing bits in the final byte */
>> +    if (trail) {
>> +        /* This sets the lowest trail bits in the byte */
>> +        uint8_t bits = (1 << trail) - 1;
>> +        buf[len - 1] |= bits;
>> +    }
> ... and you should also check if there is a change in the bits, and skip
> the flush if there is no change.  Flushing a multi-megabyte write is
> very expensive.  It basically makes format=cow as slow as
> format=raw,cache=writethrough.

So if ORing the allocation makes no difference, don't flush?


Charlie
>> +    ret = bdrv_pwrite(bs->file, offset, buf, len);
>> +    if (ret < 0) {
>> +        return ret;
>>       }
>>   
>> -    return error;
>> +    return 0;
>>   }
>>   
>>   static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
>> @@ -237,6 +247,13 @@ static int cow_write(BlockDriverState *bs, int64_t sector_num,
>>           return ret;
>>       }
>>   
>> +    /* We need to flush the data before writing the metadata so that there is
>> +     * no chance of metadata referring to data that doesn't exist. */
>> +    ret = bdrv_flush(bs->file);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
> See above about this flush.
>
> Paolo
>
>>       return cow_update_bitmap(bs, sector_num, nb_sectors);
>>   }
>>   
>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update_bitmap more efficient
  2013-08-20 22:53   ` Charlie Shepherd
@ 2013-08-21  8:14     ` Paolo Bonzini
  2013-08-21  9:11       ` Charlie Shepherd
  0 siblings, 1 reply; 9+ messages in thread
From: Paolo Bonzini @ 2013-08-21  8:14 UTC (permalink / raw)
  To: Charlie Shepherd; +Cc: kwolf, stefanha, gabriel, qemu-devel

Il 21/08/2013 00:53, Charlie Shepherd ha scritto:
> On 20/08/2013 21:48, Paolo Bonzini wrote:
>> Il 20/08/2013 20:34, Charlie Shepherd ha scritto:
>>>   /* Return true if first block has been changed (ie. current version is
>>> @@ -146,40 +114,82 @@ static inline int is_bit_set(BlockDriverState
>>> *bs, int64_t bitnum)
>>>   static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
>>>           int64_t sector_num, int nb_sectors, int *num_same)
>>>   {
>>> -    int changed;
>>> +    int ret, changed;
>>> +    uint64_t offset = sizeof(struct cow_header_v2) + sector_num / 8;
>>> +
>>> +    int init_bits = (sector_num % 8) ? (8 - (sector_num % 8)) : 0;
>>> +    int remaining = sector_num - init_bits;
>>> +    int full_bytes = remaining / 8;
>>> +    int trail = remaining % 8;
>>> +
>>> +    int len = !!init_bits + full_bytes + !!trail;
>>> +    uint8_t bitmap[len];
>> This is a basically unbounded allocation on the stack.  You should split
>> this in smaller ranges using the "num_same" argument, which is what I
>> did in my patch.
> 
> So if I understand your patch correctly, you read the next 512 bytes
> (ie, one BDRV_SECTOR_SIZE) after offset into bitmap? Is this guaranteed
> to be safe (like if the file isn't that long)?

Yes, because the bitmap is always before the data, and always rounded to
full sector size so that the data stays aligned:

    bitmap_size = ((bs->total_sectors + 7) >> 3) + sizeof(cow_header);
    s->cow_sectors_offset = (bitmap_size + 511) & ~511;

> What if nb_sectors > 512 * 8?

For cow_co_is_allocated, you have the luxury of returning information
only for the fewer than nb_sectors.  That is, you can set *num_same to a
smaller value than nb_sectors, even if sector_num + *num_same has the
same state as the [sector_num, sector_num + *num_same) range.  It will
cause extra calls to is_allocated in the callers, but that's it.

> I think it's best to use your version of cow_co_is_allocated(), but
> those are the questions that come to mind when trying to convert the
> stack allocation in cow_update_bitmap()

Good point.

>>> +    ret = bdrv_pread(bs->file, offset, buf, len);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    /* Do sector_num -> nearest byte boundary */
>>> +    if (init_bits) {
>>> +        /* This sets the highest init_bits bits in the byte */
>>> +        uint8_t bits = ((1 << init_bits) - 1) << (8 - init_bits);
>>> +        buf[0] |= bits;
>>> +    }
>>> +
>>> +    if (full_bytes) {
>>> +        memset(&buf[!!init_bits], ~0, full_bytes);
>>> +    }
>>> +
>>> +    /* Set the trailing bits in the final byte */
>>> +    if (trail) {
>>> +        /* This sets the lowest trail bits in the byte */
>>> +        uint8_t bits = (1 << trail) - 1;
>>> +        buf[len - 1] |= bits;
>>> +    }
>> ... and you should also check if there is a change in the bits, and skip
>> the flush if there is no change.  Flushing a multi-megabyte write is
>> very expensive.  It basically makes format=cow as slow as
>> format=raw,cache=writethrough.
> 
> So if ORing the allocation makes no difference, don't flush?

Yep!  This means if an image is fully COW-ed, there will be no extra
flush (only extra reads).

I already did this, but in a very inefficient way because each bit would
be read separately.

Paolo

> 
> Charlie
>>> +    ret = bdrv_pwrite(bs->file, offset, buf, len);
>>> +    if (ret < 0) {
>>> +        return ret;
>>>       }
>>>   -    return error;
>>> +    return 0;
>>>   }
>>>     static int coroutine_fn cow_read(BlockDriverState *bs, int64_t
>>> sector_num,
>>> @@ -237,6 +247,13 @@ static int cow_write(BlockDriverState *bs,
>>> int64_t sector_num,
>>>           return ret;
>>>       }
>>>   +    /* We need to flush the data before writing the metadata so
>>> that there is
>>> +     * no chance of metadata referring to data that doesn't exist. */
>>> +    ret = bdrv_flush(bs->file);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>> See above about this flush.
>>
>> Paolo
>>
>>>       return cow_update_bitmap(bs, sector_num, nb_sectors);
>>>   }
>>>  
> 
> 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update_bitmap more efficient
  2013-08-21  8:14     ` Paolo Bonzini
@ 2013-08-21  9:11       ` Charlie Shepherd
  2013-08-21  9:19         ` Paolo Bonzini
  0 siblings, 1 reply; 9+ messages in thread
From: Charlie Shepherd @ 2013-08-21  9:11 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kwolf, stefanha, gabriel, qemu-devel

On 21/08/2013 09:14, Paolo Bonzini wrote:
> Il 21/08/2013 00:53, Charlie Shepherd ha scritto:
>> What if nb_sectors > 512 * 8?
> For cow_co_is_allocated, you have the luxury of returning information
> only for the fewer than nb_sectors.  That is, you can set *num_same to a
> smaller value than nb_sectors, even if sector_num + *num_same has the
> same state as the [sector_num, sector_num + *num_same) range.  It will
> cause extra calls to is_allocated in the callers, but that's it.
So we can report a conservative estimate of *num_same? It still seems 
worthwhile to me to be as efficient as possible, I guess that means 
processing a sector's worth of metadata at a time?

Charlie

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update_bitmap more efficient
  2013-08-21  9:11       ` Charlie Shepherd
@ 2013-08-21  9:19         ` Paolo Bonzini
  2013-08-21 10:31           ` Charlie Shepherd
  0 siblings, 1 reply; 9+ messages in thread
From: Paolo Bonzini @ 2013-08-21  9:19 UTC (permalink / raw)
  To: Charlie Shepherd; +Cc: kwolf, stefanha, gabriel, qemu-devel

Il 21/08/2013 11:11, Charlie Shepherd ha scritto:
>>>
>> For cow_co_is_allocated, you have the luxury of returning information
>> only for the fewer than nb_sectors.  That is, you can set *num_same to a
>> smaller value than nb_sectors, even if sector_num + *num_same has the
>> same state as the [sector_num, sector_num + *num_same) range.  It will
>> cause extra calls to is_allocated in the callers, but that's it.
> So we can report a conservative estimate of *num_same?

Yes.  We can set *num_same to the number of bits from sector_num to the
end of the bitmap sector.

> It still seems
> worthwhile to me to be as efficient as possible, I guess that means
> processing a sector's worth of metadata at a time?

Yes, that's what my patches do.  My is_allocated and flushing strategy +
something like your replacement of cow_set_bit (just without the
unbounded allocation) should be pretty good.

Perhaps you can use a cow_co_is_allocated loop after writing the data.
If it returns 0, you flush (the first time only) and call your
cow_update_bitmap.  Then you advance by num_same sectors and go on until
you did all the nb_sectors.  The disadvantage is that it does two reads
(one in cow_co_is_allocated, one in cow_update_bitmap).  The advantage
is that unbounded allocation goes away because cow_co_is_allocated will
never consider more than a sector of bitmap data.  And you can reuse all
your cow_update_bitmap code.

To be as efficient as possible, you could keep a memory copy of the
bitmap, so that you only have to do writes, not reads.  The memory copy
would be somewhat expensive of course but perhaps reasonable (256M of
memory for a 1T image).  The largest cost would be loading the bitmap
from memory at startup.  But that can be done later.

Paolo

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update_bitmap more efficient
  2013-08-21  9:19         ` Paolo Bonzini
@ 2013-08-21 10:31           ` Charlie Shepherd
  2013-08-21 10:38             ` Paolo Bonzini
  0 siblings, 1 reply; 9+ messages in thread
From: Charlie Shepherd @ 2013-08-21 10:31 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kwolf, stefanha, gabriel, qemu-devel

On 21/08/2013 10:19, Paolo Bonzini wrote:
> Il 21/08/2013 11:11, Charlie Shepherd ha scritto:
>> It still seems
>> worthwhile to me to be as efficient as possible, I guess that means
>> processing a sector's worth of metadata at a time?
> Yes, that's what my patches do.  My is_allocated and flushing strategy +
> something like your replacement of cow_set_bit (just without the
> unbounded allocation) should be pretty good.
>
> Perhaps you can use a cow_co_is_allocated loop after writing the data.
> If it returns 0, you flush (the first time only) and call your
> cow_update_bitmap.  Then you advance by num_same sectors and go on until
> you did all the nb_sectors.  The disadvantage is that it does two reads
> (one in cow_co_is_allocated, one in cow_update_bitmap).  The advantage
> is that unbounded allocation goes away because cow_co_is_allocated will
> never consider more than a sector of bitmap data.  And you can reuse all
> your cow_update_bitmap code.

Agreed. But can the two functions not share the same read data? As in:

cow_is_allocated(char *buf) {
   return test_bits(buf)
}

cow_co_is_allocated(Bdrv *b) {
   char sector[SECTOR_SIZE]
   bdrv_read(b, sector, sector_num)
   return cow_is_allocated(sector)
}

cow_update_bitmap(Bdrv *b) {
   while (sector) {
     char buf[SECTOR_SIZE]
     bdrv_read(b, buf, sector_num)
     if (cow_is_allocated(buf)) {
         if (!flushed)
             bdrv_flush()
         cow_update_bitmap_sector(buf)
         bdrv_write(buf)
     }
   }
}

> To be as efficient as possible, you could keep a memory copy of the
> bitmap, so that you only have to do writes, not reads.  The memory copy
> would be somewhat expensive of course but perhaps reasonable (256M of
> memory for a 1T image).  The largest cost would be loading the bitmap
> from memory at startup.  But that can be done later.

Yes I considered that option too. I guess it can be considered as 
another patch, with the performance implicated measured against the 
results of this patch.


Charlie

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update_bitmap more efficient
  2013-08-21 10:31           ` Charlie Shepherd
@ 2013-08-21 10:38             ` Paolo Bonzini
  0 siblings, 0 replies; 9+ messages in thread
From: Paolo Bonzini @ 2013-08-21 10:38 UTC (permalink / raw)
  To: Charlie Shepherd; +Cc: kwolf, stefanha, gabriel, qemu-devel

Il 21/08/2013 12:31, Charlie Shepherd ha scritto:
> On 21/08/2013 10:19, Paolo Bonzini wrote:
>> Il 21/08/2013 11:11, Charlie Shepherd ha scritto:
>>> It still seems
>>> worthwhile to me to be as efficient as possible, I guess that means
>>> processing a sector's worth of metadata at a time?
>> Yes, that's what my patches do.  My is_allocated and flushing strategy +
>> something like your replacement of cow_set_bit (just without the
>> unbounded allocation) should be pretty good.
>>
>> Perhaps you can use a cow_co_is_allocated loop after writing the data.
>> If it returns 0, you flush (the first time only) and call your
>> cow_update_bitmap.  Then you advance by num_same sectors and go on until
>> you did all the nb_sectors.  The disadvantage is that it does two reads
>> (one in cow_co_is_allocated, one in cow_update_bitmap).  The advantage
>> is that unbounded allocation goes away because cow_co_is_allocated will
>> never consider more than a sector of bitmap data.  And you can reuse all
>> your cow_update_bitmap code.
> 
> Agreed. But can the two functions not share the same read data?

Yes, of course!  Good idea.

Paolo

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2013-08-21 10:39 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-08-20 18:34 [Qemu-devel] [PATCH 1/2] Make cow_co_is_allocated and cow_update_bitmap more efficient Charlie Shepherd
2013-08-20 18:37 ` Charlie Shepherd
2013-08-20 20:48 ` Paolo Bonzini
2013-08-20 22:53   ` Charlie Shepherd
2013-08-21  8:14     ` Paolo Bonzini
2013-08-21  9:11       ` Charlie Shepherd
2013-08-21  9:19         ` Paolo Bonzini
2013-08-21 10:31           ` Charlie Shepherd
2013-08-21 10:38             ` Paolo Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).