* [PATCH 1/6] Pool locking code
2011-03-22 21:21 [PATCH 0/6] VG share v2 Zdenek Kabelac
@ 2011-03-22 21:21 ` Zdenek Kabelac
2011-03-23 14:03 ` Joe Thornber
2011-03-22 21:21 ` [PATCH 2/6] Code move vg_mark_partial up in stack Zdenek Kabelac
` (4 subsequent siblings)
5 siblings, 1 reply; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-22 21:21 UTC (permalink / raw)
To: lvm-devel
Adding specific functions to lock and unlock pool, to modify
specific uint64_t*, and to revert all modified uint64_t back.
Current API is the result of several iterations I've made.
To avoid some more rewrites in case this API would still require some
changes - only pool-fast are currently fully updated for this API.
2 ways to debug code:
mprotect - quite fast all the time - but requires more memory and
currently it is using posix_memalign - this would be
later modified to use dm_malloc() and align internally.
crc - checksum of locked pool - implemented via quick simple hash.
But even this gets quite slow when the pool has bigger size -
thus it's not very effiecient to check with every restore,
so the check is made only when reused VG is released.
Probably needs some disscusion where to move futher...
Signed-off-by: Zdenek Kabelac <zkabelac@redhat.com>
---
libdm/libdevmapper.h | 11 ++
libdm/mm/pool-debug.c | 9 ++
libdm/mm/pool-fast.c | 74 ++++++++++++++-
libdm/mm/pool.c | 246 ++++++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 337 insertions(+), 3 deletions(-)
diff --git a/libdm/libdevmapper.h b/libdm/libdevmapper.h
index 88bddb1..a0bed97 100644
--- a/libdm/libdevmapper.h
+++ b/libdm/libdevmapper.h
@@ -594,6 +594,17 @@ void *dm_pool_alloc_aligned(struct dm_pool *p, size_t s, unsigned alignment);
void dm_pool_empty(struct dm_pool *p);
void dm_pool_free(struct dm_pool *p, void *ptr);
+/* prevent modification of pool */
+int dm_pool_locked(struct dm_pool *p);
+int dm_pool_lock(struct dm_pool *p, unsigned count)
+ __attribute__((__warn_unused_result__));
+int dm_pool_unlock(struct dm_pool *p)
+ __attribute__((__warn_unused_result__));
+int dm_pool_set_uint64(struct dm_pool *p, uint64_t *addr, uint64_t new_value)
+ __attribute__((__warn_unused_result__));
+int dm_pool_restore(struct dm_pool *p, int check_crc)
+ __attribute__((__warn_unused_result__));
+
/*
* Object building routines:
*
diff --git a/libdm/mm/pool-debug.c b/libdm/mm/pool-debug.c
index fd931e4..206dd84 100644
--- a/libdm/mm/pool-debug.c
+++ b/libdm/mm/pool-debug.c
@@ -33,6 +33,7 @@ struct dm_pool {
struct dm_list list;
const char *name;
void *orig_pool; /* to pair it with first allocation call */
+ unsigned locked;
int begun;
struct block *object;
@@ -81,6 +82,8 @@ static void _free_blocks(struct dm_pool *p, struct block *b)
{
struct block *n;
+ assert(!p->locked);
+
while (b) {
p->stats.bytes -= b->size;
p->stats.blocks_allocated--;
@@ -94,6 +97,8 @@ static void _free_blocks(struct dm_pool *p, struct block *b)
static void _pool_stats(struct dm_pool *p, const char *action)
{
+ assert(!p->locked);
+
#ifdef DEBUG_POOL
log_debug("%s mempool %s: %u/%u bytes, %u/%u blocks, "
"%u allocations)", action, p->name, p->stats.bytes,
@@ -119,6 +124,8 @@ void *dm_pool_alloc(struct dm_pool *p, size_t s)
static void _append_block(struct dm_pool *p, struct block *b)
{
+ assert(!p->locked);
+
if (p->tail) {
p->tail->next = b;
p->tail = b;
@@ -226,6 +233,8 @@ int dm_pool_grow_object(struct dm_pool *p, const void *extra, size_t delta)
struct block *new;
size_t new_size;
+ assert(!p->locked);
+
if (!delta)
delta = strlen(extra);
diff --git a/libdm/mm/pool-fast.c b/libdm/mm/pool-fast.c
index 29d61a4..5d2643f 100644
--- a/libdm/mm/pool-fast.c
+++ b/libdm/mm/pool-fast.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
- * Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
*
* This file is part of the device-mapper userspace tools.
*
@@ -18,6 +18,8 @@
#endif
#include "dmlib.h"
+#include <sys/mman.h>
+#include <malloc.h>
struct chunk {
char *begin, *end;
@@ -32,6 +34,10 @@ struct dm_pool {
size_t chunk_size;
size_t object_len;
unsigned object_alignment;
+ int locked;
+ long crc;
+ uint64_t *volatile_mem;
+ unsigned volatile_count;
};
static void _align_chunk(struct chunk *c, unsigned alignment);
@@ -74,6 +80,7 @@ void dm_pool_destroy(struct dm_pool *p)
}
dm_list_del(&p->list);
+ dm_free(p->volatile_mem);
dm_free(p);
}
@@ -254,17 +261,37 @@ static struct chunk *_new_chunk(struct dm_pool *p, size_t s)
{
struct chunk *c;
+#ifdef DEBUG_USE_MPROTECT
+ if (!pagesize) {
+ pagesize = getpagesize(); // lvm_pagesize();
+ pagesize_mask = pagesize - 1;
+ }
+#endif
if (p->spare_chunk &&
((p->spare_chunk->end - p->spare_chunk->begin) >= s)) {
/* reuse old chunk */
c = p->spare_chunk;
p->spare_chunk = 0;
} else {
- if (!(c = dm_malloc(s))) {
+#ifdef DEBUG_USE_MPROTECT
+ /*
+ * Allocate page aligned size so malloc could work.
+ * Otherwise page fault would happen from pool unrelated
+ * memory writes of internal malloc pointers.
+ */
+#define aligned_malloc(s) (posix_memalign((void**)&c, pagesize, \
+ ALIGN_ON_PAGE(s)) == 0)
+#else
+#define aligned_malloc(s) (c = dm_malloc(s))
+#endif
+ /* mem for internals */
+ if (!aligned_malloc(s)) {
+#undef aligned_malloc
log_error("Out of memory. Requested %" PRIsize_t
" bytes.", s);
return NULL;
}
+ //log_error("RESET %p %p %d a:%d", c, c + s, (int)s, (int)ALIGN_ON_PAGE(s));
c->begin = (char *) (c + 1);
c->end = (char *) c + s;
@@ -283,3 +310,46 @@ static void _free_chunk(struct chunk *c)
{
dm_free(c);
}
+
+/*
+ * Calc hash from pool memory chunks with internal pointers
+ */
+static long _pool_crc(const struct dm_pool *p)
+{
+ long crc_hash = 0;
+#ifndef DEBUG_USE_MPROTECT
+ const struct chunk *c;
+ const long *ptr, *end;
+
+ for (c = p->chunk; c; c = c->prev) {
+ end = (const long *) (c->begin < c->end ? (long) c->begin & ~7: (long) c->end);
+ ptr = (const long *) c;
+#ifdef VALGRIND_POOL
+ VALGRIND_MAKE_MEM_DEFINED(ptr, (end - ptr) * sizeof(*end));
+#endif
+ while (ptr < end) {
+ crc_hash += *ptr++;
+ crc_hash += (crc_hash << 10);
+ crc_hash ^= (crc_hash >> 6);
+ }
+ }
+#endif
+
+ return crc_hash;
+}
+
+static int _pool_protect(struct dm_pool *p, int prot)
+{
+#ifdef DEBUG_USE_MPROTECT
+ struct chunk *c;
+
+ for (c = p->chunk; c; c = c->prev) {
+ //log_error("protecting %d %p 0x%x", prot, c, (int) ((char*)c->end - (char*)c) - 1);
+ if (mprotect(c, (size_t) ((c->end - (char *) c) - 1), prot) != 0) {
+ log_sys_error("mprotect", "");
+ return 0;
+ }
+ }
+#endif
+ return 1;
+}
diff --git a/libdm/mm/pool.c b/libdm/mm/pool.c
index 608826b..9f036a8 100644
--- a/libdm/mm/pool.c
+++ b/libdm/mm/pool.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
*
* This file is part of the device-mapper userspace tools.
*
@@ -15,10 +15,33 @@
#include "dmlib.h"
+/*
+ * Use mprotect system call to ensure all locked pages are not writeable
+ * Generates segmentation fault with write access to locked pool.
+ *
+ * 1. Implementation is using posix_memalign() to get page aligned
+ * memory blocks (could be implemented also through malloc)
+ *
+ * 2. Only pool-fast is properly handle for now
+ *
+ * 3. As all pools could be locked now - using this aligned malloc for
+ * each pool - this is not memory effiecient
+ *
+ * 4. Checksum is quite slow compared to mprotect if it would be performed
+ * with every restore operation.
+ */
+#define DEBUG_USE_MPROTECT 1
+
/* FIXME: thread unsafe */
static DM_LIST_INIT(_dm_pools);
void dm_pools_check_leaks(void);
+#ifdef DEBUG_USE_MPROTECT
+static size_t pagesize = 0;
+static size_t pagesize_mask = 0;
+#define ALIGN_ON_PAGE(size) (((size) + (pagesize_mask)) & ~(pagesize_mask))
+#endif
+
#ifdef DEBUG_POOL
#include "pool-debug.c"
#else
@@ -75,3 +98,224 @@ void dm_pools_check_leaks(void)
#endif
}
}
+
+/**
+ * Status of locked pool.
+ *
+ * \param p
+ * Pool to be locked.
+ *
+ * \return
+ * 1 (success) when the pool is locked, 0 otherwise.
+ */
+int dm_pool_locked(struct dm_pool *p)
+{
+ return p->locked;
+}
+
+/**
+ * Lock memory pool.
+ *
+ * \param p
+ * Pool to be locked.
+ *
+ * \param count
+ * Specifies the amount of 64bit pairs to be placed in
+ * volatile exception store. When count is 0, pool is
+ * locked without reallocation of exception store and
+ * crc recomputation. This is useful for temporal pool
+ * unlocking and locking.
+ *
+ * \return
+ * 1 (success) when the pool was preperly locked, 0 otherwise.
+ */
+int dm_pool_lock(struct dm_pool *p, unsigned count)
+{
+ if (p->locked) {
+ log_error(INTERNAL_ERROR "Pool is already locked.");
+ return 0;
+ }
+
+ if (count) {
+ if (count != p->volatile_count) {
+ dm_free(p->volatile_mem);
+
+ if (!(p->volatile_mem = dm_malloc(2 * sizeof(uint64_t) * (count + 1)))) {
+ log_error("Failed to allocate exception store for pool.");
+ return 0;
+ }
+
+ p->volatile_count = count;
+ }
+
+ p->volatile_mem[0] = 0; /* Reset volatile buffer */
+ p->crc = _pool_crc(p); /* Get crc for pool */
+ } else if (!p->volatile_count) {
+ log_error(INTERNAL_ERROR "Pool locking without exception store specified.");
+ return 0;
+ }
+
+ if (!_pool_protect(p, PROT_READ)) {
+ _pool_protect(p, PROT_READ | PROT_WRITE);
+ return_0;
+ }
+
+ p->locked = 1;
+
+ log_debug("Pool %p locked.", p);
+
+ return 1;
+}
+
+/**
+ * Unlock memory pool.
+ *
+ * \param p
+ * Pool to be unlocked
+ *
+ * \return
+ * 1 (success) when the pool was properly locked, 0 otherwise.
+ */
+int dm_pool_unlock(struct dm_pool *p)
+{
+ if (!p->locked) {
+ log_error(INTERNAL_ERROR "Pool is already unlocked.");
+ return 0;
+ }
+
+ p->locked = 0;
+
+ if (!_pool_protect(p, PROT_READ | PROT_WRITE))
+ return_0;
+
+ log_debug("Pool %p unlocked.", p);
+
+ return 1;
+}
+
+static int _mprotect_addr(uint64_t *addr, int prot)
+{
+#ifdef DEBUG_USE_MPROTECT
+ int bounds = (((long) addr & (pagesize_mask)) < (pagesize - 8))
+ ? pagesize_mask : pagesize_mask + pagesize;
+
+ if (mprotect((void *) ((long) addr & ~pagesize_mask), bounds, prot)
+ != 0) {
+ log_sys_error("mprotect", (prot == PROT_READ) ? "READ" : "WRITE");
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+/**
+ * Modify uint64_t value in pool
+ *
+ * \param p
+ * Pool which constains modified address.
+ *
+ * \param addr
+ * Address of the modified value. Original content is saved in exception store.
+ *
+ * \param d
+ * New value to be set.
+ *
+ * \return
+ * 1 (success) if the value was updated, 0 otherwise.
+ *
+ * TODO:
+ * More advanced algorithm would be needed if the exception
+ * area size grows to so large number of pairs.
+ * Currently it also appears to be enough to handle just 64bit pairs.
+ */
+int dm_pool_set_uint64(struct dm_pool *p, uint64_t *addr, uint64_t d)
+{
+ unsigned i;
+
+ if (!p->locked) {
+ *addr = d;
+ return 1;
+ }
+
+ for (i = 0; p->volatile_mem[i] != (uint64_t) addr; i += 2)
+ if (!p->volatile_mem[i]) {
+ /* Add new address and original value to exception store */
+ if ((i + 2) >= (p->volatile_count * 2)) {
+ log_error(INTERNAL_ERROR
+ "Exception store with %u size is full.",
+ (unsigned) p->volatile_count);
+ return 0;
+ }
+
+ log_debug("Saving protected %u: %p %" PRIu64
+ " -> %" PRIu64 ".", (unsigned) i / 2,
+ addr, *addr, d);
+ p->volatile_mem[i] = (uint64_t) addr;
+ p->volatile_mem[i + 1] = *addr;
+ p->volatile_mem[i + 2] = 0;
+ break;
+ }
+
+ if (!_mprotect_addr(addr, PROT_READ | PROT_WRITE))
+ return_0;
+
+ log_debug("Changing protected: %p %" PRIu64 " -> %" PRIu64 ".",
+ addr, *addr, d);
+ *addr = d;
+
+ if (!_mprotect_addr(addr, PROT_READ))
+ return_0;
+
+ return 1;
+}
+
+/**
+ * Restore original content in pool and checks whether
+ * it is matching the crc calculated at pool lock.
+ *
+ * \param p
+ * Locked pool to be restored.
+ *
+ * \param check_crc
+ * Boolean whether to check crc after restore.
+ * Checksum of larger pool size is slow.
+ *
+ * \return
+ * 1 (success) if the restore was successful, 0 otherwise.
+ */
+int dm_pool_restore(struct dm_pool *p, int check_crc)
+{
+ unsigned i;
+
+ if (!p->locked) {
+ log_error(INTERNAL_ERROR "Restore of unlocked pool.");
+ return 0;
+ }
+
+ /* Unlock all pages for modified addresses */
+ for (i = 0; p->volatile_mem[i]; i += 2)
+ if (!_mprotect_addr((uint64_t *)(long)p->volatile_mem[i],
+ PROT_READ | PROT_WRITE))
+ return_0;
+
+ /* Restore saved values from exception store */
+ for (i = 0; p->volatile_mem[i]; i += 2) {
+ log_debug("Restoring %u 0x%" PRIx64 " = %" PRIu64 ".", i / 2,
+ p->volatile_mem[i], p->volatile_mem[i + 1]);
+ ((uint64_t *)(long)p->volatile_mem[i])[0] = p->volatile_mem[i + 1];
+ }
+
+ /* Lock again modified pages */
+ for (i = 0; p->volatile_mem[i]; i += 2)
+ if (!_mprotect_addr((uint64_t *)p->volatile_mem[i], PROT_READ))
+ return_0;
+
+ p->volatile_mem[0] = 0;
+
+ if (check_crc && p->crc != _pool_crc(p)) {
+ log_error(INTERNAL_ERROR "Pool content was modified.");
+ return 0;
+ }
+
+ return 1;
+}
--
1.7.4.1
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 1/6] Pool locking code
2011-03-22 21:21 ` [PATCH 1/6] Pool locking code Zdenek Kabelac
@ 2011-03-23 14:03 ` Joe Thornber
2011-03-23 14:43 ` Zdenek Kabelac
2011-03-23 14:54 ` Zdenek Kabelac
0 siblings, 2 replies; 20+ messages in thread
From: Joe Thornber @ 2011-03-23 14:03 UTC (permalink / raw)
To: lvm-devel
On Tue, 2011-03-22 at 22:21 +0100, Zdenek Kabelac wrote:
> Adding specific functions to lock and unlock pool, to modify
> specific uint64_t*, and to revert all modified uint64_t back.
> +/* prevent modification of pool */
> +int dm_pool_locked(struct dm_pool *p);
> +int dm_pool_lock(struct dm_pool *p, unsigned count)
> + __attribute__((__warn_unused_result__));
> +int dm_pool_unlock(struct dm_pool *p)
> + __attribute__((__warn_unused_result__));
> +int dm_pool_set_uint64(struct dm_pool *p, uint64_t *addr, uint64_t new_value)
> + __attribute__((__warn_unused_result__));
> +int dm_pool_restore(struct dm_pool *p, int check_crc)
> + __attribute__((__warn_unused_result__));
> +
Hi Kabi,
I like what you're trying to do here. But I don't really like the above
interface, particularly the dm_pool_set_uint64() method which is going
to be ugly to use.
Ideally I guess I'd prefer something like:
struct dm_pool_snapshot;
struct dm_pool_snapshot *dm_pool_take_snapshot(struct dm_pool *p);
void dm_pool_destroy_snapshot(struct dm_pool_snapshot *snap);
int dm_pool_compare(struct dm_pool *p, struct dm_pool_snapshot *snap);
int dm_pool_restore_snapshot(struct dm_pool *p, struct dm_pool_snapshot *snap, int release_subsequently_allocated_data);
You'd use the compare function within assertions to check nobody has
changed a pool unexpectedly. There's no concept of a locked pool, you
don't have to change the existing pool code to check if you're locked.
Also you don't need a special interface for changing data within a
locked pool.
The simple way of implementing the above is to just make a copy of the
pool data within the snapshot. Which of course is a very slow op. So
could you give me an idea of the size of a typical pool when you are
locking it? How frequently do you lock?
- Joe
^ permalink raw reply [flat|nested] 20+ messages in thread* [PATCH 1/6] Pool locking code
2011-03-23 14:03 ` Joe Thornber
@ 2011-03-23 14:43 ` Zdenek Kabelac
2011-03-23 14:53 ` Joe Thornber
2011-03-23 14:54 ` Zdenek Kabelac
1 sibling, 1 reply; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-23 14:43 UTC (permalink / raw)
To: lvm-devel
Dne 23.3.2011 15:03, Joe Thornber napsal(a):
> On Tue, 2011-03-22 at 22:21 +0100, Zdenek Kabelac wrote:
>> Adding specific functions to lock and unlock pool, to modify
>> specific uint64_t*, and to revert all modified uint64_t back.
>
>> +/* prevent modification of pool */
>> +int dm_pool_locked(struct dm_pool *p);
>> +int dm_pool_lock(struct dm_pool *p, unsigned count)
>> + __attribute__((__warn_unused_result__));
>> +int dm_pool_unlock(struct dm_pool *p)
>> + __attribute__((__warn_unused_result__));
>> +int dm_pool_set_uint64(struct dm_pool *p, uint64_t *addr, uint64_t new_value)
>> + __attribute__((__warn_unused_result__));
>> +int dm_pool_restore(struct dm_pool *p, int check_crc)
>> + __attribute__((__warn_unused_result__));
>> +
>
> Hi Kabi,
>
> I like what you're trying to do here. But I don't really like the above
> interface, particularly the dm_pool_set_uint64() method which is going
> to be ugly to use.
>
> Ideally I guess I'd prefer something like:
>
> struct dm_pool_snapshot;
>
> struct dm_pool_snapshot *dm_pool_take_snapshot(struct dm_pool *p);
> void dm_pool_destroy_snapshot(struct dm_pool_snapshot *snap);
> int dm_pool_compare(struct dm_pool *p, struct dm_pool_snapshot *snap);
> int dm_pool_restore_snapshot(struct dm_pool *p, struct dm_pool_snapshot *snap, int release_subsequently_allocated_data);
>
> You'd use the compare function within assertions to check nobody has
> changed a pool unexpectedly. There's no concept of a locked pool, you
> don't have to change the existing pool code to check if you're locked.
> Also you don't need a special interface for changing data within a
> locked pool.
>
> The simple way of implementing the above is to just make a copy of the
> pool data within the snapshot. Which of course is a very slow op. So
> could you give me an idea of the size of a typical pool when you are
> locking it? How frequently do you lock?
>
It's been my first idea - to copy all chunks to separate buffer - but the VG
mempool size for large VGs is actually quite big - thus doing always a
snapshot isn't performance optimal. I'm doing my tests with ~7000LV - and it
would be quite noticable to copy all the pool chunks with each use of it.
That's why I came with keeping only few separate values - which is way more
efficient in this case.
As you can see in patch 4,5,6 only very few modifications are being made to
the pool itself - so the restore is usually only for few uin64_t in exception
area.
I'm aware the API isn't the best - but it seems the most efficient I could
think of for now.
We may probably trade of the speed with mirroring whole mempool structure.
But I wanted to show - that only few stucture members are usually modified
during the lifetime of VG structure (thought yes - using the knowledge of the
_lv_postoder() where I intentionally disable the locking - as then I'd get
status of each LV modified - not very efficient with current exception store)
So basically - lock is done when the VG is parsed for the first time.
(See Patch 3) - during the whole live of this shared VG (except for
_lv_postoder()) - it's kept locked. Thus with mprotect-ing enabled - it's
impossible to write to this pool without using dm_pool_set() as you get
immediately segfault. It gets unlocked when it's being released to free vginfo
cache - or when it's droped because RW usage is needed.
It would be nice to have 'COW' in user space - but have no idea how to
implement this.
Zdenek
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 1/6] Pool locking code
2011-03-23 14:43 ` Zdenek Kabelac
@ 2011-03-23 14:53 ` Joe Thornber
0 siblings, 0 replies; 20+ messages in thread
From: Joe Thornber @ 2011-03-23 14:53 UTC (permalink / raw)
To: lvm-devel
On Wed, 2011-03-23 at 15:43 +0100, Zdenek Kabelac wrote:
> It's been my first idea - to copy all chunks to separate buffer - but the VG
> mempool size for large VGs is actually quite big - thus doing always a
> snapshot isn't performance optimal. I'm doing my tests with ~7000LV - and it
> would be quite noticable to copy all the pool chunks with each use of it.
Can you define 'quite big' please?
- Joe
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 1/6] Pool locking code
2011-03-23 14:03 ` Joe Thornber
2011-03-23 14:43 ` Zdenek Kabelac
@ 2011-03-23 14:54 ` Zdenek Kabelac
2011-03-23 15:13 ` Joe Thornber
1 sibling, 1 reply; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-23 14:54 UTC (permalink / raw)
To: lvm-devel
Dne 23.3.2011 15:03, Joe Thornber napsal(a):
> On Tue, 2011-03-22 at 22:21 +0100, Zdenek Kabelac wrote:
>> Adding specific functions to lock and unlock pool, to modify
>> specific uint64_t*, and to revert all modified uint64_t back.
>
>> +/* prevent modification of pool */
>> +int dm_pool_locked(struct dm_pool *p);
>> +int dm_pool_lock(struct dm_pool *p, unsigned count)
>> + __attribute__((__warn_unused_result__));
>> +int dm_pool_unlock(struct dm_pool *p)
>> + __attribute__((__warn_unused_result__));
>> +int dm_pool_set_uint64(struct dm_pool *p, uint64_t *addr, uint64_t new_value)
>> + __attribute__((__warn_unused_result__));
>> +int dm_pool_restore(struct dm_pool *p, int check_crc)
>> + __attribute__((__warn_unused_result__));
>> +
>
> Hi Kabi,
>
> I like what you're trying to do here. But I don't really like the above
> interface, particularly the dm_pool_set_uint64() method which is going
> to be ugly to use.
>
> Ideally I guess I'd prefer something like:
>
> struct dm_pool_snapshot;
>
> struct dm_pool_snapshot *dm_pool_take_snapshot(struct dm_pool *p);
> void dm_pool_destroy_snapshot(struct dm_pool_snapshot *snap);
> int dm_pool_compare(struct dm_pool *p, struct dm_pool_snapshot *snap);
> int dm_pool_restore_snapshot(struct dm_pool *p, struct dm_pool_snapshot *snap, int release_subsequently_allocated_data);
>
> You'd use the compare function within assertions to check nobody has
> changed a pool unexpectedly. There's no concept of a locked pool, you
> don't have to change the existing pool code to check if you're locked.
> Also you don't need a special interface for changing data within a
> locked pool.
>
> The simple way of implementing the above is to just make a copy of the
> pool data within the snapshot. Which of course is a very slow op. So
> could you give me an idea of the size of a typical pool when you are
> locking it? How frequently do you lock?
>
Forget to answer the size question - >8MB is allocated by the VG mempool when
set of simple linear 7200LVs on 1 PV are in use.
Also that's why I mention in my patchset that doing even quick crc hash
checksum is quite slow - I except full memcpy would be even slower - and I do
that only with finally pool unlock).
Zdenek
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 1/6] Pool locking code
2011-03-23 14:54 ` Zdenek Kabelac
@ 2011-03-23 15:13 ` Joe Thornber
2011-03-23 15:57 ` Zdenek Kabelac
0 siblings, 1 reply; 20+ messages in thread
From: Joe Thornber @ 2011-03-23 15:13 UTC (permalink / raw)
To: lvm-devel
On Wed, 2011-03-23 at 15:54 +0100, Zdenek Kabelac wrote:
> Forget to answer the size question - >8MB is allocated by the VG
> mempool when
> set of simple linear 7200LVs on 1 PV are in use.
So I think each snapshot/restore of a VG pool of that size is going to
take ~0.1 seconds. So now my question is how many times are we
expecting to happen during a typical tool invocation? Your previous
reply suggested it happens just once?
- Joe
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 1/6] Pool locking code
2011-03-23 15:13 ` Joe Thornber
@ 2011-03-23 15:57 ` Zdenek Kabelac
2011-03-23 16:33 ` Zdenek Kabelac
2011-03-23 18:41 ` Joe Thornber
0 siblings, 2 replies; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-23 15:57 UTC (permalink / raw)
To: lvm-devel
Dne 23.3.2011 16:13, Joe Thornber napsal(a):
> On Wed, 2011-03-23 at 15:54 +0100, Zdenek Kabelac wrote:
>> Forget to answer the size question - >8MB is allocated by the VG
>> mempool when
>> set of simple linear 7200LVs on 1 PV are in use.
>
> So I think each snapshot/restore of a VG pool of that size is going to
> take ~0.1 seconds. So now my question is how many times are we
> expecting to happen during a typical tool invocation? Your previous
> reply suggested it happens just once?
If there would be just 1 vg read for 7200 activate call - obviously I'd not
need to play with this dm_pool locking :)
So there is 7200 vg reads - which will use this one locked pool with my patch set:
To give some time info here
(using my unoptimized gcc debug build on C2D 2.2GHz - but with few more
patches applied than current upstream)
Deactivate 7200LV (7200 already active)
real 1m16.140s
user 0m5.686s
sys 0m31.410s
Deactivate 7200LV (non active)
real 0m2.121s
user 0m1.083s
sys 0m1.032s
Activate 7200 (non active)
real 2m19.324s
user 0m6.503s
sys 1m30.785s
Activate 7200 (7200 already active)
real 0m10.945s
user 0m1.422s
sys 0m9.502s
Zdenek
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 1/6] Pool locking code
2011-03-23 15:57 ` Zdenek Kabelac
@ 2011-03-23 16:33 ` Zdenek Kabelac
2011-03-23 17:29 ` Zdenek Kabelac
2011-03-23 18:41 ` Joe Thornber
1 sibling, 1 reply; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-23 16:33 UTC (permalink / raw)
To: lvm-devel
Dne 23.3.2011 16:57, Zdenek Kabelac napsal(a):
> Dne 23.3.2011 16:13, Joe Thornber napsal(a):
>> On Wed, 2011-03-23 at 15:54 +0100, Zdenek Kabelac wrote:
>>> Forget to answer the size question - >8MB is allocated by the VG
>>> mempool when
>>> set of simple linear 7200LVs on 1 PV are in use.
>>
>> So I think each snapshot/restore of a VG pool of that size is going to
>> take ~0.1 seconds. So now my question is how many times are we
>> expecting to happen during a typical tool invocation? Your previous
>> reply suggested it happens just once?
>
>
> If there would be just 1 vg read for 7200 activate call - obviously I'd not
> need to play with this dm_pool locking :)
>
> So there is 7200 vg reads - which will use this one locked pool with my patch set:
>
> To give some time info here
> (using my unoptimized gcc debug build on C2D 2.2GHz - but with few more
> patches applied than current upstream)
>
> Deactivate 7200LV (7200 already active)
> real 1m16.140s
> user 0m5.686s
> sys 0m31.410s
>
>
> Deactivate 7200LV (non active)
> real 0m2.121s
> user 0m1.083s
> sys 0m1.032s
>
>
> Activate 7200 (non active)
> real 2m19.324s
> user 0m6.503s
> sys 1m30.785s
>
> Activate 7200 (7200 already active)
> real 0m10.945s
> user 0m1.422s
> sys 0m9.502s
>
I think good example could be this - when I disable mprotect-ing in my
patchset - and enable crc checking with each dm_pool_restore (which should be
similar to memcpy) (otherwise crc is checked only with final pool unlock)
Deactivate 7200LV
real 1m45.353s
user 0m38.128s
sys 0m27.395s
Deactivate 7200LV
real 0m36.948s
user 0m35.284s
sys 0m1.642s
Activate 7200 (non active)
real 2m31.065s
user 0m37.740s
sys 1m18.481s
(here could be observed - thet mprotecting takes 12s)
Activate 7200 (7200 already active)
real 0m52.188s
user 0m35.570s
sys 0m16.563s
So I'd estimate like 33seconds would be wasted.
--
And to complete the picture - crc only with unlock - and without pool
mprotect-ing - assuming this would be normal work.
Deactivate 7200LV
real 1m9.989s
user 0m4.584s
sys 0m26.882s
Deactivate 7200LV
real 0m2.119s
user 0m1.087s
sys 0m1.025s
Activate 7200 (non active)
real 2m11.363s
user 0m4.696s
sys 1m15.393s
Activate 7200 (7200 already active)
real 0m11.127s
user 0m1.392s
sys 0m9.669s
Zdenek
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 1/6] Pool locking code
2011-03-23 16:33 ` Zdenek Kabelac
@ 2011-03-23 17:29 ` Zdenek Kabelac
0 siblings, 0 replies; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-23 17:29 UTC (permalink / raw)
To: lvm-devel
Dne 23.3.2011 17:33, Zdenek Kabelac napsal(a):
> Dne 23.3.2011 16:57, Zdenek Kabelac napsal(a):
>> Dne 23.3.2011 16:13, Joe Thornber napsal(a):
>>> On Wed, 2011-03-23 at 15:54 +0100, Zdenek Kabelac wrote:
>>>> Forget to answer the size question - >8MB is allocated by the VG
>>>> mempool when
>>>> set of simple linear 7200LVs on 1 PV are in use.
>>>
>>> So I think each snapshot/restore of a VG pool of that size is going to
>>> take ~0.1 seconds. So now my question is how many times are we
>>> expecting to happen during a typical tool invocation? Your previous
>>> reply suggested it happens just once?
>>
>>
>> If there would be just 1 vg read for 7200 activate call - obviously I'd not
>> need to play with this dm_pool locking :)
>>
>> So there is 7200 vg reads - which will use this one locked pool with my patch set:
>>
>> To give some time info here
>> (using my unoptimized gcc debug build on C2D 2.2GHz - but with few more
>> patches applied than current upstream)
>>
>> Deactivate 7200LV (7200 already active)
>> real 1m16.140s
>> user 0m5.686s
>> sys 0m31.410s
>>
>>
>> Deactivate 7200LV (non active)
>> real 0m2.121s
>> user 0m1.083s
>> sys 0m1.032s
>>
>>
>> Activate 7200 (non active)
>> real 2m19.324s
>> user 0m6.503s
>> sys 1m30.785s
>>
>> Activate 7200 (7200 already active)
>> real 0m10.945s
>> user 0m1.422s
>> sys 0m9.502s
>>
>
> I think good example could be this - when I disable mprotect-ing in my
> patchset - and enable crc checking with each dm_pool_restore (which should be
> similar to memcpy) (otherwise crc is checked only with final pool unlock)
>
> Deactivate 7200LV
> real 1m45.353s
> user 0m38.128s
> sys 0m27.395s
>
> Deactivate 7200LV
> real 0m36.948s
> user 0m35.284s
> sys 0m1.642s
>
> Activate 7200 (non active)
> real 2m31.065s
> user 0m37.740s
> sys 1m18.481s
>
> (here could be observed - thet mprotecting takes 12s)
>
> Activate 7200 (7200 already active)
> real 0m52.188s
> user 0m35.570s
> sys 0m16.563s
>
>
>
> So I'd estimate like 33seconds would be wasted.
>
> --
>
> And to complete the picture - crc only with unlock - and without pool
> mprotect-ing - assuming this would be normal work.
>
> Deactivate 7200LV
> real 1m9.989s
> user 0m4.584s
> sys 0m26.882s
>
> Deactivate 7200LV
> real 0m2.119s
> user 0m1.087s
> sys 0m1.025s
>
> Activate 7200 (non active)
> real 2m11.363s
> user 0m4.696s
> sys 1m15.393s
>
> Activate 7200 (7200 already active)
> real 0m11.127s
> user 0m1.392s
> sys 0m9.669s
>
And as the implementation of memcpy instead of crc was rather very simple -
here are exact numbers with memcpy (memcpy being somewhat faster then crc):
Deactivate 7200LV (7200 already active)
real 1m32.725s
user 0m24.828s
sys 0m27.213s
Deactivate 7200LV (non active)
real 0m20.486s
user 0m18.239s
sys 0m2.236s
Activate 7200 (non active)
real 2m33.219s
user 0m24.777s
sys 1m19.996s
Activate 7200 (7200 already active)
real 0m36.627s
user 0m18.474s
sys 0m18.065s
Zdenek
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 1/6] Pool locking code
2011-03-23 15:57 ` Zdenek Kabelac
2011-03-23 16:33 ` Zdenek Kabelac
@ 2011-03-23 18:41 ` Joe Thornber
2011-03-23 19:01 ` Zdenek Kabelac
1 sibling, 1 reply; 20+ messages in thread
From: Joe Thornber @ 2011-03-23 18:41 UTC (permalink / raw)
To: lvm-devel
On Wed, 2011-03-23 at 16:57 +0100, Zdenek Kabelac wrote:
> If there would be just 1 vg read for 7200 activate call - obviously
> I'd not
> need to play with this dm_pool locking :)
>
> So there is 7200 vg reads - which will use this one locked pool with
> my patch set:
ok, so I'm playing catch-up; This locking functionality is debug only.
In the near term the set/restore api wont be needed at all, at which
point that interface can be removed. We should leave the locking side
for regression tests.
So I'll ACK the patch.
- Joe
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 1/6] Pool locking code
2011-03-23 18:41 ` Joe Thornber
@ 2011-03-23 19:01 ` Zdenek Kabelac
2011-03-24 12:21 ` Joe Thornber
0 siblings, 1 reply; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-23 19:01 UTC (permalink / raw)
To: lvm-devel
Dne 23.3.2011 19:41, Joe Thornber napsal(a):
> On Wed, 2011-03-23 at 16:57 +0100, Zdenek Kabelac wrote:
>> If there would be just 1 vg read for 7200 activate call - obviously
>> I'd not
>> need to play with this dm_pool locking :)
>>
>> So there is 7200 vg reads - which will use this one locked pool with
>> my patch set:
>
> ok, so I'm playing catch-up; This locking functionality is debug only.
> In the near term the set/restore api wont be needed at all, at which
> point that interface can be removed. We should leave the locking side
> for regression tests.
Yes - the locking has rather 'debug' semantic to catch writes to vg mempool.
But I'm not sure what do you mean here by 'set/restore api won't be needed' ?
As currently I need to restore the content of the modified part of mempool to
return always the same memory like it would be created 1st time.
(Unless you mean here, that you would rather change the code which modifies
lv/vg->status - so these flag would not need to be restored - which would be a
nice option - but would certainly require bigger modification of the code.)
Zdenek
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 1/6] Pool locking code
2011-03-23 19:01 ` Zdenek Kabelac
@ 2011-03-24 12:21 ` Joe Thornber
2011-03-24 12:45 ` Zdenek Kabelac
0 siblings, 1 reply; 20+ messages in thread
From: Joe Thornber @ 2011-03-24 12:21 UTC (permalink / raw)
To: lvm-devel
On Wed, 2011-03-23 at 20:01 +0100, Zdenek Kabelac wrote:
>
> Yes - the locking has rather 'debug' semantic to catch writes to vg
> mempool.
>
> But I'm not sure what do you mean here by 'set/restore api won't be
> needed' ?
I had a quick chat about this with Alasdair yesterday. The plan is to
change the code that updates the VGs so they don't touch the VG data in
the pool, but instead hold those small changes off to one side.
So the steps you seem to be following are:
i) debug lock/unlock api for pool to identify mutating code. (Useful in
perpetuity for regression tests.)
ii) set/restore api for pool (temporary).
iii) update the mutating code to use set.
iv) gradually get rid of the use of set/restore on a case by case basis.
v) remove the set/restore api
There is an argument that set/restore shouldn't ever go into HEAD, but
is instead something solely useful to you while you do this work. But
since this work is going to take some time I see no problem with this
going in on the understanding that it comes out again as soon as
possible.
- Joe
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 1/6] Pool locking code
2011-03-24 12:21 ` Joe Thornber
@ 2011-03-24 12:45 ` Zdenek Kabelac
2011-03-24 13:50 ` Zdenek Kabelac
0 siblings, 1 reply; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-24 12:45 UTC (permalink / raw)
To: lvm-devel
Dne 24.3.2011 13:21, Joe Thornber napsal(a):
> On Wed, 2011-03-23 at 20:01 +0100, Zdenek Kabelac wrote:
>>
>> Yes - the locking has rather 'debug' semantic to catch writes to vg
>> mempool.
>>
>> But I'm not sure what do you mean here by 'set/restore api won't be
>> needed' ?
>
> I had a quick chat about this with Alasdair yesterday. The plan is to
> change the code that updates the VGs so they don't touch the VG data in
> the pool, but instead hold those small changes off to one side.
>
> So the steps you seem to be following are:
>
> i) debug lock/unlock api for pool to identify mutating code. (Useful in
> perpetuity for regression tests.)
> ii) set/restore api for pool (temporary).
> iii) update the mutating code to use set.
> iv) gradually get rid of the use of set/restore on a case by case basis.
> v) remove the set/restore api
>
> There is an argument that set/restore shouldn't ever go into HEAD, but
> is instead something solely useful to you while you do this work. But
> since this work is going to take some time I see no problem with this
> going in on the understanding that it comes out again as soon as
> possible.
Yeah - I've been already thinking about this -
I could probably make even API for dm_pool_set & restore purely internal -
so only lvm would know about it - by copying internal structures.
So we would not need to expose this in public libdm API for a while.
And I've been also thinking about changing the places which are modifying
'read-only' VG structure members - but at this moment it's somewhat bigger patch.
My idea is - to use for this variable 'offset' inside volume_group structure,
and to access it for read/write - one must deference it through array access
over the exception store area (I'd stay with uint64_t data)
Depends on whether we accept this direction.
As for now it seems there is not so many accesses - so it looks like hopefully
'relatively' small amount of code needs to be changed.
If we would agree on the way we access them - it's probably easy to write
before it hits upstream - to avoid hassles with internal structures.
Zdenek
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 1/6] Pool locking code
2011-03-24 12:45 ` Zdenek Kabelac
@ 2011-03-24 13:50 ` Zdenek Kabelac
0 siblings, 0 replies; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-24 13:50 UTC (permalink / raw)
To: lvm-devel
Dne 24.3.2011 13:45, Zdenek Kabelac napsal(a):
> Dne 24.3.2011 13:21, Joe Thornber napsal(a):
>> On Wed, 2011-03-23 at 20:01 +0100, Zdenek Kabelac wrote:
>>>
>>> Yes - the locking has rather 'debug' semantic to catch writes to vg
>>> mempool.
>>>
>>> But I'm not sure what do you mean here by 'set/restore api won't be
>>> needed' ?
>>
>> I had a quick chat about this with Alasdair yesterday. The plan is to
>> change the code that updates the VGs so they don't touch the VG data in
>> the pool, but instead hold those small changes off to one side.
>>
>> So the steps you seem to be following are:
>>
>> i) debug lock/unlock api for pool to identify mutating code. (Useful in
>> perpetuity for regression tests.)
>> ii) set/restore api for pool (temporary).
>> iii) update the mutating code to use set.
>> iv) gradually get rid of the use of set/restore on a case by case basis.
>> v) remove the set/restore api
>>
>> There is an argument that set/restore shouldn't ever go into HEAD, but
>> is instead something solely useful to you while you do this work. But
>> since this work is going to take some time I see no problem with this
>> going in on the understanding that it comes out again as soon as
>> possible.
>
>
> Yeah - I've been already thinking about this -
> I could probably make even API for dm_pool_set & restore purely internal -
> so only lvm would know about it - by copying internal structures.
>
> So we would not need to expose this in public libdm API for a while.
>
>
> And I've been also thinking about changing the places which are modifying
> 'read-only' VG structure members - but at this moment it's somewhat bigger patch.
>
> My idea is - to use for this variable 'offset' inside volume_group structure,
> and to access it for read/write - one must deference it through array access
> over the exception store area (I'd stay with uint64_t data)
>
> Depends on whether we accept this direction.
>
> As for now it seems there is not so many accesses - so it looks like hopefully
> 'relatively' small amount of code needs to be changed.
>
> If we would agree on the way we access them - it's probably easy to write
> before it hits upstream - to avoid hassles with internal structures.
>
I should still mention here - that typical use-case of locked VG is to operate
on very limited set of LVs (for just linears - it's == 1)
- thus very short list of exceptions is usually restored (if some at all).
So the idea of restoring all fields of say 3 or more 8byte values of 7000LVs
would be surely more expensive (though it's obviously less then 8MB)
Also the code which would work on references instead of direct structure
members would look less fancy than it's now (As now only 'write operation'
requires 'wrapper').
So I think it's good to well specify what we really want to achieve - and what
are we going to sacrifice for what.
(I know the current API isn't the best - but as I said - it's been already
couple iteration on my own harddrive)
Zdenek
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 2/6] Code move vg_mark_partial up in stack
2011-03-22 21:21 [PATCH 0/6] VG share v2 Zdenek Kabelac
2011-03-22 21:21 ` [PATCH 1/6] Pool locking code Zdenek Kabelac
@ 2011-03-22 21:21 ` Zdenek Kabelac
2011-03-22 21:21 ` [PATCH 3/6] Share VG multiple times Zdenek Kabelac
` (3 subsequent siblings)
5 siblings, 0 replies; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-22 21:21 UTC (permalink / raw)
To: lvm-devel
It's useful to keep the partial flag cached - so just move the call
for vg_mark_partil_lvs into lvmcache.c
This patch should not present any functional change.
Signed-off-by: Zdenek Kabelac <zkabelac@redhat.com>
---
lib/cache/lvmcache.c | 7 +++++++
lib/metadata/metadata.c | 5 -----
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/lib/cache/lvmcache.c b/lib/cache/lvmcache.c
index 037fc4a..d0de34f 100644
--- a/lib/cache/lvmcache.c
+++ b/lib/cache/lvmcache.c
@@ -639,6 +639,7 @@ struct volume_group *lvmcache_get_vg(const char *vgid, unsigned precommitted)
struct volume_group *vg = NULL;
struct format_instance *fid;
struct format_instance_ctx fic;
+ int vg_missing;
if (!vgid || !(vginfo = vginfo_from_vgid(vgid)) || !vginfo->vgmetadata)
return NULL;
@@ -678,6 +679,12 @@ struct volume_group *lvmcache_get_vg(const char *vgid, unsigned precommitted)
if (!(vg = import_vg_from_config_tree(vginfo->cft, fid)))
goto_bad;
+ if ((vg_missing = vg_missing_pv_count(vg))) {
+ log_verbose("There are %d physical volumes missing.",
+ vg_missing);
+ vg_mark_partial_lvs(vg);
+ }
+
log_debug("Using cached %smetadata for VG %s.",
vginfo->precommitted ? "pre-committed" : "", vginfo->vgname);
diff --git a/lib/metadata/metadata.c b/lib/metadata/metadata.c
index f4d4671..b6312a0 100644
--- a/lib/metadata/metadata.c
+++ b/lib/metadata/metadata.c
@@ -2819,11 +2819,6 @@ static struct volume_group *_vg_read(struct cmd_context *cmd,
else /* Inconsistent but we can't repair it */
correct_vg->status &= ~INCONSISTENT_VG;
- if (vg_missing_pv_count(correct_vg)) {
- log_verbose("There are %d physical volumes missing.",
- vg_missing_pv_count(correct_vg));
- vg_mark_partial_lvs(correct_vg);
- }
return correct_vg;
} else {
free_vg(correct_vg);
--
1.7.4.1
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 3/6] Share VG multiple times
2011-03-22 21:21 [PATCH 0/6] VG share v2 Zdenek Kabelac
2011-03-22 21:21 ` [PATCH 1/6] Pool locking code Zdenek Kabelac
2011-03-22 21:21 ` [PATCH 2/6] Code move vg_mark_partial up in stack Zdenek Kabelac
@ 2011-03-22 21:21 ` Zdenek Kabelac
2011-03-22 21:21 ` [PATCH 4/6] Use dm_pool_set_uint64 Zdenek Kabelac
` (2 subsequent siblings)
5 siblings, 0 replies; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-22 21:21 UTC (permalink / raw)
To: lvm-devel
Saves time of the creation of the same volume_group structure.
Structure vginfo is extended with linked list of vg_pool_list elements.
It tracks the number of reuses of every cached VG. As especially clvmd
and few other situation requires more then 1 parsed VG - so using
this vg_pool implementation.
Sharing is not used when VG is opened RW recover mode.
As there is far more case when sharing is possible - use rather specific
lvmcache_drop_vg() call to drop parsed VG from cache - such VG is then
unlocked moved away from the vg pool and it is fully modifiable.
Signed-off-by: Zdenek Kabelac <zkabelac@redhat.com>
---
lib/cache/lvmcache.c | 104 +++++++++++++++++++++++++++++++++++++++++++++-
lib/cache/lvmcache.h | 3 +
lib/metadata/metadata.c | 20 ++++++++-
lib/metadata/vg.h | 2 +
4 files changed, 124 insertions(+), 5 deletions(-)
diff --git a/lib/cache/lvmcache.c b/lib/cache/lvmcache.c
index d0de34f..9ee009d 100644
--- a/lib/cache/lvmcache.c
+++ b/lib/cache/lvmcache.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
- * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
*
* This file is part of LVM2.
*
@@ -38,6 +38,13 @@ static int _has_scanned = 0;
static int _vgs_locked = 0;
static int _vg_global_lock_held = 0; /* Global lock held when cache wiped? */
+struct vg_pool_list {
+ struct dm_list list; /* vg_pool list */
+ struct volume_group *vg;
+ int in_use;
+ unsigned use_count; /* Reuse counter for statistics */
+};
+
int lvmcache_init(void)
{
/*
@@ -76,6 +83,10 @@ int lvmcache_init(void)
/* Volume Group metadata cache functions */
static void _free_cached_vgmetadata(struct lvmcache_vginfo *vginfo)
{
+ struct vg_pool_list *vgpl, *tvgpl;
+ struct volume_group *vg;
+ int in_use;
+
if (!vginfo || !vginfo->vgmetadata)
return;
@@ -89,6 +100,17 @@ static void _free_cached_vgmetadata(struct lvmcache_vginfo *vginfo)
vginfo->cft = NULL;
}
+ dm_list_iterate_items_safe(vgpl, tvgpl, &vginfo->vg_pool) {
+ vg = vgpl->vg;
+ in_use = vgpl->in_use;
+ if (!lvmcache_drop_vg(vg))
+ stack;
+ if (in_use)
+ log_debug("Unpooling referenced VG:%p %s", vg, vg->name);
+ else
+ free_vg(vg);
+ }
+
log_debug("Metadata cache: VG %s wiped.", vginfo->vgname);
}
@@ -637,6 +659,7 @@ struct volume_group *lvmcache_get_vg(const char *vgid, unsigned precommitted)
{
struct lvmcache_vginfo *vginfo;
struct volume_group *vg = NULL;
+ struct vg_pool_list *vgpl = NULL;
struct format_instance *fid;
struct format_instance_ctx fic;
int vg_missing;
@@ -663,12 +686,32 @@ struct volume_group *lvmcache_get_vg(const char *vgid, unsigned precommitted)
(!precommitted && vginfo->precommitted && !critical_section()))
return NULL;
+ /* Try to find some usable VG in pool - short list is expected */
+ dm_list_iterate_items(vgpl, &vginfo->vg_pool) {
+ if (!vgpl->in_use) {
+ if (!dm_pool_restore(vgpl->vg->vgmem, 0))
+ /* Should not happen, but ignore such VG */
+ stack;
+ else {
+ vg = vgpl->vg;
+ vgpl->in_use = 1;
+ vgpl->use_count++;
+ goto out;
+ }
+ }
+ }
+
fic.type = FMT_INSTANCE_VG | FMT_INSTANCE_MDAS | FMT_INSTANCE_AUX_MDAS;
fic.context.vg_ref.vg_name = vginfo->vgname;
fic.context.vg_ref.vg_id = vgid;
if (!(fid = vginfo->fmt->ops->create_instance(vginfo->fmt, &fic)))
return_NULL;
+ if (!(vgpl = dm_zalloc(sizeof(*vgpl)))) {
+ log_error("Failed to allocate memory for pool list.");
+ goto bad;
+ }
+
/* Build config tree from vgmetadata, if not yet cached */
if (!vginfo->cft &&
!(vginfo->cft =
@@ -685,17 +728,71 @@ struct volume_group *lvmcache_get_vg(const char *vgid, unsigned precommitted)
vg_mark_partial_lvs(vg);
}
- log_debug("Using cached %smetadata for VG %s.",
- vginfo->precommitted ? "pre-committed" : "", vginfo->vgname);
+ /* Add to VGs pool */
+ vg->vg_pool = vgpl;
+ vgpl->vg = vg;
+ vgpl->in_use = 1; /* Also for goto_bad fail path */
+ dm_list_add(&vginfo->vg_pool, &vgpl->list);
+ log_debug("Add VG:%p %s to the vginfo %p vg_pool (%d).",
+ vg, vg->name, vginfo, dm_list_size(&vginfo->vg_pool));
+ if (!dm_pool_lock(vg->vgmem, 8 + dm_list_size(&vg->lvs)))
+ goto_bad;
+out:
+ log_debug("Using cached (%u) %smetadata for VG %s.", vgpl->use_count,
+ vginfo->precommitted ? "pre-committed " : "", vginfo->vgname);
return vg;
bad:
free_vg(vg);
_free_cached_vgmetadata(vginfo);
+ dm_free(vgpl);
return NULL;
}
+/**
+ * Put back VG as reusable in vg_pool list
+ */
+int lvmcache_put_vg(struct volume_group *vg)
+{
+ if (!vg->vg_pool || !vg->vg_pool->in_use)
+ return 0;
+
+ log_debug("Putting back to cache list VG:%p %s (reused:%d).",
+ vg, vg->name, vg->vg_pool->use_count);
+ vg->vg_pool->in_use = 0;
+ return 1;
+}
+
+/**
+ * Drop VG from the vginfo vg_pool list so it can be used
+ * for modification.
+ */
+int lvmcache_drop_vg(struct volume_group *vg)
+{
+ if (!vg->vg_pool)
+ return 0;
+
+ log_debug("Dropping VG:%p %s from cache (reused:%d).",
+ vg, vg->name, vg->vg_pool->use_count);
+
+ /* Debug perform crc check */
+ if (!vg->vg_pool->in_use &&
+ vg->vg_pool->use_count &&
+ !dm_pool_restore(vg->vgmem, 1))
+ /* Should not happen, but ignore such VG */
+ stack;
+
+ if (!dm_pool_unlock(vg->vgmem))
+ stack;
+
+ dm_list_del(&vg->vg_pool->list);
+ dm_free(vg->vg_pool);
+ vg->vg_pool = NULL;
+
+ return 1;
+}
+
struct dm_list *lvmcache_get_vgids(struct cmd_context *cmd,
int include_internal)
{
@@ -1117,6 +1214,7 @@ static int _lvmcache_update_vgname(struct lvmcache_info *info,
return 0;
}
dm_list_init(&vginfo->infos);
+ dm_list_init(&vginfo->vg_pool);
/*
* If we're scanning and there's an invalidated entry, remove it.
diff --git a/lib/cache/lvmcache.h b/lib/cache/lvmcache.h
index f13cad2..6360f30 100644
--- a/lib/cache/lvmcache.h
+++ b/lib/cache/lvmcache.h
@@ -50,6 +50,7 @@ struct lvmcache_vginfo {
char *vgmetadata; /* Copy of VG metadata as format_text string */
struct config_tree *cft; /* Config tree created from vgmetadata */
/* Lifetime is directly tied to vgmetadata */
+ struct dm_list vg_pool; /* List of cached parsed struct volume_group */
unsigned precommitted; /* Is vgmetadata live or precommitted? */
};
@@ -122,6 +123,8 @@ struct dm_list *lvmcache_get_pvids(struct cmd_context *cmd, const char *vgname,
/* Returns cached volume group metadata. */
struct volume_group *lvmcache_get_vg(const char *vgid, unsigned precommitted);
+int lvmcache_put_vg(struct volume_group *vg);
+int lvmcache_drop_vg(struct volume_group *vg);
void lvmcache_drop_metadata(const char *vgname, int drop_precommitted);
void lvmcache_commit_metadata(const char *vgname);
diff --git a/lib/metadata/metadata.c b/lib/metadata/metadata.c
index b6312a0..e466821 100644
--- a/lib/metadata/metadata.c
+++ b/lib/metadata/metadata.c
@@ -855,6 +855,12 @@ static struct volume_group *_vg_make_handle(struct cmd_context *cmd,
struct volume_group *vg,
uint32_t failure)
{
+ if (vg && vg->vg_pool && (failure != SUCCESS)) {
+ /* Avoid overwrite of shared VG structure */
+ free_vg(vg);
+ vg = NULL;
+ }
+
if (!vg && !(vg = alloc_vg("vg_make_handle", cmd, NULL)))
return_NULL;
@@ -2816,8 +2822,11 @@ static struct volume_group *_vg_read(struct cmd_context *cmd,
(use_precommitted || !*consistent || !(correct_vg->status & INCONSISTENT_VG))) {
if (!(correct_vg->status & INCONSISTENT_VG))
*consistent = 1;
- else /* Inconsistent but we can't repair it */
+ else {
+ lvmcache_drop_vg(correct_vg);
+ /* Inconsistent but we can't repair it */
correct_vg->status &= ~INCONSISTENT_VG;
+ }
return correct_vg;
} else {
@@ -3243,6 +3252,11 @@ void free_vg(struct volume_group *vg)
if (!vg)
return;
+ log_debug("Releasing VG:%p %s.", vg, vg->name);
+
+ if (lvmcache_put_vg(vg))
+ return;
+
dm_list_iterate_items(pvl, &vg->pvs)
pvl->pv->fid->fmt->ops->destroy_instance(pvl->pv->fid);
@@ -3789,7 +3803,9 @@ static struct volume_group *_recover_vg(struct cmd_context *cmd,
return_NULL;
}
- return (struct volume_group *)vg;
+ lvmcache_drop_vg(vg);
+
+ return vg;
}
/*
diff --git a/lib/metadata/vg.h b/lib/metadata/vg.h
index 2cc6047..e153c76 100644
--- a/lib/metadata/vg.h
+++ b/lib/metadata/vg.h
@@ -20,6 +20,7 @@ struct dm_pool;
struct format_instance;
struct dm_list;
struct id;
+struct vg_pool_list;
typedef enum {
ALLOC_INVALID,
@@ -35,6 +36,7 @@ struct volume_group {
struct cmd_context *cmd;
struct dm_pool *vgmem;
struct format_instance *fid;
+ struct vg_pool_list *vg_pool; /* Backward reference to lvmcache */
struct dm_list *cmd_vgs;/* List of wanted/locked and opened VGs */
uint32_t cmd_missing_vgs;/* Flag marks missing VG */
uint32_t seqno; /* Metadata sequence number */
--
1.7.4.1
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 4/6] Use dm_pool_set_uint64
2011-03-22 21:21 [PATCH 0/6] VG share v2 Zdenek Kabelac
` (2 preceding siblings ...)
2011-03-22 21:21 ` [PATCH 3/6] Share VG multiple times Zdenek Kabelac
@ 2011-03-22 21:21 ` Zdenek Kabelac
2011-03-22 21:21 ` [PATCH 5/6] lv_postorder using dm_pool_set_uint64 Zdenek Kabelac
2011-03-22 21:22 ` [PATCH 6/6] lv_postorder unlock and lock Zdenek Kabelac
5 siblings, 0 replies; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-22 21:21 UTC (permalink / raw)
To: lvm-devel
Use the new dm_pool API call for the update of content of the locked
shared VG mempool. Save the original value of the VG into an extra
allocated volatile area. Currently only simple preallocated memory
area is used - with expectation there should not be too many updates here.
Otherwisee more effective algorithm would be needed to get better perfomance.
(most probably some resizable heap??)
Original code is kept in comments to make it more obvious what it is
supposed to do.
Note: (snapshot is kept 64bit aligned in union)
Signed-off-by: Zdenek Kabelac <zkabelac@redhat.com>
---
lib/activate/activate.c | 5 ++++-
lib/metadata/lv.h | 5 ++++-
lib/metadata/snapshot_manip.c | 15 ++++++++++++---
3 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/lib/activate/activate.c b/lib/activate/activate.c
index 1ec518d..4d257f1 100644
--- a/lib/activate/activate.c
+++ b/lib/activate/activate.c
@@ -1411,7 +1411,10 @@ static int _lv_activate(struct cmd_context *cmd, const char *lvid_s,
lv_calculate_readahead(lv, NULL);
if (exclusive)
- lv->status |= ACTIVATE_EXCL;
+ /* lv->status |= ACTIVATE_EXCL */
+ if (!dm_pool_set_uint64(lv->vg->vgmem, &lv->status,
+ lv->status | ACTIVATE_EXCL))
+ goto_out;
critical_section_inc(cmd);
if (!(r = _lv_activate_lv(lv, 0)))
diff --git a/lib/metadata/lv.h b/lib/metadata/lv.h
index 1c3640b..66db813 100644
--- a/lib/metadata/lv.h
+++ b/lib/metadata/lv.h
@@ -38,7 +38,10 @@ struct logical_volume {
uint32_t origin_count;
struct dm_list snapshot_segs;
- struct lv_segment *snapshot;
+ union {
+ struct lv_segment *snapshot;
+ uint64_t snapshot_64; /* Keep the size 64bit on 32bit arch */
+ };
struct replicator_device *rdevice;/* For replicator-devs, rimages, slogs - reference to rdevice */
struct dm_list rsites; /* For replicators - all sites */
diff --git a/lib/metadata/snapshot_manip.c b/lib/metadata/snapshot_manip.c
index cb5df6b..0dc8620 100644
--- a/lib/metadata/snapshot_manip.c
+++ b/lib/metadata/snapshot_manip.c
@@ -130,9 +130,18 @@ void init_snapshot_merge(struct lv_segment *cow_seg,
void clear_snapshot_merge(struct logical_volume *origin)
{
/* clear merge attributes */
- origin->snapshot->status &= ~MERGING;
- origin->snapshot = NULL;
- origin->status &= ~MERGING;
+
+ /* origin->snapshot->status &= ~MERGING */
+ /* origin->snapshot = NULL */
+ /* origin->status &= ~MERGING */
+ if (!dm_pool_set_uint64(origin->vg->vgmem,
+ &origin->snapshot->status,
+ origin->snapshot->status & ~MERGING) ||
+ !dm_pool_set_uint64(origin->vg->vgmem,
+ &origin->snapshot_64, 0) ||
+ !dm_pool_set_uint64(origin->vg->vgmem, &origin->status,
+ origin->status & ~MERGING))
+ stack; /* FIXME: change to return error value */
}
int vg_add_snapshot(struct logical_volume *origin,
--
1.7.4.1
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 5/6] lv_postorder using dm_pool_set_uint64
2011-03-22 21:21 [PATCH 0/6] VG share v2 Zdenek Kabelac
` (3 preceding siblings ...)
2011-03-22 21:21 ` [PATCH 4/6] Use dm_pool_set_uint64 Zdenek Kabelac
@ 2011-03-22 21:21 ` Zdenek Kabelac
2011-03-22 21:22 ` [PATCH 6/6] lv_postorder unlock and lock Zdenek Kabelac
5 siblings, 0 replies; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-22 21:21 UTC (permalink / raw)
To: lvm-devel
We may use either this code to safe&restore lv_postorder and keep
the structure locked.
Next patch is optimization for this one - it unlocks VG during
postorder processing (thus it equals to the original setting).
When memory pool is unlocked - this patch presents just very small
overhead and potentially helps to better debugging if there ever would
done something more complex then current lv_postoder code which could
be easily checked there are no other side-effects.
More comments in following patch...
Signed-off-by: Zdenek Kabelac <zkabelac@redhat.com>
---
lib/metadata/metadata.c | 17 +++++++++++++----
1 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/lib/metadata/metadata.c b/lib/metadata/metadata.c
index e466821..cec9c3c 100644
--- a/lib/metadata/metadata.c
+++ b/lib/metadata/metadata.c
@@ -2023,7 +2023,10 @@ static int _lv_postorder_cleanup(struct logical_volume *lv, void *data)
{
if (!(lv->status & POSTORDER_FLAG))
return 1;
- lv->status &= ~POSTORDER_FLAG;
+ /* lv->status &= ~POSTORDER_FLAG */
+ if (!dm_pool_set_uint64(lv->vg->vgmem, &lv->status,
+ lv->status & ~POSTORDER_FLAG))
+ return_0;
if (!_lv_each_dependency(lv, _lv_postorder_cleanup, data))
return_0;
@@ -2047,7 +2050,10 @@ static int _lv_postorder_visit(struct logical_volume *lv,
return 1;
if (lv->status & POSTORDER_OPEN_FLAG)
return 1; // a data structure loop has closed...
- lv->status |= POSTORDER_OPEN_FLAG;
+ /* lv->status |= POSTORDER_OPEN_FLAG */
+ if (!dm_pool_set_uint64(lv->vg->vgmem, &lv->status,
+ lv->status | POSTORDER_OPEN_FLAG))
+ return_0;
baton.fn = fn;
baton.data = data;
@@ -2056,8 +2062,11 @@ static int _lv_postorder_visit(struct logical_volume *lv,
if (r)
r = fn(lv, data);
- lv->status &= ~POSTORDER_OPEN_FLAG;
- lv->status |= POSTORDER_FLAG;
+ /* lv->status &= ~POSTORDER_OPEN_FLAG */
+ /* lv->status |= POSTORDER_FLAG */
+ if (!dm_pool_set_uint64(lv->vg->vgmem, &lv->status,
+ (lv->status & ~POSTORDER_OPEN_FLAG) | POSTORDER_FLAG))
+ return_0;
return r;
}
--
1.7.4.1
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 6/6] lv_postorder unlock and lock
2011-03-22 21:21 [PATCH 0/6] VG share v2 Zdenek Kabelac
` (4 preceding siblings ...)
2011-03-22 21:21 ` [PATCH 5/6] lv_postorder using dm_pool_set_uint64 Zdenek Kabelac
@ 2011-03-22 21:22 ` Zdenek Kabelac
5 siblings, 0 replies; 20+ messages in thread
From: Zdenek Kabelac @ 2011-03-22 21:22 UTC (permalink / raw)
To: lvm-devel
Instead of unlocking and locking for every changed struct member
do it once when entering and leaving function.
Currently lv_postoder() does not modify other part of vg structure
then status flags of each LV with flags that are also removed.
However using standard dm_pool_set_uint64 would require to implement
more efective data structure for exception stor - as it is not
needed for now - avoid complexity and use this very simple workaround.
Signed-off-by: Zdenek Kabelac <zkabelac@redhat.com>
---
lib/metadata/metadata.c | 16 ++++++++++++++++
1 files changed, 16 insertions(+), 0 deletions(-)
diff --git a/lib/metadata/metadata.c b/lib/metadata/metadata.c
index cec9c3c..85bcb2d 100644
--- a/lib/metadata/metadata.c
+++ b/lib/metadata/metadata.c
@@ -2083,8 +2083,17 @@ static int _lv_postorder(struct logical_volume *lv,
void *data)
{
int r;
+ int lck = dm_pool_locked(lv->vg->vgmem);
+
+ if (lck && !dm_pool_unlock(lv->vg->vgmem))
+ return_0;
+
r = _lv_postorder_visit(lv, fn, data);
_lv_postorder_cleanup(lv, 0);
+
+ if (lck && !dm_pool_lock(lv->vg->vgmem, 0))
+ return_0;
+
return r;
}
@@ -2098,6 +2107,10 @@ static int _lv_postorder_vg(struct volume_group *vg,
{
struct lv_list *lvl;
int r = 1;
+ int lck = dm_pool_locked(vg->vgmem);
+
+ if (lck && !dm_pool_unlock(vg->vgmem))
+ return_0;
dm_list_iterate_items(lvl, &vg->lvs)
if (!_lv_postorder_visit(lvl->lv, fn, data)) {
@@ -2108,6 +2121,9 @@ static int _lv_postorder_vg(struct volume_group *vg,
dm_list_iterate_items(lvl, &vg->lvs)
_lv_postorder_cleanup(lvl->lv, 0);
+ if (lck && !dm_pool_lock(vg->vgmem, 0))
+ return_0;
+
return r;
}
--
1.7.4.1
^ permalink raw reply related [flat|nested] 20+ messages in thread