* [PATCH 01/11] Unlock the superblock on an error path for new metadata dev creation.
2012-02-02 16:39 [PATCH 00/11] Latest dm-thin patches Joe Thornber
@ 2012-02-02 16:39 ` Joe Thornber
2012-02-02 16:39 ` [PATCH 02/11] Remove redundant arg from value_ptr() Joe Thornber
` (9 subsequent siblings)
10 siblings, 0 replies; 20+ messages in thread
From: Joe Thornber @ 2012-02-02 16:39 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
---
drivers/md/dm-thin-metadata.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 59c4f04..6741732 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -385,6 +385,7 @@ static int init_pmd(struct dm_pool_metadata *pmd,
data_sm = dm_sm_disk_create(tm, nr_blocks);
if (IS_ERR(data_sm)) {
DMERR("sm_disk_create failed");
+ dm_tm_unlock(tm, sblock);
r = PTR_ERR(data_sm);
goto bad;
}
--
1.7.5.4
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 02/11] Remove redundant arg from value_ptr()
2012-02-02 16:39 [PATCH 00/11] Latest dm-thin patches Joe Thornber
2012-02-02 16:39 ` [PATCH 01/11] Unlock the superblock on an error path for new metadata dev creation Joe Thornber
@ 2012-02-02 16:39 ` Joe Thornber
2012-02-02 16:39 ` [PATCH 03/11] [PATCH 18/19] [dm-thin] [bio prison] Don't use the bi_next field for the holder of a cell Joe Thornber
` (8 subsequent siblings)
10 siblings, 0 replies; 20+ messages in thread
From: Joe Thornber @ 2012-02-02 16:39 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
Now that the value_size is held within every node of the btrees we can
remove this argument from value_ptr().
For the last few months a BUG_ON has been checking this argument is
the same as that held in the node. No issues were reported. So this
is a safe change.
---
drivers/md/persistent-data/dm-btree-internal.h | 7 +----
drivers/md/persistent-data/dm-btree-remove.c | 28 ++++++++++++------------
drivers/md/persistent-data/dm-btree.c | 27 +++++++++++------------
3 files changed, 29 insertions(+), 33 deletions(-)
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index d279c76..5709bfe 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -108,12 +108,9 @@ static inline void *value_base(struct node *n)
return &n->keys[le32_to_cpu(n->header.max_entries)];
}
-/*
- * FIXME: Now that value size is stored in node we don't need the third parm.
- */
-static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size)
+static inline void *value_ptr(struct node *n, uint32_t index)
{
- BUG_ON(value_size != le32_to_cpu(n->header.value_size));
+ uint32_t value_size = le32_to_cpu(n->header.value_size);
return value_base(n) + (value_size * index);
}
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 023fbc2..8576d56 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -61,20 +61,20 @@ static void node_shift(struct node *n, int shift)
if (shift < 0) {
shift = -shift;
BUG_ON(shift > nr_entries);
- BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size));
+ BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
memmove(key_ptr(n, 0),
key_ptr(n, shift),
(nr_entries - shift) * sizeof(__le64));
- memmove(value_ptr(n, 0, value_size),
- value_ptr(n, shift, value_size),
+ memmove(value_ptr(n, 0),
+ value_ptr(n, shift),
(nr_entries - shift) * value_size);
} else {
BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
memmove(key_ptr(n, shift),
key_ptr(n, 0),
nr_entries * sizeof(__le64));
- memmove(value_ptr(n, shift, value_size),
- value_ptr(n, 0, value_size),
+ memmove(value_ptr(n, shift),
+ value_ptr(n, 0),
nr_entries * value_size);
}
}
@@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift)
memcpy(key_ptr(left, nr_left),
key_ptr(right, 0),
shift * sizeof(__le64));
- memcpy(value_ptr(left, nr_left, value_size),
- value_ptr(right, 0, value_size),
+ memcpy(value_ptr(left, nr_left),
+ value_ptr(right, 0),
shift * value_size);
} else {
BUG_ON(shift > le32_to_cpu(right->header.max_entries));
memcpy(key_ptr(right, 0),
key_ptr(left, nr_left - shift),
shift * sizeof(__le64));
- memcpy(value_ptr(right, 0, value_size),
- value_ptr(left, nr_left - shift, value_size),
+ memcpy(value_ptr(right, 0),
+ value_ptr(left, nr_left - shift),
shift * value_size);
}
}
@@ -120,8 +120,8 @@ static void delete_at(struct node *n, unsigned index)
key_ptr(n, index + 1),
nr_to_copy * sizeof(__le64));
- memmove(value_ptr(n, index, value_size),
- value_ptr(n, index + 1, value_size),
+ memmove(value_ptr(n, index),
+ value_ptr(n, index + 1),
nr_to_copy * value_size);
}
@@ -175,7 +175,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent,
if (inc)
inc_children(info->tm, result->n, &le64_type);
- *((__le64 *) value_ptr(parent, index, sizeof(__le64))) =
+ *((__le64 *) value_ptr(parent, index)) =
cpu_to_le64(dm_block_location(result->block));
return 0;
@@ -496,7 +496,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
*/
if (shadow_has_parent(s)) {
__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
- memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)),
+ memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
&location, sizeof(__le64));
}
@@ -553,7 +553,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
if (info->value_type.dec)
info->value_type.dec(info->value_type.context,
- value_ptr(n, index, info->value_type.size));
+ value_ptr(n, index));
delete_at(n, index);
}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index bd1e7ff..d12b2cc 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -74,8 +74,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n,
dm_tm_inc(tm, value64(n, i));
else if (vt->inc)
for (i = 0; i < nr_entries; i++)
- vt->inc(vt->context,
- value_ptr(n, i, vt->size));
+ vt->inc(vt->context, value_ptr(n, i));
}
static int insert_at(size_t value_size, struct node *node, unsigned index,
@@ -281,7 +280,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
for (i = 0; i < f->nr_children; i++)
info->value_type.dec(info->value_type.context,
- value_ptr(f->n, i, info->value_type.size));
+ value_ptr(f->n, i));
}
f->current_child = f->nr_children;
}
@@ -320,7 +319,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
} while (!(flags & LEAF_NODE));
*result_key = le64_to_cpu(ro_node(s)->keys[i]);
- memcpy(v, value_ptr(ro_node(s), i, value_size), value_size);
+ memcpy(v, value_ptr(ro_node(s), i), value_size);
return 0;
}
@@ -432,7 +431,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
sizeof(uint64_t) : s->info->value_type.size;
- memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size),
+ memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
size * nr_right);
/*
@@ -443,7 +442,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
pn = dm_block_data(parent);
location = cpu_to_le64(dm_block_location(left));
__dm_bless_for_disk(&location);
- memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)),
+ memcpy_disk(value_ptr(pn, parent_index),
&location, sizeof(__le64));
location = cpu_to_le64(dm_block_location(right));
@@ -529,8 +528,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
sizeof(__le64) : s->info->value_type.size;
- memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size);
- memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size),
+ memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
+ memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
nr_right * size);
/* new_parent should just point to l and r now */
@@ -545,12 +544,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
val = cpu_to_le64(dm_block_location(left));
__dm_bless_for_disk(&val);
pn->keys[0] = ln->keys[0];
- memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64));
+ memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
val = cpu_to_le64(dm_block_location(right));
__dm_bless_for_disk(&val);
pn->keys[1] = rn->keys[0];
- memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64));
+ memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
/*
* rejig the spine. This is ugly, since it knows too
@@ -595,7 +594,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
__dm_bless_for_disk(&location);
- memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
+ memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
&location, sizeof(__le64));
}
@@ -710,12 +709,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
(!info->value_type.equal ||
!info->value_type.equal(
info->value_type.context,
- value_ptr(n, index, info->value_type.size),
+ value_ptr(n, index),
value))) {
info->value_type.dec(info->value_type.context,
- value_ptr(n, index, info->value_type.size));
+ value_ptr(n, index));
}
- memcpy_disk(value_ptr(n, index, info->value_type.size),
+ memcpy_disk(value_ptr(n, index),
value, info->value_type.size);
}
--
1.7.5.4
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 03/11] [PATCH 18/19] [dm-thin] [bio prison] Don't use the bi_next field for the holder of a cell.
2012-02-02 16:39 [PATCH 00/11] Latest dm-thin patches Joe Thornber
2012-02-02 16:39 ` [PATCH 01/11] Unlock the superblock on an error path for new metadata dev creation Joe Thornber
2012-02-02 16:39 ` [PATCH 02/11] Remove redundant arg from value_ptr() Joe Thornber
@ 2012-02-02 16:39 ` Joe Thornber
2012-02-07 23:18 ` Mike Snitzer
2012-02-10 18:03 ` [PATCH 04/12 v2] dm thin: don't " Mike Snitzer
2012-02-02 16:39 ` [PATCH 04/11] [PATCH 15/19] [dm-thin] dm_thin_remove_block() wasn't decrementing the mapped_blocks counter Joe Thornber
` (7 subsequent siblings)
10 siblings, 2 replies; 20+ messages in thread
From: Joe Thornber @ 2012-02-02 16:39 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
The holder can be passed down to lower devices which may want to use
bi_next themselves. Also added BUG_ON checks to confirm fix.
---
drivers/md/dm-thin.c | 35 ++++++++++++++++++++++-------------
1 files changed, 22 insertions(+), 13 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c308757..6ef03a2 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -124,7 +124,7 @@ struct cell {
struct hlist_node list;
struct bio_prison *prison;
struct cell_key key;
- unsigned count;
+ struct bio *holder;
struct bio_list bios;
};
@@ -220,8 +220,8 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
* This may block if a new cell needs allocating. You must ensure that
* cells will be unlocked even if the calling thread is blocked.
*
- * Returns the number of entries in the cell prior to the new addition
- * or < 0 on failure.
+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder,
+ * or < 0 on error.
*/
static int bio_detain(struct bio_prison *prison, struct cell_key *key,
struct bio *inmate, struct cell **ref)
@@ -256,21 +256,25 @@ static int bio_detain(struct bio_prison *prison, struct cell_key *key,
cell->prison = prison;
memcpy(&cell->key, key, sizeof(cell->key));
- cell->count = 0;
+ cell->holder = inmate;
bio_list_init(&cell->bios);
hlist_add_head(&cell->list, prison->cells + hash);
+ r = 0;
+
+ } else {
+ mempool_free(cell2, prison->cell_pool);
+ cell2 = NULL;
+ r = 1;
+ bio_list_add(&cell->bios, inmate);
}
- }
- r = cell->count++;
- bio_list_add(&cell->bios, inmate);
+ } else {
+ r = 1;
+ bio_list_add(&cell->bios, inmate);
+ }
spin_unlock_irqrestore(&prison->lock, flags);
- if (cell2)
- mempool_free(cell2, prison->cell_pool);
-
*ref = cell;
-
return r;
}
@@ -286,6 +290,7 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
if (inmates)
bio_list_merge(inmates, &cell->bios);
+ bio_list_add_head(inmates, cell->holder);
mempool_free(cell, prison->cell_pool);
}
@@ -662,8 +667,10 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
spin_lock_irqsave(&pool->lock, flags);
bio_list_add(&pool->deferred_flush_bios, bio);
spin_unlock_irqrestore(&pool->lock, flags);
- } else
+ } else {
+ BUG_ON(bio->bi_next);
generic_make_request(bio);
+ }
}
/*
@@ -1302,8 +1309,10 @@ static void process_deferred_bios(struct pool *pool)
return;
}
- while ((bio = bio_list_pop(&bios)))
+ while ((bio = bio_list_pop(&bios))) {
+ BUG_ON(bio->bi_next);
generic_make_request(bio);
+ }
}
static void do_worker(struct work_struct *ws)
--
1.7.5.4
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 03/11] [PATCH 18/19] [dm-thin] [bio prison] Don't use the bi_next field for the holder of a cell.
2012-02-02 16:39 ` [PATCH 03/11] [PATCH 18/19] [dm-thin] [bio prison] Don't use the bi_next field for the holder of a cell Joe Thornber
@ 2012-02-07 23:18 ` Mike Snitzer
2012-02-10 14:55 ` Joe Thornber
2012-02-10 18:03 ` [PATCH 04/12 v2] dm thin: don't " Mike Snitzer
1 sibling, 1 reply; 20+ messages in thread
From: Mike Snitzer @ 2012-02-07 23:18 UTC (permalink / raw)
To: device-mapper development; +Cc: Joe Thornber
On Thu, Feb 02 2012 at 11:39am -0500,
Joe Thornber <ejt@redhat.com> wrote:
> The holder can be passed down to lower devices which may want to use
> bi_next themselves. Also added BUG_ON checks to confirm fix.
Aside from not (ab)using bi_next; do any other patches in the series
depend on this patch? I don't think so but figured I'd explicitly ask.
> diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> index c308757..6ef03a2 100644
> --- a/drivers/md/dm-thin.c
> +++ b/drivers/md/dm-thin.c
> @@ -220,8 +220,8 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
> * This may block if a new cell needs allocating. You must ensure that
> * cells will be unlocked even if the calling thread is blocked.
> *
> - * Returns the number of entries in the cell prior to the new addition
> - * or < 0 on failure.
> + * Returns 1 if the cell was already held, 0 if @inmate is the new holder,
> + * or < 0 on error.
> */
> static int bio_detain(struct bio_prison *prison, struct cell_key *key,
> struct bio *inmate, struct cell **ref)
bio_detain() never returns < 0, so I've updated the above comment block.
> @@ -256,21 +256,25 @@ static int bio_detain(struct bio_prison *prison, struct cell_key *key,
>
> cell->prison = prison;
> memcpy(&cell->key, key, sizeof(cell->key));
> - cell->count = 0;
> + cell->holder = inmate;
> bio_list_init(&cell->bios);
> hlist_add_head(&cell->list, prison->cells + hash);
> + r = 0;
So in this case where there is no existing inmate in the cell, and
you're allocating the cell, you aren't adding the lone inmate (the
holder) to the cell->bios. But you did previously [1].
> +
> + } else {
> + mempool_free(cell2, prison->cell_pool);
> + cell2 = NULL;
> + r = 1;
> + bio_list_add(&cell->bios, inmate);
> }
> - }
>
> - r = cell->count++;
> - bio_list_add(&cell->bios, inmate);
[1] So you only added it in all cases before because you were looking to
get bio->bi_next to reflect the holder? Thing is bio_list_add()
will set: bio->bi_next = NULL; -- so I'm not seeing how overloading
bi_next to imply holder was reliable.
I'll need to look closer at the bigger picture of how a cell is used;
but it is clear that cell->holder isn't on the cell->bios bio_list now.
> @@ -286,6 +290,7 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
> if (inmates)
> bio_list_merge(inmates, &cell->bios);
>
> + bio_list_add_head(inmates, cell->holder);
> mempool_free(cell, prison->cell_pool);
> }
__cell_release() assumes @inmates is a properly initialized bio_list.
So why the check for inmates != NULL?
And why is this bio_list_add_head() outside that inmates != NULL check?
But most important question: why is it so important to put the
cell->holder at the head of the @inmates list?
Anyway, __cell_release() is the only place where cell->holder is
actually used -- would be worthwhile to to add a comment above it's
bio_list_add_head() call to document _why_ it is important to add the
holder to the head there.
Especially considering @inmates _could_ already be non-empty, so you're
appending the cell->bios to the end of the @inmates list
(e.g. pool->deferred_bios), but then putting the holder of the cell at
the head of the @inmates list.. leaving the cell's holder disjoint from
it's other cell members... why? Was this intended?
(Maybe I'm not reading the code right).
> @@ -1302,8 +1309,10 @@ static void process_deferred_bios(struct pool *pool)
> return;
> }
>
> - while ((bio = bio_list_pop(&bios)))
> + while ((bio = bio_list_pop(&bios))) {
> + BUG_ON(bio->bi_next);
> generic_make_request(bio);
> + }
bio_list_pop() will always: bio->bi_next = NULL;
so there is no need for the BUG_ON() here.
I've refreshed this patch here:
http://people.redhat.com/msnitzer/patches/upstream/dm-thinp-3.3/0004-dm-thin-don-t-use-the-bi_next-field-for-the-holder-o.patch
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 03/11] [PATCH 18/19] [dm-thin] [bio prison] Don't use the bi_next field for the holder of a cell.
2012-02-07 23:18 ` Mike Snitzer
@ 2012-02-10 14:55 ` Joe Thornber
0 siblings, 0 replies; 20+ messages in thread
From: Joe Thornber @ 2012-02-10 14:55 UTC (permalink / raw)
To: Mike Snitzer; +Cc: device-mapper development, Joe Thornber
On Tue, Feb 07, 2012 at 06:18:21PM -0500, Mike Snitzer wrote:
> On Thu, Feb 02 2012 at 11:39am -0500,
> Joe Thornber <ejt@redhat.com> wrote:
>
> > The holder can be passed down to lower devices which may want to use
> > bi_next themselves. Also added BUG_ON checks to confirm fix.
>
> Aside from not (ab)using bi_next; do any other patches in the series
> depend on this patch? I don't think so but figured I'd explicitly ask.
No. I found the issue while chasing a discard problem.
> bio_detain() never returns < 0, so I've updated the above comment block.
Fine.
>
> > @@ -256,21 +256,25 @@ static int bio_detain(struct bio_prison *prison, struct cell_key *key,
> >
> > cell->prison = prison;
> > memcpy(&cell->key, key, sizeof(cell->key));
> > - cell->count = 0;
> > + cell->holder = inmate;
> > bio_list_init(&cell->bios);
> > hlist_add_head(&cell->list, prison->cells + hash);
> > + r = 0;
>
> So in this case where there is no existing inmate in the cell, and
> you're allocating the cell, you aren't adding the lone inmate (the
> holder) to the cell->bios. But you did previously [1].
Correct, the bio that hold the cell previously went into the
cell->bios list. But these ios are sometimes issued before the cell
is released (eg, if the io totally overwrites a data block, we can use
it to 'zero' or 'copy'). The bio is still retained in the cell
however, just in the 'holder' field.
>
> > +
> > + } else {
> > + mempool_free(cell2, prison->cell_pool);
> > + cell2 = NULL;
> > + r = 1;
> > + bio_list_add(&cell->bios, inmate);
> > }
> > - }
> >
> > - r = cell->count++;
> > - bio_list_add(&cell->bios, inmate);
>
> [1] So you only added it in all cases before because you were looking to
> get bio->bi_next to reflect the holder? Thing is bio_list_add()
> will set: bio->bi_next = NULL; -- so I'm not seeing how overloading
> bi_next to imply holder was reliable.
Hmm, not really. The holder was known outside the cell (release_except ...)
> I'll need to look closer at the bigger picture of how a cell is used;
> but it is clear that cell->holder isn't on the cell->bios bio_list now.
Correct.
>
> > @@ -286,6 +290,7 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
> > if (inmates)
> > bio_list_merge(inmates, &cell->bios);
> >
> > + bio_list_add_head(inmates, cell->holder);
> > mempool_free(cell, prison->cell_pool);
> > }
>
> __cell_release() assumes @inmates is a properly initialized bio_list.
> So why the check for inmates != NULL?
This is a bug, that bio_list_add_head() should be inside the if too, thx.
> And why is this bio_list_add_head() outside that inmates != NULL check?
exactly.
> But most important question: why is it so important to put the
> cell->holder at the head of the @inmates list?
I don't want to reorder the ios. The holder was definitely the first io originally.
> Especially considering @inmates _could_ already be non-empty, so you're
> appending the cell->bios to the end of the @inmates list
> (e.g. pool->deferred_bios), but then putting the holder of the cell at
> the head of the @inmates list.. leaving the cell's holder disjoint from
> it's other cell members... why? Was this intended?
Not really, I guess we should add the holder, then do the merge.
>
> (Maybe I'm not reading the code right).
>
> > @@ -1302,8 +1309,10 @@ static void process_deferred_bios(struct pool *pool)
> > return;
> > }
> >
> > - while ((bio = bio_list_pop(&bios)))
> > + while ((bio = bio_list_pop(&bios))) {
> > + BUG_ON(bio->bi_next);
> > generic_make_request(bio);
> > + }
>
> bio_list_pop() will always: bio->bi_next = NULL;
>
> so there is no need for the BUG_ON() here.
ok.
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 04/12 v2] dm thin: don't use the bi_next field for the holder of a cell
2012-02-02 16:39 ` [PATCH 03/11] [PATCH 18/19] [dm-thin] [bio prison] Don't use the bi_next field for the holder of a cell Joe Thornber
2012-02-07 23:18 ` Mike Snitzer
@ 2012-02-10 18:03 ` Mike Snitzer
1 sibling, 0 replies; 20+ messages in thread
From: Mike Snitzer @ 2012-02-10 18:03 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, Mike Snitzer
From: Joe Thornber <ejt@redhat.com>
The holder can be passed down to lower devices which may want to use
bi_next themselves. Also add BUG_ON check to confirm fix.
When releasing bios that have been detained in a cell, fix the order in
which the holder and other cellmates are added to the inmates list:
append holder first followed by the other cellmates.
Clarify what is expected during each cell release by introducing
variants of __cell_release.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/dm-thin.c | 86 ++++++++++++++++++++++++++++++++-------------------
1 file changed, 54 insertions(+), 32 deletions(-)
Index: linux-2.6/drivers/md/dm-thin.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-thin.c
+++ linux-2.6/drivers/md/dm-thin.c
@@ -124,7 +124,7 @@ struct cell {
struct hlist_node list;
struct bio_prison *prison;
struct cell_key key;
- unsigned count;
+ struct bio *holder;
struct bio_list bios;
};
@@ -220,8 +220,7 @@ static struct cell *__search_bucket(stru
* This may block if a new cell needs allocating. You must ensure that
* cells will be unlocked even if the calling thread is blocked.
*
- * Returns the number of entries in the cell prior to the new addition
- * or < 0 on failure.
+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
*/
static int bio_detain(struct bio_prison *prison, struct cell_key *key,
struct bio *inmate, struct cell **ref)
@@ -235,7 +234,6 @@ static int bio_detain(struct bio_prison
spin_lock_irqsave(&prison->lock, flags);
cell = __search_bucket(prison->cells + hash, key);
-
if (!cell) {
/*
* Allocate a new cell
@@ -249,28 +247,30 @@ static int bio_detain(struct bio_prison
* nobody else has inserted this cell in the meantime.
*/
cell = __search_bucket(prison->cells + hash, key);
-
if (!cell) {
cell = cell2;
cell2 = NULL;
cell->prison = prison;
memcpy(&cell->key, key, sizeof(cell->key));
- cell->count = 0;
+ cell->holder = inmate;
bio_list_init(&cell->bios);
hlist_add_head(&cell->list, prison->cells + hash);
+ r = 0;
+ } else {
+ mempool_free(cell2, prison->cell_pool);
+ cell2 = NULL;
+ r = 1;
+ bio_list_add(&cell->bios, inmate);
}
- }
- r = cell->count++;
- bio_list_add(&cell->bios, inmate);
+ } else {
+ r = 1;
+ bio_list_add(&cell->bios, inmate);
+ }
spin_unlock_irqrestore(&prison->lock, flags);
- if (cell2)
- mempool_free(cell2, prison->cell_pool);
-
*ref = cell;
-
return r;
}
@@ -283,8 +283,10 @@ static void __cell_release(struct cell *
hlist_del(&cell->list);
- if (inmates)
+ if (inmates) {
+ bio_list_add(inmates, cell->holder);
bio_list_merge(inmates, &cell->bios);
+ }
mempool_free(cell, prison->cell_pool);
}
@@ -305,22 +307,45 @@ static void cell_release(struct cell *ce
* bio may be in the cell. This function releases the cell, and also does
* a sanity check.
*/
+static void __cell_release_singleton(struct cell *cell, struct bio *bio)
+{
+ hlist_del(&cell->list);
+ BUG_ON(cell->holder != bio);
+ BUG_ON(!bio_list_empty(&cell->bios));
+}
+
static void cell_release_singleton(struct cell *cell, struct bio *bio)
{
- struct bio_prison *prison = cell->prison;
- struct bio_list bios;
- struct bio *b;
unsigned long flags;
-
- bio_list_init(&bios);
+ struct bio_prison *prison = cell->prison;
spin_lock_irqsave(&prison->lock, flags);
- __cell_release(cell, &bios);
+ __cell_release_singleton(cell, bio);
spin_unlock_irqrestore(&prison->lock, flags);
+}
- b = bio_list_pop(&bios);
- BUG_ON(b != bio);
- BUG_ON(!bio_list_empty(&bios));
+/*
+ * Sometimes we don't want the holder, just the additional bios.
+ */
+static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+{
+ struct bio_prison *prison = cell->prison;
+
+ hlist_del(&cell->list);
+ if (inmates)
+ bio_list_merge(inmates, &cell->bios);
+
+ mempool_free(cell, prison->cell_pool);
+}
+
+static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+{
+ unsigned long flags;
+ struct bio_prison *prison = cell->prison;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ __cell_release_no_holder(cell, inmates);
+ spin_unlock_irqrestore(&prison->lock, flags);
}
static void cell_error(struct cell *cell)
@@ -662,8 +687,10 @@ static void remap_and_issue(struct thin_
spin_lock_irqsave(&pool->lock, flags);
bio_list_add(&pool->deferred_flush_bios, bio);
spin_unlock_irqrestore(&pool->lock, flags);
- } else
+ } else {
+ BUG_ON(bio->bi_next);
generic_make_request(bio);
+ }
}
/*
@@ -800,21 +827,16 @@ static void cell_defer(struct thin_c *tc
* Same as cell_defer above, except it omits one particular detainee,
* a write bio that covers the block and has already been processed.
*/
-static void cell_defer_except(struct thin_c *tc, struct cell *cell,
- struct bio *exception)
+static void cell_defer_except(struct thin_c *tc, struct cell *cell)
{
struct bio_list bios;
- struct bio *bio;
struct pool *pool = tc->pool;
unsigned long flags;
bio_list_init(&bios);
- cell_release(cell, &bios);
spin_lock_irqsave(&pool->lock, flags);
- while ((bio = bio_list_pop(&bios)))
- if (bio != exception)
- bio_list_add(&pool->deferred_bios, bio);
+ cell_release_no_holder(cell, &pool->deferred_bios);
spin_unlock_irqrestore(&pool->lock, flags);
wake_worker(pool);
@@ -854,7 +876,7 @@ static void process_prepared_mapping(str
* the bios in the cell.
*/
if (bio) {
- cell_defer_except(tc, m->cell, bio);
+ cell_defer_except(tc, m->cell);
bio_endio(bio, 0);
} else
cell_defer(tc, m->cell, m->data_block);
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 04/11] [PATCH 15/19] [dm-thin] dm_thin_remove_block() wasn't decrementing the mapped_blocks counter.
2012-02-02 16:39 [PATCH 00/11] Latest dm-thin patches Joe Thornber
` (2 preceding siblings ...)
2012-02-02 16:39 ` [PATCH 03/11] [PATCH 18/19] [dm-thin] [bio prison] Don't use the bi_next field for the holder of a cell Joe Thornber
@ 2012-02-02 16:39 ` Joe Thornber
2012-02-02 16:39 ` [PATCH 05/11] [dm-thin] btree-remove - fix rebalancing of 3 nodes Joe Thornber
` (6 subsequent siblings)
10 siblings, 0 replies; 20+ messages in thread
From: Joe Thornber @ 2012-02-02 16:39 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
---
drivers/md/dm-thin-metadata.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 6741732..1aeef4e 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1212,6 +1212,8 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
if (r)
return r;
+ td->mapped_blocks--;
+ td->changed = 1;
pmd->need_commit = 1;
return 0;
--
1.7.5.4
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 05/11] [dm-thin] btree-remove - fix rebalancing of 3 nodes.
2012-02-02 16:39 [PATCH 00/11] Latest dm-thin patches Joe Thornber
` (3 preceding siblings ...)
2012-02-02 16:39 ` [PATCH 04/11] [PATCH 15/19] [dm-thin] dm_thin_remove_block() wasn't decrementing the mapped_blocks counter Joe Thornber
@ 2012-02-02 16:39 ` Joe Thornber
2012-02-02 16:39 ` [PATCH 06/11] Remove entries from the ref_count tree if they're no longer needed Joe Thornber
` (5 subsequent siblings)
10 siblings, 0 replies; 20+ messages in thread
From: Joe Thornber @ 2012-02-02 16:39 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
When we remove an entry from a node we sometimes rebalance with it's
neighbours. This wasn't being done correctly.
---
drivers/md/persistent-data/dm-btree-remove.c | 174 +++++++++++++++-----------
1 files changed, 99 insertions(+), 75 deletions(-)
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 8576d56..aa71e23 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -128,18 +128,9 @@ static void delete_at(struct node *n, unsigned index)
n->header.nr_entries = cpu_to_le32(nr_entries - 1);
}
-static unsigned del_threshold(struct node *n)
-{
- return le32_to_cpu(n->header.max_entries) / 3;
-}
-
static unsigned merge_threshold(struct node *n)
{
- /*
- * The extra one is because we know we're potentially going to
- * delete an entry.
- */
- return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1;
+ return le32_to_cpu(n->header.max_entries) / 3;
}
struct child {
@@ -188,6 +179,15 @@ static int exit_child(struct dm_btree_info *info, struct child *c)
static void shift(struct node *left, struct node *right, int count)
{
+ uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
+ uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+ uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+ uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
+
+ BUG_ON(max_entries != r_max_entries);
+ BUG_ON(nr_left - count > max_entries);
+ BUG_ON(nr_right + count > max_entries);
+
if (!count)
return;
@@ -199,13 +199,8 @@ static void shift(struct node *left, struct node *right, int count)
node_shift(right, count);
}
- left->header.nr_entries =
- cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count);
- BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
-
- right->header.nr_entries =
- cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
- BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
+ left->header.nr_entries = cpu_to_le32(nr_left - count);
+ right->header.nr_entries = cpu_to_le32(nr_right + count);
}
static void __rebalance2(struct dm_btree_info *info, struct node *parent,
@@ -215,8 +210,9 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
struct node *right = r->n;
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+ unsigned threshold = 2 * merge_threshold(left) + 1;
- if (nr_left + nr_right <= merge_threshold(left)) {
+ if (nr_left + nr_right < threshold) {
/*
* Merge
*/
@@ -234,9 +230,6 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
* Rebalance.
*/
unsigned target_left = (nr_left + nr_right) / 2;
- unsigned shift_ = nr_left - target_left;
- BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
- BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
shift(left, right, nr_left - target_left);
*key_ptr(parent, r->index) = right->keys[0];
}
@@ -272,6 +265,84 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
return exit_child(info, &right);
}
+/*
+ * We dump as many entries from center as possible into left, then the rest
+ * in right, then rebalance2. This wastes some cpu, but I want something
+ * simple atm.
+ */
+static void delete_center_node(struct dm_btree_info *info, struct node *parent,
+ struct child *l, struct child *c, struct child *r,
+ struct node *left, struct node *center, struct node *right,
+ uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+ uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+ unsigned shift = min(max_entries - nr_left, nr_center);
+
+ BUG_ON(nr_left + shift > max_entries);
+ node_copy(left, center, -shift);
+ left->header.nr_entries = cpu_to_le32(nr_left + shift);
+
+ if (shift != nr_center) {
+ shift = nr_center - shift;
+ BUG_ON((nr_right + shift) > max_entries);
+ node_shift(right, shift);
+ node_copy(center, right, shift);
+ right->header.nr_entries = cpu_to_le32(nr_right + shift);
+ }
+ *key_ptr(parent, r->index) = right->keys[0];
+
+ delete_at(parent, c->index);
+ r->index--;
+
+ dm_tm_dec(info->tm, dm_block_location(c->block));
+ __rebalance2(info, parent, l, r);
+}
+
+/*
+ * Redistributes entries among 3 sibling nodes.
+ */
+static void redistribute3(struct dm_btree_info *info, struct node *parent,
+ struct child *l, struct child *c, struct child *r,
+ struct node *left, struct node *center, struct node *right,
+ uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+ int s;
+ uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+ unsigned target = (nr_left + nr_center + nr_right) / 3;
+ BUG_ON(target > max_entries);
+
+ if (nr_left < nr_right) {
+ s = nr_left - target;
+
+ if (s < 0 && nr_center < -s) {
+ /* not enough in central node */
+ shift(left, center, nr_center);
+ s = nr_center - target;
+ shift(left, right, s);
+ nr_right += s;
+ } else
+ shift(left, center, s);
+
+ shift(center, right, target - nr_right);
+
+ } else {
+ s = target - nr_right;
+ if (s > 0 && nr_center < s) {
+ /* not enough in central node */
+ shift(center, right, nr_center);
+ s = target - nr_center;
+ shift(left, right, s);
+ nr_left -= s;
+ } else
+ shift(center, right, s);
+
+ shift(left, center, nr_left - target);
+ }
+
+ *key_ptr(parent, c->index) = center->keys[0];
+ *key_ptr(parent, r->index) = right->keys[0];
+}
+
static void __rebalance3(struct dm_btree_info *info, struct node *parent,
struct child *l, struct child *c, struct child *r)
{
@@ -282,62 +353,18 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent,
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
- uint32_t max_entries = le32_to_cpu(left->header.max_entries);
- unsigned target;
+ unsigned threshold = merge_threshold(left) * 4 + 1;
BUG_ON(left->header.max_entries != center->header.max_entries);
BUG_ON(center->header.max_entries != right->header.max_entries);
- if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) {
- /*
- * Delete center node:
- *
- * We dump as many entries from center as possible into
- * left, then the rest in right, then rebalance2. This
- * wastes some cpu, but I want something simple atm.
- */
- unsigned shift = min(max_entries - nr_left, nr_center);
-
- BUG_ON(nr_left + shift > max_entries);
- node_copy(left, center, -shift);
- left->header.nr_entries = cpu_to_le32(nr_left + shift);
-
- if (shift != nr_center) {
- shift = nr_center - shift;
- BUG_ON((nr_right + shift) >= max_entries);
- node_shift(right, shift);
- node_copy(center, right, shift);
- right->header.nr_entries = cpu_to_le32(nr_right + shift);
- }
- *key_ptr(parent, r->index) = right->keys[0];
-
- delete_at(parent, c->index);
- r->index--;
-
- dm_tm_dec(info->tm, dm_block_location(c->block));
- __rebalance2(info, parent, l, r);
-
- return;
- }
-
- /*
- * Rebalance
- */
- target = (nr_left + nr_center + nr_right) / 3;
- BUG_ON(target > max_entries);
-
- /*
- * Adjust the left node
- */
- shift(left, center, nr_left - target);
-
- /*
- * Adjust the right node
- */
- shift(center, right, target - nr_right);
- *key_ptr(parent, c->index) = center->keys[0];
- *key_ptr(parent, r->index) = right->keys[0];
+ if ((nr_left + nr_center + nr_right) < threshold)
+ delete_center_node(info, parent, l, c, r, left, center, right,
+ nr_left, nr_center, nr_right);
+ else
+ redistribute3(info, parent, l, c, r, left, center, right,
+ nr_left, nr_center, nr_right);
}
static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
@@ -441,9 +468,6 @@ static int rebalance_children(struct shadow_spine *s,
if (r)
return r;
- if (child_entries > del_threshold(n))
- return 0;
-
has_left_sibling = i > 0;
has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
--
1.7.5.4
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 06/11] Remove entries from the ref_count tree if they're no longer needed.
2012-02-02 16:39 [PATCH 00/11] Latest dm-thin patches Joe Thornber
` (4 preceding siblings ...)
2012-02-02 16:39 ` [PATCH 05/11] [dm-thin] btree-remove - fix rebalancing of 3 nodes Joe Thornber
@ 2012-02-02 16:39 ` Joe Thornber
2012-02-02 16:39 ` [PATCH 07/11] [dm-thin] Commit every second to prevent too much of a position building up Joe Thornber
` (4 subsequent siblings)
10 siblings, 0 replies; 20+ messages in thread
From: Joe Thornber @ 2012-02-02 16:39 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
Ref counts are stored in two places: a bitmap if the ref_count is
below 3, or a btreeof uint32_t if 3 or above.
When a ref_count that was above 3 drops below we can remove it from
the tree and save some metadata space. This removal was commented out
before because I was unsure why this was causing under populated btree
nodes. Earlier patches have fixed this issue.
---
drivers/md/persistent-data/dm-space-map-common.c | 3 ---
1 files changed, 0 insertions(+), 3 deletions(-)
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index df2494c..ff3beed 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -405,8 +405,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
if (r < 0)
return r;
-#if 0
- /* FIXME: dm_btree_remove doesn't handle this yet */
if (old > 2) {
r = dm_btree_remove(&ll->ref_count_info,
ll->ref_count_root,
@@ -414,7 +412,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
if (r)
return r;
}
-#endif
} else {
__le32 le_rc = cpu_to_le32(ref_count);
--
1.7.5.4
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 07/11] [dm-thin] Commit every second to prevent too much of a position building up.
2012-02-02 16:39 [PATCH 00/11] Latest dm-thin patches Joe Thornber
` (5 preceding siblings ...)
2012-02-02 16:39 ` [PATCH 06/11] Remove entries from the ref_count tree if they're no longer needed Joe Thornber
@ 2012-02-02 16:39 ` Joe Thornber
2012-02-07 16:53 ` Mike Snitzer
2012-02-02 16:39 ` [PATCH 08/11] [dm-thin] Add support for external origins Joe Thornber
` (3 subsequent siblings)
10 siblings, 1 reply; 20+ messages in thread
From: Joe Thornber @ 2012-02-02 16:39 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
---
drivers/md/dm-thin.c | 28 ++++++++++++++++++++++++++--
1 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 6ef03a2..19de11a 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -23,6 +23,7 @@
#define DEFERRED_SET_SIZE 64
#define MAPPING_POOL_SIZE 1024
#define PRISON_CELLS 1024
+#define COMMIT_PERIOD HZ
/*
* The block size of the device holding pool data must be
@@ -498,8 +499,10 @@ struct pool {
struct workqueue_struct *wq;
struct work_struct worker;
+ struct delayed_work waker;
unsigned ref_count;
+ unsigned long last_commit_jiffies;
spinlock_t lock;
struct bio_list deferred_bios;
@@ -1256,6 +1259,12 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
}
}
+static int need_commit_due_to_time(struct pool *pool)
+{
+ return jiffies < pool->last_commit_jiffies ||
+ jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
+}
+
static void process_deferred_bios(struct pool *pool)
{
unsigned long flags;
@@ -1297,7 +1306,7 @@ static void process_deferred_bios(struct pool *pool)
bio_list_init(&pool->deferred_flush_bios);
spin_unlock_irqrestore(&pool->lock, flags);
- if (bio_list_empty(&bios))
+ if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
return;
r = dm_pool_commit_metadata(pool->pmd);
@@ -1308,6 +1317,7 @@ static void process_deferred_bios(struct pool *pool)
bio_io_error(bio);
return;
}
+ pool->last_commit_jiffies = jiffies;
while ((bio = bio_list_pop(&bios))) {
BUG_ON(bio->bi_next);
@@ -1323,6 +1333,17 @@ static void do_worker(struct work_struct *ws)
process_deferred_bios(pool);
}
+/*
+ * We want to commit periodically so that not too much
+ * unwritten data builds up.
+ */
+static void do_waker(struct work_struct *ws)
+{
+ struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
+ wake_worker(pool);
+ queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
+}
+
/*----------------------------------------------------------------*/
/*
@@ -1532,6 +1553,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
}
INIT_WORK(&pool->worker, do_worker);
+ INIT_DELAYED_WORK(&pool->waker, do_waker);
spin_lock_init(&pool->lock);
bio_list_init(&pool->deferred_bios);
bio_list_init(&pool->deferred_flush_bios);
@@ -1558,6 +1580,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
goto bad_endio_hook_pool;
}
pool->ref_count = 1;
+ pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md;
pool->md_dev = metadata_dev;
__pool_table_insert(pool);
@@ -1887,7 +1910,7 @@ static void pool_resume(struct dm_target *ti)
__requeue_bios(pool);
spin_unlock_irqrestore(&pool->lock, flags);
- wake_worker(pool);
+ do_waker(&pool->waker.work);
}
static void pool_postsuspend(struct dm_target *ti)
@@ -1896,6 +1919,7 @@ static void pool_postsuspend(struct dm_target *ti)
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
+ cancel_delayed_work(&pool->waker);
flush_workqueue(pool->wq);
r = dm_pool_commit_metadata(pool->pmd);
--
1.7.5.4
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 07/11] [dm-thin] Commit every second to prevent too much of a position building up.
2012-02-02 16:39 ` [PATCH 07/11] [dm-thin] Commit every second to prevent too much of a position building up Joe Thornber
@ 2012-02-07 16:53 ` Mike Snitzer
2012-02-07 23:00 ` Mike Snitzer
2012-02-10 14:48 ` Joe Thornber
0 siblings, 2 replies; 20+ messages in thread
From: Mike Snitzer @ 2012-02-07 16:53 UTC (permalink / raw)
To: device-mapper development; +Cc: Joe Thornber
On Thu, Feb 02 2012 at 11:39am -0500,
Joe Thornber <ejt@redhat.com> wrote:
> ---
> drivers/md/dm-thin.c | 28 ++++++++++++++++++++++++++--
> 1 files changed, 26 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> index 6ef03a2..19de11a 100644
> --- a/drivers/md/dm-thin.c
> +++ b/drivers/md/dm-thin.c
> @@ -23,6 +23,7 @@
> #define DEFERRED_SET_SIZE 64
> #define MAPPING_POOL_SIZE 1024
> #define PRISON_CELLS 1024
> +#define COMMIT_PERIOD HZ
>
> /*
> * The block size of the device holding pool data must be
> @@ -498,8 +499,10 @@ struct pool {
>
> struct workqueue_struct *wq;
> struct work_struct worker;
> + struct delayed_work waker;
>
> unsigned ref_count;
> + unsigned long last_commit_jiffies;
>
> spinlock_t lock;
> struct bio_list deferred_bios;
> @@ -1256,6 +1259,12 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
> }
> }
>
> +static int need_commit_due_to_time(struct pool *pool)
> +{
> + return jiffies < pool->last_commit_jiffies ||
> + jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
> +}
> +
> static void process_deferred_bios(struct pool *pool)
> {
> unsigned long flags;
> @@ -1297,7 +1306,7 @@ static void process_deferred_bios(struct pool *pool)
> bio_list_init(&pool->deferred_flush_bios);
> spin_unlock_irqrestore(&pool->lock, flags);
>
> - if (bio_list_empty(&bios))
> + if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
> return;
Shouldn't this be:
if (bio_list_empty(&bios) || !need_commit_due_to_time(pool))
return;
?
Also, should we make the commit interval tunable (akin to ext[34]'s
'commit' mount option)? Or did you defer doing so until it proves
worthwhile?
Mike
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 07/11] [dm-thin] Commit every second to prevent too much of a position building up.
2012-02-07 16:53 ` Mike Snitzer
@ 2012-02-07 23:00 ` Mike Snitzer
2012-02-10 14:48 ` Joe Thornber
1 sibling, 0 replies; 20+ messages in thread
From: Mike Snitzer @ 2012-02-07 23:00 UTC (permalink / raw)
To: device-mapper development; +Cc: Joe Thornber
On Tue, Feb 07 2012 at 11:53am -0500,
Mike Snitzer <snitzer@redhat.com> wrote:
> On Thu, Feb 02 2012 at 11:39am -0500,
> Joe Thornber <ejt@redhat.com> wrote:
>
> > ---
> > drivers/md/dm-thin.c | 28 ++++++++++++++++++++++++++--
> > 1 files changed, 26 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> > index 6ef03a2..19de11a 100644
> > --- a/drivers/md/dm-thin.c
> > +++ b/drivers/md/dm-thin.c
> > @@ -23,6 +23,7 @@
> > #define DEFERRED_SET_SIZE 64
> > #define MAPPING_POOL_SIZE 1024
> > #define PRISON_CELLS 1024
> > +#define COMMIT_PERIOD HZ
> >
> > /*
> > * The block size of the device holding pool data must be
> > @@ -498,8 +499,10 @@ struct pool {
> >
> > struct workqueue_struct *wq;
> > struct work_struct worker;
> > + struct delayed_work waker;
> >
> > unsigned ref_count;
> > + unsigned long last_commit_jiffies;
> >
> > spinlock_t lock;
> > struct bio_list deferred_bios;
> > @@ -1256,6 +1259,12 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
> > }
> > }
> >
> > +static int need_commit_due_to_time(struct pool *pool)
> > +{
> > + return jiffies < pool->last_commit_jiffies ||
> > + jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
> > +}
> > +
> > static void process_deferred_bios(struct pool *pool)
> > {
> > unsigned long flags;
> > @@ -1297,7 +1306,7 @@ static void process_deferred_bios(struct pool *pool)
> > bio_list_init(&pool->deferred_flush_bios);
> > spin_unlock_irqrestore(&pool->lock, flags);
> >
> > - if (bio_list_empty(&bios))
> > + if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
> > return;
>
> Shouldn't this be:
>
> if (bio_list_empty(&bios) || !need_commit_due_to_time(pool))
> return;
>
> ?
Hmm, looking closer at the code it is clear that if we'd use || then the
pool->deferred_flush_bios would get dropped in the floor by the
!need_commit_due_to_time(pool) early return.
So ignore that ;)
> Also, should we make the commit interval tunable (akin to ext[34]'s
> 'commit' mount option)? Or did you defer doing so until it proves
> worthwhile?
But this question still stands.
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 07/11] [dm-thin] Commit every second to prevent too much of a position building up.
2012-02-07 16:53 ` Mike Snitzer
2012-02-07 23:00 ` Mike Snitzer
@ 2012-02-10 14:48 ` Joe Thornber
2012-02-10 14:55 ` Mike Snitzer
1 sibling, 1 reply; 20+ messages in thread
From: Joe Thornber @ 2012-02-10 14:48 UTC (permalink / raw)
To: Mike Snitzer; +Cc: device-mapper development, Joe Thornber
On Tue, Feb 07, 2012 at 11:53:25AM -0500, Mike Snitzer wrote:
> Shouldn't this be:
>
> if (bio_list_empty(&bios) || !need_commit_due_to_time(pool))
> return;
>
> ?
I don't think so. It's saying 'if <there's no reason to commit>'. The reasons to commit are bios being non-empty or need_commit_due_to_time.
reason to commit = !bio_list_empty(&bios) || need_commit_due_to_time(pool)
!reason_to_commit = !(!bio_list_empty(&bios) || need_commit_due_to_time(pool))
!reason_to_commit = (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
Agree?
> Also, should we make the commit interval tunable (akin to ext[34]'s
> 'commit' mount option)? Or did you defer doing so until it proves
> worthwhile?
y, tunable is on the list. Every second will do for now though.
- Joe
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 07/11] [dm-thin] Commit every second to prevent too much of a position building up.
2012-02-10 14:48 ` Joe Thornber
@ 2012-02-10 14:55 ` Mike Snitzer
0 siblings, 0 replies; 20+ messages in thread
From: Mike Snitzer @ 2012-02-10 14:55 UTC (permalink / raw)
To: device-mapper development, Joe Thornber
On Fri, Feb 10 2012 at 9:48am -0500,
Joe Thornber <thornber@redhat.com> wrote:
> On Tue, Feb 07, 2012 at 11:53:25AM -0500, Mike Snitzer wrote:
> > Shouldn't this be:
> >
> > if (bio_list_empty(&bios) || !need_commit_due_to_time(pool))
> > return;
> >
> > ?
>
> I don't think so. It's saying 'if <there's no reason to commit>'. The reasons to commit are bios being non-empty or need_commit_due_to_time.
>
> reason to commit = !bio_list_empty(&bios) || need_commit_due_to_time(pool)
> !reason_to_commit = !(!bio_list_empty(&bios) || need_commit_due_to_time(pool))
> !reason_to_commit = (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
>
> Agree?
Yeap, I followed up saying something like nevermind.
> > Also, should we make the commit interval tunable (akin to ext[34]'s
> > 'commit' mount option)? Or did you defer doing so until it proves
> > worthwhile?
>
> y, tunable is on the list. Every second will do for now though.
OK.
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 08/11] [dm-thin] Add support for external origins.
2012-02-02 16:39 [PATCH 00/11] Latest dm-thin patches Joe Thornber
` (6 preceding siblings ...)
2012-02-02 16:39 ` [PATCH 07/11] [dm-thin] Commit every second to prevent too much of a position building up Joe Thornber
@ 2012-02-02 16:39 ` Joe Thornber
2012-02-02 16:39 ` [PATCH 09/11] [dm-thin] Discard support part 1 Joe Thornber
` (2 subsequent siblings)
10 siblings, 0 replies; 20+ messages in thread
From: Joe Thornber @ 2012-02-02 16:39 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
---
Documentation/device-mapper/thin-provisioning.txt | 38 ++++++++++-
drivers/md/dm-thin.c | 81 +++++++++++++++++----
2 files changed, 105 insertions(+), 14 deletions(-)
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index 801d9d1..60fc5cf 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -167,6 +167,38 @@ ii) Using an internal snapshot.
dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1"
+External snapshots
+------------------
+
+You can use an external, _read only_, device as an origin for a thin
+device. Any read to an unprovisioned area of the thin device will be
+passed through to the origin. Writes trigger allocation of new blocks
+as usual.
+
+One possible use case for this would be VM hosts who want to run
+guests on thinp volumes, but have the base image on another device
+(possibly shared between many VMs).
+
+You must not write to the origin device if you use this technique! Of
+course you can write to the thin device, and take internal snapshots
+of the thin.
+
+i) Creating an external snapshot
+
+ Same as creating a thin device. You don't need to mention the
+ origin at this stage.
+
+ dmsetup message /dev/mapper/pool 0 "create_thin 0"
+
+ii) Using an external snapshot.
+
+ Add an extra parameter to the thin target specifying the origin:
+
+ dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image"
+
+ All descendants (internal snapshots) of an external snapshot will
+ need the extra origin argument.
+
Deactivation
------------
@@ -262,7 +294,7 @@ iii) Messages
i) Constructor
- thin <pool dev> <dev id>
+ thin <pool dev> <dev id> [external origin id]
pool dev:
the thin-pool device, e.g. /dev/mapper/my_pool or 253:0
@@ -271,6 +303,10 @@ i) Constructor
the internal device identifier of the device to be
activated.
+ external origin dev:
+ a block device; reads to unprovisioned areas of the thin target
+ will be mapped to here.
+
The pool doesn't store any size against the thin devices. If you
load a thin target that is smaller than you've been using previously,
then you'll have no access to blocks mapped beyond the end. If you
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 19de11a..922ebf2 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -537,6 +537,7 @@ struct pool_c {
*/
struct thin_c {
struct dm_dev *pool_dev;
+ struct dm_dev *origin_dev;
dm_thin_id dev_id;
struct pool *pool;
@@ -654,14 +655,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
(bio->bi_sector & pool->offset_mask);
}
-static void remap_and_issue(struct thin_c *tc, struct bio *bio,
- dm_block_t block)
+static void remap_to_origin(struct thin_c *tc, struct bio *bio)
+{
+ bio->bi_bdev = tc->origin_dev->bdev;
+}
+
+static void issue(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
unsigned long flags;
- remap(tc, bio, block);
-
/*
* Batch together any FUA/FLUSH bios we find and then issue
* a single commit for them in process_deferred_bios().
@@ -676,6 +679,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
}
}
+static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
+{
+ remap_to_origin(tc, bio);
+ issue(tc, bio);
+}
+
+static void remap_and_issue(struct thin_c *tc, struct bio *bio,
+ dm_block_t block)
+{
+ remap(tc, bio, block);
+ issue(tc, bio);
+}
+
/*
* wake_worker() is used when new work is queued and when pool_resume is
* ready to continue deferred IO processing.
@@ -927,7 +943,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool)
}
static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
- dm_block_t data_origin, dm_block_t data_dest,
+ struct dm_dev *origin, dm_block_t data_origin,
+ dm_block_t data_dest,
struct cell *cell, struct bio *bio)
{
int r;
@@ -959,7 +976,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
} else {
struct dm_io_region from, to;
- from.bdev = tc->pool_dev->bdev;
+ from.bdev = origin->bdev;
from.sector = data_origin * pool->sectors_per_block;
from.count = pool->sectors_per_block;
@@ -977,6 +994,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
}
}
+static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
+ dm_block_t data_origin, dm_block_t data_dest,
+ struct cell *cell, struct bio *bio)
+{
+ schedule_copy(tc, virt_block, tc->pool_dev,
+ data_origin, data_dest, cell, bio);
+}
+
+static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
+ dm_block_t data_dest,
+ struct cell *cell, struct bio *bio)
+{
+ schedule_copy(tc, virt_block, tc->origin_dev,
+ virt_block, data_dest, cell, bio);
+}
+
static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
dm_block_t data_block, struct cell *cell,
struct bio *bio)
@@ -1123,8 +1156,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
r = alloc_data_block(tc, &data_block);
switch (r) {
case 0:
- schedule_copy(tc, block, lookup_result->block,
- data_block, cell, bio);
+ schedule_internal_copy(tc, block, lookup_result->block,
+ data_block, cell, bio);
break;
case -ENOSPC:
@@ -1198,7 +1231,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
r = alloc_data_block(tc, &data_block);
switch (r) {
case 0:
- schedule_zero(tc, block, data_block, cell, bio);
+ if (tc->origin_dev)
+ schedule_external_copy(tc, block, data_block, cell, bio);
+ else
+ schedule_zero(tc, block, data_block, cell, bio);
break;
case -ENOSPC:
@@ -1249,7 +1285,11 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
break;
case -ENODATA:
- provision_block(tc, bio, block, cell);
+ if (bio_data_dir(bio) == READ && tc->origin_dev) {
+ cell_release_singleton(cell, bio);
+ remap_to_origin_and_issue(tc, bio);
+ } else
+ provision_block(tc, bio, block, cell);
break;
default:
@@ -2235,6 +2275,8 @@ static void thin_dtr(struct dm_target *ti)
__pool_dec(tc->pool);
dm_pool_close_thin_device(tc->td);
dm_put_device(ti, tc->pool_dev);
+ if (tc->origin_dev)
+ dm_put_device(ti, tc->origin_dev);
kfree(tc);
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2243,21 +2285,22 @@ static void thin_dtr(struct dm_target *ti)
/*
* Thin target parameters:
*
- * <pool_dev> <dev_id>
+ * <pool_dev> <dev_id> [origin_dev]
*
* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
* dev_id: the internal device identifier
+ * origin_dev: a device external to the pool that should act as the origin
*/
static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
int r;
struct thin_c *tc;
- struct dm_dev *pool_dev;
+ struct dm_dev *pool_dev, *origin_dev;
struct mapped_device *pool_md;
mutex_lock(&dm_thin_pool_table.mutex);
- if (argc != 2) {
+ if (argc != 2 && argc != 3) {
ti->error = "Invalid argument count";
r = -EINVAL;
goto out_unlock;
@@ -2270,6 +2313,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto out_unlock;
}
+ if (argc == 3) {
+ r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
+ if (r) {
+ ti->error = "Error opening origin device";
+ goto bad_origin_dev;
+ }
+ tc->origin_dev = origin_dev;
+ }
+
r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
if (r) {
ti->error = "Error opening pool device";
@@ -2322,6 +2374,9 @@ bad_pool_lookup:
bad_common:
dm_put_device(ti, tc->pool_dev);
bad_pool_dev:
+ if (tc->origin_dev)
+ dm_put_device(ti, tc->origin_dev);
+bad_origin_dev:
kfree(tc);
out_unlock:
mutex_unlock(&dm_thin_pool_table.mutex);
--
1.7.5.4
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 09/11] [dm-thin] Discard support part 1
2012-02-02 16:39 [PATCH 00/11] Latest dm-thin patches Joe Thornber
` (7 preceding siblings ...)
2012-02-02 16:39 ` [PATCH 08/11] [dm-thin] Add support for external origins Joe Thornber
@ 2012-02-02 16:39 ` Joe Thornber
2012-02-02 16:39 ` [PATCH 10/11] [dm-thin] Add support for REQ_DISCARD Joe Thornber
2012-02-02 16:39 ` [PATCH 11/11] [dm-thin] some tidy ups of the __open_device() error path (Mike Snitzer) Joe Thornber
10 siblings, 0 replies; 20+ messages in thread
From: Joe Thornber @ 2012-02-02 16:39 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
This patch contains a lot of the ground work needed for supporting
discard.
- The thin target now has an endio function, that replaces
shared_read_endio.
- An explicit 'quiesced' flag has been introduced into the new_mapping
structure. Before, this was implicitly indicated by m->list being
empty.
- The map_info->ptr remains constant for the duration of a bio's trip
through thinp. Making it easier to reason about it.
---
drivers/md/dm-thin.c | 125 ++++++++++++++++++++++++++++----------------------
1 files changed, 70 insertions(+), 55 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 922ebf2..c5e3102 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -511,7 +511,7 @@ struct pool {
struct bio_list retry_on_resume_list;
- struct deferred_set ds; /* FIXME: move to thin_c */
+ struct deferred_set shared_read_ds;
struct new_mapping *next_mapping;
mempool_t *mapping_pool;
@@ -606,6 +606,12 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
/*----------------------------------------------------------------*/
+struct endio_hook {
+ struct thin_c *tc;
+ struct deferred_entry *shared_read_entry;
+ struct new_mapping *overwrite_mapping;
+};
+
static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
{
struct bio *bio;
@@ -616,7 +622,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
bio_list_init(master);
while ((bio = bio_list_pop(&bios))) {
- if (dm_get_mapinfo(bio)->ptr == tc)
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ if (h->tc == tc)
bio_endio(bio, DM_ENDIO_REQUEUE);
else
bio_list_add(master, bio);
@@ -706,16 +713,11 @@ static void wake_worker(struct pool *pool)
/*
* Bio endio functions.
*/
-struct endio_hook {
- struct thin_c *tc;
- bio_end_io_t *saved_bi_end_io;
- struct deferred_entry *entry;
-};
-
struct new_mapping {
struct list_head list;
- int prepared;
+ unsigned quiesced:1;
+ unsigned prepared:1;
struct thin_c *tc;
dm_block_t virt_block;
@@ -737,7 +739,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
{
struct pool *pool = m->tc->pool;
- if (list_empty(&m->list) && m->prepared) {
+ if (m->quiesced && m->prepared) {
list_add(&m->list, &pool->prepared_mappings);
wake_worker(pool);
}
@@ -760,7 +762,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
static void overwrite_endio(struct bio *bio, int err)
{
unsigned long flags;
- struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct new_mapping *m = h->overwrite_mapping;
struct pool *pool = m->tc->pool;
m->err = err;
@@ -771,31 +774,6 @@ static void overwrite_endio(struct bio *bio, int err)
spin_unlock_irqrestore(&pool->lock, flags);
}
-static void shared_read_endio(struct bio *bio, int err)
-{
- struct list_head mappings;
- struct new_mapping *m, *tmp;
- struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
- unsigned long flags;
- struct pool *pool = h->tc->pool;
-
- bio->bi_end_io = h->saved_bi_end_io;
- bio_endio(bio, err);
-
- INIT_LIST_HEAD(&mappings);
- ds_dec(h->entry, &mappings);
-
- spin_lock_irqsave(&pool->lock, flags);
- list_for_each_entry_safe(m, tmp, &mappings, list) {
- list_del(&m->list);
- INIT_LIST_HEAD(&m->list);
- __maybe_add_mapping(m);
- }
- spin_unlock_irqrestore(&pool->lock, flags);
-
- mempool_free(h, pool->endio_hook_pool);
-}
-
/*----------------------------------------------------------------*/
/*
@@ -934,9 +912,7 @@ static int ensure_next_mapping(struct pool *pool)
static struct new_mapping *get_next_mapping(struct pool *pool)
{
struct new_mapping *r = pool->next_mapping;
-
BUG_ON(!pool->next_mapping);
-
pool->next_mapping = NULL;
return r;
@@ -952,6 +928,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct new_mapping *m = get_next_mapping(pool);
INIT_LIST_HEAD(&m->list);
+ m->quiesced = 0;
m->prepared = 0;
m->tc = tc;
m->virt_block = virt_block;
@@ -960,7 +937,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
m->err = 0;
m->bio = NULL;
- ds_add_work(&pool->ds, &m->list);
+ if (!ds_add_work(&pool->shared_read_ds, &m->list))
+ m->quiesced = 1;
/*
* IO to pool_dev remaps to the pool target's data_dev.
@@ -969,9 +947,10 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
* bio immediately. Otherwise we use kcopyd to clone the data first.
*/
if (io_overwrites_block(pool, bio)) {
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ h->overwrite_mapping = m;
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
- dm_get_mapinfo(bio)->ptr = m;
remap_and_issue(tc, bio, data_dest);
} else {
struct dm_io_region from, to;
@@ -1018,6 +997,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
struct new_mapping *m = get_next_mapping(pool);
INIT_LIST_HEAD(&m->list);
+ m->quiesced = 1;
m->prepared = 0;
m->tc = tc;
m->virt_block = virt_block;
@@ -1035,9 +1015,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
process_prepared_mapping(m);
else if (io_overwrites_block(pool, bio)) {
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ h->overwrite_mapping = m;
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
- dm_get_mapinfo(bio)->ptr = m;
remap_and_issue(tc, bio, data_block);
} else {
@@ -1124,7 +1105,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
*/
static void retry_on_resume(struct bio *bio)
{
- struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct thin_c *tc = h->tc;
struct pool *pool = tc->pool;
unsigned long flags;
@@ -1190,13 +1172,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
if (bio_data_dir(bio) == WRITE)
break_sharing(tc, bio, block, &key, lookup_result, cell);
else {
- struct endio_hook *h;
- h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
- h->tc = tc;
- h->entry = ds_inc(&pool->ds);
- save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
- dm_get_mapinfo(bio)->ptr = h;
+ h->shared_read_entry = ds_inc(&pool->shared_read_ds);
cell_release_singleton(cell, bio);
remap_and_issue(tc, bio, lookup_result->block);
@@ -1320,7 +1298,9 @@ static void process_deferred_bios(struct pool *pool)
spin_unlock_irqrestore(&pool->lock, flags);
while ((bio = bio_list_pop(&bios))) {
- struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct thin_c *tc = h->tc;
+
/*
* If we've got no free new_mapping structs, and processing
* this bio might require one, we pause until there are some
@@ -1405,6 +1385,18 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
wake_worker(pool);
}
+static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+
+ h->tc = tc;
+ h->shared_read_entry = NULL;
+ h->overwrite_mapping = NULL;
+
+ return h;
+}
+
/*
* Non-blocking function called from the thin target's map function.
*/
@@ -1417,11 +1409,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
struct dm_thin_device *td = tc->td;
struct dm_thin_lookup_result result;
- /*
- * Save the thin context for easy access from the deferred bio later.
- */
- map_context->ptr = tc;
-
+ map_context->ptr = thin_hook_bio(tc, bio);
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
thin_defer_bio(tc, bio);
return DM_MAPIO_SUBMITTED;
@@ -1601,7 +1589,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->low_water_triggered = 0;
pool->no_free_space = 0;
bio_list_init(&pool->retry_on_resume_list);
- ds_init(&pool->ds);
+ ds_init(&pool->shared_read_ds);
pool->next_mapping = NULL;
pool->mapping_pool =
@@ -2392,6 +2380,32 @@ static int thin_map(struct dm_target *ti, struct bio *bio,
return thin_bio_map(ti, bio, map_context);
}
+static int thin_endio(struct dm_target *ti,
+ struct bio *bio, int err,
+ union map_info *map_context)
+{
+ unsigned long flags;
+ struct endio_hook *h = map_context->ptr;
+ struct list_head work;
+ struct new_mapping *m, *tmp;
+ struct pool *pool = h->tc->pool;
+
+ if (h->shared_read_entry) {
+ INIT_LIST_HEAD(&work);
+ ds_dec(h->shared_read_entry, &work);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ list_for_each_entry_safe(m, tmp, &work, list) {
+ list_del(&m->list);
+ m->quiesced = 1;
+ __maybe_add_mapping(m);
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ return 0;
+}
+
static void thin_postsuspend(struct dm_target *ti)
{
if (dm_noflush_suspending(ti))
@@ -2477,6 +2491,7 @@ static struct target_type thin_target = {
.ctr = thin_ctr,
.dtr = thin_dtr,
.map = thin_map,
+ .end_io = thin_endio,
.postsuspend = thin_postsuspend,
.status = thin_status,
.iterate_devices = thin_iterate_devices,
--
1.7.5.4
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 10/11] [dm-thin] Add support for REQ_DISCARD
2012-02-02 16:39 [PATCH 00/11] Latest dm-thin patches Joe Thornber
` (8 preceding siblings ...)
2012-02-02 16:39 ` [PATCH 09/11] [dm-thin] Discard support part 1 Joe Thornber
@ 2012-02-02 16:39 ` Joe Thornber
2012-02-10 18:08 ` [PATCH 12/12 v2] dm thin: add discard support Mike Snitzer
2012-02-02 16:39 ` [PATCH 11/11] [dm-thin] some tidy ups of the __open_device() error path (Mike Snitzer) Joe Thornber
10 siblings, 1 reply; 20+ messages in thread
From: Joe Thornber @ 2012-02-02 16:39 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
---
drivers/md/dm-thin.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 159 insertions(+), 14 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c5e3102..304a934 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -508,10 +508,12 @@ struct pool {
struct bio_list deferred_bios;
struct bio_list deferred_flush_bios;
struct list_head prepared_mappings;
+ struct list_head prepared_discards;
struct bio_list retry_on_resume_list;
struct deferred_set shared_read_ds;
+ struct deferred_set all_io_ds;
struct new_mapping *next_mapping;
mempool_t *mapping_pool;
@@ -609,6 +611,7 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
struct endio_hook {
struct thin_c *tc;
struct deferred_entry *shared_read_entry;
+ struct deferred_entry *all_io_entry;
struct new_mapping *overwrite_mapping;
};
@@ -718,11 +721,12 @@ struct new_mapping {
unsigned quiesced:1;
unsigned prepared:1;
+ unsigned pass_discard:1;
struct thin_c *tc;
dm_block_t virt_block;
dm_block_t data_block;
- struct cell *cell;
+ struct cell *cell, *cell2;
int err;
/*
@@ -867,7 +871,30 @@ static void process_prepared_mapping(struct new_mapping *m)
mempool_free(m, tc->pool->mapping_pool);
}
-static void process_prepared_mappings(struct pool *pool)
+static void process_prepared_discard(struct new_mapping *m)
+{
+ int r;
+ struct thin_c *tc = m->tc;
+
+ r = dm_thin_remove_block(tc->td, m->virt_block);
+ if (r)
+ DMERR("dm_thin_metadata_remove() failed");
+
+ /*
+ * Pass the discard down to the underlying device?
+ */
+ if (m->pass_discard)
+ remap_and_issue(tc, m->bio, m->data_block);
+ else
+ bio_endio(m->bio, 0);
+
+ cell_defer_except(tc, m->cell, m->bio);
+ cell_defer_except(tc, m->cell2, m->bio);
+ mempool_free(m, tc->pool->mapping_pool);
+}
+
+static void process_prepared(struct pool *pool, struct list_head *head,
+ void (*fn)(struct new_mapping *))
{
unsigned long flags;
struct list_head maps;
@@ -875,21 +902,27 @@ static void process_prepared_mappings(struct pool *pool)
INIT_LIST_HEAD(&maps);
spin_lock_irqsave(&pool->lock, flags);
- list_splice_init(&pool->prepared_mappings, &maps);
+ list_splice_init(head, &maps);
spin_unlock_irqrestore(&pool->lock, flags);
list_for_each_entry_safe(m, tmp, &maps, list)
- process_prepared_mapping(m);
+ fn(m);
}
/*
* Deferred bio jobs.
*/
-static int io_overwrites_block(struct pool *pool, struct bio *bio)
+static int io_overlaps_block(struct pool *pool, struct bio *bio)
{
- return ((bio_data_dir(bio) == WRITE) &&
- !(bio->bi_sector & pool->offset_mask)) &&
+ return !(bio->bi_sector & pool->offset_mask) &&
(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
+
+}
+
+static int io_overwrites_block(struct pool *pool, struct bio *bio)
+{
+ return (bio_data_dir(bio) == WRITE) &&
+ io_overlaps_block(pool, bio);
}
static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -1127,6 +1160,86 @@ static void no_space(struct cell *cell)
retry_on_resume(bio);
}
+static void process_discard(struct thin_c *tc, struct bio *bio)
+{
+ int r;
+ struct pool *pool = tc->pool;
+ struct cell *cell, *cell2;
+ struct cell_key key, key2;
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_thin_lookup_result lookup_result;
+ struct new_mapping *m;
+
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(tc->pool->prison, &key, bio, &cell))
+ return;
+
+ r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+ switch (r) {
+ case 0:
+ /*
+ * Check nobody is fiddling with this pool block. This can
+ * happen if someone's in the process of breaking sharing
+ * on this block.
+ */
+ build_data_key(tc->td, lookup_result.block, &key2);
+ if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
+ cell_release_singleton(cell, bio);
+ break;
+ }
+
+ if (io_overlaps_block(pool, bio)) {
+ /*
+ * IO may still be going to the destination block. We must
+ * quiesce before we can do the removal.
+ */
+ m = get_next_mapping(pool);
+ m->tc = tc;
+ m->pass_discard = !lookup_result.shared;
+ m->virt_block = block;
+ m->data_block = lookup_result.block;
+ m->cell = cell;
+ m->cell2 = cell2;
+ m->err = 0;
+ m->bio = bio;
+
+ if (!ds_add_work(&pool->all_io_ds, &m->list)) {
+ list_add(&m->list, &pool->prepared_discards);
+ wake_worker(pool);
+ }
+ } else {
+ /*
+ * This path is hit if people are ignoring
+ * limits->discard_granularity. It ignores any
+ * part of the discard that is in a subsequent
+ * block.
+ */
+ sector_t offset = bio->bi_sector - (block << pool->block_shift);
+ unsigned remaining = (pool->sectors_per_block - offset) << 9;
+ bio->bi_size = min(bio->bi_size, remaining);
+
+ cell_release_singleton(cell, bio);
+ cell_release_singleton(cell2, bio);
+ remap_and_issue(tc, bio, lookup_result.block);
+ }
+ break;
+
+ case -ENODATA:
+ /*
+ * It isn't provisioned, just forget it.
+ */
+ cell_release_singleton(cell, bio);
+ bio_endio(bio, 0);
+ break;
+
+ default:
+ DMERR("discard: find block unexpectedly returned %d\n", r);
+ cell_release_singleton(cell, bio);
+ bio_io_error(bio);
+ break;
+ }
+}
+
static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
struct cell_key *key,
struct dm_thin_lookup_result *lookup_result,
@@ -1272,6 +1385,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
default:
DMERR("dm_thin_find_block() failed, error = %d", r);
+ cell_release_singleton(cell, bio);
bio_io_error(bio);
break;
}
@@ -1313,7 +1427,11 @@ static void process_deferred_bios(struct pool *pool)
break;
}
- process_bio(tc, bio);
+
+ if (bio->bi_rw & REQ_DISCARD)
+ process_discard(tc, bio);
+ else
+ process_bio(tc, bio);
}
/*
@@ -1349,7 +1467,8 @@ static void do_worker(struct work_struct *ws)
{
struct pool *pool = container_of(ws, struct pool, worker);
- process_prepared_mappings(pool);
+ process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
+ process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
process_deferred_bios(pool);
}
@@ -1392,6 +1511,7 @@ static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
h->tc = tc;
h->shared_read_entry = NULL;
+ h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
h->overwrite_mapping = NULL;
return h;
@@ -1410,7 +1530,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
struct dm_thin_lookup_result result;
map_context->ptr = thin_hook_bio(tc, bio);
- if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+ if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
thin_defer_bio(tc, bio);
return DM_MAPIO_SUBMITTED;
}
@@ -1586,10 +1706,12 @@ static struct pool *pool_create(struct mapped_device *pool_md,
bio_list_init(&pool->deferred_bios);
bio_list_init(&pool->deferred_flush_bios);
INIT_LIST_HEAD(&pool->prepared_mappings);
+ INIT_LIST_HEAD(&pool->prepared_discards);
pool->low_water_triggered = 0;
pool->no_free_space = 0;
bio_list_init(&pool->retry_on_resume_list);
ds_init(&pool->shared_read_ds);
+ ds_init(&pool->all_io_ds);
pool->next_mapping = NULL;
pool->mapping_pool =
@@ -1830,7 +1952,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
pt->low_water_blocks = low_water_blocks;
pt->zero_new_blocks = pf.zero_new_blocks;
ti->num_flush_requests = 1;
- ti->num_discard_requests = 0;
+ ti->num_discard_requests = 1;
+ ti->discards_supported = 1;
ti->private = pt;
pt->callbacks.congested_fn = pool_is_congested;
@@ -2223,6 +2346,17 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
+static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
+{
+ limits->max_discard_sectors = pool->sectors_per_block;
+
+ /*
+ * This is just a hint, and not enforced. We have to cope with
+ * bios that overlap 2 blocks.
+ */
+ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+}
+
static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct pool_c *pt = ti->private;
@@ -2230,6 +2364,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_min(limits, 0);
blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ set_discard_limits(pool, limits);
}
static struct target_type pool_target = {
@@ -2346,8 +2481,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->split_io = tc->pool->sectors_per_block;
ti->num_flush_requests = 1;
- ti->num_discard_requests = 0;
- ti->discards_supported = 0;
+ ti->num_discard_requests = 1;
+ ti->discards_supported = 1;
dm_put(pool_md);
@@ -2403,6 +2538,14 @@ static int thin_endio(struct dm_target *ti,
spin_unlock_irqrestore(&pool->lock, flags);
}
+ if (h->all_io_entry) {
+ INIT_LIST_HEAD(&work);
+ ds_dec(h->all_io_entry, &work);
+ list_for_each_entry_safe(m, tmp, &work, list)
+ list_add(&m->list, &pool->prepared_discards);
+ }
+
+ mempool_free(h, pool->endio_hook_pool);
return 0;
}
@@ -2479,9 +2622,11 @@ static int thin_iterate_devices(struct dm_target *ti,
static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct thin_c *tc = ti->private;
+ struct pool *pool = tc->pool;
blk_limits_io_min(limits, 0);
- blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
+ blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ set_discard_limits(pool, limits);
}
static struct target_type thin_target = {
--
1.7.5.4
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 12/12 v2] dm thin: add discard support
2012-02-02 16:39 ` [PATCH 10/11] [dm-thin] Add support for REQ_DISCARD Joe Thornber
@ 2012-02-10 18:08 ` Mike Snitzer
0 siblings, 0 replies; 20+ messages in thread
From: Mike Snitzer @ 2012-02-10 18:08 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
From: Joe Thornber <ejt@redhat.com>
FIXME - add descriptive header
Signed-off-by: Joe Thornber <ejt@redhat.com>
---
drivers/md/dm-thin.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 159 insertions(+), 14 deletions(-)
Index: linux-2.6/drivers/md/dm-thin.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-thin.c
+++ linux-2.6/drivers/md/dm-thin.c
@@ -528,10 +528,12 @@ struct pool {
struct bio_list deferred_bios;
struct bio_list deferred_flush_bios;
struct list_head prepared_mappings;
+ struct list_head prepared_discards;
struct bio_list retry_on_resume_list;
struct deferred_set shared_read_ds;
+ struct deferred_set all_io_ds;
struct new_mapping *next_mapping;
mempool_t *mapping_pool;
@@ -629,6 +631,7 @@ static struct pool *__pool_table_lookup_
struct endio_hook {
struct thin_c *tc;
struct deferred_entry *shared_read_entry;
+ struct deferred_entry *all_io_entry;
struct new_mapping *overwrite_mapping;
};
@@ -738,11 +741,12 @@ struct new_mapping {
unsigned quiesced:1;
unsigned prepared:1;
+ unsigned pass_discard:1;
struct thin_c *tc;
dm_block_t virt_block;
dm_block_t data_block;
- struct cell *cell;
+ struct cell *cell, *cell2;
int err;
/*
@@ -882,7 +886,30 @@ static void process_prepared_mapping(str
mempool_free(m, tc->pool->mapping_pool);
}
-static void process_prepared_mappings(struct pool *pool)
+static void process_prepared_discard(struct new_mapping *m)
+{
+ int r;
+ struct thin_c *tc = m->tc;
+
+ r = dm_thin_remove_block(tc->td, m->virt_block);
+ if (r)
+ DMERR("dm_thin_metadata_remove() failed");
+
+ /*
+ * Pass the discard down to the underlying device?
+ */
+ if (m->pass_discard)
+ remap_and_issue(tc, m->bio, m->data_block);
+ else
+ bio_endio(m->bio, 0);
+
+ cell_defer_except(tc, m->cell);
+ cell_defer_except(tc, m->cell2);
+ mempool_free(m, tc->pool->mapping_pool);
+}
+
+static void process_prepared(struct pool *pool, struct list_head *head,
+ void (*fn)(struct new_mapping *))
{
unsigned long flags;
struct list_head maps;
@@ -890,21 +917,27 @@ static void process_prepared_mappings(st
INIT_LIST_HEAD(&maps);
spin_lock_irqsave(&pool->lock, flags);
- list_splice_init(&pool->prepared_mappings, &maps);
+ list_splice_init(head, &maps);
spin_unlock_irqrestore(&pool->lock, flags);
list_for_each_entry_safe(m, tmp, &maps, list)
- process_prepared_mapping(m);
+ fn(m);
}
/*
* Deferred bio jobs.
*/
-static int io_overwrites_block(struct pool *pool, struct bio *bio)
+static int io_overlaps_block(struct pool *pool, struct bio *bio)
{
- return ((bio_data_dir(bio) == WRITE) &&
- !(bio->bi_sector & pool->offset_mask)) &&
+ return !(bio->bi_sector & pool->offset_mask) &&
(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
+
+}
+
+static int io_overwrites_block(struct pool *pool, struct bio *bio)
+{
+ return (bio_data_dir(bio) == WRITE) &&
+ io_overlaps_block(pool, bio);
}
static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -1142,6 +1175,86 @@ static void no_space(struct cell *cell)
retry_on_resume(bio);
}
+static void process_discard(struct thin_c *tc, struct bio *bio)
+{
+ int r;
+ struct pool *pool = tc->pool;
+ struct cell *cell, *cell2;
+ struct cell_key key, key2;
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_thin_lookup_result lookup_result;
+ struct new_mapping *m;
+
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(tc->pool->prison, &key, bio, &cell))
+ return;
+
+ r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+ switch (r) {
+ case 0:
+ /*
+ * Check nobody is fiddling with this pool block. This can
+ * happen if someone's in the process of breaking sharing
+ * on this block.
+ */
+ build_data_key(tc->td, lookup_result.block, &key2);
+ if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
+ cell_release_singleton(cell, bio);
+ break;
+ }
+
+ if (io_overlaps_block(pool, bio)) {
+ /*
+ * IO may still be going to the destination block. We must
+ * quiesce before we can do the removal.
+ */
+ m = get_next_mapping(pool);
+ m->tc = tc;
+ m->pass_discard = !lookup_result.shared;
+ m->virt_block = block;
+ m->data_block = lookup_result.block;
+ m->cell = cell;
+ m->cell2 = cell2;
+ m->err = 0;
+ m->bio = bio;
+
+ if (!ds_add_work(&pool->all_io_ds, &m->list)) {
+ list_add(&m->list, &pool->prepared_discards);
+ wake_worker(pool);
+ }
+ } else {
+ /*
+ * This path is hit if people are ignoring
+ * limits->discard_granularity. It ignores any
+ * part of the discard that is in a subsequent
+ * block.
+ */
+ sector_t offset = bio->bi_sector - (block << pool->block_shift);
+ unsigned remaining = (pool->sectors_per_block - offset) << 9;
+ bio->bi_size = min(bio->bi_size, remaining);
+
+ cell_release_singleton(cell, bio);
+ cell_release_singleton(cell2, bio);
+ remap_and_issue(tc, bio, lookup_result.block);
+ }
+ break;
+
+ case -ENODATA:
+ /*
+ * It isn't provisioned, just forget it.
+ */
+ cell_release_singleton(cell, bio);
+ bio_endio(bio, 0);
+ break;
+
+ default:
+ DMERR("discard: find block unexpectedly returned %d\n", r);
+ cell_release_singleton(cell, bio);
+ bio_io_error(bio);
+ break;
+ }
+}
+
static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
struct cell_key *key,
struct dm_thin_lookup_result *lookup_result,
@@ -1287,6 +1400,7 @@ static void process_bio(struct thin_c *t
default:
DMERR("dm_thin_find_block() failed, error = %d", r);
+ cell_release_singleton(cell, bio);
bio_io_error(bio);
break;
}
@@ -1328,7 +1442,11 @@ static void process_deferred_bios(struct
break;
}
- process_bio(tc, bio);
+
+ if (bio->bi_rw & REQ_DISCARD)
+ process_discard(tc, bio);
+ else
+ process_bio(tc, bio);
}
/*
@@ -1362,7 +1480,8 @@ static void do_worker(struct work_struct
{
struct pool *pool = container_of(ws, struct pool, worker);
- process_prepared_mappings(pool);
+ process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
+ process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
process_deferred_bios(pool);
}
@@ -1405,6 +1524,7 @@ static struct endio_hook *thin_hook_bio(
h->tc = tc;
h->shared_read_entry = NULL;
+ h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
h->overwrite_mapping = NULL;
return h;
@@ -1423,7 +1543,7 @@ static int thin_bio_map(struct dm_target
struct dm_thin_lookup_result result;
map_context->ptr = thin_hook_bio(tc, bio);
- if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+ if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
thin_defer_bio(tc, bio);
return DM_MAPIO_SUBMITTED;
}
@@ -1599,10 +1719,12 @@ static struct pool *pool_create(struct m
bio_list_init(&pool->deferred_bios);
bio_list_init(&pool->deferred_flush_bios);
INIT_LIST_HEAD(&pool->prepared_mappings);
+ INIT_LIST_HEAD(&pool->prepared_discards);
pool->low_water_triggered = 0;
pool->no_free_space = 0;
bio_list_init(&pool->retry_on_resume_list);
ds_init(&pool->shared_read_ds);
+ ds_init(&pool->all_io_ds);
pool->next_mapping = NULL;
pool->mapping_pool =
@@ -1843,7 +1965,8 @@ static int pool_ctr(struct dm_target *ti
pt->low_water_blocks = low_water_blocks;
pt->zero_new_blocks = pf.zero_new_blocks;
ti->num_flush_requests = 1;
- ti->num_discard_requests = 0;
+ ti->num_discard_requests = 1;
+ ti->discards_supported = 1;
ti->private = pt;
pt->callbacks.congested_fn = pool_is_congested;
@@ -2236,6 +2359,17 @@ static int pool_merge(struct dm_target *
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
+static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
+{
+ limits->max_discard_sectors = pool->sectors_per_block;
+
+ /*
+ * This is just a hint, and not enforced. We have to cope with
+ * bios that overlap 2 blocks.
+ */
+ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+}
+
static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct pool_c *pt = ti->private;
@@ -2243,6 +2377,7 @@ static void pool_io_hints(struct dm_targ
blk_limits_io_min(limits, 0);
blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ set_discard_limits(pool, limits);
}
static struct target_type pool_target = {
@@ -2359,8 +2494,8 @@ static int thin_ctr(struct dm_target *ti
ti->split_io = tc->pool->sectors_per_block;
ti->num_flush_requests = 1;
- ti->num_discard_requests = 0;
- ti->discards_supported = 0;
+ ti->num_discard_requests = 1;
+ ti->discards_supported = 1;
dm_put(pool_md);
@@ -2416,6 +2551,14 @@ static int thin_endio(struct dm_target *
spin_unlock_irqrestore(&pool->lock, flags);
}
+ if (h->all_io_entry) {
+ INIT_LIST_HEAD(&work);
+ ds_dec(h->all_io_entry, &work);
+ list_for_each_entry_safe(m, tmp, &work, list)
+ list_add(&m->list, &pool->prepared_discards);
+ }
+
+ mempool_free(h, pool->endio_hook_pool);
return 0;
}
@@ -2492,9 +2635,11 @@ static int thin_iterate_devices(struct d
static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct thin_c *tc = ti->private;
+ struct pool *pool = tc->pool;
blk_limits_io_min(limits, 0);
- blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
+ blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ set_discard_limits(pool, limits);
}
static struct target_type thin_target = {
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 11/11] [dm-thin] some tidy ups of the __open_device() error path (Mike Snitzer)
2012-02-02 16:39 [PATCH 00/11] Latest dm-thin patches Joe Thornber
` (9 preceding siblings ...)
2012-02-02 16:39 ` [PATCH 10/11] [dm-thin] Add support for REQ_DISCARD Joe Thornber
@ 2012-02-02 16:39 ` Joe Thornber
10 siblings, 0 replies; 20+ messages in thread
From: Joe Thornber @ 2012-02-02 16:39 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
---
drivers/md/dm-thin-metadata.c | 22 ++++++++++++++++++----
1 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 1aeef4e..0bc3033 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -790,6 +790,11 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
return 0;
}
+/*
+ * __open_device: lookup @td if already open, allocate @td on first open.
+ * on success, return @td with an incremented reference count.
+ * on failure, @td is not initialized.
+ */
static int __open_device(struct dm_pool_metadata *pmd,
dm_thin_id dev, int create,
struct dm_thin_device **td)
@@ -804,6 +809,13 @@ static int __open_device(struct dm_pool_metadata *pmd,
*/
list_for_each_entry(td2, &pmd->thin_devices, list)
if (td2->id == dev) {
+ if (create) {
+ /*
+ * inconsistency if looking to create
+ * an already open device.
+ */
+ return -EINVAL;
+ }
td2->open_count++;
*td = td2;
return 0;
@@ -818,6 +830,9 @@ static int __open_device(struct dm_pool_metadata *pmd,
if (r != -ENODATA || !create)
return r;
+ /*
+ * Open is for device creation.
+ */
changed = 1;
details_le.mapped_blocks = 0;
details_le.transaction_id = cpu_to_le64(pmd->trans_id);
@@ -883,12 +898,10 @@ static int __create_thin(struct dm_pool_metadata *pmd,
r = __open_device(pmd, dev, 1, &td);
if (r) {
- __close_device(td);
dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
dm_btree_del(&pmd->bl_info, dev_root);
return r;
}
- td->changed = 1;
__close_device(td);
return r;
@@ -968,14 +981,15 @@ static int __create_snap(struct dm_pool_metadata *pmd,
goto bad;
r = __set_snapshot_details(pmd, td, origin, pmd->time);
- if (r)
+ if (r) {
+ __close_device(td);
goto bad;
+ }
__close_device(td);
return 0;
bad:
- __close_device(td);
dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
dm_btree_remove(&pmd->details_info, pmd->details_root,
&key, &pmd->details_root);
--
1.7.5.4
^ permalink raw reply related [flat|nested] 20+ messages in thread