* [PATCH 00/13] dm thin: updated patches for 3.4
@ 2012-02-29 14:50 Mike Snitzer
2012-02-29 14:50 ` [PATCH 01/13] dm thin: tidy up the cell_release functions Mike Snitzer
` (12 more replies)
0 siblings, 13 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: snitzer
I've reviewed all but one of these patches quite closely (didn't go
crazy on the held root patch).
I've taken care to make sure the patches include relevant fixes that
were made after Joe's previous submission to dm-devel.
This patchset will bring the upstream dm thinp code up to Joe's
thin-dev commit f1f8676ecdf366.
Heinz Mauelshagen (1):
dm thin: pre-commit in pool_status so it provides accurate free block counts
Joe Thornber (12):
dm thin: tidy up the cell_release functions
dm btree-remove: break up __rebalance3 function
dm btree remove: remove 2 BUG_ONs from __rebalance2 function
dm btree remove: fix bug that allowed the nr of entries in a btree node to drop below 1/3
dm btree remove: fix center node entry redistribution in redistribute3 function
dm space map: remove entries from the ref_count tree if no longer needed
dm thin: don't use the bi_next field for the holder of a cell
dm thin: add support for external origins
dm thin: held root support
dm thin: foundation for discard support
dm thin: add discard support
dm thin: add discard pool features
Documentation/device-mapper/thin-provisioning.txt | 51 ++-
drivers/md/dm-thin-metadata.c | 78 +++-
drivers/md/dm-thin-metadata.h | 1 +
drivers/md/dm-thin.c | 577 ++++++++++++++++-----
drivers/md/persistent-data/dm-btree-remove.c | 174 ++++---
drivers/md/persistent-data/dm-space-map-common.c | 3 -
6 files changed, 675 insertions(+), 209 deletions(-)
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH 01/13] dm thin: tidy up the cell_release functions
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
2012-02-29 14:50 ` [PATCH 02/13] dm btree-remove: break up __rebalance3 function Mike Snitzer
` (11 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, snitzer
From: Joe Thornber <ejt@redhat.com>
Clarify what is expected during each cell release by introducing
variants of __cell_release.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/dm-thin.c | 52 +++++++++++++++++++++++++++++++++----------------
1 files changed, 35 insertions(+), 17 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 920304e..f796afb 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -306,22 +306,45 @@ static void cell_release(struct cell *cell, struct bio_list *bios)
* bio may be in the cell. This function releases the cell, and also does
* a sanity check.
*/
+static void __cell_release_singleton(struct cell *cell, struct bio *bio)
+{
+ hlist_del(&cell->list);
+ BUG_ON(cell->holder != bio);
+ BUG_ON(!bio_list_empty(&cell->bios));
+}
+
static void cell_release_singleton(struct cell *cell, struct bio *bio)
{
- struct bio_prison *prison = cell->prison;
- struct bio_list bios;
- struct bio *b;
unsigned long flags;
-
- bio_list_init(&bios);
+ struct bio_prison *prison = cell->prison;
spin_lock_irqsave(&prison->lock, flags);
- __cell_release(cell, &bios);
+ __cell_release_singleton(cell, bio);
spin_unlock_irqrestore(&prison->lock, flags);
+}
+
+/*
+ * Sometimes we don't want the holder, just the additional bios.
+ */
+static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+{
+ struct bio_prison *prison = cell->prison;
- b = bio_list_pop(&bios);
- BUG_ON(b != bio);
- BUG_ON(!bio_list_empty(&bios));
+ hlist_del(&cell->list);
+ if (inmates)
+ bio_list_merge(inmates, &cell->bios);
+
+ mempool_free(cell, prison->cell_pool);
+}
+
+static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+{
+ unsigned long flags;
+ struct bio_prison *prison = cell->prison;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ __cell_release_no_holder(cell, inmates);
+ spin_unlock_irqrestore(&prison->lock, flags);
}
static void cell_error(struct cell *cell)
@@ -803,21 +826,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell,
* Same as cell_defer above, except it omits one particular detainee,
* a write bio that covers the block and has already been processed.
*/
-static void cell_defer_except(struct thin_c *tc, struct cell *cell,
- struct bio *exception)
+static void cell_defer_except(struct thin_c *tc, struct cell *cell)
{
struct bio_list bios;
- struct bio *bio;
struct pool *pool = tc->pool;
unsigned long flags;
bio_list_init(&bios);
- cell_release(cell, &bios);
spin_lock_irqsave(&pool->lock, flags);
- while ((bio = bio_list_pop(&bios)))
- if (bio != exception)
- bio_list_add(&pool->deferred_bios, bio);
+ cell_release_no_holder(cell, &pool->deferred_bios);
spin_unlock_irqrestore(&pool->lock, flags);
wake_worker(pool);
@@ -857,7 +875,7 @@ static void process_prepared_mapping(struct new_mapping *m)
* the bios in the cell.
*/
if (bio) {
- cell_defer_except(tc, m->cell, bio);
+ cell_defer_except(tc, m->cell);
bio_endio(bio, 0);
} else
cell_defer(tc, m->cell, m->data_block);
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 02/13] dm btree-remove: break up __rebalance3 function
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
2012-02-29 14:50 ` [PATCH 01/13] dm thin: tidy up the cell_release functions Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
2012-02-29 14:50 ` [PATCH 03/13] dm btree remove: remove 2 BUG_ONs from __rebalance2 function Mike Snitzer
` (10 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, snitzer
From: Joe Thornber <ejt@redhat.com>
This function was getting too big. Two functions are factored out
delete_center_node() and redistribute3(). No functional changes.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/persistent-data/dm-btree-remove.c | 109 +++++++++++++------------
1 files changed, 57 insertions(+), 52 deletions(-)
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 8576d56..ebd2460 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -272,6 +272,57 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
return exit_child(info, &right);
}
+/*
+ * We dump as many entries from center as possible into left, then the rest
+ * in right, then rebalance2. This wastes some cpu, but I want something
+ * simple atm.
+ */
+static void delete_center_node(struct dm_btree_info *info, struct node *parent,
+ struct child *l, struct child *c, struct child *r,
+ struct node *left, struct node *center, struct node *right,
+ uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+ uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+ unsigned shift = min(max_entries - nr_left, nr_center);
+
+ BUG_ON(nr_left + shift > max_entries);
+ node_copy(left, center, -shift);
+ left->header.nr_entries = cpu_to_le32(nr_left + shift);
+
+ if (shift != nr_center) {
+ shift = nr_center - shift;
+ BUG_ON((nr_right + shift) >= max_entries);
+ node_shift(right, shift);
+ node_copy(center, right, shift);
+ right->header.nr_entries = cpu_to_le32(nr_right + shift);
+ }
+ *key_ptr(parent, r->index) = right->keys[0];
+
+ delete_at(parent, c->index);
+ r->index--;
+
+ dm_tm_dec(info->tm, dm_block_location(c->block));
+ __rebalance2(info, parent, l, r);
+}
+
+/*
+ * Redistributes entries among 3 sibling nodes.
+ */
+static void redistribute3(struct dm_btree_info *info, struct node *parent,
+ struct child *l, struct child *c, struct child *r,
+ struct node *left, struct node *center, struct node *right,
+ uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+ uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+ unsigned target = (nr_left + nr_center + nr_right) / 3;
+ BUG_ON(target > max_entries);
+
+ shift(left, center, nr_left - target);
+ shift(center, right, target - nr_right);
+ *key_ptr(parent, c->index) = center->keys[0];
+ *key_ptr(parent, r->index) = right->keys[0];
+}
+
static void __rebalance3(struct dm_btree_info *info, struct node *parent,
struct child *l, struct child *c, struct child *r)
{
@@ -282,62 +333,16 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent,
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
- uint32_t max_entries = le32_to_cpu(left->header.max_entries);
-
- unsigned target;
BUG_ON(left->header.max_entries != center->header.max_entries);
BUG_ON(center->header.max_entries != right->header.max_entries);
- if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) {
- /*
- * Delete center node:
- *
- * We dump as many entries from center as possible into
- * left, then the rest in right, then rebalance2. This
- * wastes some cpu, but I want something simple atm.
- */
- unsigned shift = min(max_entries - nr_left, nr_center);
-
- BUG_ON(nr_left + shift > max_entries);
- node_copy(left, center, -shift);
- left->header.nr_entries = cpu_to_le32(nr_left + shift);
-
- if (shift != nr_center) {
- shift = nr_center - shift;
- BUG_ON((nr_right + shift) >= max_entries);
- node_shift(right, shift);
- node_copy(center, right, shift);
- right->header.nr_entries = cpu_to_le32(nr_right + shift);
- }
- *key_ptr(parent, r->index) = right->keys[0];
-
- delete_at(parent, c->index);
- r->index--;
-
- dm_tm_dec(info->tm, dm_block_location(c->block));
- __rebalance2(info, parent, l, r);
-
- return;
- }
-
- /*
- * Rebalance
- */
- target = (nr_left + nr_center + nr_right) / 3;
- BUG_ON(target > max_entries);
-
- /*
- * Adjust the left node
- */
- shift(left, center, nr_left - target);
-
- /*
- * Adjust the right node
- */
- shift(center, right, target - nr_right);
- *key_ptr(parent, c->index) = center->keys[0];
- *key_ptr(parent, r->index) = right->keys[0];
+ if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center))
+ delete_center_node(info, parent, l, c, r, left, center, right,
+ nr_left, nr_center, nr_right);
+ else
+ redistribute3(info, parent, l, c, r, left, center, right,
+ nr_left, nr_center, nr_right);
}
static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 03/13] dm btree remove: remove 2 BUG_ONs from __rebalance2 function
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
2012-02-29 14:50 ` [PATCH 01/13] dm thin: tidy up the cell_release functions Mike Snitzer
2012-02-29 14:50 ` [PATCH 02/13] dm btree-remove: break up __rebalance3 function Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
2012-02-29 14:50 ` [PATCH 04/13] dm btree remove: fix bug that allowed the nr of entries in a btree node to drop below 1/3 Mike Snitzer
` (9 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, snitzer
From: Joe Thornber <ejt@redhat.com>
These paranoia checks are no longer needed.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/persistent-data/dm-btree-remove.c | 3 ---
1 files changed, 0 insertions(+), 3 deletions(-)
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index ebd2460..9480fcc 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -234,9 +234,6 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
* Rebalance.
*/
unsigned target_left = (nr_left + nr_right) / 2;
- unsigned shift_ = nr_left - target_left;
- BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
- BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
shift(left, right, nr_left - target_left);
*key_ptr(parent, r->index) = right->keys[0];
}
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 04/13] dm btree remove: fix bug that allowed the nr of entries in a btree node to drop below 1/3
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
` (2 preceding siblings ...)
2012-02-29 14:50 ` [PATCH 03/13] dm btree remove: remove 2 BUG_ONs from __rebalance2 function Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
2012-02-29 14:50 ` [PATCH 05/13] dm btree remove: fix center node entry redistribution in redistribute3 function Mike Snitzer
` (8 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, snitzer
From: Joe Thornber <ejt@redhat.com>
For performance reasons we try and keep all btree nodes at least 1/3
full.
Doing so requires spotting when we should delete a node and move it's
entries to it's neighbours. Sometimes this deletion wasn't occuring.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/persistent-data/dm-btree-remove.c | 23 +++++++----------------
1 files changed, 7 insertions(+), 16 deletions(-)
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 9480fcc..6c8e9a7 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -128,18 +128,9 @@ static void delete_at(struct node *n, unsigned index)
n->header.nr_entries = cpu_to_le32(nr_entries - 1);
}
-static unsigned del_threshold(struct node *n)
-{
- return le32_to_cpu(n->header.max_entries) / 3;
-}
-
static unsigned merge_threshold(struct node *n)
{
- /*
- * The extra one is because we know we're potentially going to
- * delete an entry.
- */
- return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1;
+ return le32_to_cpu(n->header.max_entries) / 3;
}
struct child {
@@ -215,8 +206,9 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
struct node *right = r->n;
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+ unsigned threshold = 2 * merge_threshold(left) + 1;
- if (nr_left + nr_right <= merge_threshold(left)) {
+ if (nr_left + nr_right < threshold) {
/*
* Merge
*/
@@ -288,7 +280,7 @@ static void delete_center_node(struct dm_btree_info *info, struct node *parent,
if (shift != nr_center) {
shift = nr_center - shift;
- BUG_ON((nr_right + shift) >= max_entries);
+ BUG_ON((nr_right + shift) > max_entries);
node_shift(right, shift);
node_copy(center, right, shift);
right->header.nr_entries = cpu_to_le32(nr_right + shift);
@@ -331,10 +323,12 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent,
uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+ unsigned threshold = merge_threshold(left) * 4 + 1;
+
BUG_ON(left->header.max_entries != center->header.max_entries);
BUG_ON(center->header.max_entries != right->header.max_entries);
- if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center))
+ if ((nr_left + nr_center + nr_right) < threshold)
delete_center_node(info, parent, l, c, r, left, center, right,
nr_left, nr_center, nr_right);
else
@@ -443,9 +437,6 @@ static int rebalance_children(struct shadow_spine *s,
if (r)
return r;
- if (child_entries > del_threshold(n))
- return 0;
-
has_left_sibling = i > 0;
has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 05/13] dm btree remove: fix center node entry redistribution in redistribute3 function
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
` (3 preceding siblings ...)
2012-02-29 14:50 ` [PATCH 04/13] dm btree remove: fix bug that allowed the nr of entries in a btree node to drop below 1/3 Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
2012-02-29 14:50 ` [PATCH 06/13] dm space map: remove entries from the ref_count tree if no longer needed Mike Snitzer
` (7 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, snitzer
From: Joe Thornber <ejt@redhat.com>
The code to redistribute entries among 3 nodes wasn't taking into
consideration the case where the central node has so few entries that
some need to go from left to right, or vice versa.
Signed-off-by: Joe Thornber <ejt@redhat.com>
---
drivers/md/persistent-data/dm-btree-remove.c | 49 +++++++++++++++++++++-----
1 files changed, 40 insertions(+), 9 deletions(-)
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 6c8e9a7..aa71e23 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -179,6 +179,15 @@ static int exit_child(struct dm_btree_info *info, struct child *c)
static void shift(struct node *left, struct node *right, int count)
{
+ uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
+ uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+ uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+ uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
+
+ BUG_ON(max_entries != r_max_entries);
+ BUG_ON(nr_left - count > max_entries);
+ BUG_ON(nr_right + count > max_entries);
+
if (!count)
return;
@@ -190,13 +199,8 @@ static void shift(struct node *left, struct node *right, int count)
node_shift(right, count);
}
- left->header.nr_entries =
- cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count);
- BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
-
- right->header.nr_entries =
- cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
- BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
+ left->header.nr_entries = cpu_to_le32(nr_left - count);
+ right->header.nr_entries = cpu_to_le32(nr_right + count);
}
static void __rebalance2(struct dm_btree_info *info, struct node *parent,
@@ -302,12 +306,39 @@ static void redistribute3(struct dm_btree_info *info, struct node *parent,
struct node *left, struct node *center, struct node *right,
uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
{
+ int s;
uint32_t max_entries = le32_to_cpu(left->header.max_entries);
unsigned target = (nr_left + nr_center + nr_right) / 3;
BUG_ON(target > max_entries);
- shift(left, center, nr_left - target);
- shift(center, right, target - nr_right);
+ if (nr_left < nr_right) {
+ s = nr_left - target;
+
+ if (s < 0 && nr_center < -s) {
+ /* not enough in central node */
+ shift(left, center, nr_center);
+ s = nr_center - target;
+ shift(left, right, s);
+ nr_right += s;
+ } else
+ shift(left, center, s);
+
+ shift(center, right, target - nr_right);
+
+ } else {
+ s = target - nr_right;
+ if (s > 0 && nr_center < s) {
+ /* not enough in central node */
+ shift(center, right, nr_center);
+ s = target - nr_center;
+ shift(left, right, s);
+ nr_left -= s;
+ } else
+ shift(center, right, s);
+
+ shift(left, center, nr_left - target);
+ }
+
*key_ptr(parent, c->index) = center->keys[0];
*key_ptr(parent, r->index) = right->keys[0];
}
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 06/13] dm space map: remove entries from the ref_count tree if no longer needed
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
` (4 preceding siblings ...)
2012-02-29 14:50 ` [PATCH 05/13] dm btree remove: fix center node entry redistribution in redistribute3 function Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
2012-02-29 14:50 ` [PATCH 07/13] dm thin: don't use the bi_next field for the holder of a cell Mike Snitzer
` (6 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, snitzer
From: Joe Thornber <ejt@redhat.com>
Reference counts are stored in two places: a bitmap if the ref_count
is below 3, or a btree of uint32_t if 3 or above.
When a ref_count that was above 3 drops below we can remove it from
the tree and save some metadata space. This removal was commented out
before because I was unsure why this was causing under populated btree
nodes. An earlier patch fixed this issue.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/persistent-data/dm-space-map-common.c | 3 ---
1 files changed, 0 insertions(+), 3 deletions(-)
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index df2494c..ff3beed 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -405,8 +405,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
if (r < 0)
return r;
-#if 0
- /* FIXME: dm_btree_remove doesn't handle this yet */
if (old > 2) {
r = dm_btree_remove(&ll->ref_count_info,
ll->ref_count_root,
@@ -414,7 +412,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
if (r)
return r;
}
-#endif
} else {
__le32 le_rc = cpu_to_le32(ref_count);
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 07/13] dm thin: don't use the bi_next field for the holder of a cell
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
` (5 preceding siblings ...)
2012-02-29 14:50 ` [PATCH 06/13] dm space map: remove entries from the ref_count tree if no longer needed Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
2012-02-29 14:50 ` [PATCH 08/13] dm thin: add support for external origins Mike Snitzer
` (5 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, snitzer
From: Joe Thornber <ejt@redhat.com>
The holder can be passed down to lower devices which may want to use
bi_next themselves. Also add BUG_ON check to confirm fix.
When releasing bios that have been detained in a cell, fix the order in
which the holder and other cellmates are added to the inmates list:
append holder first followed by the other cellmates.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/dm-thin.c | 29 +++++++++++++++++------------
1 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index f796afb..a148532 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -125,7 +125,7 @@ struct cell {
struct hlist_node list;
struct bio_prison *prison;
struct cell_key key;
- unsigned count;
+ struct bio *holder;
struct bio_list bios;
};
@@ -221,8 +221,7 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
* This may block if a new cell needs allocating. You must ensure that
* cells will be unlocked even if the calling thread is blocked.
*
- * Returns the number of entries in the cell prior to the new addition
- * or < 0 on failure.
+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
*/
static int bio_detain(struct bio_prison *prison, struct cell_key *key,
struct bio *inmate, struct cell **ref)
@@ -257,21 +256,25 @@ static int bio_detain(struct bio_prison *prison, struct cell_key *key,
cell->prison = prison;
memcpy(&cell->key, key, sizeof(cell->key));
- cell->count = 0;
+ cell->holder = inmate;
bio_list_init(&cell->bios);
hlist_add_head(&cell->list, prison->cells + hash);
+ r = 0;
+
+ } else {
+ mempool_free(cell2, prison->cell_pool);
+ cell2 = NULL;
+ r = 1;
+ bio_list_add(&cell->bios, inmate);
}
- }
- r = cell->count++;
- bio_list_add(&cell->bios, inmate);
+ } else {
+ r = 1;
+ bio_list_add(&cell->bios, inmate);
+ }
spin_unlock_irqrestore(&prison->lock, flags);
- if (cell2)
- mempool_free(cell2, prison->cell_pool);
-
*ref = cell;
-
return r;
}
@@ -284,8 +287,10 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
hlist_del(&cell->list);
- if (inmates)
+ if (inmates) {
+ bio_list_add(inmates, cell->holder);
bio_list_merge(inmates, &cell->bios);
+ }
mempool_free(cell, prison->cell_pool);
}
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 08/13] dm thin: add support for external origins
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
` (6 preceding siblings ...)
2012-02-29 14:50 ` [PATCH 07/13] dm thin: don't use the bi_next field for the holder of a cell Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
2012-03-12 22:14 ` [PATCH 08/13 v2] " Mike Snitzer
2012-02-29 14:50 ` [PATCH 09/13] dm thin: held root support Mike Snitzer
` (4 subsequent siblings)
12 siblings, 1 reply; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, snitzer
From: Joe Thornber <ejt@redhat.com>
Allow use of an external, _read only_, device as an origin for a thin
device. Any read to an unprovisioned area of the thin device will be
passed through to the origin. Writes trigger allocation of new blocks
as usual.
One possible use case for this would be VM hosts who want to run
guests on thinp volumes, but have the base image on another device
(possibly shared between many VMs).
You must not write to the origin device if you use this technique! Of
course you can write to the thin device, and take internal snapshots
of the thin.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
---
Documentation/device-mapper/thin-provisioning.txt | 38 ++++++++++-
drivers/md/dm-thin.c | 81 +++++++++++++++++----
2 files changed, 105 insertions(+), 14 deletions(-)
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index 801d9d1..60fc5cf 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -167,6 +167,38 @@ ii) Using an internal snapshot.
dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1"
+External snapshots
+------------------
+
+You can use an external, _read only_, device as an origin for a thin
+device. Any read to an unprovisioned area of the thin device will be
+passed through to the origin. Writes trigger allocation of new blocks
+as usual.
+
+One possible use case for this would be VM hosts who want to run
+guests on thinp volumes, but have the base image on another device
+(possibly shared between many VMs).
+
+You must not write to the origin device if you use this technique! Of
+course you can write to the thin device, and take internal snapshots
+of the thin.
+
+i) Creating an external snapshot
+
+ Same as creating a thin device. You don't need to mention the
+ origin at this stage.
+
+ dmsetup message /dev/mapper/pool 0 "create_thin 0"
+
+ii) Using an external snapshot.
+
+ Add an extra parameter to the thin target specifying the origin:
+
+ dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image"
+
+ All descendants (internal snapshots) of an external snapshot will
+ need the extra origin argument.
+
Deactivation
------------
@@ -262,7 +294,7 @@ iii) Messages
i) Constructor
- thin <pool dev> <dev id>
+ thin <pool dev> <dev id> [external origin id]
pool dev:
the thin-pool device, e.g. /dev/mapper/my_pool or 253:0
@@ -271,6 +303,10 @@ i) Constructor
the internal device identifier of the device to be
activated.
+ external origin dev:
+ a block device; reads to unprovisioned areas of the thin target
+ will be mapped to here.
+
The pool doesn't store any size against the thin devices. If you
load a thin target that is smaller than you've been using previously,
then you'll have no access to blocks mapped beyond the end. If you
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index a148532..c143cf1 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -560,6 +560,7 @@ struct pool_c {
*/
struct thin_c {
struct dm_dev *pool_dev;
+ struct dm_dev *origin_dev;
dm_thin_id dev_id;
struct pool *pool;
@@ -677,14 +678,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
(bio->bi_sector & pool->offset_mask);
}
-static void remap_and_issue(struct thin_c *tc, struct bio *bio,
- dm_block_t block)
+static void remap_to_origin(struct thin_c *tc, struct bio *bio)
+{
+ bio->bi_bdev = tc->origin_dev->bdev;
+}
+
+static void issue(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
unsigned long flags;
- remap(tc, bio, block);
-
/*
* Batch together any FUA/FLUSH bios we find and then issue
* a single commit for them in process_deferred_bios().
@@ -697,6 +700,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
generic_make_request(bio);
}
+static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
+{
+ remap_to_origin(tc, bio);
+ issue(tc, bio);
+}
+
+static void remap_and_issue(struct thin_c *tc, struct bio *bio,
+ dm_block_t block)
+{
+ remap(tc, bio, block);
+ issue(tc, bio);
+}
+
/*
* wake_worker() is used when new work is queued and when pool_resume is
* ready to continue deferred IO processing.
@@ -943,7 +959,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool)
}
static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
- dm_block_t data_origin, dm_block_t data_dest,
+ struct dm_dev *origin, dm_block_t data_origin,
+ dm_block_t data_dest,
struct cell *cell, struct bio *bio)
{
int r;
@@ -975,7 +992,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
} else {
struct dm_io_region from, to;
- from.bdev = tc->pool_dev->bdev;
+ from.bdev = origin->bdev;
from.sector = data_origin * pool->sectors_per_block;
from.count = pool->sectors_per_block;
@@ -993,6 +1010,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
}
}
+static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
+ dm_block_t data_origin, dm_block_t data_dest,
+ struct cell *cell, struct bio *bio)
+{
+ schedule_copy(tc, virt_block, tc->pool_dev,
+ data_origin, data_dest, cell, bio);
+}
+
+static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
+ dm_block_t data_dest,
+ struct cell *cell, struct bio *bio)
+{
+ schedule_copy(tc, virt_block, tc->origin_dev,
+ virt_block, data_dest, cell, bio);
+}
+
static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
dm_block_t data_block, struct cell *cell,
struct bio *bio)
@@ -1139,8 +1172,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
r = alloc_data_block(tc, &data_block);
switch (r) {
case 0:
- schedule_copy(tc, block, lookup_result->block,
- data_block, cell, bio);
+ schedule_internal_copy(tc, block, lookup_result->block,
+ data_block, cell, bio);
break;
case -ENOSPC:
@@ -1214,7 +1247,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
r = alloc_data_block(tc, &data_block);
switch (r) {
case 0:
- schedule_zero(tc, block, data_block, cell, bio);
+ if (tc->origin_dev)
+ schedule_external_copy(tc, block, data_block, cell, bio);
+ else
+ schedule_zero(tc, block, data_block, cell, bio);
break;
case -ENOSPC:
@@ -1265,7 +1301,11 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
break;
case -ENODATA:
- provision_block(tc, bio, block, cell);
+ if (bio_data_dir(bio) == READ && tc->origin_dev) {
+ cell_release_singleton(cell, bio);
+ remap_to_origin_and_issue(tc, bio);
+ } else
+ provision_block(tc, bio, block, cell);
break;
default:
@@ -2249,6 +2289,8 @@ static void thin_dtr(struct dm_target *ti)
__pool_dec(tc->pool);
dm_pool_close_thin_device(tc->td);
dm_put_device(ti, tc->pool_dev);
+ if (tc->origin_dev)
+ dm_put_device(ti, tc->origin_dev);
kfree(tc);
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2257,21 +2299,22 @@ static void thin_dtr(struct dm_target *ti)
/*
* Thin target parameters:
*
- * <pool_dev> <dev_id>
+ * <pool_dev> <dev_id> [origin_dev]
*
* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
* dev_id: the internal device identifier
+ * origin_dev: a device external to the pool that should act as the origin
*/
static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
int r;
struct thin_c *tc;
- struct dm_dev *pool_dev;
+ struct dm_dev *pool_dev, *origin_dev;
struct mapped_device *pool_md;
mutex_lock(&dm_thin_pool_table.mutex);
- if (argc != 2) {
+ if (argc != 2 && argc != 3) {
ti->error = "Invalid argument count";
r = -EINVAL;
goto out_unlock;
@@ -2284,6 +2327,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto out_unlock;
}
+ if (argc == 3) {
+ r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
+ if (r) {
+ ti->error = "Error opening origin device";
+ goto bad_origin_dev;
+ }
+ tc->origin_dev = origin_dev;
+ }
+
r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
if (r) {
ti->error = "Error opening pool device";
@@ -2336,6 +2388,9 @@ bad_pool_lookup:
bad_common:
dm_put_device(ti, tc->pool_dev);
bad_pool_dev:
+ if (tc->origin_dev)
+ dm_put_device(ti, tc->origin_dev);
+bad_origin_dev:
kfree(tc);
out_unlock:
mutex_unlock(&dm_thin_pool_table.mutex);
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 09/13] dm thin: held root support
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
` (7 preceding siblings ...)
2012-02-29 14:50 ` [PATCH 08/13] dm thin: add support for external origins Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
2012-02-29 14:50 ` [PATCH 10/13] dm thin: foundation for discard support Mike Snitzer
` (3 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, snitzer
From: Joe Thornber <ejt@redhat.com>
This allows userland access to the thinp data mappings.
Signed-off-by: Joe Thornber <ejt@redhat.com>
---
Documentation/device-mapper/thin-provisioning.txt | 11 +++
drivers/md/dm-thin-metadata.c | 78 ++++++++++++++++++++-
drivers/md/dm-thin-metadata.h | 1 +
drivers/md/dm-thin.c | 38 ++++++++++
4 files changed, 126 insertions(+), 2 deletions(-)
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index 60fc5cf..13e42fb 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -289,6 +289,17 @@ iii) Messages
the current transaction id is when you change it with this
compare-and-swap message.
+ hold_root
+
+ Reserve a copy of the data mapping btree for use by userland.
+ This allows userland to inspect the mappings as they were when
+ this message was executed. Use the pool's status command to
+ get the root block.
+
+ release_root
+
+ Release a previously reserved copy of the data mapping btree.
+
'thin' target
-------------
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 0bc3033..f3ba61d 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1081,10 +1081,46 @@ int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
return 0;
}
-static int __get_held_metadata_root(struct dm_pool_metadata *pmd,
- dm_block_t *result)
+static int __hold_metadata_root(struct dm_pool_metadata *pmd)
+{
+ int r, r2;
+ struct thin_disk_superblock *disk_super;
+ struct dm_block *sblock;
+
+ r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+ &sb_validator, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+ if (le64_to_cpu(disk_super->held_root)) {
+ DMWARN("pool already has a held root");
+ r = -EBUSY;
+ } else {
+ __le64 root = disk_super->data_mapping_root;
+ dm_sm_inc_block(pmd->metadata_sm, le64_to_cpu(root));
+ disk_super->held_root = root;
+ pmd->need_commit = 1;
+ }
+
+ r2 = dm_bm_unlock(sblock);
+ return r ? r : r2;
+}
+
+int dm_pool_hold_metadata_root(struct dm_pool_metadata *pmd)
{
int r;
+
+ down_write(&pmd->root_lock);
+ r = __hold_metadata_root(pmd);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+static int __release_metadata_root(struct dm_pool_metadata *pmd)
+{
+ int r, r2;
struct thin_disk_superblock *disk_super;
struct dm_block *sblock;
@@ -1094,6 +1130,44 @@ static int __get_held_metadata_root(struct dm_pool_metadata *pmd,
return r;
disk_super = dm_block_data(sblock);
+ if (!le64_to_cpu(disk_super->held_root)) {
+ DMWARN("pool has no held root");
+ r = -EINVAL;
+ } else {
+ __le64 root = disk_super->held_root;
+ dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(root));
+ disk_super->held_root = cpu_to_le64(0ULL);
+ pmd->need_commit = 1;
+ }
+
+ r2 = dm_bm_unlock(sblock);
+ return r ? r : r2;
+}
+
+int dm_pool_release_metadata_root(struct dm_pool_metadata *pmd)
+{
+ int r;
+
+ down_write(&pmd->root_lock);
+ r = __release_metadata_root(pmd);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+static int __get_held_metadata_root(struct dm_pool_metadata *pmd,
+ dm_block_t *result)
+{
+ int r;
+ struct thin_disk_superblock *disk_super;
+ struct dm_block *sblock;
+
+ r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+ &sb_validator, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
*result = le64_to_cpu(disk_super->held_root);
return dm_bm_unlock(sblock);
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 859c168..cfc7d0b 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -79,6 +79,7 @@ int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
* Hold/get root for userspace transaction.
*/
int dm_pool_hold_metadata_root(struct dm_pool_metadata *pmd);
+int dm_pool_release_metadata_root(struct dm_pool_metadata *pmd);
int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd,
dm_block_t *result);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c143cf1..2e7c1bc 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2107,6 +2107,36 @@ static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct po
return 0;
}
+static int process_hold_root_mesg(unsigned argc, char **argv, struct pool *pool)
+{
+ int r;
+
+ r = check_arg_count(argc, 1);
+ if (r)
+ return r;
+
+ r = dm_pool_hold_metadata_root(pool->pmd);
+ if (r)
+ DMWARN("hold root request failed");
+
+ return r;
+}
+
+static int process_release_root_mesg(unsigned argc, char **argv, struct pool *pool)
+{
+ int r;
+
+ r = check_arg_count(argc, 1);
+ if (r)
+ return r;
+
+ r = dm_pool_release_metadata_root(pool->pmd);
+ if (r)
+ DMWARN("release root request failed");
+
+ return r;
+}
+
/*
* Messages supported:
* create_thin <dev_id>
@@ -2114,6 +2144,8 @@ static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct po
* delete <dev_id>
* trim <dev_id> <new_size_in_sectors>
* set_transaction_id <current_trans_id> <new_trans_id>
+ * hold_root
+ * release_root
*/
static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
{
@@ -2133,6 +2165,12 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
else if (!strcasecmp(argv[0], "set_transaction_id"))
r = process_set_transaction_id_mesg(argc, argv, pool);
+ else if (!strcasecmp(argv[0], "hold_root"))
+ r = process_hold_root_mesg(argc, argv, pool);
+
+ else if (!strcasecmp(argv[0], "release_root"))
+ r = process_release_root_mesg(argc, argv, pool);
+
else
DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 10/13] dm thin: foundation for discard support
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
` (8 preceding siblings ...)
2012-02-29 14:50 ` [PATCH 09/13] dm thin: held root support Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
2012-02-29 14:50 ` [PATCH 11/13] dm thin: add " Mike Snitzer
` (2 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, snitzer
From: Joe Thornber <ejt@redhat.com>
This patch contains a lot of the ground work needed for supporting
discard.
- The thin target now has an endio function, that replaces
shared_read_endio.
- An explicit 'quiesced' flag has been introduced into the new_mapping
structure. Before, this was implicitly indicated by m->list being
empty.
- The map_info->ptr remains constant for the duration of a bio's trip
through thinp. Making it easier to reason about it.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/dm-thin.c | 125 ++++++++++++++++++++++++++++----------------------
1 files changed, 70 insertions(+), 55 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2e7c1bc..bcbe86f 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -534,7 +534,7 @@ struct pool {
struct bio_list retry_on_resume_list;
- struct deferred_set ds; /* FIXME: move to thin_c */
+ struct deferred_set shared_read_ds;
struct new_mapping *next_mapping;
mempool_t *mapping_pool;
@@ -629,6 +629,12 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
/*----------------------------------------------------------------*/
+struct endio_hook {
+ struct thin_c *tc;
+ struct deferred_entry *shared_read_entry;
+ struct new_mapping *overwrite_mapping;
+};
+
static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
{
struct bio *bio;
@@ -639,7 +645,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
bio_list_init(master);
while ((bio = bio_list_pop(&bios))) {
- if (dm_get_mapinfo(bio)->ptr == tc)
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ if (h->tc == tc)
bio_endio(bio, DM_ENDIO_REQUEUE);
else
bio_list_add(master, bio);
@@ -727,16 +734,11 @@ static void wake_worker(struct pool *pool)
/*
* Bio endio functions.
*/
-struct endio_hook {
- struct thin_c *tc;
- bio_end_io_t *saved_bi_end_io;
- struct deferred_entry *entry;
-};
-
struct new_mapping {
struct list_head list;
- int prepared;
+ unsigned quiesced:1;
+ unsigned prepared:1;
struct thin_c *tc;
dm_block_t virt_block;
@@ -758,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
{
struct pool *pool = m->tc->pool;
- if (list_empty(&m->list) && m->prepared) {
+ if (m->quiesced && m->prepared) {
list_add(&m->list, &pool->prepared_mappings);
wake_worker(pool);
}
@@ -781,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
static void overwrite_endio(struct bio *bio, int err)
{
unsigned long flags;
- struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct new_mapping *m = h->overwrite_mapping;
struct pool *pool = m->tc->pool;
m->err = err;
@@ -792,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err)
spin_unlock_irqrestore(&pool->lock, flags);
}
-static void shared_read_endio(struct bio *bio, int err)
-{
- struct list_head mappings;
- struct new_mapping *m, *tmp;
- struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
- unsigned long flags;
- struct pool *pool = h->tc->pool;
-
- bio->bi_end_io = h->saved_bi_end_io;
- bio_endio(bio, err);
-
- INIT_LIST_HEAD(&mappings);
- ds_dec(h->entry, &mappings);
-
- spin_lock_irqsave(&pool->lock, flags);
- list_for_each_entry_safe(m, tmp, &mappings, list) {
- list_del(&m->list);
- INIT_LIST_HEAD(&m->list);
- __maybe_add_mapping(m);
- }
- spin_unlock_irqrestore(&pool->lock, flags);
-
- mempool_free(h, pool->endio_hook_pool);
-}
-
/*----------------------------------------------------------------*/
/*
@@ -950,9 +928,7 @@ static int ensure_next_mapping(struct pool *pool)
static struct new_mapping *get_next_mapping(struct pool *pool)
{
struct new_mapping *r = pool->next_mapping;
-
BUG_ON(!pool->next_mapping);
-
pool->next_mapping = NULL;
return r;
@@ -968,6 +944,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct new_mapping *m = get_next_mapping(pool);
INIT_LIST_HEAD(&m->list);
+ m->quiesced = 0;
m->prepared = 0;
m->tc = tc;
m->virt_block = virt_block;
@@ -976,7 +953,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
m->err = 0;
m->bio = NULL;
- ds_add_work(&pool->ds, &m->list);
+ if (!ds_add_work(&pool->shared_read_ds, &m->list))
+ m->quiesced = 1;
/*
* IO to pool_dev remaps to the pool target's data_dev.
@@ -985,9 +963,10 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
* bio immediately. Otherwise we use kcopyd to clone the data first.
*/
if (io_overwrites_block(pool, bio)) {
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ h->overwrite_mapping = m;
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
- dm_get_mapinfo(bio)->ptr = m;
remap_and_issue(tc, bio, data_dest);
} else {
struct dm_io_region from, to;
@@ -1034,6 +1013,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
struct new_mapping *m = get_next_mapping(pool);
INIT_LIST_HEAD(&m->list);
+ m->quiesced = 1;
m->prepared = 0;
m->tc = tc;
m->virt_block = virt_block;
@@ -1051,9 +1031,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
process_prepared_mapping(m);
else if (io_overwrites_block(pool, bio)) {
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ h->overwrite_mapping = m;
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
- dm_get_mapinfo(bio)->ptr = m;
remap_and_issue(tc, bio, data_block);
} else {
@@ -1140,7 +1121,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
*/
static void retry_on_resume(struct bio *bio)
{
- struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct thin_c *tc = h->tc;
struct pool *pool = tc->pool;
unsigned long flags;
@@ -1206,13 +1188,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
if (bio_data_dir(bio) == WRITE)
break_sharing(tc, bio, block, &key, lookup_result, cell);
else {
- struct endio_hook *h;
- h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
- h->tc = tc;
- h->entry = ds_inc(&pool->ds);
- save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
- dm_get_mapinfo(bio)->ptr = h;
+ h->shared_read_entry = ds_inc(&pool->shared_read_ds);
cell_release_singleton(cell, bio);
remap_and_issue(tc, bio, lookup_result->block);
@@ -1336,7 +1314,9 @@ static void process_deferred_bios(struct pool *pool)
spin_unlock_irqrestore(&pool->lock, flags);
while ((bio = bio_list_pop(&bios))) {
- struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct thin_c *tc = h->tc;
+
/*
* If we've got no free new_mapping structs, and processing
* this bio might require one, we pause until there are some
@@ -1419,6 +1399,18 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
wake_worker(pool);
}
+static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+
+ h->tc = tc;
+ h->shared_read_entry = NULL;
+ h->overwrite_mapping = NULL;
+
+ return h;
+}
+
/*
* Non-blocking function called from the thin target's map function.
*/
@@ -1431,11 +1423,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
struct dm_thin_device *td = tc->td;
struct dm_thin_lookup_result result;
- /*
- * Save the thin context for easy access from the deferred bio later.
- */
- map_context->ptr = tc;
-
+ map_context->ptr = thin_hook_bio(tc, bio);
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
thin_defer_bio(tc, bio);
return DM_MAPIO_SUBMITTED;
@@ -1615,7 +1603,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->low_water_triggered = 0;
pool->no_free_space = 0;
bio_list_init(&pool->retry_on_resume_list);
- ds_init(&pool->ds);
+ ds_init(&pool->shared_read_ds);
pool->next_mapping = NULL;
pool->mapping_pool =
@@ -2444,6 +2432,32 @@ static int thin_map(struct dm_target *ti, struct bio *bio,
return thin_bio_map(ti, bio, map_context);
}
+static int thin_endio(struct dm_target *ti,
+ struct bio *bio, int err,
+ union map_info *map_context)
+{
+ unsigned long flags;
+ struct endio_hook *h = map_context->ptr;
+ struct list_head work;
+ struct new_mapping *m, *tmp;
+ struct pool *pool = h->tc->pool;
+
+ if (h->shared_read_entry) {
+ INIT_LIST_HEAD(&work);
+ ds_dec(h->shared_read_entry, &work);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ list_for_each_entry_safe(m, tmp, &work, list) {
+ list_del(&m->list);
+ m->quiesced = 1;
+ __maybe_add_mapping(m);
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ return 0;
+}
+
static void thin_postsuspend(struct dm_target *ti)
{
if (dm_noflush_suspending(ti))
@@ -2529,6 +2543,7 @@ static struct target_type thin_target = {
.ctr = thin_ctr,
.dtr = thin_dtr,
.map = thin_map,
+ .end_io = thin_endio,
.postsuspend = thin_postsuspend,
.status = thin_status,
.iterate_devices = thin_iterate_devices,
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 11/13] dm thin: add discard support
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
` (9 preceding siblings ...)
2012-02-29 14:50 ` [PATCH 10/13] dm thin: foundation for discard support Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
2012-02-29 14:50 ` [PATCH 12/13] dm thin: add discard pool features Mike Snitzer
2012-02-29 14:50 ` [PATCH 13/13] dm thin: pre-commit in pool_status so it provides accurate free block counts Mike Snitzer
12 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, snitzer
From: Joe Thornber <ejt@redhat.com>
On discard the corresponding mapping(s) are removed from the thin
device. If the associated block(s) are no longer shared the discard is
passed to the underlying device.
All bios other than discards now have an associated deferred_entry that
is saved to the 'all_io_entry' in endio_hook. When non-discard IO
completes and associated mappings are quiesced any discards that were
deferred, via ds_add_work() in process_discard(), will be queued for
processing by the worker thread.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/dm-thin.c | 176 +++++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 161 insertions(+), 15 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index bcbe86f..fec7ddb 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -73,7 +73,7 @@
* missed out if the io covers the block. (schedule_copy).
*
* iv) insert the new mapping into the origin's btree
- * (process_prepared_mappings). This act of inserting breaks some
+ * (process_prepared_mapping). This act of inserting breaks some
* sharing of btree nodes between the two devices. Breaking sharing only
* effects the btree of that specific device. Btrees for the other
* devices that share the block never change. The btree for the origin
@@ -531,10 +531,12 @@ struct pool {
struct bio_list deferred_bios;
struct bio_list deferred_flush_bios;
struct list_head prepared_mappings;
+ struct list_head prepared_discards;
struct bio_list retry_on_resume_list;
struct deferred_set shared_read_ds;
+ struct deferred_set all_io_ds;
struct new_mapping *next_mapping;
mempool_t *mapping_pool;
@@ -632,6 +634,7 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
struct endio_hook {
struct thin_c *tc;
struct deferred_entry *shared_read_entry;
+ struct deferred_entry *all_io_entry;
struct new_mapping *overwrite_mapping;
};
@@ -739,11 +742,12 @@ struct new_mapping {
unsigned quiesced:1;
unsigned prepared:1;
+ unsigned pass_discard:1;
struct thin_c *tc;
dm_block_t virt_block;
dm_block_t data_block;
- struct cell *cell;
+ struct cell *cell, *cell2;
int err;
/*
@@ -883,7 +887,30 @@ static void process_prepared_mapping(struct new_mapping *m)
mempool_free(m, tc->pool->mapping_pool);
}
-static void process_prepared_mappings(struct pool *pool)
+static void process_prepared_discard(struct new_mapping *m)
+{
+ int r;
+ struct thin_c *tc = m->tc;
+
+ r = dm_thin_remove_block(tc->td, m->virt_block);
+ if (r)
+ DMERR("dm_thin_remove_block() failed");
+
+ /*
+ * Pass the discard down to the underlying device?
+ */
+ if (m->pass_discard)
+ remap_and_issue(tc, m->bio, m->data_block);
+ else
+ bio_endio(m->bio, 0);
+
+ cell_defer_except(tc, m->cell);
+ cell_defer_except(tc, m->cell2);
+ mempool_free(m, tc->pool->mapping_pool);
+}
+
+static void process_prepared(struct pool *pool, struct list_head *head,
+ void (*fn)(struct new_mapping *))
{
unsigned long flags;
struct list_head maps;
@@ -891,21 +918,27 @@ static void process_prepared_mappings(struct pool *pool)
INIT_LIST_HEAD(&maps);
spin_lock_irqsave(&pool->lock, flags);
- list_splice_init(&pool->prepared_mappings, &maps);
+ list_splice_init(head, &maps);
spin_unlock_irqrestore(&pool->lock, flags);
list_for_each_entry_safe(m, tmp, &maps, list)
- process_prepared_mapping(m);
+ fn(m);
}
/*
* Deferred bio jobs.
*/
-static int io_overwrites_block(struct pool *pool, struct bio *bio)
+static int io_overlaps_block(struct pool *pool, struct bio *bio)
{
- return ((bio_data_dir(bio) == WRITE) &&
- !(bio->bi_sector & pool->offset_mask)) &&
+ return !(bio->bi_sector & pool->offset_mask) &&
(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
+
+}
+
+static int io_overwrites_block(struct pool *pool, struct bio *bio)
+{
+ return (bio_data_dir(bio) == WRITE) &&
+ io_overlaps_block(pool, bio);
}
static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -1143,6 +1176,86 @@ static void no_space(struct cell *cell)
retry_on_resume(bio);
}
+static void process_discard(struct thin_c *tc, struct bio *bio)
+{
+ int r;
+ struct pool *pool = tc->pool;
+ struct cell *cell, *cell2;
+ struct cell_key key, key2;
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_thin_lookup_result lookup_result;
+ struct new_mapping *m;
+
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(tc->pool->prison, &key, bio, &cell))
+ return;
+
+ r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+ switch (r) {
+ case 0:
+ /*
+ * Check nobody is fiddling with this pool block. This can
+ * happen if someone's in the process of breaking sharing
+ * on this block.
+ */
+ build_data_key(tc->td, lookup_result.block, &key2);
+ if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
+ cell_release_singleton(cell, bio);
+ break;
+ }
+
+ if (io_overlaps_block(pool, bio)) {
+ /*
+ * IO may still be going to the destination block. We must
+ * quiesce before we can do the removal.
+ */
+ m = get_next_mapping(pool);
+ m->tc = tc;
+ m->pass_discard = !lookup_result.shared;
+ m->virt_block = block;
+ m->data_block = lookup_result.block;
+ m->cell = cell;
+ m->cell2 = cell2;
+ m->err = 0;
+ m->bio = bio;
+
+ if (!ds_add_work(&pool->all_io_ds, &m->list)) {
+ list_add(&m->list, &pool->prepared_discards);
+ wake_worker(pool);
+ }
+ } else {
+ /*
+ * This path is hit if people are ignoring
+ * limits->discard_granularity. It ignores any
+ * part of the discard that is in a subsequent
+ * block.
+ */
+ sector_t offset = bio->bi_sector - (block << pool->block_shift);
+ unsigned remaining = (pool->sectors_per_block - offset) << 9;
+ bio->bi_size = min(bio->bi_size, remaining);
+
+ cell_release_singleton(cell, bio);
+ cell_release_singleton(cell2, bio);
+ remap_and_issue(tc, bio, lookup_result.block);
+ }
+ break;
+
+ case -ENODATA:
+ /*
+ * It isn't provisioned, just forget it.
+ */
+ cell_release_singleton(cell, bio);
+ bio_endio(bio, 0);
+ break;
+
+ default:
+ DMERR("discard: find block unexpectedly returned %d\n", r);
+ cell_release_singleton(cell, bio);
+ bio_io_error(bio);
+ break;
+ }
+}
+
static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
struct cell_key *key,
struct dm_thin_lookup_result *lookup_result,
@@ -1288,6 +1401,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
default:
DMERR("dm_thin_find_block() failed, error = %d", r);
+ cell_release_singleton(cell, bio);
bio_io_error(bio);
break;
}
@@ -1329,7 +1443,11 @@ static void process_deferred_bios(struct pool *pool)
break;
}
- process_bio(tc, bio);
+
+ if (bio->bi_rw & REQ_DISCARD)
+ process_discard(tc, bio);
+ else
+ process_bio(tc, bio);
}
/*
@@ -1363,7 +1481,8 @@ static void do_worker(struct work_struct *ws)
{
struct pool *pool = container_of(ws, struct pool, worker);
- process_prepared_mappings(pool);
+ process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
+ process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
process_deferred_bios(pool);
}
@@ -1406,6 +1525,7 @@ static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
h->tc = tc;
h->shared_read_entry = NULL;
+ h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
h->overwrite_mapping = NULL;
return h;
@@ -1424,7 +1544,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
struct dm_thin_lookup_result result;
map_context->ptr = thin_hook_bio(tc, bio);
- if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+ if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
thin_defer_bio(tc, bio);
return DM_MAPIO_SUBMITTED;
}
@@ -1600,10 +1720,12 @@ static struct pool *pool_create(struct mapped_device *pool_md,
bio_list_init(&pool->deferred_bios);
bio_list_init(&pool->deferred_flush_bios);
INIT_LIST_HEAD(&pool->prepared_mappings);
+ INIT_LIST_HEAD(&pool->prepared_discards);
pool->low_water_triggered = 0;
pool->no_free_space = 0;
bio_list_init(&pool->retry_on_resume_list);
ds_init(&pool->shared_read_ds);
+ ds_init(&pool->all_io_ds);
pool->next_mapping = NULL;
pool->mapping_pool =
@@ -1844,7 +1966,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
pt->low_water_blocks = low_water_blocks;
pt->zero_new_blocks = pf.zero_new_blocks;
ti->num_flush_requests = 1;
- ti->num_discard_requests = 0;
+ ti->num_discard_requests = 1;
+ ti->discards_supported = 1;
ti->private = pt;
pt->callbacks.congested_fn = pool_is_congested;
@@ -2275,6 +2398,18 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
+static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
+{
+ limits->max_discard_sectors = pool->sectors_per_block;
+
+ /*
+ * This is just a hint, and not enforced. We have to cope with
+ * bios that overlap 2 blocks.
+ */
+ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+ limits->discard_zeroes_data = pool->zero_new_blocks;
+}
+
static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct pool_c *pt = ti->private;
@@ -2282,6 +2417,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_min(limits, 0);
blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ set_discard_limits(pool, limits);
}
static struct target_type pool_target = {
@@ -2398,8 +2534,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->split_io = tc->pool->sectors_per_block;
ti->num_flush_requests = 1;
- ti->num_discard_requests = 0;
- ti->discards_supported = 0;
+ ti->num_discard_requests = 1;
+ ti->discards_supported = 1;
dm_put(pool_md);
@@ -2455,6 +2591,14 @@ static int thin_endio(struct dm_target *ti,
spin_unlock_irqrestore(&pool->lock, flags);
}
+ if (h->all_io_entry) {
+ INIT_LIST_HEAD(&work);
+ ds_dec(h->all_io_entry, &work);
+ list_for_each_entry_safe(m, tmp, &work, list)
+ list_add(&m->list, &pool->prepared_discards);
+ }
+
+ mempool_free(h, pool->endio_hook_pool);
return 0;
}
@@ -2531,9 +2675,11 @@ static int thin_iterate_devices(struct dm_target *ti,
static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct thin_c *tc = ti->private;
+ struct pool *pool = tc->pool;
blk_limits_io_min(limits, 0);
- blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
+ blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ set_discard_limits(pool, limits);
}
static struct target_type thin_target = {
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 12/13] dm thin: add discard pool features
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
` (10 preceding siblings ...)
2012-02-29 14:50 ` [PATCH 11/13] dm thin: add " Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
2012-02-29 14:50 ` [PATCH 13/13] dm thin: pre-commit in pool_status so it provides accurate free block counts Mike Snitzer
12 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber, snitzer
From: Joe Thornber <ejt@redhat.com>
Make additional features available during pool creation:
'skip_discard': disable discard support
'skip_discard_passdown': don't pass discards down to the underlying data device
Signed-off-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
---
Documentation/device-mapper/thin-provisioning.txt | 2 +
drivers/md/dm-thin.c | 79 +++++++++++++++------
2 files changed, 59 insertions(+), 22 deletions(-)
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index 13e42fb..57076dd 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -222,6 +222,8 @@ i) Constructor
Optional feature arguments:
- 'skip_block_zeroing': skips the zeroing of newly-provisioned blocks.
+ 'skip_discard': disable discard support
+ 'skip_discard_passdown': don't pass discards down to the underlying data device
Data block size must be between 64KB (128 sectors) and 1GB
(2097152 sectors) inclusive.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index fec7ddb..2e16b3a 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -500,6 +500,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
* devices.
*/
struct new_mapping;
+
+struct pool_features {
+ unsigned zero_new_blocks:1;
+ unsigned discard_enabled:1;
+ unsigned discard_passdown:1;
+};
+
struct pool {
struct list_head list;
struct dm_target *ti; /* Only set if a pool target is bound */
@@ -513,7 +520,7 @@ struct pool {
dm_block_t offset_mask;
dm_block_t low_water_blocks;
- unsigned zero_new_blocks:1;
+ struct pool_features pf;
unsigned low_water_triggered:1; /* A dm event has been sent */
unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
@@ -554,7 +561,7 @@ struct pool_c {
struct dm_target_callbacks callbacks;
dm_block_t low_water_blocks;
- unsigned zero_new_blocks:1;
+ struct pool_features pf;
};
/*
@@ -1060,7 +1067,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
* zeroing pre-existing data, we can issue the bio immediately.
* Otherwise we use kcopyd to zero the data first.
*/
- if (!pool->zero_new_blocks)
+ if (!pool->pf.zero_new_blocks)
process_prepared_mapping(m);
else if (io_overwrites_block(pool, bio)) {
@@ -1626,7 +1633,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
pool->ti = ti;
pool->low_water_blocks = pt->low_water_blocks;
- pool->zero_new_blocks = pt->zero_new_blocks;
+ pool->pf = pt->pf;
return 0;
}
@@ -1640,6 +1647,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
/*----------------------------------------------------------------
* Pool creation
*--------------------------------------------------------------*/
+/* Initialize pool features. */
+static void pool_features_init(struct pool_features *pf)
+{
+ pf->zero_new_blocks = 1;
+ pf->discard_enabled = 1;
+ pf->discard_passdown = 1;
+}
+
static void __pool_destroy(struct pool *pool)
{
__pool_table_remove(pool);
@@ -1687,7 +1702,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->block_shift = ffs(block_size) - 1;
pool->offset_mask = block_size - 1;
pool->low_water_blocks = 0;
- pool->zero_new_blocks = 1;
+ pool_features_init(&pool->pf);
pool->prison = prison_create(PRISON_CELLS);
if (!pool->prison) {
*error = "Error creating pool's bio prison";
@@ -1825,10 +1840,6 @@ static void pool_dtr(struct dm_target *ti)
mutex_unlock(&dm_thin_pool_table.mutex);
}
-struct pool_features {
- unsigned zero_new_blocks:1;
-};
-
static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
struct dm_target *ti)
{
@@ -1837,7 +1848,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
const char *arg_name;
static struct dm_arg _args[] = {
- {0, 1, "Invalid number of pool feature arguments"},
+ {0, 3, "Invalid number of pool feature arguments"},
};
/*
@@ -1857,6 +1868,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
if (!strcasecmp(arg_name, "skip_block_zeroing")) {
pf->zero_new_blocks = 0;
continue;
+ } else if (!strcasecmp(arg_name, "skip_discard")) {
+ pf->discard_enabled = 0;
+ continue;
+ } else if (!strcasecmp(arg_name, "skip_discard_passdown")) {
+ pf->discard_passdown = 0;
+ continue;
}
ti->error = "Unrecognised pool feature requested";
@@ -1874,6 +1891,8 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
*
* Optional feature arguments are:
* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
+ * skip_discard: disable discard
+ * skip_discard_passdown: don't pass discards down to the data device
*/
static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
@@ -1938,8 +1957,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
/*
* Set default pool features.
*/
- memset(&pf, 0, sizeof(pf));
- pf.zero_new_blocks = 1;
+ pool_features_init(&pf);
dm_consume_args(&as, 4);
r = parse_pool_features(&as, &pf, ti);
@@ -1964,10 +1982,12 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
pt->metadata_dev = metadata_dev;
pt->data_dev = data_dev;
pt->low_water_blocks = low_water_blocks;
- pt->zero_new_blocks = pf.zero_new_blocks;
+ pt->pf = pf;
ti->num_flush_requests = 1;
- ti->num_discard_requests = 1;
- ti->discards_supported = 1;
+ if (pf.discard_enabled && pf.discard_passdown) {
+ ti->discards_supported = 1;
+ ti->num_discard_requests = 1;
+ }
ti->private = pt;
pt->callbacks.congested_fn = pool_is_congested;
@@ -2303,7 +2323,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
static int pool_status(struct dm_target *ti, status_type_t type,
char *result, unsigned maxlen)
{
- int r;
+ int r, count;
unsigned sz = 0;
uint64_t transaction_id;
dm_block_t nr_free_blocks_data;
@@ -2366,10 +2386,18 @@ static int pool_status(struct dm_target *ti, status_type_t type,
(unsigned long)pool->sectors_per_block,
(unsigned long long)pt->low_water_blocks);
- DMEMIT("%u ", !pool->zero_new_blocks);
+ count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + !pool->pf.discard_passdown;
+ DMEMIT("%u ", count);
- if (!pool->zero_new_blocks)
+ if (!pool->pf.zero_new_blocks)
DMEMIT("skip_block_zeroing ");
+
+ if (!pool->pf.discard_enabled)
+ DMEMIT("skip_discard ");
+
+ if (!pool->pf.discard_passdown)
+ DMEMIT("skip_discard_passdown");
+
break;
}
@@ -2407,7 +2435,7 @@ static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
* bios that overlap 2 blocks.
*/
limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
- limits->discard_zeroes_data = pool->zero_new_blocks;
+ limits->discard_zeroes_data = pool->pf.zero_new_blocks;
}
static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -2417,7 +2445,8 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_min(limits, 0);
blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
- set_discard_limits(pool, limits);
+ if (pool->pf.discard_enabled)
+ set_discard_limits(pool, limits);
}
static struct target_type pool_target = {
@@ -2466,6 +2495,8 @@ static void thin_dtr(struct dm_target *ti)
* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
* dev_id: the internal device identifier
* origin_dev: a device external to the pool that should act as the origin
+ *
+ * If the pool has discards disabled, they get disabled for the thin as well.
*/
static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
@@ -2534,8 +2565,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->split_io = tc->pool->sectors_per_block;
ti->num_flush_requests = 1;
- ti->num_discard_requests = 1;
- ti->discards_supported = 1;
+
+ /* In case the pool supports discards, pass them on. */
+ if (tc->pool->pf.discard_enabled) {
+ ti->discards_supported = 1;
+ ti->num_discard_requests = 1;
+ }
dm_put(pool_md);
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 13/13] dm thin: pre-commit in pool_status so it provides accurate free block counts
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
` (11 preceding siblings ...)
2012-02-29 14:50 ` [PATCH 12/13] dm thin: add discard pool features Mike Snitzer
@ 2012-02-29 14:50 ` Mike Snitzer
12 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-02-29 14:50 UTC (permalink / raw)
To: dm-devel; +Cc: Heinz Mauelshagen, Joe Thornber, snitzer
From: Heinz Mauelshagen <heinzm@redhat.com>
If we're in the middle of a transaction the free block counts can be
quite out of date, so do a quick commit to update them.
Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Joe Thornber <thornber@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/dm-thin.c | 9 +++++++++
1 files changed, 9 insertions(+), 0 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2e16b3a..2b1d5bd 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2343,6 +2343,15 @@ static int pool_status(struct dm_target *ti, status_type_t type,
if (r)
return r;
+ /*
+ * If we're in the middle of a transaction the free block
+ * counts can be quite out of date, so we do a quick
+ * commit.
+ */
+ r = dm_pool_commit_metadata(pool->pmd);
+ if (r)
+ return r;
+
r = dm_pool_get_free_metadata_block_count(pool->pmd,
&nr_free_blocks_metadata);
if (r)
--
1.7.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 08/13 v2] dm thin: add support for external origins
2012-02-29 14:50 ` [PATCH 08/13] dm thin: add support for external origins Mike Snitzer
@ 2012-03-12 22:14 ` Mike Snitzer
0 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2012-03-12 22:14 UTC (permalink / raw)
To: dm-devel; +Cc: Joe Thornber
Allow use of an external, _read only_, device as an origin for a thin
device. Any read to an unprovisioned area of the thin device will be
passed through to the origin. Writes trigger allocation of new blocks
as usual.
One possible use case for this would be VM hosts who want to run
guests on thinp volumes, but have the base image on another device
(possibly shared between many VMs).
You must not write to the origin device if you use this technique! Of
course you can write to the thin device, and take internal snapshots
of the thin.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
---
Documentation/device-mapper/thin-provisioning.txt | 38 +++++++++
drivers/md/dm-thin.c | 84 ++++++++++++++++++----
2 files changed, 108 insertions(+), 14 deletions(-)
[v2: add external origin output to pool_status]
Index: linux-2.6/Documentation/device-mapper/thin-provisioning.txt
===================================================================
--- linux-2.6.orig/Documentation/device-mapper/thin-provisioning.txt
+++ linux-2.6/Documentation/device-mapper/thin-provisioning.txt
@@ -167,6 +167,38 @@ ii) Using an internal snapshot.
dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1"
+External snapshots
+------------------
+
+You can use an external, _read only_, device as an origin for a thin
+device. Any read to an unprovisioned area of the thin device will be
+passed through to the origin. Writes trigger allocation of new blocks
+as usual.
+
+One possible use case for this would be VM hosts who want to run
+guests on thinp volumes, but have the base image on another device
+(possibly shared between many VMs).
+
+You must not write to the origin device if you use this technique! Of
+course you can write to the thin device, and take internal snapshots
+of the thin.
+
+i) Creating an external snapshot
+
+ Same as creating a thin device. You don't need to mention the
+ origin at this stage.
+
+ dmsetup message /dev/mapper/pool 0 "create_thin 0"
+
+ii) Using an external snapshot.
+
+ Add an extra parameter to the thin target specifying the origin:
+
+ dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image"
+
+ All descendants (internal snapshots) of an external snapshot will
+ need the extra origin argument.
+
Deactivation
------------
@@ -262,7 +294,7 @@ iii) Messages
i) Constructor
- thin <pool dev> <dev id>
+ thin <pool dev> <dev id> [external origin id]
pool dev:
the thin-pool device, e.g. /dev/mapper/my_pool or 253:0
@@ -271,6 +303,10 @@ i) Constructor
the internal device identifier of the device to be
activated.
+ external origin dev:
+ a block device; reads to unprovisioned areas of the thin target
+ will be mapped to here.
+
The pool doesn't store any size against the thin devices. If you
load a thin target that is smaller than you've been using previously,
then you'll have no access to blocks mapped beyond the end. If you
Index: linux-2.6/drivers/md/dm-thin.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-thin.c
+++ linux-2.6/drivers/md/dm-thin.c
@@ -557,6 +557,7 @@ struct pool_c {
*/
struct thin_c {
struct dm_dev *pool_dev;
+ struct dm_dev *origin_dev;
dm_thin_id dev_id;
struct pool *pool;
@@ -674,14 +675,16 @@ static void remap(struct thin_c *tc, str
(bio->bi_sector & pool->offset_mask);
}
-static void remap_and_issue(struct thin_c *tc, struct bio *bio,
- dm_block_t block)
+static void remap_to_origin(struct thin_c *tc, struct bio *bio)
+{
+ bio->bi_bdev = tc->origin_dev->bdev;
+}
+
+static void issue(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
unsigned long flags;
- remap(tc, bio, block);
-
/*
* Batch together any FUA/FLUSH bios we find and then issue
* a single commit for them in process_deferred_bios().
@@ -694,6 +697,19 @@ static void remap_and_issue(struct thin_
generic_make_request(bio);
}
+static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
+{
+ remap_to_origin(tc, bio);
+ issue(tc, bio);
+}
+
+static void remap_and_issue(struct thin_c *tc, struct bio *bio,
+ dm_block_t block)
+{
+ remap(tc, bio, block);
+ issue(tc, bio);
+}
+
/*
* wake_worker() is used when new work is queued and when pool_resume is
* ready to continue deferred IO processing.
@@ -940,7 +956,8 @@ static struct new_mapping *get_next_mapp
}
static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
- dm_block_t data_origin, dm_block_t data_dest,
+ struct dm_dev *origin, dm_block_t data_origin,
+ dm_block_t data_dest,
struct cell *cell, struct bio *bio)
{
int r;
@@ -972,7 +989,7 @@ static void schedule_copy(struct thin_c
} else {
struct dm_io_region from, to;
- from.bdev = tc->pool_dev->bdev;
+ from.bdev = origin->bdev;
from.sector = data_origin * pool->sectors_per_block;
from.count = pool->sectors_per_block;
@@ -990,6 +1007,22 @@ static void schedule_copy(struct thin_c
}
}
+static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
+ dm_block_t data_origin, dm_block_t data_dest,
+ struct cell *cell, struct bio *bio)
+{
+ schedule_copy(tc, virt_block, tc->pool_dev,
+ data_origin, data_dest, cell, bio);
+}
+
+static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
+ dm_block_t data_dest,
+ struct cell *cell, struct bio *bio)
+{
+ schedule_copy(tc, virt_block, tc->origin_dev,
+ virt_block, data_dest, cell, bio);
+}
+
static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
dm_block_t data_block, struct cell *cell,
struct bio *bio)
@@ -1136,8 +1169,8 @@ static void break_sharing(struct thin_c
r = alloc_data_block(tc, &data_block);
switch (r) {
case 0:
- schedule_copy(tc, block, lookup_result->block,
- data_block, cell, bio);
+ schedule_internal_copy(tc, block, lookup_result->block,
+ data_block, cell, bio);
break;
case -ENOSPC:
@@ -1211,7 +1244,10 @@ static void provision_block(struct thin_
r = alloc_data_block(tc, &data_block);
switch (r) {
case 0:
- schedule_zero(tc, block, data_block, cell, bio);
+ if (tc->origin_dev)
+ schedule_external_copy(tc, block, data_block, cell, bio);
+ else
+ schedule_zero(tc, block, data_block, cell, bio);
break;
case -ENOSPC:
@@ -1262,7 +1298,11 @@ static void process_bio(struct thin_c *t
break;
case -ENODATA:
- provision_block(tc, bio, block, cell);
+ if (bio_data_dir(bio) == READ && tc->origin_dev) {
+ cell_release_singleton(cell, bio);
+ remap_to_origin_and_issue(tc, bio);
+ } else
+ provision_block(tc, bio, block, cell);
break;
default:
@@ -2225,6 +2265,8 @@ static void thin_dtr(struct dm_target *t
__pool_dec(tc->pool);
dm_pool_close_thin_device(tc->td);
dm_put_device(ti, tc->pool_dev);
+ if (tc->origin_dev)
+ dm_put_device(ti, tc->origin_dev);
kfree(tc);
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2233,21 +2275,22 @@ static void thin_dtr(struct dm_target *t
/*
* Thin target parameters:
*
- * <pool_dev> <dev_id>
+ * <pool_dev> <dev_id> [origin_dev]
*
* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
* dev_id: the internal device identifier
+ * origin_dev: a device external to the pool that should act as the origin
*/
static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
int r;
struct thin_c *tc;
- struct dm_dev *pool_dev;
+ struct dm_dev *pool_dev, *origin_dev;
struct mapped_device *pool_md;
mutex_lock(&dm_thin_pool_table.mutex);
- if (argc != 2) {
+ if (argc != 2 && argc != 3) {
ti->error = "Invalid argument count";
r = -EINVAL;
goto out_unlock;
@@ -2260,6 +2303,15 @@ static int thin_ctr(struct dm_target *ti
goto out_unlock;
}
+ if (argc == 3) {
+ r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
+ if (r) {
+ ti->error = "Error opening origin device";
+ goto bad_origin_dev;
+ }
+ tc->origin_dev = origin_dev;
+ }
+
r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
if (r) {
ti->error = "Error opening pool device";
@@ -2312,6 +2364,9 @@ bad_pool_lookup:
bad_common:
dm_put_device(ti, tc->pool_dev);
bad_pool_dev:
+ if (tc->origin_dev)
+ dm_put_device(ti, tc->origin_dev);
+bad_origin_dev:
kfree(tc);
out_unlock:
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2343,6 +2398,7 @@ static int thin_status(struct dm_target
ssize_t sz = 0;
dm_block_t mapped, highest;
char buf[BDEVNAME_SIZE];
+ char buf2[BDEVNAME_SIZE];
struct thin_c *tc = ti->private;
if (!tc->td)
@@ -2370,6 +2426,8 @@ static int thin_status(struct dm_target
DMEMIT("%s %lu",
format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
(unsigned long) tc->dev_id);
+ if (tc->origin_dev)
+ DMEMIT(" %s", format_dev_t(buf2, tc->origin_dev->bdev->bd_dev));
break;
}
}
^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2012-03-12 22:14 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-02-29 14:50 [PATCH 00/13] dm thin: updated patches for 3.4 Mike Snitzer
2012-02-29 14:50 ` [PATCH 01/13] dm thin: tidy up the cell_release functions Mike Snitzer
2012-02-29 14:50 ` [PATCH 02/13] dm btree-remove: break up __rebalance3 function Mike Snitzer
2012-02-29 14:50 ` [PATCH 03/13] dm btree remove: remove 2 BUG_ONs from __rebalance2 function Mike Snitzer
2012-02-29 14:50 ` [PATCH 04/13] dm btree remove: fix bug that allowed the nr of entries in a btree node to drop below 1/3 Mike Snitzer
2012-02-29 14:50 ` [PATCH 05/13] dm btree remove: fix center node entry redistribution in redistribute3 function Mike Snitzer
2012-02-29 14:50 ` [PATCH 06/13] dm space map: remove entries from the ref_count tree if no longer needed Mike Snitzer
2012-02-29 14:50 ` [PATCH 07/13] dm thin: don't use the bi_next field for the holder of a cell Mike Snitzer
2012-02-29 14:50 ` [PATCH 08/13] dm thin: add support for external origins Mike Snitzer
2012-03-12 22:14 ` [PATCH 08/13 v2] " Mike Snitzer
2012-02-29 14:50 ` [PATCH 09/13] dm thin: held root support Mike Snitzer
2012-02-29 14:50 ` [PATCH 10/13] dm thin: foundation for discard support Mike Snitzer
2012-02-29 14:50 ` [PATCH 11/13] dm thin: add " Mike Snitzer
2012-02-29 14:50 ` [PATCH 12/13] dm thin: add discard pool features Mike Snitzer
2012-02-29 14:50 ` [PATCH 13/13] dm thin: pre-commit in pool_status so it provides accurate free block counts Mike Snitzer
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.