* [RFC PATCH 1/9] block: add block_device_operations methods to set and get reserved space
2016-03-17 14:30 [RFC PATCH 0/9] dm-thin/xfs: prototype a block reservation allocation model Brian Foster
@ 2016-03-17 14:30 ` Brian Foster
[not found] ` <20160321120829.GB25476@redhat.com>
2016-03-17 14:30 ` [RFC PATCH 2/9] dm: add " Brian Foster
` (8 subsequent siblings)
9 siblings, 1 reply; 12+ messages in thread
From: Brian Foster @ 2016-03-17 14:30 UTC (permalink / raw)
To: xfs; +Cc: dm-devel, linux-block, linux-fsdevel
From: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
fs/block_dev.c | 20 ++++++++++++++++++++
include/linux/blkdev.h | 5 +++++
2 files changed, 25 insertions(+)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 826b164..375a2e4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -497,6 +497,26 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
}
EXPORT_SYMBOL_GPL(bdev_direct_access);
+int blk_reserve_space(struct block_device *bdev, sector_t nr_sects)
+{
+ const struct block_device_operations *ops = bdev->bd_disk->fops;
+
+ if (!ops->reserve_space)
+ return -EOPNOTSUPP;
+ return ops->reserve_space(bdev, nr_sects);
+}
+EXPORT_SYMBOL_GPL(blk_reserve_space);
+
+int blk_get_reserved_space(struct block_device *bdev, sector_t *nr_sects)
+{
+ const struct block_device_operations *ops = bdev->bd_disk->fops;
+
+ if (!ops->get_reserved_space)
+ return -EOPNOTSUPP;
+ return ops->get_reserved_space(bdev, nr_sects);
+}
+EXPORT_SYMBOL_GPL(blk_get_reserved_space);
+
/*
* pseudo-fs
*/
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 413c84f..f212fe5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1664,6 +1664,8 @@ struct block_device_operations {
int (*getgeo)(struct block_device *, struct hd_geometry *);
/* this callback is with swap_lock and sometimes page table lock held */
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
+ int (*reserve_space) (struct block_device *, sector_t);
+ int (*get_reserved_space) (struct block_device *, sector_t *);
struct module *owner;
const struct pr_ops *pr_ops;
};
@@ -1674,6 +1676,9 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *);
extern int bdev_write_page(struct block_device *, sector_t, struct page *,
struct writeback_control *);
extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
+
+extern int blk_reserve_space(struct block_device *, sector_t);
+extern int blk_get_reserved_space(struct block_device *, sector_t *);
#else /* CONFIG_BLOCK */
struct block_device;
--
2.4.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [RFC PATCH 2/9] dm: add methods to set and get reserved space
2016-03-17 14:30 [RFC PATCH 0/9] dm-thin/xfs: prototype a block reservation allocation model Brian Foster
2016-03-17 14:30 ` [RFC PATCH 1/9] block: add block_device_operations methods to set and get reserved space Brian Foster
@ 2016-03-17 14:30 ` Brian Foster
2016-03-17 14:30 ` [RFC PATCH 3/9] dm thin: " Brian Foster
` (7 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Brian Foster @ 2016-03-17 14:30 UTC (permalink / raw)
To: xfs; +Cc: dm-devel, linux-block, linux-fsdevel
From: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/dm.c | 75 +++++++++++++++++++++++++++++++++++++++++++
include/linux/device-mapper.h | 5 +++
2 files changed, 80 insertions(+)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index be49057..4da5d3e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -664,6 +664,79 @@ out:
return r;
}
+/*
+ * FIXME: factor out common helper that can be used by
+ * multiple block_device_operations -> target methods
+ * (including dm_blk_ioctl above)
+ */
+
+static int dm_blk_reserve_space(struct block_device *bdev, sector_t nr_sects)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ int srcu_idx;
+ struct dm_table *map;
+ struct dm_target *tgt;
+ int r = -EINVAL;
+
+ map = dm_get_live_table(md, &srcu_idx);
+
+ if (!map || !dm_table_get_size(map))
+ goto out;
+
+ /* We only support devices that have a single target */
+ if (dm_table_get_num_targets(map) != 1)
+ goto out;
+
+ tgt = dm_table_get_target(map, 0);
+ if (!tgt->type->reserve_space)
+ goto out;
+
+ if (dm_suspended_md(md)) {
+ r = -EAGAIN;
+ goto out;
+ }
+
+ r = tgt->type->reserve_space(tgt, nr_sects);
+out:
+ dm_put_live_table(md, srcu_idx);
+
+ return r;
+}
+
+static int dm_blk_get_reserved_space(struct block_device *bdev,
+ sector_t *nr_sects)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ int srcu_idx;
+ struct dm_table *map;
+ struct dm_target *tgt;
+ int r = -EINVAL;
+
+ map = dm_get_live_table(md, &srcu_idx);
+
+ if (!map || !dm_table_get_size(map))
+ goto out;
+
+ /* We only support devices that have a single target */
+ if (dm_table_get_num_targets(map) != 1)
+ goto out;
+
+ tgt = dm_table_get_target(map, 0);
+ if (!tgt->type->get_reserved_space)
+ goto out;
+
+ if (dm_suspended_md(md)) {
+ r = -EAGAIN;
+ goto out;
+ }
+
+ r = tgt->type->get_reserved_space(tgt, nr_sects);
+out:
+ dm_put_live_table(md, srcu_idx);
+
+ return r;
+}
+
static struct dm_io *alloc_io(struct mapped_device *md)
{
return mempool_alloc(md->io_pool, GFP_NOIO);
@@ -3723,6 +3796,8 @@ static const struct block_device_operations dm_blk_dops = {
.ioctl = dm_blk_ioctl,
.getgeo = dm_blk_getgeo,
.pr_ops = &dm_pr_ops,
+ .reserve_space = dm_blk_reserve_space,
+ .get_reserved_space = dm_blk_get_reserved_space,
.owner = THIS_MODULE
};
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 0830c9e..540c772 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -116,6 +116,9 @@ typedef void (*dm_io_hints_fn) (struct dm_target *ti,
*/
typedef int (*dm_busy_fn) (struct dm_target *ti);
+typedef int (*dm_reserve_space_fn) (struct dm_target *ti, sector_t nr_sects);
+typedef int (*dm_get_reserved_space_fn) (struct dm_target *ti, sector_t *nr_sects);
+
void dm_error(const char *message);
struct dm_dev {
@@ -162,6 +165,8 @@ struct target_type {
dm_busy_fn busy;
dm_iterate_devices_fn iterate_devices;
dm_io_hints_fn io_hints;
+ dm_reserve_space_fn reserve_space;
+ dm_get_reserved_space_fn get_reserved_space;
/* For internal device-mapper use. */
struct list_head list;
--
2.4.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [RFC PATCH 3/9] dm thin: add methods to set and get reserved space
2016-03-17 14:30 [RFC PATCH 0/9] dm-thin/xfs: prototype a block reservation allocation model Brian Foster
2016-03-17 14:30 ` [RFC PATCH 1/9] block: add block_device_operations methods to set and get reserved space Brian Foster
2016-03-17 14:30 ` [RFC PATCH 2/9] dm: add " Brian Foster
@ 2016-03-17 14:30 ` Brian Foster
2016-03-17 14:30 ` [RFC PATCH 4/9] dm thin: update reserve space func to allow reduction Brian Foster
` (6 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Brian Foster @ 2016-03-17 14:30 UTC (permalink / raw)
To: xfs; +Cc: dm-devel, linux-block, linux-fsdevel
From: Joe Thornber <ejt@redhat.com>
Experimental reserve interface for XFS guys to play with.
I have big reservations (no pun intended) about this patch.
Not-Signed-off-by: Joe Thornber <ejt@redhat.com>
Not-Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/dm-thin.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 132 insertions(+), 10 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 92237b6..31d36da 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -271,6 +271,8 @@ struct pool {
process_mapping_fn process_prepared_discard;
struct dm_bio_prison_cell **cell_sort_array;
+
+ dm_block_t reserve_count;
};
static enum pool_mode get_pool_mode(struct pool *pool);
@@ -318,6 +320,8 @@ struct thin_c {
*/
atomic_t refcount;
struct completion can_destroy;
+
+ dm_block_t reserve_count;
};
/*----------------------------------------------------------------*/
@@ -1359,24 +1363,19 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
}
}
-static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+static int get_free_blocks(struct pool *pool, dm_block_t *free_blocks)
{
int r;
- dm_block_t free_blocks;
- struct pool *pool = tc->pool;
-
- if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
- return -EINVAL;
- r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
+ r = dm_pool_get_free_block_count(pool->pmd, free_blocks);
if (r) {
metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
return r;
}
- check_low_water_mark(pool, free_blocks);
+ check_low_water_mark(pool, *free_blocks);
- if (!free_blocks) {
+ if (!*free_blocks) {
/*
* Try to commit to see if that will free up some
* more space.
@@ -1385,7 +1384,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
if (r)
return r;
- r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
+ r = dm_pool_get_free_block_count(pool->pmd, free_blocks);
if (r) {
metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
return r;
@@ -1397,6 +1396,78 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
}
}
+ return r;
+}
+
+/*
+ * Returns true iff either:
+ * i) decrement succeeded (ie. there was reserve left)
+ * ii) there is extra space in the pool
+ */
+static bool dec_reserve_count(struct thin_c *tc, dm_block_t free_blocks)
+{
+ bool r = false;
+ unsigned long flags;
+
+ if (!free_blocks)
+ return false;
+
+ spin_lock_irqsave(&tc->pool->lock, flags);
+ if (tc->reserve_count > 0) {
+ tc->reserve_count--;
+ tc->pool->reserve_count--;
+ r = true;
+ } else {
+ if (free_blocks > tc->pool->reserve_count)
+ r = true;
+ }
+ spin_unlock_irqrestore(&tc->pool->lock, flags);
+
+ return r;
+}
+
+static int set_reserve_count(struct thin_c *tc, dm_block_t count)
+{
+ int r;
+ dm_block_t free_blocks, delta;
+ unsigned long flags;
+
+ r = get_free_blocks(tc->pool, &free_blocks);
+ if (r)
+ return r;
+
+ spin_lock_irqsave(&tc->pool->lock, flags);
+ if (count <= tc->reserve_count)
+ goto out_unlock; /* nothing to do */
+ delta = count - tc->reserve_count;
+ if (tc->pool->reserve_count + delta > free_blocks)
+ r = -ENOSPC;
+ else {
+ tc->reserve_count = count;
+ tc->pool->reserve_count += delta;
+ }
+out_unlock:
+ spin_unlock_irqrestore(&tc->pool->lock, flags);
+
+ return r;
+}
+
+static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+{
+ int r;
+ dm_block_t free_blocks;
+ struct pool *pool = tc->pool;
+
+ if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
+ return -EINVAL;
+
+ r = get_free_blocks(tc->pool, &free_blocks);
+ if (r)
+ return r;
+
+ if (!dec_reserve_count(tc, free_blocks))
+ return -ENOSPC;
+
r = dm_pool_alloc_data_block(pool->pmd, result);
if (r) {
metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
@@ -2880,6 +2951,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md;
pool->md_dev = metadata_dev;
+ pool->reserve_count = 0;
__pool_table_insert(pool);
return pool;
@@ -3895,6 +3967,19 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
*/
}
+static int pool_get_reserved_space(struct dm_target *ti, sector_t *nr_sects)
+{
+ unsigned long flags;
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ *nr_sects = pool->reserve_count * pool->sectors_per_block;
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ return 0;
+}
+
static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
@@ -3913,6 +3998,7 @@ static struct target_type pool_target = {
.status = pool_status,
.iterate_devices = pool_iterate_devices,
.io_hints = pool_io_hints,
+ .get_reserved_space = pool_get_reserved_space,
};
/*----------------------------------------------------------------
@@ -3936,6 +4022,7 @@ static void thin_dtr(struct dm_target *ti)
spin_lock_irqsave(&tc->pool->lock, flags);
list_del_rcu(&tc->list);
+ tc->pool->reserve_count -= tc->reserve_count;
spin_unlock_irqrestore(&tc->pool->lock, flags);
synchronize_rcu();
@@ -4074,6 +4161,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
init_completion(&tc->can_destroy);
list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
spin_unlock_irqrestore(&tc->pool->lock, flags);
+ tc->reserve_count = 0;
/*
* This synchronize_rcu() call is needed here otherwise we risk a
* wake_worker() call finding no bios to process (because the newly
@@ -4271,6 +4359,38 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
}
+static int thin_reserve_space(struct dm_target *ti, sector_t nr_sects)
+{
+ struct thin_c *tc = ti->private;
+ struct pool *pool = tc->pool;
+ sector_t blocks;
+
+ /*
+ * @nr_sects must always be a factor of the pool's blocksize;
+ * upper layers can rely on the bdev's minimum_io_size for this.
+ */
+ if (!nr_sects || !is_factor(nr_sects, pool->sectors_per_block))
+ return -EINVAL;
+
+ blocks = nr_sects;
+ (void) sector_div(blocks, pool->sectors_per_block);
+
+ return set_reserve_count(tc, blocks);
+}
+
+static int thin_get_reserved_space(struct dm_target *ti, sector_t *nr_sects)
+{
+ unsigned long flags;
+ struct thin_c *tc = ti->private;
+ struct pool *pool = tc->pool;
+
+ spin_lock_irqsave(&tc->pool->lock, flags);
+ *nr_sects = tc->reserve_count * pool->sectors_per_block;
+ spin_unlock_irqrestore(&tc->pool->lock, flags);
+
+ return 0;
+}
+
static struct target_type thin_target = {
.name = "thin",
.version = {1, 18, 0},
@@ -4285,6 +4405,8 @@ static struct target_type thin_target = {
.status = thin_status,
.iterate_devices = thin_iterate_devices,
.io_hints = thin_io_hints,
+ .reserve_space = thin_reserve_space,
+ .get_reserved_space = thin_get_reserved_space,
};
/*----------------------------------------------------------------*/
--
2.4.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [RFC PATCH 4/9] dm thin: update reserve space func to allow reduction
2016-03-17 14:30 [RFC PATCH 0/9] dm-thin/xfs: prototype a block reservation allocation model Brian Foster
` (2 preceding siblings ...)
2016-03-17 14:30 ` [RFC PATCH 3/9] dm thin: " Brian Foster
@ 2016-03-17 14:30 ` Brian Foster
2016-03-17 14:30 ` [RFC PATCH 5/9] block: add a block_device_operations method to provision space Brian Foster
` (5 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Brian Foster @ 2016-03-17 14:30 UTC (permalink / raw)
To: xfs; +Cc: dm-devel, linux-block, linux-fsdevel
The dm-thin set_reserve_count() function is designed to control the
current block reservation for a thin pool. It currently only provides
the ability to increase the reservation.
Clients such as XFS will rely on over-reservation and thus require the
ability to release excess reservation back to the pool. Update
set_reserve_count() to return reserved blocks back to the pool.
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
drivers/md/dm-thin.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 31d36da..ac770d89 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1429,7 +1429,8 @@ static bool dec_reserve_count(struct thin_c *tc, dm_block_t free_blocks)
static int set_reserve_count(struct thin_c *tc, dm_block_t count)
{
int r;
- dm_block_t free_blocks, delta;
+ dm_block_t free_blocks;
+ int64_t delta;
unsigned long flags;
r = get_free_blocks(tc->pool, &free_blocks);
@@ -1437,8 +1438,6 @@ static int set_reserve_count(struct thin_c *tc, dm_block_t count)
return r;
spin_lock_irqsave(&tc->pool->lock, flags);
- if (count <= tc->reserve_count)
- goto out_unlock; /* nothing to do */
delta = count - tc->reserve_count;
if (tc->pool->reserve_count + delta > free_blocks)
r = -ENOSPC;
@@ -1446,7 +1445,6 @@ static int set_reserve_count(struct thin_c *tc, dm_block_t count)
tc->reserve_count = count;
tc->pool->reserve_count += delta;
}
-out_unlock:
spin_unlock_irqrestore(&tc->pool->lock, flags);
return r;
@@ -4369,7 +4367,7 @@ static int thin_reserve_space(struct dm_target *ti, sector_t nr_sects)
* @nr_sects must always be a factor of the pool's blocksize;
* upper layers can rely on the bdev's minimum_io_size for this.
*/
- if (!nr_sects || !is_factor(nr_sects, pool->sectors_per_block))
+ if (!is_factor(nr_sects, pool->sectors_per_block))
return -EINVAL;
blocks = nr_sects;
--
2.4.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [RFC PATCH 5/9] block: add a block_device_operations method to provision space
2016-03-17 14:30 [RFC PATCH 0/9] dm-thin/xfs: prototype a block reservation allocation model Brian Foster
` (3 preceding siblings ...)
2016-03-17 14:30 ` [RFC PATCH 4/9] dm thin: update reserve space func to allow reduction Brian Foster
@ 2016-03-17 14:30 ` Brian Foster
2016-03-17 14:30 ` [RFC PATCH 6/9] dm: add " Brian Foster
` (4 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Brian Foster @ 2016-03-17 14:30 UTC (permalink / raw)
To: xfs; +Cc: linux-block, linux-fsdevel, dm-devel
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
fs/block_dev.c | 10 ++++++++++
include/linux/blkdev.h | 2 ++
2 files changed, 12 insertions(+)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 375a2e4..73e57b9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -517,6 +517,16 @@ int blk_get_reserved_space(struct block_device *bdev, sector_t *nr_sects)
}
EXPORT_SYMBOL_GPL(blk_get_reserved_space);
+int blk_provision_space(struct block_device *bdev, sector_t offset, sector_t len)
+{
+ const struct block_device_operations *ops = bdev->bd_disk->fops;
+
+ if (!ops->provision_space)
+ return -EOPNOTSUPP;
+ return ops->provision_space(bdev, offset, len);
+}
+EXPORT_SYMBOL_GPL(blk_provision_space);
+
/*
* pseudo-fs
*/
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f212fe5..d0fa8a35 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1666,6 +1666,7 @@ struct block_device_operations {
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
int (*reserve_space) (struct block_device *, sector_t);
int (*get_reserved_space) (struct block_device *, sector_t *);
+ int (*provision_space) (struct block_device *, sector_t, sector_t);
struct module *owner;
const struct pr_ops *pr_ops;
};
@@ -1679,6 +1680,7 @@ extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
extern int blk_reserve_space(struct block_device *, sector_t);
extern int blk_get_reserved_space(struct block_device *, sector_t *);
+extern int blk_provision_space(struct block_device *, sector_t, sector_t);
#else /* CONFIG_BLOCK */
struct block_device;
--
2.4.3
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 12+ messages in thread* [RFC PATCH 6/9] dm: add method to provision space
2016-03-17 14:30 [RFC PATCH 0/9] dm-thin/xfs: prototype a block reservation allocation model Brian Foster
` (4 preceding siblings ...)
2016-03-17 14:30 ` [RFC PATCH 5/9] block: add a block_device_operations method to provision space Brian Foster
@ 2016-03-17 14:30 ` Brian Foster
2016-03-17 14:30 ` [RFC PATCH 7/9] dm thin: " Brian Foster
` (3 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Brian Foster @ 2016-03-17 14:30 UTC (permalink / raw)
To: xfs; +Cc: dm-devel, linux-block, linux-fsdevel
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
drivers/md/dm.c | 35 +++++++++++++++++++++++++++++++++++
include/linux/device-mapper.h | 2 ++
2 files changed, 37 insertions(+)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4da5d3e..e1aec28 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -737,6 +737,40 @@ out:
return r;
}
+static int dm_blk_provision_space(struct block_device *bdev,
+ sector_t offset, sector_t len)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ int srcu_idx;
+ struct dm_table *map;
+ struct dm_target *tgt;
+ int r = -EINVAL;
+
+ map = dm_get_live_table(md, &srcu_idx);
+
+ if (!map || !dm_table_get_size(map))
+ goto out;
+
+ /* We only support devices that have a single target */
+ if (dm_table_get_num_targets(map) != 1)
+ goto out;
+
+ tgt = dm_table_get_target(map, 0);
+ if (!tgt->type->provision_space)
+ goto out;
+
+ if (dm_suspended_md(md)) {
+ r = -EAGAIN;
+ goto out;
+ }
+
+ r = tgt->type->provision_space(tgt, offset, len);
+out:
+ dm_put_live_table(md, srcu_idx);
+
+ return r;
+}
+
static struct dm_io *alloc_io(struct mapped_device *md)
{
return mempool_alloc(md->io_pool, GFP_NOIO);
@@ -3798,6 +3832,7 @@ static const struct block_device_operations dm_blk_dops = {
.pr_ops = &dm_pr_ops,
.reserve_space = dm_blk_reserve_space,
.get_reserved_space = dm_blk_get_reserved_space,
+ .provision_space = dm_blk_provision_space,
.owner = THIS_MODULE
};
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 540c772..0ebf172 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -118,6 +118,7 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
typedef int (*dm_reserve_space_fn) (struct dm_target *ti, sector_t nr_sects);
typedef int (*dm_get_reserved_space_fn) (struct dm_target *ti, sector_t *nr_sects);
+typedef int (*dm_provision_space_fn) (struct dm_target *ti, sector_t offset, sector_t len);
void dm_error(const char *message);
@@ -167,6 +168,7 @@ struct target_type {
dm_io_hints_fn io_hints;
dm_reserve_space_fn reserve_space;
dm_get_reserved_space_fn get_reserved_space;
+ dm_provision_space_fn provision_space;
/* For internal device-mapper use. */
struct list_head list;
--
2.4.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [RFC PATCH 7/9] dm thin: add method to provision space
2016-03-17 14:30 [RFC PATCH 0/9] dm-thin/xfs: prototype a block reservation allocation model Brian Foster
` (5 preceding siblings ...)
2016-03-17 14:30 ` [RFC PATCH 6/9] dm: add " Brian Foster
@ 2016-03-17 14:30 ` Brian Foster
2016-03-17 14:30 ` [RFC PATCH 8/9] xfs: thin block device reservation mechanism Brian Foster
` (2 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Brian Foster @ 2016-03-17 14:30 UTC (permalink / raw)
To: xfs; +Cc: dm-devel, linux-block, linux-fsdevel
This is a hacky interface added to support the filesystem experiment for
reserving blocks out of thin pools. The XFS POC uses this method
provision space in the pool when blocks are allocated. The number of
actual blocks allocated is returned, which is used to update the
filesystem accounting.
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
drivers/md/dm-thin.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 47 insertions(+)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index ac770d89..00b7322 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -4389,6 +4389,52 @@ static int thin_get_reserved_space(struct dm_target *ti, sector_t *nr_sects)
return 0;
}
+static int thin_provision_space(struct dm_target *ti, sector_t offset, sector_t len)
+{
+ struct thin_c *tc = ti->private;
+ struct pool *pool = tc->pool;
+ sector_t end;
+ dm_block_t pblock;
+ dm_block_t vblock;
+ int error;
+ struct dm_thin_lookup_result lookup;
+ sector_t count;
+
+ if (!is_factor(offset, pool->sectors_per_block))
+ return -EINVAL;
+
+ if (!len || !is_factor(len, pool->sectors_per_block))
+ return -EINVAL;
+
+ end = offset + len;
+
+ count = 0;
+ while (offset < end) {
+ vblock = offset / pool->sectors_per_block;
+
+ error = dm_thin_find_block(tc->td, vblock, true, &lookup);
+ if (error == 0)
+ goto next;
+ if (error != -ENODATA)
+ return error;
+
+ error = alloc_data_block(tc, &pblock);
+ if (error)
+ return error;
+
+ error = dm_thin_insert_block(tc->td, vblock, pblock);
+ if (error)
+ return error;
+
+ count += pool->sectors_per_block;
+next:
+ offset += pool->sectors_per_block;
+ }
+
+ /* XXX: int -> sector_t ! */
+ return count;
+}
+
static struct target_type thin_target = {
.name = "thin",
.version = {1, 18, 0},
@@ -4405,6 +4451,7 @@ static struct target_type thin_target = {
.io_hints = thin_io_hints,
.reserve_space = thin_reserve_space,
.get_reserved_space = thin_get_reserved_space,
+ .provision_space = thin_provision_space,
};
/*----------------------------------------------------------------*/
--
2.4.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [RFC PATCH 8/9] xfs: thin block device reservation mechanism
2016-03-17 14:30 [RFC PATCH 0/9] dm-thin/xfs: prototype a block reservation allocation model Brian Foster
` (6 preceding siblings ...)
2016-03-17 14:30 ` [RFC PATCH 7/9] dm thin: " Brian Foster
@ 2016-03-17 14:30 ` Brian Foster
2016-03-17 14:30 ` [RFC PATCH 9/9] xfs: adopt a reserved allocation model on dm-thin devices Brian Foster
[not found] ` <20160321133346.GD25476@redhat.com>
9 siblings, 0 replies; 12+ messages in thread
From: Brian Foster @ 2016-03-17 14:30 UTC (permalink / raw)
To: xfs; +Cc: dm-devel, linux-block, linux-fsdevel
Add block device reservation infrastructure to XFS. This primarily
consists of wrappers around the associated block device functions. This
mechanism provides the ability to reserve, release and provision a set
of blocks in the underlying block device.
The mechanism enables the filesystem to adopt a block reservation model
with the underlying device. In turn, this allows the filesystem to
identify when the underlying device is out of space and propagate an
error (-ENOSPC) gracefully before the device itself must handle the
condition. The latter typically involves a read-only state transition
and thus requires administrator intervention to resolve.
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
fs/xfs/Makefile | 1 +
fs/xfs/xfs_mount.h | 5 +
fs/xfs/xfs_thin.c | 273 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/xfs_thin.h | 9 ++
fs/xfs/xfs_trace.h | 27 ++++++
5 files changed, 315 insertions(+)
create mode 100644 fs/xfs/xfs_thin.c
create mode 100644 fs/xfs/xfs_thin.h
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index f646391..92ea714 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -88,6 +88,7 @@ xfs-y += xfs_aops.o \
xfs_super.o \
xfs_symlink.o \
xfs_sysfs.o \
+ xfs_thin.o \
xfs_trans.o \
xfs_xattr.o \
kmem.o \
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b570984..3696700 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -147,6 +147,11 @@ typedef struct xfs_mount {
* to various other kinds of pain inflicted on the pNFS server.
*/
__uint32_t m_generation;
+
+ bool m_thin_reserve;
+ struct mutex m_thin_res_lock;
+ uint32_t m_thin_sectpb;
+ sector_t m_thin_res;
} xfs_mount_t;
/*
diff --git a/fs/xfs/xfs_thin.c b/fs/xfs/xfs_thin.c
new file mode 100644
index 0000000..ce6f373
--- /dev/null
+++ b/fs/xfs/xfs_thin.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2016 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_fsops.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_sysfs.h"
+/* XXX: above copied from xfs_mount.c */
+#include "xfs_thin.h"
+
+/*
+ * Notes/Issues:
+ *
+ * - Reservation support depends on the '-o discard' mount option so freed
+ * extents are returned to the pool.
+ * - The absolute reservation value API is potentially racy. We can cover our
+ * own reservations/provisions with a mutex, but a delta reservation API might
+ * be better.
+ * - Local reservation accounting is not necessarily correct/accurate.
+ * Reservation leakage has been reproduced, particularly in ENOSPC conditions.
+ * The discard mechanism to return blocks to dm-thin has not been totally
+ * reliable either, which means filling, removing and filling an fs causes
+ * some space to be lost. This can be worked around with fstrim for the time
+ * being.
+ * - The locking in xfs_mod_fdblocks() is not quite correct/safe. Sleeping from
+ * invalid context BUG()'s are expected. Needs to be reworked.
+ * - Worst case reservation means each XFS filesystem block is considered a new
+ * dm block allocation. This translates to a significant amount of space given
+ * larger dm block sizes. For example, 4k XFS blocks to 64k dm blocks means
+ * we'll hit ENOSPC sooner and more frequently than typically expected.
+ * - The above also means large fallocate requests are problematic. Need to find
+ * a workaround for this. Perhaps a reduced reservation is safe for known
+ * contiguous extents? E.g., xfs_bmapi_write() w/ nimaps = 1;
+ * - The xfs_mod_fdblocks() implementation means the XFS reserve pool blocks are
+ * also reserved from the thin pool. XFS defaults to 8192 reserve pool blocks
+ * in most cases, which translates to 512MB of reserved space. This can be
+ * tuned with: 'xfs_io -xc "resblks <blks>" <mnt>'. Note that insufficient
+ * reserves will result in errors in unexpected areas of code (e.g., page
+ * discards on writeback, inode unlinked list removal failures, etc.).
+ * - The existing xfs_reserve_blocks() implementation is flaky and does not
+ * correctly reserve in the event of xfs_mod_fdblocks() failure. This will
+ * likely require some fixes independent of this feature. It also may depend
+ * on some kind of (currently undefined) "query available reservation" or
+ * "perform partial reservation" API to support partial XFS reserved blocks
+ * allocation.
+ */
+
+/*
+ * Convert an fsb count to a sector reservation.
+ */
+static inline sector_t
+XFS_FSB_TO_SECT(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsb)
+{
+ sector_t bb;
+
+ bb = fsb * mp->m_thin_sectpb;
+ return bb;
+}
+
+/*
+ * Reserve blocks from the underlying block device.
+ */
+int
+xfs_thin_reserve(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsb)
+{
+ int error;
+ sector_t bb;
+
+ bb = XFS_FSB_TO_SECT(mp, fsb);
+
+ mutex_lock(&mp->m_thin_res_lock);
+
+ error = blk_reserve_space(mp->m_ddev_targp->bt_bdev,
+ mp->m_thin_res + bb);
+ if (error) {
+ if (error == -ENOSPC)
+ trace_xfs_thin_reserve_enospc(mp, mp->m_thin_res, bb);
+ goto out;
+ }
+
+ trace_xfs_thin_reserve(mp, mp->m_thin_res, bb);
+ mp->m_thin_res += bb;
+
+out:
+ mutex_unlock(&mp->m_thin_res_lock);
+ return error;
+}
+
+static int
+__xfs_thin_unreserve(
+ struct xfs_mount *mp,
+ sector_t bb)
+{
+ int error;
+
+ if (bb > mp->m_thin_res) {
+ WARN(1, "unres (%lu) exceeds current res (%lu)", bb,
+ mp->m_thin_res);
+ bb = mp->m_thin_res;
+ }
+
+ error = blk_reserve_space(mp->m_ddev_targp->bt_bdev,
+ mp->m_thin_res - bb);
+ if (error)
+ return error;;
+
+ trace_xfs_thin_unreserve(mp, mp->m_thin_res, bb);
+ mp->m_thin_res -= bb;
+
+ return error;
+}
+
+/*
+ * Release a reservation back to the block device.
+ */
+int
+xfs_thin_unreserve(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsb)
+{
+ int error;
+ sector_t bb;
+
+ bb = XFS_FSB_TO_SECT(mp, fsb);
+
+ mutex_lock(&mp->m_thin_res_lock);
+ error = __xfs_thin_unreserve(mp, bb);
+ mutex_unlock(&mp->m_thin_res_lock);
+
+ return error;
+}
+
+/*
+ * Given a recently allocated extent, ask the block device to provision the
+ * underlying space.
+ */
+int
+xfs_thin_provision(
+ struct xfs_mount *mp,
+ xfs_fsblock_t offset,
+ xfs_fsblock_t len)
+{
+ sector_t bbres;
+ sector_t bbstart, bblen;
+ int count;
+ int error;
+
+ bbstart = XFS_FSB_TO_DADDR(mp, offset);
+ bbstart = round_down(bbstart, mp->m_thin_sectpb);
+ bblen = XFS_FSB_TO_BB(mp, len);
+ bblen = round_up(bblen, mp->m_thin_sectpb);
+
+ bbres = XFS_FSB_TO_SECT(mp, len);
+
+ mutex_lock(&mp->m_thin_res_lock);
+
+ WARN_ON(bblen > mp->m_thin_res);
+
+ /*
+ * XXX: alloc count here is kind of a hack. Need to find a local
+ * mechanism. Pass res to blk_provision_space?
+ */
+ count = blk_provision_space(mp->m_ddev_targp->bt_bdev, bbstart, bblen);
+ if (count < 0) {
+ error = count;
+ goto out;
+ }
+
+ trace_xfs_thin_provision(mp, count, bbres);
+
+ /*
+ * Update the local reservation based on the blocks that were actually
+ * allocated and release the rest of the unused reservation.
+ */
+ mp->m_thin_res -= count;
+ bbres -= count;
+ error = __xfs_thin_unreserve(mp, bbres);
+out:
+ mutex_unlock(&mp->m_thin_res_lock);
+ return error;
+}
+
+int
+xfs_thin_init(
+ struct xfs_mount *mp)
+{
+ sector_t res1 = 0, res2 = 0;
+ int error = 0;
+ unsigned int io_opt;
+
+ mp->m_thin_reserve = false;
+
+ if (!(mp->m_flags & XFS_MOUNT_DISCARD))
+ goto out;
+
+ mutex_init(&mp->m_thin_res_lock);
+
+ /* use optimal I/O size as dm-thin block size */
+ io_opt = bdev_io_opt(mp->m_super->s_bdev);
+ if ((io_opt % BBSIZE) || (io_opt < mp->m_sb.sb_blocksize))
+ goto out;
+ mp->m_thin_sectpb = io_opt / BBSIZE;
+
+ /*
+ * Run some test calls to determine whether the block device has
+ * support. Note: res is in 512b sector units.
+ */
+ error = xfs_thin_reserve(mp, 1);
+ if (error)
+ goto out;
+
+ error = blk_get_reserved_space(mp->m_ddev_targp->bt_bdev, &res1);
+ if (error)
+ goto out;
+
+ error = xfs_thin_unreserve(mp, 1);
+ if (error)
+ goto out;
+
+ error = blk_get_reserved_space(mp->m_ddev_targp->bt_bdev, &res2);
+ if (error)
+ goto out;
+
+ ASSERT(res1 >= 1 && res2 == 0);
+ mp->m_thin_reserve = true;
+out:
+ xfs_notice(mp, "Thin pool reservation %s", mp->m_thin_reserve ?
+ "enabled" : "disabled");
+ if (mp->m_thin_reserve)
+ xfs_notice(mp, "Thin reserve blocksize: %u sectors",
+ mp->m_thin_sectpb);
+ return 0;
+}
diff --git a/fs/xfs/xfs_thin.h b/fs/xfs/xfs_thin.h
new file mode 100644
index 0000000..ce5a019
--- /dev/null
+++ b/fs/xfs/xfs_thin.h
@@ -0,0 +1,9 @@
+#ifndef __XFS_THIN_H__
+#define __XFS_THIN_H__
+
+int xfs_thin_init(struct xfs_mount *);
+int xfs_thin_reserve(struct xfs_mount *, xfs_fsblock_t);
+int xfs_thin_unreserve(struct xfs_mount *, xfs_fsblock_t);
+int xfs_thin_provision(struct xfs_mount *, xfs_fsblock_t, xfs_fsblock_t);
+
+#endif /* __XFS_THIN_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 391d797..01b0702 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2185,6 +2185,33 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
DEFINE_DISCARD_EVENT(xfs_discard_exclude);
DEFINE_DISCARD_EVENT(xfs_discard_busy);
+DECLARE_EVENT_CLASS(xfs_thin_class,
+ TP_PROTO(struct xfs_mount *mp, sector_t total, sector_t res),
+ TP_ARGS(mp, total, res),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(sector_t, total)
+ __field(sector_t, res)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->total = total;
+ __entry->res = res;
+ ),
+ TP_printk("dev %d:%d total %lu res %lu",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->total,
+ __entry->res)
+)
+
+#define DEFINE_THIN_EVENT(name) \
+DEFINE_EVENT(xfs_thin_class, name, \
+ TP_PROTO(struct xfs_mount *mp, sector_t total, sector_t res), \
+ TP_ARGS(mp, total, res))
+DEFINE_THIN_EVENT(xfs_thin_reserve);
+DEFINE_THIN_EVENT(xfs_thin_reserve_enospc);
+DEFINE_THIN_EVENT(xfs_thin_unreserve);
+DEFINE_THIN_EVENT(xfs_thin_provision);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
--
2.4.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [RFC PATCH 9/9] xfs: adopt a reserved allocation model on dm-thin devices
2016-03-17 14:30 [RFC PATCH 0/9] dm-thin/xfs: prototype a block reservation allocation model Brian Foster
` (7 preceding siblings ...)
2016-03-17 14:30 ` [RFC PATCH 8/9] xfs: thin block device reservation mechanism Brian Foster
@ 2016-03-17 14:30 ` Brian Foster
[not found] ` <20160321133346.GD25476@redhat.com>
9 siblings, 0 replies; 12+ messages in thread
From: Brian Foster @ 2016-03-17 14:30 UTC (permalink / raw)
To: xfs; +Cc: dm-devel, linux-block, linux-fsdevel
Adopt a reservation-based block allocation model when XFS runs on top of
a dm-thin device with accompanying support. As of today, the filesystem
has no indication of available space in the underlying device. If the
thin pool is depleted, the filesystem has no recourse but to handle the
read-only state change of the device. This results in unexpected higher
level behavior, error returns and can result in data loss if the
filesystem is ultimately shutdown before more space is provisioned to
the pool.
The reservation model enables XFS to manage thin pool space similar to
how delayed allocation blocks are managed today. Delalloc blocks are
reserved up front (e.g., at write time) to guarantee physical space is
available at writeback time and thus prevent data loss due to
overprovisioning. Similarly, block device reservation allows XFS to
reserve space for various operations in advance and thus guarantee an
operation will not fail for lack of space, or otherwise return an error
to the user.
To accomplish this, tie in the device block reservation calls to the
existing filesystem reservation mechanism. Each transaction now reserves
physical space in the underlying thin pool along with other such
reserved resources (e.g., filesystem blocks, log space). Delayed
allocation blocks are similarly reserved in the thin device when the
associated filesystem blocks are reserved. If a reservation cannot be
satisfied, the associated operation returns -ENOSPC precisely as if the
filesystem itself were out of space.
Note that this is proof-of-concept and highly experimental. The purpose
is to explore the potential effectiveness of such a scheme between the
filesystem and a thinly provisioned device. As such, the implementation
is hacky, broken and geared towards proof-of-concept over correctness or
completeness.
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
fs/xfs/libxfs/xfs_alloc.c | 6 ++++
fs/xfs/xfs_mount.c | 81 +++++++++++++++++++++++++++++++++++++++++------
fs/xfs/xfs_mount.h | 2 ++
fs/xfs/xfs_trans.c | 26 +++++++++++++--
4 files changed, 103 insertions(+), 12 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a708e38..2497fd3 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -35,6 +35,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_thin.h"
struct workqueue_struct *xfs_alloc_wq;
@@ -2650,6 +2651,11 @@ xfs_alloc_vextent(
goto error0;
}
+ if (mp->m_thin_reserve) {
+ error = xfs_thin_provision(mp, args->fsbno, args->len);
+ WARN_ON(error);
+ error = 0;
+ }
}
xfs_perag_put(args->pag);
return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb753b3..98c437b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -41,6 +41,7 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_sysfs.h"
+#include "xfs_thin.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -947,6 +948,8 @@ xfs_mountfs(
xfs_qm_mount_quotas(mp);
}
+ xfs_thin_init(mp);
+
/*
* Now we are mounted, reserve a small amount of unused space for
* privileged transactions. This is needed so that transaction
@@ -1165,21 +1168,32 @@ xfs_mod_ifree(
*/
#define XFS_FDBLOCKS_BATCH 1024
int
-xfs_mod_fdblocks(
+__xfs_mod_fdblocks(
struct xfs_mount *mp,
int64_t delta,
- bool rsvd)
+ bool rsvd,
+ bool unres)
{
int64_t lcounter;
long long res_used;
s32 batch;
+ int error;
+ int64_t res_delta = 0;
if (delta > 0) {
/*
- * If the reserve pool is depleted, put blocks back into it
- * first. Most of the time the pool is full.
+ * If the reserve pool is full (the typical case), return the
+ * blocks to the general fs pool. Otherwise, return what we can
+ * to the reserve pool first.
*/
if (likely(mp->m_resblks == mp->m_resblks_avail)) {
+main_pool:
+ if (mp->m_thin_reserve && unres) {
+ error = xfs_thin_unreserve(mp, delta);
+ if (error)
+ return error;
+ }
+
percpu_counter_add(&mp->m_fdblocks, delta);
return 0;
}
@@ -1187,17 +1201,56 @@ xfs_mod_fdblocks(
spin_lock(&mp->m_sb_lock);
res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
- if (res_used > delta) {
- mp->m_resblks_avail += delta;
+ /*
+ * The reserve pool is not full. Blocks in the reserve pool must
+ * hold a bdev reservation which means we may need to re-reserve
+ * blocks depending on what the caller is giving us.
+ *
+ * If the blocks are already reserved (i.e., via a transaction
+ * reservation), simply update the reserve pool counter. If not,
+ * reserve as many blocks as we can, return those to the reserve
+ * pool, and then jump back above to return whatever is left
+ * back to the general filesystem pool.
+ */
+ if (!unres) {
+ while (!unres && delta) {
+ if (res_delta >= res_used)
+ break;
+
+ /* XXX: shouldn't call this w/ m_sb_lock */
+ error = xfs_thin_reserve(mp, 1);
+ if (error)
+ break;
+
+ res_delta++;
+ delta--;
+ }
} else {
- delta -= res_used;
- mp->m_resblks_avail = mp->m_resblks;
- percpu_counter_add(&mp->m_fdblocks, delta);
+ res_delta = min(delta, res_used);
+ delta -= res_delta;
}
+
+ if (res_used > res_delta)
+ mp->m_resblks_avail += res_delta;
+ else
+ mp->m_resblks_avail = mp->m_resblks;
spin_unlock(&mp->m_sb_lock);
+ if (delta)
+ goto main_pool;
return 0;
}
+ /* res calls take positive value */
+ if (mp->m_thin_reserve) {
+ error = xfs_thin_reserve(mp, -delta);
+ if (error == -ENOSPC && rsvd) {
+ spin_lock(&mp->m_sb_lock);
+ goto fdblocks_rsvd;
+ }
+ if (error)
+ return error;
+ }
+
/*
* Taking blocks away, need to be more accurate the closer we
* are to zero.
@@ -1228,6 +1281,7 @@ xfs_mod_fdblocks(
if (!rsvd)
goto fdblocks_enospc;
+fdblocks_rsvd:
lcounter = (long long)mp->m_resblks_avail + delta;
if (lcounter >= 0) {
mp->m_resblks_avail = lcounter;
@@ -1244,6 +1298,15 @@ fdblocks_enospc:
}
int
+xfs_mod_fdblocks(
+ struct xfs_mount *mp,
+ int64_t delta,
+ bool rsvd)
+{
+ return __xfs_mod_fdblocks(mp, delta, rsvd, true);
+}
+
+int
xfs_mod_frextents(
struct xfs_mount *mp,
int64_t delta)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 3696700..2d43422 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -328,6 +328,8 @@ extern void xfs_unmountfs(xfs_mount_t *);
extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta);
extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta);
+extern int __xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
+ bool reserved, bool unres);
extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 748b16a..729367c 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -548,11 +548,22 @@ xfs_trans_unreserve_and_mod_sb(
int64_t rtxdelta = 0;
int64_t idelta = 0;
int64_t ifreedelta = 0;
+ int64_t resdelta = 0;
int error;
/* calculate deltas */
- if (tp->t_blk_res > 0)
+ if (tp->t_blk_res > 0) {
+ /*
+ * Distinguish between what should be unreserved from an
+ * underlying thin pool and and what is only returned to the
+ * free blocks counter.
+ */
blkdelta = tp->t_blk_res;
+ if (tp->t_blk_res > tp->t_blk_res_used) {
+ resdelta = tp->t_blk_res - tp->t_blk_res_used;
+ blkdelta -= resdelta;
+ }
+ }
if ((tp->t_fdblocks_delta != 0) &&
(xfs_sb_version_haslazysbcount(&mp->m_sb) ||
(tp->t_flags & XFS_TRANS_SB_DIRTY)))
@@ -571,12 +582,18 @@ xfs_trans_unreserve_and_mod_sb(
}
/* apply the per-cpu counters */
- if (blkdelta) {
- error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
+ if (resdelta) {
+ error = __xfs_mod_fdblocks(mp, resdelta, rsvd, true);
if (error)
goto out;
}
+ if (blkdelta) {
+ error = __xfs_mod_fdblocks(mp, blkdelta, rsvd, false);
+ if (error)
+ goto out_undo_resblocks;
+ }
+
if (idelta) {
error = xfs_mod_icount(mp, idelta);
if (error)
@@ -681,6 +698,9 @@ out_undo_icount:
out_undo_fdblocks:
if (blkdelta)
xfs_mod_fdblocks(mp, -blkdelta, rsvd);
+out_undo_resblocks:
+ if (resdelta)
+ xfs_mod_fdblocks(mp, -resdelta, rsvd);
out:
ASSERT(error == 0);
return;
--
2.4.3
^ permalink raw reply related [flat|nested] 12+ messages in thread[parent not found: <20160321133346.GD25476@redhat.com>]