* Questions regarding request based dm-multipath and blk-layer/elevator
@ 2010-05-19 11:46 Nikanth Karthikesan
2010-05-19 18:31 ` Vivek Goyal
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: Nikanth Karthikesan @ 2010-05-19 11:46 UTC (permalink / raw)
To: device-mapper development
Cc: Kiyoshi Ueda, Mike Snitzer, Alasdair G Kergon, Jens Axboe,
Jun'ichi Nomura, Vivek Goyal
Hi
I have couple of questions regd request based dm.
1. With request based multipath, we have 2 elevators in the path to the
device. Doesn't 2 idling I/O schedulers affect performance? Is it better to
use the noop elevator for the dm device? What is suggested?
I can send a patch to set noop as default for rq based dm, if it would be
better.
2. The blk-layer limit nr_requests is the maximum nr of requests per-queue.
Currently we set it to the default maximum(128) and leave it.
Instead would it be better to set it to a higher number based on the number of
paths(underlying devices) and their nr_requests? In bio-based dm-multipath, we
were queueing upto the sum of all the underlying queue's nr_requests.
For example the attached patch would set it to the sum of nr_requests of all
the targets. May be it is better to do it from the user-space daemon,
multipathd? Or just 128 requests is enough as in the end, it is going to be a
single LUN? Or should we simply use the nr_request from one of the underlying
device? Or the maximum nr_request amoung the underlying devices?
Thanks
Nikanth
diff --git a/block/blk-core.c b/block/blk-core.c
index 9fe174d..fc33c2d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -91,6 +91,44 @@ void blk_queue_congestion_threshold(struct request_queue *q)
q->nr_congestion_off = nr;
}
+unsigned long
+__blk_queue_set_nr_requests(struct request_queue *q, unsigned long nr)
+{
+ struct request_list *rl = &q->rq;
+
+ if (nr < BLKDEV_MIN_RQ)
+ nr = BLKDEV_MIN_RQ;
+
+ q->nr_requests = nr;
+ blk_queue_congestion_threshold(q);
+
+ if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
+ blk_set_queue_congested(q, BLK_RW_SYNC);
+ else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
+ blk_clear_queue_congested(q, BLK_RW_SYNC);
+
+ if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
+ blk_set_queue_congested(q, BLK_RW_ASYNC);
+ else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
+ blk_clear_queue_congested(q, BLK_RW_ASYNC);
+
+ if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+ blk_set_queue_full(q, BLK_RW_SYNC);
+ } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
+ blk_clear_queue_full(q, BLK_RW_SYNC);
+ wake_up(&rl->wait[BLK_RW_SYNC]);
+ }
+
+ if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+ blk_set_queue_full(q, BLK_RW_ASYNC);
+ } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
+ blk_clear_queue_full(q, BLK_RW_ASYNC);
+ wake_up(&rl->wait[BLK_RW_ASYNC]);
+ }
+ return nr;
+}
+EXPORT_SYMBOL(__blk_queue_set_nr_requests);
+
/**
* blk_get_backing_dev_info - get the address of a queue's backing_dev_info
* @bdev: device
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 306759b..615198c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -39,7 +39,6 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
static ssize_t
queue_requests_store(struct request_queue *q, const char *page, size_t count)
{
- struct request_list *rl = &q->rq;
unsigned long nr;
int ret;
@@ -47,37 +46,11 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
return -EINVAL;
ret = queue_var_store(&nr, page, count);
- if (nr < BLKDEV_MIN_RQ)
- nr = BLKDEV_MIN_RQ;
spin_lock_irq(q->queue_lock);
- q->nr_requests = nr;
- blk_queue_congestion_threshold(q);
-
- if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
- blk_set_queue_congested(q, BLK_RW_SYNC);
- else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, BLK_RW_SYNC);
-
- if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
- blk_set_queue_congested(q, BLK_RW_ASYNC);
- else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, BLK_RW_ASYNC);
-
- if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
- blk_set_queue_full(q, BLK_RW_SYNC);
- } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
- blk_clear_queue_full(q, BLK_RW_SYNC);
- wake_up(&rl->wait[BLK_RW_SYNC]);
- }
-
- if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
- blk_set_queue_full(q, BLK_RW_ASYNC);
- } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
- blk_clear_queue_full(q, BLK_RW_ASYNC);
- wake_up(&rl->wait[BLK_RW_ASYNC]);
- }
+ __blk_queue_set_nr_requests(q, nr);
spin_unlock_irq(q->queue_lock);
+
return ret;
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9924ea2..c6deff9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -526,6 +526,18 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
}
EXPORT_SYMBOL_GPL(dm_set_device_limits);
+int dm_set_device_nr_requests(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ unsigned long *nr_requests = data;
+ struct block_device *bdev = dev->bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ *nr_requests += q->nr_requests;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_set_device_nr_requests);
+
int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
struct dm_dev **result)
{
@@ -983,12 +995,13 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
* Establish the new table's queue_limits and validate them.
*/
int dm_calculate_queue_limits(struct dm_table *table,
- struct queue_limits *limits)
+ struct queue_limits *limits, unsigned long *nr_requests)
{
struct dm_target *uninitialized_var(ti);
struct queue_limits ti_limits;
unsigned i = 0;
+ *nr_requests = 0;
blk_set_default_limits(limits);
while (i < dm_table_get_num_targets(table)) {
@@ -1005,6 +1018,13 @@ int dm_calculate_queue_limits(struct dm_table *table,
ti->type->iterate_devices(ti, dm_set_device_limits,
&ti_limits);
+ /*
+ * Combine queue nr_requests limit of all the devices this
+ * target uses.
+ */
+ ti->type->iterate_devices(ti, dm_set_device_nr_requests,
+ nr_requests);
+
/* Set I/O hints portion of queue limits */
if (ti->type->io_hints)
ti->type->io_hints(ti, &ti_limits);
@@ -1074,7 +1094,7 @@ no_integrity:
}
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
- struct queue_limits *limits)
+ struct queue_limits *limits, unsigned long nr_requests)
{
/*
* Copy table's limits to the DM device's request_queue
@@ -1098,8 +1118,10 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
* Those bios are passed to request-based dm at the resume time.
*/
smp_mb();
- if (dm_table_request_based(t))
+ if (dm_table_request_based(t)) {
queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
+ __blk_queue_set_nr_requests(q, nr_requests);
+ }
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d21e128..2d45689 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2052,7 +2052,7 @@ static void __set_size(struct mapped_device *md, sector_t size)
* Returns old map, which caller must destroy.
*/
static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
- struct queue_limits *limits)
+ struct queue_limits *limits, unsigned long nr_requests)
{
struct dm_table *old_map;
struct request_queue *q = md->queue;
@@ -2083,11 +2083,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
__bind_mempools(md, t);
- write_lock_irqsave(&md->map_lock, flags);
+ spin_lock_irqsave(q->queue_lock, flags);
+ write_lock(&md->map_lock);
old_map = md->map;
md->map = t;
- dm_table_set_restrictions(t, q, limits);
- write_unlock_irqrestore(&md->map_lock, flags);
+ dm_table_set_restrictions(t, q, limits, nr_requests);
+ write_unlock(&md->map_lock);
+ spin_unlock_irqrestore(q->queue_lock, flags);
return old_map;
}
@@ -2390,6 +2392,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
struct dm_table *map = ERR_PTR(-EINVAL);
struct queue_limits limits;
int r;
+ unsigned long nr_requests;
mutex_lock(&md->suspend_lock);
@@ -2397,7 +2400,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
if (!dm_suspended_md(md))
goto out;
- r = dm_calculate_queue_limits(table, &limits);
+ r = dm_calculate_queue_limits(table, &limits, &nr_requests);
if (r) {
map = ERR_PTR(r);
goto out;
@@ -2410,7 +2413,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
goto out;
}
- map = __bind(md, table, &limits);
+ map = __bind(md, table, &limits, nr_requests);
out:
mutex_unlock(&md->suspend_lock);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index bad1724..9e25c48 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -50,9 +50,9 @@ void dm_table_event_callback(struct dm_table *t,
struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
int dm_calculate_queue_limits(struct dm_table *table,
- struct queue_limits *limits);
+ struct queue_limits *limits, unsigned long *nr_requests);
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
- struct queue_limits *limits);
+ struct queue_limits *limits, unsigned long nr_requests);
struct list_head *dm_table_get_devices(struct dm_table *t);
void dm_table_presuspend_targets(struct dm_table *t);
void dm_table_postsuspend_targets(struct dm_table *t);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6690e8b..366bba5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -934,6 +934,9 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
+extern unsigned long __blk_queue_set_nr_requests(struct request_queue *q,
+ unsigned long nr);
+
extern void blk_set_default_limits(struct queue_limits *lim);
extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
sector_t offset);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 1381cd9..b0f9443 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -108,6 +108,11 @@ void dm_error(const char *message);
*/
int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data);
+/*
+ * Combine device nr_requests.
+ */
+int dm_set_device_nr_requests(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data);
struct dm_dev {
struct block_device *bdev;
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: Questions regarding request based dm-multipath and blk-layer/elevator
2010-05-19 11:46 Questions regarding request based dm-multipath and blk-layer/elevator Nikanth Karthikesan
@ 2010-05-19 18:31 ` Vivek Goyal
2010-05-20 5:51 ` Nikanth Karthikesan
2010-05-20 11:18 ` Kiyoshi Ueda
2010-05-23 15:54 ` Peter Grandi
2 siblings, 1 reply; 5+ messages in thread
From: Vivek Goyal @ 2010-05-19 18:31 UTC (permalink / raw)
To: Nikanth Karthikesan
Cc: Kiyoshi Ueda, Mike Snitzer, device-mapper development, Jens Axboe,
Jun'ichi Nomura, Alasdair G Kergon
On Wed, May 19, 2010 at 05:16:38PM +0530, Nikanth Karthikesan wrote:
> Hi
>
> I have couple of questions regd request based dm.
>
> 1. With request based multipath, we have 2 elevators in the path to the
> device. Doesn't 2 idling I/O schedulers affect performance? Is it better to
> use the noop elevator for the dm device? What is suggested?
> I can send a patch to set noop as default for rq based dm, if it would be
> better.
>
IIUC, we don't use the ioscheduler of unerlying device and directly insert
the request in request queue of underlying device. So double idling should
not be an issue.
Look at blk_insert_cloned_request(), which uses __elv_add_request() with
option ELEVATOR_INSERT_BACK.
Thanks
Vivek
> 2. The blk-layer limit nr_requests is the maximum nr of requests per-queue.
> Currently we set it to the default maximum(128) and leave it.
>
> Instead would it be better to set it to a higher number based on the number of
> paths(underlying devices) and their nr_requests? In bio-based dm-multipath, we
> were queueing upto the sum of all the underlying queue's nr_requests.
>
> For example the attached patch would set it to the sum of nr_requests of all
> the targets. May be it is better to do it from the user-space daemon,
> multipathd? Or just 128 requests is enough as in the end, it is going to be a
> single LUN? Or should we simply use the nr_request from one of the underlying
> device? Or the maximum nr_request amoung the underlying devices?
>
> Thanks
> Nikanth
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 9fe174d..fc33c2d 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -91,6 +91,44 @@ void blk_queue_congestion_threshold(struct request_queue *q)
> q->nr_congestion_off = nr;
> }
>
> +unsigned long
> +__blk_queue_set_nr_requests(struct request_queue *q, unsigned long nr)
> +{
> + struct request_list *rl = &q->rq;
> +
> + if (nr < BLKDEV_MIN_RQ)
> + nr = BLKDEV_MIN_RQ;
> +
> + q->nr_requests = nr;
> + blk_queue_congestion_threshold(q);
> +
> + if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
> + blk_set_queue_congested(q, BLK_RW_SYNC);
> + else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
> + blk_clear_queue_congested(q, BLK_RW_SYNC);
> +
> + if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
> + blk_set_queue_congested(q, BLK_RW_ASYNC);
> + else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
> + blk_clear_queue_congested(q, BLK_RW_ASYNC);
> +
> + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
> + blk_set_queue_full(q, BLK_RW_SYNC);
> + } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
> + blk_clear_queue_full(q, BLK_RW_SYNC);
> + wake_up(&rl->wait[BLK_RW_SYNC]);
> + }
> +
> + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
> + blk_set_queue_full(q, BLK_RW_ASYNC);
> + } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
> + blk_clear_queue_full(q, BLK_RW_ASYNC);
> + wake_up(&rl->wait[BLK_RW_ASYNC]);
> + }
> + return nr;
> +}
> +EXPORT_SYMBOL(__blk_queue_set_nr_requests);
> +
> /**
> * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
> * @bdev: device
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 306759b..615198c 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -39,7 +39,6 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
> static ssize_t
> queue_requests_store(struct request_queue *q, const char *page, size_t count)
> {
> - struct request_list *rl = &q->rq;
> unsigned long nr;
> int ret;
>
> @@ -47,37 +46,11 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
> return -EINVAL;
>
> ret = queue_var_store(&nr, page, count);
> - if (nr < BLKDEV_MIN_RQ)
> - nr = BLKDEV_MIN_RQ;
>
> spin_lock_irq(q->queue_lock);
> - q->nr_requests = nr;
> - blk_queue_congestion_threshold(q);
> -
> - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
> - blk_set_queue_congested(q, BLK_RW_SYNC);
> - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
> - blk_clear_queue_congested(q, BLK_RW_SYNC);
> -
> - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
> - blk_set_queue_congested(q, BLK_RW_ASYNC);
> - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
> - blk_clear_queue_congested(q, BLK_RW_ASYNC);
> -
> - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
> - blk_set_queue_full(q, BLK_RW_SYNC);
> - } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
> - blk_clear_queue_full(q, BLK_RW_SYNC);
> - wake_up(&rl->wait[BLK_RW_SYNC]);
> - }
> -
> - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
> - blk_set_queue_full(q, BLK_RW_ASYNC);
> - } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
> - blk_clear_queue_full(q, BLK_RW_ASYNC);
> - wake_up(&rl->wait[BLK_RW_ASYNC]);
> - }
> + __blk_queue_set_nr_requests(q, nr);
> spin_unlock_irq(q->queue_lock);
> +
> return ret;
> }
>
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index 9924ea2..c6deff9 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -526,6 +526,18 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
> }
> EXPORT_SYMBOL_GPL(dm_set_device_limits);
>
> +int dm_set_device_nr_requests(struct dm_target *ti, struct dm_dev *dev,
> + sector_t start, sector_t len, void *data)
> +{
> + unsigned long *nr_requests = data;
> + struct block_device *bdev = dev->bdev;
> + struct request_queue *q = bdev_get_queue(bdev);
> +
> + *nr_requests += q->nr_requests;
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(dm_set_device_nr_requests);
> +
> int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
> struct dm_dev **result)
> {
> @@ -983,12 +995,13 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
> * Establish the new table's queue_limits and validate them.
> */
> int dm_calculate_queue_limits(struct dm_table *table,
> - struct queue_limits *limits)
> + struct queue_limits *limits, unsigned long *nr_requests)
> {
> struct dm_target *uninitialized_var(ti);
> struct queue_limits ti_limits;
> unsigned i = 0;
>
> + *nr_requests = 0;
> blk_set_default_limits(limits);
>
> while (i < dm_table_get_num_targets(table)) {
> @@ -1005,6 +1018,13 @@ int dm_calculate_queue_limits(struct dm_table *table,
> ti->type->iterate_devices(ti, dm_set_device_limits,
> &ti_limits);
>
> + /*
> + * Combine queue nr_requests limit of all the devices this
> + * target uses.
> + */
> + ti->type->iterate_devices(ti, dm_set_device_nr_requests,
> + nr_requests);
> +
> /* Set I/O hints portion of queue limits */
> if (ti->type->io_hints)
> ti->type->io_hints(ti, &ti_limits);
> @@ -1074,7 +1094,7 @@ no_integrity:
> }
>
> void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
> - struct queue_limits *limits)
> + struct queue_limits *limits, unsigned long nr_requests)
> {
> /*
> * Copy table's limits to the DM device's request_queue
> @@ -1098,8 +1118,10 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
> * Those bios are passed to request-based dm at the resume time.
> */
> smp_mb();
> - if (dm_table_request_based(t))
> + if (dm_table_request_based(t)) {
> queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
> + __blk_queue_set_nr_requests(q, nr_requests);
> + }
> }
>
> unsigned int dm_table_get_num_targets(struct dm_table *t)
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index d21e128..2d45689 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -2052,7 +2052,7 @@ static void __set_size(struct mapped_device *md, sector_t size)
> * Returns old map, which caller must destroy.
> */
> static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
> - struct queue_limits *limits)
> + struct queue_limits *limits, unsigned long nr_requests)
> {
> struct dm_table *old_map;
> struct request_queue *q = md->queue;
> @@ -2083,11 +2083,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
>
> __bind_mempools(md, t);
>
> - write_lock_irqsave(&md->map_lock, flags);
> + spin_lock_irqsave(q->queue_lock, flags);
> + write_lock(&md->map_lock);
> old_map = md->map;
> md->map = t;
> - dm_table_set_restrictions(t, q, limits);
> - write_unlock_irqrestore(&md->map_lock, flags);
> + dm_table_set_restrictions(t, q, limits, nr_requests);
> + write_unlock(&md->map_lock);
> + spin_unlock_irqrestore(q->queue_lock, flags);
>
> return old_map;
> }
> @@ -2390,6 +2392,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
> struct dm_table *map = ERR_PTR(-EINVAL);
> struct queue_limits limits;
> int r;
> + unsigned long nr_requests;
>
> mutex_lock(&md->suspend_lock);
>
> @@ -2397,7 +2400,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
> if (!dm_suspended_md(md))
> goto out;
>
> - r = dm_calculate_queue_limits(table, &limits);
> + r = dm_calculate_queue_limits(table, &limits, &nr_requests);
> if (r) {
> map = ERR_PTR(r);
> goto out;
> @@ -2410,7 +2413,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
> goto out;
> }
>
> - map = __bind(md, table, &limits);
> + map = __bind(md, table, &limits, nr_requests);
>
> out:
> mutex_unlock(&md->suspend_lock);
> diff --git a/drivers/md/dm.h b/drivers/md/dm.h
> index bad1724..9e25c48 100644
> --- a/drivers/md/dm.h
> +++ b/drivers/md/dm.h
> @@ -50,9 +50,9 @@ void dm_table_event_callback(struct dm_table *t,
> struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
> struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
> int dm_calculate_queue_limits(struct dm_table *table,
> - struct queue_limits *limits);
> + struct queue_limits *limits, unsigned long *nr_requests);
> void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
> - struct queue_limits *limits);
> + struct queue_limits *limits, unsigned long nr_requests);
> struct list_head *dm_table_get_devices(struct dm_table *t);
> void dm_table_presuspend_targets(struct dm_table *t);
> void dm_table_postsuspend_targets(struct dm_table *t);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 6690e8b..366bba5 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -934,6 +934,9 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
> extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
> extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
> extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
> +extern unsigned long __blk_queue_set_nr_requests(struct request_queue *q,
> + unsigned long nr);
> +
> extern void blk_set_default_limits(struct queue_limits *lim);
> extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
> sector_t offset);
> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> index 1381cd9..b0f9443 100644
> --- a/include/linux/device-mapper.h
> +++ b/include/linux/device-mapper.h
> @@ -108,6 +108,11 @@ void dm_error(const char *message);
> */
> int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
> sector_t start, sector_t len, void *data);
> +/*
> + * Combine device nr_requests.
> + */
> +int dm_set_device_nr_requests(struct dm_target *ti, struct dm_dev *dev,
> + sector_t start, sector_t len, void *data);
>
> struct dm_dev {
> struct block_device *bdev;
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: Questions regarding request based dm-multipath and blk-layer/elevator
2010-05-19 18:31 ` Vivek Goyal
@ 2010-05-20 5:51 ` Nikanth Karthikesan
0 siblings, 0 replies; 5+ messages in thread
From: Nikanth Karthikesan @ 2010-05-20 5:51 UTC (permalink / raw)
To: Vivek Goyal
Cc: Kiyoshi Ueda, Mike Snitzer, device-mapper development, Jens Axboe,
Jun'ichi Nomura, Alasdair G Kergon
On Thursday 20 May 2010 00:01:47 Vivek Goyal wrote:
> On Wed, May 19, 2010 at 05:16:38PM +0530, Nikanth Karthikesan wrote:
> > Hi
> >
> > I have couple of questions regd request based dm.
> >
> > 1. With request based multipath, we have 2 elevators in the path to the
> > device. Doesn't 2 idling I/O schedulers affect performance? Is it better
> > to use the noop elevator for the dm device? What is suggested?
> > I can send a patch to set noop as default for rq based dm, if it would be
> > better.
>
> IIUC, we don't use the ioscheduler of unerlying device and directly insert
> the request in request queue of underlying device. So double idling should
> not be an issue.
>
> Look at blk_insert_cloned_request(), which uses __elv_add_request() with
> option ELEVATOR_INSERT_BACK.
>
Oh, yes. Thanks a lot for the answer.
So changing the ioscheduler of the underlying device of a dm-mulitpath device
will have no effect on I/O via the multipath device!
Thanks
Nikanth
> Thanks
> Vivek
>
> > 2. The blk-layer limit nr_requests is the maximum nr of requests
> > per-queue. Currently we set it to the default maximum(128) and leave it.
> >
> > Instead would it be better to set it to a higher number based on the
> > number of paths(underlying devices) and their nr_requests? In bio-based
> > dm-multipath, we were queueing upto the sum of all the underlying queue's
> > nr_requests.
> >
> > For example the attached patch would set it to the sum of nr_requests of
> > all the targets. May be it is better to do it from the user-space daemon,
> > multipathd? Or just 128 requests is enough as in the end, it is going to
> > be a single LUN? Or should we simply use the nr_request from one of the
> > underlying device? Or the maximum nr_request amoung the underlying
> > devices?
> >
> > Thanks
> > Nikanth
> >
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index 9fe174d..fc33c2d 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -91,6 +91,44 @@ void blk_queue_congestion_threshold(struct
> > request_queue *q) q->nr_congestion_off = nr;
> > }
> >
> > +unsigned long
> > +__blk_queue_set_nr_requests(struct request_queue *q, unsigned long nr)
> > +{
> > + struct request_list *rl = &q->rq;
> > +
> > + if (nr < BLKDEV_MIN_RQ)
> > + nr = BLKDEV_MIN_RQ;
> > +
> > + q->nr_requests = nr;
> > + blk_queue_congestion_threshold(q);
> > +
> > + if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
> > + blk_set_queue_congested(q, BLK_RW_SYNC);
> > + else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
> > + blk_clear_queue_congested(q, BLK_RW_SYNC);
> > +
> > + if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
> > + blk_set_queue_congested(q, BLK_RW_ASYNC);
> > + else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
> > + blk_clear_queue_congested(q, BLK_RW_ASYNC);
> > +
> > + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
> > + blk_set_queue_full(q, BLK_RW_SYNC);
> > + } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
> > + blk_clear_queue_full(q, BLK_RW_SYNC);
> > + wake_up(&rl->wait[BLK_RW_SYNC]);
> > + }
> > +
> > + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
> > + blk_set_queue_full(q, BLK_RW_ASYNC);
> > + } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
> > + blk_clear_queue_full(q, BLK_RW_ASYNC);
> > + wake_up(&rl->wait[BLK_RW_ASYNC]);
> > + }
> > + return nr;
> > +}
> > +EXPORT_SYMBOL(__blk_queue_set_nr_requests);
> > +
> > /**
> > * blk_get_backing_dev_info - get the address of a queue's
> > backing_dev_info * @bdev: device
> > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> > index 306759b..615198c 100644
> > --- a/block/blk-sysfs.c
> > +++ b/block/blk-sysfs.c
> > @@ -39,7 +39,6 @@ static ssize_t queue_requests_show(struct request_queue
> > *q, char *page) static ssize_t
> > queue_requests_store(struct request_queue *q, const char *page, size_t
> > count) {
> > - struct request_list *rl = &q->rq;
> > unsigned long nr;
> > int ret;
> >
> > @@ -47,37 +46,11 @@ queue_requests_store(struct request_queue *q, const
> > char *page, size_t count) return -EINVAL;
> >
> > ret = queue_var_store(&nr, page, count);
> > - if (nr < BLKDEV_MIN_RQ)
> > - nr = BLKDEV_MIN_RQ;
> >
> > spin_lock_irq(q->queue_lock);
> > - q->nr_requests = nr;
> > - blk_queue_congestion_threshold(q);
> > -
> > - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
> > - blk_set_queue_congested(q, BLK_RW_SYNC);
> > - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
> > - blk_clear_queue_congested(q, BLK_RW_SYNC);
> > -
> > - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
> > - blk_set_queue_congested(q, BLK_RW_ASYNC);
> > - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
> > - blk_clear_queue_congested(q, BLK_RW_ASYNC);
> > -
> > - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
> > - blk_set_queue_full(q, BLK_RW_SYNC);
> > - } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
> > - blk_clear_queue_full(q, BLK_RW_SYNC);
> > - wake_up(&rl->wait[BLK_RW_SYNC]);
> > - }
> > -
> > - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
> > - blk_set_queue_full(q, BLK_RW_ASYNC);
> > - } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
> > - blk_clear_queue_full(q, BLK_RW_ASYNC);
> > - wake_up(&rl->wait[BLK_RW_ASYNC]);
> > - }
> > + __blk_queue_set_nr_requests(q, nr);
> > spin_unlock_irq(q->queue_lock);
> > +
> > return ret;
> > }
> >
> > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> > index 9924ea2..c6deff9 100644
> > --- a/drivers/md/dm-table.c
> > +++ b/drivers/md/dm-table.c
> > @@ -526,6 +526,18 @@ int dm_set_device_limits(struct dm_target *ti,
> > struct dm_dev *dev, }
> > EXPORT_SYMBOL_GPL(dm_set_device_limits);
> >
> > +int dm_set_device_nr_requests(struct dm_target *ti, struct dm_dev *dev,
> > + sector_t start, sector_t len, void *data)
> > +{
> > + unsigned long *nr_requests = data;
> > + struct block_device *bdev = dev->bdev;
> > + struct request_queue *q = bdev_get_queue(bdev);
> > +
> > + *nr_requests += q->nr_requests;
> > + return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(dm_set_device_nr_requests);
> > +
> > int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
> > struct dm_dev **result)
> > {
> > @@ -983,12 +995,13 @@ struct dm_target *dm_table_find_target(struct
> > dm_table *t, sector_t sector) * Establish the new table's queue_limits
> > and validate them.
> > */
> > int dm_calculate_queue_limits(struct dm_table *table,
> > - struct queue_limits *limits)
> > + struct queue_limits *limits, unsigned long *nr_requests)
> > {
> > struct dm_target *uninitialized_var(ti);
> > struct queue_limits ti_limits;
> > unsigned i = 0;
> >
> > + *nr_requests = 0;
> > blk_set_default_limits(limits);
> >
> > while (i < dm_table_get_num_targets(table)) {
> > @@ -1005,6 +1018,13 @@ int dm_calculate_queue_limits(struct dm_table
> > *table, ti->type->iterate_devices(ti, dm_set_device_limits,
> > &ti_limits);
> >
> > + /*
> > + * Combine queue nr_requests limit of all the devices this
> > + * target uses.
> > + */
> > + ti->type->iterate_devices(ti, dm_set_device_nr_requests,
> > + nr_requests);
> > +
> > /* Set I/O hints portion of queue limits */
> > if (ti->type->io_hints)
> > ti->type->io_hints(ti, &ti_limits);
> > @@ -1074,7 +1094,7 @@ no_integrity:
> > }
> >
> > void dm_table_set_restrictions(struct dm_table *t, struct request_queue
> > *q, - struct queue_limits *limits)
> > + struct queue_limits *limits, unsigned long nr_requests)
> > {
> > /*
> > * Copy table's limits to the DM device's request_queue
> > @@ -1098,8 +1118,10 @@ void dm_table_set_restrictions(struct dm_table *t,
> > struct request_queue *q, * Those bios are passed to request-based dm at
> > the resume time. */
> > smp_mb();
> > - if (dm_table_request_based(t))
> > + if (dm_table_request_based(t)) {
> > queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
> > + __blk_queue_set_nr_requests(q, nr_requests);
> > + }
> > }
> >
> > unsigned int dm_table_get_num_targets(struct dm_table *t)
> > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > index d21e128..2d45689 100644
> > --- a/drivers/md/dm.c
> > +++ b/drivers/md/dm.c
> > @@ -2052,7 +2052,7 @@ static void __set_size(struct mapped_device *md,
> > sector_t size) * Returns old map, which caller must destroy.
> > */
> > static struct dm_table *__bind(struct mapped_device *md, struct dm_table
> > *t, - struct queue_limits *limits)
> > + struct queue_limits *limits, unsigned long nr_requests)
> > {
> > struct dm_table *old_map;
> > struct request_queue *q = md->queue;
> > @@ -2083,11 +2083,13 @@ static struct dm_table *__bind(struct
> > mapped_device *md, struct dm_table *t,
> >
> > __bind_mempools(md, t);
> >
> > - write_lock_irqsave(&md->map_lock, flags);
> > + spin_lock_irqsave(q->queue_lock, flags);
> > + write_lock(&md->map_lock);
> > old_map = md->map;
> > md->map = t;
> > - dm_table_set_restrictions(t, q, limits);
> > - write_unlock_irqrestore(&md->map_lock, flags);
> > + dm_table_set_restrictions(t, q, limits, nr_requests);
> > + write_unlock(&md->map_lock);
> > + spin_unlock_irqrestore(q->queue_lock, flags);
> >
> > return old_map;
> > }
> > @@ -2390,6 +2392,7 @@ struct dm_table *dm_swap_table(struct mapped_device
> > *md, struct dm_table *table) struct dm_table *map = ERR_PTR(-EINVAL);
> > struct queue_limits limits;
> > int r;
> > + unsigned long nr_requests;
> >
> > mutex_lock(&md->suspend_lock);
> >
> > @@ -2397,7 +2400,7 @@ struct dm_table *dm_swap_table(struct mapped_device
> > *md, struct dm_table *table) if (!dm_suspended_md(md))
> > goto out;
> >
> > - r = dm_calculate_queue_limits(table, &limits);
> > + r = dm_calculate_queue_limits(table, &limits, &nr_requests);
> > if (r) {
> > map = ERR_PTR(r);
> > goto out;
> > @@ -2410,7 +2413,7 @@ struct dm_table *dm_swap_table(struct mapped_device
> > *md, struct dm_table *table) goto out;
> > }
> >
> > - map = __bind(md, table, &limits);
> > + map = __bind(md, table, &limits, nr_requests);
> >
> > out:
> > mutex_unlock(&md->suspend_lock);
> > diff --git a/drivers/md/dm.h b/drivers/md/dm.h
> > index bad1724..9e25c48 100644
> > --- a/drivers/md/dm.h
> > +++ b/drivers/md/dm.h
> > @@ -50,9 +50,9 @@ void dm_table_event_callback(struct dm_table *t,
> > struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int
> > index); struct dm_target *dm_table_find_target(struct dm_table *t,
> > sector_t sector); int dm_calculate_queue_limits(struct dm_table *table,
> > - struct queue_limits *limits);
> > + struct queue_limits *limits, unsigned long *nr_requests);
> > void dm_table_set_restrictions(struct dm_table *t, struct request_queue
> > *q, - struct queue_limits *limits);
> > + struct queue_limits *limits, unsigned long nr_requests);
> > struct list_head *dm_table_get_devices(struct dm_table *t);
> > void dm_table_presuspend_targets(struct dm_table *t);
> > void dm_table_postsuspend_targets(struct dm_table *t);
> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index 6690e8b..366bba5 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -934,6 +934,9 @@ extern void blk_limits_io_min(struct queue_limits
> > *limits, unsigned int min); extern void blk_queue_io_min(struct
> > request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct
> > queue_limits *limits, unsigned int opt); extern void
> > blk_queue_io_opt(struct request_queue *q, unsigned int opt); +extern
> > unsigned long __blk_queue_set_nr_requests(struct request_queue *q,
> > + unsigned long nr);
> > +
> > extern void blk_set_default_limits(struct queue_limits *lim);
> > extern int blk_stack_limits(struct queue_limits *t, struct queue_limits
> > *b, sector_t offset);
> > diff --git a/include/linux/device-mapper.h
> > b/include/linux/device-mapper.h index 1381cd9..b0f9443 100644
> > --- a/include/linux/device-mapper.h
> > +++ b/include/linux/device-mapper.h
> > @@ -108,6 +108,11 @@ void dm_error(const char *message);
> > */
> > int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
> > sector_t start, sector_t len, void *data);
> > +/*
> > + * Combine device nr_requests.
> > + */
> > +int dm_set_device_nr_requests(struct dm_target *ti, struct dm_dev *dev,
> > + sector_t start, sector_t len, void *data);
> >
> > struct dm_dev {
> > struct block_device *bdev;
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: Questions regarding request based dm-multipath and blk-layer/elevator
2010-05-19 11:46 Questions regarding request based dm-multipath and blk-layer/elevator Nikanth Karthikesan
2010-05-19 18:31 ` Vivek Goyal
@ 2010-05-20 11:18 ` Kiyoshi Ueda
2010-05-23 15:54 ` Peter Grandi
2 siblings, 0 replies; 5+ messages in thread
From: Kiyoshi Ueda @ 2010-05-20 11:18 UTC (permalink / raw)
To: Nikanth Karthikesan
Cc: Mike Snitzer, device-mapper development, Alasdair G Kergon,
Jens Axboe, Jun'ichi Nomura, Vivek Goyal
Hi Nikanth,
On 05/19/2010 08:46 PM +0900, Nikanth Karthikesan wrote:
> Hi
>
> I have couple of questions regd request based dm.
>
> 1. With request based multipath, we have 2 elevators in the path to the
> device. Doesn't 2 idling I/O schedulers affect performance? Is it better to
> use the noop elevator for the dm device? What is suggested?
> I can send a patch to set noop as default for rq based dm, if it would be
> better.
Vivek answered perfectly.
> 2. The blk-layer limit nr_requests is the maximum nr of requests per-queue.
> Currently we set it to the default maximum(128) and leave it.
>
> Instead would it be better to set it to a higher number based on the number of
> paths(underlying devices) and their nr_requests? In bio-based dm-multipath, we
> were queueing upto the sum of all the underlying queue's nr_requests.
>
> For example the attached patch would set it to the sum of nr_requests of all
> the targets. May be it is better to do it from the user-space daemon,
> multipathd? Or just 128 requests is enough as in the end, it is going to be a
> single LUN? Or should we simply use the nr_request from one of the underlying
> device? Or the maximum nr_request amoung the underlying devices?
Thank you for working on this!
This has been on my TODO list for a long time and I have been having
the same concern.
My personal opinion is:
o q->nr_requests of underlying devices may not be of our interests,
since we don't use 'request' of underlying device's queue.
o Instead, we might have to consider queue_depth of bottom devices,
since queue_depth should affect I/O performance.
Propagating the sum of underlying device's queue_depth to upper
device using I/O topology framework or something may be an candidate
for that.
o On the other hand, which underlying devices are used depends on
multipath configuration. (e.g. For multibus, the sum of all
underlying devices' queue_depth should be appropriate. But for
failover, one of the underlying devices' queue_depth may be enough.)
o Considering above, the userspace daemon, which knows multipath
configuration in use, may be a good place to implement that.
(Although some help/interface in kernel should be needed.)
Thanks,
Kiyoshi Ueda
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: Questions regarding request based dm-multipath and blk-layer/elevator
2010-05-19 11:46 Questions regarding request based dm-multipath and blk-layer/elevator Nikanth Karthikesan
2010-05-19 18:31 ` Vivek Goyal
2010-05-20 11:18 ` Kiyoshi Ueda
@ 2010-05-23 15:54 ` Peter Grandi
2 siblings, 0 replies; 5+ messages in thread
From: Peter Grandi @ 2010-05-23 15:54 UTC (permalink / raw)
To: Linux DM devel
> 1. With request based multipath, we have 2 elevators in the
> path to the device. Doesn't 2 idling I/O schedulers affect
> performance? Is it better to use the noop elevator for the
> dm device? [ ... ]
Usually "metadrivers" (DM, MD, 'loop', ...) in Linux use only
one elevator (and it should be that of the _physical device_).
But if you are concerned by that kind of stuff, consider also
the impact of the (short-sightedly idiotic, and IIRC it cannot
be disabled) plugging/unplugging logic.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2010-05-23 15:54 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-05-19 11:46 Questions regarding request based dm-multipath and blk-layer/elevator Nikanth Karthikesan
2010-05-19 18:31 ` Vivek Goyal
2010-05-20 5:51 ` Nikanth Karthikesan
2010-05-20 11:18 ` Kiyoshi Ueda
2010-05-23 15:54 ` Peter Grandi
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.