[PATCH v1 5/9] block: loop: convert to blk-mq

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-14 15:50 [PATCH v1 0/9] block & aio: kernel aio and loop mq conversion Ming Lei
@ 2014-08-14 15:50 ` Ming Lei
  2014-08-15 16:31   ` Christoph Hellwig
  0 siblings, 1 reply; 22+ messages in thread
From: Ming Lei @ 2014-08-14 15:50 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, Andrew Morton, Dave Kleikamp
  Cc: Zach Brown, Benjamin LaHaise, Christoph Hellwig, Kent Overstreet,
	linux-aio, linux-fsdevel, Dave Chinner, Ming Lei

The conversion is a bit straightforward, and use work queue to
dispatch reqests of loop block, so scalability gets improved a
lot, and also thoughput is increased a lot in case of concurrent
I/O requests.

Another benefit is that loop driver code gets simplified
much, and the patch can be thought as cleanup too.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
---
 drivers/block/loop.c |  294 ++++++++++++++++++++++----------------------------
 drivers/block/loop.h |   14 +--
 2 files changed, 137 insertions(+), 171 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6cb1beb..1af5265 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -75,6 +75,7 @@
 #include <linux/sysfs.h>
 #include <linux/miscdevice.h>
 #include <linux/falloc.h>
+#include <linux/blk-mq.h>
 #include "loop.h"
 
 #include <asm/uaccess.h>
@@ -85,6 +86,8 @@ static DEFINE_MUTEX(loop_index_mutex);
 static int max_part;
 static int part_shift;
 
+static struct workqueue_struct *loop_wq;
+
 /*
  * Transfer functions
  */
@@ -466,109 +469,37 @@ out:
 	return ret;
 }
 
-/*
- * Add bio to back of pending list
- */
-static void loop_add_bio(struct loop_device *lo, struct bio *bio)
-{
-	lo->lo_bio_count++;
-	bio_list_add(&lo->lo_bio_list, bio);
-}
-
-/*
- * Grab first pending buffer
- */
-static struct bio *loop_get_bio(struct loop_device *lo)
-{
-	lo->lo_bio_count--;
-	return bio_list_pop(&lo->lo_bio_list);
-}
-
-static void loop_make_request(struct request_queue *q, struct bio *old_bio)
-{
-	struct loop_device *lo = q->queuedata;
-	int rw = bio_rw(old_bio);
-
-	if (rw == READA)
-		rw = READ;
-
-	BUG_ON(!lo || (rw != READ && rw != WRITE));
-
-	spin_lock_irq(&lo->lo_lock);
-	if (lo->lo_state != Lo_bound)
-		goto out;
-	if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
-		goto out;
-	if (lo->lo_bio_count >= q->nr_congestion_on)
-		wait_event_lock_irq(lo->lo_req_wait,
-				    lo->lo_bio_count < q->nr_congestion_off,
-				    lo->lo_lock);
-	loop_add_bio(lo, old_bio);
-	wake_up(&lo->lo_event);
-	spin_unlock_irq(&lo->lo_lock);
-	return;
-
-out:
-	spin_unlock_irq(&lo->lo_lock);
-	bio_io_error(old_bio);
-}
-
 struct switch_request {
 	struct file *file;
 	struct completion wait;
 };
 
-static void do_loop_switch(struct loop_device *, struct switch_request *);
-
-static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
+static inline int loop_handle_bio(struct loop_device *lo, struct bio *bio)
 {
-	if (unlikely(!bio->bi_bdev)) {
-		do_loop_switch(lo, bio->bi_private);
-		bio_put(bio);
-	} else {
-		int ret = do_bio_filebacked(lo, bio);
-		bio_endio(bio, ret);
-	}
+	int ret = do_bio_filebacked(lo, bio);
+	return ret;
 }
 
 /*
- * worker thread that handles reads/writes to file backed loop devices,
- * to avoid blocking in our make_request_fn. it also does loop decrypting
- * on reads for block backed loop, as that is too heavy to do from
- * b_end_io context where irqs may be disabled.
- *
- * Loop explanation:  loop_clr_fd() sets lo_state to Lo_rundown before
- * calling kthread_stop().  Therefore once kthread_should_stop() is
- * true, make_request will not place any more requests.  Therefore
- * once kthread_should_stop() is true and lo_bio is NULL, we are
- * done with the loop.
+ * Do the actual switch; called from the BIO completion routine
  */
-static int loop_thread(void *data)
+static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
 {
-	struct loop_device *lo = data;
-	struct bio *bio;
-
-	set_user_nice(current, MIN_NICE);
-
-	while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
-
-		wait_event_interruptible(lo->lo_event,
-				!bio_list_empty(&lo->lo_bio_list) ||
-				kthread_should_stop());
-
-		if (bio_list_empty(&lo->lo_bio_list))
-			continue;
-		spin_lock_irq(&lo->lo_lock);
-		bio = loop_get_bio(lo);
-		if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
-			wake_up(&lo->lo_req_wait);
-		spin_unlock_irq(&lo->lo_lock);
+	struct file *file = p->file;
+	struct file *old_file = lo->lo_backing_file;
+	struct address_space *mapping;
 
-		BUG_ON(!bio);
-		loop_handle_bio(lo, bio);
-	}
+	/* if no new file, only flush of queued bios requested */
+	if (!file)
+		return;
 
-	return 0;
+	mapping = file->f_mapping;
+	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
+	lo->lo_backing_file = file;
+	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
+		mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
+	lo->old_gfp_mask = mapping_gfp_mask(mapping);
+	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 }
 
 /*
@@ -579,15 +510,18 @@ static int loop_thread(void *data)
 static int loop_switch(struct loop_device *lo, struct file *file)
 {
 	struct switch_request w;
-	struct bio *bio = bio_alloc(GFP_KERNEL, 0);
-	if (!bio)
-		return -ENOMEM;
-	init_completion(&w.wait);
+
 	w.file = file;
-	bio->bi_private = &w;
-	bio->bi_bdev = NULL;
-	loop_make_request(lo->lo_queue, bio);
-	wait_for_completion(&w.wait);
+
+	/* freeze queue and wait for completion of scheduled requests */
+	blk_mq_freeze_queue(lo->lo_queue);
+
+	/* do the switch action */
+	do_loop_switch(lo, &w);
+
+	/* unfreeze */
+	blk_mq_unfreeze_queue(lo->lo_queue);
+
 	return 0;
 }
 
@@ -596,39 +530,10 @@ static int loop_switch(struct loop_device *lo, struct file *file)
  */
 static int loop_flush(struct loop_device *lo)
 {
-	/* loop not yet configured, no running thread, nothing to flush */
-	if (!lo->lo_thread)
-		return 0;
-
 	return loop_switch(lo, NULL);
 }
 
 /*
- * Do the actual switch; called from the BIO completion routine
- */
-static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
-{
-	struct file *file = p->file;
-	struct file *old_file = lo->lo_backing_file;
-	struct address_space *mapping;
-
-	/* if no new file, only flush of queued bios requested */
-	if (!file)
-		goto out;
-
-	mapping = file->f_mapping;
-	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
-	lo->lo_backing_file = file;
-	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
-		mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
-	lo->old_gfp_mask = mapping_gfp_mask(mapping);
-	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
-out:
-	complete(&p->wait);
-}
-
-
-/*
  * loop_change_fd switched the backing store of a loopback device to
  * a new file. This is useful for operating system installers to free up
  * the original file and in High Availability environments to switch to
@@ -889,12 +794,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->transfer = transfer_none;
 	lo->ioctl = NULL;
 	lo->lo_sizelimit = 0;
-	lo->lo_bio_count = 0;
 	lo->old_gfp_mask = mapping_gfp_mask(mapping);
 	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 
-	bio_list_init(&lo->lo_bio_list);
-
 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
 		blk_queue_flush(lo->lo_queue, REQ_FLUSH);
 
@@ -906,14 +808,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 
 	set_blocksize(bdev, lo_blocksize);
 
-	lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
-						lo->lo_number);
-	if (IS_ERR(lo->lo_thread)) {
-		error = PTR_ERR(lo->lo_thread);
-		goto out_clr;
-	}
 	lo->lo_state = Lo_bound;
-	wake_up_process(lo->lo_thread);
 	if (part_shift)
 		lo->lo_flags |= LO_FLAGS_PARTSCAN;
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
@@ -925,18 +820,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	bdgrab(bdev);
 	return 0;
 
-out_clr:
-	loop_sysfs_exit(lo);
-	lo->lo_thread = NULL;
-	lo->lo_device = NULL;
-	lo->lo_backing_file = NULL;
-	lo->lo_flags = 0;
-	set_capacity(lo->lo_disk, 0);
-	invalidate_bdev(bdev);
-	bd_set_size(bdev, 0);
-	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
-	mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
-	lo->lo_state = Lo_unbound;
  out_putf:
 	fput(file);
  out:
@@ -1012,11 +895,6 @@ static int loop_clr_fd(struct loop_device *lo)
 
 	spin_lock_irq(&lo->lo_lock);
 	lo->lo_state = Lo_rundown;
-	spin_unlock_irq(&lo->lo_lock);
-
-	kthread_stop(lo->lo_thread);
-
-	spin_lock_irq(&lo->lo_lock);
 	lo->lo_backing_file = NULL;
 	spin_unlock_irq(&lo->lo_lock);
 
@@ -1028,7 +906,6 @@ static int loop_clr_fd(struct loop_device *lo)
 	lo->lo_offset = 0;
 	lo->lo_sizelimit = 0;
 	lo->lo_encrypt_key_size = 0;
-	lo->lo_thread = NULL;
 	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
 	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
 	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
@@ -1601,6 +1478,84 @@ int loop_unregister_transfer(int number)
 EXPORT_SYMBOL(loop_register_transfer);
 EXPORT_SYMBOL(loop_unregister_transfer);
 
+static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	queue_work(loop_wq, &cmd->work);
+	return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static int loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+			  unsigned int index)
+{
+	struct loop_device *lo = data;
+
+	hctx->driver_data = lo;
+	return 0;
+}
+
+static void loop_softirq_done_fn(struct request *rq)
+{
+	blk_mq_end_io(rq, rq->errors);
+}
+
+static void loop_queue_work(struct work_struct *work)
+{
+	struct loop_cmd *cmd =
+		container_of(work, struct loop_cmd, work);
+	const bool write = cmd->rq->cmd_flags & REQ_WRITE;
+	struct loop_device *lo = cmd->lo;
+	int ret = -EIO;
+	struct bio *bio;
+
+	if (lo->lo_state != Lo_bound)
+		goto failed;
+
+	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
+		goto failed;
+
+	ret = 0;
+	__rq_for_each_bio(bio, cmd->rq)
+		ret |= loop_handle_bio(lo, bio);
+
+ failed:
+	if (ret)
+		cmd->rq->errors = -EIO;
+	blk_mq_complete_request(cmd->rq);
+}
+
+static int loop_init_request(void *data, struct request *rq,
+		unsigned int hctx_idx, unsigned int request_idx,
+		unsigned int numa_node)
+{
+	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	cmd->rq = rq;
+	cmd->lo = data;
+	INIT_WORK(&cmd->work, loop_queue_work);
+
+	return 0;
+}
+
+static int loop_init_flush_rq(void *data, struct request_queue *q,
+		struct request *flush_rq,
+		const struct request *src_rq)
+{
+	/* borrow initialization helper for common rq */
+	loop_init_request(data, flush_rq, 0, -1, NUMA_NO_NODE);
+	return 0;
+}
+
+static struct blk_mq_ops loop_mq_ops = {
+	.queue_rq       = loop_queue_rq,
+	.map_queue      = blk_mq_map_queue,
+	.init_hctx	= loop_init_hctx,
+	.init_request	= loop_init_request,
+	.init_flush_rq  = loop_init_flush_rq,
+	.complete	= loop_softirq_done_fn,
+};
+
 static int loop_add(struct loop_device **l, int i)
 {
 	struct loop_device *lo;
@@ -1627,15 +1582,20 @@ static int loop_add(struct loop_device **l, int i)
 	i = err;
 
 	err = -ENOMEM;
-	lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
-	if (!lo->lo_queue)
+	lo->tag_set.ops = &loop_mq_ops;
+	lo->tag_set.nr_hw_queues = 1;
+	lo->tag_set.queue_depth = 128;
+	lo->tag_set.numa_node = NUMA_NO_NODE;
+	lo->tag_set.cmd_size = sizeof(struct loop_cmd);
+	lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	lo->tag_set.driver_data = lo;
+
+	if (blk_mq_alloc_tag_set(&lo->tag_set))
 		goto out_free_idr;
 
-	/*
-	 * set queue make_request_fn
-	 */
-	blk_queue_make_request(lo->lo_queue, loop_make_request);
-	lo->lo_queue->queuedata = lo;
+	lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
+	if (!lo->lo_queue)
+		goto out_cleanup_tags;
 
 	disk = lo->lo_disk = alloc_disk(1 << part_shift);
 	if (!disk)
@@ -1664,9 +1624,6 @@ static int loop_add(struct loop_device **l, int i)
 	disk->flags |= GENHD_FL_EXT_DEVT;
 	mutex_init(&lo->lo_ctl_mutex);
 	lo->lo_number		= i;
-	lo->lo_thread		= NULL;
-	init_waitqueue_head(&lo->lo_event);
-	init_waitqueue_head(&lo->lo_req_wait);
 	spin_lock_init(&lo->lo_lock);
 	disk->major		= LOOP_MAJOR;
 	disk->first_minor	= i << part_shift;
@@ -1680,6 +1637,8 @@ static int loop_add(struct loop_device **l, int i)
 
 out_free_queue:
 	blk_cleanup_queue(lo->lo_queue);
+out_cleanup_tags:
+	blk_mq_free_tag_set(&lo->tag_set);
 out_free_idr:
 	idr_remove(&loop_index_idr, i);
 out_free_dev:
@@ -1692,6 +1651,7 @@ static void loop_remove(struct loop_device *lo)
 {
 	del_gendisk(lo->lo_disk);
 	blk_cleanup_queue(lo->lo_queue);
+	blk_mq_free_tag_set(&lo->tag_set);
 	put_disk(lo->lo_disk);
 	kfree(lo);
 }
@@ -1884,6 +1844,10 @@ static int __init loop_init(void)
 		loop_add(&lo, i);
 	mutex_unlock(&loop_index_mutex);
 
+	loop_wq = alloc_workqueue("kloopd", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+	if (!loop_wq)
+		panic("Failed to create kloopd\n");
+
 	printk(KERN_INFO "loop: module loaded\n");
 	return 0;
 
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 90df5d6..be796c7 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -13,6 +13,7 @@
 #include <linux/blkdev.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
+#include <linux/workqueue.h>
 #include <uapi/linux/loop.h>
 
 /* Possible states of device */
@@ -52,19 +53,20 @@ struct loop_device {
 	gfp_t		old_gfp_mask;
 
 	spinlock_t		lo_lock;
-	struct bio_list		lo_bio_list;
-	unsigned int		lo_bio_count;
 	int			lo_state;
 	struct mutex		lo_ctl_mutex;
-	struct task_struct	*lo_thread;
-	wait_queue_head_t	lo_event;
-	/* wait queue for incoming requests */
-	wait_queue_head_t	lo_req_wait;
 
 	struct request_queue	*lo_queue;
+	struct blk_mq_tag_set	tag_set;
 	struct gendisk		*lo_disk;
 };
 
+struct loop_cmd {
+	struct work_struct work;
+	struct request *rq;
+	struct loop_device *lo;
+};
+
 /* Support for loadable transfer modules */
 struct loop_func_table {
 	int number;	/* filter type */ 
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-14 15:50 ` [PATCH v1 5/9] block: loop: convert to blk-mq Ming Lei
@ 2014-08-15 16:31   ` Christoph Hellwig
  2014-08-15 16:36     ` Jens Axboe
  0 siblings, 1 reply; 22+ messages in thread
From: Christoph Hellwig @ 2014-08-15 16:31 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-kernel, Andrew Morton, Dave Kleikamp,
	Zach Brown, Benjamin LaHaise, Christoph Hellwig, Kent Overstreet,
	linux-aio, linux-fsdevel, Dave Chinner

> +
> +static int loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
> +			  unsigned int index)
> +{
> +	struct loop_device *lo = data;
> +
> +	hctx->driver_data = lo;

I don't think there is much of a point to store this in the hctx
instead of relying on the queue.

> +static void loop_softirq_done_fn(struct request *rq)
> +{
> +	blk_mq_end_io(rq, rq->errors);
> +}

no need for a noop softirq done function.

> +static void loop_queue_work(struct work_struct *work)

Offloading work straight to a workqueue dosn't make much sense
in the blk-mq model as we'll usually be called from one.  If you
need to avoid the cases where we are called directly a flag for
the blk-mq code to always schedule a workqueue sounds like a much
better plan.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-15 16:31   ` Christoph Hellwig
@ 2014-08-15 16:36     ` Jens Axboe
  2014-08-15 16:46       ` Jens Axboe
  0 siblings, 1 reply; 22+ messages in thread
From: Jens Axboe @ 2014-08-15 16:36 UTC (permalink / raw)
  To: Christoph Hellwig, Ming Lei
  Cc: linux-kernel, Andrew Morton, Dave Kleikamp, Zach Brown,
	Benjamin LaHaise, Kent Overstreet, linux-aio, linux-fsdevel,
	Dave Chinner

On 08/15/2014 10:31 AM, Christoph Hellwig wrote:
>> +static void loop_queue_work(struct work_struct *work)
> 
> Offloading work straight to a workqueue dosn't make much sense
> in the blk-mq model as we'll usually be called from one.  If you
> need to avoid the cases where we are called directly a flag for
> the blk-mq code to always schedule a workqueue sounds like a much
> better plan.

That's a good point - would clean up this bit, and be pretty close to a
one-liner to support in blk-mq for the drivers that always need blocking
context.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-15 16:36     ` Jens Axboe
@ 2014-08-15 16:46       ` Jens Axboe
  2014-08-16  8:06         ` Ming Lei
  0 siblings, 1 reply; 22+ messages in thread
From: Jens Axboe @ 2014-08-15 16:46 UTC (permalink / raw)
  To: Christoph Hellwig, Ming Lei
  Cc: linux-kernel, Andrew Morton, Dave Kleikamp, Zach Brown,
	Benjamin LaHaise, Kent Overstreet, linux-aio, linux-fsdevel,
	Dave Chinner

[-- Attachment #1: Type: text/plain, Size: 825 bytes --]

On 08/15/2014 10:36 AM, Jens Axboe wrote:
> On 08/15/2014 10:31 AM, Christoph Hellwig wrote:
>>> +static void loop_queue_work(struct work_struct *work)
>>
>> Offloading work straight to a workqueue dosn't make much sense
>> in the blk-mq model as we'll usually be called from one.  If you
>> need to avoid the cases where we are called directly a flag for
>> the blk-mq code to always schedule a workqueue sounds like a much
>> better plan.
> 
> That's a good point - would clean up this bit, and be pretty close to a
> one-liner to support in blk-mq for the drivers that always need blocking
> context.

Something like this should do the trick - totally untested. But with
that, loop would just need to add BLK_MQ_F_WQ_CONTEXT to it's tag set
flags and it could always do the work inline from ->queue_rq().

-- 
Jens Axboe


[-- Attachment #2: blk-mq-wq.patch --]
[-- Type: text/x-patch, Size: 1230 bytes --]

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5189cb1e478a..a97eb9a4af60 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -803,6 +803,9 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 		return;
 
+	if (hctx->flags & BLK_MQ_F_WQ_CONTEXT)
+		async = true;
+
 	if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
 		__blk_mq_run_hw_queue(hctx);
 	else if (hctx->queue->nr_hw_queues == 1)
@@ -1173,7 +1176,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		goto run_queue;
 	}
 
-	if (is_sync) {
+	if (is_sync && !(data.hctx->flags & BLK_MQ_F_WQ_CONTEXT)) {
 		int ret;
 
 		blk_mq_bio_to_request(rq, bio);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index eb726b9c5762..c7a8c5fdd380 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -127,7 +127,8 @@ enum {
 	BLK_MQ_RQ_QUEUE_ERROR	= 2,	/* end IO with error */
 
 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
-	BLK_MQ_F_SHOULD_SORT	= 1 << 1,
+	BLK_MQ_F_WQ_CONTEXT	= 1 << 1,	/* ->queue_rq() must run from
+						 * a blocking context */
 	BLK_MQ_F_TAG_SHARED	= 1 << 2,
 	BLK_MQ_F_SG_MERGE	= 1 << 3,
 	BLK_MQ_F_SYSFS_UP	= 1 << 4,

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-15 16:46       ` Jens Axboe
@ 2014-08-16  8:06         ` Ming Lei
  2014-08-17 17:48           ` Jens Axboe
  0 siblings, 1 reply; 22+ messages in thread
From: Ming Lei @ 2014-08-16  8:06 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, linux-kernel, Andrew Morton, Dave Kleikamp,
	Zach Brown, Benjamin LaHaise, Kent Overstreet, linux-aio,
	linux-fsdevel, Dave Chinner

On 8/16/14, Jens Axboe <axboe@kernel.dk> wrote:
> On 08/15/2014 10:36 AM, Jens Axboe wrote:
>> On 08/15/2014 10:31 AM, Christoph Hellwig wrote:
>>>> +static void loop_queue_work(struct work_struct *work)
>>>
>>> Offloading work straight to a workqueue dosn't make much sense
>>> in the blk-mq model as we'll usually be called from one.  If you
>>> need to avoid the cases where we are called directly a flag for
>>> the blk-mq code to always schedule a workqueue sounds like a much
>>> better plan.
>>
>> That's a good point - would clean up this bit, and be pretty close to a
>> one-liner to support in blk-mq for the drivers that always need blocking
>> context.
>
> Something like this should do the trick - totally untested. But with
> that, loop would just need to add BLK_MQ_F_WQ_CONTEXT to it's tag set
> flags and it could always do the work inline from ->queue_rq().

I think it is a good idea.

But for loop, there may be two problems:

- default max_active for bound workqueue is 256, which means several slow
loop devices might slow down whole block system. With kernel AIO, it won't
be a big deal, but some block/fs may not support direct I/O and still
fallback to
workqueue

- 6. Guidelines of Documentation/workqueue.txt
If there is dependency among multiple work items used during memory
reclaim, they should be queued to separate wq each with WQ_MEM_RECLAIM.


Thanks,
-- 
Ming Lei

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-16  8:06         ` Ming Lei
@ 2014-08-17 17:48           ` Jens Axboe
  2014-08-18  1:22             ` Ming Lei
  0 siblings, 1 reply; 22+ messages in thread
From: Jens Axboe @ 2014-08-17 17:48 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, linux-kernel, Andrew Morton, Dave Kleikamp,
	Zach Brown, Benjamin LaHaise, Kent Overstreet, linux-aio,
	linux-fsdevel, Dave Chinner

On 2014-08-16 02:06, Ming Lei wrote:
> On 8/16/14, Jens Axboe <axboe@kernel.dk> wrote:
>> On 08/15/2014 10:36 AM, Jens Axboe wrote:
>>> On 08/15/2014 10:31 AM, Christoph Hellwig wrote:
>>>>> +static void loop_queue_work(struct work_struct *work)
>>>>
>>>> Offloading work straight to a workqueue dosn't make much sense
>>>> in the blk-mq model as we'll usually be called from one.  If you
>>>> need to avoid the cases where we are called directly a flag for
>>>> the blk-mq code to always schedule a workqueue sounds like a much
>>>> better plan.
>>>
>>> That's a good point - would clean up this bit, and be pretty close to a
>>> one-liner to support in blk-mq for the drivers that always need blocking
>>> context.
>>
>> Something like this should do the trick - totally untested. But with
>> that, loop would just need to add BLK_MQ_F_WQ_CONTEXT to it's tag set
>> flags and it could always do the work inline from ->queue_rq().
>
> I think it is a good idea.
>
> But for loop, there may be two problems:
>
> - default max_active for bound workqueue is 256, which means several slow
> loop devices might slow down whole block system. With kernel AIO, it won't
> be a big deal, but some block/fs may not support direct I/O and still
> fallback to
> workqueue
>
> - 6. Guidelines of Documentation/workqueue.txt
> If there is dependency among multiple work items used during memory
> reclaim, they should be queued to separate wq each with WQ_MEM_RECLAIM.

Both are good points. But I think this mainly means that we should 
support this through a potentially per-dispatch queue workqueue, 
separate from kblockd. There's no reason blk-mq can't support this with 
a per-hctx workqueue, for drivers that need it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-17 17:48           ` Jens Axboe
@ 2014-08-18  1:22             ` Ming Lei
  2014-08-18 11:53               ` Ming Lei
  0 siblings, 1 reply; 22+ messages in thread
From: Ming Lei @ 2014-08-18  1:22 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Linux Kernel Mailing List, Andrew Morton,
	Dave Kleikamp, Zach Brown, Benjamin LaHaise, Kent Overstreet,
	open list:AIO, Linux FS Devel, Dave Chinner

On Mon, Aug 18, 2014 at 1:48 AM, Jens Axboe <axboe@kernel.dk> wrote:
> On 2014-08-16 02:06, Ming Lei wrote:
>>
>> On 8/16/14, Jens Axboe <axboe@kernel.dk> wrote:
>>>
>>> On 08/15/2014 10:36 AM, Jens Axboe wrote:
>>>>
>>>> On 08/15/2014 10:31 AM, Christoph Hellwig wrote:
>>>>>>
>>>>>> +static void loop_queue_work(struct work_struct *work)
>>>>>
>>>>>
>>>>> Offloading work straight to a workqueue dosn't make much sense
>>>>> in the blk-mq model as we'll usually be called from one.  If you
>>>>> need to avoid the cases where we are called directly a flag for
>>>>> the blk-mq code to always schedule a workqueue sounds like a much
>>>>> better plan.
>>>>
>>>>
>>>> That's a good point - would clean up this bit, and be pretty close to a
>>>> one-liner to support in blk-mq for the drivers that always need blocking
>>>> context.
>>>
>>>
>>> Something like this should do the trick - totally untested. But with
>>> that, loop would just need to add BLK_MQ_F_WQ_CONTEXT to it's tag set
>>> flags and it could always do the work inline from ->queue_rq().
>>
>>
>> I think it is a good idea.
>>
>> But for loop, there may be two problems:
>>
>> - default max_active for bound workqueue is 256, which means several slow
>> loop devices might slow down whole block system. With kernel AIO, it won't
>> be a big deal, but some block/fs may not support direct I/O and still
>> fallback to
>> workqueue
>>
>> - 6. Guidelines of Documentation/workqueue.txt
>> If there is dependency among multiple work items used during memory
>> reclaim, they should be queued to separate wq each with WQ_MEM_RECLAIM.
>
>
> Both are good points. But I think this mainly means that we should support
> this through a potentially per-dispatch queue workqueue, separate from
> kblockd. There's no reason blk-mq can't support this with a per-hctx
> workqueue, for drivers that need it.

Good idea, and per-device workqueue should be enough if
BLK_MQ_F_WQ_CONTEXT flag is set.

Thanks,

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-18  1:22             ` Ming Lei
@ 2014-08-18 11:53               ` Ming Lei
       [not found]                 ` <53F3B89D.6070703@kernel.dk>
  0 siblings, 1 reply; 22+ messages in thread
From: Ming Lei @ 2014-08-18 11:53 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Linux Kernel Mailing List, Andrew Morton,
	Dave Kleikamp, Zach Brown, Benjamin LaHaise, Kent Overstreet,
	open list:AIO, Linux FS Devel, Dave Chinner

On Mon, Aug 18, 2014 at 9:22 AM, Ming Lei <ming.lei@canonical.com> wrote:
> On Mon, Aug 18, 2014 at 1:48 AM, Jens Axboe <axboe@kernel.dk> wrote:
>> On 2014-08-16 02:06, Ming Lei wrote:
>>>
>>> On 8/16/14, Jens Axboe <axboe@kernel.dk> wrote:
>>>>
>>>> On 08/15/2014 10:36 AM, Jens Axboe wrote:
>>>>>
>>>>> On 08/15/2014 10:31 AM, Christoph Hellwig wrote:
>>>>>>>
>>>>>>> +static void loop_queue_work(struct work_struct *work)
>>>>>>
>>>>>>
>>>>>> Offloading work straight to a workqueue dosn't make much sense
>>>>>> in the blk-mq model as we'll usually be called from one.  If you
>>>>>> need to avoid the cases where we are called directly a flag for
>>>>>> the blk-mq code to always schedule a workqueue sounds like a much
>>>>>> better plan.
>>>>>
>>>>>
>>>>> That's a good point - would clean up this bit, and be pretty close to a
>>>>> one-liner to support in blk-mq for the drivers that always need blocking
>>>>> context.
>>>>
>>>>
>>>> Something like this should do the trick - totally untested. But with
>>>> that, loop would just need to add BLK_MQ_F_WQ_CONTEXT to it's tag set
>>>> flags and it could always do the work inline from ->queue_rq().
>>>
>>>
>>> I think it is a good idea.
>>>
>>> But for loop, there may be two problems:
>>>
>>> - default max_active for bound workqueue is 256, which means several slow
>>> loop devices might slow down whole block system. With kernel AIO, it won't
>>> be a big deal, but some block/fs may not support direct I/O and still
>>> fallback to
>>> workqueue
>>>
>>> - 6. Guidelines of Documentation/workqueue.txt
>>> If there is dependency among multiple work items used during memory
>>> reclaim, they should be queued to separate wq each with WQ_MEM_RECLAIM.
>>
>>
>> Both are good points. But I think this mainly means that we should support
>> this through a potentially per-dispatch queue workqueue, separate from
>> kblockd. There's no reason blk-mq can't support this with a per-hctx
>> workqueue, for drivers that need it.
>
> Good idea, and per-device workqueue should be enough if
> BLK_MQ_F_WQ_CONTEXT flag is set.

Maybe for most of cases per-device class(driver) workqueue should be
enough since dependency between devices driven by same driver
isn't common, for example, loop over loop is absolutely insane.

I will keep the work queue in loop-mq V2, and it should be easy to switch
to the mechanism once it is ready.

Thanks,

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
       [not found]                 ` <53F3B89D.6070703@kernel.dk>
@ 2014-08-20  1:23                   ` Ming Lei
       [not found]                     ` <53F4C835.7030407@kernel.dk>
  2014-08-21  5:44                   ` Ming Lei
  1 sibling, 1 reply; 22+ messages in thread
From: Ming Lei @ 2014-08-20  1:23 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Linux Kernel Mailing List, Andrew Morton,
	Dave Kleikamp, Zach Brown, Benjamin LaHaise, Kent Overstreet,
	open list:AIO <linux-aio@kvack.org>, Linux FS Devel <linux-fsdevel@vger.kernel.org>, Dave Chinner,
	Tejun Heo

On Wed, Aug 20, 2014 at 4:50 AM, Jens Axboe <axboe@kernel.dk> wrote:
> On 2014-08-18 06:53, Ming Lei wrote:
>>
>> On Mon, Aug 18, 2014 at 9:22 AM, Ming Lei <ming.lei@canonical.com> wrote:
>>>
>>> On Mon, Aug 18, 2014 at 1:48 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>
>>>> On 2014-08-16 02:06, Ming Lei wrote:
>>>>>
>>>>>
>>>>> On 8/16/14, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>
>>>>>>
>>>>>> On 08/15/2014 10:36 AM, Jens Axboe wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 08/15/2014 10:31 AM, Christoph Hellwig wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> +static void loop_queue_work(struct work_struct *work)
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> Offloading work straight to a workqueue dosn't make much sense
>>>>>>>> in the blk-mq model as we'll usually be called from one.  If you
>>>>>>>> need to avoid the cases where we are called directly a flag for
>>>>>>>> the blk-mq code to always schedule a workqueue sounds like a much
>>>>>>>> better plan.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> That's a good point - would clean up this bit, and be pretty close to
>>>>>>> a
>>>>>>> one-liner to support in blk-mq for the drivers that always need
>>>>>>> blocking
>>>>>>> context.
>>>>>>
>>>>>>
>>>>>>
>>>>>> Something like this should do the trick - totally untested. But with
>>>>>> that, loop would just need to add BLK_MQ_F_WQ_CONTEXT to it's tag set
>>>>>> flags and it could always do the work inline from ->queue_rq().
>>>>>
>>>>>
>>>>>
>>>>> I think it is a good idea.
>>>>>
>>>>> But for loop, there may be two problems:
>>>>>
>>>>> - default max_active for bound workqueue is 256, which means several
>>>>> slow
>>>>> loop devices might slow down whole block system. With kernel AIO, it
>>>>> won't
>>>>> be a big deal, but some block/fs may not support direct I/O and still
>>>>> fallback to
>>>>> workqueue
>>>>>
>>>>> - 6. Guidelines of Documentation/workqueue.txt
>>>>> If there is dependency among multiple work items used during memory
>>>>> reclaim, they should be queued to separate wq each with WQ_MEM_RECLAIM.
>>>>
>>>>
>>>>
>>>> Both are good points. But I think this mainly means that we should
>>>> support
>>>> this through a potentially per-dispatch queue workqueue, separate from
>>>> kblockd. There's no reason blk-mq can't support this with a per-hctx
>>>> workqueue, for drivers that need it.
>>>
>>>
>>> Good idea, and per-device workqueue should be enough if
>>> BLK_MQ_F_WQ_CONTEXT flag is set.
>>
>>
>> Maybe for most of cases per-device class(driver) workqueue should be
>> enough since dependency between devices driven by same driver
>> isn't common, for example, loop over loop is absolutely insane.
>
>
> It's insane, but it can happen. And given how cheap it is to do a workqueue,

Workqueue with WQ_MEM_RECLAIM need to create a standalone kthread
for the queue, so at default there will be 8 kthreads created even no one
uses loop at all.  From current implementation the per-device thread is
created only when one file or blk device is attached to the loop device, which
may not be possible when blk-mq supports per-device workqueue.

> I don't see a reason why we should not. Loop over loop might seem nutty, but
> it's not that far out into the realm of nutty things that people end up
> doing.

Another reason I am still not sure if workqueue is good for loop, though I
do really like workqueue for sake of simplicity, :-)

- sequential read becomes a bit slow with workqueue, especially for some
fast block(such as null_blk)

- random read becomes a bit slow too for some fast devices(such as null_blk)
in some environment(It is reproduced in my server, but can't in my laptop) even
it can improve throughout quite much for common devices(HDD., SSD,..)

>From my investigation, context switch increases almost 50% with
workqueue compared with kthread in loop in a quad-core VM. With
kthread, requests may be handled as batch in cases which won't be
blocked in read()/write()(like null_blk, tmpfs, ...), but it is impossible with
workqueue any more.  Also block plug&unplug should have been used
with kthread to optimize the case, especially when kernel AIO is applied,
still impossible with work queue too.

So looks kthread with kernel AIO is still not bad for the blk-mq conversion,
which can improve throughput much too.  Or other ideas?


Thanks

>
>
>> I will keep the work queue in loop-mq V2, and it should be easy to switch
>> to the mechanism once it is ready.
>
>
> Reworked a bit more:
>
> http://git.kernel.dk/?p=linux-block.git;a=commit;h=a323185a761b9a54dc340d383695b4205ea258b6
>
> Lets base loop-mq on the blk-mq workqueues, it would simplify it quite a bit
> and I don't think there's much point in doing v1 and then ripping it out for
> v2. Especially since it isn't queued up for 3.18 yet.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
       [not found]                     ` <53F4C835.7030407@kernel.dk>
@ 2014-08-21  2:54                       ` Ming Lei
       [not found]                         ` <53F5605C.2010304@kernel.dk>
  0 siblings, 1 reply; 22+ messages in thread
From: Ming Lei @ 2014-08-21  2:54 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Linux Kernel Mailing List, Andrew Morton,
	Dave Kleikamp, Zach Brown, Benjamin LaHaise, Kent Overstreet,
	open list:AIO <linux-aio@kvack.org>, Linux FS Devel <linux-fsdevel@vger.kernel.org>, Dave Chinner <david@fromorbit.com>, Tejun Heo

On Thu, Aug 21, 2014 at 12:09 AM, Jens Axboe <axboe@kernel.dk> wrote:
> On 2014-08-19 20:23, Ming Lei wrote:
>>
>> On Wed, Aug 20, 2014 at 4:50 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>
>>> On 2014-08-18 06:53, Ming Lei wrote:
>>>>
>>>>
>>>> On Mon, Aug 18, 2014 at 9:22 AM, Ming Lei <ming.lei@canonical.com>
>>>> wrote:
>>>>>
>>>>>
>>>>> On Mon, Aug 18, 2014 at 1:48 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>
>>>>>>
>>>>>> On 2014-08-16 02:06, Ming Lei wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 8/16/14, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 08/15/2014 10:36 AM, Jens Axboe wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 08/15/2014 10:31 AM, Christoph Hellwig wrote:
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> +static void loop_queue_work(struct work_struct *work)
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Offloading work straight to a workqueue dosn't make much sense
>>>>>>>>>> in the blk-mq model as we'll usually be called from one.  If you
>>>>>>>>>> need to avoid the cases where we are called directly a flag for
>>>>>>>>>> the blk-mq code to always schedule a workqueue sounds like a much
>>>>>>>>>> better plan.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> That's a good point - would clean up this bit, and be pretty close
>>>>>>>>> to
>>>>>>>>> a
>>>>>>>>> one-liner to support in blk-mq for the drivers that always need
>>>>>>>>> blocking
>>>>>>>>> context.
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> Something like this should do the trick - totally untested. But with
>>>>>>>> that, loop would just need to add BLK_MQ_F_WQ_CONTEXT to it's tag
>>>>>>>> set
>>>>>>>> flags and it could always do the work inline from ->queue_rq().
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> I think it is a good idea.
>>>>>>>
>>>>>>> But for loop, there may be two problems:
>>>>>>>
>>>>>>> - default max_active for bound workqueue is 256, which means several
>>>>>>> slow
>>>>>>> loop devices might slow down whole block system. With kernel AIO, it
>>>>>>> won't
>>>>>>> be a big deal, but some block/fs may not support direct I/O and still
>>>>>>> fallback to
>>>>>>> workqueue
>>>>>>>
>>>>>>> - 6. Guidelines of Documentation/workqueue.txt
>>>>>>> If there is dependency among multiple work items used during memory
>>>>>>> reclaim, they should be queued to separate wq each with
>>>>>>> WQ_MEM_RECLAIM.
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> Both are good points. But I think this mainly means that we should
>>>>>> support
>>>>>> this through a potentially per-dispatch queue workqueue, separate from
>>>>>> kblockd. There's no reason blk-mq can't support this with a per-hctx
>>>>>> workqueue, for drivers that need it.
>>>>>
>>>>>
>>>>>
>>>>> Good idea, and per-device workqueue should be enough if
>>>>> BLK_MQ_F_WQ_CONTEXT flag is set.
>>>>
>>>>
>>>>
>>>> Maybe for most of cases per-device class(driver) workqueue should be
>>>> enough since dependency between devices driven by same driver
>>>> isn't common, for example, loop over loop is absolutely insane.
>>>
>>>
>>>
>>> It's insane, but it can happen. And given how cheap it is to do a
>>> workqueue,
>>
>>
>> Workqueue with WQ_MEM_RECLAIM need to create a standalone kthread
>> for the queue, so at default there will be 8 kthreads created even no one
>> uses loop at all.  From current implementation the per-device thread is
>> created only when one file or blk device is attached to the loop device,
>> which
>> may not be possible when blk-mq supports per-device workqueue.
>
>
> That is true, but I don't see this as a huge problem. And idle kthread is
> pretty much free...

OK, I am fine with that too if no one complains that, :-)

BTW, loop over loop won't be a problem since loop driver can cut the
dependency and just use the original back file, so one workqueue should
be enough for all loop devices.

>
>
>>> I don't see a reason why we should not. Loop over loop might seem nutty,
>>> but
>>> it's not that far out into the realm of nutty things that people end up
>>> doing.
>>
>>
>> Another reason I am still not sure if workqueue is good for loop, though I
>> do really like workqueue for sake of simplicity, :-)
>>
>> - sequential read becomes a bit slow with workqueue, especially for some
>> fast block(such as null_blk)
>>
>> - random read becomes a bit slow too for some fast devices(such as
>> null_blk)
>> in some environment(It is reproduced in my server, but can't in my laptop)
>> even
>> it can improve throughout quite much for common devices(HDD., SSD,..)
>
>
> Thread offloading will always slow down some use cases, like sync(ish) IO.
> Not sure this is a case against kthread vs workqueue, performance and
> behavior should be identical here?

Looks no sync is involved because I just test randread with fio, and
the cause should be same with below.

>
>
>>  From my investigation, context switch increases almost 50% with
>> workqueue compared with kthread in loop in a quad-core VM. With
>> kthread, requests may be handled as batch in cases which won't be
>> blocked in read()/write()(like null_blk, tmpfs, ...), but it is impossible
>> with
>> workqueue any more.  Also block plug&unplug should have been used
>> with kthread to optimize the case, especially when kernel AIO is applied,
>> still impossible with work queue too.
>
>
> OK, that one is actually a good point, since one need not do per-item
> queueing. We could handle different units, though. And we should have proper
> marking of the last item in a chain of stuff, so we might even be able to
> offload based on that instead of doing single items. It wont help the sync
> case, but for that, workqueue and kthread would be identical.

We may do that by introducing callback of queue_rq_list in blk_mq_ops,
and I will figure out one patch today to see if it can help the case.

> Or we could just provide a better alternative in blk-mq. Doing workqueues is
> just so damn easy, I'd be reluctant to add a kthread pool instead. It'd be
> much better to augment or fix workqueues to work well for this case as well.



Thanks,

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
       [not found]                         ` <53F5605C.2010304@kernel.dk>
@ 2014-08-21  3:13                           ` Ming Lei
  2014-08-21  3:15                             ` Ming Lei
  2014-08-21  3:34                           ` Ming Lei
  1 sibling, 1 reply; 22+ messages in thread
From: Ming Lei @ 2014-08-21  3:13 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Linux Kernel Mailing List, Andrew Morton,
	Dave Kleikamp, Zach Brown, Benjamin LaHaise, Kent Overstreet,
	open list:AIO <linux-aio@kvack.org>, Linux FS Devel <linux-fsdevel@vger.kernel.org>, Dave Chinner <david@fromorbit.com>, Tejun Heo

On Thu, Aug 21, 2014 at 10:58 AM, Jens Axboe <axboe@kernel.dk> wrote:
> On 2014-08-20 21:54, Ming Lei wrote:
>>>>
>>>>   From my investigation, context switch increases almost 50% with
>>>> workqueue compared with kthread in loop in a quad-core VM. With
>>>> kthread, requests may be handled as batch in cases which won't be
>>>> blocked in read()/write()(like null_blk, tmpfs, ...), but it is
>>>> impossible
>>>> with
>>>> workqueue any more.  Also block plug&unplug should have been used
>>>> with kthread to optimize the case, especially when kernel AIO is
>>>> applied,
>>>> still impossible with work queue too.
>>>
>>>
>>>
>>> OK, that one is actually a good point, since one need not do per-item
>>> queueing. We could handle different units, though. And we should have
>>> proper
>>> marking of the last item in a chain of stuff, so we might even be able to
>>> offload based on that instead of doing single items. It wont help the
>>> sync
>>> case, but for that, workqueue and kthread would be identical.
>>
>>
>> We may do that by introducing callback of queue_rq_list in blk_mq_ops,
>> and I will figure out one patch today to see if it can help the case.
>
>
> I don't think we should add to the interface, I prefer keeping it clean like
> it is right now. At least not if we can get around it. My point is that the
> driver already knows when the chain is complete, when REQ_LAST is set. So
> before that event triggers, it need not kick off IO, or at least i could do
> it in batches before that. That may not be fully reliable in case of
> queueing errors, but if REQ_LAST or 'error return' is used as the way to
> kick off pending IO, then that should be good enough. Haven't audited this
> in a while, but at least that is the intent of REQ_LAST.

Yes, I thought of too, but driver need another context for handling that,
either workqueue or kthread, which may cause the introduced per-device
workqueue useless.

thanks,

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-21  3:13                           ` Ming Lei
@ 2014-08-21  3:15                             ` Ming Lei
  0 siblings, 0 replies; 22+ messages in thread
From: Ming Lei @ 2014-08-21  3:15 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Linux Kernel Mailing List, Andrew Morton,
	Dave Kleikamp, Zach Brown, Benjamin LaHaise, Kent Overstreet,
	open list:AIO <linux-aio@kvack.org>, Linux FS Devel <linux-fsdevel@vger.kernel.org>, Dave Chinner <david@fromorbit.com>, Tejun Heo

On Thu, Aug 21, 2014 at 11:13 AM, Ming Lei <ming.lei@canonical.com> wrote:
> On Thu, Aug 21, 2014 at 10:58 AM, Jens Axboe <axboe@kernel.dk> wrote:
>> On 2014-08-20 21:54, Ming Lei wrote:
>>>>>
>>>>>   From my investigation, context switch increases almost 50% with
>>>>> workqueue compared with kthread in loop in a quad-core VM. With
>>>>> kthread, requests may be handled as batch in cases which won't be
>>>>> blocked in read()/write()(like null_blk, tmpfs, ...), but it is
>>>>> impossible
>>>>> with
>>>>> workqueue any more.  Also block plug&unplug should have been used
>>>>> with kthread to optimize the case, especially when kernel AIO is
>>>>> applied,
>>>>> still impossible with work queue too.
>>>>
>>>>
>>>>
>>>> OK, that one is actually a good point, since one need not do per-item
>>>> queueing. We could handle different units, though. And we should have
>>>> proper
>>>> marking of the last item in a chain of stuff, so we might even be able to
>>>> offload based on that instead of doing single items. It wont help the
>>>> sync
>>>> case, but for that, workqueue and kthread would be identical.
>>>
>>>
>>> We may do that by introducing callback of queue_rq_list in blk_mq_ops,
>>> and I will figure out one patch today to see if it can help the case.
>>
>>
>> I don't think we should add to the interface, I prefer keeping it clean like
>> it is right now. At least not if we can get around it. My point is that the
>> driver already knows when the chain is complete, when REQ_LAST is set. So
>> before that event triggers, it need not kick off IO, or at least i could do
>> it in batches before that. That may not be fully reliable in case of
>> queueing errors, but if REQ_LAST or 'error return' is used as the way to
>> kick off pending IO, then that should be good enough. Haven't audited this
>> in a while, but at least that is the intent of REQ_LAST.
>
> Yes, I thought of too, but driver need another context for handling that,
> either workqueue or kthread, which may cause the introduced per-device
> workqueue useless.

Hmmm, a list should be enough, will do that.

Thanks,

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
       [not found]                         ` <53F5605C.2010304@kernel.dk>
  2014-08-21  3:13                           ` Ming Lei
@ 2014-08-21  3:34                           ` Ming Lei
  1 sibling, 0 replies; 22+ messages in thread
From: Ming Lei @ 2014-08-21  3:34 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Linux Kernel Mailing List, Andrew Morton,
	Dave Kleikamp, Zach Brown, Benjamin LaHaise, Kent Overstreet,
	open list:AIO <linux-aio@kvack.org>, Linux FS Devel <linux-fsdevel@vger.kernel.org>, Dave Chinner <david@fromorbit.com>, Tejun Heo

On Thu, Aug 21, 2014 at 10:58 AM, Jens Axboe <axboe@kernel.dk> wrote:
> On 2014-08-20 21:54, Ming Lei wrote:
>>>>
>>>>   From my investigation, context switch increases almost 50% with
>>>> workqueue compared with kthread in loop in a quad-core VM. With
>>>> kthread, requests may be handled as batch in cases which won't be
>>>> blocked in read()/write()(like null_blk, tmpfs, ...), but it is
>>>> impossible
>>>> with
>>>> workqueue any more.  Also block plug&unplug should have been used
>>>> with kthread to optimize the case, especially when kernel AIO is
>>>> applied,
>>>> still impossible with work queue too.
>>>
>>>
>>>
>>> OK, that one is actually a good point, since one need not do per-item
>>> queueing. We could handle different units, though. And we should have
>>> proper
>>> marking of the last item in a chain of stuff, so we might even be able to
>>> offload based on that instead of doing single items. It wont help the
>>> sync
>>> case, but for that, workqueue and kthread would be identical.
>>
>>
>> We may do that by introducing callback of queue_rq_list in blk_mq_ops,
>> and I will figure out one patch today to see if it can help the case.
>
>
> I don't think we should add to the interface, I prefer keeping it clean like
> it is right now. At least not if we can get around it. My point is that the
> driver already knows when the chain is complete, when REQ_LAST is set. So
> before that event triggers, it need not kick off IO, or at least i could do
> it in batches before that. That may not be fully reliable in case of
> queueing errors, but if REQ_LAST or 'error return' is used as the way to
> kick off pending IO, then that should be good enough. Haven't audited this
> in a while, but at least that is the intent of REQ_LAST.

Another point is that running N queue_work(rq) may cost more
than running one time queue_work(N rqs) since context still may
switch back and forth when executing queue_work().

Anyway I need to run test first to see if it can bring back throughout
on sequential read by handling them as batch.


Thanks,

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
       [not found]                 ` <53F3B89D.6070703@kernel.dk>
  2014-08-20  1:23                   ` Ming Lei
@ 2014-08-21  5:44                   ` Ming Lei
  2014-08-27 16:08                     ` Maxim Patlasov
  1 sibling, 1 reply; 22+ messages in thread
From: Ming Lei @ 2014-08-21  5:44 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Linux Kernel Mailing List, Andrew Morton,
	Dave Kleikamp, Zach Brown, Benjamin LaHaise, Kent Overstreet,
	open list:AIO <linux-aio@kvack.org>, Linux FS Devel <linux-fsdevel@vger.kernel.org>, Dave Chinner

On Wed, Aug 20, 2014 at 4:50 AM, Jens Axboe <axboe@kernel.dk> wrote:

>
>
> Reworked a bit more:
>
> http://git.kernel.dk/?p=linux-block.git;a=commit;h=a323185a761b9a54dc340d383695b4205ea258b6

One big problem of the commit is that it is basically a serialized workqueue
because of single &hctx->run_work, and per-req work_struct has to be
used for concurrent implementation.  So looks the approach isn't flexible
enough compared with doing that in driver, or any idea about how to fix
that?

Thanks

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-21  5:44                   ` Ming Lei
@ 2014-08-27 16:08                     ` Maxim Patlasov
  2014-08-27 16:29                       ` Benjamin LaHaise
  2014-08-28  2:06                       ` Ming Lei
  0 siblings, 2 replies; 22+ messages in thread
From: Maxim Patlasov @ 2014-08-27 16:08 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe
  Cc: Christoph Hellwig, Linux Kernel Mailing List, Andrew Morton,
	Dave Kleikamp, Zach Brown, Benjamin LaHaise, Kent Overstreet, AIO,
	Linux FS Devel, Dave Chinner

On 08/21/2014 09:44 AM, Ming Lei wrote:
> On Wed, Aug 20, 2014 at 4:50 AM, Jens Axboe <axboe@kernel.dk> wrote:
>
>>
>> Reworked a bit more:
>>
>> http://git.kernel.dk/?p=linux-block.git;a=commit;h=a323185a761b9a54dc340d383695b4205ea258b6
> One big problem of the commit is that it is basically a serialized workqueue
> because of single &hctx->run_work, and per-req work_struct has to be
> used for concurrent implementation.  So looks the approach isn't flexible
> enough compared with doing that in driver, or any idea about how to fix
> that?
>

I'm interested what's the price of handling requests in a separate 
thread at large. I used the following fio script:

     [global]
     direct=1
     bsrange=512-512
     timeout=10
     numjobs=1
     ioengine=sync

     filename=/dev/loop0 # or /dev/nullb0

     [f1]
     rw=randwrite

to compare the performance of:

1) /dev/loop0 of 3.17.0-rc1 with Ming's patches applied -- 11K iops
2) the same as above, but call loop_queue_work() directly from 
loop_queue_rq() -- 270K iops
3) /dev/nullb0 of 3.17.0-rc1 -- 380K iops

Taking into account so big difference (11K vs. 270K), would it be worthy 
to implement pure non-blocking version of aio_kernel_submit() returning 
error if blocking needed? Then loop driver (or any other in-kernel user) 
might firstly try that non-blocking submit as fast-path, and, only if 
it's failed, fall back to queueing.

Thanks,
Maxim

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-27 16:08                     ` Maxim Patlasov
@ 2014-08-27 16:29                       ` Benjamin LaHaise
  2014-08-27 17:19                         ` Maxim Patlasov
  2014-08-28  2:06                       ` Ming Lei
  1 sibling, 1 reply; 22+ messages in thread
From: Benjamin LaHaise @ 2014-08-27 16:29 UTC (permalink / raw)
  To: Maxim Patlasov
  Cc: Ming Lei, Jens Axboe, Christoph Hellwig,
	Linux Kernel Mailing List, Andrew Morton, Dave Kleikamp,
	Zach Brown, Kent Overstreet, AIO, Linux FS Devel, Dave Chinner

On Wed, Aug 27, 2014 at 08:08:59PM +0400, Maxim Patlasov wrote:
...
> 1) /dev/loop0 of 3.17.0-rc1 with Ming's patches applied -- 11K iops
> 2) the same as above, but call loop_queue_work() directly from 
> loop_queue_rq() -- 270K iops
> 3) /dev/nullb0 of 3.17.0-rc1 -- 380K iops
> 
> Taking into account so big difference (11K vs. 270K), would it be worthy 
> to implement pure non-blocking version of aio_kernel_submit() returning 
> error if blocking needed? Then loop driver (or any other in-kernel user) 
> might firstly try that non-blocking submit as fast-path, and, only if 
> it's failed, fall back to queueing.

What filesystem is the backing file for loop0 on?  O_DIRECT access as 
Ming's patches use should be non-blocking, and if not, that's something 
to fix.

		-ben
-- 
"Thought is the essence of where you are now."

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-27 16:29                       ` Benjamin LaHaise
@ 2014-08-27 17:19                         ` Maxim Patlasov
  2014-08-27 17:56                           ` Zach Brown
  0 siblings, 1 reply; 22+ messages in thread
From: Maxim Patlasov @ 2014-08-27 17:19 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Ming Lei, Jens Axboe, Christoph Hellwig,
	Linux Kernel Mailing List, Andrew Morton, Dave Kleikamp,
	Zach Brown, Kent Overstreet, AIO, Linux FS Devel, Dave Chinner

On 08/27/2014 08:29 PM, Benjamin LaHaise wrote:
> On Wed, Aug 27, 2014 at 08:08:59PM +0400, Maxim Patlasov wrote:
> ...
>> 1) /dev/loop0 of 3.17.0-rc1 with Ming's patches applied -- 11K iops
>> 2) the same as above, but call loop_queue_work() directly from
>> loop_queue_rq() -- 270K iops
>> 3) /dev/nullb0 of 3.17.0-rc1 -- 380K iops
>>
>> Taking into account so big difference (11K vs. 270K), would it be worthy
>> to implement pure non-blocking version of aio_kernel_submit() returning
>> error if blocking needed? Then loop driver (or any other in-kernel user)
>> might firstly try that non-blocking submit as fast-path, and, only if
>> it's failed, fall back to queueing.
> What filesystem is the backing file for loop0 on?  O_DIRECT access as
> Ming's patches use should be non-blocking, and if not, that's something
> to fix.

I used loop0 directly on top of null_blk driver (because my goal was to 
measure the overhead of processing requests in a separate thread).

In case of real-life filesystems, e.g. ext4, aio_kernel_submit() may 
easily block on something like bh_submit_read(), when fs reads file 
metadata to calculate the offset on block device by position in the file.

Thanks,
Maxim

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-27 17:19                         ` Maxim Patlasov
@ 2014-08-27 17:56                           ` Zach Brown
  2014-08-28  2:10                             ` Ming Lei
  0 siblings, 1 reply; 22+ messages in thread
From: Zach Brown @ 2014-08-27 17:56 UTC (permalink / raw)
  To: Maxim Patlasov
  Cc: Benjamin LaHaise, Ming Lei, Jens Axboe, Christoph Hellwig,
	Linux Kernel Mailing List, Andrew Morton, Dave Kleikamp,
	Kent Overstreet, AIO, Linux FS Devel, Dave Chinner

On Wed, Aug 27, 2014 at 09:19:36PM +0400, Maxim Patlasov wrote:
> On 08/27/2014 08:29 PM, Benjamin LaHaise wrote:
> >On Wed, Aug 27, 2014 at 08:08:59PM +0400, Maxim Patlasov wrote:
> >...
> >>1) /dev/loop0 of 3.17.0-rc1 with Ming's patches applied -- 11K iops
> >>2) the same as above, but call loop_queue_work() directly from
> >>loop_queue_rq() -- 270K iops
> >>3) /dev/nullb0 of 3.17.0-rc1 -- 380K iops
> >>
> >>Taking into account so big difference (11K vs. 270K), would it be worthy
> >>to implement pure non-blocking version of aio_kernel_submit() returning
> >>error if blocking needed? Then loop driver (or any other in-kernel user)
> >>might firstly try that non-blocking submit as fast-path, and, only if
> >>it's failed, fall back to queueing.
> >What filesystem is the backing file for loop0 on?  O_DIRECT access as
> >Ming's patches use should be non-blocking, and if not, that's something
> >to fix.
> 
> I used loop0 directly on top of null_blk driver (because my goal was to
> measure the overhead of processing requests in a separate thread).

The relative overhead while doing nothing else.  While zooming way down
in to micro benchmarks is fun and all, testing on an fs on brd might be
more representitive and so more compelling.

(And you might start to stumble into the terrifying territory of
stacking fs write paths under fs write paths.. turn on lockdep! :))

> In case of real-life filesystems, e.g. ext4, aio_kernel_submit() may easily
> block on something like bh_submit_read(), when fs reads file metadata to
> calculate the offset on block device by position in the file.

Yeah, there are a lot of rare potential blocking points throughout the
fs aio submission paths.   In practice (aio+dio+block fs), I think
submission tends to block waiting for congested block queues most often.

- z

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-27 16:08                     ` Maxim Patlasov
  2014-08-27 16:29                       ` Benjamin LaHaise
@ 2014-08-28  2:06                       ` Ming Lei
  2014-08-29 11:14                         ` Maxim Patlasov
  1 sibling, 1 reply; 22+ messages in thread
From: Ming Lei @ 2014-08-28  2:06 UTC (permalink / raw)
  To: Maxim Patlasov
  Cc: Jens Axboe, Christoph Hellwig, Linux Kernel Mailing List,
	Andrew Morton, Dave Kleikamp, Zach Brown, Benjamin LaHaise,
	Kent Overstreet, AIO, Linux FS Devel, Dave Chinner

On 8/28/14, Maxim Patlasov <mpatlasov@parallels.com> wrote:
> On 08/21/2014 09:44 AM, Ming Lei wrote:
>> On Wed, Aug 20, 2014 at 4:50 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>
>>>
>>> Reworked a bit more:
>>>
>>> http://git.kernel.dk/?p=linux-block.git;a=commit;h=a323185a761b9a54dc340d383695b4205ea258b6
>> One big problem of the commit is that it is basically a serialized
>> workqueue
>> because of single &hctx->run_work, and per-req work_struct has to be
>> used for concurrent implementation.  So looks the approach isn't flexible
>> enough compared with doing that in driver, or any idea about how to fix
>> that?
>>
>
> I'm interested what's the price of handling requests in a separate
> thread at large. I used the following fio script:
>
>      [global]
>      direct=1
>      bsrange=512-512
>      timeout=10
>      numjobs=1
>      ioengine=sync
>
>      filename=/dev/loop0 # or /dev/nullb0
>
>      [f1]
>      rw=randwrite
>
> to compare the performance of:
>
> 1) /dev/loop0 of 3.17.0-rc1 with Ming's patches applied -- 11K iops

If you enable BLK_MQ_F_WQ_CONTEXT, it isn't strange to see this
result since blk-mq implements a serialized workqueue.

> 2) the same as above, but call loop_queue_work() directly from
> loop_queue_rq() -- 270K iops
> 3) /dev/nullb0 of 3.17.0-rc1 -- 380K iops

In my recent investigation and discussion with Jens, using workqueue
may introduce some regression for cases like loop over null_blk, tmpfs.

And 270K vs. 380K is a bit similar with my result, and it was observed that
context switch is increased by more than 50% with introducing workqueue.

I will post V3 which will use previous kthread, with blk-mq & kernel aio, which
should make full use of blk-mq and kernel aio, and won't introduce regression
for cases like above.

> Taking into account so big difference (11K vs. 270K), would it be worthy
> to implement pure non-blocking version of aio_kernel_submit() returning
> error if blocking needed? Then loop driver (or any other in-kernel user)

The kernel aio submit is very similar with user space's implementation,
except for block plug&unplug usage in user space aio submit path.

If it is blocked in aio_kernel_submit(), you should observe similar thing
with io_submit() too.

Thanks,
-- 
Ming Lei

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-27 17:56                           ` Zach Brown
@ 2014-08-28  2:10                             ` Ming Lei
  0 siblings, 0 replies; 22+ messages in thread
From: Ming Lei @ 2014-08-28  2:10 UTC (permalink / raw)
  To: Zach Brown
  Cc: Maxim Patlasov, Benjamin LaHaise, Jens Axboe, Christoph Hellwig,
	Linux Kernel Mailing List, Andrew Morton, Dave Kleikamp,
	Kent Overstreet, AIO, Linux FS Devel, Dave Chinner

On 8/28/14, Zach Brown <zab@zabbo.net> wrote:
> On Wed, Aug 27, 2014 at 09:19:36PM +0400, Maxim Patlasov wrote:
>> On 08/27/2014 08:29 PM, Benjamin LaHaise wrote:
>> >On Wed, Aug 27, 2014 at 08:08:59PM +0400, Maxim Patlasov wrote:
>> >...
>> >>1) /dev/loop0 of 3.17.0-rc1 with Ming's patches applied -- 11K iops
>> >>2) the same as above, but call loop_queue_work() directly from
>> >>loop_queue_rq() -- 270K iops
>> >>3) /dev/nullb0 of 3.17.0-rc1 -- 380K iops
>> >>
>> >>Taking into account so big difference (11K vs. 270K), would it be
>> >> worthy
>> >>to implement pure non-blocking version of aio_kernel_submit() returning
>> >>error if blocking needed? Then loop driver (or any other in-kernel
>> >> user)
>> >>might firstly try that non-blocking submit as fast-path, and, only if
>> >>it's failed, fall back to queueing.
>> >What filesystem is the backing file for loop0 on?  O_DIRECT access as
>> >Ming's patches use should be non-blocking, and if not, that's something
>> >to fix.
>>
>> I used loop0 directly on top of null_blk driver (because my goal was to
>> measure the overhead of processing requests in a separate thread).
>
> The relative overhead while doing nothing else.  While zooming way down
> in to micro benchmarks is fun and all, testing on an fs on brd might be
> more representitive and so more compelling.
>
> (And you might start to stumble into the terrifying territory of
> stacking fs write paths under fs write paths.. turn on lockdep! :))
>
>> In case of real-life filesystems, e.g. ext4, aio_kernel_submit() may
>> easily
>> block on something like bh_submit_read(), when fs reads file metadata to
>> calculate the offset on block device by position in the file.
>
> Yeah, there are a lot of rare potential blocking points throughout the
> fs aio submission paths.   In practice (aio+dio+block fs), I think
> submission tends to block waiting for congested block queues most often.

In case of null_blk, it shouldn't have blocked here since the defaul tag size
is enough for the single job test if Maxim didn't change the default parameter
of null_blk.


Thanks,
-- 
Ming Lei

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
@ 2014-08-29 10:41 Maxim Patlasov
  0 siblings, 0 replies; 22+ messages in thread
From: Maxim Patlasov @ 2014-08-29 10:41 UTC (permalink / raw)
  To: Zach Brown
  Cc: Ming Lei, Benjamin LaHaise, axboe@kernel.dk, Christoph Hellwig,
	linux-kernel@vger.kernel.org, Andrew Morton, Dave Kleikamp,
	Kent Overstreet, AIO, linux-fsdevel@vger.kernel.org, Dave Chinner

On 8/28/14, Zach Brown<zab@zabbo.net>  wrote:

> On Wed, Aug 27, 2014 at 09:19:36PM +0400, Maxim Patlasov wrote:
>> On 08/27/2014 08:29 PM, Benjamin LaHaise wrote:
>>> On Wed, Aug 27, 2014 at 08:08:59PM +0400, Maxim Patlasov wrote:
>>> ...
>>>> 1) /dev/loop0 of 3.17.0-rc1 with Ming's patches applied -- 11K iops
>>>> 2) the same as above, but call loop_queue_work() directly from
>>>> loop_queue_rq() -- 270K iops
>>>> 3) /dev/nullb0 of 3.17.0-rc1 -- 380K iops
>>>>
>>>> Taking into account so big difference (11K vs. 270K), would it be
>>>> worthy
>>>> to implement pure non-blocking version of aio_kernel_submit() returning
>>>> error if blocking needed? Then loop driver (or any other in-kernel
>>>> user)
>>>> might firstly try that non-blocking submit as fast-path, and, only if
>>>> it's failed, fall back to queueing.
>>> What filesystem is the backing file for loop0 on?  O_DIRECT access as
>>> Ming's patches use should be non-blocking, and if not, that's something
>>> to fix.
>> I used loop0 directly on top of null_blk driver (because my goal was to
>> measure the overhead of processing requests in a separate thread).
> The relative overhead while doing nothing else.  While zooming way down
> in to micro benchmarks is fun and all, testing on an fs on brd might be
> more representitive and so more compelling.

The measurements on an fs on brd are even more outrageous (the same fio 
script I posted a few messages above):

1) Baseline. no loopback device involved.

fio on /dev/ram0:                           467K iops
fio on ext4 over /dev/ram0:                 378K iops

2) Loopback device from 3.17.0-rc1 with Ming's patches (v1) applied:

fio on /dev/loop0 over /dev/ram0:            10K iops
fio on ext4 over /dev/loop0 over /dev/ram0:   9K iops

3) the same as above, but avoid extra context switch (call 
loop_queue_work() directly from loop_queue_rq()):

fio on /dev/loop0 over /dev/ram0:           267K iops
fio on ext4 over /dev/loop0 over /dev/ram0: 223K iops

The problem is not about huge relative overhead while doing nothing 
else. It's rather about introducing extra latency (~100 microseconds on 
commodity h/w I used) which might be noticeable on modern SSDs (and h/w 
RAIDs with caching).

Thanks,
Maxim

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v1 5/9] block: loop: convert to blk-mq
  2014-08-28  2:06                       ` Ming Lei
@ 2014-08-29 11:14                         ` Maxim Patlasov
  0 siblings, 0 replies; 22+ messages in thread
From: Maxim Patlasov @ 2014-08-29 11:14 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Christoph Hellwig, Linux Kernel Mailing List,
	Andrew Morton, Dave Kleikamp, Zach Brown, Benjamin LaHaise,
	Kent Overstreet, AIO, Linux FS Devel, Dave Chinner

On 08/28/2014 06:06 AM, Ming Lei wrote:
> On 8/28/14, Maxim Patlasov <mpatlasov@parallels.com> wrote:
>> On 08/21/2014 09:44 AM, Ming Lei wrote:
>>> On Wed, Aug 20, 2014 at 4:50 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>
>>>> Reworked a bit more:
>>>>
>>>> http://git.kernel.dk/?p=linux-block.git;a=commit;h=a323185a761b9a54dc340d383695b4205ea258b6
>>> One big problem of the commit is that it is basically a serialized
>>> workqueue
>>> because of single &hctx->run_work, and per-req work_struct has to be
>>> used for concurrent implementation.  So looks the approach isn't flexible
>>> enough compared with doing that in driver, or any idea about how to fix
>>> that?
>>>
>> I'm interested what's the price of handling requests in a separate
>> thread at large. I used the following fio script:
>>
>>       [global]
>>       direct=1
>>       bsrange=512-512
>>       timeout=10
>>       numjobs=1
>>       ioengine=sync
>>
>>       filename=/dev/loop0 # or /dev/nullb0
>>
>>       [f1]
>>       rw=randwrite
>>
>> to compare the performance of:
>>
>> 1) /dev/loop0 of 3.17.0-rc1 with Ming's patches applied -- 11K iops
> If you enable BLK_MQ_F_WQ_CONTEXT, it isn't strange to see this
> result since blk-mq implements a serialized workqueue.

BLK_MQ_F_WQ_CONTEXT is not in 3.17.0-rc1, so I couldn't enable it.

>
>> 2) the same as above, but call loop_queue_work() directly from
>> loop_queue_rq() -- 270K iops
>> 3) /dev/nullb0 of 3.17.0-rc1 -- 380K iops
> In my recent investigation and discussion with Jens, using workqueue
> may introduce some regression for cases like loop over null_blk, tmpfs.
>
> And 270K vs. 380K is a bit similar with my result, and it was observed that
> context switch is increased by more than 50% with introducing workqueue.

The figures are similar, but the comparison is not. Both 270K and 380K 
refer to configurations where no extra context switch involved.

>
> I will post V3 which will use previous kthread, with blk-mq & kernel aio, which
> should make full use of blk-mq and kernel aio, and won't introduce regression
> for cases like above.

That would be great!

>
>> Taking into account so big difference (11K vs. 270K), would it be worthy
>> to implement pure non-blocking version of aio_kernel_submit() returning
>> error if blocking needed? Then loop driver (or any other in-kernel user)
> The kernel aio submit is very similar with user space's implementation,
> except for block plug&unplug usage in user space aio submit path.
>
> If it is blocked in aio_kernel_submit(), you should observe similar thing
> with io_submit() too.

Yes, I agree. My point was that there is a room for optimization as my 
experiments demonstrate. The question is whether it's worthy to 
sophisticate kernel aio (and fs-specific code too) for the sake of that 
optimization.

In fact, in a simple case like block fs on top of loopback device on top 
of a file on another block fs, what kernel aio does for loopback driver 
is a subtle way of converting incoming bio-s to outgoing bio-s. In case 
you know where the image file is placed (e.g. by fiemap), such a 
conversion may be done with zero overhead and anything that makes the 
overhead noticeable is suspicious. And it is easy to imagine other 
use-cases when that extra context switch is avoidable.

Thanks,
Maxim

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2014-08-29 11:14 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-08-29 10:41 [PATCH v1 5/9] block: loop: convert to blk-mq Maxim Patlasov
  -- strict thread matches above, loose matches on Subject: below --
2014-08-14 15:50 [PATCH v1 0/9] block & aio: kernel aio and loop mq conversion Ming Lei
2014-08-14 15:50 ` [PATCH v1 5/9] block: loop: convert to blk-mq Ming Lei
2014-08-15 16:31   ` Christoph Hellwig
2014-08-15 16:36     ` Jens Axboe
2014-08-15 16:46       ` Jens Axboe
2014-08-16  8:06         ` Ming Lei
2014-08-17 17:48           ` Jens Axboe
2014-08-18  1:22             ` Ming Lei
2014-08-18 11:53               ` Ming Lei
     [not found]                 ` <53F3B89D.6070703@kernel.dk>
2014-08-20  1:23                   ` Ming Lei
     [not found]                     ` <53F4C835.7030407@kernel.dk>
2014-08-21  2:54                       ` Ming Lei
     [not found]                         ` <53F5605C.2010304@kernel.dk>
2014-08-21  3:13                           ` Ming Lei
2014-08-21  3:15                             ` Ming Lei
2014-08-21  3:34                           ` Ming Lei
2014-08-21  5:44                   ` Ming Lei
2014-08-27 16:08                     ` Maxim Patlasov
2014-08-27 16:29                       ` Benjamin LaHaise
2014-08-27 17:19                         ` Maxim Patlasov
2014-08-27 17:56                           ` Zach Brown
2014-08-28  2:10                             ` Ming Lei
2014-08-28  2:06                       ` Ming Lei
2014-08-29 11:14                         ` Maxim Patlasov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox