linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RESEND PATCH 0/5] loop: improve loop aio perf by IOCB_NOWAIT
@ 2025-03-08 16:23 Ming Lei
  2025-03-08 16:23 ` [RESEND PATCH 1/5] loop: remove 'rw' parameter from lo_rw_aio() Ming Lei
                   ` (4 more replies)
  0 siblings, 5 replies; 18+ messages in thread
From: Ming Lei @ 2025-03-08 16:23 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Ming Lei

Hello Jens,

This patchset improves loop aio perf by using IOCB_NOWAIT for avoiding to queue aio
command to workqueue context, meantime refactor lo_rw_aio() a bit.

The last patch adds MQ support, which improves perf a bit in case of multiple
IO jobs.

In my test VM, loop disk perf becomes very close to perf of the backing block
device(nvme/mq virtio-scsi).

Thanks,
Ming


Ming Lei (5):
  loop: remove 'rw' parameter from lo_rw_aio()
  loop: cleanup lo_rw_aio()
  loop: add helper loop_queue_work_prep
  loop: try to handle loop aio command via NOWAIT IO first
  loop: add module parameter of 'nr_hw_queues'

 drivers/block/loop.c | 225 ++++++++++++++++++++++++++++++-------------
 1 file changed, 156 insertions(+), 69 deletions(-)

-- 
2.47.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RESEND PATCH 1/5] loop: remove 'rw' parameter from lo_rw_aio()
  2025-03-08 16:23 [RESEND PATCH 0/5] loop: improve loop aio perf by IOCB_NOWAIT Ming Lei
@ 2025-03-08 16:23 ` Ming Lei
  2025-03-09  7:30   ` kernel test robot
  2025-03-10 10:46   ` Christoph Hellwig
  2025-03-08 16:23 ` [RESEND PATCH 2/5] loop: cleanup lo_rw_aio() Ming Lei
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 18+ messages in thread
From: Ming Lei @ 2025-03-08 16:23 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Ming Lei

lo_rw_aio() is only called for READ/WRITE operation, which can be
figured out from request directly, so remove 'rw' parameter from
lo_rw_aio(), meantime rename the local variable as 'dir' which matches
the actual use more.

Meantime merge lo_read_simple() and lo_write_simple() into
lo_rw_simple().

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/loop.c | 48 ++++++++++++++++++--------------------------
 1 file changed, 19 insertions(+), 29 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 657bf53decf3..6bbbaa4aaf2c 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -239,31 +239,25 @@ static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos)
 	return bw;
 }
 
-static int lo_write_simple(struct loop_device *lo, struct request *rq,
-		loff_t pos)
-{
-	struct bio_vec bvec;
-	struct req_iterator iter;
-	int ret = 0;
-
-	rq_for_each_segment(bvec, rq, iter) {
-		ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos);
-		if (ret < 0)
-			break;
-		cond_resched();
-	}
-
-	return ret;
-}
-
-static int lo_read_simple(struct loop_device *lo, struct request *rq,
-		loff_t pos)
+static int lo_rw_simple(struct loop_device *lo, struct request *rq, loff_t pos)
 {
 	struct bio_vec bvec;
 	struct req_iterator iter;
 	struct iov_iter i;
 	ssize_t len;
 
+	if (req_op(rq) == REQ_OP_WRITE) {
+		int ret;
+
+		rq_for_each_segment(bvec, rq, iter) {
+			ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos);
+			if (ret < 0)
+				break;
+			cond_resched();
+		}
+		return ret;
+	}
+
 	rq_for_each_segment(bvec, rq, iter) {
 		iov_iter_bvec(&i, ITER_DEST, &bvec, 1, bvec.bv_len);
 		len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
@@ -400,13 +394,13 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret)
 	lo_rw_aio_do_completion(cmd);
 }
 
-static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
-		     loff_t pos, int rw)
+static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
 {
 	struct iov_iter iter;
 	struct req_iterator rq_iter;
 	struct bio_vec *bvec;
 	struct request *rq = blk_mq_rq_from_pdu(cmd);
+	int dir = (req_op(rq) == REQ_OP_READ) ? ITER_DEST : ITER_SOURCE;
 	struct bio *bio = rq->bio;
 	struct file *file = lo->lo_backing_file;
 	struct bio_vec tmp;
@@ -448,7 +442,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	}
 	atomic_set(&cmd->ref, 2);
 
-	iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
+	iov_iter_bvec(&iter, dir, bvec, nr_bvec, blk_rq_bytes(rq));
 	iter.iov_offset = offset;
 
 	cmd->iocb.ki_pos = pos;
@@ -457,7 +451,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	cmd->iocb.ki_flags = IOCB_DIRECT;
 	cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
 
-	if (rw == ITER_SOURCE)
+	if (dir == ITER_SOURCE)
 		ret = file->f_op->write_iter(&cmd->iocb, &iter);
 	else
 		ret = file->f_op->read_iter(&cmd->iocb, &iter);
@@ -498,15 +492,11 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 	case REQ_OP_DISCARD:
 		return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
 	case REQ_OP_WRITE:
-		if (cmd->use_aio)
-			return lo_rw_aio(lo, cmd, pos, ITER_SOURCE);
-		else
-			return lo_write_simple(lo, rq, pos);
 	case REQ_OP_READ:
 		if (cmd->use_aio)
-			return lo_rw_aio(lo, cmd, pos, ITER_DEST);
+			return lo_rw_aio(lo, cmd, pos);
 		else
-			return lo_read_simple(lo, rq, pos);
+			return lo_rw_simple(lo, rq, pos);
 	default:
 		WARN_ON_ONCE(1);
 		return -EIO;
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RESEND PATCH 2/5] loop: cleanup lo_rw_aio()
  2025-03-08 16:23 [RESEND PATCH 0/5] loop: improve loop aio perf by IOCB_NOWAIT Ming Lei
  2025-03-08 16:23 ` [RESEND PATCH 1/5] loop: remove 'rw' parameter from lo_rw_aio() Ming Lei
@ 2025-03-08 16:23 ` Ming Lei
  2025-03-10 11:07   ` Christoph Hellwig
  2025-03-08 16:23 ` [RESEND PATCH 3/5] loop: add helper loop_queue_work_prep Ming Lei
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 18+ messages in thread
From: Ming Lei @ 2025-03-08 16:23 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Ming Lei

Cleanup lo_rw_aio() a bit by refactoring it into three parts:

- lo_cmd_nr_bvec(), for calculating how many bvecs in this request

- lo_rw_aio_prep(), for preparing loop command, which need to be called
once

- lo_submit_rw_aio(), for submitting this lo command, which can be
called multiple times

Prepare for trying to handle loop command by NOWAIT read/write IO
first.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/loop.c | 83 +++++++++++++++++++++++++++++---------------
 1 file changed, 55 insertions(+), 28 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6bbbaa4aaf2c..eae38cd38b7b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -394,24 +394,63 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret)
 	lo_rw_aio_do_completion(cmd);
 }
 
-static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
+static int lo_submit_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
+			    loff_t pos, int nr_bvec)
 {
-	struct iov_iter iter;
-	struct req_iterator rq_iter;
-	struct bio_vec *bvec;
 	struct request *rq = blk_mq_rq_from_pdu(cmd);
 	int dir = (req_op(rq) == REQ_OP_READ) ? ITER_DEST : ITER_SOURCE;
-	struct bio *bio = rq->bio;
 	struct file *file = lo->lo_backing_file;
-	struct bio_vec tmp;
+	struct iov_iter iter;
+	struct bio_vec *bvec;
 	unsigned int offset;
-	int nr_bvec = 0;
 	int ret;
 
+	if (rq->bio != rq->biotail) {
+		bvec = cmd->bvec;
+		offset = 0;
+	} else {
+		struct bio *bio = rq->bio;
+
+		offset = bio->bi_iter.bi_bvec_done;
+		bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+	}
+	iov_iter_bvec(&iter, dir, bvec, nr_bvec, blk_rq_bytes(rq));
+	iter.iov_offset = offset;
+	cmd->iocb.ki_pos = pos;
+
+	atomic_set(&cmd->ref, 2);
+	if (dir == ITER_SOURCE)
+		ret = file->f_op->write_iter(&cmd->iocb, &iter);
+	else
+		ret = file->f_op->read_iter(&cmd->iocb, &iter);
+	lo_rw_aio_do_completion(cmd);
+
+	return ret;
+}
+
+static inline unsigned lo_cmd_nr_bvec(struct loop_cmd *cmd)
+{
+	struct req_iterator rq_iter;
+	struct request *rq = blk_mq_rq_from_pdu(cmd);
+	struct bio_vec tmp;
+	int nr_bvec = 0;
+
 	rq_for_each_bvec(tmp, rq, rq_iter)
 		nr_bvec++;
 
+	return nr_bvec;
+}
+
+static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd,
+			  unsigned nr_bvec)
+{
+	struct request *rq = blk_mq_rq_from_pdu(cmd);
+	struct file *file = lo->lo_backing_file;
+
 	if (rq->bio != rq->biotail) {
+		struct req_iterator rq_iter;
+		struct bio_vec *bvec;
+		struct bio_vec tmp;
 
 		bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
 				     GFP_NOIO);
@@ -429,35 +468,23 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
 			*bvec = tmp;
 			bvec++;
 		}
-		bvec = cmd->bvec;
-		offset = 0;
-	} else {
-		/*
-		 * Same here, this bio may be started from the middle of the
-		 * 'bvec' because of bio splitting, so offset from the bvec
-		 * must be passed to iov iterator
-		 */
-		offset = bio->bi_iter.bi_bvec_done;
-		bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 	}
-	atomic_set(&cmd->ref, 2);
-
-	iov_iter_bvec(&iter, dir, bvec, nr_bvec, blk_rq_bytes(rq));
-	iter.iov_offset = offset;
-
-	cmd->iocb.ki_pos = pos;
 	cmd->iocb.ki_filp = file;
 	cmd->iocb.ki_complete = lo_rw_aio_complete;
 	cmd->iocb.ki_flags = IOCB_DIRECT;
 	cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
 
-	if (dir == ITER_SOURCE)
-		ret = file->f_op->write_iter(&cmd->iocb, &iter);
-	else
-		ret = file->f_op->read_iter(&cmd->iocb, &iter);
+	return 0;
+}
 
-	lo_rw_aio_do_completion(cmd);
+static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
+{
+	unsigned int nr_bvec = lo_cmd_nr_bvec(cmd);
+	int ret = lo_rw_aio_prep(lo, cmd, nr_bvec);
 
+	if (ret < 0)
+		return ret;
+	ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec);
 	if (ret != -EIOCBQUEUED)
 		lo_rw_aio_complete(&cmd->iocb, ret);
 	return 0;
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RESEND PATCH 3/5] loop: add helper loop_queue_work_prep
  2025-03-08 16:23 [RESEND PATCH 0/5] loop: improve loop aio perf by IOCB_NOWAIT Ming Lei
  2025-03-08 16:23 ` [RESEND PATCH 1/5] loop: remove 'rw' parameter from lo_rw_aio() Ming Lei
  2025-03-08 16:23 ` [RESEND PATCH 2/5] loop: cleanup lo_rw_aio() Ming Lei
@ 2025-03-08 16:23 ` Ming Lei
  2025-03-09  7:30   ` kernel test robot
  2025-03-10 11:11   ` Christoph Hellwig
  2025-03-08 16:23 ` [RESEND PATCH 4/5] loop: try to handle loop aio command via NOWAIT IO first Ming Lei
  2025-03-08 16:23 ` [RESEND PATCH 5/5] loop: add module parameter of 'nr_hw_queues' Ming Lei
  4 siblings, 2 replies; 18+ messages in thread
From: Ming Lei @ 2025-03-08 16:23 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Ming Lei

Add helper loop_queue_work_prep() for making loop_queue_rq() more
readable.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/loop.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index eae38cd38b7b..9f8d32d2dc4d 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -859,6 +859,27 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
 }
 #endif
 
+static void loop_queue_work_prep(struct loop_cmd *cmd)
+{
+	struct request *rq = blk_mq_rq_from_pdu(cmd);
+
+	/* always use the first bio's css */
+	cmd->blkcg_css = NULL;
+	cmd->memcg_css = NULL;
+#ifdef CONFIG_BLK_CGROUP
+	if (rq->bio) {
+		cmd->blkcg_css = bio_blkcg_css(rq->bio);
+#ifdef CONFIG_MEMCG
+		if (cmd->blkcg_css) {
+			cmd->memcg_css =
+				cgroup_get_e_css(cmd->blkcg_css->cgroup,
+						&memory_cgrp_subsys);
+		}
+#endif
+	}
+#endif
+}
+
 static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 {
 	struct rb_node **node, *parent = NULL;
@@ -866,6 +887,8 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 	struct work_struct *work;
 	struct list_head *cmd_list;
 
+	loop_queue_work_prep(cmd);
+
 	spin_lock_irq(&lo->lo_work_lock);
 
 	if (queue_on_root_worker(cmd->blkcg_css))
@@ -1903,21 +1926,6 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		break;
 	}
 
-	/* always use the first bio's css */
-	cmd->blkcg_css = NULL;
-	cmd->memcg_css = NULL;
-#ifdef CONFIG_BLK_CGROUP
-	if (rq->bio) {
-		cmd->blkcg_css = bio_blkcg_css(rq->bio);
-#ifdef CONFIG_MEMCG
-		if (cmd->blkcg_css) {
-			cmd->memcg_css =
-				cgroup_get_e_css(cmd->blkcg_css->cgroup,
-						&memory_cgrp_subsys);
-		}
-#endif
-	}
-#endif
 	loop_queue_work(lo, cmd);
 
 	return BLK_STS_OK;
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RESEND PATCH 4/5] loop: try to handle loop aio command via NOWAIT IO first
  2025-03-08 16:23 [RESEND PATCH 0/5] loop: improve loop aio perf by IOCB_NOWAIT Ming Lei
                   ` (2 preceding siblings ...)
  2025-03-08 16:23 ` [RESEND PATCH 3/5] loop: add helper loop_queue_work_prep Ming Lei
@ 2025-03-08 16:23 ` Ming Lei
  2025-03-10 11:14   ` Christoph Hellwig
  2025-03-08 16:23 ` [RESEND PATCH 5/5] loop: add module parameter of 'nr_hw_queues' Ming Lei
  4 siblings, 1 reply; 18+ messages in thread
From: Ming Lei @ 2025-03-08 16:23 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Ming Lei

Try to handle loop aio command via NOWAIT IO first, then we can avoid to
queue the aio command into workqueue.

Fallback to workqueue in case of -EAGAIN.

BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or
.write_iter() which might sleep even though it is NOWAIT.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/loop.c | 47 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 9f8d32d2dc4d..46be0c8e75a6 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -92,6 +92,8 @@ struct loop_cmd {
 #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ)
 #define LOOP_DEFAULT_HW_Q_DEPTH 128
 
+static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd);
+
 static DEFINE_IDR(loop_index_idr);
 static DEFINE_MUTEX(loop_ctl_mutex);
 static DEFINE_MUTEX(loop_validate_mutex);
@@ -380,8 +382,17 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
 
 	if (!atomic_dec_and_test(&cmd->ref))
 		return;
+
+	if (cmd->ret == -EAGAIN) {
+		struct loop_device *lo = rq->q->queuedata;
+
+		loop_queue_work(lo, cmd);
+		return;
+	}
+
 	kfree(cmd->bvec);
 	cmd->bvec = NULL;
+
 	if (likely(!blk_should_fake_timeout(rq->q)))
 		blk_mq_complete_request(rq);
 }
@@ -478,16 +489,34 @@ static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd,
 }
 
 static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
+{
+	unsigned int nr_bvec = lo_cmd_nr_bvec(cmd);
+	int ret;
+
+	cmd->iocb.ki_flags &= ~IOCB_NOWAIT;
+	ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec);
+	if (ret != -EIOCBQUEUED)
+		lo_rw_aio_complete(&cmd->iocb, ret);
+	return 0;
+}
+
+static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
 {
 	unsigned int nr_bvec = lo_cmd_nr_bvec(cmd);
 	int ret = lo_rw_aio_prep(lo, cmd, nr_bvec);
 
 	if (ret < 0)
 		return ret;
+
+	cmd->iocb.ki_flags |= IOCB_NOWAIT;
 	ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec);
-	if (ret != -EIOCBQUEUED)
+	if (ret == -EIOCBQUEUED)
+		return 0;
+	if (ret != -EAGAIN) {
 		lo_rw_aio_complete(&cmd->iocb, ret);
-	return 0;
+		return 0;
+	}
+	return ret;
 }
 
 static int do_req_filebacked(struct loop_device *lo, struct request *rq)
@@ -1926,6 +1955,17 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		break;
 	}
 
+	if (cmd->use_aio) {
+		loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
+		int ret = lo_rw_aio_nowait(lo, cmd, pos);
+
+		if (!ret)
+			return BLK_STS_OK;
+		if (ret != -EAGAIN)
+			return BLK_STS_IOERR;
+		/* fallback to workqueue for handling aio */
+	}
+
 	loop_queue_work(lo, cmd);
 
 	return BLK_STS_OK;
@@ -2076,7 +2116,8 @@ static int loop_add(int i)
 	lo->tag_set.queue_depth = hw_queue_depth;
 	lo->tag_set.numa_node = NUMA_NO_NODE;
 	lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-	lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT;
+	lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT |
+		BLK_MQ_F_BLOCKING;
 	lo->tag_set.driver_data = lo;
 
 	err = blk_mq_alloc_tag_set(&lo->tag_set);
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RESEND PATCH 5/5] loop: add module parameter of 'nr_hw_queues'
  2025-03-08 16:23 [RESEND PATCH 0/5] loop: improve loop aio perf by IOCB_NOWAIT Ming Lei
                   ` (3 preceding siblings ...)
  2025-03-08 16:23 ` [RESEND PATCH 4/5] loop: try to handle loop aio command via NOWAIT IO first Ming Lei
@ 2025-03-08 16:23 ` Ming Lei
  2025-03-10 11:15   ` Christoph Hellwig
  4 siblings, 1 reply; 18+ messages in thread
From: Ming Lei @ 2025-03-08 16:23 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Ming Lei

Add module parameter of 'nr_hw_queues' so that loop can support MQ,
which may reduce contention in case of too many io jobs.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/loop.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 46be0c8e75a6..6378dfee6681 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -91,6 +91,7 @@ struct loop_cmd {
 
 #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ)
 #define LOOP_DEFAULT_HW_Q_DEPTH 128
+#define LOOP_DEFAULT_NR_HW_Q 1
 
 static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd);
 
@@ -1928,6 +1929,26 @@ static const struct kernel_param_ops loop_hw_qdepth_param_ops = {
 device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444);
 MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH));
 
+static int nr_hw_queues = LOOP_DEFAULT_NR_HW_Q;
+static int loop_set_nr_hw_queues(const char *s, const struct kernel_param *p)
+{
+	int nr, ret;
+
+	ret = kstrtoint(s, 0, &nr);
+	if (ret < 0)
+		return ret;
+	if (nr < 1)
+		return -EINVAL;
+	nr_hw_queues = nr;
+	return 0;
+}
+static const struct kernel_param_ops loop_nr_hw_q_param_ops = {
+	.set	= loop_set_nr_hw_queues,
+	.get	= param_get_int,
+};
+device_param_cb(nr_hw_queues, &loop_nr_hw_q_param_ops, &nr_hw_queues, 0444);
+MODULE_PARM_DESC(nr_hw_queues, "number of hardware queues. Default: " __stringify(LOOP_DEFAULT_NR_HW_Q));
+
 MODULE_DESCRIPTION("Loopback device support");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
@@ -2112,7 +2133,7 @@ static int loop_add(int i)
 	i = err;
 
 	lo->tag_set.ops = &loop_mq_ops;
-	lo->tag_set.nr_hw_queues = 1;
+	lo->tag_set.nr_hw_queues = nr_hw_queues;
 	lo->tag_set.queue_depth = hw_queue_depth;
 	lo->tag_set.numa_node = NUMA_NO_NODE;
 	lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RESEND PATCH 3/5] loop: add helper loop_queue_work_prep
  2025-03-08 16:23 ` [RESEND PATCH 3/5] loop: add helper loop_queue_work_prep Ming Lei
@ 2025-03-09  7:30   ` kernel test robot
  2025-03-10 11:11   ` Christoph Hellwig
  1 sibling, 0 replies; 18+ messages in thread
From: kernel test robot @ 2025-03-09  7:30 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe, linux-block; +Cc: oe-kbuild-all, Ming Lei

Hi Ming,

kernel test robot noticed the following build warnings:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.14-rc5 next-20250307]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Ming-Lei/loop-remove-rw-parameter-from-lo_rw_aio/20250309-002548
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20250308162312.1640828-4-ming.lei%40redhat.com
patch subject: [RESEND PATCH 3/5] loop: add helper loop_queue_work_prep
config: arc-randconfig-001-20250309 (https://download.01.org/0day-ci/archive/20250309/202503091413.vbFFy32o-lkp@intel.com/config)
compiler: arceb-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250309/202503091413.vbFFy32o-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503091413.vbFFy32o-lkp@intel.com/

All warnings (new ones prefixed by >>):

   drivers/block/loop.c: In function 'loop_queue_work_prep':
>> drivers/block/loop.c:862:25: warning: unused variable 'rq' [-Wunused-variable]
     862 |         struct request *rq = blk_mq_rq_from_pdu(cmd);
         |                         ^~


vim +/rq +862 drivers/block/loop.c

   859	
   860	static void loop_queue_work_prep(struct loop_cmd *cmd)
   861	{
 > 862		struct request *rq = blk_mq_rq_from_pdu(cmd);
   863	
   864		/* always use the first bio's css */
   865		cmd->blkcg_css = NULL;
   866		cmd->memcg_css = NULL;
   867	#ifdef CONFIG_BLK_CGROUP
   868		if (rq->bio) {
   869			cmd->blkcg_css = bio_blkcg_css(rq->bio);
   870	#ifdef CONFIG_MEMCG
   871			if (cmd->blkcg_css) {
   872				cmd->memcg_css =
   873					cgroup_get_e_css(cmd->blkcg_css->cgroup,
   874							&memory_cgrp_subsys);
   875			}
   876	#endif
   877		}
   878	#endif
   879	}
   880	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND PATCH 1/5] loop: remove 'rw' parameter from lo_rw_aio()
  2025-03-08 16:23 ` [RESEND PATCH 1/5] loop: remove 'rw' parameter from lo_rw_aio() Ming Lei
@ 2025-03-09  7:30   ` kernel test robot
  2025-03-10 10:46   ` Christoph Hellwig
  1 sibling, 0 replies; 18+ messages in thread
From: kernel test robot @ 2025-03-09  7:30 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe, linux-block; +Cc: llvm, oe-kbuild-all, Ming Lei

Hi Ming,

kernel test robot noticed the following build warnings:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.14-rc5 next-20250307]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Ming-Lei/loop-remove-rw-parameter-from-lo_rw_aio/20250309-002548
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20250308162312.1640828-2-ming.lei%40redhat.com
patch subject: [RESEND PATCH 1/5] loop: remove 'rw' parameter from lo_rw_aio()
config: arm-randconfig-001-20250309 (https://download.01.org/0day-ci/archive/20250309/202503091516.7U64QwvF-lkp@intel.com/config)
compiler: clang version 21.0.0git (https://github.com/llvm/llvm-project e15545cad8297ec7555f26e5ae74a9f0511203e7)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250309/202503091516.7U64QwvF-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503091516.7U64QwvF-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> drivers/block/loop.c:250:3: warning: variable 'ret' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
     250 |                 rq_for_each_segment(bvec, rq, iter) {
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/blk-mq.h:1043:2: note: expanded from macro 'rq_for_each_segment'
    1043 |         __rq_for_each_bio(_iter.bio, _rq)                       \
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/blk-mq.h:1039:6: note: expanded from macro '__rq_for_each_bio'
    1039 |         if ((rq->bio))                  \
         |             ^~~~~~~~~
   drivers/block/loop.c:256:10: note: uninitialized use occurs here
     256 |                 return ret;
         |                        ^~~
   drivers/block/loop.c:250:3: note: remove the 'if' if its condition is always true
     250 |                 rq_for_each_segment(bvec, rq, iter) {
         |                 ^
   include/linux/blk-mq.h:1043:2: note: expanded from macro 'rq_for_each_segment'
    1043 |         __rq_for_each_bio(_iter.bio, _rq)                       \
         |         ^
   include/linux/blk-mq.h:1039:2: note: expanded from macro '__rq_for_each_bio'
    1039 |         if ((rq->bio))                  \
         |         ^
   drivers/block/loop.c:248:10: note: initialize the variable 'ret' to silence this warning
     248 |                 int ret;
         |                        ^
         |                         = 0
   1 warning generated.


vim +250 drivers/block/loop.c

^1da177e4c3f41 Linus Torvalds    2005-04-16  239  
e1d39dbf4716cd Ming Lei          2025-03-09  240  static int lo_rw_simple(struct loop_device *lo, struct request *rq, loff_t pos)
^1da177e4c3f41 Linus Torvalds    2005-04-16  241  {
aa4d86163e4e91 Christoph Hellwig 2015-04-07  242  	struct bio_vec bvec;
aa4d86163e4e91 Christoph Hellwig 2015-04-07  243  	struct req_iterator iter;
e1d39dbf4716cd Ming Lei          2025-03-09  244  	struct iov_iter i;
e1d39dbf4716cd Ming Lei          2025-03-09  245  	ssize_t len;
e1d39dbf4716cd Ming Lei          2025-03-09  246  
e1d39dbf4716cd Ming Lei          2025-03-09  247  	if (req_op(rq) == REQ_OP_WRITE) {
e1d39dbf4716cd Ming Lei          2025-03-09  248  		int ret;
aa4d86163e4e91 Christoph Hellwig 2015-04-07  249  
aa4d86163e4e91 Christoph Hellwig 2015-04-07 @250  		rq_for_each_segment(bvec, rq, iter) {
aa4d86163e4e91 Christoph Hellwig 2015-04-07  251  			ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos);
aa4d86163e4e91 Christoph Hellwig 2015-04-07  252  			if (ret < 0)
aa4d86163e4e91 Christoph Hellwig 2015-04-07  253  				break;
^1da177e4c3f41 Linus Torvalds    2005-04-16  254  			cond_resched();
^1da177e4c3f41 Linus Torvalds    2005-04-16  255  		}
aa4d86163e4e91 Christoph Hellwig 2015-04-07  256  		return ret;
aa4d86163e4e91 Christoph Hellwig 2015-04-07  257  	}
aa4d86163e4e91 Christoph Hellwig 2015-04-07  258  
aa4d86163e4e91 Christoph Hellwig 2015-04-07  259  	rq_for_each_segment(bvec, rq, iter) {
de4eda9de2d957 Al Viro           2022-09-15  260  		iov_iter_bvec(&i, ITER_DEST, &bvec, 1, bvec.bv_len);
18e9710ee59ce3 Christoph Hellwig 2017-05-27  261  		len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
aa4d86163e4e91 Christoph Hellwig 2015-04-07  262  		if (len < 0)
aa4d86163e4e91 Christoph Hellwig 2015-04-07  263  			return len;
aa4d86163e4e91 Christoph Hellwig 2015-04-07  264  
aa4d86163e4e91 Christoph Hellwig 2015-04-07  265  		flush_dcache_page(bvec.bv_page);
^1da177e4c3f41 Linus Torvalds    2005-04-16  266  
aa4d86163e4e91 Christoph Hellwig 2015-04-07  267  		if (len != bvec.bv_len) {
aa4d86163e4e91 Christoph Hellwig 2015-04-07  268  			struct bio *bio;
fd5821404e6823 Jens Axboe        2007-06-12  269  
aa4d86163e4e91 Christoph Hellwig 2015-04-07  270  			__rq_for_each_bio(bio, rq)
aa4d86163e4e91 Christoph Hellwig 2015-04-07  271  				zero_fill_bio(bio);
aa4d86163e4e91 Christoph Hellwig 2015-04-07  272  			break;
aa4d86163e4e91 Christoph Hellwig 2015-04-07  273  		}
aa4d86163e4e91 Christoph Hellwig 2015-04-07  274  		cond_resched();
^1da177e4c3f41 Linus Torvalds    2005-04-16  275  	}
^1da177e4c3f41 Linus Torvalds    2005-04-16  276  
aa4d86163e4e91 Christoph Hellwig 2015-04-07  277  	return 0;
fd5821404e6823 Jens Axboe        2007-06-12  278  }
fd5821404e6823 Jens Axboe        2007-06-12  279  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND PATCH 1/5] loop: remove 'rw' parameter from lo_rw_aio()
  2025-03-08 16:23 ` [RESEND PATCH 1/5] loop: remove 'rw' parameter from lo_rw_aio() Ming Lei
  2025-03-09  7:30   ` kernel test robot
@ 2025-03-10 10:46   ` Christoph Hellwig
  1 sibling, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2025-03-10 10:46 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block

On Sun, Mar 09, 2025 at 12:23:05AM +0800, Ming Lei wrote:
> lo_rw_aio() is only called for READ/WRITE operation, which can be
> figured out from request directly, so remove 'rw' parameter from
> lo_rw_aio(), meantime rename the local variable as 'dir' which matches
> the actual use more.
> 
> Meantime merge lo_read_simple() and lo_write_simple() into
> lo_rw_simple().

That's two entirely separate things, please split them into separate
patches.

static int lo_rw_simple(struct loop_device *lo, struct request *rq, loff_t pos)
>  {
>  	struct bio_vec bvec;
>  	struct req_iterator iter;
>  	struct iov_iter i;
>  	ssize_t len;
>  
> +	if (req_op(rq) == REQ_OP_WRITE) {
> +		int ret;
> +
> +		rq_for_each_segment(bvec, rq, iter) {
> +			ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos);
> +			if (ret < 0)
> +				break;
> +			cond_resched();
> +		}
> +		return ret;
> +	}
> +
>  	rq_for_each_segment(bvec, rq, iter) {

.. and nothing is really merged here.  So unless you actually merge
some code later this part doesn't seem particularly useful.

>  	struct request *rq = blk_mq_rq_from_pdu(cmd);
> +	int dir = (req_op(rq) == REQ_OP_READ) ? ITER_DEST : ITER_SOURCE;
>  	struct bio *bio = rq->bio;
>  	struct file *file = lo->lo_backing_file;
>  	struct bio_vec tmp;
> @@ -448,7 +442,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  	}
>  	atomic_set(&cmd->ref, 2);
>  
> -	iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
> +	iov_iter_bvec(&iter, dir, bvec, nr_bvec, blk_rq_bytes(rq));
>  	iter.iov_offset = offset;
>  
>  	cmd->iocb.ki_pos = pos;
> @@ -457,7 +451,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  	cmd->iocb.ki_flags = IOCB_DIRECT;
>  	cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
>  
> -	if (rw == ITER_SOURCE)
> +	if (dir == ITER_SOURCE)


i'd just use the request direction check here, and then open code
the iter source/dest in the iov_iter_bvec call.

>  	case REQ_OP_READ:
>  		if (cmd->use_aio)
> -			return lo_rw_aio(lo, cmd, pos, ITER_DEST);
> +			return lo_rw_aio(lo, cmd, pos);
>  		else
> -			return lo_read_simple(lo, rq, pos);
> +			return lo_rw_simple(lo, rq, pos);

Not entirely new here, but there is no reason to use an else after a
return.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND PATCH 2/5] loop: cleanup lo_rw_aio()
  2025-03-08 16:23 ` [RESEND PATCH 2/5] loop: cleanup lo_rw_aio() Ming Lei
@ 2025-03-10 11:07   ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2025-03-10 11:07 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block

On Sun, Mar 09, 2025 at 12:23:06AM +0800, Ming Lei wrote:
> +	if (rq->bio != rq->biotail) {

This would probably be more self-explaining by checking for cmd->bdev
here.

> +		bvec = cmd->bvec;
> +		offset = 0;
> +	} else {
> +		struct bio *bio = rq->bio;
> +
> +		offset = bio->bi_iter.bi_bvec_done;
> +		bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
> +	}
> +	iov_iter_bvec(&iter, dir, bvec, nr_bvec, blk_rq_bytes(rq));
> +	iter.iov_offset = offset;

And given that bvec and offset are only used here I'd just move the
iov_iter_bvec into the branches and do away with the two variables,
and kill the bio variable as well while at it.

> +static inline unsigned lo_cmd_nr_bvec(struct loop_cmd *cmd)
> +{
> +	struct req_iterator rq_iter;
> +	struct request *rq = blk_mq_rq_from_pdu(cmd);
> +	struct bio_vec tmp;
> +	int nr_bvec = 0;
> +
>  	rq_for_each_bvec(tmp, rq, rq_iter)
>  		nr_bvec++;
>  
> +	return nr_bvec;
> +}
> +
> +static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd,
> +			  unsigned nr_bvec)

The function order is a bit weird.  I would expect them to appear in
the rough order that they are called, e.g. lo_cmd_nr_bvec first, then
lo_rw_aio_prep, then the submit helper.

> -		/*
> -		 * Same here, this bio may be started from the middle of the
> -		 * 'bvec' because of bio splitting, so offset from the bvec
> -		 * must be passed to iov iterator
> -		 */

It would be good if this comment didn't get lost.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND PATCH 3/5] loop: add helper loop_queue_work_prep
  2025-03-08 16:23 ` [RESEND PATCH 3/5] loop: add helper loop_queue_work_prep Ming Lei
  2025-03-09  7:30   ` kernel test robot
@ 2025-03-10 11:11   ` Christoph Hellwig
  2025-03-11  1:21     ` Ming Lei
  1 sibling, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2025-03-10 11:11 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block

On Sun, Mar 09, 2025 at 12:23:07AM +0800, Ming Lei wrote:
> Add helper loop_queue_work_prep() for making loop_queue_rq() more
> readable.

Looking at this and the finaly result I don't really see any advantage
over just moving the code into loop_queue_work.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND PATCH 4/5] loop: try to handle loop aio command via NOWAIT IO first
  2025-03-08 16:23 ` [RESEND PATCH 4/5] loop: try to handle loop aio command via NOWAIT IO first Ming Lei
@ 2025-03-10 11:14   ` Christoph Hellwig
  2025-03-11  1:33     ` Ming Lei
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2025-03-10 11:14 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block

On Sun, Mar 09, 2025 at 12:23:08AM +0800, Ming Lei wrote:
> Try to handle loop aio command via NOWAIT IO first, then we can avoid to
> queue the aio command into workqueue.
> 
> Fallback to workqueue in case of -EAGAIN.
> 
> BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or
> .write_iter() which might sleep even though it is NOWAIT.

This needs performance numbers (or other reasons) justifying the
change, especially as BLK_MQ_F_BLOCKING is a bit of an overhead.

>  static DEFINE_IDR(loop_index_idr);
>  static DEFINE_MUTEX(loop_ctl_mutex);
>  static DEFINE_MUTEX(loop_validate_mutex);
> @@ -380,8 +382,17 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
>  
>  	if (!atomic_dec_and_test(&cmd->ref))
>  		return;
> +
> +	if (cmd->ret == -EAGAIN) {
> +		struct loop_device *lo = rq->q->queuedata;
> +
> +		loop_queue_work(lo, cmd);
> +		return;
> +	}

This looks like the wrong place for the rety, as -EAGAIN can only come from
the submissions path.  i.e. we should never make it to the full completion
path for that case.

>  static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
> +{
> +	unsigned int nr_bvec = lo_cmd_nr_bvec(cmd);
> +	int ret;
> +
> +	cmd->iocb.ki_flags &= ~IOCB_NOWAIT;
> +	ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec);
> +	if (ret != -EIOCBQUEUED)
> +		lo_rw_aio_complete(&cmd->iocb, ret);
> +	return 0;

This needs an explanation that it is for the fallback path and thus
clears the nowait flag.

> +}
> +
> +static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)

Overly long line.

> @@ -1926,6 +1955,17 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
>  		break;
>  	}
>  
> +	if (cmd->use_aio) {
> +		loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
> +		int ret = lo_rw_aio_nowait(lo, cmd, pos);
> +
> +		if (!ret)
> +			return BLK_STS_OK;
> +		if (ret != -EAGAIN)
> +			return BLK_STS_IOERR;
> +		/* fallback to workqueue for handling aio */
> +	}

Why isn't all the logic in this branch in lo_rw_aio_nowait?


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND PATCH 5/5] loop: add module parameter of 'nr_hw_queues'
  2025-03-08 16:23 ` [RESEND PATCH 5/5] loop: add module parameter of 'nr_hw_queues' Ming Lei
@ 2025-03-10 11:15   ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2025-03-10 11:15 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block

On Sun, Mar 09, 2025 at 12:23:09AM +0800, Ming Lei wrote:
> Add module parameter of 'nr_hw_queues' so that loop can support MQ,
> which may reduce contention in case of too many io jobs.

More details, please.  Also a module parameter is a really bad
interface - this needs to be per-device and have a good default.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND PATCH 3/5] loop: add helper loop_queue_work_prep
  2025-03-10 11:11   ` Christoph Hellwig
@ 2025-03-11  1:21     ` Ming Lei
  2025-03-11  7:55       ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Ming Lei @ 2025-03-11  1:21 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, linux-block

On Mon, Mar 10, 2025 at 12:11:20PM +0100, Christoph Hellwig wrote:
> On Sun, Mar 09, 2025 at 12:23:07AM +0800, Ming Lei wrote:
> > Add helper loop_queue_work_prep() for making loop_queue_rq() more
> > readable.
> 
> Looking at this and the finaly result I don't really see any advantage
> over just moving the code into loop_queue_work.

loop_queue_work() is required for handling -EAGAIN, that is why I move
loop_queue_work_prep() into loop_queue_work().


Thanks,
Ming


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND PATCH 4/5] loop: try to handle loop aio command via NOWAIT IO first
  2025-03-10 11:14   ` Christoph Hellwig
@ 2025-03-11  1:33     ` Ming Lei
  2025-03-11  7:58       ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Ming Lei @ 2025-03-11  1:33 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, linux-block

On Mon, Mar 10, 2025 at 12:14:44PM +0100, Christoph Hellwig wrote:
> On Sun, Mar 09, 2025 at 12:23:08AM +0800, Ming Lei wrote:
> > Try to handle loop aio command via NOWAIT IO first, then we can avoid to
> > queue the aio command into workqueue.
> > 
> > Fallback to workqueue in case of -EAGAIN.
> > 
> > BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or
> > .write_iter() which might sleep even though it is NOWAIT.
> 
> This needs performance numbers (or other reasons) justifying the
> change, especially as BLK_MQ_F_BLOCKING is a bit of an overhead.

The difference is just rcu_read_lock() vs. srcu_read_lock(), and not
see any difference in typical fio workload on loop device, and the gain
is pretty obvious, bandwidth is increased by > 4X in aio workloads:

https://lore.kernel.org/linux-block/f7c9d956-2b9b-8bb4-aa49-d57323fc8eb0@redhat.com/T/#md3a6154218cb6619d8af5432cf2dd3a4a7a3dcc6

> 
> >  static DEFINE_IDR(loop_index_idr);
> >  static DEFINE_MUTEX(loop_ctl_mutex);
> >  static DEFINE_MUTEX(loop_validate_mutex);
> > @@ -380,8 +382,17 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
> >  
> >  	if (!atomic_dec_and_test(&cmd->ref))
> >  		return;
> > +
> > +	if (cmd->ret == -EAGAIN) {
> > +		struct loop_device *lo = rq->q->queuedata;
> > +
> > +		loop_queue_work(lo, cmd);
> > +		return;
> > +	}
> 
> This looks like the wrong place for the rety, as -EAGAIN can only come from
> the submissions path.  i.e. we should never make it to the full completion
> path for that case.

That is not true, at least for XFS:

[root@ktest-40 io]# bpftrace -e 'kretfunc:lo_rw_aio_complete /args->ret == -11/ { @eagain[kstack] = count() } '
Attaching 1 probe...
^C

@eagain[
    bpf_prog_6deef7357e7b4530_sd_fw_ingress+28250
    bpf_prog_6deef7357e7b4530_sd_fw_ingress+28250
    bpf_trampoline_367219848433+108
    lo_rw_aio_complete+9
    blkdev_bio_end_io_async+63
    bio_submit_split+347
    blk_mq_submit_bio+395
    __submit_bio+116
    submit_bio_noacct_nocheck+773
    submit_bio_wait+87
    xfs_rw_bdev+348
    xlog_do_io+131
    xlog_write_log_records+451
    xlog_find_tail+829
    xlog_recover+61
    xfs_log_mount+259
    xfs_mountfs+1232
    xfs_fs_fill_super+1507
    get_tree_bdev_flags+303
    vfs_get_tree+38
    vfs_cmd_create+89
    __do_sys_fsconfig+1286
    do_syscall_64+130
    entry_SYSCALL_64_after_hwframe+118
]: 2


> 
> >  static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
> > +{
> > +	unsigned int nr_bvec = lo_cmd_nr_bvec(cmd);
> > +	int ret;
> > +
> > +	cmd->iocb.ki_flags &= ~IOCB_NOWAIT;
> > +	ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec);
> > +	if (ret != -EIOCBQUEUED)
> > +		lo_rw_aio_complete(&cmd->iocb, ret);
> > +	return 0;
> 
> This needs an explanation that it is for the fallback path and thus
> clears the nowait flag.

OK.

> 
> > +}
> > +
> > +static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
> 
> Overly long line.
> 
> > @@ -1926,6 +1955,17 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
> >  		break;
> >  	}
> >  
> > +	if (cmd->use_aio) {
> > +		loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
> > +		int ret = lo_rw_aio_nowait(lo, cmd, pos);
> > +
> > +		if (!ret)
> > +			return BLK_STS_OK;
> > +		if (ret != -EAGAIN)
> > +			return BLK_STS_IOERR;
> > +		/* fallback to workqueue for handling aio */
> > +	}
> 
> Why isn't all the logic in this branch in lo_rw_aio_nowait?

Good catch, I just found we have BLK_STS_AGAIN.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND PATCH 3/5] loop: add helper loop_queue_work_prep
  2025-03-11  1:21     ` Ming Lei
@ 2025-03-11  7:55       ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2025-03-11  7:55 UTC (permalink / raw)
  To: Ming Lei; +Cc: Christoph Hellwig, Jens Axboe, linux-block

On Tue, Mar 11, 2025 at 09:21:14AM +0800, Ming Lei wrote:
> On Mon, Mar 10, 2025 at 12:11:20PM +0100, Christoph Hellwig wrote:
> > On Sun, Mar 09, 2025 at 12:23:07AM +0800, Ming Lei wrote:
> > > Add helper loop_queue_work_prep() for making loop_queue_rq() more
> > > readable.
> > 
> > Looking at this and the finaly result I don't really see any advantage
> > over just moving the code into loop_queue_work.
> 
> loop_queue_work() is required for handling -EAGAIN, that is why I move
> loop_queue_work_prep() into loop_queue_work().

Yes, but please just add the code directly to loop_queue_work instead
of adding a not very clearly defined helper.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND PATCH 4/5] loop: try to handle loop aio command via NOWAIT IO first
  2025-03-11  1:33     ` Ming Lei
@ 2025-03-11  7:58       ` Christoph Hellwig
  2025-03-11 10:55         ` Ming Lei
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2025-03-11  7:58 UTC (permalink / raw)
  To: Ming Lei; +Cc: Christoph Hellwig, Jens Axboe, linux-block

On Tue, Mar 11, 2025 at 09:33:01AM +0800, Ming Lei wrote:
> On Mon, Mar 10, 2025 at 12:14:44PM +0100, Christoph Hellwig wrote:
> > On Sun, Mar 09, 2025 at 12:23:08AM +0800, Ming Lei wrote:
> > > Try to handle loop aio command via NOWAIT IO first, then we can avoid to
> > > queue the aio command into workqueue.
> > > 
> > > Fallback to workqueue in case of -EAGAIN.
> > > 
> > > BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or
> > > .write_iter() which might sleep even though it is NOWAIT.
> > 
> > This needs performance numbers (or other reasons) justifying the
> > change, especially as BLK_MQ_F_BLOCKING is a bit of an overhead.
> 
> The difference is just rcu_read_lock() vs. srcu_read_lock(), and not

Not, it also includes offloading to kblockd in more cases.

>
>
> see any difference in typical fio workload on loop device, and the gain
> is pretty obvious, bandwidth is increased by > 4X in aio workloads:
> 
> https://lore.kernel.org/linux-block/f7c9d956-2b9b-8bb4-aa49-d57323fc8eb0@redhat.com/T/#md3a6154218cb6619d8af5432cf2dd3a4a7a3dcc6

Please document that in the commit log.

> > > +	if (cmd->ret == -EAGAIN) {
> > > +		struct loop_device *lo = rq->q->queuedata;
> > > +
> > > +		loop_queue_work(lo, cmd);
> > > +		return;
> > > +	}
> > 
> > This looks like the wrong place for the rety, as -EAGAIN can only come from
> > the submissions path.  i.e. we should never make it to the full completion
> > path for that case.
> 
> That is not true, at least for XFS:

Your trace sees lo_rw_aio_complete called from the block layer
splitting called from loop, I see nothing about XFS there.  But yes,
this shows the issue discussed last week in the iomap IOCB_NOWAIT
thread.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND PATCH 4/5] loop: try to handle loop aio command via NOWAIT IO first
  2025-03-11  7:58       ` Christoph Hellwig
@ 2025-03-11 10:55         ` Ming Lei
  0 siblings, 0 replies; 18+ messages in thread
From: Ming Lei @ 2025-03-11 10:55 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, linux-block

On Tue, Mar 11, 2025 at 12:58:46AM -0700, Christoph Hellwig wrote:
> On Tue, Mar 11, 2025 at 09:33:01AM +0800, Ming Lei wrote:
> > On Mon, Mar 10, 2025 at 12:14:44PM +0100, Christoph Hellwig wrote:
> > > On Sun, Mar 09, 2025 at 12:23:08AM +0800, Ming Lei wrote:
> > > > Try to handle loop aio command via NOWAIT IO first, then we can avoid to
> > > > queue the aio command into workqueue.
> > > > 
> > > > Fallback to workqueue in case of -EAGAIN.
> > > > 
> > > > BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or
> > > > .write_iter() which might sleep even though it is NOWAIT.
> > > 
> > > This needs performance numbers (or other reasons) justifying the
> > > change, especially as BLK_MQ_F_BLOCKING is a bit of an overhead.
> > 
> > The difference is just rcu_read_lock() vs. srcu_read_lock(), and not
> 
> Not, it also includes offloading to kblockd in more cases.

But loop doesn't run into these cases:

blk_execute_rq_nowait():
	blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);

blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
	blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);

blk_mq_start_stopped_hw_queues
	...

> 
> >
> >
> > see any difference in typical fio workload on loop device, and the gain
> > is pretty obvious, bandwidth is increased by > 4X in aio workloads:
> > 
> > https://lore.kernel.org/linux-block/f7c9d956-2b9b-8bb4-aa49-d57323fc8eb0@redhat.com/T/#md3a6154218cb6619d8af5432cf2dd3a4a7a3dcc6
> 
> Please document that in the commit log.
> 
> > > > +	if (cmd->ret == -EAGAIN) {
> > > > +		struct loop_device *lo = rq->q->queuedata;
> > > > +
> > > > +		loop_queue_work(lo, cmd);
> > > > +		return;
> > > > +	}
> > > 
> > > This looks like the wrong place for the rety, as -EAGAIN can only come from
> > > the submissions path.  i.e. we should never make it to the full completion
> > > path for that case.
> > 
> > That is not true, at least for XFS:
> 
> Your trace sees lo_rw_aio_complete called from the block layer
> splitting called from loop, I see nothing about XFS there.  But yes,
> this shows the issue discussed last week in the iomap IOCB_NOWAIT
> thread.

Looks I mis-parse the stack, but it is still returned from blkdev's ->ki_complete(),
and need to be handled.



Thanks,
Ming


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2025-03-11 10:55 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-08 16:23 [RESEND PATCH 0/5] loop: improve loop aio perf by IOCB_NOWAIT Ming Lei
2025-03-08 16:23 ` [RESEND PATCH 1/5] loop: remove 'rw' parameter from lo_rw_aio() Ming Lei
2025-03-09  7:30   ` kernel test robot
2025-03-10 10:46   ` Christoph Hellwig
2025-03-08 16:23 ` [RESEND PATCH 2/5] loop: cleanup lo_rw_aio() Ming Lei
2025-03-10 11:07   ` Christoph Hellwig
2025-03-08 16:23 ` [RESEND PATCH 3/5] loop: add helper loop_queue_work_prep Ming Lei
2025-03-09  7:30   ` kernel test robot
2025-03-10 11:11   ` Christoph Hellwig
2025-03-11  1:21     ` Ming Lei
2025-03-11  7:55       ` Christoph Hellwig
2025-03-08 16:23 ` [RESEND PATCH 4/5] loop: try to handle loop aio command via NOWAIT IO first Ming Lei
2025-03-10 11:14   ` Christoph Hellwig
2025-03-11  1:33     ` Ming Lei
2025-03-11  7:58       ` Christoph Hellwig
2025-03-11 10:55         ` Ming Lei
2025-03-08 16:23 ` [RESEND PATCH 5/5] loop: add module parameter of 'nr_hw_queues' Ming Lei
2025-03-10 11:15   ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).