[PATCH] ublk_drv: don't call task_work_add for queueing io commands

linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] ublk_drv: don't call task_work_add for queueing io commands
@ 2022-10-23  9:38 Ming Lei
  2022-10-24  9:48 ` Ziyang Zhang
  0 siblings, 1 reply; 12+ messages in thread
From: Ming Lei @ 2022-10-23  9:38 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, ZiyangZhang, Ming Lei

task_work_add() is used for waking ubq daemon task with one batch
of io requests/commands queued. However, task_work_add() isn't
exported for module code, and it is still debatable if the symbol
should be exported.

Fortunately we still have io_uring_cmd_complete_in_task() which just
can't handle batched wakeup for us.

Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
via current command for running them via task work.

This way cleans up current code a lot, meantime allow us to wakeup
ubq daemon task after queueing batched requests/io commands.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/ublk_drv.c | 130 ++++++++++++++++-----------------------
 1 file changed, 52 insertions(+), 78 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 5afce6ffaadf..7963fba66dd1 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -41,7 +41,7 @@
 #include <linux/delay.h>
 #include <linux/mm.h>
 #include <asm/page.h>
-#include <linux/task_work.h>
+#include <linux/llist.h>
 #include <uapi/linux/ublk_cmd.h>
 
 #define UBLK_MINORS		(1U << MINORBITS)
@@ -56,12 +56,9 @@
 /* All UBLK_PARAM_TYPE_* should be included here */
 #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
 
-struct ublk_rq_data {
-	struct callback_head work;
-};
-
 struct ublk_uring_cmd_pdu {
 	struct request *req;
+	struct llist_node node;
 };
 
 /*
@@ -119,6 +116,8 @@ struct ublk_queue {
 	struct task_struct	*ubq_daemon;
 	char *io_cmd_buf;
 
+	struct llist_head	io_cmds;
+
 	unsigned long io_addr;	/* mapped vm address */
 	unsigned int max_io_sz;
 	bool force_abort;
@@ -268,14 +267,6 @@ static int ublk_apply_params(struct ublk_device *ub)
 	return 0;
 }
 
-static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
-{
-	if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
-			!(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
-		return true;
-	return false;
-}
-
 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
 {
 	if (ubq->flags & UBLK_F_NEED_GET_DATA)
@@ -761,20 +752,16 @@ static inline void __ublk_rq_task_work(struct request *req)
 	ubq_complete_io_cmd(io, UBLK_IO_RES_OK);
 }
 
-static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
+static void ublk_rqs_task_work_cb(struct io_uring_cmd *cmd)
 {
 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+	struct ublk_queue *ubq = pdu->req->mq_hctx->driver_data;
+	struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
 
 	__ublk_rq_task_work(pdu->req);
-}
 
-static void ublk_rq_task_work_fn(struct callback_head *work)
-{
-	struct ublk_rq_data *data = container_of(work,
-			struct ublk_rq_data, work);
-	struct request *req = blk_mq_rq_from_pdu(data);
-
-	__ublk_rq_task_work(req);
+	llist_for_each_entry(pdu, io_cmds, node)
+		__ublk_rq_task_work(pdu->req);
 }
 
 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -782,12 +769,16 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 {
 	struct ublk_queue *ubq = hctx->driver_data;
 	struct request *rq = bd->rq;
+	struct ublk_io *io = &ubq->ios[rq->tag];
+	struct io_uring_cmd *cmd = io->cmd;
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
 	blk_status_t res;
 
 	/* fill iod to slot in io cmd buffer */
 	res = ublk_setup_iod(ubq, rq);
 	if (unlikely(res != BLK_STS_OK))
 		return BLK_STS_IOERR;
+
 	/* With recovery feature enabled, force_abort is set in
 	 * ublk_stop_dev() before calling del_gendisk(). We have to
 	 * abort all requeued and new rqs here to let del_gendisk()
@@ -802,42 +793,38 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	blk_mq_start_request(bd->rq);
 
-	if (unlikely(ubq_daemon_is_dying(ubq))) {
- fail:
+	/*
+	 * If the check pass, we know that this is a re-issued request aborted
+	 * previously in monitor_work because the ubq_daemon(cmd's task) is
+	 * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
+	 * because this ioucmd's io_uring context may be freed now if no inflight
+	 * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
+	 *
+	 * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
+	 * the tag). Then the request is re-started(allocating the tag) and we are here.
+	 * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
+	 * guarantees that here is a re-issued request aborted previously.
+	 */
+	if (unlikely(ubq_daemon_is_dying(ubq) ||
+				(io->flags & UBLK_IO_FLAG_ABORTED))) {
 		__ublk_abort_rq(ubq, rq);
 		return BLK_STS_OK;
 	}
 
-	if (ublk_can_use_task_work(ubq)) {
-		struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
-		enum task_work_notify_mode notify_mode = bd->last ?
-			TWA_SIGNAL_NO_IPI : TWA_NONE;
-
-		if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode))
-			goto fail;
-	} else {
-		struct ublk_io *io = &ubq->ios[rq->tag];
-		struct io_uring_cmd *cmd = io->cmd;
-		struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+	pdu->req = rq;
 
-		/*
-		 * If the check pass, we know that this is a re-issued request aborted
-		 * previously in monitor_work because the ubq_daemon(cmd's task) is
-		 * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
-		 * because this ioucmd's io_uring context may be freed now if no inflight
-		 * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
-		 *
-		 * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
-		 * the tag). Then the request is re-started(allocating the tag) and we are here.
-		 * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
-		 * guarantees that here is a re-issued request aborted previously.
-		 */
-		if ((io->flags & UBLK_IO_FLAG_ABORTED))
-			goto fail;
-
-		pdu->req = rq;
-		io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
-	}
+	/*
+	 * Typical multiple producers and single consumer, it is just fine
+	 * to use llist_add() in producer side and llist_del_all() in
+	 * consumer side.
+	 *
+	 * The last command can't be added into list, otherwise it could
+	 * be handled twice
+	 */
+	if (bd->last)
+		io_uring_cmd_complete_in_task(cmd, ublk_rqs_task_work_cb);
+	else
+		llist_add(&pdu->node, &ubq->io_cmds);
 
 	return BLK_STS_OK;
 }
@@ -846,8 +833,8 @@ static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
 {
 	struct ublk_queue *ubq = hctx->driver_data;
 
-	if (ublk_can_use_task_work(ubq))
-		__set_notify_signal(ubq->ubq_daemon);
+	if (!test_and_set_tsk_thread_flag(ubq->ubq_daemon, TIF_NOTIFY_SIGNAL))
+		wake_up_process(ubq->ubq_daemon);
 }
 
 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
@@ -860,20 +847,10 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
 	return 0;
 }
 
-static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
-		unsigned int hctx_idx, unsigned int numa_node)
-{
-	struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
-	init_task_work(&data->work, ublk_rq_task_work_fn);
-	return 0;
-}
-
 static const struct blk_mq_ops ublk_mq_ops = {
 	.queue_rq       = ublk_queue_rq,
 	.commit_rqs     = ublk_commit_rqs,
 	.init_hctx	= ublk_init_hctx,
-	.init_request   = ublk_init_rq,
 };
 
 static int ublk_ch_open(struct inode *inode, struct file *filp)
@@ -1163,23 +1140,21 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
 	mutex_unlock(&ub->mutex);
 }
 
+static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
+{
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+	__ublk_rq_task_work(pdu->req);
+}
+
 static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
 		int tag, struct io_uring_cmd *cmd)
 {
-	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
 	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
 
-	if (ublk_can_use_task_work(ubq)) {
-		struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
-		/* should not fail since we call it just in ubq->ubq_daemon */
-		task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI);
-	} else {
-		struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
-
-		pdu->req = req;
-		io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
-	}
+	pdu->req = req;
+	io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
 }
 
 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
@@ -1450,7 +1425,6 @@ static int ublk_add_tag_set(struct ublk_device *ub)
 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
 	ub->tag_set.numa_node = NUMA_NO_NODE;
-	ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
 	ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 	ub->tag_set.driver_data = ub;
 	return blk_mq_alloc_tag_set(&ub->tag_set);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
  2022-10-23  9:38 [PATCH] ublk_drv: don't call task_work_add for queueing io commands Ming Lei
@ 2022-10-24  9:48 ` Ziyang Zhang
  2022-10-24 13:20   ` Ming Lei
  0 siblings, 1 reply; 12+ messages in thread
From: Ziyang Zhang @ 2022-10-24  9:48 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, Jens Axboe

On 2022/10/23 17:38, Ming Lei wrote:
> task_work_add() is used for waking ubq daemon task with one batch
> of io requests/commands queued. However, task_work_add() isn't
> exported for module code, and it is still debatable if the symbol
> should be exported.
> 
> Fortunately we still have io_uring_cmd_complete_in_task() which just
> can't handle batched wakeup for us.
> 
> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
> via current command for running them via task work.
> 
> This way cleans up current code a lot, meantime allow us to wakeup
> ubq daemon task after queueing batched requests/io commands.
> 


Hi, Ming

This patch works and I have run some tests to compare current version(ucmd)
with your patch(ucmd-batch).

iodepth=128 numjobs=1 direct=1 bs=4k

--------------------------------------------
ublk loop target, the backend is a file.
IOPS(k)

type		ucmd		ucmd-batch
seq-read	54.7		54.2	
rand-read	52.8		52.0

--------------------------------------------
ublk null target
IOPS(k)

type		ucmd		ucmd-batch
seq-read	257		257
rand-read	252		253


I find that io_req_task_work_add() puts task_work node into a llist
first, then it may call task_work_add() to run batched task_works. So do we really
need such llist in ublk_drv? I think io_uring has already considered task_work batch
optimization.

BTW, task_work_add() in ublk_drv achieves
higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
in ublk_drv.

Regards,
Zhang

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
  2022-10-24  9:48 ` Ziyang Zhang
@ 2022-10-24 13:20   ` Ming Lei
  2022-10-25  3:15     ` Ziyang Zhang
  0 siblings, 1 reply; 12+ messages in thread
From: Ming Lei @ 2022-10-24 13:20 UTC (permalink / raw)
  To: Ziyang Zhang; +Cc: linux-block, Jens Axboe

Hello Ziyang,

On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
> On 2022/10/23 17:38, Ming Lei wrote:
> > task_work_add() is used for waking ubq daemon task with one batch
> > of io requests/commands queued. However, task_work_add() isn't
> > exported for module code, and it is still debatable if the symbol
> > should be exported.
> > 
> > Fortunately we still have io_uring_cmd_complete_in_task() which just
> > can't handle batched wakeup for us.
> > 
> > Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
> > via current command for running them via task work.
> > 
> > This way cleans up current code a lot, meantime allow us to wakeup
> > ubq daemon task after queueing batched requests/io commands.
> > 
> 
> 
> Hi, Ming
> 
> This patch works and I have run some tests to compare current version(ucmd)
> with your patch(ucmd-batch).
> 
> iodepth=128 numjobs=1 direct=1 bs=4k
> 
> --------------------------------------------
> ublk loop target, the backend is a file.
> IOPS(k)
> 
> type		ucmd		ucmd-batch
> seq-read	54.7		54.2	
> rand-read	52.8		52.0
> 
> --------------------------------------------
> ublk null target
> IOPS(k)
> 
> type		ucmd		ucmd-batch
> seq-read	257		257
> rand-read	252		253
> 
> 
> I find that io_req_task_work_add() puts task_work node into a llist
> first, then it may call task_work_add() to run batched task_works. So do we really
> need such llist in ublk_drv? I think io_uring has already considered task_work batch
> optimization.
> 
> BTW, task_work_add() in ublk_drv achieves
> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
> in ublk_drv.

Yeah, that is same with my observation, and motivation of this patch is
to get same performance with task_work_add by building ublk_drv as
module. One win of task_work_add() is that we get exact batching info
meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
what the patch is doing, but needs help of the following ublksrv patch:

https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548

which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
IOPS boost is observed on loop/001 by putting image on SSD in my test
VM.

Thanks, 
Ming


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
  2022-10-24 13:20   ` Ming Lei
@ 2022-10-25  3:15     ` Ziyang Zhang
  2022-10-25  7:19       ` Ming Lei
  0 siblings, 1 reply; 12+ messages in thread
From: Ziyang Zhang @ 2022-10-25  3:15 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, Jens Axboe

On 2022/10/24 21:20, Ming Lei wrote:
> Hello Ziyang,
> 
> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
>> On 2022/10/23 17:38, Ming Lei wrote:
>>> task_work_add() is used for waking ubq daemon task with one batch
>>> of io requests/commands queued. However, task_work_add() isn't
>>> exported for module code, and it is still debatable if the symbol
>>> should be exported.
>>>
>>> Fortunately we still have io_uring_cmd_complete_in_task() which just
>>> can't handle batched wakeup for us.
>>>
>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
>>> via current command for running them via task work.
>>>
>>> This way cleans up current code a lot, meantime allow us to wakeup
>>> ubq daemon task after queueing batched requests/io commands.
>>>
>>
>>
>> Hi, Ming
>>
>> This patch works and I have run some tests to compare current version(ucmd)
>> with your patch(ucmd-batch).
>>
>> iodepth=128 numjobs=1 direct=1 bs=4k
>>
>> --------------------------------------------
>> ublk loop target, the backend is a file.
>> IOPS(k)
>>
>> type		ucmd		ucmd-batch
>> seq-read	54.7		54.2	
>> rand-read	52.8		52.0
>>
>> --------------------------------------------
>> ublk null target
>> IOPS(k)
>>
>> type		ucmd		ucmd-batch
>> seq-read	257		257
>> rand-read	252		253
>>
>>
>> I find that io_req_task_work_add() puts task_work node into a llist
>> first, then it may call task_work_add() to run batched task_works. So do we really
>> need such llist in ublk_drv? I think io_uring has already considered task_work batch
>> optimization.
>>
>> BTW, task_work_add() in ublk_drv achieves
>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
>> in ublk_drv.
> 
> Yeah, that is same with my observation, and motivation of this patch is
> to get same performance with task_work_add by building ublk_drv as
> module. One win of task_work_add() is that we get exact batching info
> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
> what the patch is doing, but needs help of the following ublksrv patch:
> 
> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
> 
> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
> IOPS boost is observed on loop/001 by putting image on SSD in my test
> VM.

Hi Ming,

I have added this ublksrv patch and run the above test again.
I have also run ublksrv test: loop/001. Please check them.

Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
64GB MEM, CentOS 8, kernel 6.0+

--------
fio test

iodepth=128 numjobs=1 direct=1 bs=4k

ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
for each blk-mq rq.

ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
for the last blk-mq rq.

--------------------------------------------
ublk loop target, the backend is a file.

IOPS(k)

type		ucmd		ucmd-batch
seq-read	54.1		53.7
rand-read	52.0		52.0

--------------------------------------------
ublk null target
IOPS(k)

type		ucmd		ucmd-batch
seq-read	272		265
rand-read	262		260

------------
ublksrv test

-------------
ucmd

running loop/001
        fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
        randwrite: jobs 1, iops 66737
        randread: jobs 1, iops 64935
        randrw: jobs 1, iops read 32694 write 32710
        rw(512k): jobs 1, iops read 772 write 819

-------------
ucmd-batch

running loop/001
        fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
        randwrite: jobs 1, iops 66720
        randread: jobs 1, iops 65015
        randrw: jobs 1, iops read 32743 write 32759
        rw(512k): jobs 1, iops read 771 write 817


It seems that manually putting rqs into llist and calling
io_uring_cmd_complete_in_task() while handling the last rq does
not improve IOPS much.

io_req_task_work_add() puts task_work node into a internal llist
first, then it may call task_work_add() to run batched task_works.
IMO, io_uring has already done such batch optimization and ublk_drv
does not need to add such llist.

Regards,
Zhang.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
  2022-10-25  3:15     ` Ziyang Zhang
@ 2022-10-25  7:19       ` Ming Lei
  2022-10-25  7:46         ` Ziyang Zhang
  0 siblings, 1 reply; 12+ messages in thread
From: Ming Lei @ 2022-10-25  7:19 UTC (permalink / raw)
  To: Ziyang Zhang; +Cc: linux-block, Jens Axboe

On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote:
> On 2022/10/24 21:20, Ming Lei wrote:
> > Hello Ziyang,
> > 
> > On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
> >> On 2022/10/23 17:38, Ming Lei wrote:
> >>> task_work_add() is used for waking ubq daemon task with one batch
> >>> of io requests/commands queued. However, task_work_add() isn't
> >>> exported for module code, and it is still debatable if the symbol
> >>> should be exported.
> >>>
> >>> Fortunately we still have io_uring_cmd_complete_in_task() which just
> >>> can't handle batched wakeup for us.
> >>>
> >>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
> >>> via current command for running them via task work.
> >>>
> >>> This way cleans up current code a lot, meantime allow us to wakeup
> >>> ubq daemon task after queueing batched requests/io commands.
> >>>
> >>
> >>
> >> Hi, Ming
> >>
> >> This patch works and I have run some tests to compare current version(ucmd)
> >> with your patch(ucmd-batch).
> >>
> >> iodepth=128 numjobs=1 direct=1 bs=4k
> >>
> >> --------------------------------------------
> >> ublk loop target, the backend is a file.
> >> IOPS(k)
> >>
> >> type		ucmd		ucmd-batch
> >> seq-read	54.7		54.2	
> >> rand-read	52.8		52.0
> >>
> >> --------------------------------------------
> >> ublk null target
> >> IOPS(k)
> >>
> >> type		ucmd		ucmd-batch
> >> seq-read	257		257
> >> rand-read	252		253
> >>
> >>
> >> I find that io_req_task_work_add() puts task_work node into a llist
> >> first, then it may call task_work_add() to run batched task_works. So do we really
> >> need such llist in ublk_drv? I think io_uring has already considered task_work batch
> >> optimization.
> >>
> >> BTW, task_work_add() in ublk_drv achieves
> >> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
> >> in ublk_drv.
> > 
> > Yeah, that is same with my observation, and motivation of this patch is
> > to get same performance with task_work_add by building ublk_drv as
> > module. One win of task_work_add() is that we get exact batching info
> > meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
> > what the patch is doing, but needs help of the following ublksrv patch:
> > 
> > https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
> > 
> > which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
> > io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
> > IOPS boost is observed on loop/001 by putting image on SSD in my test
> > VM.
> 
> Hi Ming,
> 
> I have added this ublksrv patch and run the above test again.
> I have also run ublksrv test: loop/001. Please check them.
> 
> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> 64GB MEM, CentOS 8, kernel 6.0+
> 
> --------
> fio test
> 
> iodepth=128 numjobs=1 direct=1 bs=4k
> 
> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
> for each blk-mq rq.
> 
> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
> for the last blk-mq rq.
> 
> --------------------------------------------
> ublk loop target, the backend is a file.
> 
> IOPS(k)
> 
> type		ucmd		ucmd-batch
> seq-read	54.1		53.7
> rand-read	52.0		52.0
> 
> --------------------------------------------
> ublk null target
> IOPS(k)
> 
> type		ucmd		ucmd-batch
> seq-read	272		265
> rand-read	262		260
> 
> ------------
> ublksrv test
> 
> -------------
> ucmd
> 
> running loop/001
>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>         randwrite: jobs 1, iops 66737
>         randread: jobs 1, iops 64935
>         randrw: jobs 1, iops read 32694 write 32710
>         rw(512k): jobs 1, iops read 772 write 819
> 
> -------------
> ucmd-batch
> 
> running loop/001
>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>         randwrite: jobs 1, iops 66720
>         randread: jobs 1, iops 65015
>         randrw: jobs 1, iops read 32743 write 32759
>         rw(512k): jobs 1, iops read 771 write 817
> 
> 
> It seems that manually putting rqs into llist and calling
> io_uring_cmd_complete_in_task() while handling the last rq does
> not improve IOPS much.
> 
> io_req_task_work_add() puts task_work node into a internal llist
> first, then it may call task_work_add() to run batched task_works.
> IMO, io_uring has already done such batch optimization and ublk_drv
> does not need to add such llist.

The difference is just how batching is handled, looks blk-mq's batch info
doesn't matter any more. In my test, looks the perf improvement is mainly
made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv.

Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach
same perf with task_work_add()(ublk_drv is builtin) when building
ublk_drv as module?


Thanks,
Ming


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
  2022-10-25  7:19       ` Ming Lei
@ 2022-10-25  7:46         ` Ziyang Zhang
  2022-10-25  8:43           ` Ziyang Zhang
  0 siblings, 1 reply; 12+ messages in thread
From: Ziyang Zhang @ 2022-10-25  7:46 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, Jens Axboe

On 2022/10/25 15:19, Ming Lei wrote:
> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote:
>> On 2022/10/24 21:20, Ming Lei wrote:
>>> Hello Ziyang,
>>>
>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
>>>> On 2022/10/23 17:38, Ming Lei wrote:
>>>>> task_work_add() is used for waking ubq daemon task with one batch
>>>>> of io requests/commands queued. However, task_work_add() isn't
>>>>> exported for module code, and it is still debatable if the symbol
>>>>> should be exported.
>>>>>
>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just
>>>>> can't handle batched wakeup for us.
>>>>>
>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
>>>>> via current command for running them via task work.
>>>>>
>>>>> This way cleans up current code a lot, meantime allow us to wakeup
>>>>> ubq daemon task after queueing batched requests/io commands.
>>>>>
>>>>
>>>>
>>>> Hi, Ming
>>>>
>>>> This patch works and I have run some tests to compare current version(ucmd)
>>>> with your patch(ucmd-batch).
>>>>
>>>> iodepth=128 numjobs=1 direct=1 bs=4k
>>>>
>>>> --------------------------------------------
>>>> ublk loop target, the backend is a file.
>>>> IOPS(k)
>>>>
>>>> type		ucmd		ucmd-batch
>>>> seq-read	54.7		54.2	
>>>> rand-read	52.8		52.0
>>>>
>>>> --------------------------------------------
>>>> ublk null target
>>>> IOPS(k)
>>>>
>>>> type		ucmd		ucmd-batch
>>>> seq-read	257		257
>>>> rand-read	252		253
>>>>
>>>>
>>>> I find that io_req_task_work_add() puts task_work node into a llist
>>>> first, then it may call task_work_add() to run batched task_works. So do we really
>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch
>>>> optimization.
>>>>
>>>> BTW, task_work_add() in ublk_drv achieves
>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
>>>> in ublk_drv.
>>>
>>> Yeah, that is same with my observation, and motivation of this patch is
>>> to get same performance with task_work_add by building ublk_drv as
>>> module. One win of task_work_add() is that we get exact batching info
>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
>>> what the patch is doing, but needs help of the following ublksrv patch:
>>>
>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
>>>
>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
>>> IOPS boost is observed on loop/001 by putting image on SSD in my test
>>> VM.
>>
>> Hi Ming,
>>
>> I have added this ublksrv patch and run the above test again.
>> I have also run ublksrv test: loop/001. Please check them.
>>
>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
>> 64GB MEM, CentOS 8, kernel 6.0+
>>
>> --------
>> fio test
>>
>> iodepth=128 numjobs=1 direct=1 bs=4k
>>
>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
>> for each blk-mq rq.
>>
>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
>> for the last blk-mq rq.
>>
>> --------------------------------------------
>> ublk loop target, the backend is a file.
>>
>> IOPS(k)
>>
>> type		ucmd		ucmd-batch
>> seq-read	54.1		53.7
>> rand-read	52.0		52.0
>>
>> --------------------------------------------
>> ublk null target
>> IOPS(k)
>>
>> type		ucmd		ucmd-batch
>> seq-read	272		265
>> rand-read	262		260
>>
>> ------------
>> ublksrv test
>>
>> -------------
>> ucmd
>>
>> running loop/001
>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>         randwrite: jobs 1, iops 66737
>>         randread: jobs 1, iops 64935
>>         randrw: jobs 1, iops read 32694 write 32710
>>         rw(512k): jobs 1, iops read 772 write 819
>>
>> -------------
>> ucmd-batch
>>
>> running loop/001
>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>         randwrite: jobs 1, iops 66720
>>         randread: jobs 1, iops 65015
>>         randrw: jobs 1, iops read 32743 write 32759
>>         rw(512k): jobs 1, iops read 771 write 817
>>
>>
>> It seems that manually putting rqs into llist and calling
>> io_uring_cmd_complete_in_task() while handling the last rq does
>> not improve IOPS much.
>>
>> io_req_task_work_add() puts task_work node into a internal llist
>> first, then it may call task_work_add() to run batched task_works.
>> IMO, io_uring has already done such batch optimization and ublk_drv
>> does not need to add such llist.
> 
> The difference is just how batching is handled, looks blk-mq's batch info
> doesn't matter any more. In my test, looks the perf improvement is mainly
> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv.

I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in
ublk_drv does not improve IOPS.

> 
> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach
> same perf with task_work_add()(ublk_drv is builtin) when building
> ublk_drv as module?
> 

OK.

Regards,
Zhang


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
  2022-10-25  7:46         ` Ziyang Zhang
@ 2022-10-25  8:43           ` Ziyang Zhang
  2022-10-25 15:17             ` Ming Lei
  0 siblings, 1 reply; 12+ messages in thread
From: Ziyang Zhang @ 2022-10-25  8:43 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, Jens Axboe

On 2022/10/25 15:46, Ziyang Zhang wrote:
> On 2022/10/25 15:19, Ming Lei wrote:
>> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote:
>>> On 2022/10/24 21:20, Ming Lei wrote:
>>>> Hello Ziyang,
>>>>
>>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
>>>>> On 2022/10/23 17:38, Ming Lei wrote:
>>>>>> task_work_add() is used for waking ubq daemon task with one batch
>>>>>> of io requests/commands queued. However, task_work_add() isn't
>>>>>> exported for module code, and it is still debatable if the symbol
>>>>>> should be exported.
>>>>>>
>>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just
>>>>>> can't handle batched wakeup for us.
>>>>>>
>>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
>>>>>> via current command for running them via task work.
>>>>>>
>>>>>> This way cleans up current code a lot, meantime allow us to wakeup
>>>>>> ubq daemon task after queueing batched requests/io commands.
>>>>>>
>>>>>
>>>>>
>>>>> Hi, Ming
>>>>>
>>>>> This patch works and I have run some tests to compare current version(ucmd)
>>>>> with your patch(ucmd-batch).
>>>>>
>>>>> iodepth=128 numjobs=1 direct=1 bs=4k
>>>>>
>>>>> --------------------------------------------
>>>>> ublk loop target, the backend is a file.
>>>>> IOPS(k)
>>>>>
>>>>> type		ucmd		ucmd-batch
>>>>> seq-read	54.7		54.2	
>>>>> rand-read	52.8		52.0
>>>>>
>>>>> --------------------------------------------
>>>>> ublk null target
>>>>> IOPS(k)
>>>>>
>>>>> type		ucmd		ucmd-batch
>>>>> seq-read	257		257
>>>>> rand-read	252		253
>>>>>
>>>>>
>>>>> I find that io_req_task_work_add() puts task_work node into a llist
>>>>> first, then it may call task_work_add() to run batched task_works. So do we really
>>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch
>>>>> optimization.
>>>>>
>>>>> BTW, task_work_add() in ublk_drv achieves
>>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
>>>>> in ublk_drv.
>>>>
>>>> Yeah, that is same with my observation, and motivation of this patch is
>>>> to get same performance with task_work_add by building ublk_drv as
>>>> module. One win of task_work_add() is that we get exact batching info
>>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
>>>> what the patch is doing, but needs help of the following ublksrv patch:
>>>>
>>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
>>>>
>>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
>>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
>>>> IOPS boost is observed on loop/001 by putting image on SSD in my test
>>>> VM.
>>>
>>> Hi Ming,
>>>
>>> I have added this ublksrv patch and run the above test again.
>>> I have also run ublksrv test: loop/001. Please check them.
>>>
>>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
>>> 64GB MEM, CentOS 8, kernel 6.0+
>>>
>>> --------
>>> fio test
>>>
>>> iodepth=128 numjobs=1 direct=1 bs=4k
>>>
>>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
>>> for each blk-mq rq.
>>>
>>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
>>> for the last blk-mq rq.
>>>
>>> --------------------------------------------
>>> ublk loop target, the backend is a file.
>>>
>>> IOPS(k)
>>>
>>> type		ucmd		ucmd-batch
>>> seq-read	54.1		53.7
>>> rand-read	52.0		52.0
>>>
>>> --------------------------------------------
>>> ublk null target
>>> IOPS(k)
>>>
>>> type		ucmd		ucmd-batch
>>> seq-read	272		265
>>> rand-read	262		260
>>>
>>> ------------
>>> ublksrv test
>>>
>>> -------------
>>> ucmd
>>>
>>> running loop/001
>>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>>         randwrite: jobs 1, iops 66737
>>>         randread: jobs 1, iops 64935
>>>         randrw: jobs 1, iops read 32694 write 32710
>>>         rw(512k): jobs 1, iops read 772 write 819
>>>
>>> -------------
>>> ucmd-batch
>>>
>>> running loop/001
>>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>>         randwrite: jobs 1, iops 66720
>>>         randread: jobs 1, iops 65015
>>>         randrw: jobs 1, iops read 32743 write 32759
>>>         rw(512k): jobs 1, iops read 771 write 817
>>>
>>>
>>> It seems that manually putting rqs into llist and calling
>>> io_uring_cmd_complete_in_task() while handling the last rq does
>>> not improve IOPS much.
>>>
>>> io_req_task_work_add() puts task_work node into a internal llist
>>> first, then it may call task_work_add() to run batched task_works.
>>> IMO, io_uring has already done such batch optimization and ublk_drv
>>> does not need to add such llist.
>>
>> The difference is just how batching is handled, looks blk-mq's batch info
>> doesn't matter any more. In my test, looks the perf improvement is mainly
>> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv.
> 
> I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in
> ublk_drv does not improve IOPS.
> 
>>
>> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach
>> same perf with task_work_add()(ublk_drv is builtin) when building
>> ublk_drv as module?
>>
> 
> OK.
> 

Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
64GB MEM, CentOS 8, kernel 6.0+
with IORING_SETUP_COOP_TASKRUN, without this kernel patch

ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
tw: task_work_add(), ublk is built-in.


--------
fio test

iodepth=128 numjobs=1 direct=1 bs=4k

--------------------------------------------
ublk loop target, the backend is a file.

IOPS(k)

type		ucmd		tw
seq-read	54.1		53.8
rand-read	52.0		52.0

--------------------------------------------
ublk null target
IOPS(k)

type		ucmd		tw
seq-read	272		286
rand-read	262		278


------------
ublksrv test

-------------
ucmd

running loop/001
        fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
        randwrite: jobs 1, iops 66737
        randread: jobs 1, iops 64935
        randrw: jobs 1, iops read 32694 write 32710
        rw(512k): jobs 1, iops read 772 write 819

running null/001
        fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
        randwrite: jobs 1, iops 715863
        randread: jobs 1, iops 758449
        randrw: jobs 1, iops read 357407 write 357183
        rw(512k): jobs 1, iops read 5895 write 5875

-------------
tw

running loop/001
        fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
        randwrite: jobs 1, iops 66856
        randread: jobs 1, iops 65015
        randrw: jobs 1, iops read 32751 write 32767
        rw(512k): jobs 1, iops read 776 write 823

running null/001
        fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
        randwrite: jobs 1, iops 739450
        randread: jobs 1, iops 787500
        randrw: jobs 1, iops read 372956 write 372831
        rw(512k): jobs 1, iops read 5798 write 5777


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
  2022-10-25  8:43           ` Ziyang Zhang
@ 2022-10-25 15:17             ` Ming Lei
  2022-10-26 10:32               ` Ziyang Zhang
  0 siblings, 1 reply; 12+ messages in thread
From: Ming Lei @ 2022-10-25 15:17 UTC (permalink / raw)
  To: Ziyang Zhang; +Cc: linux-block, Jens Axboe

On Tue, Oct 25, 2022 at 04:43:56PM +0800, Ziyang Zhang wrote:
> On 2022/10/25 15:46, Ziyang Zhang wrote:
> > On 2022/10/25 15:19, Ming Lei wrote:
> >> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote:
> >>> On 2022/10/24 21:20, Ming Lei wrote:
> >>>> Hello Ziyang,
> >>>>
> >>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
> >>>>> On 2022/10/23 17:38, Ming Lei wrote:
> >>>>>> task_work_add() is used for waking ubq daemon task with one batch
> >>>>>> of io requests/commands queued. However, task_work_add() isn't
> >>>>>> exported for module code, and it is still debatable if the symbol
> >>>>>> should be exported.
> >>>>>>
> >>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just
> >>>>>> can't handle batched wakeup for us.
> >>>>>>
> >>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
> >>>>>> via current command for running them via task work.
> >>>>>>
> >>>>>> This way cleans up current code a lot, meantime allow us to wakeup
> >>>>>> ubq daemon task after queueing batched requests/io commands.
> >>>>>>
> >>>>>
> >>>>>
> >>>>> Hi, Ming
> >>>>>
> >>>>> This patch works and I have run some tests to compare current version(ucmd)
> >>>>> with your patch(ucmd-batch).
> >>>>>
> >>>>> iodepth=128 numjobs=1 direct=1 bs=4k
> >>>>>
> >>>>> --------------------------------------------
> >>>>> ublk loop target, the backend is a file.
> >>>>> IOPS(k)
> >>>>>
> >>>>> type		ucmd		ucmd-batch
> >>>>> seq-read	54.7		54.2	
> >>>>> rand-read	52.8		52.0
> >>>>>
> >>>>> --------------------------------------------
> >>>>> ublk null target
> >>>>> IOPS(k)
> >>>>>
> >>>>> type		ucmd		ucmd-batch
> >>>>> seq-read	257		257
> >>>>> rand-read	252		253
> >>>>>
> >>>>>
> >>>>> I find that io_req_task_work_add() puts task_work node into a llist
> >>>>> first, then it may call task_work_add() to run batched task_works. So do we really
> >>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch
> >>>>> optimization.
> >>>>>
> >>>>> BTW, task_work_add() in ublk_drv achieves
> >>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
> >>>>> in ublk_drv.
> >>>>
> >>>> Yeah, that is same with my observation, and motivation of this patch is
> >>>> to get same performance with task_work_add by building ublk_drv as
> >>>> module. One win of task_work_add() is that we get exact batching info
> >>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
> >>>> what the patch is doing, but needs help of the following ublksrv patch:
> >>>>
> >>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
> >>>>
> >>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
> >>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
> >>>> IOPS boost is observed on loop/001 by putting image on SSD in my test
> >>>> VM.
> >>>
> >>> Hi Ming,
> >>>
> >>> I have added this ublksrv patch and run the above test again.
> >>> I have also run ublksrv test: loop/001. Please check them.
> >>>
> >>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> >>> 64GB MEM, CentOS 8, kernel 6.0+
> >>>
> >>> --------
> >>> fio test
> >>>
> >>> iodepth=128 numjobs=1 direct=1 bs=4k
> >>>
> >>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
> >>> for each blk-mq rq.
> >>>
> >>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
> >>> for the last blk-mq rq.
> >>>
> >>> --------------------------------------------
> >>> ublk loop target, the backend is a file.
> >>>
> >>> IOPS(k)
> >>>
> >>> type		ucmd		ucmd-batch
> >>> seq-read	54.1		53.7
> >>> rand-read	52.0		52.0
> >>>
> >>> --------------------------------------------
> >>> ublk null target
> >>> IOPS(k)
> >>>
> >>> type		ucmd		ucmd-batch
> >>> seq-read	272		265
> >>> rand-read	262		260
> >>>
> >>> ------------
> >>> ublksrv test
> >>>
> >>> -------------
> >>> ucmd
> >>>
> >>> running loop/001
> >>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>>         randwrite: jobs 1, iops 66737
> >>>         randread: jobs 1, iops 64935
> >>>         randrw: jobs 1, iops read 32694 write 32710
> >>>         rw(512k): jobs 1, iops read 772 write 819
> >>>
> >>> -------------
> >>> ucmd-batch
> >>>
> >>> running loop/001
> >>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>>         randwrite: jobs 1, iops 66720
> >>>         randread: jobs 1, iops 65015
> >>>         randrw: jobs 1, iops read 32743 write 32759
> >>>         rw(512k): jobs 1, iops read 771 write 817
> >>>
> >>>
> >>> It seems that manually putting rqs into llist and calling
> >>> io_uring_cmd_complete_in_task() while handling the last rq does
> >>> not improve IOPS much.
> >>>
> >>> io_req_task_work_add() puts task_work node into a internal llist
> >>> first, then it may call task_work_add() to run batched task_works.
> >>> IMO, io_uring has already done such batch optimization and ublk_drv
> >>> does not need to add such llist.
> >>
> >> The difference is just how batching is handled, looks blk-mq's batch info
> >> doesn't matter any more. In my test, looks the perf improvement is mainly
> >> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv.
> > 
> > I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in
> > ublk_drv does not improve IOPS.
> > 
> >>
> >> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach
> >> same perf with task_work_add()(ublk_drv is builtin) when building
> >> ublk_drv as module?
> >>
> > 
> > OK.
> > 
> 
> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> 64GB MEM, CentOS 8, kernel 6.0+
> with IORING_SETUP_COOP_TASKRUN, without this kernel patch
> 
> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
> tw: task_work_add(), ublk is built-in.
> 
> 
> --------
> fio test
> 
> iodepth=128 numjobs=1 direct=1 bs=4k
> 
> --------------------------------------------
> ublk loop target, the backend is a file.
> 
> IOPS(k)
> 
> type		ucmd		tw
> seq-read	54.1		53.8
> rand-read	52.0		52.0
> 
> --------------------------------------------
> ublk null target
> IOPS(k)
> 
> type		ucmd		tw
> seq-read	272		286
> rand-read	262		278
> 
> 
> ------------
> ublksrv test
> 
> -------------
> ucmd
> 
> running loop/001
>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>         randwrite: jobs 1, iops 66737
>         randread: jobs 1, iops 64935
>         randrw: jobs 1, iops read 32694 write 32710
>         rw(512k): jobs 1, iops read 772 write 819
> 
> running null/001
>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>         randwrite: jobs 1, iops 715863
>         randread: jobs 1, iops 758449
>         randrw: jobs 1, iops read 357407 write 357183
>         rw(512k): jobs 1, iops read 5895 write 5875
> 
> -------------
> tw
> 
> running loop/001
>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>         randwrite: jobs 1, iops 66856
>         randread: jobs 1, iops 65015
>         randrw: jobs 1, iops read 32751 write 32767
>         rw(512k): jobs 1, iops read 776 write 823
> 
> running null/001
>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>         randwrite: jobs 1, iops 739450
>         randread: jobs 1, iops 787500
>         randrw: jobs 1, iops read 372956 write 372831
>         rw(512k): jobs 1, iops read 5798 write 5777

Looks the gap isn't big between ucmd and tw when running null/001, in
which the fio io process should saturate the CPU. Probably we
should avoid to touch 'cmd'/'pdu'/'io' in ublk_queue_rq() since these
data should be cold at that time.

Can you apply the following delta patch against the current patch(
"ublk_drv: don't call task_work_add for queueing io commands") and
compare with task_work_add()?

From ecbbf6d10dbc63e279ce0b55c45da6721947f18d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 25 Oct 2022 11:01:25 -0400
Subject: [PATCH] ublk: follow up change

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/ublk_drv.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 7963fba66dd1..18db337094c1 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -56,9 +56,12 @@
 /* All UBLK_PARAM_TYPE_* should be included here */
 #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
 
+struct ublk_rq_data {
+	struct llist_node node;
+};
+
 struct ublk_uring_cmd_pdu {
 	struct request *req;
-	struct llist_node node;
 };
 
 /*
@@ -693,7 +696,8 @@ static inline void __ublk_rq_task_work(struct request *req)
 	 *
 	 * (2) current->flags & PF_EXITING.
 	 */
-	if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
+	if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING
+				|| (io->flags & UBLK_IO_FLAG_ABORTED))) {
 		__ublk_abort_rq(ubq, req);
 		return;
 	}
@@ -757,11 +761,12 @@ static void ublk_rqs_task_work_cb(struct io_uring_cmd *cmd)
 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
 	struct ublk_queue *ubq = pdu->req->mq_hctx->driver_data;
 	struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
+	struct ublk_rq_data *data;
 
 	__ublk_rq_task_work(pdu->req);
 
-	llist_for_each_entry(pdu, io_cmds, node)
-		__ublk_rq_task_work(pdu->req);
+	llist_for_each_entry(data, io_cmds, node)
+		__ublk_rq_task_work(blk_mq_rq_from_pdu(data));
 }
 
 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -769,9 +774,6 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 {
 	struct ublk_queue *ubq = hctx->driver_data;
 	struct request *rq = bd->rq;
-	struct ublk_io *io = &ubq->ios[rq->tag];
-	struct io_uring_cmd *cmd = io->cmd;
-	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
 	blk_status_t res;
 
 	/* fill iod to slot in io cmd buffer */
@@ -805,14 +807,11 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
 	 * guarantees that here is a re-issued request aborted previously.
 	 */
-	if (unlikely(ubq_daemon_is_dying(ubq) ||
-				(io->flags & UBLK_IO_FLAG_ABORTED))) {
+	if (unlikely(ubq_daemon_is_dying(ubq))) {
 		__ublk_abort_rq(ubq, rq);
 		return BLK_STS_OK;
 	}
 
-	pdu->req = rq;
-
 	/*
 	 * Typical multiple producers and single consumer, it is just fine
 	 * to use llist_add() in producer side and llist_del_all() in
@@ -821,10 +820,18 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 * The last command can't be added into list, otherwise it could
 	 * be handled twice
 	 */
-	if (bd->last)
+	if (bd->last) {
+		struct ublk_io *io = &ubq->ios[rq->tag];
+		struct io_uring_cmd *cmd = io->cmd;
+		struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+		pdu->req = rq;
 		io_uring_cmd_complete_in_task(cmd, ublk_rqs_task_work_cb);
-	else
-		llist_add(&pdu->node, &ubq->io_cmds);
+	} else {
+		struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+
+		llist_add(&data->node, &ubq->io_cmds);
+	}
 
 	return BLK_STS_OK;
 }
@@ -1426,6 +1433,7 @@ static int ublk_add_tag_set(struct ublk_device *ub)
 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
 	ub->tag_set.numa_node = NUMA_NO_NODE;
 	ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
 	ub->tag_set.driver_data = ub;
 	return blk_mq_alloc_tag_set(&ub->tag_set);
 }
-- 
2.31.1


Thanks,
Ming


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
  2022-10-25 15:17             ` Ming Lei
@ 2022-10-26 10:32               ` Ziyang Zhang
  2022-10-26 11:29                 ` Ming Lei
  0 siblings, 1 reply; 12+ messages in thread
From: Ziyang Zhang @ 2022-10-26 10:32 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, Jens Axboe

On 2022/10/25 23:17, Ming Lei wrote:
> On Tue, Oct 25, 2022 at 04:43:56PM +0800, Ziyang Zhang wrote:
>> On 2022/10/25 15:46, Ziyang Zhang wrote:
>>> On 2022/10/25 15:19, Ming Lei wrote:
>>>> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote:
>>>>> On 2022/10/24 21:20, Ming Lei wrote:
>>>>>> Hello Ziyang,
>>>>>>
>>>>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
>>>>>>> On 2022/10/23 17:38, Ming Lei wrote:
>>>>>>>> task_work_add() is used for waking ubq daemon task with one batch
>>>>>>>> of io requests/commands queued. However, task_work_add() isn't
>>>>>>>> exported for module code, and it is still debatable if the symbol
>>>>>>>> should be exported.
>>>>>>>>
>>>>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just
>>>>>>>> can't handle batched wakeup for us.
>>>>>>>>
>>>>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
>>>>>>>> via current command for running them via task work.
>>>>>>>>
>>>>>>>> This way cleans up current code a lot, meantime allow us to wakeup
>>>>>>>> ubq daemon task after queueing batched requests/io commands.
>>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> Hi, Ming
>>>>>>>
>>>>>>> This patch works and I have run some tests to compare current version(ucmd)
>>>>>>> with your patch(ucmd-batch).
>>>>>>>
>>>>>>> iodepth=128 numjobs=1 direct=1 bs=4k
>>>>>>>
>>>>>>> --------------------------------------------
>>>>>>> ublk loop target, the backend is a file.
>>>>>>> IOPS(k)
>>>>>>>
>>>>>>> type		ucmd		ucmd-batch
>>>>>>> seq-read	54.7		54.2	
>>>>>>> rand-read	52.8		52.0
>>>>>>>
>>>>>>> --------------------------------------------
>>>>>>> ublk null target
>>>>>>> IOPS(k)
>>>>>>>
>>>>>>> type		ucmd		ucmd-batch
>>>>>>> seq-read	257		257
>>>>>>> rand-read	252		253
>>>>>>>
>>>>>>>
>>>>>>> I find that io_req_task_work_add() puts task_work node into a llist
>>>>>>> first, then it may call task_work_add() to run batched task_works. So do we really
>>>>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch
>>>>>>> optimization.
>>>>>>>
>>>>>>> BTW, task_work_add() in ublk_drv achieves
>>>>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
>>>>>>> in ublk_drv.
>>>>>>
>>>>>> Yeah, that is same with my observation, and motivation of this patch is
>>>>>> to get same performance with task_work_add by building ublk_drv as
>>>>>> module. One win of task_work_add() is that we get exact batching info
>>>>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
>>>>>> what the patch is doing, but needs help of the following ublksrv patch:
>>>>>>
>>>>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
>>>>>>
>>>>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
>>>>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
>>>>>> IOPS boost is observed on loop/001 by putting image on SSD in my test
>>>>>> VM.
>>>>>
>>>>> Hi Ming,
>>>>>
>>>>> I have added this ublksrv patch and run the above test again.
>>>>> I have also run ublksrv test: loop/001. Please check them.
>>>>>
>>>>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
>>>>> 64GB MEM, CentOS 8, kernel 6.0+
>>>>>
>>>>> --------
>>>>> fio test
>>>>>
>>>>> iodepth=128 numjobs=1 direct=1 bs=4k
>>>>>
>>>>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
>>>>> for each blk-mq rq.
>>>>>
>>>>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
>>>>> for the last blk-mq rq.
>>>>>
>>>>> --------------------------------------------
>>>>> ublk loop target, the backend is a file.
>>>>>
>>>>> IOPS(k)
>>>>>
>>>>> type		ucmd		ucmd-batch
>>>>> seq-read	54.1		53.7
>>>>> rand-read	52.0		52.0
>>>>>
>>>>> --------------------------------------------
>>>>> ublk null target
>>>>> IOPS(k)
>>>>>
>>>>> type		ucmd		ucmd-batch
>>>>> seq-read	272		265
>>>>> rand-read	262		260
>>>>>
>>>>> ------------
>>>>> ublksrv test
>>>>>
>>>>> -------------
>>>>> ucmd
>>>>>
>>>>> running loop/001
>>>>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>>>>         randwrite: jobs 1, iops 66737
>>>>>         randread: jobs 1, iops 64935
>>>>>         randrw: jobs 1, iops read 32694 write 32710
>>>>>         rw(512k): jobs 1, iops read 772 write 819
>>>>>
>>>>> -------------
>>>>> ucmd-batch
>>>>>
>>>>> running loop/001
>>>>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>>>>         randwrite: jobs 1, iops 66720
>>>>>         randread: jobs 1, iops 65015
>>>>>         randrw: jobs 1, iops read 32743 write 32759
>>>>>         rw(512k): jobs 1, iops read 771 write 817
>>>>>
>>>>>
>>>>> It seems that manually putting rqs into llist and calling
>>>>> io_uring_cmd_complete_in_task() while handling the last rq does
>>>>> not improve IOPS much.
>>>>>
>>>>> io_req_task_work_add() puts task_work node into a internal llist
>>>>> first, then it may call task_work_add() to run batched task_works.
>>>>> IMO, io_uring has already done such batch optimization and ublk_drv
>>>>> does not need to add such llist.
>>>>
>>>> The difference is just how batching is handled, looks blk-mq's batch info
>>>> doesn't matter any more. In my test, looks the perf improvement is mainly
>>>> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv.
>>>
>>> I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in
>>> ublk_drv does not improve IOPS.
>>>
>>>>
>>>> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach
>>>> same perf with task_work_add()(ublk_drv is builtin) when building
>>>> ublk_drv as module?
>>>>
>>>
>>> OK.
>>>
>>
>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
>> 64GB MEM, CentOS 8, kernel 6.0+
>> with IORING_SETUP_COOP_TASKRUN, without this kernel patch
>>
>> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
>> tw: task_work_add(), ublk is built-in.
>>
>>
>> --------
>> fio test
>>
>> iodepth=128 numjobs=1 direct=1 bs=4k
>>
>> --------------------------------------------
>> ublk loop target, the backend is a file.
>>
>> IOPS(k)
>>
>> type		ucmd		tw
>> seq-read	54.1		53.8
>> rand-read	52.0		52.0
>>
>> --------------------------------------------
>> ublk null target
>> IOPS(k)
>>
>> type		ucmd		tw
>> seq-read	272		286
>> rand-read	262		278
>>
>>
>> ------------
>> ublksrv test
>>
>> -------------
>> ucmd
>>
>> running loop/001
>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>         randwrite: jobs 1, iops 66737
>>         randread: jobs 1, iops 64935
>>         randrw: jobs 1, iops read 32694 write 32710
>>         rw(512k): jobs 1, iops read 772 write 819
>>
>> running null/001
>>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>         randwrite: jobs 1, iops 715863
>>         randread: jobs 1, iops 758449
>>         randrw: jobs 1, iops read 357407 write 357183
>>         rw(512k): jobs 1, iops read 5895 write 5875
>>
>> -------------
>> tw
>>
>> running loop/001
>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>         randwrite: jobs 1, iops 66856
>>         randread: jobs 1, iops 65015
>>         randrw: jobs 1, iops read 32751 write 32767
>>         rw(512k): jobs 1, iops read 776 write 823
>>
>> running null/001
>>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>         randwrite: jobs 1, iops 739450
>>         randread: jobs 1, iops 787500
>>         randrw: jobs 1, iops read 372956 write 372831
>>         rw(512k): jobs 1, iops read 5798 write 5777
> 
> Looks the gap isn't big between ucmd and tw when running null/001, in
> which the fio io process should saturate the CPU. Probably we
> should avoid to touch 'cmd'/'pdu'/'io' in ublk_queue_rq() since these
> data should be cold at that time.
> 
> Can you apply the following delta patch against the current patch(
> "ublk_drv: don't call task_work_add for queueing io commands") and
> compare with task_work_add()?
> 
> From ecbbf6d10dbc63e279ce0b55c45da6721947f18d Mon Sep 17 00:00:00 2001
> From: Ming Lei <ming.lei@redhat.com>
> Date: Tue, 25 Oct 2022 11:01:25 -0400
> Subject: [PATCH] ublk: follow up change
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  drivers/block/ublk_drv.c | 36 ++++++++++++++++++++++--------------
>  1 file changed, 22 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 7963fba66dd1..18db337094c1 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -56,9 +56,12 @@
>  /* All UBLK_PARAM_TYPE_* should be included here */
>  #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
>  
> +struct ublk_rq_data {
> +	struct llist_node node;
> +};
> +
>  struct ublk_uring_cmd_pdu {
>  	struct request *req;
> -	struct llist_node node;
>  };
>  
>  /*
> @@ -693,7 +696,8 @@ static inline void __ublk_rq_task_work(struct request *req)
>  	 *
>  	 * (2) current->flags & PF_EXITING.
>  	 */
> -	if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
> +	if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING
> +				|| (io->flags & UBLK_IO_FLAG_ABORTED))) {
>  		__ublk_abort_rq(ubq, req);
>  		return;

We cannot check io->flags & UBLK_IO_FLAG_ABORTED in io_uring task_work. 
We have to check io->flags & UBLK_IO_FLAG_ABORTED in ublk_queue_rq() because
io_uring ctx may be freed at that time and ioucmd task_work is invalid(freed).
Please See comment around it for more detail.

>  	}
> @@ -757,11 +761,12 @@ static void ublk_rqs_task_work_cb(struct io_uring_cmd *cmd)
>  	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
>  	struct ublk_queue *ubq = pdu->req->mq_hctx->driver_data;
>  	struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
> +	struct ublk_rq_data *data;
>  
>  	__ublk_rq_task_work(pdu->req);
>  
> -	llist_for_each_entry(pdu, io_cmds, node)
> -		__ublk_rq_task_work(pdu->req);
> +	llist_for_each_entry(data, io_cmds, node)
> +		__ublk_rq_task_work(blk_mq_rq_from_pdu(data));
>  }
>  
>  static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> @@ -769,9 +774,6 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>  {
>  	struct ublk_queue *ubq = hctx->driver_data;
>  	struct request *rq = bd->rq;
> -	struct ublk_io *io = &ubq->ios[rq->tag];
> -	struct io_uring_cmd *cmd = io->cmd;
> -	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
>  	blk_status_t res;
>  
>  	/* fill iod to slot in io cmd buffer */
> @@ -805,14 +807,11 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	 * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
>  	 * guarantees that here is a re-issued request aborted previously.
>  	 */
> -	if (unlikely(ubq_daemon_is_dying(ubq) ||
> -				(io->flags & UBLK_IO_FLAG_ABORTED))) {
> +	if (unlikely(ubq_daemon_is_dying(ubq))) {
>  		__ublk_abort_rq(ubq, rq);
>  		return BLK_STS_OK;
>  	}
>  
> -	pdu->req = rq;
> -
>  	/*
>  	 * Typical multiple producers and single consumer, it is just fine
>  	 * to use llist_add() in producer side and llist_del_all() in
> @@ -821,10 +820,18 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	 * The last command can't be added into list, otherwise it could
>  	 * be handled twice
>  	 */
> -	if (bd->last)
> +	if (bd->last) {
> +		struct ublk_io *io = &ubq->ios[rq->tag];
> +		struct io_uring_cmd *cmd = io->cmd;
> +		struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
> +
> +		pdu->req = rq;
>  		io_uring_cmd_complete_in_task(cmd, ublk_rqs_task_work_cb);
> -	else
> -		llist_add(&pdu->node, &ubq->io_cmds);
> +	} else {
> +		struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
> +
> +		llist_add(&data->node, &ubq->io_cmds);
> +	}
>  
>  	return BLK_STS_OK;
>  }
> @@ -1426,6 +1433,7 @@ static int ublk_add_tag_set(struct ublk_device *ub)
>  	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
>  	ub->tag_set.numa_node = NUMA_NO_NODE;
>  	ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
> +	ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
>  	ub->tag_set.driver_data = ub;
>  	return blk_mq_alloc_tag_set(&ub->tag_set);
>  }


Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
64GB MEM, CentOS 8, kernel 6.0+
with IORING_SETUP_COOP_TASKRUN, without this kernel patch

ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module

ucmd-not-touch-pdu: use llist && do not touch 'cmd'/'pdu'/'io' in ublk_queue_rq()

tw: task_work_add(), ublk is built-in.


--------
fio test

iodepth=128 numjobs=1 direct=1 bs=4k

--------------------------------------------
ublk loop target, the backend is a file.

IOPS(k)

type		ucmd		tw		ucmd-not-touch-pdu
seq-read	54.1		53.8		53.6
rand-read	52.0		52.0		52.0

--------------------------------------------
ublk null target
IOPS(k)

type		ucmd		tw		ucmd-not-touch-pdu
seq-read	272		286		275
rand-read	262		278		269


------------
ublksrv test

-------------
ucmd

running loop/001
        fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
        randwrite: jobs 1, iops 66737
        randread: jobs 1, iops 64935
        randrw: jobs 1, iops read 32694 write 32710
        rw(512k): jobs 1, iops read 772 write 819

running null/001
        fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
        randwrite: jobs 1, iops 715863
        randread: jobs 1, iops 758449
        randrw: jobs 1, iops read 357407 write 357183
        rw(512k): jobs 1, iops read 5895 write 5875

-------------
tw

running loop/001
        fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
        randwrite: jobs 1, iops 66856
        randread: jobs 1, iops 65015
        randrw: jobs 1, iops read 32751 write 32767
        rw(512k): jobs 1, iops read 776 write 823

running null/001
        fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
        randwrite: jobs 1, iops 739450
        randread: jobs 1, iops 787500
        randrw: jobs 1, iops read 372956 write 372831
        rw(512k): jobs 1, iops read 5798 write 5777

-------------
ucmd-not-touch-pdu

running loop/001
        fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_oH0eG), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
        randwrite: jobs 1, iops 66754
        randread: jobs 1, iops 65032
        randrw: jobs 1, iops read 32776 write 32792
        rw(512k): jobs 1, iops read 772 write 818

running null/001
        fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
        randwrite: jobs 1, iops 725334
        randread: jobs 1, iops 741105
        randrw: jobs 1, iops read 360285 write 360047
        rw(512k): jobs 1, iops read 5770 write 5748

Not touching cmd/pdu/io in ublk_queue_rq() improves IOPS.
But it is worse than using task_work_add().

Regards,
Zhang



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
  2022-10-26 10:32               ` Ziyang Zhang
@ 2022-10-26 11:29                 ` Ming Lei
  2022-10-27  3:00                   ` Ziyang Zhang
  0 siblings, 1 reply; 12+ messages in thread
From: Ming Lei @ 2022-10-26 11:29 UTC (permalink / raw)
  To: Ziyang Zhang; +Cc: linux-block, Jens Axboe

On Wed, Oct 26, 2022 at 06:32:46PM +0800, Ziyang Zhang wrote:
> On 2022/10/25 23:17, Ming Lei wrote:
> > On Tue, Oct 25, 2022 at 04:43:56PM +0800, Ziyang Zhang wrote:
> >> On 2022/10/25 15:46, Ziyang Zhang wrote:
> >>> On 2022/10/25 15:19, Ming Lei wrote:
> >>>> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote:
> >>>>> On 2022/10/24 21:20, Ming Lei wrote:
> >>>>>> Hello Ziyang,
> >>>>>>
> >>>>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
> >>>>>>> On 2022/10/23 17:38, Ming Lei wrote:
> >>>>>>>> task_work_add() is used for waking ubq daemon task with one batch
> >>>>>>>> of io requests/commands queued. However, task_work_add() isn't
> >>>>>>>> exported for module code, and it is still debatable if the symbol
> >>>>>>>> should be exported.
> >>>>>>>>
> >>>>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just
> >>>>>>>> can't handle batched wakeup for us.
> >>>>>>>>
> >>>>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
> >>>>>>>> via current command for running them via task work.
> >>>>>>>>
> >>>>>>>> This way cleans up current code a lot, meantime allow us to wakeup
> >>>>>>>> ubq daemon task after queueing batched requests/io commands.
> >>>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>> Hi, Ming
> >>>>>>>
> >>>>>>> This patch works and I have run some tests to compare current version(ucmd)
> >>>>>>> with your patch(ucmd-batch).
> >>>>>>>
> >>>>>>> iodepth=128 numjobs=1 direct=1 bs=4k
> >>>>>>>
> >>>>>>> --------------------------------------------
> >>>>>>> ublk loop target, the backend is a file.
> >>>>>>> IOPS(k)
> >>>>>>>
> >>>>>>> type		ucmd		ucmd-batch
> >>>>>>> seq-read	54.7		54.2	
> >>>>>>> rand-read	52.8		52.0
> >>>>>>>
> >>>>>>> --------------------------------------------
> >>>>>>> ublk null target
> >>>>>>> IOPS(k)
> >>>>>>>
> >>>>>>> type		ucmd		ucmd-batch
> >>>>>>> seq-read	257		257
> >>>>>>> rand-read	252		253
> >>>>>>>
> >>>>>>>
> >>>>>>> I find that io_req_task_work_add() puts task_work node into a llist
> >>>>>>> first, then it may call task_work_add() to run batched task_works. So do we really
> >>>>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch
> >>>>>>> optimization.
> >>>>>>>
> >>>>>>> BTW, task_work_add() in ublk_drv achieves
> >>>>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
> >>>>>>> in ublk_drv.
> >>>>>>
> >>>>>> Yeah, that is same with my observation, and motivation of this patch is
> >>>>>> to get same performance with task_work_add by building ublk_drv as
> >>>>>> module. One win of task_work_add() is that we get exact batching info
> >>>>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
> >>>>>> what the patch is doing, but needs help of the following ublksrv patch:
> >>>>>>
> >>>>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
> >>>>>>
> >>>>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
> >>>>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
> >>>>>> IOPS boost is observed on loop/001 by putting image on SSD in my test
> >>>>>> VM.
> >>>>>
> >>>>> Hi Ming,
> >>>>>
> >>>>> I have added this ublksrv patch and run the above test again.
> >>>>> I have also run ublksrv test: loop/001. Please check them.
> >>>>>
> >>>>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> >>>>> 64GB MEM, CentOS 8, kernel 6.0+
> >>>>>
> >>>>> --------
> >>>>> fio test
> >>>>>
> >>>>> iodepth=128 numjobs=1 direct=1 bs=4k
> >>>>>
> >>>>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
> >>>>> for each blk-mq rq.
> >>>>>
> >>>>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
> >>>>> for the last blk-mq rq.
> >>>>>
> >>>>> --------------------------------------------
> >>>>> ublk loop target, the backend is a file.
> >>>>>
> >>>>> IOPS(k)
> >>>>>
> >>>>> type		ucmd		ucmd-batch
> >>>>> seq-read	54.1		53.7
> >>>>> rand-read	52.0		52.0
> >>>>>
> >>>>> --------------------------------------------
> >>>>> ublk null target
> >>>>> IOPS(k)
> >>>>>
> >>>>> type		ucmd		ucmd-batch
> >>>>> seq-read	272		265
> >>>>> rand-read	262		260
> >>>>>
> >>>>> ------------
> >>>>> ublksrv test
> >>>>>
> >>>>> -------------
> >>>>> ucmd
> >>>>>
> >>>>> running loop/001
> >>>>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>>>>         randwrite: jobs 1, iops 66737
> >>>>>         randread: jobs 1, iops 64935
> >>>>>         randrw: jobs 1, iops read 32694 write 32710
> >>>>>         rw(512k): jobs 1, iops read 772 write 819
> >>>>>
> >>>>> -------------
> >>>>> ucmd-batch
> >>>>>
> >>>>> running loop/001
> >>>>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>>>>         randwrite: jobs 1, iops 66720
> >>>>>         randread: jobs 1, iops 65015
> >>>>>         randrw: jobs 1, iops read 32743 write 32759
> >>>>>         rw(512k): jobs 1, iops read 771 write 817
> >>>>>
> >>>>>
> >>>>> It seems that manually putting rqs into llist and calling
> >>>>> io_uring_cmd_complete_in_task() while handling the last rq does
> >>>>> not improve IOPS much.
> >>>>>
> >>>>> io_req_task_work_add() puts task_work node into a internal llist
> >>>>> first, then it may call task_work_add() to run batched task_works.
> >>>>> IMO, io_uring has already done such batch optimization and ublk_drv
> >>>>> does not need to add such llist.
> >>>>
> >>>> The difference is just how batching is handled, looks blk-mq's batch info
> >>>> doesn't matter any more. In my test, looks the perf improvement is mainly
> >>>> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv.
> >>>
> >>> I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in
> >>> ublk_drv does not improve IOPS.
> >>>
> >>>>
> >>>> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach
> >>>> same perf with task_work_add()(ublk_drv is builtin) when building
> >>>> ublk_drv as module?
> >>>>
> >>>
> >>> OK.
> >>>
> >>
> >> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> >> 64GB MEM, CentOS 8, kernel 6.0+
> >> with IORING_SETUP_COOP_TASKRUN, without this kernel patch
> >>
> >> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
> >> tw: task_work_add(), ublk is built-in.
> >>
> >>
> >> --------
> >> fio test
> >>
> >> iodepth=128 numjobs=1 direct=1 bs=4k
> >>
> >> --------------------------------------------
> >> ublk loop target, the backend is a file.
> >>
> >> IOPS(k)
> >>
> >> type		ucmd		tw
> >> seq-read	54.1		53.8
> >> rand-read	52.0		52.0
> >>
> >> --------------------------------------------
> >> ublk null target
> >> IOPS(k)
> >>
> >> type		ucmd		tw
> >> seq-read	272		286
> >> rand-read	262		278
> >>
> >>
> >> ------------
> >> ublksrv test
> >>
> >> -------------
> >> ucmd
> >>
> >> running loop/001
> >>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>         randwrite: jobs 1, iops 66737
> >>         randread: jobs 1, iops 64935
> >>         randrw: jobs 1, iops read 32694 write 32710
> >>         rw(512k): jobs 1, iops read 772 write 819
> >>
> >> running null/001
> >>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>         randwrite: jobs 1, iops 715863
> >>         randread: jobs 1, iops 758449
> >>         randrw: jobs 1, iops read 357407 write 357183
> >>         rw(512k): jobs 1, iops read 5895 write 5875
> >>
> >> -------------
> >> tw
> >>
> >> running loop/001
> >>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>         randwrite: jobs 1, iops 66856
> >>         randread: jobs 1, iops 65015
> >>         randrw: jobs 1, iops read 32751 write 32767
> >>         rw(512k): jobs 1, iops read 776 write 823
> >>
> >> running null/001
> >>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>         randwrite: jobs 1, iops 739450
> >>         randread: jobs 1, iops 787500
> >>         randrw: jobs 1, iops read 372956 write 372831
> >>         rw(512k): jobs 1, iops read 5798 write 5777
> > 
> > Looks the gap isn't big between ucmd and tw when running null/001, in
> > which the fio io process should saturate the CPU. Probably we
> > should avoid to touch 'cmd'/'pdu'/'io' in ublk_queue_rq() since these
> > data should be cold at that time.
> > 
> > Can you apply the following delta patch against the current patch(
> > "ublk_drv: don't call task_work_add for queueing io commands") and
> > compare with task_work_add()?
> > 
> > From ecbbf6d10dbc63e279ce0b55c45da6721947f18d Mon Sep 17 00:00:00 2001
> > From: Ming Lei <ming.lei@redhat.com>
> > Date: Tue, 25 Oct 2022 11:01:25 -0400
> > Subject: [PATCH] ublk: follow up change
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  drivers/block/ublk_drv.c | 36 ++++++++++++++++++++++--------------
> >  1 file changed, 22 insertions(+), 14 deletions(-)
> > 
> > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > index 7963fba66dd1..18db337094c1 100644
> > --- a/drivers/block/ublk_drv.c
> > +++ b/drivers/block/ublk_drv.c
> > @@ -56,9 +56,12 @@
> >  /* All UBLK_PARAM_TYPE_* should be included here */
> >  #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
> >  
> > +struct ublk_rq_data {
> > +	struct llist_node node;
> > +};
> > +
> >  struct ublk_uring_cmd_pdu {
> >  	struct request *req;
> > -	struct llist_node node;
> >  };
> >  
> >  /*
> > @@ -693,7 +696,8 @@ static inline void __ublk_rq_task_work(struct request *req)
> >  	 *
> >  	 * (2) current->flags & PF_EXITING.
> >  	 */
> > -	if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
> > +	if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING
> > +				|| (io->flags & UBLK_IO_FLAG_ABORTED))) {
> >  		__ublk_abort_rq(ubq, req);
> >  		return;
> 
> We cannot check io->flags & UBLK_IO_FLAG_ABORTED in io_uring task_work. 
> We have to check io->flags & UBLK_IO_FLAG_ABORTED in ublk_queue_rq() because
> io_uring ctx may be freed at that time and ioucmd task_work is invalid(freed).
> Please See comment around it for more detail.

The comment is obsolete, because all inflight requests are drained
before io_uring is dead, please see __ublk_quiesce_dev().

> 
> >  	}
> > @@ -757,11 +761,12 @@ static void ublk_rqs_task_work_cb(struct io_uring_cmd *cmd)
> >  	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
> >  	struct ublk_queue *ubq = pdu->req->mq_hctx->driver_data;
> >  	struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
> > +	struct ublk_rq_data *data;
> >  
> >  	__ublk_rq_task_work(pdu->req);
> >  
> > -	llist_for_each_entry(pdu, io_cmds, node)
> > -		__ublk_rq_task_work(pdu->req);
> > +	llist_for_each_entry(data, io_cmds, node)
> > +		__ublk_rq_task_work(blk_mq_rq_from_pdu(data));
> >  }
> >  
> >  static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> > @@ -769,9 +774,6 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> >  {
> >  	struct ublk_queue *ubq = hctx->driver_data;
> >  	struct request *rq = bd->rq;
> > -	struct ublk_io *io = &ubq->ios[rq->tag];
> > -	struct io_uring_cmd *cmd = io->cmd;
> > -	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
> >  	blk_status_t res;
> >  
> >  	/* fill iod to slot in io cmd buffer */
> > @@ -805,14 +807,11 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> >  	 * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
> >  	 * guarantees that here is a re-issued request aborted previously.
> >  	 */
> > -	if (unlikely(ubq_daemon_is_dying(ubq) ||
> > -				(io->flags & UBLK_IO_FLAG_ABORTED))) {
> > +	if (unlikely(ubq_daemon_is_dying(ubq))) {
> >  		__ublk_abort_rq(ubq, rq);
> >  		return BLK_STS_OK;
> >  	}
> >  
> > -	pdu->req = rq;
> > -
> >  	/*
> >  	 * Typical multiple producers and single consumer, it is just fine
> >  	 * to use llist_add() in producer side and llist_del_all() in
> > @@ -821,10 +820,18 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> >  	 * The last command can't be added into list, otherwise it could
> >  	 * be handled twice
> >  	 */
> > -	if (bd->last)
> > +	if (bd->last) {
> > +		struct ublk_io *io = &ubq->ios[rq->tag];
> > +		struct io_uring_cmd *cmd = io->cmd;
> > +		struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
> > +
> > +		pdu->req = rq;
> >  		io_uring_cmd_complete_in_task(cmd, ublk_rqs_task_work_cb);
> > -	else
> > -		llist_add(&pdu->node, &ubq->io_cmds);
> > +	} else {
> > +		struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
> > +
> > +		llist_add(&data->node, &ubq->io_cmds);
> > +	}
> >  
> >  	return BLK_STS_OK;
> >  }
> > @@ -1426,6 +1433,7 @@ static int ublk_add_tag_set(struct ublk_device *ub)
> >  	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
> >  	ub->tag_set.numa_node = NUMA_NO_NODE;
> >  	ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
> > +	ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
> >  	ub->tag_set.driver_data = ub;
> >  	return blk_mq_alloc_tag_set(&ub->tag_set);
> >  }
> 
> 
> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> 64GB MEM, CentOS 8, kernel 6.0+
> with IORING_SETUP_COOP_TASKRUN, without this kernel patch
> 
> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
> 
> ucmd-not-touch-pdu: use llist && do not touch 'cmd'/'pdu'/'io' in ublk_queue_rq()
> 
> tw: task_work_add(), ublk is built-in.
> 
> 
> --------
> fio test
> 
> iodepth=128 numjobs=1 direct=1 bs=4k
> 
> --------------------------------------------
> ublk loop target, the backend is a file.
> 
> IOPS(k)
> 
> type		ucmd		tw		ucmd-not-touch-pdu
> seq-read	54.1		53.8		53.6
> rand-read	52.0		52.0		52.0
> 
> --------------------------------------------
> ublk null target
> IOPS(k)
> 
> type		ucmd		tw		ucmd-not-touch-pdu
> seq-read	272		286		275
> rand-read	262		278		269
> 
> 
> ------------
> ublksrv test
> 
> -------------
> ucmd
> 
> running loop/001
>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>         randwrite: jobs 1, iops 66737
>         randread: jobs 1, iops 64935
>         randrw: jobs 1, iops read 32694 write 32710
>         rw(512k): jobs 1, iops read 772 write 819
> 
> running null/001
>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>         randwrite: jobs 1, iops 715863
>         randread: jobs 1, iops 758449
>         randrw: jobs 1, iops read 357407 write 357183
>         rw(512k): jobs 1, iops read 5895 write 5875
> 
> -------------
> tw
> 
> running loop/001
>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>         randwrite: jobs 1, iops 66856
>         randread: jobs 1, iops 65015
>         randrw: jobs 1, iops read 32751 write 32767
>         rw(512k): jobs 1, iops read 776 write 823
> 
> running null/001
>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>         randwrite: jobs 1, iops 739450
>         randread: jobs 1, iops 787500
>         randrw: jobs 1, iops read 372956 write 372831
>         rw(512k): jobs 1, iops read 5798 write 5777
> 
> -------------
> ucmd-not-touch-pdu
> 
> running loop/001
>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_oH0eG), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>         randwrite: jobs 1, iops 66754
>         randread: jobs 1, iops 65032
>         randrw: jobs 1, iops read 32776 write 32792
>         rw(512k): jobs 1, iops read 772 write 818
> 
> running null/001
>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>         randwrite: jobs 1, iops 725334
>         randread: jobs 1, iops 741105
>         randrw: jobs 1, iops read 360285 write 360047
>         rw(512k): jobs 1, iops read 5770 write 5748
> 
> Not touching cmd/pdu/io in ublk_queue_rq() improves IOPS.
> But it is worse than using task_work_add().

Thanks for the test! It is better to not share ucmd between
ublk blk-mq io context and ubq daemon context, and we can
improve it for using io_uring_cmd_complete_in_task(), and I
have one patch by using the batch handing approach in
io_uring_cmd_complete_in_task().

Another reason could be the extra __set_notify_signal() in
__io_req_task_work_add() via task_work_add(). When task_work_add()
is available, we just need to call __set_notify_signal() once
for the whole batch, but it can't be done in case of using
io_uring_cmd_complete_in_task().

Also the patch of 'use llist' is actually wrong since we have to
call io_uring_cmd_complete_in_task() once in ->commit_rqs(), but
that couldn't be easy because ucmd isn't available at that time.

I think we may have to live with task_work_add() until the perf
number is improved to same basically with io_uring_cmd_complete_in_task().



thanks,
Ming


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
  2022-10-26 11:29                 ` Ming Lei
@ 2022-10-27  3:00                   ` Ziyang Zhang
  2022-10-27 15:38                     ` Ming Lei
  0 siblings, 1 reply; 12+ messages in thread
From: Ziyang Zhang @ 2022-10-27  3:00 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, Jens Axboe

On 2022/10/26 19:29, Ming Lei wrote:

[...]
>>
>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
>> 64GB MEM, CentOS 8, kernel 6.0+
>> with IORING_SETUP_COOP_TASKRUN, without this kernel patch
>>
>> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
>>
>> ucmd-not-touch-pdu: use llist && do not touch 'cmd'/'pdu'/'io' in ublk_queue_rq()
>>
>> tw: task_work_add(), ublk is built-in.
>>
>>
>> --------
>> fio test
>>
>> iodepth=128 numjobs=1 direct=1 bs=4k
>>
>> --------------------------------------------
>> ublk loop target, the backend is a file.
>>
>> IOPS(k)
>>
>> type		ucmd		tw		ucmd-not-touch-pdu
>> seq-read	54.1		53.8		53.6
>> rand-read	52.0		52.0		52.0
>>
>> --------------------------------------------
>> ublk null target
>> IOPS(k)
>>
>> type		ucmd		tw		ucmd-not-touch-pdu
>> seq-read	272		286		275
>> rand-read	262		278		269
>>
>>
>> ------------
>> ublksrv test
>>
>> -------------
>> ucmd
>>
>> running loop/001
>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>         randwrite: jobs 1, iops 66737
>>         randread: jobs 1, iops 64935
>>         randrw: jobs 1, iops read 32694 write 32710
>>         rw(512k): jobs 1, iops read 772 write 819
>>
>> running null/001
>>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>         randwrite: jobs 1, iops 715863
>>         randread: jobs 1, iops 758449
>>         randrw: jobs 1, iops read 357407 write 357183
>>         rw(512k): jobs 1, iops read 5895 write 5875
>>
>> -------------
>> tw
>>
>> running loop/001
>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>         randwrite: jobs 1, iops 66856
>>         randread: jobs 1, iops 65015
>>         randrw: jobs 1, iops read 32751 write 32767
>>         rw(512k): jobs 1, iops read 776 write 823
>>
>> running null/001
>>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>         randwrite: jobs 1, iops 739450
>>         randread: jobs 1, iops 787500
>>         randrw: jobs 1, iops read 372956 write 372831
>>         rw(512k): jobs 1, iops read 5798 write 5777
>>
>> -------------
>> ucmd-not-touch-pdu
>>
>> running loop/001
>>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_oH0eG), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>         randwrite: jobs 1, iops 66754
>>         randread: jobs 1, iops 65032
>>         randrw: jobs 1, iops read 32776 write 32792
>>         rw(512k): jobs 1, iops read 772 write 818
>>
>> running null/001
>>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>         randwrite: jobs 1, iops 725334
>>         randread: jobs 1, iops 741105
>>         randrw: jobs 1, iops read 360285 write 360047
>>         rw(512k): jobs 1, iops read 5770 write 5748
>>
>> Not touching cmd/pdu/io in ublk_queue_rq() improves IOPS.
>> But it is worse than using task_work_add().
> 
> Thanks for the test! It is better to not share ucmd between
> ublk blk-mq io context and ubq daemon context, and we can
> improve it for using io_uring_cmd_complete_in_task(), and I
> have one patch by using the batch handing approach in
> io_uring_cmd_complete_in_task().
> 
> Another reason could be the extra __set_notify_signal() in
> __io_req_task_work_add() via task_work_add(). When task_work_add()
> is available, we just need to call __set_notify_signal() once
> for the whole batch, but it can't be done in case of using
> io_uring_cmd_complete_in_task().
> 
> Also the patch of 'use llist' is actually wrong since we have to
> call io_uring_cmd_complete_in_task() once in ->commit_rqs(), but
> that couldn't be easy because ucmd isn't available at that time.

Yes, you are correct.

> 
> I think we may have to live with task_work_add() until the perf
> number is improved to same basically with io_uring_cmd_complete_in_task().

OK, we can keep task_work_add() && io_uring_cmd_complete_in_task().
BTW, from test:loop/001, I think with real backends, the performance
gap between them seems not too big.

Regards,
Zhang


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
  2022-10-27  3:00                   ` Ziyang Zhang
@ 2022-10-27 15:38                     ` Ming Lei
  0 siblings, 0 replies; 12+ messages in thread
From: Ming Lei @ 2022-10-27 15:38 UTC (permalink / raw)
  To: Ziyang Zhang; +Cc: linux-block, Jens Axboe

On Thu, Oct 27, 2022 at 11:00:44AM +0800, Ziyang Zhang wrote:
> On 2022/10/26 19:29, Ming Lei wrote:
> 
> [...]
> >>
> >> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> >> 64GB MEM, CentOS 8, kernel 6.0+
> >> with IORING_SETUP_COOP_TASKRUN, without this kernel patch
> >>
> >> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
> >>
> >> ucmd-not-touch-pdu: use llist && do not touch 'cmd'/'pdu'/'io' in ublk_queue_rq()
> >>
> >> tw: task_work_add(), ublk is built-in.
> >>
> >>
> >> --------
> >> fio test
> >>
> >> iodepth=128 numjobs=1 direct=1 bs=4k
> >>
> >> --------------------------------------------
> >> ublk loop target, the backend is a file.
> >>
> >> IOPS(k)
> >>
> >> type		ucmd		tw		ucmd-not-touch-pdu
> >> seq-read	54.1		53.8		53.6
> >> rand-read	52.0		52.0		52.0
> >>
> >> --------------------------------------------
> >> ublk null target
> >> IOPS(k)
> >>
> >> type		ucmd		tw		ucmd-not-touch-pdu
> >> seq-read	272		286		275
> >> rand-read	262		278		269
> >>
> >>
> >> ------------
> >> ublksrv test
> >>
> >> -------------
> >> ucmd
> >>
> >> running loop/001
> >>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>         randwrite: jobs 1, iops 66737
> >>         randread: jobs 1, iops 64935
> >>         randrw: jobs 1, iops read 32694 write 32710
> >>         rw(512k): jobs 1, iops read 772 write 819
> >>
> >> running null/001
> >>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>         randwrite: jobs 1, iops 715863
> >>         randread: jobs 1, iops 758449
> >>         randrw: jobs 1, iops read 357407 write 357183
> >>         rw(512k): jobs 1, iops read 5895 write 5875
> >>
> >> -------------
> >> tw
> >>
> >> running loop/001
> >>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>         randwrite: jobs 1, iops 66856
> >>         randread: jobs 1, iops 65015
> >>         randrw: jobs 1, iops read 32751 write 32767
> >>         rw(512k): jobs 1, iops read 776 write 823
> >>
> >> running null/001
> >>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>         randwrite: jobs 1, iops 739450
> >>         randread: jobs 1, iops 787500
> >>         randrw: jobs 1, iops read 372956 write 372831
> >>         rw(512k): jobs 1, iops read 5798 write 5777
> >>
> >> -------------
> >> ucmd-not-touch-pdu
> >>
> >> running loop/001
> >>         fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_oH0eG), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>         randwrite: jobs 1, iops 66754
> >>         randread: jobs 1, iops 65032
> >>         randrw: jobs 1, iops read 32776 write 32792
> >>         rw(512k): jobs 1, iops read 772 write 818
> >>
> >> running null/001
> >>         fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>         randwrite: jobs 1, iops 725334
> >>         randread: jobs 1, iops 741105
> >>         randrw: jobs 1, iops read 360285 write 360047
> >>         rw(512k): jobs 1, iops read 5770 write 5748
> >>
> >> Not touching cmd/pdu/io in ublk_queue_rq() improves IOPS.
> >> But it is worse than using task_work_add().
> > 
> > Thanks for the test! It is better to not share ucmd between
> > ublk blk-mq io context and ubq daemon context, and we can
> > improve it for using io_uring_cmd_complete_in_task(), and I
> > have one patch by using the batch handing approach in
> > io_uring_cmd_complete_in_task().
> > 
> > Another reason could be the extra __set_notify_signal() in
> > __io_req_task_work_add() via task_work_add(). When task_work_add()
> > is available, we just need to call __set_notify_signal() once
> > for the whole batch, but it can't be done in case of using
> > io_uring_cmd_complete_in_task().
> > 
> > Also the patch of 'use llist' is actually wrong since we have to
> > call io_uring_cmd_complete_in_task() once in ->commit_rqs(), but
> > that couldn't be easy because ucmd isn't available at that time.
> 
> Yes, you are correct.
> 
> > 
> > I think we may have to live with task_work_add() until the perf
> > number is improved to same basically with io_uring_cmd_complete_in_task().
> 
> OK, we can keep task_work_add() && io_uring_cmd_complete_in_task().
> BTW, from test:loop/001, I think with real backends, the performance
> gap between them seems not too big.

Actually in VM created in my laptop, the gap isn't small on ublk-loop:

1) ublk-null, single job, change nr_hw_queue to 2 for using none scheduler
[root@ktest-09 ublksrv]# make test T=null/001:null/004
make -C tests run T=null/001:null/004 R=10 D=tests/tmp/
make[1]: Entering directory '/root/git/ublksrv/tests'
./run_test.sh null/001:null/004 10 tests/tmp/
running null/001
	fio (ublk/null(), libaio, dio, hw queues:2, io jobs: 1, uring_comp: 0, get_data: 0)...
	randwrite(4k): jobs 1, iops 1028774, cpu_util(29% 69%)
	randread(4k): jobs 1, iops 958598, cpu_util(27% 72%)
	randrw(4k): jobs 1, iops read 453382 write 453239, cpu_util(29% 70%)
	rw(512k): jobs 1, iops read 5264 write 5242, cpu_util(7% 6%)

running null/004
	fio (ublk/null(), libaio, dio, hw queues:2, io jobs: 1, uring_comp: 1, get_data: 0)...
	randwrite(4k): jobs 1, iops 950755, cpu_util(30% 69%)
	randread(4k): jobs 1, iops 855984, cpu_util(26% 73%)
	randrw(4k): jobs 1, iops read 439369 write 439287, cpu_util(29% 70%)
	rw(512k): jobs 1, iops read 5225 write 5202, cpu_util(7% 5%)

2) ublk-loop, single job, change nr_hw_queue to 2 for using none scheduler
[root@ktest-09 ublksrv]# make test T=loop/001:loop/004
make -C tests run T=loop/001:loop/004 R=10 D=tests/tmp/
make[1]: Entering directory '/root/git/ublksrv/tests'
./run_test.sh loop/001:loop/004 10 tests/tmp/
running loop/001
	fio (ublk/loop( -f /root/git/ublksrv/tests/tmp/ublk_loop_2G_STbdZ), libaio, dio, hw queues:2, io jobs: 1, uring_comp: 0, get_data: 0)...
	randwrite(4k): jobs 1, iops 163177, cpu_util(7% 20%)
	randread(4k): jobs 1, iops 141584, cpu_util(6% 14%)
	randrw(4k): jobs 1, iops read 75034 write 75148, cpu_util(7% 17%)
	rw(512k): jobs 1, iops read 1574 write 1663, cpu_util(3% 4%)

running loop/004
	fio (ublk/loop( -f /root/git/ublksrv/tests/tmp/ublk_loop_2G_AdrHl), libaio, dio, hw queues:2, io jobs: 1, uring_comp: 1, get_data: 0)...
	randwrite(4k): jobs 1, iops 142353, cpu_util(7% 19%)
	randread(4k): jobs 1, iops 133883, cpu_util(6% 14%)
	randrw(4k): jobs 1, iops read 72581 write 72691, cpu_util(7% 18%)
	rw(512k): jobs 1, iops read 929 write 981, cpu_util(2% 2%)



Thanks,
Ming


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2022-10-27 15:38 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-10-23  9:38 [PATCH] ublk_drv: don't call task_work_add for queueing io commands Ming Lei
2022-10-24  9:48 ` Ziyang Zhang
2022-10-24 13:20   ` Ming Lei
2022-10-25  3:15     ` Ziyang Zhang
2022-10-25  7:19       ` Ming Lei
2022-10-25  7:46         ` Ziyang Zhang
2022-10-25  8:43           ` Ziyang Zhang
2022-10-25 15:17             ` Ming Lei
2022-10-26 10:32               ` Ziyang Zhang
2022-10-26 11:29                 ` Ming Lei
2022-10-27  3:00                   ` Ziyang Zhang
2022-10-27 15:38                     ` Ming Lei

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).