* [PATCH] ublk_drv: don't call task_work_add for queueing io commands @ 2022-10-23 9:38 Ming Lei 2022-10-24 9:48 ` Ziyang Zhang 0 siblings, 1 reply; 12+ messages in thread From: Ming Lei @ 2022-10-23 9:38 UTC (permalink / raw) To: Jens Axboe; +Cc: linux-block, ZiyangZhang, Ming Lei task_work_add() is used for waking ubq daemon task with one batch of io requests/commands queued. However, task_work_add() isn't exported for module code, and it is still debatable if the symbol should be exported. Fortunately we still have io_uring_cmd_complete_in_task() which just can't handle batched wakeup for us. Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task() via current command for running them via task work. This way cleans up current code a lot, meantime allow us to wakeup ubq daemon task after queueing batched requests/io commands. Signed-off-by: Ming Lei <ming.lei@redhat.com> --- drivers/block/ublk_drv.c | 130 ++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 78 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 5afce6ffaadf..7963fba66dd1 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -41,7 +41,7 @@ #include <linux/delay.h> #include <linux/mm.h> #include <asm/page.h> -#include <linux/task_work.h> +#include <linux/llist.h> #include <uapi/linux/ublk_cmd.h> #define UBLK_MINORS (1U << MINORBITS) @@ -56,12 +56,9 @@ /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) -struct ublk_rq_data { - struct callback_head work; -}; - struct ublk_uring_cmd_pdu { struct request *req; + struct llist_node node; }; /* @@ -119,6 +116,8 @@ struct ublk_queue { struct task_struct *ubq_daemon; char *io_cmd_buf; + struct llist_head io_cmds; + unsigned long io_addr; /* mapped vm address */ unsigned int max_io_sz; bool force_abort; @@ -268,14 +267,6 @@ static int ublk_apply_params(struct ublk_device *ub) return 0; } -static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq) -{ - if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) && - !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK)) - return true; - return false; -} - static inline bool ublk_need_get_data(const struct ublk_queue *ubq) { if (ubq->flags & UBLK_F_NEED_GET_DATA) @@ -761,20 +752,16 @@ static inline void __ublk_rq_task_work(struct request *req) ubq_complete_io_cmd(io, UBLK_IO_RES_OK); } -static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) +static void ublk_rqs_task_work_cb(struct io_uring_cmd *cmd) { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_queue *ubq = pdu->req->mq_hctx->driver_data; + struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); __ublk_rq_task_work(pdu->req); -} -static void ublk_rq_task_work_fn(struct callback_head *work) -{ - struct ublk_rq_data *data = container_of(work, - struct ublk_rq_data, work); - struct request *req = blk_mq_rq_from_pdu(data); - - __ublk_rq_task_work(req); + llist_for_each_entry(pdu, io_cmds, node) + __ublk_rq_task_work(pdu->req); } static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, @@ -782,12 +769,16 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, { struct ublk_queue *ubq = hctx->driver_data; struct request *rq = bd->rq; + struct ublk_io *io = &ubq->ios[rq->tag]; + struct io_uring_cmd *cmd = io->cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); blk_status_t res; /* fill iod to slot in io cmd buffer */ res = ublk_setup_iod(ubq, rq); if (unlikely(res != BLK_STS_OK)) return BLK_STS_IOERR; + /* With recovery feature enabled, force_abort is set in * ublk_stop_dev() before calling del_gendisk(). We have to * abort all requeued and new rqs here to let del_gendisk() @@ -802,42 +793,38 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(bd->rq); - if (unlikely(ubq_daemon_is_dying(ubq))) { - fail: + /* + * If the check pass, we know that this is a re-issued request aborted + * previously in monitor_work because the ubq_daemon(cmd's task) is + * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore + * because this ioucmd's io_uring context may be freed now if no inflight + * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work. + * + * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing + * the tag). Then the request is re-started(allocating the tag) and we are here. + * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED + * guarantees that here is a re-issued request aborted previously. + */ + if (unlikely(ubq_daemon_is_dying(ubq) || + (io->flags & UBLK_IO_FLAG_ABORTED))) { __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } - if (ublk_can_use_task_work(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); - enum task_work_notify_mode notify_mode = bd->last ? - TWA_SIGNAL_NO_IPI : TWA_NONE; - - if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode)) - goto fail; - } else { - struct ublk_io *io = &ubq->ios[rq->tag]; - struct io_uring_cmd *cmd = io->cmd; - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + pdu->req = rq; - /* - * If the check pass, we know that this is a re-issued request aborted - * previously in monitor_work because the ubq_daemon(cmd's task) is - * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore - * because this ioucmd's io_uring context may be freed now if no inflight - * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work. - * - * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing - * the tag). Then the request is re-started(allocating the tag) and we are here. - * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED - * guarantees that here is a re-issued request aborted previously. - */ - if ((io->flags & UBLK_IO_FLAG_ABORTED)) - goto fail; - - pdu->req = rq; - io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); - } + /* + * Typical multiple producers and single consumer, it is just fine + * to use llist_add() in producer side and llist_del_all() in + * consumer side. + * + * The last command can't be added into list, otherwise it could + * be handled twice + */ + if (bd->last) + io_uring_cmd_complete_in_task(cmd, ublk_rqs_task_work_cb); + else + llist_add(&pdu->node, &ubq->io_cmds); return BLK_STS_OK; } @@ -846,8 +833,8 @@ static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) { struct ublk_queue *ubq = hctx->driver_data; - if (ublk_can_use_task_work(ubq)) - __set_notify_signal(ubq->ubq_daemon); + if (!test_and_set_tsk_thread_flag(ubq->ubq_daemon, TIF_NOTIFY_SIGNAL)) + wake_up_process(ubq->ubq_daemon); } static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, @@ -860,20 +847,10 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, return 0; } -static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - init_task_work(&data->work, ublk_rq_task_work_fn); - return 0; -} - static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, .commit_rqs = ublk_commit_rqs, .init_hctx = ublk_init_hctx, - .init_request = ublk_init_rq, }; static int ublk_ch_open(struct inode *inode, struct file *filp) @@ -1163,23 +1140,21 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) mutex_unlock(&ub->mutex); } +static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + __ublk_rq_task_work(pdu->req); +} + static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, int tag, struct io_uring_cmd *cmd) { - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - if (ublk_can_use_task_work(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - /* should not fail since we call it just in ubq->ubq_daemon */ - task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI); - } else { - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - - pdu->req = req; - io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); - } + pdu->req = req; + io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); } static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) @@ -1450,7 +1425,6 @@ static int ublk_add_tag_set(struct ublk_device *ub) ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; - ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ub->tag_set.driver_data = ub; return blk_mq_alloc_tag_set(&ub->tag_set); -- 2.31.1 ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands 2022-10-23 9:38 [PATCH] ublk_drv: don't call task_work_add for queueing io commands Ming Lei @ 2022-10-24 9:48 ` Ziyang Zhang 2022-10-24 13:20 ` Ming Lei 0 siblings, 1 reply; 12+ messages in thread From: Ziyang Zhang @ 2022-10-24 9:48 UTC (permalink / raw) To: Ming Lei; +Cc: linux-block, Jens Axboe On 2022/10/23 17:38, Ming Lei wrote: > task_work_add() is used for waking ubq daemon task with one batch > of io requests/commands queued. However, task_work_add() isn't > exported for module code, and it is still debatable if the symbol > should be exported. > > Fortunately we still have io_uring_cmd_complete_in_task() which just > can't handle batched wakeup for us. > > Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task() > via current command for running them via task work. > > This way cleans up current code a lot, meantime allow us to wakeup > ubq daemon task after queueing batched requests/io commands. > Hi, Ming This patch works and I have run some tests to compare current version(ucmd) with your patch(ucmd-batch). iodepth=128 numjobs=1 direct=1 bs=4k -------------------------------------------- ublk loop target, the backend is a file. IOPS(k) type ucmd ucmd-batch seq-read 54.7 54.2 rand-read 52.8 52.0 -------------------------------------------- ublk null target IOPS(k) type ucmd ucmd-batch seq-read 257 257 rand-read 252 253 I find that io_req_task_work_add() puts task_work node into a llist first, then it may call task_work_add() to run batched task_works. So do we really need such llist in ublk_drv? I think io_uring has already considered task_work batch optimization. BTW, task_work_add() in ublk_drv achieves higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task() in ublk_drv. Regards, Zhang ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands 2022-10-24 9:48 ` Ziyang Zhang @ 2022-10-24 13:20 ` Ming Lei 2022-10-25 3:15 ` Ziyang Zhang 0 siblings, 1 reply; 12+ messages in thread From: Ming Lei @ 2022-10-24 13:20 UTC (permalink / raw) To: Ziyang Zhang; +Cc: linux-block, Jens Axboe Hello Ziyang, On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote: > On 2022/10/23 17:38, Ming Lei wrote: > > task_work_add() is used for waking ubq daemon task with one batch > > of io requests/commands queued. However, task_work_add() isn't > > exported for module code, and it is still debatable if the symbol > > should be exported. > > > > Fortunately we still have io_uring_cmd_complete_in_task() which just > > can't handle batched wakeup for us. > > > > Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task() > > via current command for running them via task work. > > > > This way cleans up current code a lot, meantime allow us to wakeup > > ubq daemon task after queueing batched requests/io commands. > > > > > Hi, Ming > > This patch works and I have run some tests to compare current version(ucmd) > with your patch(ucmd-batch). > > iodepth=128 numjobs=1 direct=1 bs=4k > > -------------------------------------------- > ublk loop target, the backend is a file. > IOPS(k) > > type ucmd ucmd-batch > seq-read 54.7 54.2 > rand-read 52.8 52.0 > > -------------------------------------------- > ublk null target > IOPS(k) > > type ucmd ucmd-batch > seq-read 257 257 > rand-read 252 253 > > > I find that io_req_task_work_add() puts task_work node into a llist > first, then it may call task_work_add() to run batched task_works. So do we really > need such llist in ublk_drv? I think io_uring has already considered task_work batch > optimization. > > BTW, task_work_add() in ublk_drv achieves > higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task() > in ublk_drv. Yeah, that is same with my observation, and motivation of this patch is to get same performance with task_work_add by building ublk_drv as module. One win of task_work_add() is that we get exact batching info meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically what the patch is doing, but needs help of the following ublksrv patch: https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548 which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+% IOPS boost is observed on loop/001 by putting image on SSD in my test VM. Thanks, Ming ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands 2022-10-24 13:20 ` Ming Lei @ 2022-10-25 3:15 ` Ziyang Zhang 2022-10-25 7:19 ` Ming Lei 0 siblings, 1 reply; 12+ messages in thread From: Ziyang Zhang @ 2022-10-25 3:15 UTC (permalink / raw) To: Ming Lei; +Cc: linux-block, Jens Axboe On 2022/10/24 21:20, Ming Lei wrote: > Hello Ziyang, > > On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote: >> On 2022/10/23 17:38, Ming Lei wrote: >>> task_work_add() is used for waking ubq daemon task with one batch >>> of io requests/commands queued. However, task_work_add() isn't >>> exported for module code, and it is still debatable if the symbol >>> should be exported. >>> >>> Fortunately we still have io_uring_cmd_complete_in_task() which just >>> can't handle batched wakeup for us. >>> >>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task() >>> via current command for running them via task work. >>> >>> This way cleans up current code a lot, meantime allow us to wakeup >>> ubq daemon task after queueing batched requests/io commands. >>> >> >> >> Hi, Ming >> >> This patch works and I have run some tests to compare current version(ucmd) >> with your patch(ucmd-batch). >> >> iodepth=128 numjobs=1 direct=1 bs=4k >> >> -------------------------------------------- >> ublk loop target, the backend is a file. >> IOPS(k) >> >> type ucmd ucmd-batch >> seq-read 54.7 54.2 >> rand-read 52.8 52.0 >> >> -------------------------------------------- >> ublk null target >> IOPS(k) >> >> type ucmd ucmd-batch >> seq-read 257 257 >> rand-read 252 253 >> >> >> I find that io_req_task_work_add() puts task_work node into a llist >> first, then it may call task_work_add() to run batched task_works. So do we really >> need such llist in ublk_drv? I think io_uring has already considered task_work batch >> optimization. >> >> BTW, task_work_add() in ublk_drv achieves >> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task() >> in ublk_drv. > > Yeah, that is same with my observation, and motivation of this patch is > to get same performance with task_work_add by building ublk_drv as > module. One win of task_work_add() is that we get exact batching info > meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically > what the patch is doing, but needs help of the following ublksrv patch: > > https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548 > > which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then > io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+% > IOPS boost is observed on loop/001 by putting image on SSD in my test > VM. Hi Ming, I have added this ublksrv patch and run the above test again. I have also run ublksrv test: loop/001. Please check them. Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores 64GB MEM, CentOS 8, kernel 6.0+ -------- fio test iodepth=128 numjobs=1 direct=1 bs=4k ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task() for each blk-mq rq. ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task() for the last blk-mq rq. -------------------------------------------- ublk loop target, the backend is a file. IOPS(k) type ucmd ucmd-batch seq-read 54.1 53.7 rand-read 52.0 52.0 -------------------------------------------- ublk null target IOPS(k) type ucmd ucmd-batch seq-read 272 265 rand-read 262 260 ------------ ublksrv test ------------- ucmd running loop/001 fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... randwrite: jobs 1, iops 66737 randread: jobs 1, iops 64935 randrw: jobs 1, iops read 32694 write 32710 rw(512k): jobs 1, iops read 772 write 819 ------------- ucmd-batch running loop/001 fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... randwrite: jobs 1, iops 66720 randread: jobs 1, iops 65015 randrw: jobs 1, iops read 32743 write 32759 rw(512k): jobs 1, iops read 771 write 817 It seems that manually putting rqs into llist and calling io_uring_cmd_complete_in_task() while handling the last rq does not improve IOPS much. io_req_task_work_add() puts task_work node into a internal llist first, then it may call task_work_add() to run batched task_works. IMO, io_uring has already done such batch optimization and ublk_drv does not need to add such llist. Regards, Zhang. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands 2022-10-25 3:15 ` Ziyang Zhang @ 2022-10-25 7:19 ` Ming Lei 2022-10-25 7:46 ` Ziyang Zhang 0 siblings, 1 reply; 12+ messages in thread From: Ming Lei @ 2022-10-25 7:19 UTC (permalink / raw) To: Ziyang Zhang; +Cc: linux-block, Jens Axboe On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote: > On 2022/10/24 21:20, Ming Lei wrote: > > Hello Ziyang, > > > > On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote: > >> On 2022/10/23 17:38, Ming Lei wrote: > >>> task_work_add() is used for waking ubq daemon task with one batch > >>> of io requests/commands queued. However, task_work_add() isn't > >>> exported for module code, and it is still debatable if the symbol > >>> should be exported. > >>> > >>> Fortunately we still have io_uring_cmd_complete_in_task() which just > >>> can't handle batched wakeup for us. > >>> > >>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task() > >>> via current command for running them via task work. > >>> > >>> This way cleans up current code a lot, meantime allow us to wakeup > >>> ubq daemon task after queueing batched requests/io commands. > >>> > >> > >> > >> Hi, Ming > >> > >> This patch works and I have run some tests to compare current version(ucmd) > >> with your patch(ucmd-batch). > >> > >> iodepth=128 numjobs=1 direct=1 bs=4k > >> > >> -------------------------------------------- > >> ublk loop target, the backend is a file. > >> IOPS(k) > >> > >> type ucmd ucmd-batch > >> seq-read 54.7 54.2 > >> rand-read 52.8 52.0 > >> > >> -------------------------------------------- > >> ublk null target > >> IOPS(k) > >> > >> type ucmd ucmd-batch > >> seq-read 257 257 > >> rand-read 252 253 > >> > >> > >> I find that io_req_task_work_add() puts task_work node into a llist > >> first, then it may call task_work_add() to run batched task_works. So do we really > >> need such llist in ublk_drv? I think io_uring has already considered task_work batch > >> optimization. > >> > >> BTW, task_work_add() in ublk_drv achieves > >> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task() > >> in ublk_drv. > > > > Yeah, that is same with my observation, and motivation of this patch is > > to get same performance with task_work_add by building ublk_drv as > > module. One win of task_work_add() is that we get exact batching info > > meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically > > what the patch is doing, but needs help of the following ublksrv patch: > > > > https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548 > > > > which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then > > io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+% > > IOPS boost is observed on loop/001 by putting image on SSD in my test > > VM. > > Hi Ming, > > I have added this ublksrv patch and run the above test again. > I have also run ublksrv test: loop/001. Please check them. > > Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores > 64GB MEM, CentOS 8, kernel 6.0+ > > -------- > fio test > > iodepth=128 numjobs=1 direct=1 bs=4k > > ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task() > for each blk-mq rq. > > ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task() > for the last blk-mq rq. > > -------------------------------------------- > ublk loop target, the backend is a file. > > IOPS(k) > > type ucmd ucmd-batch > seq-read 54.1 53.7 > rand-read 52.0 52.0 > > -------------------------------------------- > ublk null target > IOPS(k) > > type ucmd ucmd-batch > seq-read 272 265 > rand-read 262 260 > > ------------ > ublksrv test > > ------------- > ucmd > > running loop/001 > fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > randwrite: jobs 1, iops 66737 > randread: jobs 1, iops 64935 > randrw: jobs 1, iops read 32694 write 32710 > rw(512k): jobs 1, iops read 772 write 819 > > ------------- > ucmd-batch > > running loop/001 > fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > randwrite: jobs 1, iops 66720 > randread: jobs 1, iops 65015 > randrw: jobs 1, iops read 32743 write 32759 > rw(512k): jobs 1, iops read 771 write 817 > > > It seems that manually putting rqs into llist and calling > io_uring_cmd_complete_in_task() while handling the last rq does > not improve IOPS much. > > io_req_task_work_add() puts task_work node into a internal llist > first, then it may call task_work_add() to run batched task_works. > IMO, io_uring has already done such batch optimization and ublk_drv > does not need to add such llist. The difference is just how batching is handled, looks blk-mq's batch info doesn't matter any more. In my test, looks the perf improvement is mainly made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv. Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach same perf with task_work_add()(ublk_drv is builtin) when building ublk_drv as module? Thanks, Ming ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands 2022-10-25 7:19 ` Ming Lei @ 2022-10-25 7:46 ` Ziyang Zhang 2022-10-25 8:43 ` Ziyang Zhang 0 siblings, 1 reply; 12+ messages in thread From: Ziyang Zhang @ 2022-10-25 7:46 UTC (permalink / raw) To: Ming Lei; +Cc: linux-block, Jens Axboe On 2022/10/25 15:19, Ming Lei wrote: > On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote: >> On 2022/10/24 21:20, Ming Lei wrote: >>> Hello Ziyang, >>> >>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote: >>>> On 2022/10/23 17:38, Ming Lei wrote: >>>>> task_work_add() is used for waking ubq daemon task with one batch >>>>> of io requests/commands queued. However, task_work_add() isn't >>>>> exported for module code, and it is still debatable if the symbol >>>>> should be exported. >>>>> >>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just >>>>> can't handle batched wakeup for us. >>>>> >>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task() >>>>> via current command for running them via task work. >>>>> >>>>> This way cleans up current code a lot, meantime allow us to wakeup >>>>> ubq daemon task after queueing batched requests/io commands. >>>>> >>>> >>>> >>>> Hi, Ming >>>> >>>> This patch works and I have run some tests to compare current version(ucmd) >>>> with your patch(ucmd-batch). >>>> >>>> iodepth=128 numjobs=1 direct=1 bs=4k >>>> >>>> -------------------------------------------- >>>> ublk loop target, the backend is a file. >>>> IOPS(k) >>>> >>>> type ucmd ucmd-batch >>>> seq-read 54.7 54.2 >>>> rand-read 52.8 52.0 >>>> >>>> -------------------------------------------- >>>> ublk null target >>>> IOPS(k) >>>> >>>> type ucmd ucmd-batch >>>> seq-read 257 257 >>>> rand-read 252 253 >>>> >>>> >>>> I find that io_req_task_work_add() puts task_work node into a llist >>>> first, then it may call task_work_add() to run batched task_works. So do we really >>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch >>>> optimization. >>>> >>>> BTW, task_work_add() in ublk_drv achieves >>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task() >>>> in ublk_drv. >>> >>> Yeah, that is same with my observation, and motivation of this patch is >>> to get same performance with task_work_add by building ublk_drv as >>> module. One win of task_work_add() is that we get exact batching info >>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically >>> what the patch is doing, but needs help of the following ublksrv patch: >>> >>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548 >>> >>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then >>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+% >>> IOPS boost is observed on loop/001 by putting image on SSD in my test >>> VM. >> >> Hi Ming, >> >> I have added this ublksrv patch and run the above test again. >> I have also run ublksrv test: loop/001. Please check them. >> >> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores >> 64GB MEM, CentOS 8, kernel 6.0+ >> >> -------- >> fio test >> >> iodepth=128 numjobs=1 direct=1 bs=4k >> >> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task() >> for each blk-mq rq. >> >> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task() >> for the last blk-mq rq. >> >> -------------------------------------------- >> ublk loop target, the backend is a file. >> >> IOPS(k) >> >> type ucmd ucmd-batch >> seq-read 54.1 53.7 >> rand-read 52.0 52.0 >> >> -------------------------------------------- >> ublk null target >> IOPS(k) >> >> type ucmd ucmd-batch >> seq-read 272 265 >> rand-read 262 260 >> >> ------------ >> ublksrv test >> >> ------------- >> ucmd >> >> running loop/001 >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >> randwrite: jobs 1, iops 66737 >> randread: jobs 1, iops 64935 >> randrw: jobs 1, iops read 32694 write 32710 >> rw(512k): jobs 1, iops read 772 write 819 >> >> ------------- >> ucmd-batch >> >> running loop/001 >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >> randwrite: jobs 1, iops 66720 >> randread: jobs 1, iops 65015 >> randrw: jobs 1, iops read 32743 write 32759 >> rw(512k): jobs 1, iops read 771 write 817 >> >> >> It seems that manually putting rqs into llist and calling >> io_uring_cmd_complete_in_task() while handling the last rq does >> not improve IOPS much. >> >> io_req_task_work_add() puts task_work node into a internal llist >> first, then it may call task_work_add() to run batched task_works. >> IMO, io_uring has already done such batch optimization and ublk_drv >> does not need to add such llist. > > The difference is just how batching is handled, looks blk-mq's batch info > doesn't matter any more. In my test, looks the perf improvement is mainly > made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv. I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in ublk_drv does not improve IOPS. > > Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach > same perf with task_work_add()(ublk_drv is builtin) when building > ublk_drv as module? > OK. Regards, Zhang ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands 2022-10-25 7:46 ` Ziyang Zhang @ 2022-10-25 8:43 ` Ziyang Zhang 2022-10-25 15:17 ` Ming Lei 0 siblings, 1 reply; 12+ messages in thread From: Ziyang Zhang @ 2022-10-25 8:43 UTC (permalink / raw) To: Ming Lei; +Cc: linux-block, Jens Axboe On 2022/10/25 15:46, Ziyang Zhang wrote: > On 2022/10/25 15:19, Ming Lei wrote: >> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote: >>> On 2022/10/24 21:20, Ming Lei wrote: >>>> Hello Ziyang, >>>> >>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote: >>>>> On 2022/10/23 17:38, Ming Lei wrote: >>>>>> task_work_add() is used for waking ubq daemon task with one batch >>>>>> of io requests/commands queued. However, task_work_add() isn't >>>>>> exported for module code, and it is still debatable if the symbol >>>>>> should be exported. >>>>>> >>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just >>>>>> can't handle batched wakeup for us. >>>>>> >>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task() >>>>>> via current command for running them via task work. >>>>>> >>>>>> This way cleans up current code a lot, meantime allow us to wakeup >>>>>> ubq daemon task after queueing batched requests/io commands. >>>>>> >>>>> >>>>> >>>>> Hi, Ming >>>>> >>>>> This patch works and I have run some tests to compare current version(ucmd) >>>>> with your patch(ucmd-batch). >>>>> >>>>> iodepth=128 numjobs=1 direct=1 bs=4k >>>>> >>>>> -------------------------------------------- >>>>> ublk loop target, the backend is a file. >>>>> IOPS(k) >>>>> >>>>> type ucmd ucmd-batch >>>>> seq-read 54.7 54.2 >>>>> rand-read 52.8 52.0 >>>>> >>>>> -------------------------------------------- >>>>> ublk null target >>>>> IOPS(k) >>>>> >>>>> type ucmd ucmd-batch >>>>> seq-read 257 257 >>>>> rand-read 252 253 >>>>> >>>>> >>>>> I find that io_req_task_work_add() puts task_work node into a llist >>>>> first, then it may call task_work_add() to run batched task_works. So do we really >>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch >>>>> optimization. >>>>> >>>>> BTW, task_work_add() in ublk_drv achieves >>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task() >>>>> in ublk_drv. >>>> >>>> Yeah, that is same with my observation, and motivation of this patch is >>>> to get same performance with task_work_add by building ublk_drv as >>>> module. One win of task_work_add() is that we get exact batching info >>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically >>>> what the patch is doing, but needs help of the following ublksrv patch: >>>> >>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548 >>>> >>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then >>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+% >>>> IOPS boost is observed on loop/001 by putting image on SSD in my test >>>> VM. >>> >>> Hi Ming, >>> >>> I have added this ublksrv patch and run the above test again. >>> I have also run ublksrv test: loop/001. Please check them. >>> >>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores >>> 64GB MEM, CentOS 8, kernel 6.0+ >>> >>> -------- >>> fio test >>> >>> iodepth=128 numjobs=1 direct=1 bs=4k >>> >>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task() >>> for each blk-mq rq. >>> >>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task() >>> for the last blk-mq rq. >>> >>> -------------------------------------------- >>> ublk loop target, the backend is a file. >>> >>> IOPS(k) >>> >>> type ucmd ucmd-batch >>> seq-read 54.1 53.7 >>> rand-read 52.0 52.0 >>> >>> -------------------------------------------- >>> ublk null target >>> IOPS(k) >>> >>> type ucmd ucmd-batch >>> seq-read 272 265 >>> rand-read 262 260 >>> >>> ------------ >>> ublksrv test >>> >>> ------------- >>> ucmd >>> >>> running loop/001 >>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >>> randwrite: jobs 1, iops 66737 >>> randread: jobs 1, iops 64935 >>> randrw: jobs 1, iops read 32694 write 32710 >>> rw(512k): jobs 1, iops read 772 write 819 >>> >>> ------------- >>> ucmd-batch >>> >>> running loop/001 >>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >>> randwrite: jobs 1, iops 66720 >>> randread: jobs 1, iops 65015 >>> randrw: jobs 1, iops read 32743 write 32759 >>> rw(512k): jobs 1, iops read 771 write 817 >>> >>> >>> It seems that manually putting rqs into llist and calling >>> io_uring_cmd_complete_in_task() while handling the last rq does >>> not improve IOPS much. >>> >>> io_req_task_work_add() puts task_work node into a internal llist >>> first, then it may call task_work_add() to run batched task_works. >>> IMO, io_uring has already done such batch optimization and ublk_drv >>> does not need to add such llist. >> >> The difference is just how batching is handled, looks blk-mq's batch info >> doesn't matter any more. In my test, looks the perf improvement is mainly >> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv. > > I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in > ublk_drv does not improve IOPS. > >> >> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach >> same perf with task_work_add()(ublk_drv is builtin) when building >> ublk_drv as module? >> > > OK. > Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores 64GB MEM, CentOS 8, kernel 6.0+ with IORING_SETUP_COOP_TASKRUN, without this kernel patch ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module tw: task_work_add(), ublk is built-in. -------- fio test iodepth=128 numjobs=1 direct=1 bs=4k -------------------------------------------- ublk loop target, the backend is a file. IOPS(k) type ucmd tw seq-read 54.1 53.8 rand-read 52.0 52.0 -------------------------------------------- ublk null target IOPS(k) type ucmd tw seq-read 272 286 rand-read 262 278 ------------ ublksrv test ------------- ucmd running loop/001 fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... randwrite: jobs 1, iops 66737 randread: jobs 1, iops 64935 randrw: jobs 1, iops read 32694 write 32710 rw(512k): jobs 1, iops read 772 write 819 running null/001 fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... randwrite: jobs 1, iops 715863 randread: jobs 1, iops 758449 randrw: jobs 1, iops read 357407 write 357183 rw(512k): jobs 1, iops read 5895 write 5875 ------------- tw running loop/001 fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... randwrite: jobs 1, iops 66856 randread: jobs 1, iops 65015 randrw: jobs 1, iops read 32751 write 32767 rw(512k): jobs 1, iops read 776 write 823 running null/001 fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... randwrite: jobs 1, iops 739450 randread: jobs 1, iops 787500 randrw: jobs 1, iops read 372956 write 372831 rw(512k): jobs 1, iops read 5798 write 5777 ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands 2022-10-25 8:43 ` Ziyang Zhang @ 2022-10-25 15:17 ` Ming Lei 2022-10-26 10:32 ` Ziyang Zhang 0 siblings, 1 reply; 12+ messages in thread From: Ming Lei @ 2022-10-25 15:17 UTC (permalink / raw) To: Ziyang Zhang; +Cc: linux-block, Jens Axboe On Tue, Oct 25, 2022 at 04:43:56PM +0800, Ziyang Zhang wrote: > On 2022/10/25 15:46, Ziyang Zhang wrote: > > On 2022/10/25 15:19, Ming Lei wrote: > >> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote: > >>> On 2022/10/24 21:20, Ming Lei wrote: > >>>> Hello Ziyang, > >>>> > >>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote: > >>>>> On 2022/10/23 17:38, Ming Lei wrote: > >>>>>> task_work_add() is used for waking ubq daemon task with one batch > >>>>>> of io requests/commands queued. However, task_work_add() isn't > >>>>>> exported for module code, and it is still debatable if the symbol > >>>>>> should be exported. > >>>>>> > >>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just > >>>>>> can't handle batched wakeup for us. > >>>>>> > >>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task() > >>>>>> via current command for running them via task work. > >>>>>> > >>>>>> This way cleans up current code a lot, meantime allow us to wakeup > >>>>>> ubq daemon task after queueing batched requests/io commands. > >>>>>> > >>>>> > >>>>> > >>>>> Hi, Ming > >>>>> > >>>>> This patch works and I have run some tests to compare current version(ucmd) > >>>>> with your patch(ucmd-batch). > >>>>> > >>>>> iodepth=128 numjobs=1 direct=1 bs=4k > >>>>> > >>>>> -------------------------------------------- > >>>>> ublk loop target, the backend is a file. > >>>>> IOPS(k) > >>>>> > >>>>> type ucmd ucmd-batch > >>>>> seq-read 54.7 54.2 > >>>>> rand-read 52.8 52.0 > >>>>> > >>>>> -------------------------------------------- > >>>>> ublk null target > >>>>> IOPS(k) > >>>>> > >>>>> type ucmd ucmd-batch > >>>>> seq-read 257 257 > >>>>> rand-read 252 253 > >>>>> > >>>>> > >>>>> I find that io_req_task_work_add() puts task_work node into a llist > >>>>> first, then it may call task_work_add() to run batched task_works. So do we really > >>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch > >>>>> optimization. > >>>>> > >>>>> BTW, task_work_add() in ublk_drv achieves > >>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task() > >>>>> in ublk_drv. > >>>> > >>>> Yeah, that is same with my observation, and motivation of this patch is > >>>> to get same performance with task_work_add by building ublk_drv as > >>>> module. One win of task_work_add() is that we get exact batching info > >>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically > >>>> what the patch is doing, but needs help of the following ublksrv patch: > >>>> > >>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548 > >>>> > >>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then > >>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+% > >>>> IOPS boost is observed on loop/001 by putting image on SSD in my test > >>>> VM. > >>> > >>> Hi Ming, > >>> > >>> I have added this ublksrv patch and run the above test again. > >>> I have also run ublksrv test: loop/001. Please check them. > >>> > >>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores > >>> 64GB MEM, CentOS 8, kernel 6.0+ > >>> > >>> -------- > >>> fio test > >>> > >>> iodepth=128 numjobs=1 direct=1 bs=4k > >>> > >>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task() > >>> for each blk-mq rq. > >>> > >>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task() > >>> for the last blk-mq rq. > >>> > >>> -------------------------------------------- > >>> ublk loop target, the backend is a file. > >>> > >>> IOPS(k) > >>> > >>> type ucmd ucmd-batch > >>> seq-read 54.1 53.7 > >>> rand-read 52.0 52.0 > >>> > >>> -------------------------------------------- > >>> ublk null target > >>> IOPS(k) > >>> > >>> type ucmd ucmd-batch > >>> seq-read 272 265 > >>> rand-read 262 260 > >>> > >>> ------------ > >>> ublksrv test > >>> > >>> ------------- > >>> ucmd > >>> > >>> running loop/001 > >>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >>> randwrite: jobs 1, iops 66737 > >>> randread: jobs 1, iops 64935 > >>> randrw: jobs 1, iops read 32694 write 32710 > >>> rw(512k): jobs 1, iops read 772 write 819 > >>> > >>> ------------- > >>> ucmd-batch > >>> > >>> running loop/001 > >>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >>> randwrite: jobs 1, iops 66720 > >>> randread: jobs 1, iops 65015 > >>> randrw: jobs 1, iops read 32743 write 32759 > >>> rw(512k): jobs 1, iops read 771 write 817 > >>> > >>> > >>> It seems that manually putting rqs into llist and calling > >>> io_uring_cmd_complete_in_task() while handling the last rq does > >>> not improve IOPS much. > >>> > >>> io_req_task_work_add() puts task_work node into a internal llist > >>> first, then it may call task_work_add() to run batched task_works. > >>> IMO, io_uring has already done such batch optimization and ublk_drv > >>> does not need to add such llist. > >> > >> The difference is just how batching is handled, looks blk-mq's batch info > >> doesn't matter any more. In my test, looks the perf improvement is mainly > >> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv. > > > > I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in > > ublk_drv does not improve IOPS. > > > >> > >> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach > >> same perf with task_work_add()(ublk_drv is builtin) when building > >> ublk_drv as module? > >> > > > > OK. > > > > Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores > 64GB MEM, CentOS 8, kernel 6.0+ > with IORING_SETUP_COOP_TASKRUN, without this kernel patch > > ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module > tw: task_work_add(), ublk is built-in. > > > -------- > fio test > > iodepth=128 numjobs=1 direct=1 bs=4k > > -------------------------------------------- > ublk loop target, the backend is a file. > > IOPS(k) > > type ucmd tw > seq-read 54.1 53.8 > rand-read 52.0 52.0 > > -------------------------------------------- > ublk null target > IOPS(k) > > type ucmd tw > seq-read 272 286 > rand-read 262 278 > > > ------------ > ublksrv test > > ------------- > ucmd > > running loop/001 > fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > randwrite: jobs 1, iops 66737 > randread: jobs 1, iops 64935 > randrw: jobs 1, iops read 32694 write 32710 > rw(512k): jobs 1, iops read 772 write 819 > > running null/001 > fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > randwrite: jobs 1, iops 715863 > randread: jobs 1, iops 758449 > randrw: jobs 1, iops read 357407 write 357183 > rw(512k): jobs 1, iops read 5895 write 5875 > > ------------- > tw > > running loop/001 > fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > randwrite: jobs 1, iops 66856 > randread: jobs 1, iops 65015 > randrw: jobs 1, iops read 32751 write 32767 > rw(512k): jobs 1, iops read 776 write 823 > > running null/001 > fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > randwrite: jobs 1, iops 739450 > randread: jobs 1, iops 787500 > randrw: jobs 1, iops read 372956 write 372831 > rw(512k): jobs 1, iops read 5798 write 5777 Looks the gap isn't big between ucmd and tw when running null/001, in which the fio io process should saturate the CPU. Probably we should avoid to touch 'cmd'/'pdu'/'io' in ublk_queue_rq() since these data should be cold at that time. Can you apply the following delta patch against the current patch( "ublk_drv: don't call task_work_add for queueing io commands") and compare with task_work_add()? From ecbbf6d10dbc63e279ce0b55c45da6721947f18d Mon Sep 17 00:00:00 2001 From: Ming Lei <ming.lei@redhat.com> Date: Tue, 25 Oct 2022 11:01:25 -0400 Subject: [PATCH] ublk: follow up change Signed-off-by: Ming Lei <ming.lei@redhat.com> --- drivers/block/ublk_drv.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 7963fba66dd1..18db337094c1 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -56,9 +56,12 @@ /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) +struct ublk_rq_data { + struct llist_node node; +}; + struct ublk_uring_cmd_pdu { struct request *req; - struct llist_node node; }; /* @@ -693,7 +696,8 @@ static inline void __ublk_rq_task_work(struct request *req) * * (2) current->flags & PF_EXITING. */ - if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) { + if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING + || (io->flags & UBLK_IO_FLAG_ABORTED))) { __ublk_abort_rq(ubq, req); return; } @@ -757,11 +761,12 @@ static void ublk_rqs_task_work_cb(struct io_uring_cmd *cmd) struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->req->mq_hctx->driver_data; struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); + struct ublk_rq_data *data; __ublk_rq_task_work(pdu->req); - llist_for_each_entry(pdu, io_cmds, node) - __ublk_rq_task_work(pdu->req); + llist_for_each_entry(data, io_cmds, node) + __ublk_rq_task_work(blk_mq_rq_from_pdu(data)); } static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, @@ -769,9 +774,6 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, { struct ublk_queue *ubq = hctx->driver_data; struct request *rq = bd->rq; - struct ublk_io *io = &ubq->ios[rq->tag]; - struct io_uring_cmd *cmd = io->cmd; - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); blk_status_t res; /* fill iod to slot in io cmd buffer */ @@ -805,14 +807,11 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED * guarantees that here is a re-issued request aborted previously. */ - if (unlikely(ubq_daemon_is_dying(ubq) || - (io->flags & UBLK_IO_FLAG_ABORTED))) { + if (unlikely(ubq_daemon_is_dying(ubq))) { __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } - pdu->req = rq; - /* * Typical multiple producers and single consumer, it is just fine * to use llist_add() in producer side and llist_del_all() in @@ -821,10 +820,18 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, * The last command can't be added into list, otherwise it could * be handled twice */ - if (bd->last) + if (bd->last) { + struct ublk_io *io = &ubq->ios[rq->tag]; + struct io_uring_cmd *cmd = io->cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + pdu->req = rq; io_uring_cmd_complete_in_task(cmd, ublk_rqs_task_work_cb); - else - llist_add(&pdu->node, &ubq->io_cmds); + } else { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); + + llist_add(&data->node, &ubq->io_cmds); + } return BLK_STS_OK; } @@ -1426,6 +1433,7 @@ static int ublk_add_tag_set(struct ublk_device *ub) ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); ub->tag_set.driver_data = ub; return blk_mq_alloc_tag_set(&ub->tag_set); } -- 2.31.1 Thanks, Ming ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands 2022-10-25 15:17 ` Ming Lei @ 2022-10-26 10:32 ` Ziyang Zhang 2022-10-26 11:29 ` Ming Lei 0 siblings, 1 reply; 12+ messages in thread From: Ziyang Zhang @ 2022-10-26 10:32 UTC (permalink / raw) To: Ming Lei; +Cc: linux-block, Jens Axboe On 2022/10/25 23:17, Ming Lei wrote: > On Tue, Oct 25, 2022 at 04:43:56PM +0800, Ziyang Zhang wrote: >> On 2022/10/25 15:46, Ziyang Zhang wrote: >>> On 2022/10/25 15:19, Ming Lei wrote: >>>> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote: >>>>> On 2022/10/24 21:20, Ming Lei wrote: >>>>>> Hello Ziyang, >>>>>> >>>>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote: >>>>>>> On 2022/10/23 17:38, Ming Lei wrote: >>>>>>>> task_work_add() is used for waking ubq daemon task with one batch >>>>>>>> of io requests/commands queued. However, task_work_add() isn't >>>>>>>> exported for module code, and it is still debatable if the symbol >>>>>>>> should be exported. >>>>>>>> >>>>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just >>>>>>>> can't handle batched wakeup for us. >>>>>>>> >>>>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task() >>>>>>>> via current command for running them via task work. >>>>>>>> >>>>>>>> This way cleans up current code a lot, meantime allow us to wakeup >>>>>>>> ubq daemon task after queueing batched requests/io commands. >>>>>>>> >>>>>>> >>>>>>> >>>>>>> Hi, Ming >>>>>>> >>>>>>> This patch works and I have run some tests to compare current version(ucmd) >>>>>>> with your patch(ucmd-batch). >>>>>>> >>>>>>> iodepth=128 numjobs=1 direct=1 bs=4k >>>>>>> >>>>>>> -------------------------------------------- >>>>>>> ublk loop target, the backend is a file. >>>>>>> IOPS(k) >>>>>>> >>>>>>> type ucmd ucmd-batch >>>>>>> seq-read 54.7 54.2 >>>>>>> rand-read 52.8 52.0 >>>>>>> >>>>>>> -------------------------------------------- >>>>>>> ublk null target >>>>>>> IOPS(k) >>>>>>> >>>>>>> type ucmd ucmd-batch >>>>>>> seq-read 257 257 >>>>>>> rand-read 252 253 >>>>>>> >>>>>>> >>>>>>> I find that io_req_task_work_add() puts task_work node into a llist >>>>>>> first, then it may call task_work_add() to run batched task_works. So do we really >>>>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch >>>>>>> optimization. >>>>>>> >>>>>>> BTW, task_work_add() in ublk_drv achieves >>>>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task() >>>>>>> in ublk_drv. >>>>>> >>>>>> Yeah, that is same with my observation, and motivation of this patch is >>>>>> to get same performance with task_work_add by building ublk_drv as >>>>>> module. One win of task_work_add() is that we get exact batching info >>>>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically >>>>>> what the patch is doing, but needs help of the following ublksrv patch: >>>>>> >>>>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548 >>>>>> >>>>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then >>>>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+% >>>>>> IOPS boost is observed on loop/001 by putting image on SSD in my test >>>>>> VM. >>>>> >>>>> Hi Ming, >>>>> >>>>> I have added this ublksrv patch and run the above test again. >>>>> I have also run ublksrv test: loop/001. Please check them. >>>>> >>>>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores >>>>> 64GB MEM, CentOS 8, kernel 6.0+ >>>>> >>>>> -------- >>>>> fio test >>>>> >>>>> iodepth=128 numjobs=1 direct=1 bs=4k >>>>> >>>>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task() >>>>> for each blk-mq rq. >>>>> >>>>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task() >>>>> for the last blk-mq rq. >>>>> >>>>> -------------------------------------------- >>>>> ublk loop target, the backend is a file. >>>>> >>>>> IOPS(k) >>>>> >>>>> type ucmd ucmd-batch >>>>> seq-read 54.1 53.7 >>>>> rand-read 52.0 52.0 >>>>> >>>>> -------------------------------------------- >>>>> ublk null target >>>>> IOPS(k) >>>>> >>>>> type ucmd ucmd-batch >>>>> seq-read 272 265 >>>>> rand-read 262 260 >>>>> >>>>> ------------ >>>>> ublksrv test >>>>> >>>>> ------------- >>>>> ucmd >>>>> >>>>> running loop/001 >>>>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >>>>> randwrite: jobs 1, iops 66737 >>>>> randread: jobs 1, iops 64935 >>>>> randrw: jobs 1, iops read 32694 write 32710 >>>>> rw(512k): jobs 1, iops read 772 write 819 >>>>> >>>>> ------------- >>>>> ucmd-batch >>>>> >>>>> running loop/001 >>>>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >>>>> randwrite: jobs 1, iops 66720 >>>>> randread: jobs 1, iops 65015 >>>>> randrw: jobs 1, iops read 32743 write 32759 >>>>> rw(512k): jobs 1, iops read 771 write 817 >>>>> >>>>> >>>>> It seems that manually putting rqs into llist and calling >>>>> io_uring_cmd_complete_in_task() while handling the last rq does >>>>> not improve IOPS much. >>>>> >>>>> io_req_task_work_add() puts task_work node into a internal llist >>>>> first, then it may call task_work_add() to run batched task_works. >>>>> IMO, io_uring has already done such batch optimization and ublk_drv >>>>> does not need to add such llist. >>>> >>>> The difference is just how batching is handled, looks blk-mq's batch info >>>> doesn't matter any more. In my test, looks the perf improvement is mainly >>>> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv. >>> >>> I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in >>> ublk_drv does not improve IOPS. >>> >>>> >>>> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach >>>> same perf with task_work_add()(ublk_drv is builtin) when building >>>> ublk_drv as module? >>>> >>> >>> OK. >>> >> >> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores >> 64GB MEM, CentOS 8, kernel 6.0+ >> with IORING_SETUP_COOP_TASKRUN, without this kernel patch >> >> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module >> tw: task_work_add(), ublk is built-in. >> >> >> -------- >> fio test >> >> iodepth=128 numjobs=1 direct=1 bs=4k >> >> -------------------------------------------- >> ublk loop target, the backend is a file. >> >> IOPS(k) >> >> type ucmd tw >> seq-read 54.1 53.8 >> rand-read 52.0 52.0 >> >> -------------------------------------------- >> ublk null target >> IOPS(k) >> >> type ucmd tw >> seq-read 272 286 >> rand-read 262 278 >> >> >> ------------ >> ublksrv test >> >> ------------- >> ucmd >> >> running loop/001 >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >> randwrite: jobs 1, iops 66737 >> randread: jobs 1, iops 64935 >> randrw: jobs 1, iops read 32694 write 32710 >> rw(512k): jobs 1, iops read 772 write 819 >> >> running null/001 >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >> randwrite: jobs 1, iops 715863 >> randread: jobs 1, iops 758449 >> randrw: jobs 1, iops read 357407 write 357183 >> rw(512k): jobs 1, iops read 5895 write 5875 >> >> ------------- >> tw >> >> running loop/001 >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >> randwrite: jobs 1, iops 66856 >> randread: jobs 1, iops 65015 >> randrw: jobs 1, iops read 32751 write 32767 >> rw(512k): jobs 1, iops read 776 write 823 >> >> running null/001 >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >> randwrite: jobs 1, iops 739450 >> randread: jobs 1, iops 787500 >> randrw: jobs 1, iops read 372956 write 372831 >> rw(512k): jobs 1, iops read 5798 write 5777 > > Looks the gap isn't big between ucmd and tw when running null/001, in > which the fio io process should saturate the CPU. Probably we > should avoid to touch 'cmd'/'pdu'/'io' in ublk_queue_rq() since these > data should be cold at that time. > > Can you apply the following delta patch against the current patch( > "ublk_drv: don't call task_work_add for queueing io commands") and > compare with task_work_add()? > > From ecbbf6d10dbc63e279ce0b55c45da6721947f18d Mon Sep 17 00:00:00 2001 > From: Ming Lei <ming.lei@redhat.com> > Date: Tue, 25 Oct 2022 11:01:25 -0400 > Subject: [PATCH] ublk: follow up change > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > --- > drivers/block/ublk_drv.c | 36 ++++++++++++++++++++++-------------- > 1 file changed, 22 insertions(+), 14 deletions(-) > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c > index 7963fba66dd1..18db337094c1 100644 > --- a/drivers/block/ublk_drv.c > +++ b/drivers/block/ublk_drv.c > @@ -56,9 +56,12 @@ > /* All UBLK_PARAM_TYPE_* should be included here */ > #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) > > +struct ublk_rq_data { > + struct llist_node node; > +}; > + > struct ublk_uring_cmd_pdu { > struct request *req; > - struct llist_node node; > }; > > /* > @@ -693,7 +696,8 @@ static inline void __ublk_rq_task_work(struct request *req) > * > * (2) current->flags & PF_EXITING. > */ > - if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) { > + if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING > + || (io->flags & UBLK_IO_FLAG_ABORTED))) { > __ublk_abort_rq(ubq, req); > return; We cannot check io->flags & UBLK_IO_FLAG_ABORTED in io_uring task_work. We have to check io->flags & UBLK_IO_FLAG_ABORTED in ublk_queue_rq() because io_uring ctx may be freed at that time and ioucmd task_work is invalid(freed). Please See comment around it for more detail. > } > @@ -757,11 +761,12 @@ static void ublk_rqs_task_work_cb(struct io_uring_cmd *cmd) > struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); > struct ublk_queue *ubq = pdu->req->mq_hctx->driver_data; > struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); > + struct ublk_rq_data *data; > > __ublk_rq_task_work(pdu->req); > > - llist_for_each_entry(pdu, io_cmds, node) > - __ublk_rq_task_work(pdu->req); > + llist_for_each_entry(data, io_cmds, node) > + __ublk_rq_task_work(blk_mq_rq_from_pdu(data)); > } > > static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > @@ -769,9 +774,6 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > { > struct ublk_queue *ubq = hctx->driver_data; > struct request *rq = bd->rq; > - struct ublk_io *io = &ubq->ios[rq->tag]; > - struct io_uring_cmd *cmd = io->cmd; > - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); > blk_status_t res; > > /* fill iod to slot in io cmd buffer */ > @@ -805,14 +807,11 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED > * guarantees that here is a re-issued request aborted previously. > */ > - if (unlikely(ubq_daemon_is_dying(ubq) || > - (io->flags & UBLK_IO_FLAG_ABORTED))) { > + if (unlikely(ubq_daemon_is_dying(ubq))) { > __ublk_abort_rq(ubq, rq); > return BLK_STS_OK; > } > > - pdu->req = rq; > - > /* > * Typical multiple producers and single consumer, it is just fine > * to use llist_add() in producer side and llist_del_all() in > @@ -821,10 +820,18 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > * The last command can't be added into list, otherwise it could > * be handled twice > */ > - if (bd->last) > + if (bd->last) { > + struct ublk_io *io = &ubq->ios[rq->tag]; > + struct io_uring_cmd *cmd = io->cmd; > + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); > + > + pdu->req = rq; > io_uring_cmd_complete_in_task(cmd, ublk_rqs_task_work_cb); > - else > - llist_add(&pdu->node, &ubq->io_cmds); > + } else { > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); > + > + llist_add(&data->node, &ubq->io_cmds); > + } > > return BLK_STS_OK; > } > @@ -1426,6 +1433,7 @@ static int ublk_add_tag_set(struct ublk_device *ub) > ub->tag_set.queue_depth = ub->dev_info.queue_depth; > ub->tag_set.numa_node = NUMA_NO_NODE; > ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; > + ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); > ub->tag_set.driver_data = ub; > return blk_mq_alloc_tag_set(&ub->tag_set); > } Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores 64GB MEM, CentOS 8, kernel 6.0+ with IORING_SETUP_COOP_TASKRUN, without this kernel patch ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module ucmd-not-touch-pdu: use llist && do not touch 'cmd'/'pdu'/'io' in ublk_queue_rq() tw: task_work_add(), ublk is built-in. -------- fio test iodepth=128 numjobs=1 direct=1 bs=4k -------------------------------------------- ublk loop target, the backend is a file. IOPS(k) type ucmd tw ucmd-not-touch-pdu seq-read 54.1 53.8 53.6 rand-read 52.0 52.0 52.0 -------------------------------------------- ublk null target IOPS(k) type ucmd tw ucmd-not-touch-pdu seq-read 272 286 275 rand-read 262 278 269 ------------ ublksrv test ------------- ucmd running loop/001 fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... randwrite: jobs 1, iops 66737 randread: jobs 1, iops 64935 randrw: jobs 1, iops read 32694 write 32710 rw(512k): jobs 1, iops read 772 write 819 running null/001 fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... randwrite: jobs 1, iops 715863 randread: jobs 1, iops 758449 randrw: jobs 1, iops read 357407 write 357183 rw(512k): jobs 1, iops read 5895 write 5875 ------------- tw running loop/001 fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... randwrite: jobs 1, iops 66856 randread: jobs 1, iops 65015 randrw: jobs 1, iops read 32751 write 32767 rw(512k): jobs 1, iops read 776 write 823 running null/001 fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... randwrite: jobs 1, iops 739450 randread: jobs 1, iops 787500 randrw: jobs 1, iops read 372956 write 372831 rw(512k): jobs 1, iops read 5798 write 5777 ------------- ucmd-not-touch-pdu running loop/001 fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_oH0eG), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... randwrite: jobs 1, iops 66754 randread: jobs 1, iops 65032 randrw: jobs 1, iops read 32776 write 32792 rw(512k): jobs 1, iops read 772 write 818 running null/001 fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... randwrite: jobs 1, iops 725334 randread: jobs 1, iops 741105 randrw: jobs 1, iops read 360285 write 360047 rw(512k): jobs 1, iops read 5770 write 5748 Not touching cmd/pdu/io in ublk_queue_rq() improves IOPS. But it is worse than using task_work_add(). Regards, Zhang ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands 2022-10-26 10:32 ` Ziyang Zhang @ 2022-10-26 11:29 ` Ming Lei 2022-10-27 3:00 ` Ziyang Zhang 0 siblings, 1 reply; 12+ messages in thread From: Ming Lei @ 2022-10-26 11:29 UTC (permalink / raw) To: Ziyang Zhang; +Cc: linux-block, Jens Axboe On Wed, Oct 26, 2022 at 06:32:46PM +0800, Ziyang Zhang wrote: > On 2022/10/25 23:17, Ming Lei wrote: > > On Tue, Oct 25, 2022 at 04:43:56PM +0800, Ziyang Zhang wrote: > >> On 2022/10/25 15:46, Ziyang Zhang wrote: > >>> On 2022/10/25 15:19, Ming Lei wrote: > >>>> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote: > >>>>> On 2022/10/24 21:20, Ming Lei wrote: > >>>>>> Hello Ziyang, > >>>>>> > >>>>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote: > >>>>>>> On 2022/10/23 17:38, Ming Lei wrote: > >>>>>>>> task_work_add() is used for waking ubq daemon task with one batch > >>>>>>>> of io requests/commands queued. However, task_work_add() isn't > >>>>>>>> exported for module code, and it is still debatable if the symbol > >>>>>>>> should be exported. > >>>>>>>> > >>>>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just > >>>>>>>> can't handle batched wakeup for us. > >>>>>>>> > >>>>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task() > >>>>>>>> via current command for running them via task work. > >>>>>>>> > >>>>>>>> This way cleans up current code a lot, meantime allow us to wakeup > >>>>>>>> ubq daemon task after queueing batched requests/io commands. > >>>>>>>> > >>>>>>> > >>>>>>> > >>>>>>> Hi, Ming > >>>>>>> > >>>>>>> This patch works and I have run some tests to compare current version(ucmd) > >>>>>>> with your patch(ucmd-batch). > >>>>>>> > >>>>>>> iodepth=128 numjobs=1 direct=1 bs=4k > >>>>>>> > >>>>>>> -------------------------------------------- > >>>>>>> ublk loop target, the backend is a file. > >>>>>>> IOPS(k) > >>>>>>> > >>>>>>> type ucmd ucmd-batch > >>>>>>> seq-read 54.7 54.2 > >>>>>>> rand-read 52.8 52.0 > >>>>>>> > >>>>>>> -------------------------------------------- > >>>>>>> ublk null target > >>>>>>> IOPS(k) > >>>>>>> > >>>>>>> type ucmd ucmd-batch > >>>>>>> seq-read 257 257 > >>>>>>> rand-read 252 253 > >>>>>>> > >>>>>>> > >>>>>>> I find that io_req_task_work_add() puts task_work node into a llist > >>>>>>> first, then it may call task_work_add() to run batched task_works. So do we really > >>>>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch > >>>>>>> optimization. > >>>>>>> > >>>>>>> BTW, task_work_add() in ublk_drv achieves > >>>>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task() > >>>>>>> in ublk_drv. > >>>>>> > >>>>>> Yeah, that is same with my observation, and motivation of this patch is > >>>>>> to get same performance with task_work_add by building ublk_drv as > >>>>>> module. One win of task_work_add() is that we get exact batching info > >>>>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically > >>>>>> what the patch is doing, but needs help of the following ublksrv patch: > >>>>>> > >>>>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548 > >>>>>> > >>>>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then > >>>>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+% > >>>>>> IOPS boost is observed on loop/001 by putting image on SSD in my test > >>>>>> VM. > >>>>> > >>>>> Hi Ming, > >>>>> > >>>>> I have added this ublksrv patch and run the above test again. > >>>>> I have also run ublksrv test: loop/001. Please check them. > >>>>> > >>>>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores > >>>>> 64GB MEM, CentOS 8, kernel 6.0+ > >>>>> > >>>>> -------- > >>>>> fio test > >>>>> > >>>>> iodepth=128 numjobs=1 direct=1 bs=4k > >>>>> > >>>>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task() > >>>>> for each blk-mq rq. > >>>>> > >>>>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task() > >>>>> for the last blk-mq rq. > >>>>> > >>>>> -------------------------------------------- > >>>>> ublk loop target, the backend is a file. > >>>>> > >>>>> IOPS(k) > >>>>> > >>>>> type ucmd ucmd-batch > >>>>> seq-read 54.1 53.7 > >>>>> rand-read 52.0 52.0 > >>>>> > >>>>> -------------------------------------------- > >>>>> ublk null target > >>>>> IOPS(k) > >>>>> > >>>>> type ucmd ucmd-batch > >>>>> seq-read 272 265 > >>>>> rand-read 262 260 > >>>>> > >>>>> ------------ > >>>>> ublksrv test > >>>>> > >>>>> ------------- > >>>>> ucmd > >>>>> > >>>>> running loop/001 > >>>>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >>>>> randwrite: jobs 1, iops 66737 > >>>>> randread: jobs 1, iops 64935 > >>>>> randrw: jobs 1, iops read 32694 write 32710 > >>>>> rw(512k): jobs 1, iops read 772 write 819 > >>>>> > >>>>> ------------- > >>>>> ucmd-batch > >>>>> > >>>>> running loop/001 > >>>>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >>>>> randwrite: jobs 1, iops 66720 > >>>>> randread: jobs 1, iops 65015 > >>>>> randrw: jobs 1, iops read 32743 write 32759 > >>>>> rw(512k): jobs 1, iops read 771 write 817 > >>>>> > >>>>> > >>>>> It seems that manually putting rqs into llist and calling > >>>>> io_uring_cmd_complete_in_task() while handling the last rq does > >>>>> not improve IOPS much. > >>>>> > >>>>> io_req_task_work_add() puts task_work node into a internal llist > >>>>> first, then it may call task_work_add() to run batched task_works. > >>>>> IMO, io_uring has already done such batch optimization and ublk_drv > >>>>> does not need to add such llist. > >>>> > >>>> The difference is just how batching is handled, looks blk-mq's batch info > >>>> doesn't matter any more. In my test, looks the perf improvement is mainly > >>>> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv. > >>> > >>> I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in > >>> ublk_drv does not improve IOPS. > >>> > >>>> > >>>> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach > >>>> same perf with task_work_add()(ublk_drv is builtin) when building > >>>> ublk_drv as module? > >>>> > >>> > >>> OK. > >>> > >> > >> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores > >> 64GB MEM, CentOS 8, kernel 6.0+ > >> with IORING_SETUP_COOP_TASKRUN, without this kernel patch > >> > >> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module > >> tw: task_work_add(), ublk is built-in. > >> > >> > >> -------- > >> fio test > >> > >> iodepth=128 numjobs=1 direct=1 bs=4k > >> > >> -------------------------------------------- > >> ublk loop target, the backend is a file. > >> > >> IOPS(k) > >> > >> type ucmd tw > >> seq-read 54.1 53.8 > >> rand-read 52.0 52.0 > >> > >> -------------------------------------------- > >> ublk null target > >> IOPS(k) > >> > >> type ucmd tw > >> seq-read 272 286 > >> rand-read 262 278 > >> > >> > >> ------------ > >> ublksrv test > >> > >> ------------- > >> ucmd > >> > >> running loop/001 > >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >> randwrite: jobs 1, iops 66737 > >> randread: jobs 1, iops 64935 > >> randrw: jobs 1, iops read 32694 write 32710 > >> rw(512k): jobs 1, iops read 772 write 819 > >> > >> running null/001 > >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >> randwrite: jobs 1, iops 715863 > >> randread: jobs 1, iops 758449 > >> randrw: jobs 1, iops read 357407 write 357183 > >> rw(512k): jobs 1, iops read 5895 write 5875 > >> > >> ------------- > >> tw > >> > >> running loop/001 > >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >> randwrite: jobs 1, iops 66856 > >> randread: jobs 1, iops 65015 > >> randrw: jobs 1, iops read 32751 write 32767 > >> rw(512k): jobs 1, iops read 776 write 823 > >> > >> running null/001 > >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >> randwrite: jobs 1, iops 739450 > >> randread: jobs 1, iops 787500 > >> randrw: jobs 1, iops read 372956 write 372831 > >> rw(512k): jobs 1, iops read 5798 write 5777 > > > > Looks the gap isn't big between ucmd and tw when running null/001, in > > which the fio io process should saturate the CPU. Probably we > > should avoid to touch 'cmd'/'pdu'/'io' in ublk_queue_rq() since these > > data should be cold at that time. > > > > Can you apply the following delta patch against the current patch( > > "ublk_drv: don't call task_work_add for queueing io commands") and > > compare with task_work_add()? > > > > From ecbbf6d10dbc63e279ce0b55c45da6721947f18d Mon Sep 17 00:00:00 2001 > > From: Ming Lei <ming.lei@redhat.com> > > Date: Tue, 25 Oct 2022 11:01:25 -0400 > > Subject: [PATCH] ublk: follow up change > > > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > > --- > > drivers/block/ublk_drv.c | 36 ++++++++++++++++++++++-------------- > > 1 file changed, 22 insertions(+), 14 deletions(-) > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c > > index 7963fba66dd1..18db337094c1 100644 > > --- a/drivers/block/ublk_drv.c > > +++ b/drivers/block/ublk_drv.c > > @@ -56,9 +56,12 @@ > > /* All UBLK_PARAM_TYPE_* should be included here */ > > #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) > > > > +struct ublk_rq_data { > > + struct llist_node node; > > +}; > > + > > struct ublk_uring_cmd_pdu { > > struct request *req; > > - struct llist_node node; > > }; > > > > /* > > @@ -693,7 +696,8 @@ static inline void __ublk_rq_task_work(struct request *req) > > * > > * (2) current->flags & PF_EXITING. > > */ > > - if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) { > > + if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING > > + || (io->flags & UBLK_IO_FLAG_ABORTED))) { > > __ublk_abort_rq(ubq, req); > > return; > > We cannot check io->flags & UBLK_IO_FLAG_ABORTED in io_uring task_work. > We have to check io->flags & UBLK_IO_FLAG_ABORTED in ublk_queue_rq() because > io_uring ctx may be freed at that time and ioucmd task_work is invalid(freed). > Please See comment around it for more detail. The comment is obsolete, because all inflight requests are drained before io_uring is dead, please see __ublk_quiesce_dev(). > > > } > > @@ -757,11 +761,12 @@ static void ublk_rqs_task_work_cb(struct io_uring_cmd *cmd) > > struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); > > struct ublk_queue *ubq = pdu->req->mq_hctx->driver_data; > > struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); > > + struct ublk_rq_data *data; > > > > __ublk_rq_task_work(pdu->req); > > > > - llist_for_each_entry(pdu, io_cmds, node) > > - __ublk_rq_task_work(pdu->req); > > + llist_for_each_entry(data, io_cmds, node) > > + __ublk_rq_task_work(blk_mq_rq_from_pdu(data)); > > } > > > > static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > > @@ -769,9 +774,6 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > > { > > struct ublk_queue *ubq = hctx->driver_data; > > struct request *rq = bd->rq; > > - struct ublk_io *io = &ubq->ios[rq->tag]; > > - struct io_uring_cmd *cmd = io->cmd; > > - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); > > blk_status_t res; > > > > /* fill iod to slot in io cmd buffer */ > > @@ -805,14 +807,11 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > > * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED > > * guarantees that here is a re-issued request aborted previously. > > */ > > - if (unlikely(ubq_daemon_is_dying(ubq) || > > - (io->flags & UBLK_IO_FLAG_ABORTED))) { > > + if (unlikely(ubq_daemon_is_dying(ubq))) { > > __ublk_abort_rq(ubq, rq); > > return BLK_STS_OK; > > } > > > > - pdu->req = rq; > > - > > /* > > * Typical multiple producers and single consumer, it is just fine > > * to use llist_add() in producer side and llist_del_all() in > > @@ -821,10 +820,18 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > > * The last command can't be added into list, otherwise it could > > * be handled twice > > */ > > - if (bd->last) > > + if (bd->last) { > > + struct ublk_io *io = &ubq->ios[rq->tag]; > > + struct io_uring_cmd *cmd = io->cmd; > > + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); > > + > > + pdu->req = rq; > > io_uring_cmd_complete_in_task(cmd, ublk_rqs_task_work_cb); > > - else > > - llist_add(&pdu->node, &ubq->io_cmds); > > + } else { > > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); > > + > > + llist_add(&data->node, &ubq->io_cmds); > > + } > > > > return BLK_STS_OK; > > } > > @@ -1426,6 +1433,7 @@ static int ublk_add_tag_set(struct ublk_device *ub) > > ub->tag_set.queue_depth = ub->dev_info.queue_depth; > > ub->tag_set.numa_node = NUMA_NO_NODE; > > ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; > > + ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); > > ub->tag_set.driver_data = ub; > > return blk_mq_alloc_tag_set(&ub->tag_set); > > } > > > Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores > 64GB MEM, CentOS 8, kernel 6.0+ > with IORING_SETUP_COOP_TASKRUN, without this kernel patch > > ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module > > ucmd-not-touch-pdu: use llist && do not touch 'cmd'/'pdu'/'io' in ublk_queue_rq() > > tw: task_work_add(), ublk is built-in. > > > -------- > fio test > > iodepth=128 numjobs=1 direct=1 bs=4k > > -------------------------------------------- > ublk loop target, the backend is a file. > > IOPS(k) > > type ucmd tw ucmd-not-touch-pdu > seq-read 54.1 53.8 53.6 > rand-read 52.0 52.0 52.0 > > -------------------------------------------- > ublk null target > IOPS(k) > > type ucmd tw ucmd-not-touch-pdu > seq-read 272 286 275 > rand-read 262 278 269 > > > ------------ > ublksrv test > > ------------- > ucmd > > running loop/001 > fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > randwrite: jobs 1, iops 66737 > randread: jobs 1, iops 64935 > randrw: jobs 1, iops read 32694 write 32710 > rw(512k): jobs 1, iops read 772 write 819 > > running null/001 > fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > randwrite: jobs 1, iops 715863 > randread: jobs 1, iops 758449 > randrw: jobs 1, iops read 357407 write 357183 > rw(512k): jobs 1, iops read 5895 write 5875 > > ------------- > tw > > running loop/001 > fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > randwrite: jobs 1, iops 66856 > randread: jobs 1, iops 65015 > randrw: jobs 1, iops read 32751 write 32767 > rw(512k): jobs 1, iops read 776 write 823 > > running null/001 > fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > randwrite: jobs 1, iops 739450 > randread: jobs 1, iops 787500 > randrw: jobs 1, iops read 372956 write 372831 > rw(512k): jobs 1, iops read 5798 write 5777 > > ------------- > ucmd-not-touch-pdu > > running loop/001 > fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_oH0eG), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > randwrite: jobs 1, iops 66754 > randread: jobs 1, iops 65032 > randrw: jobs 1, iops read 32776 write 32792 > rw(512k): jobs 1, iops read 772 write 818 > > running null/001 > fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > randwrite: jobs 1, iops 725334 > randread: jobs 1, iops 741105 > randrw: jobs 1, iops read 360285 write 360047 > rw(512k): jobs 1, iops read 5770 write 5748 > > Not touching cmd/pdu/io in ublk_queue_rq() improves IOPS. > But it is worse than using task_work_add(). Thanks for the test! It is better to not share ucmd between ublk blk-mq io context and ubq daemon context, and we can improve it for using io_uring_cmd_complete_in_task(), and I have one patch by using the batch handing approach in io_uring_cmd_complete_in_task(). Another reason could be the extra __set_notify_signal() in __io_req_task_work_add() via task_work_add(). When task_work_add() is available, we just need to call __set_notify_signal() once for the whole batch, but it can't be done in case of using io_uring_cmd_complete_in_task(). Also the patch of 'use llist' is actually wrong since we have to call io_uring_cmd_complete_in_task() once in ->commit_rqs(), but that couldn't be easy because ucmd isn't available at that time. I think we may have to live with task_work_add() until the perf number is improved to same basically with io_uring_cmd_complete_in_task(). thanks, Ming ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands 2022-10-26 11:29 ` Ming Lei @ 2022-10-27 3:00 ` Ziyang Zhang 2022-10-27 15:38 ` Ming Lei 0 siblings, 1 reply; 12+ messages in thread From: Ziyang Zhang @ 2022-10-27 3:00 UTC (permalink / raw) To: Ming Lei; +Cc: linux-block, Jens Axboe On 2022/10/26 19:29, Ming Lei wrote: [...] >> >> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores >> 64GB MEM, CentOS 8, kernel 6.0+ >> with IORING_SETUP_COOP_TASKRUN, without this kernel patch >> >> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module >> >> ucmd-not-touch-pdu: use llist && do not touch 'cmd'/'pdu'/'io' in ublk_queue_rq() >> >> tw: task_work_add(), ublk is built-in. >> >> >> -------- >> fio test >> >> iodepth=128 numjobs=1 direct=1 bs=4k >> >> -------------------------------------------- >> ublk loop target, the backend is a file. >> >> IOPS(k) >> >> type ucmd tw ucmd-not-touch-pdu >> seq-read 54.1 53.8 53.6 >> rand-read 52.0 52.0 52.0 >> >> -------------------------------------------- >> ublk null target >> IOPS(k) >> >> type ucmd tw ucmd-not-touch-pdu >> seq-read 272 286 275 >> rand-read 262 278 269 >> >> >> ------------ >> ublksrv test >> >> ------------- >> ucmd >> >> running loop/001 >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >> randwrite: jobs 1, iops 66737 >> randread: jobs 1, iops 64935 >> randrw: jobs 1, iops read 32694 write 32710 >> rw(512k): jobs 1, iops read 772 write 819 >> >> running null/001 >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >> randwrite: jobs 1, iops 715863 >> randread: jobs 1, iops 758449 >> randrw: jobs 1, iops read 357407 write 357183 >> rw(512k): jobs 1, iops read 5895 write 5875 >> >> ------------- >> tw >> >> running loop/001 >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >> randwrite: jobs 1, iops 66856 >> randread: jobs 1, iops 65015 >> randrw: jobs 1, iops read 32751 write 32767 >> rw(512k): jobs 1, iops read 776 write 823 >> >> running null/001 >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >> randwrite: jobs 1, iops 739450 >> randread: jobs 1, iops 787500 >> randrw: jobs 1, iops read 372956 write 372831 >> rw(512k): jobs 1, iops read 5798 write 5777 >> >> ------------- >> ucmd-not-touch-pdu >> >> running loop/001 >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_oH0eG), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >> randwrite: jobs 1, iops 66754 >> randread: jobs 1, iops 65032 >> randrw: jobs 1, iops read 32776 write 32792 >> rw(512k): jobs 1, iops read 772 write 818 >> >> running null/001 >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... >> randwrite: jobs 1, iops 725334 >> randread: jobs 1, iops 741105 >> randrw: jobs 1, iops read 360285 write 360047 >> rw(512k): jobs 1, iops read 5770 write 5748 >> >> Not touching cmd/pdu/io in ublk_queue_rq() improves IOPS. >> But it is worse than using task_work_add(). > > Thanks for the test! It is better to not share ucmd between > ublk blk-mq io context and ubq daemon context, and we can > improve it for using io_uring_cmd_complete_in_task(), and I > have one patch by using the batch handing approach in > io_uring_cmd_complete_in_task(). > > Another reason could be the extra __set_notify_signal() in > __io_req_task_work_add() via task_work_add(). When task_work_add() > is available, we just need to call __set_notify_signal() once > for the whole batch, but it can't be done in case of using > io_uring_cmd_complete_in_task(). > > Also the patch of 'use llist' is actually wrong since we have to > call io_uring_cmd_complete_in_task() once in ->commit_rqs(), but > that couldn't be easy because ucmd isn't available at that time. Yes, you are correct. > > I think we may have to live with task_work_add() until the perf > number is improved to same basically with io_uring_cmd_complete_in_task(). OK, we can keep task_work_add() && io_uring_cmd_complete_in_task(). BTW, from test:loop/001, I think with real backends, the performance gap between them seems not too big. Regards, Zhang ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands 2022-10-27 3:00 ` Ziyang Zhang @ 2022-10-27 15:38 ` Ming Lei 0 siblings, 0 replies; 12+ messages in thread From: Ming Lei @ 2022-10-27 15:38 UTC (permalink / raw) To: Ziyang Zhang; +Cc: linux-block, Jens Axboe On Thu, Oct 27, 2022 at 11:00:44AM +0800, Ziyang Zhang wrote: > On 2022/10/26 19:29, Ming Lei wrote: > > [...] > >> > >> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores > >> 64GB MEM, CentOS 8, kernel 6.0+ > >> with IORING_SETUP_COOP_TASKRUN, without this kernel patch > >> > >> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module > >> > >> ucmd-not-touch-pdu: use llist && do not touch 'cmd'/'pdu'/'io' in ublk_queue_rq() > >> > >> tw: task_work_add(), ublk is built-in. > >> > >> > >> -------- > >> fio test > >> > >> iodepth=128 numjobs=1 direct=1 bs=4k > >> > >> -------------------------------------------- > >> ublk loop target, the backend is a file. > >> > >> IOPS(k) > >> > >> type ucmd tw ucmd-not-touch-pdu > >> seq-read 54.1 53.8 53.6 > >> rand-read 52.0 52.0 52.0 > >> > >> -------------------------------------------- > >> ublk null target > >> IOPS(k) > >> > >> type ucmd tw ucmd-not-touch-pdu > >> seq-read 272 286 275 > >> rand-read 262 278 269 > >> > >> > >> ------------ > >> ublksrv test > >> > >> ------------- > >> ucmd > >> > >> running loop/001 > >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >> randwrite: jobs 1, iops 66737 > >> randread: jobs 1, iops 64935 > >> randrw: jobs 1, iops read 32694 write 32710 > >> rw(512k): jobs 1, iops read 772 write 819 > >> > >> running null/001 > >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >> randwrite: jobs 1, iops 715863 > >> randread: jobs 1, iops 758449 > >> randrw: jobs 1, iops read 357407 write 357183 > >> rw(512k): jobs 1, iops read 5895 write 5875 > >> > >> ------------- > >> tw > >> > >> running loop/001 > >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >> randwrite: jobs 1, iops 66856 > >> randread: jobs 1, iops 65015 > >> randrw: jobs 1, iops read 32751 write 32767 > >> rw(512k): jobs 1, iops read 776 write 823 > >> > >> running null/001 > >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >> randwrite: jobs 1, iops 739450 > >> randread: jobs 1, iops 787500 > >> randrw: jobs 1, iops read 372956 write 372831 > >> rw(512k): jobs 1, iops read 5798 write 5777 > >> > >> ------------- > >> ucmd-not-touch-pdu > >> > >> running loop/001 > >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_oH0eG), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >> randwrite: jobs 1, iops 66754 > >> randread: jobs 1, iops 65032 > >> randrw: jobs 1, iops read 32776 write 32792 > >> rw(512k): jobs 1, iops read 772 write 818 > >> > >> running null/001 > >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)... > >> randwrite: jobs 1, iops 725334 > >> randread: jobs 1, iops 741105 > >> randrw: jobs 1, iops read 360285 write 360047 > >> rw(512k): jobs 1, iops read 5770 write 5748 > >> > >> Not touching cmd/pdu/io in ublk_queue_rq() improves IOPS. > >> But it is worse than using task_work_add(). > > > > Thanks for the test! It is better to not share ucmd between > > ublk blk-mq io context and ubq daemon context, and we can > > improve it for using io_uring_cmd_complete_in_task(), and I > > have one patch by using the batch handing approach in > > io_uring_cmd_complete_in_task(). > > > > Another reason could be the extra __set_notify_signal() in > > __io_req_task_work_add() via task_work_add(). When task_work_add() > > is available, we just need to call __set_notify_signal() once > > for the whole batch, but it can't be done in case of using > > io_uring_cmd_complete_in_task(). > > > > Also the patch of 'use llist' is actually wrong since we have to > > call io_uring_cmd_complete_in_task() once in ->commit_rqs(), but > > that couldn't be easy because ucmd isn't available at that time. > > Yes, you are correct. > > > > > I think we may have to live with task_work_add() until the perf > > number is improved to same basically with io_uring_cmd_complete_in_task(). > > OK, we can keep task_work_add() && io_uring_cmd_complete_in_task(). > BTW, from test:loop/001, I think with real backends, the performance > gap between them seems not too big. Actually in VM created in my laptop, the gap isn't small on ublk-loop: 1) ublk-null, single job, change nr_hw_queue to 2 for using none scheduler [root@ktest-09 ublksrv]# make test T=null/001:null/004 make -C tests run T=null/001:null/004 R=10 D=tests/tmp/ make[1]: Entering directory '/root/git/ublksrv/tests' ./run_test.sh null/001:null/004 10 tests/tmp/ running null/001 fio (ublk/null(), libaio, dio, hw queues:2, io jobs: 1, uring_comp: 0, get_data: 0)... randwrite(4k): jobs 1, iops 1028774, cpu_util(29% 69%) randread(4k): jobs 1, iops 958598, cpu_util(27% 72%) randrw(4k): jobs 1, iops read 453382 write 453239, cpu_util(29% 70%) rw(512k): jobs 1, iops read 5264 write 5242, cpu_util(7% 6%) running null/004 fio (ublk/null(), libaio, dio, hw queues:2, io jobs: 1, uring_comp: 1, get_data: 0)... randwrite(4k): jobs 1, iops 950755, cpu_util(30% 69%) randread(4k): jobs 1, iops 855984, cpu_util(26% 73%) randrw(4k): jobs 1, iops read 439369 write 439287, cpu_util(29% 70%) rw(512k): jobs 1, iops read 5225 write 5202, cpu_util(7% 5%) 2) ublk-loop, single job, change nr_hw_queue to 2 for using none scheduler [root@ktest-09 ublksrv]# make test T=loop/001:loop/004 make -C tests run T=loop/001:loop/004 R=10 D=tests/tmp/ make[1]: Entering directory '/root/git/ublksrv/tests' ./run_test.sh loop/001:loop/004 10 tests/tmp/ running loop/001 fio (ublk/loop( -f /root/git/ublksrv/tests/tmp/ublk_loop_2G_STbdZ), libaio, dio, hw queues:2, io jobs: 1, uring_comp: 0, get_data: 0)... randwrite(4k): jobs 1, iops 163177, cpu_util(7% 20%) randread(4k): jobs 1, iops 141584, cpu_util(6% 14%) randrw(4k): jobs 1, iops read 75034 write 75148, cpu_util(7% 17%) rw(512k): jobs 1, iops read 1574 write 1663, cpu_util(3% 4%) running loop/004 fio (ublk/loop( -f /root/git/ublksrv/tests/tmp/ublk_loop_2G_AdrHl), libaio, dio, hw queues:2, io jobs: 1, uring_comp: 1, get_data: 0)... randwrite(4k): jobs 1, iops 142353, cpu_util(7% 19%) randread(4k): jobs 1, iops 133883, cpu_util(6% 14%) randrw(4k): jobs 1, iops read 72581 write 72691, cpu_util(7% 18%) rw(512k): jobs 1, iops read 929 write 981, cpu_util(2% 2%) Thanks, Ming ^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2022-10-27 15:38 UTC | newest] Thread overview: 12+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2022-10-23 9:38 [PATCH] ublk_drv: don't call task_work_add for queueing io commands Ming Lei 2022-10-24 9:48 ` Ziyang Zhang 2022-10-24 13:20 ` Ming Lei 2022-10-25 3:15 ` Ziyang Zhang 2022-10-25 7:19 ` Ming Lei 2022-10-25 7:46 ` Ziyang Zhang 2022-10-25 8:43 ` Ziyang Zhang 2022-10-25 15:17 ` Ming Lei 2022-10-26 10:32 ` Ziyang Zhang 2022-10-26 11:29 ` Ming Lei 2022-10-27 3:00 ` Ziyang Zhang 2022-10-27 15:38 ` Ming Lei
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).