All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] nvme-multipath: introduce service-time iopolicy
@ 2024-11-07  6:32 Guixin Liu
  2024-11-07  6:39 ` Christoph Hellwig
  0 siblings, 1 reply; 3+ messages in thread
From: Guixin Liu @ 2024-11-07  6:32 UTC (permalink / raw)
  To: kbusch, axboe, hch, sagi, kch; +Cc: linux-nvme

The service-time policy can dispatch I/O to the path with the lowest
total amount of currently processed I/O, ensuring that new I/O can be
sent to less-loaded paths when some paths are overloaded, thereby
achieving lower latency.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
---
Changes from v1 to v2:
- Use atomic64_t to replace atomic_t(Keith Busch)

 drivers/nvme/host/multipath.c | 53 ++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/nvme.h      |  3 ++
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 6a15873055b9..fcd3b2108152 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -18,6 +18,7 @@ static const char *nvme_iopolicy_names[] = {
 	[NVME_IOPOLICY_NUMA]	= "numa",
 	[NVME_IOPOLICY_RR]	= "round-robin",
 	[NVME_IOPOLICY_QD]      = "queue-depth",
+	[NVME_IOPOLICY_ST]	= "service-time",
 };
 
 static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -32,6 +33,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
 		iopolicy = NVME_IOPOLICY_RR;
 	else if (!strncmp(val, "queue-depth", 11))
 		iopolicy = NVME_IOPOLICY_QD;
+	else if (!strncmp(val, "service-time", 12))
+		iopolicy = NVME_IOPOLICY_ST;
 	else
 		return -EINVAL;
 
@@ -46,7 +49,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
 	&iopolicy, 0644);
 MODULE_PARM_DESC(iopolicy,
-	"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
+	"Default multipath I/O policy; 'numa' (default), 'round-robin', 'queue-depth' or 'service-time'");
 
 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
 {
@@ -136,6 +139,11 @@ void nvme_mpath_start_request(struct request *rq)
 		nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
 	}
 
+	if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_ST) {
+		atomic64_add(blk_rq_bytes(rq), &ns->ctrl->inflight_size);
+		nvme_req(rq)->flags |= NVME_MPATH_CNT_IOSIZE;
+	}
+
 	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
 		return;
 
@@ -152,6 +160,9 @@ void nvme_mpath_end_request(struct request *rq)
 	if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
 		atomic_dec_if_positive(&ns->ctrl->nr_active);
 
+	if (nvme_req(rq)->flags & NVME_MPATH_CNT_IOSIZE)
+		atomic64_sub(blk_rq_bytes(rq), &ns->ctrl->inflight_size);
+
 	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
 		return;
 	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
@@ -405,9 +416,48 @@ static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
 	return ns;
 }
 
+static struct nvme_ns *nvme_service_time_path(struct nvme_ns_head *head)
+{
+	struct nvme_ns *opt = NULL, *nonopt = NULL, *ns;
+	unsigned int min_inflight_nonopt = UINT_MAX;
+	unsigned int min_inflight_opt = UINT_MAX;
+	unsigned int inflight;
+
+	list_for_each_entry_rcu(ns, &head->list, siblings) {
+		if (nvme_path_is_disabled(ns))
+			continue;
+
+		inflight = atomic64_read(&ns->ctrl->inflight_size);
+
+		switch (ns->ana_state) {
+		case NVME_ANA_OPTIMIZED:
+			if (inflight < min_inflight_opt) {
+				min_inflight_opt = inflight;
+				opt = ns;
+			}
+			break;
+		case NVME_ANA_NONOPTIMIZED:
+			if (inflight < min_inflight_nonopt) {
+				min_inflight_nonopt = inflight;
+				nonopt = ns;
+			}
+			break;
+		default:
+			break;
+		}
+
+		if (min_inflight_opt == 0)
+			return opt;
+	}
+
+	return opt ? opt : nonopt;
+}
+
 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 {
 	switch (READ_ONCE(head->subsys->iopolicy)) {
+	case NVME_IOPOLICY_ST:
+		return nvme_service_time_path(head);
 	case NVME_IOPOLICY_QD:
 		return nvme_queue_depth_path(head);
 	case NVME_IOPOLICY_RR:
@@ -1040,6 +1090,7 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 
 	/* initialize this in the identify path to cover controller resets */
 	atomic_set(&ctrl->nr_active, 0);
+	atomic64_set(&ctrl->inflight_size, 0);
 
 	if (!ctrl->max_namespaces ||
 	    ctrl->max_namespaces > le32_to_cpu(id->nn)) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 093cb423f536..bf6c74fdc9ba 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -202,6 +202,7 @@ enum {
 	NVME_REQ_USERCMD		= (1 << 1),
 	NVME_MPATH_IO_STATS		= (1 << 2),
 	NVME_MPATH_CNT_ACTIVE		= (1 << 3),
+	NVME_MPATH_CNT_IOSIZE		= (1 << 4),
 };
 
 static inline struct nvme_request *nvme_req(struct request *req)
@@ -367,6 +368,7 @@ struct nvme_ctrl {
 	struct timer_list anatt_timer;
 	struct work_struct ana_work;
 	atomic_t nr_active;
+	atomic64_t inflight_size;
 #endif
 
 #ifdef CONFIG_NVME_HOST_AUTH
@@ -416,6 +418,7 @@ enum nvme_iopolicy {
 	NVME_IOPOLICY_NUMA,
 	NVME_IOPOLICY_RR,
 	NVME_IOPOLICY_QD,
+	NVME_IOPOLICY_ST,
 };
 
 struct nvme_subsystem {
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] nvme-multipath: introduce service-time iopolicy
  2024-11-07  6:32 [PATCH v2] nvme-multipath: introduce service-time iopolicy Guixin Liu
@ 2024-11-07  6:39 ` Christoph Hellwig
  2024-11-07  7:09   ` Guixin Liu
  0 siblings, 1 reply; 3+ messages in thread
From: Christoph Hellwig @ 2024-11-07  6:39 UTC (permalink / raw)
  To: Guixin Liu; +Cc: kbusch, axboe, hch, sagi, kch, linux-nvme

On Thu, Nov 07, 2024 at 02:32:49PM +0800, Guixin Liu wrote:
> The service-time policy can dispatch I/O to the path with the lowest
> total amount of currently processed I/O, ensuring that new I/O can be
> sent to less-loaded paths when some paths are overloaded, thereby
> achieving lower latency.

What is the exact use case for this?  The commit log is unfortunately
very sparse.  All these little policies add up, and we have absolutely
no documentation on them :(



^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] nvme-multipath: introduce service-time iopolicy
  2024-11-07  6:39 ` Christoph Hellwig
@ 2024-11-07  7:09   ` Guixin Liu
  0 siblings, 0 replies; 3+ messages in thread
From: Guixin Liu @ 2024-11-07  7:09 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kbusch, axboe, sagi, kch, linux-nvme


在 2024/11/7 14:39, Christoph Hellwig 写道:
> On Thu, Nov 07, 2024 at 02:32:49PM +0800, Guixin Liu wrote:
>> The service-time policy can dispatch I/O to the path with the lowest
>> total amount of currently processed I/O, ensuring that new I/O can be
>> sent to less-loaded paths when some paths are overloaded, thereby
>> achieving lower latency.
> What is the exact use case for this?  The commit log is unfortunately
> very sparse.

In scenarios with varying random I/O sizes, the different I/O sizes 
being processed on

each path can lead to slower processing and higher latency on paths 
under heavy load. By

distributing the I/O to other paths with lighter loads, the I/O can be 
processed more

quickly.

Sorry for the sparse commit log, I will send a v3 to expain detaily.

>   All these little policies add up, and we have absolutely
> no documentation on them :(

I'm glad to add a document to introduce all these nvme-multipath polices 
within v3 patchset.

Best Regards,

Guixin Liu




^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-11-07  7:09 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-07  6:32 [PATCH v2] nvme-multipath: introduce service-time iopolicy Guixin Liu
2024-11-07  6:39 ` Christoph Hellwig
2024-11-07  7:09   ` Guixin Liu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.