All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC 0/2] nvme-multipath: additional iopolicies
@ 2023-07-26 13:23 Hannes Reinecke
  2023-07-26 13:23 ` [PATCH 1/2] nvme-multipath: add " Hannes Reinecke
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Hannes Reinecke @ 2023-07-26 13:23 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sagi Grimberg, Keith Busch, Ewan Milne, Uday Shankar,
	Randy Jennings, linux-nvme, Hannes Reinecke

Hi all,

as Ewan raised the issue that we are suffering from a buffer-bloat
problem when using the round-robin I/O scheduler here are additiona
I/O schedulers to improve the situation.
The schedulers are untested, and are compile-tested only.
I just putting them out here as I had several requests for them;
I'm happily improving them upon feedback.

Hannes Reinecke (2):
  nvme-multipath: add additional iopolicies
  nvme-multipath: add 'latency' iopolicy

 drivers/nvme/host/core.c      |  1 +
 drivers/nvme/host/multipath.c | 69 ++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/nvme.h      |  8 ++++
 3 files changed, 76 insertions(+), 2 deletions(-)

-- 
2.35.3



^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/2] nvme-multipath: add additional iopolicies
  2023-07-26 13:23 [PATCH RFC 0/2] nvme-multipath: additional iopolicies Hannes Reinecke
@ 2023-07-26 13:23 ` Hannes Reinecke
  2023-07-26 13:23 ` [PATCH 2/2] nvme-multipath: add 'latency' iopolicy Hannes Reinecke
  2023-07-26 13:25 ` [PATCH RFC 0/2] nvme-multipath: additional iopolicies Christoph Hellwig
  2 siblings, 0 replies; 5+ messages in thread
From: Hannes Reinecke @ 2023-07-26 13:23 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sagi Grimberg, Keith Busch, Ewan Milne, Uday Shankar,
	Randy Jennings, linux-nvme, Hannes Reinecke

Add a several new iopolicies, 'bandwidth' and 'ewma',
to direct I/O better in case of complex fabrics.
The current 'round-robin' I/O policy suffers from the 'buffer bloat'
symptom, where I/O will be piling on the slow paths for
asymmetric fabrics.
With these iopolicies I/O is directed to the path with the
smallest 'load', where the load is calculated either as the
smallest number of bytes transmitted (the 'bandwidth' iopolicy),
or the least busy path (the 'ewma' iopolicy).

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 drivers/nvme/host/multipath.c | 54 +++++++++++++++++++++++++++++++++--
 drivers/nvme/host/nvme.h      |  3 ++
 2 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 0a88d7bdc5e3..7813608038bc 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -17,6 +17,8 @@ MODULE_PARM_DESC(multipath,
 static const char *nvme_iopolicy_names[] = {
 	[NVME_IOPOLICY_NUMA]	= "numa",
 	[NVME_IOPOLICY_RR]	= "round-robin",
+	[NVME_IOPOLICY_BW]	= "bandwidth",
+	[NVME_IOPOLICY_EWMA]	= "ewma",
 };
 
 static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -29,6 +31,10 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
 		iopolicy = NVME_IOPOLICY_NUMA;
 	else if (!strncmp(val, "round-robin", 11))
 		iopolicy = NVME_IOPOLICY_RR;
+	else if (!strncmp(val, "bandwidth", 9))
+		iopolicy = NVME_IOPOLICY_BW;
+	else if (!strncmp(val, "ewma", 4))
+		iopolicy = NVME_IOPOLICY_EWMA;
 	else
 		return -EINVAL;
 
@@ -43,7 +49,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
 	&iopolicy, 0644);
 MODULE_PARM_DESC(iopolicy,
-	"Default multipath I/O policy; 'numa' (default) or 'round-robin'");
+	"Default multipath I/O policy; 'numa' (default), 'round-robin', 'bandwidth', or 'ewma'");
 
 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
 {
@@ -237,17 +243,34 @@ static bool nvme_path_is_disabled(struct nvme_ns *ns)
 	return false;
 }
 
+static unsigned int nvme_path_ewma(struct nvme_ns *ns)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long i;
+	unsigned int ewma = 0;
+
+	queue_for_each_hw_ctx(ns->queue, hctx, i)
+		ewma += hctx->dispatch_busy;
+
+	return ewma;
+}
+
 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 {
 	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
 	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
+	int iopolicy = READ_ONCE(head->subsys->iopolicy);
 
 	list_for_each_entry_rcu(ns, &head->list, siblings) {
 		if (nvme_path_is_disabled(ns))
 			continue;
 
-		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+		if (iopolicy == NVME_IOPOLICY_NUMA)
 			distance = node_distance(node, ns->ctrl->numa_node);
+		else if (iopolicy == NVME_IOPOLICY_BW)
+			distance = ns->path_weight;
+		else if (iopolicy == NVME_IOPOLICY_EWMA)
+			distance = nvme_path_ewma(ns);
 		else
 			distance = LOCAL_DISTANCE;
 
@@ -329,6 +352,26 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
 	return found;
 }
 
+static inline void nvme_path_weight(struct nvme_ns *ns, struct bio *bio)
+{
+	int iopolicy;
+
+	iopolicy = READ_ONCE(ns->head->subsys->iopolicy);
+	if (iopolicy == NVME_IOPOLICY_BW)
+		ns->path_weight += bio_sectors(bio);
+}
+
+static void nvme_reset_weight(struct nvme_ns_head *head)
+{
+	int srcu_idx;
+	struct nvme_ns *ns;
+
+	srcu_idx = srcu_read_lock(&head->srcu);
+	list_for_each_entry_rcu(ns, &head->list, siblings)
+		WRITE_ONCE(ns->path_weight, 0);
+	srcu_read_unlock(&head->srcu, srcu_idx);
+}
+
 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 {
 	return ns->ctrl->state == NVME_CTRL_LIVE &&
@@ -390,6 +433,7 @@ static void nvme_ns_head_submit_bio(struct bio *bio)
 	srcu_idx = srcu_read_lock(&head->srcu);
 	ns = nvme_find_path(head);
 	if (likely(ns)) {
+		nvme_path_weight(ns, bio);
 		bio_set_dev(bio, ns->disk->part0);
 		bio->bi_opf |= REQ_NVME_MPATH;
 		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
@@ -657,6 +701,7 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
 	ns->ana_grpid = le32_to_cpu(desc->grpid);
 	ns->ana_state = desc->state;
 	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
+	ns->path_weight = 0;
 	/*
 	 * nvme_mpath_set_live() will trigger I/O to the multipath path device
 	 * and in turn to this path device.  However we cannot accept this I/O
@@ -805,7 +850,12 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
 
 	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
 		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
+			struct nvme_ns_head *h;
 			WRITE_ONCE(subsys->iopolicy, i);
+			mutex_lock(&subsys->lock);
+			list_for_each_entry(h, &subsys->nsheads, entry)
+				nvme_reset_weight(h);
+			mutex_unlock(&subsys->lock);
 			return count;
 		}
 	}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 6fe7966f720b..4eabb6ee563a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -390,6 +390,8 @@ struct nvme_ctrl {
 enum nvme_iopolicy {
 	NVME_IOPOLICY_NUMA,
 	NVME_IOPOLICY_RR,
+	NVME_IOPOLICY_BW,
+	NVME_IOPOLICY_EWMA,
 };
 
 struct nvme_subsystem {
@@ -482,6 +484,7 @@ struct nvme_ns {
 #ifdef CONFIG_NVME_MULTIPATH
 	enum nvme_ana_state ana_state;
 	u32 ana_grpid;
+	u32 path_weight;
 #endif
 	struct list_head siblings;
 	struct kref kref;
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] nvme-multipath: add 'latency' iopolicy
  2023-07-26 13:23 [PATCH RFC 0/2] nvme-multipath: additional iopolicies Hannes Reinecke
  2023-07-26 13:23 ` [PATCH 1/2] nvme-multipath: add " Hannes Reinecke
@ 2023-07-26 13:23 ` Hannes Reinecke
  2023-07-26 13:25 ` [PATCH RFC 0/2] nvme-multipath: additional iopolicies Christoph Hellwig
  2 siblings, 0 replies; 5+ messages in thread
From: Hannes Reinecke @ 2023-07-26 13:23 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sagi Grimberg, Keith Busch, Ewan Milne, Uday Shankar,
	Randy Jennings, linux-nvme, Hannes Reinecke

Add an I/O policy for selecting paths based upon the least latency.

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 drivers/nvme/host/core.c      |  1 +
 drivers/nvme/host/multipath.c | 19 +++++++++++++++++--
 drivers/nvme/host/nvme.h      |  5 +++++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b52e9c9bffd6..bd506284af1f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -397,6 +397,7 @@ void nvme_complete_rq(struct request *req)
 
 	trace_nvme_complete_rq(req);
 	nvme_cleanup_cmd(req);
+	nvme_mpath_complete_request(req);
 
 	/*
 	 * Completions of long-running commands should not be able to
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 7813608038bc..8052ba8085f0 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -18,6 +18,7 @@ static const char *nvme_iopolicy_names[] = {
 	[NVME_IOPOLICY_NUMA]	= "numa",
 	[NVME_IOPOLICY_RR]	= "round-robin",
 	[NVME_IOPOLICY_BW]	= "bandwidth",
+	[NVME_IOPOLICY_LAT]	= "latency",
 	[NVME_IOPOLICY_EWMA]	= "ewma",
 };
 
@@ -33,6 +34,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
 		iopolicy = NVME_IOPOLICY_RR;
 	else if (!strncmp(val, "bandwidth", 9))
 		iopolicy = NVME_IOPOLICY_BW;
+	else if (!strncmp(val, "latency", 7))
+		iopolicy = NVME_IOPOLICY_LAT;
 	else if (!strncmp(val, "ewma", 4))
 		iopolicy = NVME_IOPOLICY_EWMA;
 	else
@@ -49,7 +52,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
 	&iopolicy, 0644);
 MODULE_PARM_DESC(iopolicy,
-	"Default multipath I/O policy; 'numa' (default), 'round-robin', 'bandwidth', or 'ewma'");
+	"Default multipath I/O policy; 'numa' (default), 'round-robin', 'bandwidth', 'latency', or 'ewma'");
 
 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
 {
@@ -153,6 +156,17 @@ void nvme_mpath_end_request(struct request *rq)
 			 nvme_req(rq)->start_time);
 }
 
+void nvme_mpath_complete_request(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+
+	if (iopolicy == NVME_IOPOLICY_LAT) {
+		u64 lat = ktime_get_ns() - req->start_time_ns;
+
+		ns->path_weight += lat >> 10;
+	}
+}
+
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
@@ -267,7 +281,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 
 		if (iopolicy == NVME_IOPOLICY_NUMA)
 			distance = node_distance(node, ns->ctrl->numa_node);
-		else if (iopolicy == NVME_IOPOLICY_BW)
+		else if (iopolicy == NVME_IOPOLICY_LAT ||
+			 iopolicy == NVME_IOPOLICY_BW)
 			distance = ns->path_weight;
 		else if (iopolicy == NVME_IOPOLICY_EWMA)
 			distance = nvme_path_ewma(ns);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 4eabb6ee563a..d9e9c1b5236a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -391,6 +391,7 @@ enum nvme_iopolicy {
 	NVME_IOPOLICY_NUMA,
 	NVME_IOPOLICY_RR,
 	NVME_IOPOLICY_BW,
+	NVME_IOPOLICY_LAT,
 	NVME_IOPOLICY_EWMA,
 };
 
@@ -904,6 +905,7 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
 void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
 void nvme_mpath_start_request(struct request *rq);
 void nvme_mpath_end_request(struct request *rq);
+void nvme_mpath_complete_request(struct request *rq);
 
 static inline void nvme_trace_bio_complete(struct request *req)
 {
@@ -995,6 +997,9 @@ static inline void nvme_mpath_start_request(struct request *rq)
 static inline void nvme_mpath_end_request(struct request *rq)
 {
 }
+static inline void nvme_mpath_complete_request(struct request *rq)
+{
+}
 #endif /* CONFIG_NVME_MULTIPATH */
 
 int nvme_revalidate_zones(struct nvme_ns *ns);
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH RFC 0/2] nvme-multipath: additional iopolicies
  2023-07-26 13:23 [PATCH RFC 0/2] nvme-multipath: additional iopolicies Hannes Reinecke
  2023-07-26 13:23 ` [PATCH 1/2] nvme-multipath: add " Hannes Reinecke
  2023-07-26 13:23 ` [PATCH 2/2] nvme-multipath: add 'latency' iopolicy Hannes Reinecke
@ 2023-07-26 13:25 ` Christoph Hellwig
  2023-07-26 13:32   ` Hannes Reinecke
  2 siblings, 1 reply; 5+ messages in thread
From: Christoph Hellwig @ 2023-07-26 13:25 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Sagi Grimberg, Keith Busch, Ewan Milne,
	Uday Shankar, Randy Jennings, linux-nvme

On Wed, Jul 26, 2023 at 03:23:03PM +0200, Hannes Reinecke wrote:
> Hi all,
> 
> as Ewan raised the issue that we are suffering from a buffer-bloat
> problem when using the round-robin I/O scheduler here are additiona
> I/O schedulers to improve the situation.
> The schedulers are untested, and are compile-tested only.
> I just putting them out here as I had several requests for them;
> I'm happily improving them upon feedback.

FYI, the policy for nvme-multpath has mostly been that we want to
see very solid used cases.  So in addition to actually testing
the code, I'd like to see detailed and solid use cases.  And preferably
just one new policy and not three if possible..



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH RFC 0/2] nvme-multipath: additional iopolicies
  2023-07-26 13:25 ` [PATCH RFC 0/2] nvme-multipath: additional iopolicies Christoph Hellwig
@ 2023-07-26 13:32   ` Hannes Reinecke
  0 siblings, 0 replies; 5+ messages in thread
From: Hannes Reinecke @ 2023-07-26 13:32 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sagi Grimberg, Keith Busch, Ewan Milne, Uday Shankar,
	Randy Jennings, linux-nvme

On 7/26/23 15:25, Christoph Hellwig wrote:
> On Wed, Jul 26, 2023 at 03:23:03PM +0200, Hannes Reinecke wrote:
>> Hi all,
>>
>> as Ewan raised the issue that we are suffering from a buffer-bloat
>> problem when using the round-robin I/O scheduler here are additiona
>> I/O schedulers to improve the situation.
>> The schedulers are untested, and are compile-tested only.
>> I just putting them out here as I had several requests for them;
>> I'm happily improving them upon feedback.
> 
> FYI, the policy for nvme-multpath has mostly been that we want to
> see very solid used cases.  So in addition to actually testing
> the code, I'd like to see detailed and solid use cases.  And preferably
> just one new policy and not three if possible..
> 
I could not agree more.
That was precisely why I was reluctant to post them in the first place.
But I have been asked by several parties and really don't have time now 
to do a proper testing (what with me being in the middle of a server 
room move).
So I decided to put them out as a RFC, fully expecting things to change.
And also expecting to have some solid data from testing to check which 
I/O scheduler really improves the situation.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew
Myers, Andrew McDonald, Martje Boudien Moerman



^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-07-26 13:32 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-07-26 13:23 [PATCH RFC 0/2] nvme-multipath: additional iopolicies Hannes Reinecke
2023-07-26 13:23 ` [PATCH 1/2] nvme-multipath: add " Hannes Reinecke
2023-07-26 13:23 ` [PATCH 2/2] nvme-multipath: add 'latency' iopolicy Hannes Reinecke
2023-07-26 13:25 ` [PATCH RFC 0/2] nvme-multipath: additional iopolicies Christoph Hellwig
2023-07-26 13:32   ` Hannes Reinecke

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.