Linux-NVME Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Hannes Reinecke <hare@suse.de>
To: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>, Keith Busch <kbusch@kernel.org>,
	Ewan Milne <emilne@redhat.com>,
	Uday Shankar <ushankar@purestorage.com>,
	Randy Jennings <randyj@purestorage.com>,
	linux-nvme@lists.infradead.org, Hannes Reinecke <hare@suse.de>
Subject: [PATCH 1/2] nvme-multipath: add additional iopolicies
Date: Wed, 26 Jul 2023 15:23:04 +0200	[thread overview]
Message-ID: <20230726132305.33739-2-hare@suse.de> (raw)
In-Reply-To: <20230726132305.33739-1-hare@suse.de>

Add a several new iopolicies, 'bandwidth' and 'ewma',
to direct I/O better in case of complex fabrics.
The current 'round-robin' I/O policy suffers from the 'buffer bloat'
symptom, where I/O will be piling on the slow paths for
asymmetric fabrics.
With these iopolicies I/O is directed to the path with the
smallest 'load', where the load is calculated either as the
smallest number of bytes transmitted (the 'bandwidth' iopolicy),
or the least busy path (the 'ewma' iopolicy).

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 drivers/nvme/host/multipath.c | 54 +++++++++++++++++++++++++++++++++--
 drivers/nvme/host/nvme.h      |  3 ++
 2 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 0a88d7bdc5e3..7813608038bc 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -17,6 +17,8 @@ MODULE_PARM_DESC(multipath,
 static const char *nvme_iopolicy_names[] = {
 	[NVME_IOPOLICY_NUMA]	= "numa",
 	[NVME_IOPOLICY_RR]	= "round-robin",
+	[NVME_IOPOLICY_BW]	= "bandwidth",
+	[NVME_IOPOLICY_EWMA]	= "ewma",
 };
 
 static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -29,6 +31,10 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
 		iopolicy = NVME_IOPOLICY_NUMA;
 	else if (!strncmp(val, "round-robin", 11))
 		iopolicy = NVME_IOPOLICY_RR;
+	else if (!strncmp(val, "bandwidth", 9))
+		iopolicy = NVME_IOPOLICY_BW;
+	else if (!strncmp(val, "ewma", 4))
+		iopolicy = NVME_IOPOLICY_EWMA;
 	else
 		return -EINVAL;
 
@@ -43,7 +49,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
 	&iopolicy, 0644);
 MODULE_PARM_DESC(iopolicy,
-	"Default multipath I/O policy; 'numa' (default) or 'round-robin'");
+	"Default multipath I/O policy; 'numa' (default), 'round-robin', 'bandwidth', or 'ewma'");
 
 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
 {
@@ -237,17 +243,34 @@ static bool nvme_path_is_disabled(struct nvme_ns *ns)
 	return false;
 }
 
+static unsigned int nvme_path_ewma(struct nvme_ns *ns)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long i;
+	unsigned int ewma = 0;
+
+	queue_for_each_hw_ctx(ns->queue, hctx, i)
+		ewma += hctx->dispatch_busy;
+
+	return ewma;
+}
+
 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 {
 	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
 	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
+	int iopolicy = READ_ONCE(head->subsys->iopolicy);
 
 	list_for_each_entry_rcu(ns, &head->list, siblings) {
 		if (nvme_path_is_disabled(ns))
 			continue;
 
-		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+		if (iopolicy == NVME_IOPOLICY_NUMA)
 			distance = node_distance(node, ns->ctrl->numa_node);
+		else if (iopolicy == NVME_IOPOLICY_BW)
+			distance = ns->path_weight;
+		else if (iopolicy == NVME_IOPOLICY_EWMA)
+			distance = nvme_path_ewma(ns);
 		else
 			distance = LOCAL_DISTANCE;
 
@@ -329,6 +352,26 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
 	return found;
 }
 
+static inline void nvme_path_weight(struct nvme_ns *ns, struct bio *bio)
+{
+	int iopolicy;
+
+	iopolicy = READ_ONCE(ns->head->subsys->iopolicy);
+	if (iopolicy == NVME_IOPOLICY_BW)
+		ns->path_weight += bio_sectors(bio);
+}
+
+static void nvme_reset_weight(struct nvme_ns_head *head)
+{
+	int srcu_idx;
+	struct nvme_ns *ns;
+
+	srcu_idx = srcu_read_lock(&head->srcu);
+	list_for_each_entry_rcu(ns, &head->list, siblings)
+		WRITE_ONCE(ns->path_weight, 0);
+	srcu_read_unlock(&head->srcu, srcu_idx);
+}
+
 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 {
 	return ns->ctrl->state == NVME_CTRL_LIVE &&
@@ -390,6 +433,7 @@ static void nvme_ns_head_submit_bio(struct bio *bio)
 	srcu_idx = srcu_read_lock(&head->srcu);
 	ns = nvme_find_path(head);
 	if (likely(ns)) {
+		nvme_path_weight(ns, bio);
 		bio_set_dev(bio, ns->disk->part0);
 		bio->bi_opf |= REQ_NVME_MPATH;
 		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
@@ -657,6 +701,7 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
 	ns->ana_grpid = le32_to_cpu(desc->grpid);
 	ns->ana_state = desc->state;
 	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
+	ns->path_weight = 0;
 	/*
 	 * nvme_mpath_set_live() will trigger I/O to the multipath path device
 	 * and in turn to this path device.  However we cannot accept this I/O
@@ -805,7 +850,12 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
 
 	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
 		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
+			struct nvme_ns_head *h;
 			WRITE_ONCE(subsys->iopolicy, i);
+			mutex_lock(&subsys->lock);
+			list_for_each_entry(h, &subsys->nsheads, entry)
+				nvme_reset_weight(h);
+			mutex_unlock(&subsys->lock);
 			return count;
 		}
 	}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 6fe7966f720b..4eabb6ee563a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -390,6 +390,8 @@ struct nvme_ctrl {
 enum nvme_iopolicy {
 	NVME_IOPOLICY_NUMA,
 	NVME_IOPOLICY_RR,
+	NVME_IOPOLICY_BW,
+	NVME_IOPOLICY_EWMA,
 };
 
 struct nvme_subsystem {
@@ -482,6 +484,7 @@ struct nvme_ns {
 #ifdef CONFIG_NVME_MULTIPATH
 	enum nvme_ana_state ana_state;
 	u32 ana_grpid;
+	u32 path_weight;
 #endif
 	struct list_head siblings;
 	struct kref kref;
-- 
2.35.3



  reply	other threads:[~2023-07-26 13:23 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-26 13:23 [PATCH RFC 0/2] nvme-multipath: additional iopolicies Hannes Reinecke
2023-07-26 13:23 ` Hannes Reinecke [this message]
2023-07-26 13:23 ` [PATCH 2/2] nvme-multipath: add 'latency' iopolicy Hannes Reinecke
2023-07-26 13:25 ` [PATCH RFC 0/2] nvme-multipath: additional iopolicies Christoph Hellwig
2023-07-26 13:32   ` Hannes Reinecke

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230726132305.33739-2-hare@suse.de \
    --to=hare@suse.de \
    --cc=emilne@redhat.com \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=randyj@purestorage.com \
    --cc=sagi@grimberg.me \
    --cc=ushankar@purestorage.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox