All of lore.kernel.org
 help / color / mirror / Atom feed
From: hare@suse.de (Hannes Reinecke)
Subject: [PATCHv2] nvme-multipath: round-robin I/O policy
Date: Fri, 21 Dec 2018 15:13:30 +0100	[thread overview]
Message-ID: <20181221141330.96599-1-hare@suse.de> (raw)

Implement a simple round-robin I/O policy for multipathing.
Path selection is done in two rounds, first iterating across all
optimized paths, and, if that doesn't return any valid paths,
then iterate over all optimized and non-optimized paths.
If no paths are found we're using the existing algorithm.
This patch also implements a sysfs attribute 'iopolicy' to switch
between the current, NUMA-aware I/O policy and the 'round-robin'
I/O policy.
The original NUMA-aware I/O policy is kept as a default.

Signed-off-by: Hannes Reinecke <hare at suse.com>
---
 drivers/nvme/host/core.c      |   6 +++
 drivers/nvme/host/multipath.c | 100 +++++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/nvme.h      |  12 +++++
 3 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 08f2c92602f4..7603aaa8217e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2275,6 +2275,9 @@ static struct attribute *nvme_subsys_attrs[] = {
 	&subsys_attr_serial.attr,
 	&subsys_attr_firmware_rev.attr,
 	&subsys_attr_subsysnqn.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+	&subsys_attr_iopolicy.attr,
+#endif
 	NULL,
 };
 
@@ -2327,6 +2330,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 	memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
 	subsys->vendor_id = le16_to_cpu(id->vid);
 	subsys->cmic = id->cmic;
+#ifdef CONFIG_NVME_MULTIPATH
+	subsys->iopolicy = NVME_IOPOLICY_NUMA;
+#endif
 
 	subsys->dev.class = nvme_subsys_class;
 	subsys->dev.release = nvme_release_subsystem;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 183ec17ba067..69cccdaea62e 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,10 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
 			continue;
 
-		distance = node_distance(node, ns->ctrl->numa_node);
+		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+			distance = node_distance(node, ns->ctrl->numa_node);
+		else
+			distance = LOCAL_DISTANCE;
 
 		switch (ns->ana_state) {
 		case NVME_ANA_OPTIMIZED:
@@ -168,6 +171,54 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 	return found;
 }
 
+static struct nvme_ns *__nvme_rr_next_path(struct nvme_ns_head *head, int node,
+					   struct nvme_ns *old)
+{
+	struct nvme_ns *ns, *found = NULL;
+	bool try_nonoptimized = false;
+
+	if (!old)
+		return NULL;
+retry:
+	ns = old;
+	do {
+		ns = list_next_or_null_rcu(&head->list, &ns->siblings,
+					   struct nvme_ns, siblings);
+		if (!ns) {
+			ns = list_first_or_null_rcu(&head->list, struct nvme_ns,
+						    siblings);
+			if (!ns)
+				return NULL;
+
+			if (ns == old)
+				/*
+				 * The list consists of just one entry.
+				 * Sorry for the noise :-)
+				 */
+				return old;
+		}
+		if (ns->disk && ns->ctrl->state == NVME_CTRL_LIVE) {
+			if (ns->ana_state == NVME_ANA_OPTIMIZED) {
+				found = ns;
+				break;
+			}
+			if (try_nonoptimized &&
+			    ns->ana_state == NVME_ANA_NONOPTIMIZED) {
+				found = ns;
+				break;
+			}
+		}
+	} while (ns != old);
+
+	if (found)
+		rcu_assign_pointer(head->current_path[node], found);
+	else if (!try_nonoptimized) {
+		try_nonoptimized = true;
+		goto retry;
+	}
+	return found;
+}
+
 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 {
 	return ns->ctrl->state == NVME_CTRL_LIVE &&
@@ -180,6 +231,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 	struct nvme_ns *ns;
 
 	ns = srcu_dereference(head->current_path[node], &head->srcu);
+	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+		ns = __nvme_rr_next_path(head, node, ns);
 	if (unlikely(!ns || !nvme_path_is_optimized(ns)))
 		ns = __nvme_find_path(head, node);
 	return ns;
@@ -471,6 +524,51 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl)
 	cancel_work_sync(&ctrl->ana_work);
 }
 
+#define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
+	struct device_attribute subsys_attr_##_name =	\
+		__ATTR(_name, _mode, _show, _store)
+
+static const char *nvme_iopolicy_names[] = {
+	[NVME_IOPOLICY_UNKNOWN] = "unknown",
+	[NVME_IOPOLICY_NUMA] = "numa",
+	[NVME_IOPOLICY_RR] = "round-robin",
+};
+
+static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_subsystem *subsys =
+		container_of(dev, struct nvme_subsystem, dev);
+	int iopolicy = NVME_IOPOLICY_UNKNOWN;
+
+	if (iopolicy < ARRAY_SIZE(nvme_iopolicy_names))
+		iopolicy = READ_ONCE(subsys->iopolicy);
+	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
+}
+
+static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	enum nvme_iopolicy iopolicy = NVME_IOPOLICY_UNKNOWN;
+	struct nvme_subsystem *subsys =
+		container_of(dev, struct nvme_subsystem, dev);
+
+	if (!strncmp(buf, nvme_iopolicy_names[NVME_IOPOLICY_NUMA],
+		     strlen(nvme_iopolicy_names[NVME_IOPOLICY_NUMA])))
+		iopolicy = NVME_IOPOLICY_NUMA;
+	else if (!strncmp(buf, nvme_iopolicy_names[NVME_IOPOLICY_RR],
+		     strlen(nvme_iopolicy_names[NVME_IOPOLICY_RR])))
+		iopolicy = NVME_IOPOLICY_RR;
+
+	if (iopolicy == NVME_IOPOLICY_UNKNOWN)
+		return -EINVAL;
+
+	WRITE_ONCE(subsys->iopolicy, iopolicy);
+	return count;
+}
+SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
+		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
+
 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
 		char *buf)
 {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2b36ac922596..e24b51a608de 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -246,6 +246,14 @@ struct nvme_ctrl {
 	unsigned long discard_page_busy;
 };
 
+#ifdef CONFIG_NVME_MULTIPATH
+enum nvme_iopolicy {
+	NVME_IOPOLICY_UNKNOWN,
+	NVME_IOPOLICY_NUMA,
+	NVME_IOPOLICY_RR,
+};
+#endif
+
 struct nvme_subsystem {
 	int			instance;
 	struct device		dev;
@@ -265,6 +273,9 @@ struct nvme_subsystem {
 	u8			cmic;
 	u16			vendor_id;
 	struct ida		ns_ida;
+#ifdef CONFIG_NVME_MULTIPATH
+	enum nvme_iopolicy	iopolicy;
+#endif
 };
 
 /*
@@ -486,6 +497,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 
 extern struct device_attribute dev_attr_ana_grpid;
 extern struct device_attribute dev_attr_ana_state;
+extern struct device_attribute subsys_attr_iopolicy;
 
 #else
 static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
-- 
2.16.4

             reply	other threads:[~2018-12-21 14:13 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-12-21 14:13 Hannes Reinecke [this message]
2019-01-03 20:24 ` [PATCHv2] nvme-multipath: round-robin I/O policy Ewan D. Milne
2019-01-08 12:01   ` Hannes Reinecke
2019-01-04 14:06 ` Martin Wilck
2019-01-04 14:24   ` Hannes Reinecke
2019-01-04 15:17     ` Martin Wilck
2019-01-29  8:21       ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181221141330.96599-1-hare@suse.de \
    --to=hare@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.