From: John Garry <john.g.garry@oracle.com>
To: hch@lst.de, kbusch@kernel.org, sagi@grimberg.me, axboe@fb.com,
martin.petersen@oracle.com,
james.bottomley@hansenpartnership.com, hare@suse.com,
bmarzins@redhat.com, nilay@linux.ibm.com
Cc: jmeneghi@redhat.com, linux-nvme@lists.infradead.org,
linux-scsi@vger.kernel.org, michael.christie@oracle.com,
snitzer@kernel.org, dm-devel@lists.linux.dev,
linux-kernel@vger.kernel.org,
John Garry <john.g.garry@oracle.com>
Subject: [PATCH v2 03/13] libmultipath: Add path selection support
Date: Tue, 28 Apr 2026 11:10:55 +0000 [thread overview]
Message-ID: <20260428111105.1778008-4-john.g.garry@oracle.com> (raw)
In-Reply-To: <20260428111105.1778008-1-john.g.garry@oracle.com>
Add code for path selection.
NVMe ANA is abstracted into enum mpath_access_state. The motivation here is
so that SCSI ALUA can be used. Callbacks .is_disabled, .is_optimized,
.get_access_state are added to get the path access state.
Path selection modes round-robin, NUMA, and queue-depth are added, same
as NVMe supports.
NVMe has almost like-for-like equivalents here:
- __mpath_find_path() -> __nvme_find_path()
- mpath_find_path() -> nvme_find_path()
and similar for all introduced callee functions.
Functions mpath_set_iopolicy() and mpath_get_iopolicy() are added for
setting default iopolicy.
A separate mpath_iopolicy structure is introduced. There is no iopolicy
member included in the mpath_head structure as it may not suit NVMe, where
iopolicy is per-subsystem and not per namespace.
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
include/linux/multipath.h | 37 ++++++
lib/multipath.c | 248 ++++++++++++++++++++++++++++++++++++++
2 files changed, 285 insertions(+)
diff --git a/include/linux/multipath.h b/include/linux/multipath.h
index 3e2a513059cde..13d810148a96a 100644
--- a/include/linux/multipath.h
+++ b/include/linux/multipath.h
@@ -7,13 +7,36 @@
extern const struct block_device_operations mpath_ops;
+enum mpath_iopolicy_e {
+ MPATH_IOPOLICY_NUMA,
+ MPATH_IOPOLICY_RR,
+ MPATH_IOPOLICY_QD,
+};
+
+struct mpath_iopolicy {
+ enum mpath_iopolicy_e iopolicy;
+};
+
+enum mpath_access_state {
+ MPATH_STATE_OPTIMIZED,
+ MPATH_STATE_NONOPTIMIZED,
+ MPATH_STATE_OTHER
+};
+
struct mpath_device {
struct mpath_head *mpath_head;
struct list_head siblings;
struct gendisk *disk;
+ int numa_node;
+ enum mpath_access_state access_state;
};
struct mpath_head_template {
+ bool (*is_disabled)(struct mpath_device *);
+ bool (*is_optimized)(struct mpath_device *);
+ int (*get_nr_active)(struct mpath_device *);
+ enum mpath_iopolicy_e (*get_iopolicy)(struct mpath_head *);
+ const struct attribute_group **device_groups;
};
#define MPATH_HEAD_DISK_LIVE 0
@@ -45,6 +68,14 @@ static inline struct mpath_head *mpath_gendisk_to_head(struct gendisk *disk)
return mpath_bd_device_to_head(disk_to_dev(disk));
}
+static inline enum mpath_iopolicy_e mpath_read_iopolicy(
+ struct mpath_iopolicy *mpath_iopolicy)
+{
+ return READ_ONCE(mpath_iopolicy->iopolicy);
+}
+void mpath_synchronize(struct mpath_head *mpath_head);
+int mpath_set_iopolicy(const char *val, int *iopolicy);
+int mpath_get_iopolicy(char *buf, int iopolicy);
int mpath_get_head(struct mpath_head *mpath_head);
void mpath_put_head(struct mpath_head *mpath_head);
struct mpath_head *mpath_alloc_head(void);
@@ -63,4 +94,10 @@ static inline bool is_mpath_disk(struct gendisk *disk)
return false;
#endif
}
+
+static inline bool mpath_qd_iopolicy(struct mpath_iopolicy *mpath_iopolicy)
+{
+ return mpath_read_iopolicy(mpath_iopolicy) == MPATH_IOPOLICY_QD;
+}
+
#endif // _LIBMULTIPATH_H
diff --git a/lib/multipath.c b/lib/multipath.c
index 726d9bec13553..fa211420b72c3 100644
--- a/lib/multipath.c
+++ b/lib/multipath.c
@@ -6,8 +6,242 @@
#include <linux/module.h>
#include <linux/multipath.h>
+static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head);
+
static struct workqueue_struct *mpath_wq;
+static const char *mpath_iopolicy_names[] = {
+ [MPATH_IOPOLICY_NUMA] = "numa",
+ [MPATH_IOPOLICY_RR] = "round-robin",
+ [MPATH_IOPOLICY_QD] = "queue-depth",
+};
+
+int mpath_set_iopolicy(const char *val, int *iopolicy)
+{
+ if (!val)
+ return -EINVAL;
+ if (!strncmp(val, "numa", 4))
+ *iopolicy = MPATH_IOPOLICY_NUMA;
+ else if (!strncmp(val, "round-robin", 11))
+ *iopolicy = MPATH_IOPOLICY_RR;
+ else if (!strncmp(val, "queue-depth", 11))
+ *iopolicy = MPATH_IOPOLICY_QD;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mpath_set_iopolicy);
+
+int mpath_get_iopolicy(char *buf, int iopolicy)
+{
+ return sprintf(buf, "%s\n", mpath_iopolicy_names[iopolicy]);
+}
+EXPORT_SYMBOL_GPL(mpath_get_iopolicy);
+
+
+void mpath_synchronize(struct mpath_head *mpath_head)
+{
+ synchronize_srcu(&mpath_head->srcu);
+}
+EXPORT_SYMBOL_GPL(mpath_synchronize);
+
+static bool mpath_path_is_disabled(struct mpath_head *mpath_head,
+ struct mpath_device *mpath_device)
+{
+ return mpath_head->mpdt->is_disabled(mpath_device);
+}
+
+static struct mpath_device *__mpath_find_path(struct mpath_head *mpath_head,
+ int node)
+{
+ int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
+ struct mpath_device *found = NULL, *fallback = NULL, *mpath_device;
+
+ list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list, siblings,
+ srcu_read_lock_held(&mpath_head->srcu)) {
+ if (mpath_path_is_disabled(mpath_head, mpath_device))
+ continue;
+
+ if (mpath_device->numa_node != NUMA_NO_NODE &&
+ (mpath_head->mpdt->get_iopolicy(mpath_head) ==
+ MPATH_IOPOLICY_NUMA))
+ distance = node_distance(node,
+ mpath_device->numa_node);
+ else
+ distance = LOCAL_DISTANCE;
+
+ switch(mpath_device->access_state) {
+ case MPATH_STATE_OPTIMIZED:
+ if (distance < found_distance) {
+ found_distance = distance;
+ found = mpath_device;
+ }
+ break;
+ case MPATH_STATE_NONOPTIMIZED:
+ if (distance < fallback_distance) {
+ fallback_distance = distance;
+ fallback = mpath_device;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!found)
+ found = fallback;
+
+ if (found)
+ rcu_assign_pointer(mpath_head->current_path[node], found);
+
+ return found;
+}
+
+static struct mpath_device *mpath_next_dev(struct mpath_head *mpath_head,
+ struct mpath_device *mpath_dev)
+{
+ mpath_dev = list_next_or_null_rcu(&mpath_head->dev_list,
+ &mpath_dev->siblings, struct mpath_device,
+ siblings);
+
+ if (mpath_dev)
+ return mpath_dev;
+ return list_first_or_null_rcu(&mpath_head->dev_list,
+ struct mpath_device, siblings);
+}
+
+static struct mpath_device *mpath_round_robin_path(
+ struct mpath_head *mpath_head)
+{
+ struct mpath_device *mpath_device, *found = NULL;
+ int node = numa_node_id();
+ enum mpath_access_state access_state_old;
+ struct mpath_device *old =
+ srcu_dereference(mpath_head->current_path[node],
+ &mpath_head->srcu);
+
+ if (unlikely(!old))
+ return __mpath_find_path(mpath_head, node);
+
+ if (list_is_singular(&mpath_head->dev_list)) {
+ if (mpath_path_is_disabled(mpath_head, old))
+ return NULL;
+ return old;
+ }
+
+ for (mpath_device = mpath_next_dev(mpath_head, old);
+ mpath_device && mpath_device != old;
+ mpath_device = mpath_next_dev(mpath_head, mpath_device)) {
+
+ if (mpath_path_is_disabled(mpath_head, mpath_device))
+ continue;
+ if (mpath_device->access_state == MPATH_STATE_OPTIMIZED) {
+ found = mpath_device;
+ goto out;
+ }
+ if (mpath_device->access_state == MPATH_STATE_NONOPTIMIZED)
+ found = mpath_device;
+ }
+
+ /*
+ * The loop above skips the current path for round-robin semantics.
+ * Fall back to the current path if either:
+ * - no other optimized path found and current is optimized,
+ * - no other usable path found and current is usable.
+ */
+ access_state_old = old->access_state;
+ if (!mpath_path_is_disabled(mpath_head, old) &&
+ (access_state_old == MPATH_STATE_OPTIMIZED ||
+ (!found && access_state_old == MPATH_STATE_NONOPTIMIZED)))
+ return old;
+
+ if (!found)
+ return NULL;
+out:
+ rcu_assign_pointer(mpath_head->current_path[node], found);
+
+ return found;
+}
+
+static struct mpath_device *mpath_queue_depth_path(
+ struct mpath_head *mpath_head)
+{
+ struct mpath_device *best_opt = NULL, *mpath_device;
+ struct mpath_device *best_nonopt = NULL;
+ unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
+ unsigned int depth;
+ int (*get_nr_active)(struct mpath_device *) =
+ mpath_head->mpdt->get_nr_active;
+
+ list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list, siblings,
+ srcu_read_lock_held(&mpath_head->srcu)) {
+
+ if (mpath_path_is_disabled(mpath_head, mpath_device))
+ continue;
+
+ depth = get_nr_active(mpath_device);
+
+ switch (mpath_device->access_state) {
+ case MPATH_STATE_OPTIMIZED:
+ if (depth < min_depth_opt) {
+ min_depth_opt = depth;
+ best_opt = mpath_device;
+ }
+ break;
+ case MPATH_STATE_NONOPTIMIZED:
+ if (depth < min_depth_nonopt) {
+ min_depth_nonopt = depth;
+ best_nonopt = mpath_device;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (min_depth_opt == 0)
+ return best_opt;
+ }
+
+ return best_opt ? best_opt : best_nonopt;
+}
+
+static inline bool mpath_path_is_optimized(struct mpath_head *mpath_head,
+ struct mpath_device *mpath_device)
+{
+ return mpath_head->mpdt->is_optimized(mpath_device);
+}
+
+static struct mpath_device *mpath_numa_path(struct mpath_head *mpath_head)
+{
+ int node = numa_node_id();
+ struct mpath_device *mpath_device;
+
+ mpath_device = srcu_dereference(mpath_head->current_path[node],
+ &mpath_head->srcu);
+ if (unlikely(!mpath_device))
+ return __mpath_find_path(mpath_head, node);
+ if (unlikely(!mpath_path_is_optimized(mpath_head, mpath_device)))
+ return __mpath_find_path(mpath_head, node);
+ return mpath_device;
+}
+
+__maybe_unused
+static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head)
+{
+ enum mpath_iopolicy_e iopolicy =
+ mpath_head->mpdt->get_iopolicy(mpath_head);
+
+ switch (iopolicy) {
+ case MPATH_IOPOLICY_QD:
+ return mpath_queue_depth_path(mpath_head);
+ case MPATH_IOPOLICY_RR:
+ return mpath_round_robin_path(mpath_head);
+ default:
+ return mpath_numa_path(mpath_head);
+ }
+}
+
static void mpath_free_head(struct kref *ref)
{
struct mpath_head *mpath_head =
@@ -71,6 +305,7 @@ void mpath_remove_disk(struct mpath_head *mpath_head)
if (test_and_clear_bit(MPATH_HEAD_DISK_LIVE, &mpath_head->flags)) {
struct gendisk *disk = mpath_head->disk;
+ mpath_synchronize(mpath_head);
del_gendisk(disk);
}
}
@@ -121,6 +356,19 @@ void mpath_device_set_live(struct mpath_device *mpath_device)
}
queue_work(mpath_wq, &mpath_head->partition_scan_work);
}
+
+ mutex_lock(&mpath_head->lock);
+ if (mpath_path_is_optimized(mpath_head, mpath_device)) {
+ int node, srcu_idx;
+
+ srcu_idx = srcu_read_lock(&mpath_head->srcu);
+ for_each_online_node(node)
+ __mpath_find_path(mpath_head, node);
+ srcu_read_unlock(&mpath_head->srcu, srcu_idx);
+ }
+ mutex_unlock(&mpath_head->lock);
+
+ mpath_synchronize(mpath_head);
}
EXPORT_SYMBOL_GPL(mpath_device_set_live);
--
2.43.5
next prev parent reply other threads:[~2026-04-28 11:11 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-28 11:10 [PATCH v2 00/13] libmultipath: a generic multipath lib for block drivers John Garry
2026-04-28 11:10 ` [PATCH v2 01/13] libmultipath: Add initial framework John Garry
2026-04-28 11:10 ` [PATCH v2 02/13] libmultipath: Add basic gendisk support John Garry
2026-04-28 11:10 ` John Garry [this message]
2026-04-28 11:10 ` [PATCH v2 04/13] libmultipath: Add bio handling John Garry
2026-04-28 11:10 ` [PATCH v2 05/13] libmultipath: Add support for mpath_device management John Garry
2026-04-28 11:10 ` [PATCH v2 06/13] libmultipath: Add cdev support John Garry
2026-04-28 11:10 ` [PATCH v2 07/13] libmultipath: Add delayed removal support John Garry
2026-04-28 11:11 ` [PATCH v2 08/13] libmultipath: Add sysfs helpers John Garry
2026-04-28 11:11 ` [PATCH v2 09/13] libmultipath: Add PR support John Garry
2026-04-28 11:11 ` [PATCH v2 10/13] libmultipath: Add mpath_bdev_report_zones() John Garry
2026-04-28 11:11 ` [PATCH v2 11/13] libmultipath: Add support for block device IOCTL John Garry
2026-04-28 11:11 ` [PATCH v2 12/13] libmultipath: Add mpath_bdev_getgeo() John Garry
2026-04-28 11:11 ` [PATCH v2 13/13] libmultipath: Add mpath_bdev_get_unique_id() John Garry
2026-05-10 22:03 ` [PATCH v2 00/13] libmultipath: a generic multipath lib for block drivers Sagi Grimberg
2026-05-11 7:30 ` John Garry
2026-05-15 0:24 ` Mike Snitzer
2026-05-15 8:45 ` John Garry
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260428111105.1778008-4-john.g.garry@oracle.com \
--to=john.g.garry@oracle.com \
--cc=axboe@fb.com \
--cc=bmarzins@redhat.com \
--cc=dm-devel@lists.linux.dev \
--cc=hare@suse.com \
--cc=hch@lst.de \
--cc=james.bottomley@hansenpartnership.com \
--cc=jmeneghi@redhat.com \
--cc=kbusch@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=linux-scsi@vger.kernel.org \
--cc=martin.petersen@oracle.com \
--cc=michael.christie@oracle.com \
--cc=nilay@linux.ibm.com \
--cc=sagi@grimberg.me \
--cc=snitzer@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.