From: Nilay Shroff <nilay@linux.ibm.com>
To: linux-nvme@lists.infradead.org
Cc: hare@suse.de, hch@lst.de, kbusch@kernel.org, sagi@grimberg.me,
dwagner@suse.de, axboe@kernel.dk, kanie@linux.alibaba.com,
gjoyce@ibm.com
Subject: [RFC PATCHv5 4/7] nvme-multipath: add debugfs attribute adaptive_ewma_shift
Date: Wed, 5 Nov 2025 16:03:23 +0530 [thread overview]
Message-ID: <20251105103347.86059-5-nilay@linux.ibm.com> (raw)
In-Reply-To: <20251105103347.86059-1-nilay@linux.ibm.com>
By default, the EWMA (Exponentially Weighted Moving Average) shift
value, used for storing latency samples for adaptive iopolicy, is set
to 3. The EWMA is calculated using the following formula:
ewma = (old * ((1 << ewma_shift) - 1) + new) >> ewma_shift;
The default value of 3 assigns ~87.5% weight to the existing EWMA value
and ~12.5% weight to the new latency sample. This provides a stable
average that smooths out short-term variations.
However, different workloads may require faster or slower adaptation to
changing conditions. This commit introduces a new debugfs attribute,
adaptive_ewma_shift, allowing users to tune the weighting factor.
For example:
- adaptive_ewma_shift = 2 => 75% old, 25% new
- adaptive_ewma_shift = 1 => 50% old, 50% new
- adaptive_ewma_shift = 0 => 0% old, 100% new
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
---
drivers/nvme/host/core.c | 3 +++
drivers/nvme/host/debugfs.c | 46 +++++++++++++++++++++++++++++++++++
drivers/nvme/host/multipath.c | 8 +++---
drivers/nvme/host/nvme.h | 1 +
4 files changed, 54 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c15dfcaf3de2..43b9b0d6cbdf 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3913,6 +3913,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->ids = info->ids;
head->shared = info->is_shared;
head->rotational = info->is_rotational;
+#ifdef CONFIG_NVME_MULTIPATH
+ head->adp_ewma_shift = NVME_DEFAULT_ADP_EWMA_SHIFT;
+#endif
ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
kref_init(&head->ref);
diff --git a/drivers/nvme/host/debugfs.c b/drivers/nvme/host/debugfs.c
index 6bb57c4b5c3b..e3c37041e8f2 100644
--- a/drivers/nvme/host/debugfs.c
+++ b/drivers/nvme/host/debugfs.c
@@ -105,8 +105,54 @@ static const struct file_operations nvme_debugfs_fops = {
.release = nvme_debugfs_release,
};
+#ifdef CONFIG_NVME_MULTIPATH
+static int nvme_adp_ewma_shift_show(void *data, struct seq_file *m)
+{
+ struct nvme_ns_head *head = data;
+
+ seq_printf(m, "%u\n", READ_ONCE(head->adp_ewma_shift));
+ return 0;
+}
+
+static ssize_t nvme_adp_ewma_shift_store(void *data, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct nvme_ns_head *head = data;
+ char kbuf[8];
+ u32 res;
+ int ret;
+ size_t len;
+ char *arg;
+
+ len = min(sizeof(kbuf) - 1, count);
+
+ if (copy_from_user(kbuf, ubuf, len))
+ return -EFAULT;
+
+ kbuf[len] = '\0';
+ arg = strstrip(kbuf);
+
+ ret = kstrtou32(arg, 0, &res);
+ if (ret)
+ return ret;
+
+ /*
+ * Values greater than 8 are nonsensical, as they effectively assign
+ * zero weight to new samples.
+ */
+ if (res > 8)
+ return -EINVAL;
+
+ WRITE_ONCE(head->adp_ewma_shift, res);
+ return count;
+}
+#endif
static const struct nvme_debugfs_attr nvme_mpath_debugfs_attrs[] = {
+#ifdef CONFIG_NVME_MULTIPATH
+ {"adaptive_ewma_shift", 0600, nvme_adp_ewma_shift_show,
+ nvme_adp_ewma_shift_store},
+#endif
{},
};
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 047dd9da9cbf..c7470cc8844e 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -294,10 +294,9 @@ static void nvme_mpath_weight_work(struct work_struct *weight_work)
* For instance, with EWMA_SHIFT = 3, this assigns 7/8 (~87.5 %) weight to
* the existing/old ewma and 1/8 (~12.5%) weight to the new sample.
*/
-static inline u64 ewma_update(u64 old, u64 new)
+static inline u64 ewma_update(u64 old, u64 new, u32 ewma_shift)
{
- return (old * ((1 << NVME_DEFAULT_ADP_EWMA_SHIFT) - 1)
- + new) >> NVME_DEFAULT_ADP_EWMA_SHIFT;
+ return (old * ((1 << ewma_shift) - 1) + new) >> ewma_shift;
}
static void nvme_mpath_add_sample(struct request *rq, struct nvme_ns *ns)
@@ -389,7 +388,8 @@ static void nvme_mpath_add_sample(struct request *rq, struct nvme_ns *ns)
if (unlikely(!stat->slat_ns))
WRITE_ONCE(stat->slat_ns, avg_lat_ns);
else {
- slat_ns = ewma_update(stat->slat_ns, avg_lat_ns);
+ slat_ns = ewma_update(stat->slat_ns, avg_lat_ns,
+ READ_ONCE(head->adp_ewma_shift));
WRITE_ONCE(stat->slat_ns, slat_ns);
}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1c1ec2a7f9ad..97de45634f08 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -545,6 +545,7 @@ struct nvme_ns_head {
unsigned int delayed_removal_secs;
struct nvme_ns * __percpu *adp_path;
+ u32 adp_ewma_shift;
#define NVME_NSHEAD_DISK_LIVE 0
#define NVME_NSHEAD_QUEUE_IF_NO_PATH 1
--
2.51.0
next prev parent reply other threads:[~2025-11-05 10:34 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-05 10:33 [RFC PATCHv5 0/7] nvme-multipath: introduce adaptive I/O policy Nilay Shroff
2025-11-05 10:33 ` [RFC PATCHv5 1/7] block: expose blk_stat_{enable,disable}_accounting() to drivers Nilay Shroff
2025-11-05 10:33 ` [RFC PATCHv5 2/7] nvme-multipath: add support for adaptive I/O policy Nilay Shroff
2025-11-05 10:33 ` [RFC PATCHv5 3/7] nvme: add generic debugfs support Nilay Shroff
2025-11-05 10:33 ` Nilay Shroff [this message]
2025-11-05 10:33 ` [RFC PATCHv5 5/7] nvme-multipath: add debugfs attribute adaptive_weight_timeout Nilay Shroff
2025-11-05 10:33 ` [RFC PATCHv5 6/7] nvme-multipath: add debugfs attribute adaptive_stat Nilay Shroff
2025-11-05 10:33 ` [RFC PATCHv5 7/7] nvme-multipath: add documentation for adaptive I/O policy Nilay Shroff
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251105103347.86059-5-nilay@linux.ibm.com \
--to=nilay@linux.ibm.com \
--cc=axboe@kernel.dk \
--cc=dwagner@suse.de \
--cc=gjoyce@ibm.com \
--cc=hare@suse.de \
--cc=hch@lst.de \
--cc=kanie@linux.alibaba.com \
--cc=kbusch@kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=sagi@grimberg.me \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).