* [PATCH 1/2] block: track per-node I/O latency
2024-03-26 15:35 [PATCH RFC 0/2] block,nvme: latency-based I/O scheduler Hannes Reinecke
@ 2024-03-26 15:35 ` Hannes Reinecke
2024-03-27 18:03 ` kernel test robot
2024-03-27 20:59 ` kernel test robot
0 siblings, 2 replies; 25+ messages in thread
From: Hannes Reinecke @ 2024-03-26 15:35 UTC (permalink / raw)
To: Jens Axboe
Cc: Keith Busch, Christoph Hellwig, Sagi Grimberg, linux-nvme,
linux-block, Hannes Reinecke
Add a new option 'BLK_NODE_LATENCY' to track per-node I/O latency.
This can be used by I/O scheduler to determine the 'best' queue
to send I/O to.
Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
block/Kconfig | 7 +
block/Makefile | 1 +
block/blk-mq-debugfs.c | 2 +
block/blk-nodelat.c | 368 +++++++++++++++++++++++++++++++++++++++++
block/blk-rq-qos.h | 6 +
include/linux/blk-mq.h | 11 ++
6 files changed, 395 insertions(+)
create mode 100644 block/blk-nodelat.c
diff --git a/block/Kconfig b/block/Kconfig
index 1de4682d48cc..7ce60becfb1d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -186,6 +186,13 @@ config BLK_CGROUP_IOPRIO
scheduler and block devices process requests. Only some I/O schedulers
and some block devices support I/O priorities.
+config BLK_NODE_LATENCY
+ bool "Track per-node I/O latency"
+ help
+ Enable the .nlat interface for tracking per-node I/O latency.
+ This can be used by I/O schedulers to determine the queue with the
+ least latency.
+
config BLK_DEBUG_FS
bool "Block layer debugging information in debugfs"
default y
diff --git a/block/Makefile b/block/Makefile
index 46ada9dc8bbf..e2683f55d15f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
+obj-$(CONFIG_BLK_NODE_LATENCY) += blk-nodelat.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 94668e72ab09..cb38228b95d8 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -762,6 +762,8 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
return "latency";
case RQ_QOS_COST:
return "cost";
+ case RQ_QOS_NLAT:
+ return "node-latency";
}
return "unknown";
}
diff --git a/block/blk-nodelat.c b/block/blk-nodelat.c
new file mode 100644
index 000000000000..45d7e622b147
--- /dev/null
+++ b/block/blk-nodelat.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Per-node request latency tracking.
+ *
+ * Copyright (C) 2023 Hannes Reinecke
+ *
+ * A simple per-node latency tracker for use
+ * by I/O scheduler.
+ * Latencies are measures over 'win_usec' microseconds
+ * and stored per node.
+ * If the number of measurements falls below 'lowat'
+ * the measurement is assumed to be unreliable and
+ * will become 'stale'.
+ * These 'stale' latencies can be 'decayed', where
+ * during each measurement interval the 'stale'
+ * latency value is decreased by 'decay' percent.
+ * Once the 'stale' latency reaches zero it
+ * will be updated by the measured latency.
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+
+#include "blk-stat.h"
+#include "blk-rq-qos.h"
+#include "blk.h"
+
+#define NLAT_DEFAULT_LOWAT 2
+#define NLAT_DEFAULT_DECAY 50
+
+struct rq_nlat {
+ struct rq_qos rqos;
+
+ u64 win_usec; /* latency measurement window */
+ unsigned int lowat; /* Low Watermark below which latency measurement is deemed unreliable */
+ unsigned int decay; /* Percentage for 'decaying' latencies */
+ bool enabled;
+
+ struct blk_stat_callback *cb;
+
+ unsigned int num;
+ u64 *latency;
+ unsigned int *samples;
+};
+
+static inline struct rq_nlat *RQNLAT(struct rq_qos *rqos)
+{
+ return container_of(rqos, struct rq_nlat, rqos);
+}
+
+static u64 nlat_default_latency_usec(struct request_queue *q)
+{
+ /*
+ * We default to 2msec for non-rotational storage, and 75msec
+ * for rotational storage.
+ */
+ if (blk_queue_nonrot(q))
+ return 2000ULL;
+ else
+ return 75000ULL;
+}
+
+static void nlat_timer_fn(struct blk_stat_callback *cb)
+{
+ struct rq_nlat *nlat = cb->data;
+ int n;
+
+ for (n = 0; n < cb->buckets; n++) {
+ if (cb->stat[n].nr_samples < nlat->lowat && nlat->latency[n]) {
+ /*
+ * 'decay' the latency by the specified
+ * percentage to ensure the nodes are
+ * being tested to balance out temporary
+ * latency spikes.
+ */
+ if (nlat->decay)
+ nlat->latency[n] =
+ div64_u64(nlat->latency[n] * nlat->decay, 100);
+ } else
+ nlat->latency[n] = cb->stat[n].mean;
+ nlat->samples[n] = cb->stat[n].nr_samples;
+ }
+ if (nlat->enabled)
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+}
+
+static int nlat_node(const struct request *rq)
+{
+ if (!rq->mq_ctx)
+ return -1;
+ return cpu_to_node(blk_mq_rq_cpu((struct request *)rq));
+}
+
+static void nlat_exit(struct rq_qos *rqos)
+{
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ blk_stat_remove_callback(nlat->rqos.disk->queue, nlat->cb);
+ blk_stat_free_callback(nlat->cb);
+ kfree(nlat->samples);
+ kfree(nlat->latency);
+ kfree(nlat);
+}
+
+u64 blk_nodelat_latency(struct request_queue *q, int node)
+{
+ struct rq_qos *rqos;
+ struct rq_nlat *nlat;
+
+ rqos = nlat_rq_qos(q);
+ if (!rqos)
+ return 0;
+ nlat = RQNLAT(rqos);
+ if (node > nlat->num)
+ return 0;
+
+ return div64_u64(nlat->latency[node], 1000);
+}
+EXPORT_SYMBOL_GPL(blk_nodelat_latency);
+
+int blk_nodelat_enable(struct request_queue *q)
+{
+ struct rq_qos *rqos;
+ struct rq_nlat *nlat;
+
+ /* Throttling already enabled? */
+ rqos = nlat_rq_qos(q);
+ if (!rqos)
+ return -EINVAL;
+ nlat = RQNLAT(rqos);
+ if (nlat->enabled)
+ return 0;
+
+ /* Queue not registered? Maybe shutting down... */
+ if (!blk_queue_registered(q))
+ return -EAGAIN;
+
+ if (queue_is_mq(q)) {
+ nlat->enabled = true;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blk_nodelat_enable);
+
+void blk_nodelat_disable(struct request_queue *q)
+{
+ struct rq_qos *rqos = nlat_rq_qos(q);
+ struct rq_nlat *nlat;
+ if (!rqos)
+ return;
+ nlat = RQNLAT(rqos);
+ if (nlat->enabled) {
+ blk_stat_deactivate(nlat->cb);
+ nlat->enabled = false;
+ }
+}
+EXPORT_SYMBOL_GPL(blk_nodelat_disable);
+
+#ifdef CONFIG_BLK_DEBUG_FS
+static int nlat_win_usec_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%llu\n", nlat->win_usec);
+ return 0;
+}
+
+static ssize_t nlat_win_usec_write(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ char val[16] = { };
+ u64 usec;
+ int err;
+
+ if (blk_queue_dying(nlat->rqos.disk->queue))
+ return -ENOENT;
+
+ if (count >= sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, count))
+ return -EFAULT;
+
+ err = kstrtoull(val, 10, &usec);
+ if (err)
+ return err;
+ blk_stat_deactivate(nlat->cb);
+ nlat->win_usec = usec;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return count;
+}
+
+static int nlat_lowat_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%u\n", nlat->lowat);
+ return 0;
+}
+
+static ssize_t nlat_lowat_write(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ char val[16] = { };
+ unsigned int lowat;
+ int err;
+
+ if (blk_queue_dying(nlat->rqos.disk->queue))
+ return -ENOENT;
+
+ if (count >= sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, count))
+ return -EFAULT;
+
+ err = kstrtouint(val, 10, &lowat);
+ if (err)
+ return err;
+ blk_stat_deactivate(nlat->cb);
+ nlat->lowat = lowat;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return count;
+}
+
+static int nlat_decay_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%u\n", nlat->decay);
+ return 0;
+}
+
+static ssize_t nlat_decay_write(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ char val[16] = { };
+ unsigned int decay;
+ int err;
+
+ if (blk_queue_dying(nlat->rqos.disk->queue))
+ return -ENOENT;
+
+ if (count >= sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, count))
+ return -EFAULT;
+
+ err = kstrtouint(val, 10, &decay);
+ if (err)
+ return err;
+ if (decay > 100)
+ return -EINVAL;
+ blk_stat_deactivate(nlat->cb);
+ nlat->decay = decay;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return count;
+}
+
+static int nlat_enabled_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%d\n", nlat->enabled);
+ return 0;
+}
+
+static int nlat_id_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+
+ seq_printf(m, "%u\n", rqos->id);
+ return 0;
+}
+
+static int nlat_latency_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ int n;
+
+ for (n = 0; n < nlat->num; n++)
+ seq_printf(m, "%llu %u ", nlat->latency[n], nlat->samples[n]);
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const struct blk_mq_debugfs_attr nlat_debugfs_attrs[] = {
+ {"win_usec", 0600, nlat_win_usec_show, nlat_win_usec_write},
+ {"lowat", 0600, nlat_lowat_show, nlat_lowat_write},
+ {"decay", 0600, nlat_decay_show, nlat_decay_write},
+ {"enabled", 0400, nlat_enabled_show},
+ {"id", 0400, nlat_id_show},
+ {"latency", 0400, nlat_latency_show},
+ {},
+};
+#endif
+
+static const struct rq_qos_ops nlat_rqos_ops = {
+ .exit = nlat_exit,
+#ifdef CONFIG_BLK_DEBUG_FS
+ .debugfs_attrs = nlat_debugfs_attrs,
+#endif
+};
+
+int blk_nodelat_init(struct gendisk *disk)
+{
+ struct rq_nlat *nlat;
+ int nlat_num = num_possible_nodes();
+ int ret = -ENOMEM;
+
+ nlat = kzalloc(sizeof(*nlat), GFP_KERNEL);
+ if (!nlat)
+ return -ENOMEM;
+
+ nlat->num = nlat_num;
+ nlat->lowat = 2;
+ nlat->decay = 50;
+ nlat->latency = kzalloc(sizeof(u64) * nlat->num, GFP_KERNEL);
+ if (!nlat->latency)
+ goto err_free;
+ nlat->samples = kzalloc(sizeof(unsigned int) * nlat->num, GFP_KERNEL);
+ if (!nlat->samples)
+ goto err_free;
+ nlat->cb = blk_stat_alloc_callback(nlat_timer_fn, nlat_node,
+ nlat->num, nlat);
+ if (!nlat->cb)
+ goto err_free;
+
+ nlat->win_usec = nlat_default_latency_usec(disk->queue);
+
+ /*
+ * Assign rwb and add the stats callback.
+ */
+ mutex_lock(&disk->queue->rq_qos_mutex);
+ ret = rq_qos_add(&nlat->rqos, disk, RQ_QOS_NLAT, &nlat_rqos_ops);
+ mutex_unlock(&disk->queue->rq_qos_mutex);
+ if (ret)
+ goto err_free_cb;
+
+ blk_stat_add_callback(disk->queue, nlat->cb);
+
+ return 0;
+
+err_free_cb:
+ blk_stat_free_callback(nlat->cb);
+err_free:
+ kfree(nlat->samples);
+ kfree(nlat->latency);
+ kfree(nlat);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_nodelat_init);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 37245c97ee61..2fc11ced0c00 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -17,6 +17,7 @@ enum rq_qos_id {
RQ_QOS_WBT,
RQ_QOS_LATENCY,
RQ_QOS_COST,
+ RQ_QOS_NLAT,
};
struct rq_wait {
@@ -79,6 +80,11 @@ static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
return rq_qos_id(q, RQ_QOS_LATENCY);
}
+static inline struct rq_qos *nlat_rq_qos(struct request_queue *q)
+{
+ return rq_qos_id(q, RQ_QOS_NLAT);
+}
+
static inline void rq_wait_init(struct rq_wait *rq_wait)
{
atomic_set(&rq_wait->inflight, 0);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 390d35fa0032..daeb837b9bc6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1229,4 +1229,15 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
}
#endif /* CONFIG_BLK_DEV_ZONED */
+#ifdef CONFIG_BLK_NODE_LATENCY
+int blk_nodelat_enable(struct request_queue *q);
+void blk_nodelat_disable(struct request_queue *q);
+u64 blk_nodelat_latency(struct request_queue *q, int node);
+int blk_nodelat_init(struct gendisk *disk);
+#else
+static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
+static inline void blk_nodelat_disable(struct request_queue *q) {}
+u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
+static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
+#endif
#endif /* BLK_MQ_H */
--
2.35.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] block: track per-node I/O latency
2024-03-26 15:35 ` [PATCH 1/2] block: track per-node I/O latency Hannes Reinecke
@ 2024-03-27 18:03 ` kernel test robot
2024-03-27 20:59 ` kernel test robot
1 sibling, 0 replies; 25+ messages in thread
From: kernel test robot @ 2024-03-27 18:03 UTC (permalink / raw)
To: Hannes Reinecke, Jens Axboe
Cc: oe-kbuild-all, Keith Busch, Christoph Hellwig, Sagi Grimberg,
linux-nvme, linux-block, Hannes Reinecke
Hi Hannes,
kernel test robot noticed the following build errors:
[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on linus/master v6.9-rc1 next-20240327]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Hannes-Reinecke/block-track-per-node-I-O-latency/20240326-234521
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20240326153529.75989-2-hare%40kernel.org
patch subject: [PATCH 1/2] block: track per-node I/O latency
config: openrisc-allnoconfig (https://download.01.org/0day-ci/archive/20240328/202403280137.o1GjQ6cI-lkp@intel.com/config)
compiler: or1k-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240328/202403280137.o1GjQ6cI-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202403280137.o1GjQ6cI-lkp@intel.com/
All error/warnings (new ones prefixed by >>):
In file included from include/linux/blk-integrity.h:5,
from block/bdev.c:15:
>> include/linux/blk-mq.h:1240:15: error: unknown type name 'in'
1240 | static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
| ^~
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for 'blk_nodelat_latency' [-Wmissing-prototypes]
1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
| ^~~~~~~~~~~~~~~~~~~
include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
1243 | static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
| ^~
vim +/in +1240 include/linux/blk-mq.h
1233
1234 #ifdef CONFIG_BLK_NODE_LATENCY
1235 int blk_nodelat_enable(struct request_queue *q);
1236 void blk_nodelat_disable(struct request_queue *q);
1237 u64 blk_nodelat_latency(struct request_queue *q, int node);
1238 int blk_nodelat_init(struct gendisk *disk);
1239 #else
> 1240 static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
1241 static inline void blk_nodelat_disable(struct request_queue *q) {}
> 1242 u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] block: track per-node I/O latency
2024-03-26 15:35 ` [PATCH 1/2] block: track per-node I/O latency Hannes Reinecke
2024-03-27 18:03 ` kernel test robot
@ 2024-03-27 20:59 ` kernel test robot
1 sibling, 0 replies; 25+ messages in thread
From: kernel test robot @ 2024-03-27 20:59 UTC (permalink / raw)
To: Hannes Reinecke, Jens Axboe
Cc: llvm, oe-kbuild-all, Keith Busch, Christoph Hellwig,
Sagi Grimberg, linux-nvme, linux-block, Hannes Reinecke
Hi Hannes,
kernel test robot noticed the following build warnings:
[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.9-rc1 next-20240327]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Hannes-Reinecke/block-track-per-node-I-O-latency/20240326-234521
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20240326153529.75989-2-hare%40kernel.org
patch subject: [PATCH 1/2] block: track per-node I/O latency
config: arm-randconfig-001-20240327 (https://download.01.org/0day-ci/archive/20240328/202403280412.Ojp0tGKt-lkp@intel.com/config)
compiler: clang version 19.0.0git (https://github.com/llvm/llvm-project 23de3862dce582ce91c1aa914467d982cb1a73b4)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240328/202403280412.Ojp0tGKt-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202403280412.Ojp0tGKt-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from drivers/scsi/aic7xxx/aic79xx_pci.c:44:
In file included from drivers/scsi/aic7xxx/aic79xx_osm.h:46:
In file included from include/linux/blkdev.h:9:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:8:
In file included from include/linux/cacheflush.h:5:
In file included from arch/arm/include/asm/cacheflush.h:10:
In file included from include/linux/mm.h:2208:
include/linux/vmstat.h:522:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
522 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
| ~~~~~~~~~~~ ^ ~~~
In file included from drivers/scsi/aic7xxx/aic79xx_pci.c:44:
In file included from drivers/scsi/aic7xxx/aic79xx_osm.h:57:
In file included from include/scsi/scsi_cmnd.h:7:
In file included from include/linux/t10-pi.h:6:
include/linux/blk-mq.h:1240:15: error: unknown type name 'in'
1240 | static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
| ^
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for function 'blk_nodelat_latency' [-Wmissing-prototypes]
1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
| ^
include/linux/blk-mq.h:1242:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
| ^
| static
include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
1243 | static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
| ^
2 warnings and 2 errors generated.
--
In file included from drivers/scsi/aic7xxx/aic79xx_core.c:43:
In file included from drivers/scsi/aic7xxx/aic79xx_osm.h:46:
In file included from include/linux/blkdev.h:9:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:8:
In file included from include/linux/cacheflush.h:5:
In file included from arch/arm/include/asm/cacheflush.h:10:
In file included from include/linux/mm.h:2208:
include/linux/vmstat.h:522:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
522 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
| ~~~~~~~~~~~ ^ ~~~
In file included from drivers/scsi/aic7xxx/aic79xx_core.c:43:
In file included from drivers/scsi/aic7xxx/aic79xx_osm.h:57:
In file included from include/scsi/scsi_cmnd.h:7:
In file included from include/linux/t10-pi.h:6:
include/linux/blk-mq.h:1240:15: error: unknown type name 'in'
1240 | static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
| ^
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for function 'blk_nodelat_latency' [-Wmissing-prototypes]
1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
| ^
include/linux/blk-mq.h:1242:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
| ^
| static
include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
1243 | static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
| ^
drivers/scsi/aic7xxx/aic79xx_core.c:5694:13: warning: variable 'data_addr' set but not used [-Wunused-but-set-variable]
5694 | uint64_t data_addr;
| ^
3 warnings and 2 errors generated.
--
In file included from drivers/scsi/aic7xxx/aic7xxx_core.c:43:
In file included from drivers/scsi/aic7xxx/aic7xxx_osm.h:63:
In file included from include/linux/blkdev.h:9:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:8:
In file included from include/linux/cacheflush.h:5:
In file included from arch/arm/include/asm/cacheflush.h:10:
In file included from include/linux/mm.h:2208:
include/linux/vmstat.h:522:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
522 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
| ~~~~~~~~~~~ ^ ~~~
In file included from drivers/scsi/aic7xxx/aic7xxx_core.c:43:
In file included from drivers/scsi/aic7xxx/aic7xxx_osm.h:74:
In file included from include/scsi/scsi_cmnd.h:7:
In file included from include/linux/t10-pi.h:6:
include/linux/blk-mq.h:1240:15: error: unknown type name 'in'
1240 | static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
| ^
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for function 'blk_nodelat_latency' [-Wmissing-prototypes]
1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
| ^
include/linux/blk-mq.h:1242:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
| ^
| static
include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
1243 | static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
| ^
drivers/scsi/aic7xxx/aic7xxx_core.c:4171:13: warning: variable 'data_addr' set but not used [-Wunused-but-set-variable]
4171 | uint32_t data_addr;
| ^
3 warnings and 2 errors generated.
--
In file included from drivers/scsi/aic7xxx/aic7xxx_osm.c:123:
In file included from drivers/scsi/aic7xxx/aic7xxx_osm.h:63:
In file included from include/linux/blkdev.h:9:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:8:
In file included from include/linux/cacheflush.h:5:
In file included from arch/arm/include/asm/cacheflush.h:10:
In file included from include/linux/mm.h:2208:
include/linux/vmstat.h:522:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
522 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
| ~~~~~~~~~~~ ^ ~~~
In file included from drivers/scsi/aic7xxx/aic7xxx_osm.c:123:
In file included from drivers/scsi/aic7xxx/aic7xxx_osm.h:74:
In file included from include/scsi/scsi_cmnd.h:7:
In file included from include/linux/t10-pi.h:6:
include/linux/blk-mq.h:1240:15: error: unknown type name 'in'
1240 | static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
| ^
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for function 'blk_nodelat_latency' [-Wmissing-prototypes]
1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
| ^
include/linux/blk-mq.h:1242:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
| ^
| static
include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
1243 | static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
| ^
drivers/scsi/aic7xxx/aic7xxx_osm.c:1435:24: warning: bitwise operation between different enumeration types ('ahc_feature' and 'ahc_flag') [-Wenum-enum-conversion]
1435 | && (ahc->features & AHC_SCB_BTT) == 0) {
| ~~~~~~~~~~~~~ ^ ~~~~~~~~~~~
3 warnings and 2 errors generated.
--
In file included from drivers/scsi/aic7xxx/aic79xx_osm_pci.c:42:
In file included from drivers/scsi/aic7xxx/aic79xx_osm.h:46:
In file included from include/linux/blkdev.h:9:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:8:
In file included from include/linux/cacheflush.h:5:
In file included from arch/arm/include/asm/cacheflush.h:10:
In file included from include/linux/mm.h:2208:
include/linux/vmstat.h:522:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
522 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
| ~~~~~~~~~~~ ^ ~~~
In file included from drivers/scsi/aic7xxx/aic79xx_osm_pci.c:42:
In file included from drivers/scsi/aic7xxx/aic79xx_osm.h:57:
In file included from include/scsi/scsi_cmnd.h:7:
In file included from include/linux/t10-pi.h:6:
include/linux/blk-mq.h:1240:15: error: unknown type name 'in'
1240 | static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
| ^
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for function 'blk_nodelat_latency' [-Wmissing-prototypes]
1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
| ^
include/linux/blk-mq.h:1242:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
| ^
| static
include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
1243 | static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
| ^
drivers/scsi/aic7xxx/aic79xx_osm_pci.c:177:25: warning: shift count >= width of type [-Wshift-count-overflow]
177 | dma_set_mask(dev, DMA_BIT_MASK(64)) == 0)
| ^~~~~~~~~~~~~~~~
include/linux/dma-mapping.h:77:54: note: expanded from macro 'DMA_BIT_MASK'
77 | #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
| ^ ~~~
3 warnings and 2 errors generated.
vim +/blk_nodelat_latency +1242 include/linux/blk-mq.h
1233
1234 #ifdef CONFIG_BLK_NODE_LATENCY
1235 int blk_nodelat_enable(struct request_queue *q);
1236 void blk_nodelat_disable(struct request_queue *q);
1237 u64 blk_nodelat_latency(struct request_queue *q, int node);
1238 int blk_nodelat_init(struct gendisk *disk);
1239 #else
> 1240 static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
1241 static inline void blk_nodelat_disable(struct request_queue *q) {}
> 1242 u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCHv2 0/2] block,nvme: latency-based I/O scheduler
@ 2024-04-03 14:17 Hannes Reinecke
2024-04-03 14:17 ` [PATCH 1/2] block: track per-node I/O latency Hannes Reinecke
` (6 more replies)
0 siblings, 7 replies; 25+ messages in thread
From: Hannes Reinecke @ 2024-04-03 14:17 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Keith Busch, Sagi Grimberg, Jens Axboe, linux-nvme, linux-block,
Hannes Reinecke
Hi all,
there had been several attempts to implement a latency-based I/O
scheduler for native nvme multipath, all of which had its issues.
So time to start afresh, this time using the QoS framework
already present in the block layer.
It consists of two parts:
- a new 'blk-nlatency' QoS module, which is just a simple per-node
latency tracker
- a 'latency' nvme I/O policy
Using the 'tiobench' fio script with 512 byte blocksize I'm getting
the following latencies (in usecs) as a baseline:
- seq write: avg 186 stddev 331
- rand write: avg 4598 stddev 7903
- seq read: avg 149 stddev 65
- rand read: avg 150 stddev 68
Enabling the 'latency' iopolicy:
- seq write: avg 178 stddev 113
- rand write: avg 3427 stddev 6703
- seq read: avg 140 stddev 59
- rand read: avg 141 stddev 58
Setting the 'decay' parameter to 10:
- seq write: avg 182 stddev 65
- rand write: avg 2619 stddev 5894
- seq read: avg 142 stddev 57
- rand read: avg 140 stddev 57
That's on a 32G FC testbed running against a brd target,
fio running with 48 threads. So promises are met: latency
goes down, and we're even able to control the standard
deviation via the 'decay' parameter.
As usual, comments and reviews are welcome.
Changes to the original version:
- split the rqos debugfs entries
- Modify commit message to indicate latency
- rename to blk-nlatency
Hannes Reinecke (2):
block: track per-node I/O latency
nvme: add 'latency' iopolicy
block/Kconfig | 6 +
block/Makefile | 1 +
block/blk-mq-debugfs.c | 2 +
block/blk-nlatency.c | 388 ++++++++++++++++++++++++++++++++++
block/blk-rq-qos.h | 6 +
drivers/nvme/host/multipath.c | 57 ++++-
drivers/nvme/host/nvme.h | 1 +
include/linux/blk-mq.h | 11 +
8 files changed, 465 insertions(+), 7 deletions(-)
create mode 100644 block/blk-nlatency.c
--
2.35.3
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 1/2] block: track per-node I/O latency
2024-04-03 14:17 [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Hannes Reinecke
@ 2024-04-03 14:17 ` Hannes Reinecke
2024-04-04 2:22 ` kernel test robot
` (2 more replies)
2024-04-03 14:17 ` [PATCH 2/2] nvme: add 'latency' iopolicy Hannes Reinecke
` (5 subsequent siblings)
6 siblings, 3 replies; 25+ messages in thread
From: Hannes Reinecke @ 2024-04-03 14:17 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Keith Busch, Sagi Grimberg, Jens Axboe, linux-nvme, linux-block,
Hannes Reinecke
Add a new option 'BLK_NODE_LATENCY' to track per-node I/O latency.
This can be used by I/O schedulers to determine the 'best' queue
to send I/O to.
Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
block/Kconfig | 6 +
block/Makefile | 1 +
block/blk-mq-debugfs.c | 2 +
block/blk-nlatency.c | 388 +++++++++++++++++++++++++++++++++++++++++
block/blk-rq-qos.h | 6 +
include/linux/blk-mq.h | 11 ++
6 files changed, 414 insertions(+)
create mode 100644 block/blk-nlatency.c
diff --git a/block/Kconfig b/block/Kconfig
index 1de4682d48cc..f8cef096a876 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -186,6 +186,12 @@ config BLK_CGROUP_IOPRIO
scheduler and block devices process requests. Only some I/O schedulers
and some block devices support I/O priorities.
+config BLK_NODE_LATENCY
+ bool "Track per-node I/O latency"
+ help
+ Enable per-node I/O latency tracking. This can be used by I/O schedulers
+ to determine the node with the least latency.
+
config BLK_DEBUG_FS
bool "Block layer debugging information in debugfs"
default y
diff --git a/block/Makefile b/block/Makefile
index 46ada9dc8bbf..9d2e71a3e36f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
+obj-$(CONFIG_BLK_NODE_LATENCY) += blk-nlatency.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 94668e72ab09..cb38228b95d8 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -762,6 +762,8 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
return "latency";
case RQ_QOS_COST:
return "cost";
+ case RQ_QOS_NLAT:
+ return "node-latency";
}
return "unknown";
}
diff --git a/block/blk-nlatency.c b/block/blk-nlatency.c
new file mode 100644
index 000000000000..037f5c64bbbf
--- /dev/null
+++ b/block/blk-nlatency.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Per-node request latency tracking.
+ *
+ * Copyright (C) 2023 Hannes Reinecke
+ *
+ * A simple per-node latency tracker for use by I/O scheduler.
+ * Latencies are measures over 'win_usec' microseconds and stored per node.
+ * If the number of measurements falls below 'lowat' the measurement is
+ * assumed to be unreliable and will become 'stale'.
+ * These 'stale' latencies can be 'decayed', where during each measurement
+ * interval the 'stale' latency value is decreased by 'decay' percent.
+ * Once the 'stale' latency reaches zero it will be updated by the
+ * measured latency.
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+
+#include "blk-stat.h"
+#include "blk-rq-qos.h"
+#include "blk.h"
+
+#define NLAT_DEFAULT_LOWAT 2
+#define NLAT_DEFAULT_DECAY 50
+
+struct rq_nlat {
+ struct rq_qos rqos;
+
+ u64 win_usec; /* latency measurement window in microseconds */
+ unsigned int lowat; /* Low Watermark below which latency measurement is deemed unreliable */
+ unsigned int decay; /* Percentage for 'decaying' latencies */
+ bool enabled;
+
+ struct blk_stat_callback *cb;
+
+ unsigned int num;
+ u64 *latency;
+ unsigned int *samples;
+};
+
+static inline struct rq_nlat *RQNLAT(struct rq_qos *rqos)
+{
+ return container_of(rqos, struct rq_nlat, rqos);
+}
+
+static u64 nlat_default_latency_usec(struct request_queue *q)
+{
+ /*
+ * We default to 2msec for non-rotational storage, and 75msec
+ * for rotational storage.
+ */
+ if (blk_queue_nonrot(q))
+ return 2000ULL;
+ else
+ return 75000ULL;
+}
+
+static void nlat_timer_fn(struct blk_stat_callback *cb)
+{
+ struct rq_nlat *nlat = cb->data;
+ int n;
+
+ for (n = 0; n < cb->buckets; n++) {
+ if (cb->stat[n].nr_samples < nlat->lowat) {
+ /*
+ * 'decay' the latency by the specified
+ * percentage to ensure the queues are
+ * being tested to balance out temporary
+ * latency spikes.
+ */
+ nlat->latency[n] =
+ div64_u64(nlat->latency[n] * nlat->decay, 100);
+ } else
+ nlat->latency[n] = cb->stat[n].mean;
+ nlat->samples[n] = cb->stat[n].nr_samples;
+ }
+ if (nlat->enabled)
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+}
+
+static int nlat_bucket_node(const struct request *rq)
+{
+ if (!rq->mq_ctx)
+ return -1;
+ return cpu_to_node(blk_mq_rq_cpu((struct request *)rq));
+}
+
+static void nlat_exit(struct rq_qos *rqos)
+{
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ blk_stat_remove_callback(nlat->rqos.disk->queue, nlat->cb);
+ blk_stat_free_callback(nlat->cb);
+ kfree(nlat->samples);
+ kfree(nlat->latency);
+ kfree(nlat);
+}
+
+#ifdef CONFIG_BLK_DEBUG_FS
+static int nlat_win_usec_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%llu\n", nlat->win_usec);
+ return 0;
+}
+
+static ssize_t nlat_win_usec_write(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ char val[16] = { };
+ u64 usec;
+ int err;
+
+ if (blk_queue_dying(nlat->rqos.disk->queue))
+ return -ENOENT;
+
+ if (count >= sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, count))
+ return -EFAULT;
+
+ err = kstrtoull(val, 10, &usec);
+ if (err)
+ return err;
+ blk_stat_deactivate(nlat->cb);
+ nlat->win_usec = usec;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return count;
+}
+
+static int nlat_lowat_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%u\n", nlat->lowat);
+ return 0;
+}
+
+static ssize_t nlat_lowat_write(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ char val[16] = { };
+ unsigned int lowat;
+ int err;
+
+ if (blk_queue_dying(nlat->rqos.disk->queue))
+ return -ENOENT;
+
+ if (count >= sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, count))
+ return -EFAULT;
+
+ err = kstrtouint(val, 10, &lowat);
+ if (err)
+ return err;
+ blk_stat_deactivate(nlat->cb);
+ nlat->lowat = lowat;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return count;
+}
+
+static int nlat_decay_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%u\n", nlat->decay);
+ return 0;
+}
+
+static ssize_t nlat_decay_write(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ char val[16] = { };
+ unsigned int decay;
+ int err;
+
+ if (blk_queue_dying(nlat->rqos.disk->queue))
+ return -ENOENT;
+
+ if (count >= sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, count))
+ return -EFAULT;
+
+ err = kstrtouint(val, 10, &decay);
+ if (err)
+ return err;
+ if (decay > 100)
+ return -EINVAL;
+ blk_stat_deactivate(nlat->cb);
+ nlat->decay = decay;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return count;
+}
+
+static int nlat_enabled_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%d\n", nlat->enabled);
+ return 0;
+}
+
+static int nlat_id_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+
+ seq_printf(m, "%u\n", rqos->id);
+ return 0;
+}
+
+static int nlat_latency_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ int n;
+
+ if (!nlat->enabled)
+ return 0;
+
+ for (n = 0; n < nlat->num; n++) {
+ if (n > 0)
+ seq_puts(m, " ");
+ seq_printf(m, "%llu", nlat->latency[n]);
+ }
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static int nlat_samples_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ int n;
+
+ if (!nlat->enabled)
+ return 0;
+
+ for (n = 0; n < nlat->num; n++) {
+ if (n > 0)
+ seq_puts(m, " ");
+ seq_printf(m, "%u", nlat->samples[n]);
+ }
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static const struct blk_mq_debugfs_attr nlat_debugfs_attrs[] = {
+ {"win_usec", 0600, nlat_win_usec_show, nlat_win_usec_write},
+ {"lowat", 0600, nlat_lowat_show, nlat_lowat_write},
+ {"decay", 0600, nlat_decay_show, nlat_decay_write},
+ {"enabled", 0400, nlat_enabled_show},
+ {"id", 0400, nlat_id_show},
+ {"latency", 0400, nlat_latency_show},
+ {"samples", 0400, nlat_samples_show},
+ {},
+};
+#endif
+
+static const struct rq_qos_ops nlat_rqos_ops = {
+ .exit = nlat_exit,
+#ifdef CONFIG_BLK_DEBUG_FS
+ .debugfs_attrs = nlat_debugfs_attrs,
+#endif
+};
+
+u64 blk_nlat_latency(struct gendisk *disk, int node)
+{
+ struct rq_qos *rqos;
+ struct rq_nlat *nlat;
+
+ rqos = nlat_rq_qos(disk->queue);
+ if (!rqos)
+ return 0;
+ nlat = RQNLAT(rqos);
+ if (node > nlat->num)
+ return 0;
+
+ return div64_u64(nlat->latency[node], 1000);
+}
+EXPORT_SYMBOL_GPL(blk_nlat_latency);
+
+int blk_nlat_enable(struct gendisk *disk)
+{
+ struct rq_qos *rqos;
+ struct rq_nlat *nlat;
+
+ /* Latency tracking not enabled? */
+ rqos = nlat_rq_qos(disk->queue);
+ if (!rqos)
+ return -EINVAL;
+ nlat = RQNLAT(rqos);
+ if (nlat->enabled)
+ return 0;
+
+ /* Queue not registered? Maybe shutting down... */
+ if (!blk_queue_registered(disk->queue))
+ return -EAGAIN;
+
+ nlat->enabled = true;
+ memset(nlat->latency, 0, sizeof(u64) * nlat->num);
+ memset(nlat->samples, 0, sizeof(unsigned int) * nlat->num);
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blk_nlat_enable);
+
+void blk_nlat_disable(struct gendisk *disk)
+{
+ struct rq_qos *rqos = nlat_rq_qos(disk->queue);
+ struct rq_nlat *nlat;
+ if (!rqos)
+ return;
+ nlat = RQNLAT(rqos);
+ if (nlat->enabled) {
+ blk_stat_deactivate(nlat->cb);
+ nlat->enabled = false;
+ }
+}
+EXPORT_SYMBOL_GPL(blk_nlat_disable);
+
+int blk_nlat_init(struct gendisk *disk)
+{
+ struct rq_nlat *nlat;
+ int ret = -ENOMEM;
+
+ nlat = kzalloc(sizeof(*nlat), GFP_KERNEL);
+ if (!nlat)
+ return -ENOMEM;
+
+ nlat->num = num_possible_nodes();
+ nlat->lowat = NLAT_DEFAULT_LOWAT;
+ nlat->decay = NLAT_DEFAULT_DECAY;
+ nlat->win_usec = nlat_default_latency_usec(disk->queue);
+
+ nlat->latency = kzalloc(sizeof(u64) * nlat->num, GFP_KERNEL);
+ if (!nlat->latency)
+ goto err_free;
+ nlat->samples = kzalloc(sizeof(unsigned int) * nlat->num, GFP_KERNEL);
+ if (!nlat->samples)
+ goto err_free;
+ nlat->cb = blk_stat_alloc_callback(nlat_timer_fn, nlat_bucket_node,
+ nlat->num, nlat);
+ if (!nlat->cb)
+ goto err_free;
+
+ /*
+ * Assign rwb and add the stats callback.
+ */
+ mutex_lock(&disk->queue->rq_qos_mutex);
+ ret = rq_qos_add(&nlat->rqos, disk, RQ_QOS_NLAT, &nlat_rqos_ops);
+ mutex_unlock(&disk->queue->rq_qos_mutex);
+ if (ret)
+ goto err_free_cb;
+
+ blk_stat_add_callback(disk->queue, nlat->cb);
+
+ return 0;
+
+err_free_cb:
+ blk_stat_free_callback(nlat->cb);
+err_free:
+ kfree(nlat->samples);
+ kfree(nlat->latency);
+ kfree(nlat);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_nlat_init);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 37245c97ee61..2fc11ced0c00 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -17,6 +17,7 @@ enum rq_qos_id {
RQ_QOS_WBT,
RQ_QOS_LATENCY,
RQ_QOS_COST,
+ RQ_QOS_NLAT,
};
struct rq_wait {
@@ -79,6 +80,11 @@ static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
return rq_qos_id(q, RQ_QOS_LATENCY);
}
+static inline struct rq_qos *nlat_rq_qos(struct request_queue *q)
+{
+ return rq_qos_id(q, RQ_QOS_NLAT);
+}
+
static inline void rq_wait_init(struct rq_wait *rq_wait)
{
atomic_set(&rq_wait->inflight, 0);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 390d35fa0032..4d88bec43316 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1229,4 +1229,15 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
}
#endif /* CONFIG_BLK_DEV_ZONED */
+#ifdef CONFIG_BLK_NODE_LATENCY
+int blk_nlat_enable(struct gendisk *disk);
+void blk_nlat_disable(struct gendisk *disk);
+u64 blk_nlat_latency(struct gendisk *disk, int node);
+int blk_nlat_init(struct gendisk *disk);
+#else
+static inline int blk_nlat_enable(struct gendisk *disk) { return 0; }
+static inline void blk_nlat_disable(struct gendisk *disk) {}
+u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
+static inline in blk_nlat_init(struct gendisk *disk) { return -ENOTSUPP; }
+#endif
#endif /* BLK_MQ_H */
--
2.35.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 2/2] nvme: add 'latency' iopolicy
2024-04-03 14:17 [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Hannes Reinecke
2024-04-03 14:17 ` [PATCH 1/2] block: track per-node I/O latency Hannes Reinecke
@ 2024-04-03 14:17 ` Hannes Reinecke
2024-04-04 21:14 ` [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Keith Busch
` (4 subsequent siblings)
6 siblings, 0 replies; 25+ messages in thread
From: Hannes Reinecke @ 2024-04-03 14:17 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Keith Busch, Sagi Grimberg, Jens Axboe, linux-nvme, linux-block,
Hannes Reinecke
Add a latency-based I/O policy for multipathing. It uses the blk-nodelat
latency tracker to provide latencies for each node, and schedules
I/O on the path with the least latency for the submitting node.
Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
drivers/nvme/host/multipath.c | 57 ++++++++++++++++++++++++++++++-----
drivers/nvme/host/nvme.h | 1 +
2 files changed, 51 insertions(+), 7 deletions(-)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5397fb428b24..18e7fe45c2c1 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath,
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
+ [NVME_IOPOLICY_LAT] = "latency",
};
static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -29,6 +30,10 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
iopolicy = NVME_IOPOLICY_NUMA;
else if (!strncmp(val, "round-robin", 11))
iopolicy = NVME_IOPOLICY_RR;
+#ifdef CONFIG_BLK_NODE_LATENCY
+ else if (!strncmp(val, "latency", 7))
+ iopolicy = NVME_IOPOLICY_LAT;
+#endif
else
return -EINVAL;
@@ -40,6 +45,28 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
}
+static int nvme_activate_iopolicy(struct nvme_subsystem *subsys, int iopolicy)
+{
+ struct nvme_ns_head *h;
+ struct nvme_ns *ns;
+ bool enable = iopolicy == NVME_IOPOLICY_LAT;
+ int ret = 0;
+
+ mutex_lock(&subsys->lock);
+ list_for_each_entry(h, &subsys->nsheads, entry) {
+ list_for_each_entry_rcu(ns, &h->list, siblings) {
+ if (enable) {
+ ret = blk_nlat_enable(ns->disk);
+ if (ret)
+ break;
+ } else
+ blk_nlat_disable(ns->disk);
+ }
+ }
+ mutex_unlock(&subsys->lock);
+ return ret;
+}
+
module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
&iopolicy, 0644);
MODULE_PARM_DESC(iopolicy,
@@ -242,13 +269,16 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
{
int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
struct nvme_ns *found = NULL, *fallback = NULL, *ns;
+ int iopolicy = READ_ONCE(head->subsys->iopolicy);
list_for_each_entry_rcu(ns, &head->list, siblings) {
if (nvme_path_is_disabled(ns))
continue;
- if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+ if (iopolicy == NVME_IOPOLICY_NUMA)
distance = node_distance(node, ns->ctrl->numa_node);
+ else if (iopolicy == NVME_IOPOLICY_LAT)
+ distance = blk_nlat_latency(ns->disk, node);
else
distance = LOCAL_DISTANCE;
@@ -339,15 +369,17 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
int node = numa_node_id();
+ int iopolicy = READ_ONCE(head->subsys->iopolicy);
struct nvme_ns *ns;
ns = srcu_dereference(head->current_path[node], &head->srcu);
if (unlikely(!ns))
return __nvme_find_path(head, node);
- if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+ if (iopolicy == NVME_IOPOLICY_RR)
return nvme_round_robin_path(head, node, ns);
- if (unlikely(!nvme_path_is_optimized(ns)))
+ if (iopolicy == NVME_IOPOLICY_LAT ||
+ unlikely(!nvme_path_is_optimized(ns)))
return __nvme_find_path(head, node);
return ns;
}
@@ -803,15 +835,18 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
{
struct nvme_subsystem *subsys =
container_of(dev, struct nvme_subsystem, dev);
- int i;
+ int i, ret;
for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
- WRITE_ONCE(subsys->iopolicy, i);
- return count;
+ ret = nvme_activate_iopolicy(subsys, i);
+ if (!ret) {
+ WRITE_ONCE(subsys->iopolicy, i);
+ return count;
+ }
+ return ret;
}
}
-
return -EINVAL;
}
SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
@@ -847,6 +882,14 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
+ if (!blk_nlat_init(ns->disk) &&
+ READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_LAT) {
+ int ret = blk_nlat_enable(ns->disk);
+ if (unlikely(ret))
+ pr_warn("%s: Failed to enable latency tracking, error %d\n",
+ ns->disk->disk_name, ret);
+ }
+
if (nvme_ctrl_use_ana(ns->ctrl)) {
struct nvme_ana_group_desc desc = {
.grpid = anagrpid,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 27397f8404d6..b07afb1aa5bb 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -402,6 +402,7 @@ static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
enum nvme_iopolicy {
NVME_IOPOLICY_NUMA,
NVME_IOPOLICY_RR,
+ NVME_IOPOLICY_LAT,
};
struct nvme_subsystem {
--
2.35.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] block: track per-node I/O latency
2024-04-03 14:17 ` [PATCH 1/2] block: track per-node I/O latency Hannes Reinecke
@ 2024-04-04 2:22 ` kernel test robot
2024-04-04 2:55 ` kernel test robot
2024-04-04 18:47 ` kernel test robot
2 siblings, 0 replies; 25+ messages in thread
From: kernel test robot @ 2024-04-04 2:22 UTC (permalink / raw)
To: Hannes Reinecke, Christoph Hellwig
Cc: oe-kbuild-all, Keith Busch, Sagi Grimberg, Jens Axboe, linux-nvme,
linux-block, Hannes Reinecke
Hi Hannes,
kernel test robot noticed the following build warnings:
[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.9-rc2 next-20240403]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Hannes-Reinecke/block-track-per-node-I-O-latency/20240403-222254
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20240403141756.88233-2-hare%40kernel.org
patch subject: [PATCH 1/2] block: track per-node I/O latency
config: openrisc-allnoconfig (https://download.01.org/0day-ci/archive/20240404/202404041051.89LVIrNh-lkp@intel.com/config)
compiler: or1k-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240404/202404041051.89LVIrNh-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202404041051.89LVIrNh-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from include/linux/blk-integrity.h:5,
from block/bdev.c:15:
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for 'blk_nlat_latency' [-Wmissing-prototypes]
1242 | u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
| ^~~~~~~~~~~~~~~~
include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
1243 | static inline in blk_nlat_init(struct gendisk *disk) { return -ENOTSUPP; }
| ^~
vim +/blk_nlat_latency +1242 include/linux/blk-mq.h
1233
1234 #ifdef CONFIG_BLK_NODE_LATENCY
1235 int blk_nlat_enable(struct gendisk *disk);
1236 void blk_nlat_disable(struct gendisk *disk);
1237 u64 blk_nlat_latency(struct gendisk *disk, int node);
1238 int blk_nlat_init(struct gendisk *disk);
1239 #else
1240 static inline int blk_nlat_enable(struct gendisk *disk) { return 0; }
1241 static inline void blk_nlat_disable(struct gendisk *disk) {}
> 1242 u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] block: track per-node I/O latency
2024-04-03 14:17 ` [PATCH 1/2] block: track per-node I/O latency Hannes Reinecke
2024-04-04 2:22 ` kernel test robot
@ 2024-04-04 2:55 ` kernel test robot
2024-04-04 18:47 ` kernel test robot
2 siblings, 0 replies; 25+ messages in thread
From: kernel test robot @ 2024-04-04 2:55 UTC (permalink / raw)
To: Hannes Reinecke, Christoph Hellwig
Cc: llvm, oe-kbuild-all, Keith Busch, Sagi Grimberg, Jens Axboe,
linux-nvme, linux-block, Hannes Reinecke
Hi Hannes,
kernel test robot noticed the following build warnings:
[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.9-rc2 next-20240403]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Hannes-Reinecke/block-track-per-node-I-O-latency/20240403-222254
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20240403141756.88233-2-hare%40kernel.org
patch subject: [PATCH 1/2] block: track per-node I/O latency
config: s390-allnoconfig (https://download.01.org/0day-ci/archive/20240404/202404041045.bLSpHDFH-lkp@intel.com/config)
compiler: clang version 19.0.0git (https://github.com/llvm/llvm-project 546dc2245ffc4cccd0b05b58b7a5955e355a3b27)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240404/202404041045.bLSpHDFH-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202404041045.bLSpHDFH-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from block/bdev.c:9:
In file included from include/linux/mm.h:2208:
include/linux/vmstat.h:522:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
522 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
| ~~~~~~~~~~~ ^ ~~~
In file included from block/bdev.c:15:
In file included from include/linux/blk-integrity.h:5:
In file included from include/linux/blk-mq.h:8:
In file included from include/linux/scatterlist.h:9:
In file included from arch/s390/include/asm/io.h:78:
include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
547 | val = __raw_readb(PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
560 | val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
| ~~~~~~~~~~ ^
include/uapi/linux/byteorder/big_endian.h:37:59: note: expanded from macro '__le16_to_cpu'
37 | #define __le16_to_cpu(x) __swab16((__force __u16)(__le16)(x))
| ^
include/uapi/linux/swab.h:102:54: note: expanded from macro '__swab16'
102 | #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x))
| ^
In file included from block/bdev.c:15:
In file included from include/linux/blk-integrity.h:5:
In file included from include/linux/blk-mq.h:8:
In file included from include/linux/scatterlist.h:9:
In file included from arch/s390/include/asm/io.h:78:
include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
573 | val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
| ~~~~~~~~~~ ^
include/uapi/linux/byteorder/big_endian.h:35:59: note: expanded from macro '__le32_to_cpu'
35 | #define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
| ^
include/uapi/linux/swab.h:115:54: note: expanded from macro '__swab32'
115 | #define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
| ^
In file included from block/bdev.c:15:
In file included from include/linux/blk-integrity.h:5:
In file included from include/linux/blk-mq.h:8:
In file included from include/linux/scatterlist.h:9:
In file included from arch/s390/include/asm/io.h:78:
include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
584 | __raw_writeb(value, PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
594 | __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
604 | __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:692:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
692 | readsb(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:700:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
700 | readsw(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:708:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
708 | readsl(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:717:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
717 | writesb(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:726:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
726 | writesw(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:735:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
735 | writesl(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
In file included from block/bdev.c:15:
In file included from include/linux/blk-integrity.h:5:
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for function 'blk_nlat_latency' [-Wmissing-prototypes]
1242 | u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
| ^
include/linux/blk-mq.h:1242:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
1242 | u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
| ^
| static
include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
1243 | static inline in blk_nlat_init(struct gendisk *disk) { return -ENOTSUPP; }
| ^
14 warnings and 1 error generated.
vim +/blk_nlat_latency +1242 include/linux/blk-mq.h
1233
1234 #ifdef CONFIG_BLK_NODE_LATENCY
1235 int blk_nlat_enable(struct gendisk *disk);
1236 void blk_nlat_disable(struct gendisk *disk);
1237 u64 blk_nlat_latency(struct gendisk *disk, int node);
1238 int blk_nlat_init(struct gendisk *disk);
1239 #else
1240 static inline int blk_nlat_enable(struct gendisk *disk) { return 0; }
1241 static inline void blk_nlat_disable(struct gendisk *disk) {}
> 1242 u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] block: track per-node I/O latency
2024-04-03 14:17 ` [PATCH 1/2] block: track per-node I/O latency Hannes Reinecke
2024-04-04 2:22 ` kernel test robot
2024-04-04 2:55 ` kernel test robot
@ 2024-04-04 18:47 ` kernel test robot
2 siblings, 0 replies; 25+ messages in thread
From: kernel test robot @ 2024-04-04 18:47 UTC (permalink / raw)
To: Hannes Reinecke, Christoph Hellwig
Cc: oe-kbuild-all, Keith Busch, Sagi Grimberg, Jens Axboe, linux-nvme,
linux-block, Hannes Reinecke
Hi Hannes,
kernel test robot noticed the following build errors:
[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on linus/master v6.9-rc2 next-20240404]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Hannes-Reinecke/block-track-per-node-I-O-latency/20240403-222254
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20240403141756.88233-2-hare%40kernel.org
patch subject: [PATCH 1/2] block: track per-node I/O latency
config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20240405/202404050222.vYNG4y3i-lkp@intel.com/config)
compiler: gcc-13 (Ubuntu 13.2.0-4ubuntu3) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240405/202404050222.vYNG4y3i-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202404050222.vYNG4y3i-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from drivers/block/loop.c:36:
include/linux/blk-mq.h:1242:5: warning: no previous prototype for 'blk_nlat_latency' [-Wmissing-prototypes]
1242 | u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
| ^~~~~~~~~~~~~~~~
>> include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
1243 | static inline in blk_nlat_init(struct gendisk *disk) { return -ENOTSUPP; }
| ^~
vim +/in +1243 include/linux/blk-mq.h
1233
1234 #ifdef CONFIG_BLK_NODE_LATENCY
1235 int blk_nlat_enable(struct gendisk *disk);
1236 void blk_nlat_disable(struct gendisk *disk);
1237 u64 blk_nlat_latency(struct gendisk *disk, int node);
1238 int blk_nlat_init(struct gendisk *disk);
1239 #else
1240 static inline int blk_nlat_enable(struct gendisk *disk) { return 0; }
1241 static inline void blk_nlat_disable(struct gendisk *disk) {}
1242 u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
> 1243 static inline in blk_nlat_init(struct gendisk *disk) { return -ENOTSUPP; }
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCHv2 0/2] block,nvme: latency-based I/O scheduler
2024-04-03 14:17 [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Hannes Reinecke
2024-04-03 14:17 ` [PATCH 1/2] block: track per-node I/O latency Hannes Reinecke
2024-04-03 14:17 ` [PATCH 2/2] nvme: add 'latency' iopolicy Hannes Reinecke
@ 2024-04-04 21:14 ` Keith Busch
2024-04-05 6:21 ` Hannes Reinecke
2024-05-09 20:43 ` [PATCH v3 0/3] " John Meneghini
` (3 subsequent siblings)
6 siblings, 1 reply; 25+ messages in thread
From: Keith Busch @ 2024-04-04 21:14 UTC (permalink / raw)
To: Hannes Reinecke
Cc: Christoph Hellwig, Sagi Grimberg, Jens Axboe, linux-nvme,
linux-block
On Wed, Apr 03, 2024 at 04:17:54PM +0200, Hannes Reinecke wrote:
> Hi all,
>
> there had been several attempts to implement a latency-based I/O
> scheduler for native nvme multipath, all of which had its issues.
>
> So time to start afresh, this time using the QoS framework
> already present in the block layer.
> It consists of two parts:
> - a new 'blk-nlatency' QoS module, which is just a simple per-node
> latency tracker
> - a 'latency' nvme I/O policy
Whatever happened with the io-depth based path selector? That should
naturally align with the lower latency path, and that metric is cheaper
to track.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCHv2 0/2] block,nvme: latency-based I/O scheduler
2024-04-04 21:14 ` [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Keith Busch
@ 2024-04-05 6:21 ` Hannes Reinecke
2024-04-05 15:03 ` Keith Busch
0 siblings, 1 reply; 25+ messages in thread
From: Hannes Reinecke @ 2024-04-05 6:21 UTC (permalink / raw)
To: Keith Busch, Hannes Reinecke
Cc: Christoph Hellwig, Sagi Grimberg, Jens Axboe, linux-nvme,
linux-block
On 4/4/24 23:14, Keith Busch wrote:
> On Wed, Apr 03, 2024 at 04:17:54PM +0200, Hannes Reinecke wrote:
>> Hi all,
>>
>> there had been several attempts to implement a latency-based I/O
>> scheduler for native nvme multipath, all of which had its issues.
>>
>> So time to start afresh, this time using the QoS framework
>> already present in the block layer.
>> It consists of two parts:
>> - a new 'blk-nlatency' QoS module, which is just a simple per-node
>> latency tracker
>> - a 'latency' nvme I/O policy
>
> Whatever happened with the io-depth based path selector? That should
> naturally align with the lower latency path, and that metric is cheaper
> to track.
Turns out that tracking queue depth (on the NVMe level) always requires
an atomic, and with that a performance impact.
The qos/blk-stat framework is already present, and as the numbers show
actually leads to a performance improvement.
So I'm not quite sure what the argument 'cheaper to track' buys us here...
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCHv2 0/2] block,nvme: latency-based I/O scheduler
2024-04-05 6:21 ` Hannes Reinecke
@ 2024-04-05 15:03 ` Keith Busch
2024-04-05 15:36 ` Hannes Reinecke
0 siblings, 1 reply; 25+ messages in thread
From: Keith Busch @ 2024-04-05 15:03 UTC (permalink / raw)
To: Hannes Reinecke
Cc: Hannes Reinecke, Christoph Hellwig, Sagi Grimberg, Jens Axboe,
linux-nvme, linux-block
On Fri, Apr 05, 2024 at 08:21:14AM +0200, Hannes Reinecke wrote:
> On 4/4/24 23:14, Keith Busch wrote:
> > On Wed, Apr 03, 2024 at 04:17:54PM +0200, Hannes Reinecke wrote:
> > > Hi all,
> > >
> > > there had been several attempts to implement a latency-based I/O
> > > scheduler for native nvme multipath, all of which had its issues.
> > >
> > > So time to start afresh, this time using the QoS framework
> > > already present in the block layer.
> > > It consists of two parts:
> > > - a new 'blk-nlatency' QoS module, which is just a simple per-node
> > > latency tracker
> > > - a 'latency' nvme I/O policy
> > Whatever happened with the io-depth based path selector? That should
> > naturally align with the lower latency path, and that metric is cheaper
> > to track.
>
> Turns out that tracking queue depth (on the NVMe level) always requires
> an atomic, and with that a performance impact.
> The qos/blk-stat framework is already present, and as the numbers show
> actually leads to a performance improvement.
>
> So I'm not quite sure what the argument 'cheaper to track' buys us here...
I was considering the blk_stat framework compared to those atomic
operations. I usually don't enable stats because all the extra
ktime_get_ns() and indirect calls are relatively costly. If you're
enabling stats anyway though, then yeah, I guess I don't really have a
point and your idea here seems pretty reasonable.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCHv2 0/2] block,nvme: latency-based I/O scheduler
2024-04-05 15:03 ` Keith Busch
@ 2024-04-05 15:36 ` Hannes Reinecke
2024-04-07 19:55 ` Sagi Grimberg
0 siblings, 1 reply; 25+ messages in thread
From: Hannes Reinecke @ 2024-04-05 15:36 UTC (permalink / raw)
To: Keith Busch
Cc: Hannes Reinecke, Christoph Hellwig, Sagi Grimberg, Jens Axboe,
linux-nvme, linux-block
On 4/5/24 17:03, Keith Busch wrote:
> On Fri, Apr 05, 2024 at 08:21:14AM +0200, Hannes Reinecke wrote:
>> On 4/4/24 23:14, Keith Busch wrote:
>>> On Wed, Apr 03, 2024 at 04:17:54PM +0200, Hannes Reinecke wrote:
>>>> Hi all,
>>>>
>>>> there had been several attempts to implement a latency-based I/O
>>>> scheduler for native nvme multipath, all of which had its issues.
>>>>
>>>> So time to start afresh, this time using the QoS framework
>>>> already present in the block layer.
>>>> It consists of two parts:
>>>> - a new 'blk-nlatency' QoS module, which is just a simple per-node
>>>> latency tracker
>>>> - a 'latency' nvme I/O policy
>>> Whatever happened with the io-depth based path selector? That should
>>> naturally align with the lower latency path, and that metric is cheaper
>>> to track.
>>
>> Turns out that tracking queue depth (on the NVMe level) always requires
>> an atomic, and with that a performance impact.
>> The qos/blk-stat framework is already present, and as the numbers show
>> actually leads to a performance improvement.
>>
>> So I'm not quite sure what the argument 'cheaper to track' buys us here...
>
> I was considering the blk_stat framework compared to those atomic
> operations. I usually don't enable stats because all the extra
> ktime_get_ns() and indirect calls are relatively costly. If you're
> enabling stats anyway though, then yeah, I guess I don't really have a
> point and your idea here seems pretty reasonable.
Pretty much. Of course you need stats to be enabled.
And problem with the queue depth is that it's actually quite costly
to compute; the while sbitmap thingie is precisely there to _avoid_
having to track the queue depth.
I can't really see how one could track the queue depth efficiently;
the beauty of the blk_stat framework is that it's running async, and
only calculated after I/O is completed.
We could do a 'mock' queue depth by calculating the difference between
submitted and completed I/O, but even then you'd have to inject a call
in the hot path to track the number of submissions.
In the end, the latency tracker did what I wanted to achieve (namely
balance out uneven paths), _and_ got faster than round-robin, so I
didn't care about queue depth tracking.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Ivo Totev, Andrew McDonald,
Werner Knoblich
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCHv2 0/2] block,nvme: latency-based I/O scheduler
2024-04-05 15:36 ` Hannes Reinecke
@ 2024-04-07 19:55 ` Sagi Grimberg
0 siblings, 0 replies; 25+ messages in thread
From: Sagi Grimberg @ 2024-04-07 19:55 UTC (permalink / raw)
To: Hannes Reinecke, Keith Busch
Cc: Hannes Reinecke, Christoph Hellwig, Jens Axboe, linux-nvme,
linux-block
On 05/04/2024 18:36, Hannes Reinecke wrote:
> On 4/5/24 17:03, Keith Busch wrote:
>> On Fri, Apr 05, 2024 at 08:21:14AM +0200, Hannes Reinecke wrote:
>>> On 4/4/24 23:14, Keith Busch wrote:
>>>> On Wed, Apr 03, 2024 at 04:17:54PM +0200, Hannes Reinecke wrote:
>>>>> Hi all,
>>>>>
>>>>> there had been several attempts to implement a latency-based I/O
>>>>> scheduler for native nvme multipath, all of which had its issues.
>>>>>
>>>>> So time to start afresh, this time using the QoS framework
>>>>> already present in the block layer.
>>>>> It consists of two parts:
>>>>> - a new 'blk-nlatency' QoS module, which is just a simple per-node
>>>>> latency tracker
>>>>> - a 'latency' nvme I/O policy
>>>> Whatever happened with the io-depth based path selector? That should
>>>> naturally align with the lower latency path, and that metric is
>>>> cheaper
>>>> to track.
>>>
>>> Turns out that tracking queue depth (on the NVMe level) always requires
>>> an atomic, and with that a performance impact.
>>> The qos/blk-stat framework is already present, and as the numbers show
>>> actually leads to a performance improvement.
>>>
>>> So I'm not quite sure what the argument 'cheaper to track' buys us
>>> here...
>>
>> I was considering the blk_stat framework compared to those atomic
>> operations. I usually don't enable stats because all the extra
>> ktime_get_ns() and indirect calls are relatively costly. If you're
>> enabling stats anyway though, then yeah, I guess I don't really have a
>> point and your idea here seems pretty reasonable.
>
> Pretty much. Of course you need stats to be enabled.
> And problem with the queue depth is that it's actually quite costly
> to compute; the while sbitmap thingie is precisely there to _avoid_
> having to track the queue depth.
> I can't really see how one could track the queue depth efficiently;
> the beauty of the blk_stat framework is that it's running async, and
> only calculated after I/O is completed.
> We could do a 'mock' queue depth by calculating the difference between
> submitted and completed I/O, but even then you'd have to inject a call
> in the hot path to track the number of submissions.
>
> In the end, the latency tracker did what I wanted to achieve (namely
> balance out uneven paths), _and_ got faster than round-robin, so I
> didn't care about queue depth tracking.
Hey Hannes,
I think its a fair claim that a latency tracker is a valid proxy for
io-depth tracker.
I think that we need Ewan to validate if this solves the original issue
he was trying
to solve with his io-depth mpath selector. If so, I don't see any major
issues with this
proposal.
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH v3 0/3] block,nvme: latency-based I/O scheduler
2024-04-03 14:17 [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Hannes Reinecke
` (2 preceding siblings ...)
2024-04-04 21:14 ` [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Keith Busch
@ 2024-05-09 20:43 ` John Meneghini
2024-05-10 9:34 ` Niklas Cassel
2024-05-09 20:43 ` [PATCH v3 1/3] block: track per-node I/O latency John Meneghini
` (2 subsequent siblings)
6 siblings, 1 reply; 25+ messages in thread
From: John Meneghini @ 2024-05-09 20:43 UTC (permalink / raw)
To: tj, josef, axboe, kbusch, hch, sagi, emilne, hare
Cc: linux-block, cgroups, linux-nvme, linux-kernel, jmeneghi, jrani,
randyj, aviv.coro
I'm re-issuing Hannes's latency patches in preparation for LSFMM
Changes since V2:
I've done quite a bit of work cleaning up these patches. There were a
number of checkpatch.pl problems as well as some compile time errors
when config BLK_NODE_LATENCY was turned off. After the clean up I
rebased these patches onto Ewan's "nvme: queue-depth multipath iopolicy"
patches. This allowed me to test both iopolicy changes together.
All of my test results, together with the scripts I used to generate these
graphs, are available at:
https://github.com/johnmeneghini/iopolicy
Please use the scripts in this repository to do your own testing.
Changes since V1:
Hi all,
there had been several attempts to implement a latency-based I/O
scheduler for native nvme multipath, all of which had its issues.
So time to start afresh, this time using the QoS framework
already present in the block layer.
It consists of two parts:
- a new 'blk-nlatency' QoS module, which is just a simple per-node
latency tracker
- a 'latency' nvme I/O policy
Using the 'tiobench' fio script with 512 byte blocksize I'm getting
the following latencies (in usecs) as a baseline:
- seq write: avg 186 stddev 331
- rand write: avg 4598 stddev 7903
- seq read: avg 149 stddev 65
- rand read: avg 150 stddev 68
Enabling the 'latency' iopolicy:
- seq write: avg 178 stddev 113
- rand write: avg 3427 stddev 6703
- seq read: avg 140 stddev 59
- rand read: avg 141 stddev 58
Setting the 'decay' parameter to 10:
- seq write: avg 182 stddev 65
- rand write: avg 2619 stddev 5894
- seq read: avg 142 stddev 57
- rand read: avg 140 stddev 57
That's on a 32G FC testbed running against a brd target,
fio running with 48 threads. So promises are met: latency
goes down, and we're even able to control the standard
deviation via the 'decay' parameter.
As usual, comments and reviews are welcome.
Changes to the original version:
- split the rqos debugfs entries
- Modify commit message to indicate latency
- rename to blk-nlatency
Hannes Reinecke (2):
block: track per-node I/O latency
nvme: add 'latency' iopolicy
John Meneghini (1):
nvme: multipath: pr_notice when iopolicy changes
MAINTAINERS | 1 +
block/Kconfig | 9 +
block/Makefile | 1 +
block/blk-mq-debugfs.c | 2 +
block/blk-nlatency.c | 389 ++++++++++++++++++++++++++++++++++
block/blk-rq-qos.h | 6 +
drivers/nvme/host/multipath.c | 73 ++++++-
drivers/nvme/host/nvme.h | 1 +
include/linux/blk-mq.h | 11 +
9 files changed, 484 insertions(+), 9 deletions(-)
create mode 100644 block/blk-nlatency.c
--
2.39.3
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH v3 1/3] block: track per-node I/O latency
2024-04-03 14:17 [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Hannes Reinecke
` (3 preceding siblings ...)
2024-05-09 20:43 ` [PATCH v3 0/3] " John Meneghini
@ 2024-05-09 20:43 ` John Meneghini
2024-05-10 7:11 ` Damien Le Moal
2024-05-09 20:43 ` [PATCH v3 2/3] nvme: add 'latency' iopolicy John Meneghini
2024-05-09 20:43 ` [PATCH v3 3/3] nvme: multipath: pr_notice when iopolicy changes John Meneghini
6 siblings, 1 reply; 25+ messages in thread
From: John Meneghini @ 2024-05-09 20:43 UTC (permalink / raw)
To: tj, josef, axboe, kbusch, hch, sagi, emilne, hare
Cc: linux-block, cgroups, linux-nvme, linux-kernel, jmeneghi, jrani,
randyj, aviv.coro
From: Hannes Reinecke <hare@kernel.org>
Add a new option 'BLK_NODE_LATENCY' to track per-node I/O latency.
This can be used by I/O schedulers to determine the 'best' queue
to send I/O to.
Signed-off-by: Hannes Reinecke <hare@kernel.org>
Cleaned up checkpatch warnings and updated MAINTAINERS.
Signed-off-by: John Meneghini <jmeneghi@redhat.com>
---
MAINTAINERS | 1 +
block/Kconfig | 9 +
block/Makefile | 1 +
block/blk-mq-debugfs.c | 2 +
block/blk-nlatency.c | 389 +++++++++++++++++++++++++++++++++++++++++
block/blk-rq-qos.h | 6 +
include/linux/blk-mq.h | 11 ++
7 files changed, 419 insertions(+)
create mode 100644 block/blk-nlatency.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 7c121493f43d..a4634365c82f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5405,6 +5405,7 @@ F: block/bfq-cgroup.c
F: block/blk-cgroup.c
F: block/blk-iocost.c
F: block/blk-iolatency.c
+F: block/blk-nlatency.c
F: block/blk-throttle.c
F: include/linux/blk-cgroup.h
diff --git a/block/Kconfig b/block/Kconfig
index 1de4682d48cc..641ed39d609c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -186,6 +186,15 @@ config BLK_CGROUP_IOPRIO
scheduler and block devices process requests. Only some I/O schedulers
and some block devices support I/O priorities.
+config BLK_NODE_LATENCY
+ bool "Track per-node I/O latency"
+ help
+ Enable per-node I/O latency tracking for multipathing. This uses the
+ blk-nodelat latency tracker to provide latencies for each node, and schedules
+ I/O on the path with the least latency for the submitting node. This can be
+ used by I/O schedulers to determine the node with the least latency. Currently
+ only supports nvme over fabrics devices.
+
config BLK_DEBUG_FS
bool "Block layer debugging information in debugfs"
default y
diff --git a/block/Makefile b/block/Makefile
index 46ada9dc8bbf..9d2e71a3e36f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
+obj-$(CONFIG_BLK_NODE_LATENCY) += blk-nlatency.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 94668e72ab09..cb38228b95d8 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -762,6 +762,8 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
return "latency";
case RQ_QOS_COST:
return "cost";
+ case RQ_QOS_NLAT:
+ return "node-latency";
}
return "unknown";
}
diff --git a/block/blk-nlatency.c b/block/blk-nlatency.c
new file mode 100644
index 000000000000..219c3f636d76
--- /dev/null
+++ b/block/blk-nlatency.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Per-node request latency tracking.
+ *
+ * Copyright (C) 2023 Hannes Reinecke
+ *
+ * A simple per-node latency tracker for use by I/O scheduler.
+ * Latencies are measures over 'win_usec' microseconds and stored per node.
+ * If the number of measurements falls below 'lowat' the measurement is
+ * assumed to be unreliable and will become 'stale'.
+ * These 'stale' latencies can be 'decayed', where during each measurement
+ * interval the 'stale' latency value is decreased by 'decay' percent.
+ * Once the 'stale' latency reaches zero it will be updated by the
+ * measured latency.
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+
+#include "blk-stat.h"
+#include "blk-rq-qos.h"
+#include "blk.h"
+
+#define NLAT_DEFAULT_LOWAT 2
+#define NLAT_DEFAULT_DECAY 50
+
+struct rq_nlat {
+ struct rq_qos rqos;
+
+ u64 win_usec; /* latency measurement window in microseconds */
+ unsigned int lowat; /* Low Watermark latency measurement */
+ unsigned int decay; /* Percentage for 'decaying' latencies */
+ bool enabled;
+
+ struct blk_stat_callback *cb;
+
+ unsigned int num;
+ u64 *latency;
+ unsigned int *samples;
+};
+
+static inline struct rq_nlat *RQNLAT(struct rq_qos *rqos)
+{
+ return container_of(rqos, struct rq_nlat, rqos);
+}
+
+static u64 nlat_default_latency_usec(struct request_queue *q)
+{
+ /*
+ * We default to 2msec for non-rotational storage, and 75msec
+ * for rotational storage.
+ */
+ if (blk_queue_nonrot(q))
+ return 2000ULL;
+ else
+ return 75000ULL;
+}
+
+static void nlat_timer_fn(struct blk_stat_callback *cb)
+{
+ struct rq_nlat *nlat = cb->data;
+ int n;
+
+ for (n = 0; n < cb->buckets; n++) {
+ if (cb->stat[n].nr_samples < nlat->lowat) {
+ /*
+ * 'decay' the latency by the specified
+ * percentage to ensure the queues are
+ * being tested to balance out temporary
+ * latency spikes.
+ */
+ nlat->latency[n] =
+ div64_u64(nlat->latency[n] * nlat->decay, 100);
+ } else
+ nlat->latency[n] = cb->stat[n].mean;
+ nlat->samples[n] = cb->stat[n].nr_samples;
+ }
+ if (nlat->enabled)
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+}
+
+static int nlat_bucket_node(const struct request *rq)
+{
+ if (!rq->mq_ctx)
+ return -1;
+ return cpu_to_node(blk_mq_rq_cpu((struct request *)rq));
+}
+
+static void nlat_exit(struct rq_qos *rqos)
+{
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ blk_stat_remove_callback(nlat->rqos.disk->queue, nlat->cb);
+ blk_stat_free_callback(nlat->cb);
+ kfree(nlat->samples);
+ kfree(nlat->latency);
+ kfree(nlat);
+}
+
+#ifdef CONFIG_BLK_DEBUG_FS
+static int nlat_win_usec_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%llu\n", nlat->win_usec);
+ return 0;
+}
+
+static ssize_t nlat_win_usec_write(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ char val[16] = { };
+ u64 usec;
+ int err;
+
+ if (blk_queue_dying(nlat->rqos.disk->queue))
+ return -ENOENT;
+
+ if (count >= sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, count))
+ return -EFAULT;
+
+ err = kstrtoull(val, 10, &usec);
+ if (err)
+ return err;
+ blk_stat_deactivate(nlat->cb);
+ nlat->win_usec = usec;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return count;
+}
+
+static int nlat_lowat_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%u\n", nlat->lowat);
+ return 0;
+}
+
+static ssize_t nlat_lowat_write(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ char val[16] = { };
+ unsigned int lowat;
+ int err;
+
+ if (blk_queue_dying(nlat->rqos.disk->queue))
+ return -ENOENT;
+
+ if (count >= sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, count))
+ return -EFAULT;
+
+ err = kstrtouint(val, 10, &lowat);
+ if (err)
+ return err;
+ blk_stat_deactivate(nlat->cb);
+ nlat->lowat = lowat;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return count;
+}
+
+static int nlat_decay_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%u\n", nlat->decay);
+ return 0;
+}
+
+static ssize_t nlat_decay_write(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ char val[16] = { };
+ unsigned int decay;
+ int err;
+
+ if (blk_queue_dying(nlat->rqos.disk->queue))
+ return -ENOENT;
+
+ if (count >= sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, count))
+ return -EFAULT;
+
+ err = kstrtouint(val, 10, &decay);
+ if (err)
+ return err;
+ if (decay > 100)
+ return -EINVAL;
+ blk_stat_deactivate(nlat->cb);
+ nlat->decay = decay;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return count;
+}
+
+static int nlat_enabled_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%d\n", nlat->enabled);
+ return 0;
+}
+
+static int nlat_id_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+
+ seq_printf(m, "%u\n", rqos->id);
+ return 0;
+}
+
+static int nlat_latency_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ int n;
+
+ if (!nlat->enabled)
+ return 0;
+
+ for (n = 0; n < nlat->num; n++) {
+ if (n > 0)
+ seq_puts(m, " ");
+ seq_printf(m, "%llu", nlat->latency[n]);
+ }
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static int nlat_samples_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ int n;
+
+ if (!nlat->enabled)
+ return 0;
+
+ for (n = 0; n < nlat->num; n++) {
+ if (n > 0)
+ seq_puts(m, " ");
+ seq_printf(m, "%u", nlat->samples[n]);
+ }
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static const struct blk_mq_debugfs_attr nlat_debugfs_attrs[] = {
+ {"win_usec", 0600, nlat_win_usec_show, nlat_win_usec_write},
+ {"lowat", 0600, nlat_lowat_show, nlat_lowat_write},
+ {"decay", 0600, nlat_decay_show, nlat_decay_write},
+ {"enabled", 0400, nlat_enabled_show},
+ {"id", 0400, nlat_id_show},
+ {"latency", 0400, nlat_latency_show},
+ {"samples", 0400, nlat_samples_show},
+ {},
+};
+#endif
+
+static const struct rq_qos_ops nlat_rqos_ops = {
+ .exit = nlat_exit,
+#ifdef CONFIG_BLK_DEBUG_FS
+ .debugfs_attrs = nlat_debugfs_attrs,
+#endif
+};
+
+u64 blk_nlat_latency(struct gendisk *disk, int node)
+{
+ struct rq_qos *rqos;
+ struct rq_nlat *nlat;
+
+ rqos = nlat_rq_qos(disk->queue);
+ if (!rqos)
+ return 0;
+ nlat = RQNLAT(rqos);
+ if (node > nlat->num)
+ return 0;
+
+ return div64_u64(nlat->latency[node], 1000);
+}
+EXPORT_SYMBOL_GPL(blk_nlat_latency);
+
+int blk_nlat_enable(struct gendisk *disk)
+{
+ struct rq_qos *rqos;
+ struct rq_nlat *nlat;
+
+ /* Latency tracking not enabled? */
+ rqos = nlat_rq_qos(disk->queue);
+ if (!rqos)
+ return -EINVAL;
+ nlat = RQNLAT(rqos);
+ if (nlat->enabled)
+ return 0;
+
+ /* Queue not registered? Maybe shutting down... */
+ if (!blk_queue_registered(disk->queue))
+ return -EAGAIN;
+
+ nlat->enabled = true;
+ memset(nlat->latency, 0, sizeof(u64) * nlat->num);
+ memset(nlat->samples, 0, sizeof(unsigned int) * nlat->num);
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blk_nlat_enable);
+
+void blk_nlat_disable(struct gendisk *disk)
+{
+ struct rq_qos *rqos = nlat_rq_qos(disk->queue);
+ struct rq_nlat *nlat;
+
+ if (!rqos)
+ return;
+ nlat = RQNLAT(rqos);
+ if (nlat->enabled) {
+ blk_stat_deactivate(nlat->cb);
+ nlat->enabled = false;
+ }
+}
+EXPORT_SYMBOL_GPL(blk_nlat_disable);
+
+int blk_nlat_init(struct gendisk *disk)
+{
+ struct rq_nlat *nlat;
+ int ret = -ENOMEM;
+
+ nlat = kzalloc(sizeof(*nlat), GFP_KERNEL);
+ if (!nlat)
+ return -ENOMEM;
+
+ nlat->num = num_possible_nodes();
+ nlat->lowat = NLAT_DEFAULT_LOWAT;
+ nlat->decay = NLAT_DEFAULT_DECAY;
+ nlat->win_usec = nlat_default_latency_usec(disk->queue);
+
+ nlat->latency = kcalloc(nlat->num, sizeof(u64), GFP_KERNEL);
+ if (!nlat->latency)
+ goto err_free;
+ nlat->samples = kcalloc(nlat->num, sizeof(unsigned int), GFP_KERNEL);
+ if (!nlat->samples)
+ goto err_free;
+ nlat->cb = blk_stat_alloc_callback(nlat_timer_fn, nlat_bucket_node,
+ nlat->num, nlat);
+ if (!nlat->cb)
+ goto err_free;
+
+ /*
+ * Assign rwb and add the stats callback.
+ */
+ mutex_lock(&disk->queue->rq_qos_mutex);
+ ret = rq_qos_add(&nlat->rqos, disk, RQ_QOS_NLAT, &nlat_rqos_ops);
+ mutex_unlock(&disk->queue->rq_qos_mutex);
+ if (ret)
+ goto err_free_cb;
+
+ blk_stat_add_callback(disk->queue, nlat->cb);
+
+ return 0;
+
+err_free_cb:
+ blk_stat_free_callback(nlat->cb);
+err_free:
+ kfree(nlat->samples);
+ kfree(nlat->latency);
+ kfree(nlat);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_nlat_init);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 37245c97ee61..2fc11ced0c00 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -17,6 +17,7 @@ enum rq_qos_id {
RQ_QOS_WBT,
RQ_QOS_LATENCY,
RQ_QOS_COST,
+ RQ_QOS_NLAT,
};
struct rq_wait {
@@ -79,6 +80,11 @@ static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
return rq_qos_id(q, RQ_QOS_LATENCY);
}
+static inline struct rq_qos *nlat_rq_qos(struct request_queue *q)
+{
+ return rq_qos_id(q, RQ_QOS_NLAT);
+}
+
static inline void rq_wait_init(struct rq_wait *rq_wait)
{
atomic_set(&rq_wait->inflight, 0);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d3d8fd8e229b..1f3829627f1b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1231,4 +1231,15 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
}
#endif /* CONFIG_BLK_DEV_ZONED */
+#ifdef CONFIG_BLK_NODE_LATENCY
+int blk_nlat_enable(struct gendisk *disk);
+void blk_nlat_disable(struct gendisk *disk);
+u64 blk_nlat_latency(struct gendisk *disk, int node);
+int blk_nlat_init(struct gendisk *disk);
+#else
+static inline int blk_nlat_enable(struct gendisk *disk) { return 0; }
+static inline void blk_nlat_disable(struct gendisk *disk) {}
+static inline u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
+static inline int blk_nlat_init(struct gendisk *disk) { return -EOPNOTSUPP; }
+#endif
#endif /* BLK_MQ_H */
--
2.39.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH v3 2/3] nvme: add 'latency' iopolicy
2024-04-03 14:17 [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Hannes Reinecke
` (4 preceding siblings ...)
2024-05-09 20:43 ` [PATCH v3 1/3] block: track per-node I/O latency John Meneghini
@ 2024-05-09 20:43 ` John Meneghini
2024-05-10 7:17 ` Damien Le Moal
2024-05-09 20:43 ` [PATCH v3 3/3] nvme: multipath: pr_notice when iopolicy changes John Meneghini
6 siblings, 1 reply; 25+ messages in thread
From: John Meneghini @ 2024-05-09 20:43 UTC (permalink / raw)
To: tj, josef, axboe, kbusch, hch, sagi, emilne, hare
Cc: linux-block, cgroups, linux-nvme, linux-kernel, jmeneghi, jrani,
randyj, aviv.coro
From: Hannes Reinecke <hare@kernel.org>
Add a latency-based I/O policy for multipathing. It uses the blk-nodelat
latency tracker to provide latencies for each node, and schedules
I/O on the path with the least latency for the submitting node.
Signed-off-by: Hannes Reinecke <hare@kernel.org>
Make this compile when CONFIG_BLK_NODE_LATENCY is not set.
Advertise the 'latency' iopolicy in modinfo.
Signed-off-by: John Meneghini <jmeneghi@redhat.com>
---
drivers/nvme/host/multipath.c | 63 ++++++++++++++++++++++++++++++-----
drivers/nvme/host/nvme.h | 1 +
2 files changed, 55 insertions(+), 9 deletions(-)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index d916a5ddf5d4..e9330bb1990b 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -18,6 +18,7 @@ static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
[NVME_IOPOLICY_QD] = "queue-depth",
+ [NVME_IOPOLICY_LAT] = "latency",
};
static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -32,6 +33,10 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
iopolicy = NVME_IOPOLICY_RR;
else if (!strncmp(val, "queue-depth", 11))
iopolicy = NVME_IOPOLICY_QD;
+#ifdef CONFIG_BLK_NODE_LATENCY
+ else if (!strncmp(val, "latency", 7))
+ iopolicy = NVME_IOPOLICY_LAT;
+#endif
else
return -EINVAL;
@@ -43,10 +48,36 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
}
+static int nvme_activate_iopolicy(struct nvme_subsystem *subsys, int iopolicy)
+{
+ struct nvme_ns_head *h;
+ struct nvme_ns *ns;
+ bool enable = iopolicy == NVME_IOPOLICY_LAT;
+ int ret = 0;
+
+ mutex_lock(&subsys->lock);
+ list_for_each_entry(h, &subsys->nsheads, entry) {
+ list_for_each_entry_rcu(ns, &h->list, siblings) {
+ if (enable) {
+ ret = blk_nlat_enable(ns->disk);
+ if (ret)
+ break;
+ } else
+ blk_nlat_disable(ns->disk);
+ }
+ }
+ mutex_unlock(&subsys->lock);
+ return ret;
+}
+
module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
&iopolicy, 0644);
MODULE_PARM_DESC(iopolicy,
+#if defined(CONFIG_BLK_NODE_LATENCY)
+ "Default multipath I/O policy; 'numa' (default) , 'round-robin', 'queue-depth' or 'latency'");
+#else
"Default multipath I/O policy; 'numa' (default) , 'round-robin' or 'queue-depth'");
+#endif
void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
{
@@ -250,14 +281,16 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
{
int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
struct nvme_ns *found = NULL, *fallback = NULL, *ns;
+ int iopolicy = READ_ONCE(head->subsys->iopolicy);
list_for_each_entry_rcu(ns, &head->list, siblings) {
if (nvme_path_is_disabled(ns))
continue;
- if (ns->ctrl->numa_node != NUMA_NO_NODE &&
- READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+ if (iopolicy == NVME_IOPOLICY_NUMA)
distance = node_distance(node, ns->ctrl->numa_node);
+ else if (iopolicy == NVME_IOPOLICY_LAT)
+ distance = blk_nlat_latency(ns->disk, node);
else
distance = LOCAL_DISTANCE;
@@ -381,8 +414,8 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
- int iopolicy = READ_ONCE(head->subsys->iopolicy);
int node;
+ int iopolicy = READ_ONCE(head->subsys->iopolicy);
struct nvme_ns *ns;
/*
@@ -401,8 +434,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
if (iopolicy == NVME_IOPOLICY_RR)
return nvme_round_robin_path(head, node, ns);
-
- if (unlikely(!nvme_path_is_optimized(ns)))
+ if (iopolicy == NVME_IOPOLICY_LAT ||
+ unlikely(!nvme_path_is_optimized(ns)))
return __nvme_find_path(head, node);
return ns;
}
@@ -872,15 +905,18 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
{
struct nvme_subsystem *subsys =
container_of(dev, struct nvme_subsystem, dev);
- int i;
+ int i, ret;
for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
- nvme_subsys_iopolicy_update(subsys, i);
- return count;
+ ret = nvme_activate_iopolicy(subsys, i);
+ if (!ret) {
+ nvme_subsys_iopolicy_update(subsys, i);
+ return count;
+ }
+ return ret;
}
}
-
return -EINVAL;
}
SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
@@ -916,6 +952,15 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
+ if (!blk_nlat_init(ns->disk) &&
+ READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_LAT) {
+ int ret = blk_nlat_enable(ns->disk);
+
+ if (unlikely(ret))
+ pr_warn("%s: Failed to enable latency tracking, error %d\n",
+ ns->disk->disk_name, ret);
+ }
+
if (nvme_ctrl_use_ana(ns->ctrl)) {
struct nvme_ana_group_desc desc = {
.grpid = anagrpid,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a557b4577c01..66bf003a6c48 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -411,6 +411,7 @@ enum nvme_iopolicy {
NVME_IOPOLICY_NUMA,
NVME_IOPOLICY_RR,
NVME_IOPOLICY_QD,
+ NVME_IOPOLICY_LAT,
};
struct nvme_subsystem {
--
2.39.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH v3 3/3] nvme: multipath: pr_notice when iopolicy changes
2024-04-03 14:17 [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Hannes Reinecke
` (5 preceding siblings ...)
2024-05-09 20:43 ` [PATCH v3 2/3] nvme: add 'latency' iopolicy John Meneghini
@ 2024-05-09 20:43 ` John Meneghini
2024-05-10 7:19 ` Damien Le Moal
6 siblings, 1 reply; 25+ messages in thread
From: John Meneghini @ 2024-05-09 20:43 UTC (permalink / raw)
To: tj, josef, axboe, kbusch, hch, sagi, emilne, hare
Cc: linux-block, cgroups, linux-nvme, linux-kernel, jmeneghi, jrani,
randyj, aviv.coro
Send a pr_notice when ever the iopolicy on a subsystem
is changed. This is important for support reasons. It
is fully expected that users will be changing the iopolicy
with active IO in progress.
Signed-off-by: John Meneghini <jmeneghi@redhat.com>
---
drivers/nvme/host/multipath.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index e9330bb1990b..0286e44a081f 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -67,6 +67,10 @@ static int nvme_activate_iopolicy(struct nvme_subsystem *subsys, int iopolicy)
}
}
mutex_unlock(&subsys->lock);
+
+ pr_notice("%s: %s enable %d status %d for subsysnqn %s\n", __func__,
+ nvme_iopolicy_names[iopolicy], enable, ret, subsys->subnqn);
+
return ret;
}
@@ -890,6 +894,8 @@ void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
{
struct nvme_ctrl *ctrl;
+ int old_iopolicy = READ_ONCE(subsys->iopolicy);
+
WRITE_ONCE(subsys->iopolicy, iopolicy);
mutex_lock(&nvme_subsystems_lock);
@@ -898,6 +904,10 @@ void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
nvme_mpath_clear_ctrl_paths(ctrl);
}
mutex_unlock(&nvme_subsystems_lock);
+
+ pr_notice("%s: changed from %s to %s for subsysnqn %s\n", __func__,
+ nvme_iopolicy_names[old_iopolicy], nvme_iopolicy_names[iopolicy],
+ subsys->subnqn);
}
static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
--
2.39.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH v3 1/3] block: track per-node I/O latency
2024-05-09 20:43 ` [PATCH v3 1/3] block: track per-node I/O latency John Meneghini
@ 2024-05-10 7:11 ` Damien Le Moal
2024-05-10 9:28 ` Niklas Cassel
2024-05-10 10:00 ` Hannes Reinecke
0 siblings, 2 replies; 25+ messages in thread
From: Damien Le Moal @ 2024-05-10 7:11 UTC (permalink / raw)
To: John Meneghini, tj, josef, axboe, kbusch, hch, sagi, emilne, hare
Cc: linux-block, cgroups, linux-nvme, linux-kernel, jrani, randyj,
aviv.coro
On 5/10/24 05:43, John Meneghini wrote:
> From: Hannes Reinecke <hare@kernel.org>
>
> Add a new option 'BLK_NODE_LATENCY' to track per-node I/O latency.
> This can be used by I/O schedulers to determine the 'best' queue
> to send I/O to.
>
> Signed-off-by: Hannes Reinecke <hare@kernel.org>
>
> Cleaned up checkpatch warnings and updated MAINTAINERS.
This note should be before Hannes SoB. E.g:
[John] Fixed checkpatch warnings and updated MAINTAINERS.
>
> Signed-off-by: John Meneghini <jmeneghi@redhat.com>
> ---
> MAINTAINERS | 1 +
> block/Kconfig | 9 +
> block/Makefile | 1 +
> block/blk-mq-debugfs.c | 2 +
> block/blk-nlatency.c | 389 +++++++++++++++++++++++++++++++++++++++++
> block/blk-rq-qos.h | 6 +
> include/linux/blk-mq.h | 11 ++
> 7 files changed, 419 insertions(+)
> create mode 100644 block/blk-nlatency.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 7c121493f43d..a4634365c82f 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -5405,6 +5405,7 @@ F: block/bfq-cgroup.c
> F: block/blk-cgroup.c
> F: block/blk-iocost.c
> F: block/blk-iolatency.c
> +F: block/blk-nlatency.c
> F: block/blk-throttle.c
> F: include/linux/blk-cgroup.h
>
> diff --git a/block/Kconfig b/block/Kconfig
> index 1de4682d48cc..641ed39d609c 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -186,6 +186,15 @@ config BLK_CGROUP_IOPRIO
> scheduler and block devices process requests. Only some I/O schedulers
> and some block devices support I/O priorities.
>
> +config BLK_NODE_LATENCY
> + bool "Track per-node I/O latency"
> + help
> + Enable per-node I/O latency tracking for multipathing. This uses the
> + blk-nodelat latency tracker to provide latencies for each node, and schedules
> + I/O on the path with the least latency for the submitting node. This can be
> + used by I/O schedulers to determine the node with the least latency. Currently
> + only supports nvme over fabrics devices.
> +
> config BLK_DEBUG_FS
> bool "Block layer debugging information in debugfs"
> default y
> diff --git a/block/Makefile b/block/Makefile
> index 46ada9dc8bbf..9d2e71a3e36f 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
> obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o
> obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
> obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
> +obj-$(CONFIG_BLK_NODE_LATENCY) += blk-nlatency.o
Let's keep the alignment please.
> obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
> obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
> bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 94668e72ab09..cb38228b95d8 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -762,6 +762,8 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
> return "latency";
> case RQ_QOS_COST:
> return "cost";
> + case RQ_QOS_NLAT:
> + return "node-latency";
> }
> return "unknown";
> }
> diff --git a/block/blk-nlatency.c b/block/blk-nlatency.c
> new file mode 100644
> index 000000000000..219c3f636d76
> --- /dev/null
> +++ b/block/blk-nlatency.c
> @@ -0,0 +1,389 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Per-node request latency tracking.
> + *
> + * Copyright (C) 2023 Hannes Reinecke
> + *
> + * A simple per-node latency tracker for use by I/O scheduler.
> + * Latencies are measures over 'win_usec' microseconds and stored per node.
> + * If the number of measurements falls below 'lowat' the measurement is
> + * assumed to be unreliable and will become 'stale'.
> + * These 'stale' latencies can be 'decayed', where during each measurement
> + * interval the 'stale' latency value is decreased by 'decay' percent.
> + * Once the 'stale' latency reaches zero it will be updated by the
> + * measured latency.
> + */
> +#include <linux/kernel.h>
> +#include <linux/blk_types.h>
> +#include <linux/slab.h>
> +
> +#include "blk-stat.h"
> +#include "blk-rq-qos.h"
> +#include "blk.h"
> +
> +#define NLAT_DEFAULT_LOWAT 2
> +#define NLAT_DEFAULT_DECAY 50
> +
> +struct rq_nlat {
> + struct rq_qos rqos;
> +
> + u64 win_usec; /* latency measurement window in microseconds */
Using microseconds forces you to do costly multiplications and divisions by
1000. Why not keep things in nanoseconds ?
> + unsigned int lowat; /* Low Watermark latency measurement */
> + unsigned int decay; /* Percentage for 'decaying' latencies */
> + bool enabled;
> +
> + struct blk_stat_callback *cb;
> +
> + unsigned int num;
> + u64 *latency;
> + unsigned int *samples;
> +};
> +
> +static inline struct rq_nlat *RQNLAT(struct rq_qos *rqos)
> +{
> + return container_of(rqos, struct rq_nlat, rqos);
> +}
> +
> +static u64 nlat_default_latency_usec(struct request_queue *q)
> +{
> + /*
> + * We default to 2msec for non-rotational storage, and 75msec
> + * for rotational storage.
> + */
> + if (blk_queue_nonrot(q))
> + return 2000ULL;
> + else
No need for this else.
> + return 75000ULL;
> +}
> +
> +static void nlat_timer_fn(struct blk_stat_callback *cb)
> +{
> + struct rq_nlat *nlat = cb->data;
> + int n;
> +
> + for (n = 0; n < cb->buckets; n++) {
> + if (cb->stat[n].nr_samples < nlat->lowat) {
> + /*
> + * 'decay' the latency by the specified
> + * percentage to ensure the queues are
> + * being tested to balance out temporary
> + * latency spikes.
> + */
> + nlat->latency[n] =
> + div64_u64(nlat->latency[n] * nlat->decay, 100);
> + } else
> + nlat->latency[n] = cb->stat[n].mean;
Missing the curly brackets around the else block.
Nit: n is a rather unusual name for a loop index. Why not the usual "i" ? Does
notmatter much though.
> + nlat->samples[n] = cb->stat[n].nr_samples;
> + }
> + if (nlat->enabled)
> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
> +}
> +
> +static int nlat_bucket_node(const struct request *rq)
> +{
> + if (!rq->mq_ctx)
> + return -1;
> + return cpu_to_node(blk_mq_rq_cpu((struct request *)rq));
> +}
> +
> +static void nlat_exit(struct rq_qos *rqos)
> +{
> + struct rq_nlat *nlat = RQNLAT(rqos);
> +
> + blk_stat_remove_callback(nlat->rqos.disk->queue, nlat->cb);
> + blk_stat_free_callback(nlat->cb);
> + kfree(nlat->samples);
> + kfree(nlat->latency);
> + kfree(nlat);
> +}
> +
> +#ifdef CONFIG_BLK_DEBUG_FS
> +static int nlat_win_usec_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> +
> + seq_printf(m, "%llu\n", nlat->win_usec);
> + return 0;
> +}
> +
> +static ssize_t nlat_win_usec_write(void *data, const char __user *buf,
> + size_t count, loff_t *ppos)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> + char val[16] = { };
> + u64 usec;
> + int err;
> +
> + if (blk_queue_dying(nlat->rqos.disk->queue))
> + return -ENOENT;
> +
> + if (count >= sizeof(val))
> + return -EINVAL;
> +
> + if (copy_from_user(val, buf, count))
> + return -EFAULT;
> +
> + err = kstrtoull(val, 10, &usec);
> + if (err)
> + return err;
> + blk_stat_deactivate(nlat->cb);
> + nlat->win_usec = usec;
> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
> +
> + return count;
> +}
> +
> +static int nlat_lowat_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> +
> + seq_printf(m, "%u\n", nlat->lowat);
> + return 0;
> +}
> +
> +static ssize_t nlat_lowat_write(void *data, const char __user *buf,
> + size_t count, loff_t *ppos)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> + char val[16] = { };
> + unsigned int lowat;
> + int err;
> +
> + if (blk_queue_dying(nlat->rqos.disk->queue))
> + return -ENOENT;
> +
> + if (count >= sizeof(val))
> + return -EINVAL;
> +
> + if (copy_from_user(val, buf, count))
> + return -EFAULT;
> +
> + err = kstrtouint(val, 10, &lowat);
> + if (err)
> + return err;
> + blk_stat_deactivate(nlat->cb);
> + nlat->lowat = lowat;
> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
> +
> + return count;
> +}
> +
> +static int nlat_decay_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> +
> + seq_printf(m, "%u\n", nlat->decay);
> + return 0;
> +}
> +
> +static ssize_t nlat_decay_write(void *data, const char __user *buf,
> + size_t count, loff_t *ppos)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> + char val[16] = { };
> + unsigned int decay;
> + int err;
> +
> + if (blk_queue_dying(nlat->rqos.disk->queue))
> + return -ENOENT;
> +
> + if (count >= sizeof(val))
> + return -EINVAL;
> +
> + if (copy_from_user(val, buf, count))
> + return -EFAULT;
> +
> + err = kstrtouint(val, 10, &decay);
> + if (err)
> + return err;
> + if (decay > 100)
> + return -EINVAL;
> + blk_stat_deactivate(nlat->cb);
> + nlat->decay = decay;
> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
> +
> + return count;
> +}
> +
> +static int nlat_enabled_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> +
> + seq_printf(m, "%d\n", nlat->enabled);
> + return 0;
> +}
> +
> +static int nlat_id_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> +
> + seq_printf(m, "%u\n", rqos->id);
> + return 0;
> +}
> +
> +static int nlat_latency_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> + int n;
> +
> + if (!nlat->enabled)
> + return 0;
> +
> + for (n = 0; n < nlat->num; n++) {
> + if (n > 0)
> + seq_puts(m, " ");
> + seq_printf(m, "%llu", nlat->latency[n]);
> + }
> + seq_puts(m, "\n");
> + return 0;
> +}
> +
> +static int nlat_samples_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> + int n;
> +
> + if (!nlat->enabled)
> + return 0;
> +
> + for (n = 0; n < nlat->num; n++) {
> + if (n > 0)
> + seq_puts(m, " ");
> + seq_printf(m, "%u", nlat->samples[n]);
> + }
> + seq_puts(m, "\n");
> + return 0;
> +}
> +
> +static const struct blk_mq_debugfs_attr nlat_debugfs_attrs[] = {
> + {"win_usec", 0600, nlat_win_usec_show, nlat_win_usec_write},
> + {"lowat", 0600, nlat_lowat_show, nlat_lowat_write},
> + {"decay", 0600, nlat_decay_show, nlat_decay_write},
> + {"enabled", 0400, nlat_enabled_show},
> + {"id", 0400, nlat_id_show},
> + {"latency", 0400, nlat_latency_show},
> + {"samples", 0400, nlat_samples_show},
> + {},
> +};
> +#endif
> +
> +static const struct rq_qos_ops nlat_rqos_ops = {
> + .exit = nlat_exit,
> +#ifdef CONFIG_BLK_DEBUG_FS
> + .debugfs_attrs = nlat_debugfs_attrs,
> +#endif
> +};
> +
> +u64 blk_nlat_latency(struct gendisk *disk, int node)
> +{
> + struct rq_qos *rqos;
> + struct rq_nlat *nlat;
> +
> + rqos = nlat_rq_qos(disk->queue);
> + if (!rqos)
> + return 0;
> + nlat = RQNLAT(rqos);
> + if (node > nlat->num)
> + return 0;
> +
> + return div64_u64(nlat->latency[node], 1000);
See comment at the top. Why not keep everything in nanoseconds to avoid this
costly division ?
> +}
> +EXPORT_SYMBOL_GPL(blk_nlat_latency);
> +
> +int blk_nlat_enable(struct gendisk *disk)
> +{
> + struct rq_qos *rqos;
> + struct rq_nlat *nlat;
> +
> + /* Latency tracking not enabled? */
> + rqos = nlat_rq_qos(disk->queue);
> + if (!rqos)
> + return -EINVAL;
> + nlat = RQNLAT(rqos);
> + if (nlat->enabled)
> + return 0;
> +
> + /* Queue not registered? Maybe shutting down... */
> + if (!blk_queue_registered(disk->queue))
> + return -EAGAIN;
> +
> + nlat->enabled = true;
> + memset(nlat->latency, 0, sizeof(u64) * nlat->num);
> + memset(nlat->samples, 0, sizeof(unsigned int) * nlat->num);
> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(blk_nlat_enable);
> +
> +void blk_nlat_disable(struct gendisk *disk)
> +{
> + struct rq_qos *rqos = nlat_rq_qos(disk->queue);
> + struct rq_nlat *nlat;
> +
> + if (!rqos)
> + return;
> + nlat = RQNLAT(rqos);
> + if (nlat->enabled) {
> + blk_stat_deactivate(nlat->cb);
> + nlat->enabled = false;
> + }
> +}
> +EXPORT_SYMBOL_GPL(blk_nlat_disable);
> +
> +int blk_nlat_init(struct gendisk *disk)
> +{
> + struct rq_nlat *nlat;
> + int ret = -ENOMEM;
> +
> + nlat = kzalloc(sizeof(*nlat), GFP_KERNEL);
> + if (!nlat)
> + return -ENOMEM;
> +
> + nlat->num = num_possible_nodes();
> + nlat->lowat = NLAT_DEFAULT_LOWAT;
> + nlat->decay = NLAT_DEFAULT_DECAY;
> + nlat->win_usec = nlat_default_latency_usec(disk->queue);
> +
> + nlat->latency = kcalloc(nlat->num, sizeof(u64), GFP_KERNEL);
> + if (!nlat->latency)
> + goto err_free;
> + nlat->samples = kcalloc(nlat->num, sizeof(unsigned int), GFP_KERNEL);
> + if (!nlat->samples)
> + goto err_free;
> + nlat->cb = blk_stat_alloc_callback(nlat_timer_fn, nlat_bucket_node,
> + nlat->num, nlat);
> + if (!nlat->cb)
> + goto err_free;
> +
> + /*
> + * Assign rwb and add the stats callback.
> + */
This can be a single line comment.
> + mutex_lock(&disk->queue->rq_qos_mutex);
> + ret = rq_qos_add(&nlat->rqos, disk, RQ_QOS_NLAT, &nlat_rqos_ops);
> + mutex_unlock(&disk->queue->rq_qos_mutex);
> + if (ret)
> + goto err_free_cb;
> +
> + blk_stat_add_callback(disk->queue, nlat->cb);
> +
> + return 0;
> +
> +err_free_cb:
> + blk_stat_free_callback(nlat->cb);
> +err_free:
> + kfree(nlat->samples);
> + kfree(nlat->latency);
> + kfree(nlat);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(blk_nlat_init);
> diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
> index 37245c97ee61..2fc11ced0c00 100644
> --- a/block/blk-rq-qos.h
> +++ b/block/blk-rq-qos.h
> @@ -17,6 +17,7 @@ enum rq_qos_id {
> RQ_QOS_WBT,
> RQ_QOS_LATENCY,
> RQ_QOS_COST,
> + RQ_QOS_NLAT,
> };
>
> struct rq_wait {
> @@ -79,6 +80,11 @@ static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
> return rq_qos_id(q, RQ_QOS_LATENCY);
> }
>
> +static inline struct rq_qos *nlat_rq_qos(struct request_queue *q)
> +{
> + return rq_qos_id(q, RQ_QOS_NLAT);
> +}
> +
> static inline void rq_wait_init(struct rq_wait *rq_wait)
> {
> atomic_set(&rq_wait->inflight, 0);
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index d3d8fd8e229b..1f3829627f1b 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -1231,4 +1231,15 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
> }
> #endif /* CONFIG_BLK_DEV_ZONED */
>
> +#ifdef CONFIG_BLK_NODE_LATENCY
> +int blk_nlat_enable(struct gendisk *disk);
> +void blk_nlat_disable(struct gendisk *disk);
> +u64 blk_nlat_latency(struct gendisk *disk, int node);
> +int blk_nlat_init(struct gendisk *disk);
> +#else
> +static inline int blk_nlat_enable(struct gendisk *disk) { return 0; }
> +static inline void blk_nlat_disable(struct gendisk *disk) {}
> +static inline u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
> +static inline int blk_nlat_init(struct gendisk *disk) { return -EOPNOTSUPP; }
> +#endif
> #endif /* BLK_MQ_H */
--
Damien Le Moal
Western Digital Research
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH v3 2/3] nvme: add 'latency' iopolicy
2024-05-09 20:43 ` [PATCH v3 2/3] nvme: add 'latency' iopolicy John Meneghini
@ 2024-05-10 7:17 ` Damien Le Moal
2024-05-10 10:03 ` Hannes Reinecke
0 siblings, 1 reply; 25+ messages in thread
From: Damien Le Moal @ 2024-05-10 7:17 UTC (permalink / raw)
To: John Meneghini, tj, josef, axboe, kbusch, hch, sagi, emilne, hare
Cc: linux-block, cgroups, linux-nvme, linux-kernel, jrani, randyj,
aviv.coro
On 5/10/24 05:43, John Meneghini wrote:
> From: Hannes Reinecke <hare@kernel.org>
>
> Add a latency-based I/O policy for multipathing. It uses the blk-nodelat
> latency tracker to provide latencies for each node, and schedules
> I/O on the path with the least latency for the submitting node.
>
> Signed-off-by: Hannes Reinecke <hare@kernel.org>
>
> Make this compile when CONFIG_BLK_NODE_LATENCY is not set.
> Advertise the 'latency' iopolicy in modinfo.
>
> Signed-off-by: John Meneghini <jmeneghi@redhat.com>
> ---
> drivers/nvme/host/multipath.c | 63 ++++++++++++++++++++++++++++++-----
> drivers/nvme/host/nvme.h | 1 +
> 2 files changed, 55 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index d916a5ddf5d4..e9330bb1990b 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -18,6 +18,7 @@ static const char *nvme_iopolicy_names[] = {
> [NVME_IOPOLICY_NUMA] = "numa",
> [NVME_IOPOLICY_RR] = "round-robin",
> [NVME_IOPOLICY_QD] = "queue-depth",
> + [NVME_IOPOLICY_LAT] = "latency",
> };
>
> static int iopolicy = NVME_IOPOLICY_NUMA;
> @@ -32,6 +33,10 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
> iopolicy = NVME_IOPOLICY_RR;
> else if (!strncmp(val, "queue-depth", 11))
> iopolicy = NVME_IOPOLICY_QD;
> +#ifdef CONFIG_BLK_NODE_LATENCY
> + else if (!strncmp(val, "latency", 7))
> + iopolicy = NVME_IOPOLICY_LAT;
> +#endif
> else
> return -EINVAL;
>
> @@ -43,10 +48,36 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
> return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
> }
>
> +static int nvme_activate_iopolicy(struct nvme_subsystem *subsys, int iopolicy)
> +{
> + struct nvme_ns_head *h;
> + struct nvme_ns *ns;
> + bool enable = iopolicy == NVME_IOPOLICY_LAT;
> + int ret = 0;
> +
> + mutex_lock(&subsys->lock);
> + list_for_each_entry(h, &subsys->nsheads, entry) {
> + list_for_each_entry_rcu(ns, &h->list, siblings) {
> + if (enable) {
> + ret = blk_nlat_enable(ns->disk);
> + if (ret)
> + break;
> + } else
> + blk_nlat_disable(ns->disk);
Missing curly brackets for the else.
> + }
> + }
> + mutex_unlock(&subsys->lock);
> + return ret;
> +}
> +
> module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
> &iopolicy, 0644);
> MODULE_PARM_DESC(iopolicy,
> +#if defined(CONFIG_BLK_NODE_LATENCY)
What is so special about the latency policy that it needs to be conditionally
defined ? I missed that point. Why not drop CONFIG_BLK_NODE_LATENCY ?
> + "Default multipath I/O policy; 'numa' (default) , 'round-robin', 'queue-depth' or 'latency'");
> +#else
> "Default multipath I/O policy; 'numa' (default) , 'round-robin' or 'queue-depth'");
> +#endif
>
> void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
> {
> @@ -250,14 +281,16 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
> {
> int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
> struct nvme_ns *found = NULL, *fallback = NULL, *ns;
> + int iopolicy = READ_ONCE(head->subsys->iopolicy);
>
> list_for_each_entry_rcu(ns, &head->list, siblings) {
> if (nvme_path_is_disabled(ns))
> continue;
>
> - if (ns->ctrl->numa_node != NUMA_NO_NODE &&
> - READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
> + if (iopolicy == NVME_IOPOLICY_NUMA)
> distance = node_distance(node, ns->ctrl->numa_node);
> + else if (iopolicy == NVME_IOPOLICY_LAT)
> + distance = blk_nlat_latency(ns->disk, node);
> else
> distance = LOCAL_DISTANCE;
>
> @@ -381,8 +414,8 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
>
> inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
> {
> - int iopolicy = READ_ONCE(head->subsys->iopolicy);
> int node;
> + int iopolicy = READ_ONCE(head->subsys->iopolicy);
No need to move this line.
> struct nvme_ns *ns;
>
> /*
> @@ -401,8 +434,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
>
> if (iopolicy == NVME_IOPOLICY_RR)
> return nvme_round_robin_path(head, node, ns);
> -
> - if (unlikely(!nvme_path_is_optimized(ns)))
> + if (iopolicy == NVME_IOPOLICY_LAT ||
> + unlikely(!nvme_path_is_optimized(ns)))
> return __nvme_find_path(head, node);
> return ns;
> }
> @@ -872,15 +905,18 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
> {
> struct nvme_subsystem *subsys =
> container_of(dev, struct nvme_subsystem, dev);
> - int i;
> + int i, ret;
>
> for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
> if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
> - nvme_subsys_iopolicy_update(subsys, i);
> - return count;
> + ret = nvme_activate_iopolicy(subsys, i);
> + if (!ret) {
> + nvme_subsys_iopolicy_update(subsys, i);
> + return count;
> + }
> + return ret;
It would be nicer to have this as:
if (ret)
break
nvme_subsys_iopolicy_update(subsys, i);
return count;
> }
> }
> -
whiteline change.
> return -EINVAL;
And "return ret;" here with ret initialized to -EINVAL when declared.
> }
> SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
> @@ -916,6 +952,15 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
>
> void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
> {
> + if (!blk_nlat_init(ns->disk) &&
> + READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_LAT) {
> + int ret = blk_nlat_enable(ns->disk);
> +
> + if (unlikely(ret))
> + pr_warn("%s: Failed to enable latency tracking, error %d\n",
> + ns->disk->disk_name, ret);
> + }
> +
> if (nvme_ctrl_use_ana(ns->ctrl)) {
> struct nvme_ana_group_desc desc = {
> .grpid = anagrpid,
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index a557b4577c01..66bf003a6c48 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -411,6 +411,7 @@ enum nvme_iopolicy {
> NVME_IOPOLICY_NUMA,
> NVME_IOPOLICY_RR,
> NVME_IOPOLICY_QD,
> + NVME_IOPOLICY_LAT,
> };
>
> struct nvme_subsystem {
--
Damien Le Moal
Western Digital Research
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH v3 3/3] nvme: multipath: pr_notice when iopolicy changes
2024-05-09 20:43 ` [PATCH v3 3/3] nvme: multipath: pr_notice when iopolicy changes John Meneghini
@ 2024-05-10 7:19 ` Damien Le Moal
0 siblings, 0 replies; 25+ messages in thread
From: Damien Le Moal @ 2024-05-10 7:19 UTC (permalink / raw)
To: John Meneghini, tj, josef, axboe, kbusch, hch, sagi, emilne, hare
Cc: linux-block, cgroups, linux-nvme, linux-kernel, jrani, randyj,
aviv.coro
On 5/10/24 05:43, John Meneghini wrote:
> Send a pr_notice when ever the iopolicy on a subsystem
> is changed. This is important for support reasons. It
> is fully expected that users will be changing the iopolicy
> with active IO in progress.
>
> Signed-off-by: John Meneghini <jmeneghi@redhat.com>
> ---
> drivers/nvme/host/multipath.c | 10 ++++++++++
> 1 file changed, 10 insertions(+)
>
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index e9330bb1990b..0286e44a081f 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -67,6 +67,10 @@ static int nvme_activate_iopolicy(struct nvme_subsystem *subsys, int iopolicy)
> }
> }
> mutex_unlock(&subsys->lock);
> +
> + pr_notice("%s: %s enable %d status %d for subsysnqn %s\n", __func__,
> + nvme_iopolicy_names[iopolicy], enable, ret, subsys->subnqn);
> +
> return ret;
> }
>
> @@ -890,6 +894,8 @@ void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> {
> struct nvme_ctrl *ctrl;
>
> + int old_iopolicy = READ_ONCE(subsys->iopolicy);
No need for the white line before this.
> +
> WRITE_ONCE(subsys->iopolicy, iopolicy);
>
> mutex_lock(&nvme_subsystems_lock);
> @@ -898,6 +904,10 @@ void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> nvme_mpath_clear_ctrl_paths(ctrl);
> }
> mutex_unlock(&nvme_subsystems_lock);
> +
> + pr_notice("%s: changed from %s to %s for subsysnqn %s\n", __func__,
> + nvme_iopolicy_names[old_iopolicy], nvme_iopolicy_names[iopolicy],
> + subsys->subnqn);
Using dev_notice(&subsys->dev, "...", ...); may be better. Same for the other
messages.
> }
>
> static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
--
Damien Le Moal
Western Digital Research
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH v3 1/3] block: track per-node I/O latency
2024-05-10 7:11 ` Damien Le Moal
@ 2024-05-10 9:28 ` Niklas Cassel
2024-05-10 10:00 ` Hannes Reinecke
1 sibling, 0 replies; 25+ messages in thread
From: Niklas Cassel @ 2024-05-10 9:28 UTC (permalink / raw)
To: Damien Le Moal
Cc: John Meneghini, tj, josef, axboe, kbusch, hch, sagi, emilne, hare,
linux-block, cgroups, linux-nvme, linux-kernel, jrani, randyj,
aviv.coro
On Fri, May 10, 2024 at 04:11:10PM +0900, Damien Le Moal wrote:
> On 5/10/24 05:43, John Meneghini wrote:
> > From: Hannes Reinecke <hare@kernel.org>
> >
> > Add a new option 'BLK_NODE_LATENCY' to track per-node I/O latency.
> > This can be used by I/O schedulers to determine the 'best' queue
> > to send I/O to.
> >
> > Signed-off-by: Hannes Reinecke <hare@kernel.org>
> >
> > Cleaned up checkpatch warnings and updated MAINTAINERS.
>
> This note should be before Hannes SoB. E.g:
>
> [John] Fixed checkpatch warnings and updated MAINTAINERS.
Not before, it shoud be after Hannes SoB.
(Between Hannes' Signed-off-by and John's Signed-off-by)
See this example:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a37e12bcab22efa05802f87baa0692365ae0ab4d
Kind regards,
Niklas
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH v3 0/3] block,nvme: latency-based I/O scheduler
2024-05-09 20:43 ` [PATCH v3 0/3] " John Meneghini
@ 2024-05-10 9:34 ` Niklas Cassel
0 siblings, 0 replies; 25+ messages in thread
From: Niklas Cassel @ 2024-05-10 9:34 UTC (permalink / raw)
To: John Meneghini
Cc: tj, josef, axboe, kbusch, hch, sagi, emilne, hare, linux-block,
cgroups, linux-nvme, linux-kernel, jrani, randyj, aviv.coro
On Thu, May 09, 2024 at 04:43:21PM -0400, John Meneghini wrote:
> I'm re-issuing Hannes's latency patches in preparation for LSFMM
Hello John,
Just a small note.
Please don't reply-to the previous version of the series (v2), when sending
out a v3.
It creates "an unmanageable forest of references in email clients".
See:
https://www.kernel.org/doc/html/latest/process/submitting-patches.html#explicit-in-reply-to-headers
Instead just add the url to the v2 on lore.kernel.org.
See you at LSFMM!
Kind regards,
Niklas
>
> Changes since V2:
>
> I've done quite a bit of work cleaning up these patches. There were a
> number of checkpatch.pl problems as well as some compile time errors
> when config BLK_NODE_LATENCY was turned off. After the clean up I
> rebased these patches onto Ewan's "nvme: queue-depth multipath iopolicy"
> patches. This allowed me to test both iopolicy changes together.
>
> All of my test results, together with the scripts I used to generate these
> graphs, are available at:
>
> https://github.com/johnmeneghini/iopolicy
>
> Please use the scripts in this repository to do your own testing.
>
> Changes since V1:
>
> Hi all,
>
> there had been several attempts to implement a latency-based I/O
> scheduler for native nvme multipath, all of which had its issues.
>
> So time to start afresh, this time using the QoS framework
> already present in the block layer.
> It consists of two parts:
> - a new 'blk-nlatency' QoS module, which is just a simple per-node
> latency tracker
> - a 'latency' nvme I/O policy
>
> Using the 'tiobench' fio script with 512 byte blocksize I'm getting
> the following latencies (in usecs) as a baseline:
> - seq write: avg 186 stddev 331
> - rand write: avg 4598 stddev 7903
> - seq read: avg 149 stddev 65
> - rand read: avg 150 stddev 68
>
> Enabling the 'latency' iopolicy:
> - seq write: avg 178 stddev 113
> - rand write: avg 3427 stddev 6703
> - seq read: avg 140 stddev 59
> - rand read: avg 141 stddev 58
>
> Setting the 'decay' parameter to 10:
> - seq write: avg 182 stddev 65
> - rand write: avg 2619 stddev 5894
> - seq read: avg 142 stddev 57
> - rand read: avg 140 stddev 57
>
> That's on a 32G FC testbed running against a brd target,
> fio running with 48 threads. So promises are met: latency
> goes down, and we're even able to control the standard
> deviation via the 'decay' parameter.
>
> As usual, comments and reviews are welcome.
>
> Changes to the original version:
> - split the rqos debugfs entries
> - Modify commit message to indicate latency
> - rename to blk-nlatency
>
> Hannes Reinecke (2):
> block: track per-node I/O latency
> nvme: add 'latency' iopolicy
>
> John Meneghini (1):
> nvme: multipath: pr_notice when iopolicy changes
>
> MAINTAINERS | 1 +
> block/Kconfig | 9 +
> block/Makefile | 1 +
> block/blk-mq-debugfs.c | 2 +
> block/blk-nlatency.c | 389 ++++++++++++++++++++++++++++++++++
> block/blk-rq-qos.h | 6 +
> drivers/nvme/host/multipath.c | 73 ++++++-
> drivers/nvme/host/nvme.h | 1 +
> include/linux/blk-mq.h | 11 +
> 9 files changed, 484 insertions(+), 9 deletions(-)
> create mode 100644 block/blk-nlatency.c
>
> --
> 2.39.3
>
>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH v3 1/3] block: track per-node I/O latency
2024-05-10 7:11 ` Damien Le Moal
2024-05-10 9:28 ` Niklas Cassel
@ 2024-05-10 10:00 ` Hannes Reinecke
1 sibling, 0 replies; 25+ messages in thread
From: Hannes Reinecke @ 2024-05-10 10:00 UTC (permalink / raw)
To: Damien Le Moal, John Meneghini, tj, josef, axboe, kbusch, hch,
sagi, emilne, hare
Cc: linux-block, cgroups, linux-nvme, linux-kernel, jrani, randyj,
aviv.coro
On 5/10/24 09:11, Damien Le Moal wrote:
> On 5/10/24 05:43, John Meneghini wrote:
>> From: Hannes Reinecke <hare@kernel.org>
>>
>> Add a new option 'BLK_NODE_LATENCY' to track per-node I/O latency.
>> This can be used by I/O schedulers to determine the 'best' queue
>> to send I/O to.
>>
>> Signed-off-by: Hannes Reinecke <hare@kernel.org>
>>
>> Cleaned up checkpatch warnings and updated MAINTAINERS.
>
> This note should be before Hannes SoB. E.g:
>
> [John] Fixed checkpatch warnings and updated MAINTAINERS.
>
>>
>> Signed-off-by: John Meneghini <jmeneghi@redhat.com>
>> ---
>> MAINTAINERS | 1 +
>> block/Kconfig | 9 +
>> block/Makefile | 1 +
>> block/blk-mq-debugfs.c | 2 +
>> block/blk-nlatency.c | 389 +++++++++++++++++++++++++++++++++++++++++
>> block/blk-rq-qos.h | 6 +
>> include/linux/blk-mq.h | 11 ++
>> 7 files changed, 419 insertions(+)
>> create mode 100644 block/blk-nlatency.c
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 7c121493f43d..a4634365c82f 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -5405,6 +5405,7 @@ F: block/bfq-cgroup.c
>> F: block/blk-cgroup.c
>> F: block/blk-iocost.c
>> F: block/blk-iolatency.c
>> +F: block/blk-nlatency.c
>> F: block/blk-throttle.c
>> F: include/linux/blk-cgroup.h
>>
>> diff --git a/block/Kconfig b/block/Kconfig
>> index 1de4682d48cc..641ed39d609c 100644
>> --- a/block/Kconfig
>> +++ b/block/Kconfig
>> @@ -186,6 +186,15 @@ config BLK_CGROUP_IOPRIO
>> scheduler and block devices process requests. Only some I/O schedulers
>> and some block devices support I/O priorities.
>>
>> +config BLK_NODE_LATENCY
>> + bool "Track per-node I/O latency"
>> + help
>> + Enable per-node I/O latency tracking for multipathing. This uses the
>> + blk-nodelat latency tracker to provide latencies for each node, and schedules
>> + I/O on the path with the least latency for the submitting node. This can be
>> + used by I/O schedulers to determine the node with the least latency. Currently
>> + only supports nvme over fabrics devices.
>> +
>> config BLK_DEBUG_FS
>> bool "Block layer debugging information in debugfs"
>> default y
>> diff --git a/block/Makefile b/block/Makefile
>> index 46ada9dc8bbf..9d2e71a3e36f 100644
>> --- a/block/Makefile
>> +++ b/block/Makefile
>> @@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
>> obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o
>> obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
>> obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
>> +obj-$(CONFIG_BLK_NODE_LATENCY) += blk-nlatency.o
>
> Let's keep the alignment please.
>
>> obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
>> obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
>> bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
>> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
>> index 94668e72ab09..cb38228b95d8 100644
>> --- a/block/blk-mq-debugfs.c
>> +++ b/block/blk-mq-debugfs.c
>> @@ -762,6 +762,8 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
>> return "latency";
>> case RQ_QOS_COST:
>> return "cost";
>> + case RQ_QOS_NLAT:
>> + return "node-latency";
>> }
>> return "unknown";
>> }
>> diff --git a/block/blk-nlatency.c b/block/blk-nlatency.c
>> new file mode 100644
>> index 000000000000..219c3f636d76
>> --- /dev/null
>> +++ b/block/blk-nlatency.c
>> @@ -0,0 +1,389 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Per-node request latency tracking.
>> + *
>> + * Copyright (C) 2023 Hannes Reinecke
>> + *
>> + * A simple per-node latency tracker for use by I/O scheduler.
>> + * Latencies are measures over 'win_usec' microseconds and stored per node.
>> + * If the number of measurements falls below 'lowat' the measurement is
>> + * assumed to be unreliable and will become 'stale'.
>> + * These 'stale' latencies can be 'decayed', where during each measurement
>> + * interval the 'stale' latency value is decreased by 'decay' percent.
>> + * Once the 'stale' latency reaches zero it will be updated by the
>> + * measured latency.
>> + */
>> +#include <linux/kernel.h>
>> +#include <linux/blk_types.h>
>> +#include <linux/slab.h>
>> +
>> +#include "blk-stat.h"
>> +#include "blk-rq-qos.h"
>> +#include "blk.h"
>> +
>> +#define NLAT_DEFAULT_LOWAT 2
>> +#define NLAT_DEFAULT_DECAY 50
>> +
>> +struct rq_nlat {
>> + struct rq_qos rqos;
>> +
>> + u64 win_usec; /* latency measurement window in microseconds */
>
> Using microseconds forces you to do costly multiplications and divisions by
> 1000. Why not keep things in nanoseconds ?
>
I wanted to keep the user interface simple; entering nanoseconds values
is tedious. But sure, I can change it.
>> + unsigned int lowat; /* Low Watermark latency measurement */
>> + unsigned int decay; /* Percentage for 'decaying' latencies */
>> + bool enabled;
>> +
>> + struct blk_stat_callback *cb;
>> +
>> + unsigned int num;
>> + u64 *latency;
>> + unsigned int *samples;
>> +};
>> +
>> +static inline struct rq_nlat *RQNLAT(struct rq_qos *rqos)
>> +{
>> + return container_of(rqos, struct rq_nlat, rqos);
>> +}
>> +
>> +static u64 nlat_default_latency_usec(struct request_queue *q)
>> +{
>> + /*
>> + * We default to 2msec for non-rotational storage, and 75msec
>> + * for rotational storage.
>> + */
>> + if (blk_queue_nonrot(q))
>> + return 2000ULL;
>> + else
>
> No need for this else.
>
OK.
>> + return 75000ULL;
>> +}
>> +
>> +static void nlat_timer_fn(struct blk_stat_callback *cb)
>> +{
>> + struct rq_nlat *nlat = cb->data;
>> + int n;
>> +
>> + for (n = 0; n < cb->buckets; n++) {
>> + if (cb->stat[n].nr_samples < nlat->lowat) {
>> + /*
>> + * 'decay' the latency by the specified
>> + * percentage to ensure the queues are
>> + * being tested to balance out temporary
>> + * latency spikes.
>> + */
>> + nlat->latency[n] =
>> + div64_u64(nlat->latency[n] * nlat->decay, 100);
>> + } else
>> + nlat->latency[n] = cb->stat[n].mean;
>
> Missing the curly brackets around the else block.
> Nit: n is a rather unusual name for a loop index. Why not the usual "i" ? Does
> notmatter much though.
>
There was a reason once ... but yeah, let's move to 'i'.
>> + nlat->samples[n] = cb->stat[n].nr_samples;
>> + }
>> + if (nlat->enabled)
>> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
>> +}
>> +
>> +static int nlat_bucket_node(const struct request *rq)
>> +{
>> + if (!rq->mq_ctx)
>> + return -1;
>> + return cpu_to_node(blk_mq_rq_cpu((struct request *)rq));
>> +}
>> +
>> +static void nlat_exit(struct rq_qos *rqos)
>> +{
>> + struct rq_nlat *nlat = RQNLAT(rqos);
>> +
>> + blk_stat_remove_callback(nlat->rqos.disk->queue, nlat->cb);
>> + blk_stat_free_callback(nlat->cb);
>> + kfree(nlat->samples);
>> + kfree(nlat->latency);
>> + kfree(nlat);
>> +}
>> +
>> +#ifdef CONFIG_BLK_DEBUG_FS
>> +static int nlat_win_usec_show(void *data, struct seq_file *m)
>> +{
>> + struct rq_qos *rqos = data;
>> + struct rq_nlat *nlat = RQNLAT(rqos);
>> +
>> + seq_printf(m, "%llu\n", nlat->win_usec);
>> + return 0;
>> +}
>> +
>> +static ssize_t nlat_win_usec_write(void *data, const char __user *buf,
>> + size_t count, loff_t *ppos)
>> +{
>> + struct rq_qos *rqos = data;
>> + struct rq_nlat *nlat = RQNLAT(rqos);
>> + char val[16] = { };
>> + u64 usec;
>> + int err;
>> +
>> + if (blk_queue_dying(nlat->rqos.disk->queue))
>> + return -ENOENT;
>> +
>> + if (count >= sizeof(val))
>> + return -EINVAL;
>> +
>> + if (copy_from_user(val, buf, count))
>> + return -EFAULT;
>> +
>> + err = kstrtoull(val, 10, &usec);
>> + if (err)
>> + return err;
>> + blk_stat_deactivate(nlat->cb);
>> + nlat->win_usec = usec;
>> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
>> +
>> + return count;
>> +}
>> +
>> +static int nlat_lowat_show(void *data, struct seq_file *m)
>> +{
>> + struct rq_qos *rqos = data;
>> + struct rq_nlat *nlat = RQNLAT(rqos);
>> +
>> + seq_printf(m, "%u\n", nlat->lowat);
>> + return 0;
>> +}
>> +
>> +static ssize_t nlat_lowat_write(void *data, const char __user *buf,
>> + size_t count, loff_t *ppos)
>> +{
>> + struct rq_qos *rqos = data;
>> + struct rq_nlat *nlat = RQNLAT(rqos);
>> + char val[16] = { };
>> + unsigned int lowat;
>> + int err;
>> +
>> + if (blk_queue_dying(nlat->rqos.disk->queue))
>> + return -ENOENT;
>> +
>> + if (count >= sizeof(val))
>> + return -EINVAL;
>> +
>> + if (copy_from_user(val, buf, count))
>> + return -EFAULT;
>> +
>> + err = kstrtouint(val, 10, &lowat);
>> + if (err)
>> + return err;
>> + blk_stat_deactivate(nlat->cb);
>> + nlat->lowat = lowat;
>> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
>> +
>> + return count;
>> +}
>> +
>> +static int nlat_decay_show(void *data, struct seq_file *m)
>> +{
>> + struct rq_qos *rqos = data;
>> + struct rq_nlat *nlat = RQNLAT(rqos);
>> +
>> + seq_printf(m, "%u\n", nlat->decay);
>> + return 0;
>> +}
>> +
>> +static ssize_t nlat_decay_write(void *data, const char __user *buf,
>> + size_t count, loff_t *ppos)
>> +{
>> + struct rq_qos *rqos = data;
>> + struct rq_nlat *nlat = RQNLAT(rqos);
>> + char val[16] = { };
>> + unsigned int decay;
>> + int err;
>> +
>> + if (blk_queue_dying(nlat->rqos.disk->queue))
>> + return -ENOENT;
>> +
>> + if (count >= sizeof(val))
>> + return -EINVAL;
>> +
>> + if (copy_from_user(val, buf, count))
>> + return -EFAULT;
>> +
>> + err = kstrtouint(val, 10, &decay);
>> + if (err)
>> + return err;
>> + if (decay > 100)
>> + return -EINVAL;
>> + blk_stat_deactivate(nlat->cb);
>> + nlat->decay = decay;
>> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
>> +
>> + return count;
>> +}
>> +
>> +static int nlat_enabled_show(void *data, struct seq_file *m)
>> +{
>> + struct rq_qos *rqos = data;
>> + struct rq_nlat *nlat = RQNLAT(rqos);
>> +
>> + seq_printf(m, "%d\n", nlat->enabled);
>> + return 0;
>> +}
>> +
>> +static int nlat_id_show(void *data, struct seq_file *m)
>> +{
>> + struct rq_qos *rqos = data;
>> +
>> + seq_printf(m, "%u\n", rqos->id);
>> + return 0;
>> +}
>> +
>> +static int nlat_latency_show(void *data, struct seq_file *m)
>> +{
>> + struct rq_qos *rqos = data;
>> + struct rq_nlat *nlat = RQNLAT(rqos);
>> + int n;
>> +
>> + if (!nlat->enabled)
>> + return 0;
>> +
>> + for (n = 0; n < nlat->num; n++) {
>> + if (n > 0)
>> + seq_puts(m, " ");
>> + seq_printf(m, "%llu", nlat->latency[n]);
>> + }
>> + seq_puts(m, "\n");
>> + return 0;
>> +}
>> +
>> +static int nlat_samples_show(void *data, struct seq_file *m)
>> +{
>> + struct rq_qos *rqos = data;
>> + struct rq_nlat *nlat = RQNLAT(rqos);
>> + int n;
>> +
>> + if (!nlat->enabled)
>> + return 0;
>> +
>> + for (n = 0; n < nlat->num; n++) {
>> + if (n > 0)
>> + seq_puts(m, " ");
>> + seq_printf(m, "%u", nlat->samples[n]);
>> + }
>> + seq_puts(m, "\n");
>> + return 0;
>> +}
>> +
>> +static const struct blk_mq_debugfs_attr nlat_debugfs_attrs[] = {
>> + {"win_usec", 0600, nlat_win_usec_show, nlat_win_usec_write},
>> + {"lowat", 0600, nlat_lowat_show, nlat_lowat_write},
>> + {"decay", 0600, nlat_decay_show, nlat_decay_write},
>> + {"enabled", 0400, nlat_enabled_show},
>> + {"id", 0400, nlat_id_show},
>> + {"latency", 0400, nlat_latency_show},
>> + {"samples", 0400, nlat_samples_show},
>> + {},
>> +};
>> +#endif
>> +
>> +static const struct rq_qos_ops nlat_rqos_ops = {
>> + .exit = nlat_exit,
>> +#ifdef CONFIG_BLK_DEBUG_FS
>> + .debugfs_attrs = nlat_debugfs_attrs,
>> +#endif
>> +};
>> +
>> +u64 blk_nlat_latency(struct gendisk *disk, int node)
>> +{
>> + struct rq_qos *rqos;
>> + struct rq_nlat *nlat;
>> +
>> + rqos = nlat_rq_qos(disk->queue);
>> + if (!rqos)
>> + return 0;
>> + nlat = RQNLAT(rqos);
>> + if (node > nlat->num)
>> + return 0;
>> +
>> + return div64_u64(nlat->latency[node], 1000);
>
> See comment at the top. Why not keep everything in nanoseconds to avoid this
> costly division ?
>
Agreed.
>> +}
>> +EXPORT_SYMBOL_GPL(blk_nlat_latency);
>> +
>> +int blk_nlat_enable(struct gendisk *disk)
>> +{
>> + struct rq_qos *rqos;
>> + struct rq_nlat *nlat;
>> +
>> + /* Latency tracking not enabled? */
>> + rqos = nlat_rq_qos(disk->queue);
>> + if (!rqos)
>> + return -EINVAL;
>> + nlat = RQNLAT(rqos);
>> + if (nlat->enabled)
>> + return 0;
>> +
>> + /* Queue not registered? Maybe shutting down... */
>> + if (!blk_queue_registered(disk->queue))
>> + return -EAGAIN;
>> +
>> + nlat->enabled = true;
>> + memset(nlat->latency, 0, sizeof(u64) * nlat->num);
>> + memset(nlat->samples, 0, sizeof(unsigned int) * nlat->num);
>> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
>> +
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(blk_nlat_enable);
>> +
>> +void blk_nlat_disable(struct gendisk *disk)
>> +{
>> + struct rq_qos *rqos = nlat_rq_qos(disk->queue);
>> + struct rq_nlat *nlat;
>> +
>> + if (!rqos)
>> + return;
>> + nlat = RQNLAT(rqos);
>> + if (nlat->enabled) {
>> + blk_stat_deactivate(nlat->cb);
>> + nlat->enabled = false;
>> + }
>> +}
>> +EXPORT_SYMBOL_GPL(blk_nlat_disable);
>> +
>> +int blk_nlat_init(struct gendisk *disk)
>> +{
>> + struct rq_nlat *nlat;
>> + int ret = -ENOMEM;
>> +
>> + nlat = kzalloc(sizeof(*nlat), GFP_KERNEL);
>> + if (!nlat)
>> + return -ENOMEM;
>> +
>> + nlat->num = num_possible_nodes();
>> + nlat->lowat = NLAT_DEFAULT_LOWAT;
>> + nlat->decay = NLAT_DEFAULT_DECAY;
>> + nlat->win_usec = nlat_default_latency_usec(disk->queue);
>> +
>> + nlat->latency = kcalloc(nlat->num, sizeof(u64), GFP_KERNEL);
>> + if (!nlat->latency)
>> + goto err_free;
>> + nlat->samples = kcalloc(nlat->num, sizeof(unsigned int), GFP_KERNEL);
>> + if (!nlat->samples)
>> + goto err_free;
>> + nlat->cb = blk_stat_alloc_callback(nlat_timer_fn, nlat_bucket_node,
>> + nlat->num, nlat);
>> + if (!nlat->cb)
>> + goto err_free;
>> +
>> + /*
>> + * Assign rwb and add the stats callback.
>> + */
>
> This can be a single line comment.
>
Ok.
>> + mutex_lock(&disk->queue->rq_qos_mutex);
>> + ret = rq_qos_add(&nlat->rqos, disk, RQ_QOS_NLAT, &nlat_rqos_ops);
>> + mutex_unlock(&disk->queue->rq_qos_mutex);
>> + if (ret)
>> + goto err_free_cb;
>> +
>> + blk_stat_add_callback(disk->queue, nlat->cb);
>> +
>> + return 0;
>> +
>> +err_free_cb:
>> + blk_stat_free_callback(nlat->cb);
>> +err_free:
>> + kfree(nlat->samples);
>> + kfree(nlat->latency);
>> + kfree(nlat);
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(blk_nlat_init);
>> diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
>> index 37245c97ee61..2fc11ced0c00 100644
>> --- a/block/blk-rq-qos.h
>> +++ b/block/blk-rq-qos.h
>> @@ -17,6 +17,7 @@ enum rq_qos_id {
>> RQ_QOS_WBT,
>> RQ_QOS_LATENCY,
>> RQ_QOS_COST,
>> + RQ_QOS_NLAT,
>> };
>>
>> struct rq_wait {
>> @@ -79,6 +80,11 @@ static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
>> return rq_qos_id(q, RQ_QOS_LATENCY);
>> }
>>
>> +static inline struct rq_qos *nlat_rq_qos(struct request_queue *q)
>> +{
>> + return rq_qos_id(q, RQ_QOS_NLAT);
>> +}
>> +
>> static inline void rq_wait_init(struct rq_wait *rq_wait)
>> {
>> atomic_set(&rq_wait->inflight, 0);
>> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
>> index d3d8fd8e229b..1f3829627f1b 100644
>> --- a/include/linux/blk-mq.h
>> +++ b/include/linux/blk-mq.h
>> @@ -1231,4 +1231,15 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
>> }
>> #endif /* CONFIG_BLK_DEV_ZONED */
>>
>> +#ifdef CONFIG_BLK_NODE_LATENCY
>> +int blk_nlat_enable(struct gendisk *disk);
>> +void blk_nlat_disable(struct gendisk *disk);
>> +u64 blk_nlat_latency(struct gendisk *disk, int node);
>> +int blk_nlat_init(struct gendisk *disk);
>> +#else
>> +static inline int blk_nlat_enable(struct gendisk *disk) { return 0; }
>> +static inline void blk_nlat_disable(struct gendisk *disk) {}
>> +static inline u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
>> +static inline int blk_nlat_init(struct gendisk *disk) { return -EOPNOTSUPP; }
>> +#endif
>> #endif /* BLK_MQ_H */
>
Thanks for the review!
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Ivo Totev, Andrew McDonald,
Werner Knoblich
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH v3 2/3] nvme: add 'latency' iopolicy
2024-05-10 7:17 ` Damien Le Moal
@ 2024-05-10 10:03 ` Hannes Reinecke
0 siblings, 0 replies; 25+ messages in thread
From: Hannes Reinecke @ 2024-05-10 10:03 UTC (permalink / raw)
To: Damien Le Moal, John Meneghini, tj, josef, axboe, kbusch, hch,
sagi, emilne, hare
Cc: linux-block, cgroups, linux-nvme, linux-kernel, jrani, randyj,
aviv.coro
On 5/10/24 09:17, Damien Le Moal wrote:
> On 5/10/24 05:43, John Meneghini wrote:
>> From: Hannes Reinecke <hare@kernel.org>
>>
>> Add a latency-based I/O policy for multipathing. It uses the blk-nodelat
>> latency tracker to provide latencies for each node, and schedules
>> I/O on the path with the least latency for the submitting node.
>>
>> Signed-off-by: Hannes Reinecke <hare@kernel.org>
>>
>> Make this compile when CONFIG_BLK_NODE_LATENCY is not set.
>> Advertise the 'latency' iopolicy in modinfo.
>>
>> Signed-off-by: John Meneghini <jmeneghi@redhat.com>
>> ---
>> drivers/nvme/host/multipath.c | 63 ++++++++++++++++++++++++++++++-----
>> drivers/nvme/host/nvme.h | 1 +
>> 2 files changed, 55 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
>> index d916a5ddf5d4..e9330bb1990b 100644
>> --- a/drivers/nvme/host/multipath.c
>> +++ b/drivers/nvme/host/multipath.c
>> @@ -18,6 +18,7 @@ static const char *nvme_iopolicy_names[] = {
>> [NVME_IOPOLICY_NUMA] = "numa",
>> [NVME_IOPOLICY_RR] = "round-robin",
>> [NVME_IOPOLICY_QD] = "queue-depth",
>> + [NVME_IOPOLICY_LAT] = "latency",
>> };
>>
>> static int iopolicy = NVME_IOPOLICY_NUMA;
>> @@ -32,6 +33,10 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
>> iopolicy = NVME_IOPOLICY_RR;
>> else if (!strncmp(val, "queue-depth", 11))
>> iopolicy = NVME_IOPOLICY_QD;
>> +#ifdef CONFIG_BLK_NODE_LATENCY
>> + else if (!strncmp(val, "latency", 7))
>> + iopolicy = NVME_IOPOLICY_LAT;
>> +#endif
>> else
>> return -EINVAL;
>>
>> @@ -43,10 +48,36 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
>> return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
>> }
>>
>> +static int nvme_activate_iopolicy(struct nvme_subsystem *subsys, int iopolicy)
>> +{
>> + struct nvme_ns_head *h;
>> + struct nvme_ns *ns;
>> + bool enable = iopolicy == NVME_IOPOLICY_LAT;
>> + int ret = 0;
>> +
>> + mutex_lock(&subsys->lock);
>> + list_for_each_entry(h, &subsys->nsheads, entry) {
>> + list_for_each_entry_rcu(ns, &h->list, siblings) {
>> + if (enable) {
>> + ret = blk_nlat_enable(ns->disk);
>> + if (ret)
>> + break;
>> + } else
>> + blk_nlat_disable(ns->disk);
>
> Missing curly brackets for the else.
>
Ok.
>> + }
>> + }
>> + mutex_unlock(&subsys->lock);
>> + return ret;
>> +}
>> +
>> module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
>> &iopolicy, 0644);
>> MODULE_PARM_DESC(iopolicy,
>> +#if defined(CONFIG_BLK_NODE_LATENCY)
>
> What is so special about the latency policy that it needs to be conditionally
> defined ? I missed that point. Why not drop CONFIG_BLK_NODE_LATENCY ?
>
The 'latency' policy is using the blk-rqos infrastructure, which in
itself might not be compiled in.
So we don't want the user to give a false impression here.
>> + "Default multipath I/O policy; 'numa' (default) , 'round-robin', 'queue-depth' or 'latency'");
>> +#else
>> "Default multipath I/O policy; 'numa' (default) , 'round-robin' or 'queue-depth'");
>> +#endif
>>
>> void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
>> {
>> @@ -250,14 +281,16 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
>> {
>> int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
>> struct nvme_ns *found = NULL, *fallback = NULL, *ns;
>> + int iopolicy = READ_ONCE(head->subsys->iopolicy);
>>
>> list_for_each_entry_rcu(ns, &head->list, siblings) {
>> if (nvme_path_is_disabled(ns))
>> continue;
>>
>> - if (ns->ctrl->numa_node != NUMA_NO_NODE &&
>> - READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
>> + if (iopolicy == NVME_IOPOLICY_NUMA)
>> distance = node_distance(node, ns->ctrl->numa_node);
>> + else if (iopolicy == NVME_IOPOLICY_LAT)
>> + distance = blk_nlat_latency(ns->disk, node);
>> else
>> distance = LOCAL_DISTANCE;
>>
>> @@ -381,8 +414,8 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
>>
>> inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
>> {
>> - int iopolicy = READ_ONCE(head->subsys->iopolicy);
>> int node;
>> + int iopolicy = READ_ONCE(head->subsys->iopolicy);
>
> No need to move this line.
>
Sure.
>> struct nvme_ns *ns;
>>
>> /*
>> @@ -401,8 +434,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
>>
>> if (iopolicy == NVME_IOPOLICY_RR)
>> return nvme_round_robin_path(head, node, ns);
>> -
>> - if (unlikely(!nvme_path_is_optimized(ns)))
>> + if (iopolicy == NVME_IOPOLICY_LAT ||
>> + unlikely(!nvme_path_is_optimized(ns)))
>> return __nvme_find_path(head, node);
>> return ns;
>> }
>> @@ -872,15 +905,18 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
>> {
>> struct nvme_subsystem *subsys =
>> container_of(dev, struct nvme_subsystem, dev);
>> - int i;
>> + int i, ret;
>>
>> for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
>> if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
>> - nvme_subsys_iopolicy_update(subsys, i);
>> - return count;
>> + ret = nvme_activate_iopolicy(subsys, i);
>> + if (!ret) {
>> + nvme_subsys_iopolicy_update(subsys, i);
>> + return count;
>> + }
>> + return ret;
>
> It would be nicer to have this as:
>
> if (ret)
> break
> nvme_subsys_iopolicy_update(subsys, i);
> return count;
>
Ok.
>> }
>> }
>> -
>
> whiteline change.
>
>> return -EINVAL;
>
> And "return ret;" here with ret initialized to -EINVAL when declared.
>
Ok.
>> }
>> SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
>> @@ -916,6 +952,15 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
>>
>> void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
>> {
>> + if (!blk_nlat_init(ns->disk) &&
>> + READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_LAT) {
>> + int ret = blk_nlat_enable(ns->disk);
>> +
>> + if (unlikely(ret))
>> + pr_warn("%s: Failed to enable latency tracking, error %d\n",
>> + ns->disk->disk_name, ret);
>> + }
>> +
>> if (nvme_ctrl_use_ana(ns->ctrl)) {
>> struct nvme_ana_group_desc desc = {
>> .grpid = anagrpid,
>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>> index a557b4577c01..66bf003a6c48 100644
>> --- a/drivers/nvme/host/nvme.h
>> +++ b/drivers/nvme/host/nvme.h
>> @@ -411,6 +411,7 @@ enum nvme_iopolicy {
>> NVME_IOPOLICY_NUMA,
>> NVME_IOPOLICY_RR,
>> NVME_IOPOLICY_QD,
>> + NVME_IOPOLICY_LAT,
>> };
>>
>> struct nvme_subsystem {
>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Ivo Totev, Andrew McDonald,
Werner Knoblich
^ permalink raw reply [flat|nested] 25+ messages in thread
end of thread, other threads:[~2024-05-10 10:03 UTC | newest]
Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-04-03 14:17 [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Hannes Reinecke
2024-04-03 14:17 ` [PATCH 1/2] block: track per-node I/O latency Hannes Reinecke
2024-04-04 2:22 ` kernel test robot
2024-04-04 2:55 ` kernel test robot
2024-04-04 18:47 ` kernel test robot
2024-04-03 14:17 ` [PATCH 2/2] nvme: add 'latency' iopolicy Hannes Reinecke
2024-04-04 21:14 ` [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Keith Busch
2024-04-05 6:21 ` Hannes Reinecke
2024-04-05 15:03 ` Keith Busch
2024-04-05 15:36 ` Hannes Reinecke
2024-04-07 19:55 ` Sagi Grimberg
2024-05-09 20:43 ` [PATCH v3 0/3] " John Meneghini
2024-05-10 9:34 ` Niklas Cassel
2024-05-09 20:43 ` [PATCH v3 1/3] block: track per-node I/O latency John Meneghini
2024-05-10 7:11 ` Damien Le Moal
2024-05-10 9:28 ` Niklas Cassel
2024-05-10 10:00 ` Hannes Reinecke
2024-05-09 20:43 ` [PATCH v3 2/3] nvme: add 'latency' iopolicy John Meneghini
2024-05-10 7:17 ` Damien Le Moal
2024-05-10 10:03 ` Hannes Reinecke
2024-05-09 20:43 ` [PATCH v3 3/3] nvme: multipath: pr_notice when iopolicy changes John Meneghini
2024-05-10 7:19 ` Damien Le Moal
-- strict thread matches above, loose matches on Subject: below --
2024-03-26 15:35 [PATCH RFC 0/2] block,nvme: latency-based I/O scheduler Hannes Reinecke
2024-03-26 15:35 ` [PATCH 1/2] block: track per-node I/O latency Hannes Reinecke
2024-03-27 18:03 ` kernel test robot
2024-03-27 20:59 ` kernel test robot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).