* [PATCH 1/3] blk-mq: don't redistribute hardware queues on a CPU hotplug event
2016-07-21 14:30 resend: pci: automatic interrupt affinity for MSI/MSI-X capable devices V2 Christoph Hellwig
@ 2016-07-21 14:30 ` Christoph Hellwig
2016-07-21 14:30 ` [PATCH 2/3] blk-mq: only allocate a single mq_map per tag_set Christoph Hellwig
` (2 subsequent siblings)
3 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2016-07-21 14:30 UTC (permalink / raw)
To: linux-pci; +Cc: agordeev, linux-kernel
Currently blk-mq will totally remap hardware context when a CPU hotplug
even happened, which causes major havoc for drivers, as they are never
told about this remapping. E.g. any carefully sorted out CPU affinity
will just be completely messed up.
The rebuild also doesn't really help for the common case of cpu
hotplug, which is soft onlining / offlining of cpus - in this case we
should just leave the queue and irq mapping as is. If it actually
worked it would have helped in the case of physical cpu hotplug,
although for that we'd need a way to actually notify the driver.
Note that drivers may already be able to accommodate such a topology
change on their own, e.g. using the reset_controller sysfs file in NVMe
will cause the driver to get things right for this case.
With the rebuild removed we will simplify retain the queue mapping for
a soft offlined CPU that will work when it comes back online, and will
map any newly onlined CPU to queue NULL until the driver initiates
a rebuild of the queue map.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
block/blk-mq.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 576e711..6e88e2c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2156,8 +2156,6 @@ static void blk_mq_queue_reinit(struct request_queue *q,
blk_mq_sysfs_unregister(q);
- blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
-
/*
* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
* we should change hctx numa_node according to new topology (this
--
2.1.4
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 2/3] blk-mq: only allocate a single mq_map per tag_set
2016-07-21 14:30 resend: pci: automatic interrupt affinity for MSI/MSI-X capable devices V2 Christoph Hellwig
2016-07-21 14:30 ` [PATCH 1/3] blk-mq: don't redistribute hardware queues on a CPU hotplug event Christoph Hellwig
@ 2016-07-21 14:30 ` Christoph Hellwig
2016-07-21 14:30 ` [PATCH 3/3] blk-mq: allow the driver to pass in an affinity mask Christoph Hellwig
2016-07-21 21:02 ` resend: pci: automatic interrupt affinity for MSI/MSI-X capable devices V2 Bjorn Helgaas
3 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2016-07-21 14:30 UTC (permalink / raw)
To: linux-pci; +Cc: agordeev, linux-kernel
The mapping is identical for all queues in a tag_set, so stop wasting
memory for building multiple. Note that for now I've kept the mq_map
pointer in the request_queue, but we'll need to investigate if we can
remove it without suffering from the additional indirection. The same
would apply to the mq_ops pointer as well.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
block/blk-mq.c | 22 ++++++++++++++--------
include/linux/blk-mq.h | 1 +
2 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6e88e2c..c4adaa2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1972,7 +1972,6 @@ void blk_mq_release(struct request_queue *q)
kfree(hctx);
}
- kfree(q->mq_map);
q->mq_map = NULL;
kfree(q->queue_hw_ctx);
@@ -2071,9 +2070,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
if (!q->queue_hw_ctx)
goto err_percpu;
- q->mq_map = blk_mq_make_queue_map(set);
- if (!q->mq_map)
- goto err_map;
+ q->mq_map = set->mq_map;
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
@@ -2123,8 +2120,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
return q;
err_hctxs:
- kfree(q->mq_map);
-err_map:
kfree(q->queue_hw_ctx);
err_percpu:
free_percpu(q->queue_ctx);
@@ -2346,14 +2341,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (!set->tags)
return -ENOMEM;
+ set->mq_map = blk_mq_make_queue_map(set);
+ if (!set->mq_map)
+ goto out_free_tags;
+
if (blk_mq_alloc_rq_maps(set))
- goto enomem;
+ goto out_free_mq_map;
mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list);
return 0;
-enomem:
+
+out_free_mq_map:
+ kfree(set->mq_map);
+ set->mq_map = NULL;
+out_free_tags:
kfree(set->tags);
set->tags = NULL;
return -ENOMEM;
@@ -2369,6 +2372,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
blk_mq_free_rq_map(set, set->tags[i], i);
}
+ kfree(set->mq_map);
+ set->mq_map = NULL;
+
kfree(set->tags);
set->tags = NULL;
}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e43bbff..a572227 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -65,6 +65,7 @@ struct blk_mq_hw_ctx {
};
struct blk_mq_tag_set {
+ unsigned int *mq_map;
struct blk_mq_ops *ops;
unsigned int nr_hw_queues;
unsigned int queue_depth; /* max hw supported */
--
2.1.4
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 3/3] blk-mq: allow the driver to pass in an affinity mask
2016-07-21 14:30 resend: pci: automatic interrupt affinity for MSI/MSI-X capable devices V2 Christoph Hellwig
2016-07-21 14:30 ` [PATCH 1/3] blk-mq: don't redistribute hardware queues on a CPU hotplug event Christoph Hellwig
2016-07-21 14:30 ` [PATCH 2/3] blk-mq: only allocate a single mq_map per tag_set Christoph Hellwig
@ 2016-07-21 14:30 ` Christoph Hellwig
2016-07-21 21:02 ` resend: pci: automatic interrupt affinity for MSI/MSI-X capable devices V2 Bjorn Helgaas
3 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2016-07-21 14:30 UTC (permalink / raw)
To: linux-pci; +Cc: agordeev, linux-kernel
Allow drivers to pass in the affinity mask from the generic interrupt
layer, and spread queues based on that. If the driver doesn't pass in
a mask we will create it using the genirq helper. As this helper was
modelled after the blk-mq algorithm there should be no change in
behavior.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
block/Makefile | 2 +-
block/blk-mq-cpumap.c | 120 -------------------------------------------------
block/blk-mq.c | 68 +++++++++++++++++++++++++---
block/blk-mq.h | 8 ----
include/linux/blk-mq.h | 1 +
5 files changed, 65 insertions(+), 134 deletions(-)
delete mode 100644 block/blk-mq-cpumap.c
diff --git a/block/Makefile b/block/Makefile
index 9eda232..aeb318d 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o \
- blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
+ blk-mq-sysfs.o blk-mq-cpu.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
deleted file mode 100644
index d0634bc..0000000
--- a/block/blk-mq-cpumap.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * CPU <-> hardware queue mapping helpers
- *
- * Copyright (C) 2013-2014 Jens Axboe
- */
-#include <linux/kernel.h>
-#include <linux/threads.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <linux/blk-mq.h>
-#include "blk.h"
-#include "blk-mq.h"
-
-static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
- const int cpu)
-{
- return cpu * nr_queues / nr_cpus;
-}
-
-static int get_first_sibling(unsigned int cpu)
-{
- unsigned int ret;
-
- ret = cpumask_first(topology_sibling_cpumask(cpu));
- if (ret < nr_cpu_ids)
- return ret;
-
- return cpu;
-}
-
-int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
- const struct cpumask *online_mask)
-{
- unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
- cpumask_var_t cpus;
-
- if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
- return 1;
-
- cpumask_clear(cpus);
- nr_cpus = nr_uniq_cpus = 0;
- for_each_cpu(i, online_mask) {
- nr_cpus++;
- first_sibling = get_first_sibling(i);
- if (!cpumask_test_cpu(first_sibling, cpus))
- nr_uniq_cpus++;
- cpumask_set_cpu(i, cpus);
- }
-
- queue = 0;
- for_each_possible_cpu(i) {
- if (!cpumask_test_cpu(i, online_mask)) {
- map[i] = 0;
- continue;
- }
-
- /*
- * Easy case - we have equal or more hardware queues. Or
- * there are no thread siblings to take into account. Do
- * 1:1 if enough, or sequential mapping if less.
- */
- if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
- map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
- queue++;
- continue;
- }
-
- /*
- * Less then nr_cpus queues, and we have some number of
- * threads per cores. Map sibling threads to the same
- * queue.
- */
- first_sibling = get_first_sibling(i);
- if (first_sibling == i) {
- map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
- queue);
- queue++;
- } else
- map[i] = map[first_sibling];
- }
-
- free_cpumask_var(cpus);
- return 0;
-}
-
-unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
-{
- unsigned int *map;
-
- /* If cpus are offline, map them to first hctx */
- map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL,
- set->numa_node);
- if (!map)
- return NULL;
-
- if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
- return map;
-
- kfree(map);
- return NULL;
-}
-
-/*
- * We have no quick way of doing reverse lookups. This is only used at
- * queue init time, so runtime isn't important.
- */
-int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
-{
- int i;
-
- for_each_possible_cpu(i) {
- if (index == mq_map[i])
- return local_memory_node(cpu_to_node(i));
- }
-
- return NUMA_NO_NODE;
-}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c4adaa2..1053b7b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -22,6 +22,7 @@
#include <linux/sched/sysctl.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
+#include <linux/interrupt.h>
#include <trace/events/block.h>
@@ -1996,6 +1997,22 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_init_queue);
+/*
+ * We have no quick way of doing reverse lookups. This is only used at
+ * queue init time, so runtime isn't important.
+ */
+static int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (index == mq_map[i])
+ return local_memory_node(cpu_to_node(i));
+ }
+
+ return NUMA_NO_NODE;
+}
+
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q)
{
@@ -2295,6 +2312,30 @@ struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
}
EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
+static int blk_mq_create_mq_map(struct blk_mq_tag_set *set,
+ const struct cpumask *affinity_mask)
+{
+ int queue = -1, cpu = 0;
+
+ set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
+ GFP_KERNEL, set->numa_node);
+ if (!set->mq_map)
+ return -ENOMEM;
+
+ if (!affinity_mask)
+ return 0; /* map all cpus to queue 0 */
+
+ /* If cpus are offline, map them to first hctx */
+ for_each_online_cpu(cpu) {
+ if (cpumask_test_cpu(cpu, affinity_mask))
+ queue++;
+ if (queue >= 0)
+ set->mq_map[cpu] = queue;
+ }
+
+ return 0;
+}
+
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
@@ -2303,6 +2344,8 @@ EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
*/
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
+ int ret;
+
BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
if (!set->nr_hw_queues)
@@ -2341,11 +2384,26 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (!set->tags)
return -ENOMEM;
- set->mq_map = blk_mq_make_queue_map(set);
- if (!set->mq_map)
- goto out_free_tags;
+ /*
+ * Use the passed in affinity mask if the driver provided one.
+ */
+ if (set->affinity_mask) {
+ ret = blk_mq_create_mq_map(set, set->affinity_mask);
+ if (!set->mq_map)
+ goto out_free_tags;
+ } else {
+ struct cpumask *affinity_mask;
+
+ affinity_mask = irq_create_affinity_mask(&set->nr_hw_queues);
+ ret = blk_mq_create_mq_map(set, affinity_mask);
+ kfree(affinity_mask);
+
+ if (!set->mq_map)
+ goto out_free_tags;
+ }
- if (blk_mq_alloc_rq_maps(set))
+ ret = blk_mq_alloc_rq_maps(set);
+ if (ret)
goto out_free_mq_map;
mutex_init(&set->tag_list_lock);
@@ -2359,7 +2417,7 @@ out_free_mq_map:
out_free_tags:
kfree(set->tags);
set->tags = NULL;
- return -ENOMEM;
+ return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9087b11..fe7e21f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -45,14 +45,6 @@ void blk_mq_enable_hotplug(void);
void blk_mq_disable_hotplug(void);
/*
- * CPU -> queue mappings
- */
-extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
-extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
- const struct cpumask *online_mask);
-extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
-
-/*
* sysfs helpers
*/
extern int blk_mq_sysfs_register(struct request_queue *q);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a572227..0809966 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -75,6 +75,7 @@ struct blk_mq_tag_set {
unsigned int timeout;
unsigned int flags; /* BLK_MQ_F_* */
void *driver_data;
+ struct cpumask *affinity_mask;
struct blk_mq_tags **tags;
--
2.1.4
^ permalink raw reply related [flat|nested] 6+ messages in thread