* [PATCH 1/3] blk-mq: introduce .map_changed to blk_mq_ops
2015-09-29 3:20 [PATCH 0/3] blk-mq & nvme: introduce .map_changed Ming Lei
@ 2015-09-29 3:20 ` Ming Lei
2015-09-29 3:20 ` [PATCH 2/3] block: nvme: use map_changed to set irq affinity hint Ming Lei
` (3 subsequent siblings)
4 siblings, 0 replies; 11+ messages in thread
From: Ming Lei @ 2015-09-29 3:20 UTC (permalink / raw)
This patch introduces .map_changed callback to blk_mq_ops
so that driver can get notified when the mapping between
sw queue and hw queue is changed. One use case is for
setting irq affinity hint of hardware queue.
Signed-off-by: Ming Lei <tom.leiming at gmail.com>
---
block/blk-mq.c | 5 +++++
include/linux/blk-mq.h | 7 +++++++
2 files changed, 12 insertions(+)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f2d67b4..181e438 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1829,6 +1829,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
set->tags[i] = NULL;
}
hctx->tags = NULL;
+ if (q->mq_ops->map_changed)
+ q->mq_ops->map_changed(hctx, i, false);
continue;
}
@@ -1850,6 +1852,9 @@ static void blk_mq_map_swqueue(struct request_queue *q)
*/
hctx->next_cpu = cpumask_first(hctx->cpumask);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+
+ if (q->mq_ops->map_changed)
+ q->mq_ops->map_changed(hctx, i, true);
}
}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 37d1602..a4a7433 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -89,6 +89,7 @@ typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef void (map_changed_fn)(struct blk_mq_hw_ctx *, unsigned int, bool);
typedef int (init_request_fn)(void *, struct request *, unsigned int,
unsigned int, unsigned int);
typedef void (exit_request_fn)(void *, struct request *, unsigned int,
@@ -125,6 +126,12 @@ struct blk_mq_ops {
exit_hctx_fn *exit_hctx;
/*
+ * Called when the mapping between sw queue and hw queue
+ * is setup or changed because of CPU topo changed
+ */
+ map_changed_fn *map_changed;
+
+ /*
* Called for every command allocated by the block layer to allow
* the driver to set up driver specific data.
*
--
1.9.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH 2/3] block: nvme: use map_changed to set irq affinity hint
2015-09-29 3:20 [PATCH 0/3] blk-mq & nvme: introduce .map_changed Ming Lei
2015-09-29 3:20 ` [PATCH 1/3] blk-mq: introduce .map_changed to blk_mq_ops Ming Lei
@ 2015-09-29 3:20 ` Ming Lei
2015-09-29 3:20 ` [PATCH 3/3] blk-mq: remove cpumask from 'struct blk_mq_tags' Ming Lei
` (2 subsequent siblings)
4 siblings, 0 replies; 11+ messages in thread
From: Ming Lei @ 2015-09-29 3:20 UTC (permalink / raw)
This patch uses the .map_changed callback to set irq affinity
hint, then the irq affinity can be updated when CPU topo
is changed.
Signed-off-by: Ming Lei <tom.leiming at gmail.com>
---
drivers/block/nvme-core.c | 53 ++++++++++++++++++++++++++++++-----------------
1 file changed, 34 insertions(+), 19 deletions(-)
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index b97fc3f..cac16a6f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -105,6 +105,8 @@ struct nvme_queue {
struct device *q_dmadev;
struct nvme_dev *dev;
char irqname[24]; /* nvme4294967295-65535\0 */
+ unsigned long mapped:1;
+ unsigned long irq_affinity_set:1;
spinlock_t q_lock;
struct nvme_command *sq_cmds;
struct nvme_command __iomem *sq_cmds_io;
@@ -232,6 +234,37 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
return 0;
}
+/*
+ * Since namespaces shared tagset and the 'hctx' with same
+ * index shared one same nvme queue & tag, also the mapping
+ * between sw queue and hw queue is global and only depends
+ * on CPUs topo, this callback only sets irq affinity once
+ * by using the cpumask from one of hctx.
+ * */
+static void nvme_map_changed(struct blk_mq_hw_ctx *hctx,
+ unsigned int hctx_idx, bool mapped)
+{
+ struct nvme_queue *nvmeq = hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
+ unsigned int irq;
+
+ if (nvmeq->mapped != mapped)
+ nvmeq->irq_affinity_set = 0;
+
+ nvmeq->mapped = mapped;
+
+ if (nvmeq->irq_affinity_set)
+ return;
+
+ irq = dev->entry[nvmeq->cq_vector].vector;
+ if (mapped)
+ irq_set_affinity_hint(irq, hctx->cpumask);
+ else
+ irq_set_affinity_hint(irq, NULL);
+
+ nvmeq->irq_affinity_set = 1;
+}
+
static int nvme_init_request(void *data, struct request *req,
unsigned int hctx_idx, unsigned int rq_idx,
unsigned int numa_node)
@@ -1664,6 +1697,7 @@ static struct blk_mq_ops nvme_mq_ops = {
.queue_rq = nvme_queue_rq,
.map_queue = blk_mq_map_queue,
.init_hctx = nvme_init_hctx,
+ .map_changed = nvme_map_changed,
.init_request = nvme_init_request,
.timeout = nvme_timeout,
};
@@ -2953,22 +2987,6 @@ static const struct file_operations nvme_dev_fops = {
.compat_ioctl = nvme_dev_ioctl,
};
-static void nvme_set_irq_hints(struct nvme_dev *dev)
-{
- struct nvme_queue *nvmeq;
- int i;
-
- for (i = 0; i < dev->online_queues; i++) {
- nvmeq = dev->queues[i];
-
- if (!nvmeq->tags || !(*nvmeq->tags))
- continue;
-
- irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
- blk_mq_tags_cpumask(*nvmeq->tags));
- }
-}
-
static int nvme_dev_start(struct nvme_dev *dev)
{
int result;
@@ -3010,8 +3028,6 @@ static int nvme_dev_start(struct nvme_dev *dev)
if (result)
goto free_tags;
- nvme_set_irq_hints(dev);
-
dev->event_limit = 1;
return result;
@@ -3062,7 +3078,6 @@ static int nvme_dev_resume(struct nvme_dev *dev)
} else {
nvme_unfreeze_queues(dev);
nvme_dev_add(dev);
- nvme_set_irq_hints(dev);
}
return 0;
}
--
1.9.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH 3/3] blk-mq: remove cpumask from 'struct blk_mq_tags'
2015-09-29 3:20 [PATCH 0/3] blk-mq & nvme: introduce .map_changed Ming Lei
2015-09-29 3:20 ` [PATCH 1/3] blk-mq: introduce .map_changed to blk_mq_ops Ming Lei
2015-09-29 3:20 ` [PATCH 2/3] block: nvme: use map_changed to set irq affinity hint Ming Lei
@ 2015-09-29 3:20 ` Ming Lei
2015-09-29 7:07 ` [PATCH 0/3] blk-mq & nvme: introduce .map_changed Christoph Hellwig
2015-09-29 14:26 ` Keith Busch
4 siblings, 0 replies; 11+ messages in thread
From: Ming Lei @ 2015-09-29 3:20 UTC (permalink / raw)
The 'cpumask' of 'struct blk_mq_tags' is introduced to set
the irq affinity of NVMe queue, but now we have introduced
.map_changed callback which can cover the use case and looks
more flexible.
Signed-off-by: Ming Lei <tom.leiming at gmail.com>
---
block/blk-mq-tag.c | 5 -----
block/blk-mq-tag.h | 1 -
block/blk-mq.c | 11 +----------
include/linux/blk-mq.h | 1 -
4 files changed, 1 insertion(+), 17 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9115c6d..931b62d 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -613,11 +613,6 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
if (!tags)
return NULL;
- if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
- kfree(tags);
- return NULL;
- }
-
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 9eb2cf4..d5a9612 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -44,7 +44,6 @@ struct blk_mq_tags {
struct list_head page_list;
int alloc_policy;
- cpumask_var_t cpumask;
};
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 181e438..8196011 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1811,7 +1811,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
hctx = q->mq_ops->map_queue(q, i);
cpumask_set_cpu(i, hctx->cpumask);
- cpumask_set_cpu(i, hctx->tags->cpumask);
ctx->index_hw = hctx->nr_ctx;
hctx->ctxs[hctx->nr_ctx++] = ctx;
}
@@ -2198,12 +2197,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
return 0;
}
-struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
-{
- return tags->cpumask;
-}
-EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
-
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
@@ -2265,10 +2258,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
int i;
for (i = 0; i < set->nr_hw_queues; i++) {
- if (set->tags[i]) {
+ if (set->tags[i])
blk_mq_free_rq_map(set, set->tags[i], i);
- free_cpumask_var(set->tags[i]->cpumask);
- }
}
kfree(set->tags);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a4a7433..79d4aaf 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -190,7 +190,6 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
gfp_t gfp, bool reserved);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
-struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags);
enum {
BLK_MQ_UNIQUE_TAG_BITS = 16,
--
1.9.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH 0/3] blk-mq & nvme: introduce .map_changed
2015-09-29 3:20 [PATCH 0/3] blk-mq & nvme: introduce .map_changed Ming Lei
` (2 preceding siblings ...)
2015-09-29 3:20 ` [PATCH 3/3] blk-mq: remove cpumask from 'struct blk_mq_tags' Ming Lei
@ 2015-09-29 7:07 ` Christoph Hellwig
2015-09-29 21:59 ` Ming Lei
2015-09-29 14:26 ` Keith Busch
4 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2015-09-29 7:07 UTC (permalink / raw)
Hi Ming,
from a quick gance this looks very useful. Please split the callback
into two instead of overloading them with the bool mapped parameter.
Please keep Akinobu Mita in the loop, as this should go on top of his blk-mq
fixes.
^ permalink raw reply [flat|nested] 11+ messages in thread* [PATCH 0/3] blk-mq & nvme: introduce .map_changed
2015-09-29 7:07 ` [PATCH 0/3] blk-mq & nvme: introduce .map_changed Christoph Hellwig
@ 2015-09-29 21:59 ` Ming Lei
0 siblings, 0 replies; 11+ messages in thread
From: Ming Lei @ 2015-09-29 21:59 UTC (permalink / raw)
On Tue, Sep 29, 2015@3:07 PM, Christoph Hellwig <hch@lst.de> wrote:
> Hi Ming,
>
> from a quick gance this looks very useful. Please split the callback
> into two instead of overloading them with the bool mapped parameter.
The mapped parameter can be removed since it can be figured out by
checking 'hctx->tags'.
IMO, it isn't good to split it into two callback, such as,
map_activate/map_deactivate or other names, and the two callback
can't be symmetrical at all. When CPU hotplug happened, it is just
the mapping changed, neigher setuping nor tearing down since it is
1:N mapping between hw queue and percpu sw queue.
So I suggest to keep it as .map_changed(), or do you have better idea?
>
> Please keep Akinobu Mita in the loop, as this should go on top of his blk-mq
> fixes.
OK.
--
Ming Lei
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 0/3] blk-mq & nvme: introduce .map_changed
2015-09-29 3:20 [PATCH 0/3] blk-mq & nvme: introduce .map_changed Ming Lei
` (3 preceding siblings ...)
2015-09-29 7:07 ` [PATCH 0/3] blk-mq & nvme: introduce .map_changed Christoph Hellwig
@ 2015-09-29 14:26 ` Keith Busch
2015-09-29 14:47 ` Jens Axboe
4 siblings, 1 reply; 11+ messages in thread
From: Keith Busch @ 2015-09-29 14:26 UTC (permalink / raw)
On Mon, 28 Sep 2015, Ming Lei wrote:
> This patchset introduces .map_changed callback into 'struct blk_mq_ops',
> and use this callback to get NVMe notified about the mapping changed event,
> then NVMe can update the irq affinity hint for its queues.
I think this is going the wrong direction. Shouldn't we provide blk-mq
the vectors in the tag set so that layer can manage the irq hints?
This could lead to more cpu-queue assignment optimizations from using
that information. For example, two h/w contexts sharing the same vector
shouldn't be assigned to cpus on different NUMA nodes.
> Also the 'cpumask' in 'struct blk_mq_tags' isn't needed any more, so remove
> that and related kernel interface.
It was added to the tags because the cpu mask is an artifact of the
tags rather that duplicating it across all the h/w contexts sharing the
same set. It also doesn't let a h/w context from one namespace overwrite
another's cpu affinity mask when they share the same vector.
^ permalink raw reply [flat|nested] 11+ messages in thread* [PATCH 0/3] blk-mq & nvme: introduce .map_changed
2015-09-29 14:26 ` Keith Busch
@ 2015-09-29 14:47 ` Jens Axboe
2015-09-29 22:16 ` Ming Lei
0 siblings, 1 reply; 11+ messages in thread
From: Jens Axboe @ 2015-09-29 14:47 UTC (permalink / raw)
On 09/29/2015 08:26 AM, Keith Busch wrote:
> On Mon, 28 Sep 2015, Ming Lei wrote:
>> This patchset introduces .map_changed callback into 'struct blk_mq_ops',
>> and use this callback to get NVMe notified about the mapping changed
>> event,
>> then NVMe can update the irq affinity hint for its queues.
>
> I think this is going the wrong direction. Shouldn't we provide blk-mq
> the vectors in the tag set so that layer can manage the irq hints?
>
> This could lead to more cpu-queue assignment optimizations from using
> that information. For example, two h/w contexts sharing the same vector
> shouldn't be assigned to cpus on different NUMA nodes.
I agree, this is moving in the wrong direction. Currently the sw <->hw
queue mappings are in blk-mq, and this is the exact same information
base we need for IRQ affinity handling. We need to move in the direction
of having blk-mq helpers handle that part too, not pass notifications to
the lower level driver to update its IRQ mappings.
>> Also the 'cpumask' in 'struct blk_mq_tags' isn't needed any more, so
>> remove
>> that and related kernel interface.
>
> It was added to the tags because the cpu mask is an artifact of the
> tags rather that duplicating it across all the h/w contexts sharing the
> same set. It also doesn't let a h/w context from one namespace overwrite
> another's cpu affinity mask when they share the same vector.
So having the mask in the tags is really odd, it should be in some
per-device type data instead.
--
Jens Axboe
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 0/3] blk-mq & nvme: introduce .map_changed
2015-09-29 14:47 ` Jens Axboe
@ 2015-09-29 22:16 ` Ming Lei
2015-09-29 22:45 ` Keith Busch
0 siblings, 1 reply; 11+ messages in thread
From: Ming Lei @ 2015-09-29 22:16 UTC (permalink / raw)
On Tue, Sep 29, 2015@10:47 PM, Jens Axboe <axboe@kernel.dk> wrote:
> On 09/29/2015 08:26 AM, Keith Busch wrote:
>>
>> On Mon, 28 Sep 2015, Ming Lei wrote:
>>>
>>> This patchset introduces .map_changed callback into 'struct blk_mq_ops',
>>> and use this callback to get NVMe notified about the mapping changed
>>> event,
>>> then NVMe can update the irq affinity hint for its queues.
>>
>>
>> I think this is going the wrong direction. Shouldn't we provide blk-mq
>> the vectors in the tag set so that layer can manage the irq hints?
>>
>> This could lead to more cpu-queue assignment optimizations from using
>> that information. For example, two h/w contexts sharing the same vector
>> shouldn't be assigned to cpus on different NUMA nodes.
>
>
> I agree, this is moving in the wrong direction. Currently the sw <->hw queue
> mappings are in blk-mq, and this is the exact same information base we need
> for IRQ affinity handling. We need to move in the direction of having blk-mq
> helpers handle that part too, not pass notifications to the lower level
> driver to update its IRQ mappings.
Yes, I thought of that before, but it has the following cons:
- some drivers/devices may need different IRQ affinity policy, such as virtio
devices which has its own set affinity handler(see virtqueue_set_affinity()),
and it is offten not efficient to handle the virt queue's irq on more
than one CPU.
- block core has to get the irq vector information which has to be
setup/finalized
before blk-mq uses that for setting irq affinity, for example, in case
NVMe's admin
queue, its vector can be changed after admin queue's initialization.
That is why I said this approach is more flexible.
>
>>> Also the 'cpumask' in 'struct blk_mq_tags' isn't needed any more, so
>>> remove
>>> that and related kernel interface.
>>
>>
>> It was added to the tags because the cpu mask is an artifact of the
>> tags rather that duplicating it across all the h/w contexts sharing the
>> same set. It also doesn't let a h/w context from one namespace overwrite
>> another's cpu affinity mask when they share the same vector.
>
>
> So having the mask in the tags is really odd, it should be in some
> per-device type data instead.
Agree, removing the mask in tags is one of this patchset's motivation.
--
Ming Lei
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 0/3] blk-mq & nvme: introduce .map_changed
2015-09-29 22:16 ` Ming Lei
@ 2015-09-29 22:45 ` Keith Busch
2015-09-30 0:08 ` Ming Lei
0 siblings, 1 reply; 11+ messages in thread
From: Keith Busch @ 2015-09-29 22:45 UTC (permalink / raw)
On Tue, 29 Sep 2015, Ming Lei wrote:
> Yes, I thought of that before, but it has the following cons:
>
> - some drivers/devices may need different IRQ affinity policy, such as virtio
> devices which has its own set affinity handler(see virtqueue_set_affinity()),
That's not a very good example to support your cause; virtio_scsi's use
is a perfect example for one that would benefit from letting blk-mq
handle affinity. virtio_scsi sets affinity only when there is a 1:1
mapping of cpu's to queue's, but this driver doesn't know the mapping
that blk-mq used, creating a potentially less than optimal mapping.
> - block core has to get the irq vector information which has to be
> setup/finalized
> before blk-mq uses that for setting irq affinity, for example, in case
> NVMe's admin
> queue, its vector can be changed after admin queue's initialization.
Why do you want to put a hint on the admin queue's irq?
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 0/3] blk-mq & nvme: introduce .map_changed
2015-09-29 22:45 ` Keith Busch
@ 2015-09-30 0:08 ` Ming Lei
0 siblings, 0 replies; 11+ messages in thread
From: Ming Lei @ 2015-09-30 0:08 UTC (permalink / raw)
On Wed, Sep 30, 2015@6:45 AM, Keith Busch <keith.busch@intel.com> wrote:
> On Tue, 29 Sep 2015, Ming Lei wrote:
>>
>> Yes, I thought of that before, but it has the following cons:
>>
>> - some drivers/devices may need different IRQ affinity policy, such as
>> virtio
>> devices which has its own set affinity handler(see
>> virtqueue_set_affinity()),
>
>
> That's not a very good example to support your cause; virtio_scsi's use
> is a perfect example for one that would benefit from letting blk-mq
> handle affinity. virtio_scsi sets affinity only when there is a 1:1
> mapping of cpu's to queue's, but this driver doesn't know the mapping
> that blk-mq used, creating a potentially less than optimal mapping.
The 1:1 mapping is introduced before blk-mq, and that doesn't mean we
have to do that for blk-mq.
Actualy I mean virtio-scsi just lets the 1st CPU of the cpumask handle
the virt-queue's irq, instead of all CPUs mapped to the hw queue(virt-queue).
>
>> - block core has to get the irq vector information which has to be
>> setup/finalized
>> before blk-mq uses that for setting irq affinity, for example, in case
>> NVMe's admin
>> queue, its vector can be changed after admin queue's initialization.
>
>
> Why do you want to put a hint on the admin queue's irq?
No, I don't want, and it is just a example, I mean other drivers/devices
may have this kind of situation too.
--
Ming Lei
^ permalink raw reply [flat|nested] 11+ messages in thread