* blk-mq: allow passing in an external queue mapping V3
@ 2016-09-14 14:18 Christoph Hellwig
  2016-09-14 14:18 ` [PATCH 01/13] genirq/msi: Add cpumask allocation to alloc_msi_entry Christoph Hellwig
                   ` (13 more replies)
  0 siblings, 14 replies; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
This series is the remainder of the earlier "automatic interrupt affinity for
MSI/MSI-X capable devices" series, and make uses of the new irq-level
interrupt / queue mapping code in blk-mq, as well as allowing the driver
to pass in such a mask obtained from the (PCI) interrupt code.  To fully
support this feature in drivers the final third in the PCI layer will
be needed as well.
A git tree is available at:
   git://git.infradead.org/users/hch/block.git block-queue-mapping.2
Gitweb:
   http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/block-queue-mapping.2
Changes since V2:
 - major rework of the core IRQ affinity code to support sibling maps
 - reworked the block code to use the above
 
Changes since V1:
 - rebased on top of Linux 4.8-rc4
Changes since automatic interrupt affinity for MSI/MSI-X capable devices V3:
 - a trivial cleanup in blk_mq_create_mq_map pointed out by Alexander
^ permalink raw reply	[flat|nested] 33+ messages in thread
* [PATCH 01/13] genirq/msi: Add cpumask allocation to alloc_msi_entry
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-19  7:30   ` Alexander Gordeev
  2016-09-14 14:18 ` [PATCH 02/13] genirq/affinity: Provide smarter irq spreading infrastructure Christoph Hellwig
                   ` (12 subsequent siblings)
  13 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
From: Thomas Gleixner <tglx@linutronix.de>
For irq spreading want to store affinity masks in the msi_entry. Add the
infrastructure for it.
We allocate an array of cpumasks with an array size of the number of used
vectors in the entry, so we can hand in the information per linux interrupt
later.
As we hand in the number of used vectors, we assign them right
away. Convert all the call sites.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/base/platform-msi.c         |  3 +--
 drivers/pci/msi.c                   |  6 ++----
 drivers/staging/fsl-mc/bus/mc-msi.c |  3 +--
 include/linux/msi.h                 |  5 +++--
 kernel/irq/msi.c                    | 26 ++++++++++++++++++++++++--
 5 files changed, 31 insertions(+), 12 deletions(-)
diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
index 279e539..be6a599 100644
--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -142,13 +142,12 @@ static int platform_msi_alloc_descs_with_irq(struct device *dev, int virq,
 	}
 
 	for (i = 0; i < nvec; i++) {
-		desc = alloc_msi_entry(dev);
+		desc = alloc_msi_entry(dev, 1, NULL);
 		if (!desc)
 			break;
 
 		desc->platform.msi_priv_data = data;
 		desc->platform.msi_index = base + i;
-		desc->nvec_used = 1;
 		desc->irq = virq ? virq + i : 0;
 
 		list_add_tail(&desc->list, dev_to_msi_list(dev));
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 98f1222..0db72ba 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -555,7 +555,7 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec)
 	struct msi_desc *entry;
 
 	/* MSI Entry Initialization */
-	entry = alloc_msi_entry(&dev->dev);
+	entry = alloc_msi_entry(&dev->dev, nvec, NULL);
 	if (!entry)
 		return NULL;
 
@@ -568,7 +568,6 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec)
 	entry->msi_attrib.default_irq	= dev->irq;	/* Save IOAPIC IRQ */
 	entry->msi_attrib.multi_cap	= (control & PCI_MSI_FLAGS_QMASK) >> 1;
 	entry->msi_attrib.multiple	= ilog2(__roundup_pow_of_two(nvec));
-	entry->nvec_used		= nvec;
 	entry->affinity			= dev->irq_affinity;
 
 	if (control & PCI_MSI_FLAGS_64BIT)
@@ -693,7 +692,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			mask = cpumask_of(cpu);
 		}
 
-		entry = alloc_msi_entry(&dev->dev);
+		entry = alloc_msi_entry(&dev->dev, 1, NULL);
 		if (!entry) {
 			if (!i)
 				iounmap(base);
@@ -711,7 +710,6 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			entry->msi_attrib.entry_nr = i;
 		entry->msi_attrib.default_irq	= dev->irq;
 		entry->mask_base		= base;
-		entry->nvec_used		= 1;
 		entry->affinity			= mask;
 
 		list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
diff --git a/drivers/staging/fsl-mc/bus/mc-msi.c b/drivers/staging/fsl-mc/bus/mc-msi.c
index c7be156..4fd8e41 100644
--- a/drivers/staging/fsl-mc/bus/mc-msi.c
+++ b/drivers/staging/fsl-mc/bus/mc-msi.c
@@ -213,7 +213,7 @@ static int fsl_mc_msi_alloc_descs(struct device *dev, unsigned int irq_count)
 	struct msi_desc *msi_desc;
 
 	for (i = 0; i < irq_count; i++) {
-		msi_desc = alloc_msi_entry(dev);
+		msi_desc = alloc_msi_entry(dev, 1, NULL);
 		if (!msi_desc) {
 			dev_err(dev, "Failed to allocate msi entry\n");
 			error = -ENOMEM;
@@ -221,7 +221,6 @@ static int fsl_mc_msi_alloc_descs(struct device *dev, unsigned int irq_count)
 		}
 
 		msi_desc->fsl_mc.msi_index = i;
-		msi_desc->nvec_used = 1;
 		INIT_LIST_HEAD(&msi_desc->list);
 		list_add_tail(&msi_desc->list, dev_to_msi_list(dev));
 	}
diff --git a/include/linux/msi.h b/include/linux/msi.h
index e8c81fb..0db320b 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -68,7 +68,7 @@ struct msi_desc {
 	unsigned int			nvec_used;
 	struct device			*dev;
 	struct msi_msg			msg;
-	const struct cpumask		*affinity;
+	struct cpumask			*affinity;
 
 	union {
 		/* PCI MSI/X specific data */
@@ -123,7 +123,8 @@ static inline void *msi_desc_to_pci_sysdata(struct msi_desc *desc)
 }
 #endif /* CONFIG_PCI_MSI */
 
-struct msi_desc *alloc_msi_entry(struct device *dev);
+struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
+				 const struct cpumask *affinity);
 void free_msi_entry(struct msi_desc *entry);
 void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 19e9dfb..8a3e8727 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -18,20 +18,42 @@
 /* Temparory solution for building, will be removed later */
 #include <linux/pci.h>
 
-struct msi_desc *alloc_msi_entry(struct device *dev)
+/**
+ * alloc_msi_entry - Allocate an initialize msi_entry
+ * @dev:	Pointer to the device for which this is allocated
+ * @nvec:	The number of vectors used in this entry
+ * @affinity:	Optional pointer to an affinity mask array size of @nvec
+ *
+ * If @affinity is not NULL then a an affinity array[@nvec] is allocated
+ * and the affinity masks from @affinity are copied.
+ */
+struct msi_desc *
+alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity)
 {
-	struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	struct msi_desc *desc;
+
+	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
 	if (!desc)
 		return NULL;
 
 	INIT_LIST_HEAD(&desc->list);
 	desc->dev = dev;
+	desc->nvec_used = nvec;
+	if (affinity) {
+		desc->affinity = kmemdup(affinity,
+			nvec * sizeof(*desc->affinity), GFP_KERNEL);
+		if (!desc->affinity) {
+			kfree(desc);
+			return NULL;
+		}
+	}
 
 	return desc;
 }
 
 void free_msi_entry(struct msi_desc *entry)
 {
+	kfree(entry->affinity);
 	kfree(entry);
 }
 
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* [PATCH 02/13] genirq/affinity: Provide smarter irq spreading infrastructure
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
  2016-09-14 14:18 ` [PATCH 01/13] genirq/msi: Add cpumask allocation to alloc_msi_entry Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-21 12:29   ` Alexander Gordeev
  2016-09-14 14:18 ` [PATCH 03/13] genirq/msi: Switch to new " Christoph Hellwig
                   ` (11 subsequent siblings)
  13 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
From: Thomas Gleixner <tglx@linutronix.de>
The current irq spreading infrastructure is just looking at a cpumask and
tries to spread the interrupts over the mask. Thats suboptimal as it does
not take numa nodes into account.
Change the logic so the interrupts are spread across numa nodes and inside
the nodes. If there are more cpus than vectors per node, then we set the
affinity to several cpus. If HT siblings are available we take that into
account and try to set all siblings to a single vector.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  15 +++++
 kernel/irq/affinity.c     | 149 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index b6683f0..4e59d12 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -279,6 +279,8 @@ extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
 struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs);
+struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, int nvec);
+int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec);
 
 #else /* CONFIG_SMP */
 
@@ -316,6 +318,19 @@ static inline struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
 	*nr_vecs = 1;
 	return NULL;
 }
+
+static inline struct cpumask *
+irq_create_affinity_masks(const struct cpumask *affinity, int nvec)
+{
+	return NULL;
+}
+
+static inline int
+irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec)
+{
+	return maxvec;
+}
+
 #endif /* CONFIG_SMP */
 
 /*
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 32f6cfc..7812fec 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -4,6 +4,155 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 
+static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
+				int cpus_per_vec)
+{
+	const struct cpumask *siblmsk;
+	int cpu, sibl;
+
+	for ( ; cpus_per_vec > 0; ) {
+		cpu = cpumask_first(nmsk);
+
+		/* Should not happen, but I'm too lazy to think about it */
+		if (cpu >= nr_cpu_ids)
+			return;
+
+		cpumask_clear_cpu(cpu, nmsk);
+		cpumask_set_cpu(cpu, irqmsk);
+		cpus_per_vec--;
+
+		/* If the cpu has siblings, use them first */
+		siblmsk = topology_sibling_cpumask(cpu);
+		for (sibl = -1; cpus_per_vec > 0; ) {
+			sibl = cpumask_next(sibl, siblmsk);
+			if (sibl >= nr_cpu_ids)
+				break;
+			if (!cpumask_test_and_clear_cpu(sibl, nmsk))
+				continue;
+			cpumask_set_cpu(sibl, irqmsk);
+			cpus_per_vec--;
+		}
+	}
+}
+
+static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
+{
+	int n, nodes;
+
+	/* Calculate the number of nodes in the supplied affinity mask */
+	for (n = 0, nodes = 0; n < num_online_nodes(); n++) {
+		if (cpumask_intersects(mask, cpumask_of_node(n))) {
+			node_set(n, *nodemsk);
+			nodes++;
+		}
+	}
+	return nodes;
+}
+
+/**
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
+ * @affinity:		The affinity mask to spread. If NULL cpu_online_mask
+ *			is used
+ * @nvecs:		The number of vectors
+ *
+ * Returns the masks pointer or NULL if allocation failed.
+ */
+struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity,
+					  int nvec)
+{
+	int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0;
+	nodemask_t nodemsk = NODE_MASK_NONE;
+	struct cpumask *masks;
+	cpumask_var_t nmsk;
+
+	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+		return NULL;
+
+	masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL);
+	if (!masks)
+		goto out;
+
+	/* Stabilize the cpumasks */
+	get_online_cpus();
+	/* If the supplied affinity mask is NULL, use cpu online mask */
+	if (!affinity)
+		affinity = cpu_online_mask;
+
+	nodes = get_nodes_in_cpumask(affinity, &nodemsk);
+
+	/*
+	 * If the number of nodes in the mask is less than or equal the
+	 * number of vectors we just spread the vectors across the nodes.
+	 */
+	if (nvec <= nodes) {
+		for_each_node_mask(n, nodemsk) {
+			cpumask_copy(masks + curvec, cpumask_of_node(n));
+			if (++curvec == nvec)
+				break;
+		}
+		goto outonl;
+	}
+
+	/* Spread the vectors per node */
+	vecs_per_node = nvec / nodes;
+	/* Account for rounding errors */
+	extra_vecs = nvec - (nodes * vecs_per_node);
+
+	for_each_node_mask(n, nodemsk) {
+		int ncpus, v, vecs_to_assign = vecs_per_node;
+
+		/* Get the cpus on this node which are in the mask */
+		cpumask_and(nmsk, affinity, cpumask_of_node(n));
+
+		/* Calculate the number of cpus per vector */
+		ncpus = cpumask_weight(nmsk);
+
+		for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) {
+			cpus_per_vec = ncpus / vecs_to_assign;
+
+			/* Account for extra vectors to compensate rounding errors */
+			if (extra_vecs) {
+				cpus_per_vec++;
+				if (!--extra_vecs)
+					vecs_per_node++;
+			}
+			irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
+		}
+
+		if (curvec >= nvec)
+			break;
+	}
+
+outonl:
+	put_online_cpus();
+out:
+	free_cpumask_var(nmsk);
+	return masks;
+}
+
+/**
+ * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask
+ * @affinity:		The affinity mask to spread. If NULL cpu_online_mask
+ *			is used
+ * @maxvec:		The maximum number of vectors available
+ */
+int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec)
+{
+	int cpus, ret;
+
+	/* Stabilize the cpumasks */
+	get_online_cpus();
+	/* If the supplied affinity mask is NULL, use cpu online mask */
+	if (!affinity)
+		affinity = cpu_online_mask;
+
+	cpus = cpumask_weight(affinity);
+	ret = (cpus < maxvec) ? cpus : maxvec;
+
+	put_online_cpus();
+	return ret;
+}
+
 static int get_first_sibling(unsigned int cpu)
 {
 	unsigned int ret;
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* [PATCH 03/13] genirq/msi: Switch to new irq spreading infrastructure
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
  2016-09-14 14:18 ` [PATCH 01/13] genirq/msi: Add cpumask allocation to alloc_msi_entry Christoph Hellwig
  2016-09-14 14:18 ` [PATCH 02/13] genirq/affinity: Provide smarter irq spreading infrastructure Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-21 12:23   ` Alexander Gordeev
  2016-09-22  8:51   ` Alexander Gordeev
  2016-09-14 14:18 ` [PATCH 04/13] genirq/affinity: Remove old irq spread infrastructure Christoph Hellwig
                   ` (10 subsequent siblings)
  13 siblings, 2 replies; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
From: Thomas Gleixner <tglx@linutronix.de>
Switch MSI over to the new spreading code. If a pci device contains a valid
pointer to a cpumask, then this mask is used for spreading otherwise the
online cpu mask is used. This allows a driver to restrict the spread to a
subset of CPUs, e.g. cpus on a particular node.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi.c    | 128 +++++++++++++++++++++++++++++----------------------
 kernel/irq/irqdesc.c |  31 ++++++-------
 2 files changed, 87 insertions(+), 72 deletions(-)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 0db72ba..06100dd 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -549,15 +549,23 @@ error_attrs:
 	return ret;
 }
 
-static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec)
+static struct msi_desc *
+msi_setup_entry(struct pci_dev *dev, int nvec, bool affinity)
 {
-	u16 control;
+	struct cpumask *masks = NULL;
 	struct msi_desc *entry;
+	u16 control;
+
+	if (affinity) {
+		masks = irq_create_affinity_masks(dev->irq_affinity, nvec);
+		if (!masks)
+			pr_err("Unable to allocate affinity masks, ignoring\n");
+	}
 
 	/* MSI Entry Initialization */
-	entry = alloc_msi_entry(&dev->dev, nvec, NULL);
+	entry = alloc_msi_entry(&dev->dev, nvec, masks);
 	if (!entry)
-		return NULL;
+		goto out;
 
 	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
 
@@ -568,7 +576,6 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec)
 	entry->msi_attrib.default_irq	= dev->irq;	/* Save IOAPIC IRQ */
 	entry->msi_attrib.multi_cap	= (control & PCI_MSI_FLAGS_QMASK) >> 1;
 	entry->msi_attrib.multiple	= ilog2(__roundup_pow_of_two(nvec));
-	entry->affinity			= dev->irq_affinity;
 
 	if (control & PCI_MSI_FLAGS_64BIT)
 		entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64;
@@ -579,6 +586,8 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec)
 	if (entry->msi_attrib.maskbit)
 		pci_read_config_dword(dev, entry->mask_pos, &entry->masked);
 
+out:
+	kfree(masks);
 	return entry;
 }
 
@@ -607,7 +616,7 @@ static int msi_verify_entries(struct pci_dev *dev)
  * an error, and a positive return value indicates the number of interrupts
  * which could have been allocated.
  */
-static int msi_capability_init(struct pci_dev *dev, int nvec)
+static int msi_capability_init(struct pci_dev *dev, int nvec, bool affinity)
 {
 	struct msi_desc *entry;
 	int ret;
@@ -615,7 +624,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec)
 
 	pci_msi_set_enable(dev, 0);	/* Disable MSI during set up */
 
-	entry = msi_setup_entry(dev, nvec);
+	entry = msi_setup_entry(dev, nvec, affinity);
 	if (!entry)
 		return -ENOMEM;
 
@@ -678,28 +687,29 @@ static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
 }
 
 static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
-			      struct msix_entry *entries, int nvec)
+			      struct msix_entry *entries, int nvec,
+			      bool affinity)
 {
-	const struct cpumask *mask = NULL;
+	struct cpumask *curmsk, *masks = NULL;
 	struct msi_desc *entry;
-	int cpu = -1, i;
-
-	for (i = 0; i < nvec; i++) {
-		if (dev->irq_affinity) {
-			cpu = cpumask_next(cpu, dev->irq_affinity);
-			if (cpu >= nr_cpu_ids)
-				cpu = cpumask_first(dev->irq_affinity);
-			mask = cpumask_of(cpu);
-		}
+	int ret, i;
 
-		entry = alloc_msi_entry(&dev->dev, 1, NULL);
+	if (affinity) {
+		masks = irq_create_affinity_masks(dev->irq_affinity, nvec);
+		if (!masks)
+			pr_err("Unable to allocate affinity masks, ignoring\n");
+	}
+
+	for (i = 0, curmsk = masks; i < nvec; i++) {
+		entry = alloc_msi_entry(&dev->dev, 1, curmsk);
 		if (!entry) {
 			if (!i)
 				iounmap(base);
 			else
 				free_msi_irqs(dev);
 			/* No enough memory. Don't try again */
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto out;
 		}
 
 		entry->msi_attrib.is_msix	= 1;
@@ -710,11 +720,14 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			entry->msi_attrib.entry_nr = i;
 		entry->msi_attrib.default_irq	= dev->irq;
 		entry->mask_base		= base;
-		entry->affinity			= mask;
 
 		list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
+		if (masks)
+			curmsk++;
 	}
-
+	ret = 0;
+out:
+	kfree(masks);
 	return 0;
 }
 
@@ -743,8 +756,8 @@ static void msix_program_entries(struct pci_dev *dev,
  * single MSI-X irq. A return of zero indicates the successful setup of
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
-static int msix_capability_init(struct pci_dev *dev,
-				struct msix_entry *entries, int nvec)
+static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
+				int nvec, bool affinity)
 {
 	int ret;
 	u16 control;
@@ -759,7 +772,7 @@ static int msix_capability_init(struct pci_dev *dev,
 	if (!base)
 		return -ENOMEM;
 
-	ret = msix_setup_entries(dev, base, entries, nvec);
+	ret = msix_setup_entries(dev, base, entries, nvec, affinity);
 	if (ret)
 		return ret;
 
@@ -939,22 +952,8 @@ int pci_msix_vec_count(struct pci_dev *dev)
 }
 EXPORT_SYMBOL(pci_msix_vec_count);
 
-/**
- * pci_enable_msix - configure device's MSI-X capability structure
- * @dev: pointer to the pci_dev data structure of MSI-X device function
- * @entries: pointer to an array of MSI-X entries (optional)
- * @nvec: number of MSI-X irqs requested for allocation by device driver
- *
- * Setup the MSI-X capability structure of device function with the number
- * of requested irqs upon its software driver call to request for
- * MSI-X mode enabled on its hardware device function. A return of zero
- * indicates the successful configuration of MSI-X capability structure
- * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
- * Or a return of > 0 indicates that driver request is exceeding the number
- * of irqs or MSI-X vectors available. Driver should use the returned value to
- * re-send its request.
- **/
-int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
+static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
+			     int nvec, bool affinity)
 {
 	int nr_entries;
 	int i, j;
@@ -986,7 +985,27 @@ int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
 		dev_info(&dev->dev, "can't enable MSI-X (MSI IRQ already assigned)\n");
 		return -EINVAL;
 	}
-	return msix_capability_init(dev, entries, nvec);
+	return msix_capability_init(dev, entries, nvec, affinity);
+}
+
+/**
+ * pci_enable_msix - configure device's MSI-X capability structure
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ * @entries: pointer to an array of MSI-X entries (optional)
+ * @nvec: number of MSI-X irqs requested for allocation by device driver
+ *
+ * Setup the MSI-X capability structure of device function with the number
+ * of requested irqs upon its software driver call to request for
+ * MSI-X mode enabled on its hardware device function. A return of zero
+ * indicates the successful configuration of MSI-X capability structure
+ * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
+ * Or a return of > 0 indicates that driver request is exceeding the number
+ * of irqs or MSI-X vectors available. Driver should use the returned value to
+ * re-send its request.
+ **/
+int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
+{
+	return __pci_enable_msix(dev, entries, nvec, false);
 }
 EXPORT_SYMBOL(pci_enable_msix);
 
@@ -1039,6 +1058,7 @@ EXPORT_SYMBOL(pci_msi_enabled);
 static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 		unsigned int flags)
 {
+	bool affinity = flags & PCI_IRQ_AFFINITY;
 	int nvec;
 	int rc;
 
@@ -1067,19 +1087,17 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 		nvec = maxvec;
 
 	for (;;) {
-		if (flags & PCI_IRQ_AFFINITY) {
-			dev->irq_affinity = irq_create_affinity_mask(&nvec);
+		if (affinity) {
+			nvec = irq_calc_affinity_vectors(dev->irq_affinity,
+					nvec);
 			if (nvec < minvec)
 				return -ENOSPC;
 		}
 
-		rc = msi_capability_init(dev, nvec);
+		rc = msi_capability_init(dev, nvec, affinity);
 		if (rc == 0)
 			return nvec;
 
-		kfree(dev->irq_affinity);
-		dev->irq_affinity = NULL;
-
 		if (rc < 0)
 			return rc;
 		if (rc < minvec)
@@ -1111,26 +1129,24 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 		struct msix_entry *entries, int minvec, int maxvec,
 		unsigned int flags)
 {
-	int nvec = maxvec;
-	int rc;
+	bool affinity = flags & PCI_IRQ_AFFINITY;
+	int rc, nvec = maxvec;
 
 	if (maxvec < minvec)
 		return -ERANGE;
 
 	for (;;) {
-		if (flags & PCI_IRQ_AFFINITY) {
-			dev->irq_affinity = irq_create_affinity_mask(&nvec);
+		if (affinity) {
+			nvec = irq_calc_affinity_vectors(dev->irq_affinity,
+					nvec);
 			if (nvec < minvec)
 				return -ENOSPC;
 		}
 
-		rc = pci_enable_msix(dev, entries, nvec);
+		rc = __pci_enable_msix(dev, entries, nvec, affinity);
 		if (rc == 0)
 			return nvec;
 
-		kfree(dev->irq_affinity);
-		dev->irq_affinity = NULL;
-
 		if (rc < 0)
 			return rc;
 		if (rc < minvec)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a623b44..5a5a685 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -236,25 +236,24 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
 	const struct cpumask *mask = NULL;
 	struct irq_desc *desc;
 	unsigned int flags;
-	int i, cpu = -1;
+	int i;
 
-	if (affinity && cpumask_empty(affinity))
-		return -EINVAL;
+	/* Validate affinity mask(s) */
+	if (affinity) {
+		for (i = 0, mask = affinity; i < cnt; i++, mask++) {
+			if (cpumask_empty(mask))
+				return -EINVAL;
+		}
+	}
 
 	flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
+	mask = NULL;
 
 	for (i = 0; i < cnt; i++) {
 		if (affinity) {
-			cpu = cpumask_next(cpu, affinity);
-			if (cpu >= nr_cpu_ids)
-				cpu = cpumask_first(affinity);
-			node = cpu_to_node(cpu);
-
-			/*
-			 * For single allocations we use the caller provided
-			 * mask otherwise we use the mask of the target cpu
-			 */
-			mask = cnt == 1 ? affinity : cpumask_of(cpu);
+			node = cpu_to_node(cpumask_first(affinity));
+			mask = affinity;
+			affinity++;
 		}
 		desc = alloc_desc(start + i, node, flags, mask, owner);
 		if (!desc)
@@ -481,9 +480,9 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
  * @cnt:	Number of consecutive irqs to allocate.
  * @node:	Preferred node on which the irq descriptor should be allocated
  * @owner:	Owning module (can be NULL)
- * @affinity:	Optional pointer to an affinity mask which hints where the
- *		irq descriptors should be allocated and which default
- *		affinities to use
+ * @affinity:	Optional pointer to an affinity mask array of size @cnt which
+ *		hints where the irq descriptors should be allocated and which
+ *		default affinities to use
  *
  * Returns the first irq number or error code
  */
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* [PATCH 04/13] genirq/affinity: Remove old irq spread infrastructure
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2016-09-14 14:18 ` [PATCH 03/13] genirq/msi: Switch to new " Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-14 14:18 ` [PATCH 05/13] pci/msi: Retrieve affinity for a vector Christoph Hellwig
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
From: Thomas Gleixner <tglx@linutronix.de>
No more users.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  7 ------
 kernel/irq/affinity.c     | 58 -----------------------------------------------
 2 files changed, 65 deletions(-)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 4e59d12..72f0721 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -278,7 +278,6 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
 extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
-struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs);
 struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, int nvec);
 int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec);
 
@@ -313,12 +312,6 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 	return 0;
 }
 
-static inline struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
-{
-	*nr_vecs = 1;
-	return NULL;
-}
-
 static inline struct cpumask *
 irq_create_affinity_masks(const struct cpumask *affinity, int nvec)
 {
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 7812fec..17f51d63 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -152,61 +152,3 @@ int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec)
 	put_online_cpus();
 	return ret;
 }
-
-static int get_first_sibling(unsigned int cpu)
-{
-	unsigned int ret;
-
-	ret = cpumask_first(topology_sibling_cpumask(cpu));
-	if (ret < nr_cpu_ids)
-		return ret;
-	return cpu;
-}
-
-/*
- * Take a map of online CPUs and the number of available interrupt vectors
- * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
- * so that they are distributed as good as possible around the CPUs.  If
- * more vectors than CPUs are available we'll map one to each CPU,
- * otherwise we map one to the first sibling of each socket.
- *
- * If there are more vectors than CPUs we will still only have one bit
- * set per CPU, but interrupt code will keep on assigning the vectors from
- * the start of the bitmap until we run out of vectors.
- */
-struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
-{
-	struct cpumask *affinity_mask;
-	unsigned int max_vecs = *nr_vecs;
-
-	if (max_vecs == 1)
-		return NULL;
-
-	affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
-	if (!affinity_mask) {
-		*nr_vecs = 1;
-		return NULL;
-	}
-
-	get_online_cpus();
-	if (max_vecs >= num_online_cpus()) {
-		cpumask_copy(affinity_mask, cpu_online_mask);
-		*nr_vecs = num_online_cpus();
-	} else {
-		unsigned int vecs = 0, cpu;
-
-		for_each_online_cpu(cpu) {
-			if (cpu == get_first_sibling(cpu)) {
-				cpumask_set_cpu(cpu, affinity_mask);
-				vecs++;
-			}
-
-			if (--max_vecs == 0)
-				break;
-		}
-		*nr_vecs = vecs;
-	}
-	put_online_cpus();
-
-	return affinity_mask;
-}
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* [PATCH 05/13] pci/msi: Retrieve affinity for a vector
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
                   ` (3 preceding siblings ...)
  2016-09-14 14:18 ` [PATCH 04/13] genirq/affinity: Remove old irq spread infrastructure Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-14 14:18 ` [PATCH 06/13] blk-mq: don't redistribute hardware queues on a CPU hotplug event Christoph Hellwig
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
From: Thomas Gleixner <tglx@linutronix.de>
Add a helper to get the affinity mask for a given PCI irq vector.  For MSI or
MSI-X vectors these are stored by the IRQ core, while for legacy interrupts
we will always return cpu_possible_map.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[hch: updated to follow the style of pci_irq_vector()]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/pci/msi.c   | 31 +++++++++++++++++++++++++++++++
 include/linux/pci.h |  6 ++++++
 2 files changed, 37 insertions(+)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 06100dd..9da5ecb 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1270,6 +1270,37 @@ int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
 }
 EXPORT_SYMBOL(pci_irq_vector);
 
+/**
+ * pci_irq_get_affinity - return the affinity of a particular msi vector
+ * @dev:	PCI device to operate on
+ * @nr:		device-relative interrupt vector index (0-based).
+ */
+const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
+{
+	if (dev->msix_enabled) {
+		struct msi_desc *entry;
+		int i = 0;
+
+		for_each_pci_msi_entry(entry, dev) {
+			if (i == nr)
+				return entry->affinity;
+			i++;
+		}
+		WARN_ON_ONCE(1);
+		return NULL;
+	} else if (dev->msi_enabled) {
+		struct msi_desc *entry = first_pci_msi_entry(dev);
+
+		if (WARN_ON_ONCE(!entry || nr >= entry->nvec_used))
+			return NULL;
+
+		return &entry->affinity[nr];
+	} else {
+		return cpu_possible_mask;
+	}
+}
+EXPORT_SYMBOL(pci_irq_get_affinity);
+
 struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc)
 {
 	return to_pci_dev(desc->dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 0ab8359..3b0a800 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1300,6 +1300,7 @@ int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
 		unsigned int max_vecs, unsigned int flags);
 void pci_free_irq_vectors(struct pci_dev *dev);
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
+const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec);
 
 #else
 static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; }
@@ -1342,6 +1343,11 @@ static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
 		return -EINVAL;
 	return dev->irq;
 }
+static inline const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev,
+		int vec)
+{
+	return cpu_possible_mask;
+}
 #endif
 
 #ifdef CONFIG_PCIEPORTBUS
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* [PATCH 06/13] blk-mq: don't redistribute hardware queues on a CPU hotplug event
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
                   ` (4 preceding siblings ...)
  2016-09-14 14:18 ` [PATCH 05/13] pci/msi: Retrieve affinity for a vector Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-14 14:18 ` [PATCH 07/13] blk-mq: only allocate a single mq_map per tag_set Christoph Hellwig
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
Currently blk-mq will totally remap hardware context when a CPU hotplug
even happened, which causes major havoc for drivers, as they are never
told about this remapping.  E.g. any carefully sorted out CPU affinity
will just be completely messed up.
The rebuild also doesn't really help for the common case of cpu
hotplug, which is soft onlining / offlining of cpus - in this case we
should just leave the queue and irq mapping as is.  If it actually
worked it would have helped in the case of physical cpu hotplug,
although for that we'd need a way to actually notify the driver.
Note that drivers may already be able to accommodate such a topology
change on their own, e.g. using the reset_controller sysfs file in NVMe
will cause the driver to get things right for this case.
With the rebuild removed we will simplify retain the queue mapping for
a soft offlined CPU that will work when it comes back online, and will
map any newly onlined CPU to queue 0 until the driver initiates
a rebuild of the queue map.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c | 2 --
 1 file changed, 2 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 13f5a6c..b29e7b2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2147,8 +2147,6 @@ static void blk_mq_queue_reinit(struct request_queue *q,
 
 	blk_mq_sysfs_unregister(q);
 
-	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
-
 	/*
 	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
 	 * we should change hctx numa_node according to new topology (this
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* [PATCH 07/13] blk-mq: only allocate a single mq_map per tag_set
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
                   ` (5 preceding siblings ...)
  2016-09-14 14:18 ` [PATCH 06/13] blk-mq: don't redistribute hardware queues on a CPU hotplug event Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-14 14:18 ` [PATCH 08/13] blk-mq: remove ->map_queue Christoph Hellwig
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
The mapping is identical for all queues in a tag_set, so stop wasting
memory for building multiple.  Note that for now I've kept the mq_map
pointer in the request_queue, but we'll need to investigate if we can
remove it without suffering too much from the additional pointer chasing.
The same would apply to the mq_ops pointer as well.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c         | 22 ++++++++++++++--------
 include/linux/blk-mq.h |  1 +
 2 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b29e7b2..15c36c1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1963,7 +1963,6 @@ void blk_mq_release(struct request_queue *q)
 		kfree(hctx);
 	}
 
-	kfree(q->mq_map);
 	q->mq_map = NULL;
 
 	kfree(q->queue_hw_ctx);
@@ -2062,9 +2061,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	if (!q->queue_hw_ctx)
 		goto err_percpu;
 
-	q->mq_map = blk_mq_make_queue_map(set);
-	if (!q->mq_map)
-		goto err_map;
+	q->mq_map = set->mq_map;
 
 	blk_mq_realloc_hw_ctxs(set, q);
 	if (!q->nr_hw_queues)
@@ -2114,8 +2111,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	return q;
 
 err_hctxs:
-	kfree(q->mq_map);
-err_map:
 	kfree(q->queue_hw_ctx);
 err_percpu:
 	free_percpu(q->queue_ctx);
@@ -2337,14 +2332,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (!set->tags)
 		return -ENOMEM;
 
+	set->mq_map = blk_mq_make_queue_map(set);
+	if (!set->mq_map)
+		goto out_free_tags;
+
 	if (blk_mq_alloc_rq_maps(set))
-		goto enomem;
+		goto out_free_mq_map;
 
 	mutex_init(&set->tag_list_lock);
 	INIT_LIST_HEAD(&set->tag_list);
 
 	return 0;
-enomem:
+
+out_free_mq_map:
+	kfree(set->mq_map);
+	set->mq_map = NULL;
+out_free_tags:
 	kfree(set->tags);
 	set->tags = NULL;
 	return -ENOMEM;
@@ -2360,6 +2363,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 			blk_mq_free_rq_map(set, set->tags[i], i);
 	}
 
+	kfree(set->mq_map);
+	set->mq_map = NULL;
+
 	kfree(set->tags);
 	set->tags = NULL;
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e43bbff..a572227 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -65,6 +65,7 @@ struct blk_mq_hw_ctx {
 };
 
 struct blk_mq_tag_set {
+	unsigned int		*mq_map;
 	struct blk_mq_ops	*ops;
 	unsigned int		nr_hw_queues;
 	unsigned int		queue_depth;	/* max hw supported */
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* [PATCH 08/13] blk-mq: remove ->map_queue
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
                   ` (6 preceding siblings ...)
  2016-09-14 14:18 ` [PATCH 07/13] blk-mq: only allocate a single mq_map per tag_set Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-14 14:18 ` [PATCH 09/13] blk-mq: allow the driver to pass in a queue mapping Christoph Hellwig
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
All drivers use the default, so provide an inline version of it.  If we
ever need other queue mapping we can add an optional method back,
although supporting will also require major changes to the queue setup
code.
This provides better code generation, and better debugability as well.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-flush.c                 |  6 +++---
 block/blk-mq-tag.c                |  5 ++---
 block/blk-mq.c                    | 40 +++++++++++----------------------------
 block/blk-mq.h                    |  6 ++++++
 block/blk.h                       | 11 +++--------
 drivers/block/loop.c              |  1 -
 drivers/block/mtip32xx/mtip32xx.c |  1 -
 drivers/block/null_blk.c          |  1 -
 drivers/block/rbd.c               |  1 -
 drivers/block/virtio_blk.c        |  1 -
 drivers/block/xen-blkfront.c      |  1 -
 drivers/md/dm-rq.c                |  1 -
 drivers/mtd/ubi/block.c           |  1 -
 drivers/nvme/host/pci.c           |  2 --
 drivers/nvme/host/rdma.c          |  2 --
 drivers/nvme/target/loop.c        |  2 --
 drivers/scsi/scsi_lib.c           |  1 -
 include/linux/blk-mq.h            |  7 -------
 18 files changed, 25 insertions(+), 65 deletions(-)
diff --git a/block/blk-flush.c b/block/blk-flush.c
index d308def8..6a14b68 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -232,7 +232,7 @@ static void flush_end_io(struct request *flush_rq, int error)
 
 		/* release the tag's ownership to the req cloned from */
 		spin_lock_irqsave(&fq->mq_flush_lock, flags);
-		hctx = q->mq_ops->map_queue(q, flush_rq->mq_ctx->cpu);
+		hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
 		blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
 		flush_rq->tag = -1;
 	}
@@ -325,7 +325,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 		flush_rq->tag = first_rq->tag;
 		fq->orig_rq = first_rq;
 
-		hctx = q->mq_ops->map_queue(q, first_rq->mq_ctx->cpu);
+		hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
 		blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
 	}
 
@@ -358,7 +358,7 @@ static void mq_flush_data_end_io(struct request *rq, int error)
 	unsigned long flags;
 	struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
 
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+	hctx = blk_mq_map_queue(q, ctx->cpu);
 
 	/*
 	 * After populating an empty queue, kick it to avoid stall.  Read
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 729bac3..1602813 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -301,8 +301,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 		io_schedule();
 
 		data->ctx = blk_mq_get_ctx(data->q);
-		data->hctx = data->q->mq_ops->map_queue(data->q,
-				data->ctx->cpu);
+		data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu);
 		if (data->flags & BLK_MQ_REQ_RESERVED) {
 			bt = &data->hctx->tags->breserved_tags;
 		} else {
@@ -726,7 +725,7 @@ u32 blk_mq_unique_tag(struct request *rq)
 	int hwq = 0;
 
 	if (q->mq_ops) {
-		hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
+		hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
 		hwq = hctx->queue_num;
 	}
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 15c36c1..434df39 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -244,7 +244,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		return ERR_PTR(ret);
 
 	ctx = blk_mq_get_ctx(q);
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+	hctx = blk_mq_map_queue(q, ctx->cpu);
 	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
 
 	rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
@@ -253,7 +253,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		blk_mq_put_ctx(ctx);
 
 		ctx = blk_mq_get_ctx(q);
-		hctx = q->mq_ops->map_queue(q, ctx->cpu);
+		hctx = blk_mq_map_queue(q, ctx->cpu);
 		blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
 		rq =  __blk_mq_alloc_request(&alloc_data, rw, 0);
 		ctx = alloc_data.ctx;
@@ -337,11 +337,7 @@ EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
 
 void blk_mq_free_request(struct request *rq)
 {
-	struct blk_mq_hw_ctx *hctx;
-	struct request_queue *q = rq->q;
-
-	hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
-	blk_mq_free_hctx_request(hctx, rq);
+	blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
@@ -1064,9 +1060,7 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx;
-
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 
 	spin_lock(&ctx->lock);
 	__blk_mq_insert_request(hctx, rq, at_head);
@@ -1083,12 +1077,10 @@ static void blk_mq_insert_requests(struct request_queue *q,
 				     bool from_schedule)
 
 {
-	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 
 	trace_block_unplug(q, depth, !from_schedule);
 
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
 	/*
 	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
 	 * offline now
@@ -1222,7 +1214,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 
 	blk_queue_enter_live(q);
 	ctx = blk_mq_get_ctx(q);
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+	hctx = blk_mq_map_queue(q, ctx->cpu);
 
 	if (rw_is_sync(bio_op(bio), bio->bi_opf))
 		op_flags |= REQ_SYNC;
@@ -1236,7 +1228,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 		trace_block_sleeprq(q, bio, op);
 
 		ctx = blk_mq_get_ctx(q);
-		hctx = q->mq_ops->map_queue(q, ctx->cpu);
+		hctx = blk_mq_map_queue(q, ctx->cpu);
 		blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
 		rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
 		ctx = alloc_data.ctx;
@@ -1253,8 +1245,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
 {
 	int ret;
 	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
-			rq->mq_ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
 	struct blk_mq_queue_data bd = {
 		.rq = rq,
 		.list = NULL,
@@ -1458,15 +1449,6 @@ run_queue:
 	return cookie;
 }
 
-/*
- * Default mapping to a software queue, since we use one per CPU.
- */
-struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
-{
-	return q->queue_hw_ctx[q->mq_map[cpu]];
-}
-EXPORT_SYMBOL(blk_mq_map_queue);
-
 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
 		struct blk_mq_tags *tags, unsigned int hctx_idx)
 {
@@ -1800,7 +1782,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		if (!cpu_online(i))
 			continue;
 
-		hctx = q->mq_ops->map_queue(q, i);
+		hctx = blk_mq_map_queue(q, i);
 
 		/*
 		 * Set local node, IFF we have more than one hw queue. If
@@ -1838,7 +1820,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 			continue;
 
 		ctx = per_cpu_ptr(q->queue_ctx, i);
-		hctx = q->mq_ops->map_queue(q, i);
+		hctx = blk_mq_map_queue(q, i);
 
 		cpumask_set_cpu(i, hctx->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
@@ -2303,7 +2285,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
 		return -EINVAL;
 
-	if (!set->ops->queue_rq || !set->ops->map_queue)
+	if (!set->ops->queue_rq)
 		return -EINVAL;
 
 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9087b11..ec774bf 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -52,6 +52,12 @@ extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
 				   const struct cpumask *online_mask);
 extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
 
+static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
+		int cpu)
+{
+	return q->queue_hw_ctx[q->mq_map[cpu]];
+}
+
 /*
  * sysfs helpers
  */
diff --git a/block/blk.h b/block/blk.h
index c37492f..74444c4 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -39,14 +39,9 @@ extern struct ida blk_queue_ida;
 static inline struct blk_flush_queue *blk_get_flush_queue(
 		struct request_queue *q, struct blk_mq_ctx *ctx)
 {
-	struct blk_mq_hw_ctx *hctx;
-
-	if (!q->mq_ops)
-		return q->fq;
-
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
-	return hctx->fq;
+	if (q->mq_ops)
+		return blk_mq_map_queue(q, ctx->cpu)->fq;
+	return q->fq;
 }
 
 static inline void __blk_get_queue(struct request_queue *q)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c9f2107..cbdb3b1 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1703,7 +1703,6 @@ static int loop_init_request(void *data, struct request *rq,
 
 static struct blk_mq_ops loop_mq_ops = {
 	.queue_rq       = loop_queue_rq,
-	.map_queue      = blk_mq_map_queue,
 	.init_request	= loop_init_request,
 };
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 2aca98e..3cc92e9 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3895,7 +3895,6 @@ exit_handler:
 
 static struct blk_mq_ops mtip_mq_ops = {
 	.queue_rq	= mtip_queue_rq,
-	.map_queue	= blk_mq_map_queue,
 	.init_request	= mtip_init_cmd,
 	.exit_request	= mtip_free_cmd,
 	.complete	= mtip_softirq_done_fn,
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 75a7f88..7d3b7d6 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -393,7 +393,6 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 
 static struct blk_mq_ops null_mq_ops = {
 	.queue_rq       = null_queue_rq,
-	.map_queue      = blk_mq_map_queue,
 	.init_hctx	= null_init_hctx,
 	.complete	= null_softirq_done_fn,
 };
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6c6519f..c1f84df 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3621,7 +3621,6 @@ static int rbd_init_request(void *data, struct request *rq,
 
 static struct blk_mq_ops rbd_mq_ops = {
 	.queue_rq	= rbd_queue_rq,
-	.map_queue	= blk_mq_map_queue,
 	.init_request	= rbd_init_request,
 };
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 93b1aaa..2dc5c96 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -542,7 +542,6 @@ static int virtblk_init_request(void *data, struct request *rq,
 
 static struct blk_mq_ops virtio_mq_ops = {
 	.queue_rq	= virtio_queue_rq,
-	.map_queue	= blk_mq_map_queue,
 	.complete	= virtblk_request_done,
 	.init_request	= virtblk_init_request,
 };
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 88ef6d4..9908597 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -909,7 +909,6 @@ out_busy:
 
 static struct blk_mq_ops blkfront_mq_ops = {
 	.queue_rq = blkif_queue_rq,
-	.map_queue = blk_mq_map_queue,
 };
 
 static void blkif_set_queue_limits(struct blkfront_info *info)
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 1ca7463..d1c3645 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -908,7 +908,6 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 static struct blk_mq_ops dm_mq_ops = {
 	.queue_rq = dm_mq_queue_rq,
-	.map_queue = blk_mq_map_queue,
 	.complete = dm_softirq_done,
 	.init_request = dm_mq_init_request,
 };
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index ebf46ad..d1e6931 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -351,7 +351,6 @@ static int ubiblock_init_request(void *data, struct request *req,
 static struct blk_mq_ops ubiblock_mq_ops = {
 	.queue_rq       = ubiblock_queue_rq,
 	.init_request	= ubiblock_init_request,
-	.map_queue      = blk_mq_map_queue,
 };
 
 static DEFINE_IDR(ubiblock_minor_idr);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8dcf5a9..086fd7e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1131,7 +1131,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 static struct blk_mq_ops nvme_mq_admin_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.complete	= nvme_complete_rq,
-	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_admin_init_hctx,
 	.exit_hctx      = nvme_admin_exit_hctx,
 	.init_request	= nvme_admin_init_request,
@@ -1141,7 +1140,6 @@ static struct blk_mq_ops nvme_mq_admin_ops = {
 static struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.complete	= nvme_complete_rq,
-	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_init_hctx,
 	.init_request	= nvme_init_request,
 	.timeout	= nvme_timeout,
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ab545fb..9bbd886 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1531,7 +1531,6 @@ static void nvme_rdma_complete_rq(struct request *rq)
 static struct blk_mq_ops nvme_rdma_mq_ops = {
 	.queue_rq	= nvme_rdma_queue_rq,
 	.complete	= nvme_rdma_complete_rq,
-	.map_queue	= blk_mq_map_queue,
 	.init_request	= nvme_rdma_init_request,
 	.exit_request	= nvme_rdma_exit_request,
 	.reinit_request	= nvme_rdma_reinit_request,
@@ -1543,7 +1542,6 @@ static struct blk_mq_ops nvme_rdma_mq_ops = {
 static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
 	.queue_rq	= nvme_rdma_queue_rq,
 	.complete	= nvme_rdma_complete_rq,
-	.map_queue	= blk_mq_map_queue,
 	.init_request	= nvme_rdma_init_admin_request,
 	.exit_request	= nvme_rdma_exit_admin_request,
 	.reinit_request	= nvme_rdma_reinit_request,
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 395e60d..d5df77d 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -273,7 +273,6 @@ static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 static struct blk_mq_ops nvme_loop_mq_ops = {
 	.queue_rq	= nvme_loop_queue_rq,
 	.complete	= nvme_loop_complete_rq,
-	.map_queue	= blk_mq_map_queue,
 	.init_request	= nvme_loop_init_request,
 	.init_hctx	= nvme_loop_init_hctx,
 	.timeout	= nvme_loop_timeout,
@@ -282,7 +281,6 @@ static struct blk_mq_ops nvme_loop_mq_ops = {
 static struct blk_mq_ops nvme_loop_admin_mq_ops = {
 	.queue_rq	= nvme_loop_queue_rq,
 	.complete	= nvme_loop_complete_rq,
-	.map_queue	= blk_mq_map_queue,
 	.init_request	= nvme_loop_init_admin_request,
 	.init_hctx	= nvme_loop_init_admin_hctx,
 	.timeout	= nvme_loop_timeout,
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index c71344a..2cca9cf 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2077,7 +2077,6 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
 }
 
 static struct blk_mq_ops scsi_mq_ops = {
-	.map_queue	= blk_mq_map_queue,
 	.queue_rq	= scsi_queue_rq,
 	.complete	= scsi_softirq_done,
 	.timeout	= scsi_timeout,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a572227..d4d8bc8 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -89,7 +89,6 @@ struct blk_mq_queue_data {
 };
 
 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
-typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
@@ -112,11 +111,6 @@ struct blk_mq_ops {
 	queue_rq_fn		*queue_rq;
 
 	/*
-	 * Map to specific hardware queue
-	 */
-	map_queue_fn		*map_queue;
-
-	/*
 	 * Called on request timeout
 	 */
 	timeout_fn		*timeout;
@@ -221,7 +215,6 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
 	return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
 }
 
-struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
 
 int blk_mq_request_started(struct request *rq);
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* [PATCH 09/13] blk-mq: allow the driver to pass in a queue mapping
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
                   ` (7 preceding siblings ...)
  2016-09-14 14:18 ` [PATCH 08/13] blk-mq: remove ->map_queue Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-14 14:18 ` [PATCH 10/13] blk-mq: provide a default queue mapping for PCI device Christoph Hellwig
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
This allows drivers specify their own queue mapping by overriding the
setup-time function that builds the mq_map.  This can be used for
example to build the map based on the MSI-X vector mapping provided
by the core interrupt layer for PCI devices.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-cpumap.c  | 25 +++++--------------------
 block/blk-mq.c         | 18 +++++++++++++++---
 block/blk-mq.h         |  4 +---
 include/linux/blk-mq.h |  3 +++
 4 files changed, 24 insertions(+), 26 deletions(-)
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index d0634bc..19b1d9c 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -31,14 +31,16 @@ static int get_first_sibling(unsigned int cpu)
 	return cpu;
 }
 
-int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
-			    const struct cpumask *online_mask)
+int blk_mq_map_queues(struct blk_mq_tag_set *set)
 {
+	unsigned int *map = set->mq_map;
+	unsigned int nr_queues = set->nr_hw_queues;
+	const struct cpumask *online_mask = cpu_online_mask;
 	unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
 	cpumask_var_t cpus;
 
 	if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
-		return 1;
+		return -ENOMEM;
 
 	cpumask_clear(cpus);
 	nr_cpus = nr_uniq_cpus = 0;
@@ -86,23 +88,6 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
 	return 0;
 }
 
-unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
-{
-	unsigned int *map;
-
-	/* If cpus are offline, map them to first hctx */
-	map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL,
-				set->numa_node);
-	if (!map)
-		return NULL;
-
-	if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
-		return map;
-
-	kfree(map);
-	return NULL;
-}
-
 /*
  * We have no quick way of doing reverse lookups. This is only used at
  * queue init time, so runtime isn't important.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 434df39..f3ef898 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2276,6 +2276,8 @@ EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
  */
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 {
+	int ret;
+
 	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
 
 	if (!set->nr_hw_queues)
@@ -2314,11 +2316,21 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (!set->tags)
 		return -ENOMEM;
 
-	set->mq_map = blk_mq_make_queue_map(set);
+	ret = -ENOMEM;
+	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
+			GFP_KERNEL, set->numa_node);
 	if (!set->mq_map)
 		goto out_free_tags;
 
-	if (blk_mq_alloc_rq_maps(set))
+	if (set->ops->map_queues)
+		ret = set->ops->map_queues(set);
+	else
+		ret = blk_mq_map_queues(set);
+	if (ret)
+		goto out_free_mq_map;
+
+	ret = blk_mq_alloc_rq_maps(set);
+	if (ret)
 		goto out_free_mq_map;
 
 	mutex_init(&set->tag_list_lock);
@@ -2332,7 +2344,7 @@ out_free_mq_map:
 out_free_tags:
 	kfree(set->tags);
 	set->tags = NULL;
-	return -ENOMEM;
+	return ret;
 }
 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ec774bf..c92bb7d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -47,9 +47,7 @@ void blk_mq_disable_hotplug(void);
 /*
  * CPU -> queue mappings
  */
-extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
-extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
-				   const struct cpumask *online_mask);
+int blk_mq_map_queues(struct blk_mq_tag_set *set);
 extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
 
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d4d8bc8..ead450a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -102,6 +102,7 @@ typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 		bool);
 typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
 typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
 
 
 struct blk_mq_ops {
@@ -142,6 +143,8 @@ struct blk_mq_ops {
 	init_request_fn		*init_request;
 	exit_request_fn		*exit_request;
 	reinit_request_fn	*reinit_request;
+
+	map_queues_fn		*map_queues;
 };
 
 enum {
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* [PATCH 10/13] blk-mq: provide a default queue mapping for PCI device
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
                   ` (8 preceding siblings ...)
  2016-09-14 14:18 ` [PATCH 09/13] blk-mq: allow the driver to pass in a queue mapping Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-19  7:33   ` Alexander Gordeev
  2016-09-14 14:18 ` [PATCH 11/13] nvme: switch to use pci_alloc_irq_vectors Christoph Hellwig
                   ` (3 subsequent siblings)
  13 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/Makefile             |  2 +-
 block/blk-mq-pci.c         | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq-pci.h |  9 +++++++++
 3 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 block/blk-mq-pci.c
 create mode 100644 include/linux/blk-mq-pci.h
diff --git a/block/Makefile b/block/Makefile
index 9eda232..2447a0b 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -22,4 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
-
+obj-$(CONFIG_PCI)		+= blk-mq-pci.o
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
new file mode 100644
index 0000000..33c7bd7
--- /dev/null
+++ b/block/blk-mq-pci.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/blk-mq.h>
+#include <linux/blk-mq-pci.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+
+/**
+ * blk_mq_pci_map_queues - provide a default queue mapping for PCI device
+ * @set:	tagset to provide the mapping for
+ * @pdev:	PCI device associated with @set.
+ *
+ * This function assumes the PCI device @pdev has at least as many available
+ * interrupt vetors as @set has queues.  It will then queuery the vector
+ * corresponding to each queue for it's affinity mask and built queue mapping
+ * that maps a queue to the CPUs that have irq affinity for the corresponding
+ * vector.
+ */
+int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev)
+{
+	const struct cpumask *mask;
+	unsigned int queue, cpu;
+
+	for (queue = 0; queue < set->nr_hw_queues; queue++) {
+		mask = pci_irq_get_affinity(pdev, queue);
+		if (!mask)
+			return -EINVAL;
+
+		for_each_cpu(cpu, mask)
+			set->mq_map[cpu] = queue;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h
new file mode 100644
index 0000000..6ab5952
--- /dev/null
+++ b/include/linux/blk-mq-pci.h
@@ -0,0 +1,9 @@
+#ifndef _LINUX_BLK_MQ_PCI_H
+#define _LINUX_BLK_MQ_PCI_H
+
+struct blk_mq_tag_set;
+struct pci_dev;
+
+int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev);
+
+#endif /* _LINUX_BLK_MQ_PCI_H */
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* [PATCH 11/13] nvme: switch to use pci_alloc_irq_vectors
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
                   ` (9 preceding siblings ...)
  2016-09-14 14:18 ` [PATCH 10/13] blk-mq: provide a default queue mapping for PCI device Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-23 22:21   ` Sagi Grimberg
  2016-09-14 14:18 ` [PATCH 12/13] nvme: remove the post_scan callout Christoph Hellwig
                   ` (2 subsequent siblings)
  13 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
Use the new helper to automatically select the right interrupt type, as
well as to use the automatic interupt affinity assignment.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 107 ++++++++++++++++--------------------------------
 1 file changed, 36 insertions(+), 71 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 086fd7e..47a44e9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -16,6 +16,7 @@
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
+#include <linux/blk-mq-pci.h>
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
@@ -88,7 +89,6 @@ struct nvme_dev {
 	unsigned max_qid;
 	int q_depth;
 	u32 db_stride;
-	struct msix_entry *entry;
 	void __iomem *bar;
 	struct work_struct reset_work;
 	struct work_struct remove_work;
@@ -201,6 +201,11 @@ static unsigned int nvme_cmd_size(struct nvme_dev *dev)
 		nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES);
 }
 
+static int nvmeq_irq(struct nvme_queue *nvmeq)
+{
+	return pci_irq_vector(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector);
+}
+
 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 				unsigned int hctx_idx)
 {
@@ -263,6 +268,13 @@ static int nvme_init_request(void *data, struct request *req,
 	return 0;
 }
 
+static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
+{
+	struct nvme_dev *dev = set->driver_data;
+
+	return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev));
+}
+
 /**
  * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
@@ -960,7 +972,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 		spin_unlock_irq(&nvmeq->q_lock);
 		return 1;
 	}
-	vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
+	vector = nvmeq_irq(nvmeq);
 	nvmeq->dev->online_queues--;
 	nvmeq->cq_vector = -1;
 	spin_unlock_irq(&nvmeq->q_lock);
@@ -968,7 +980,6 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
 		blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q);
 
-	irq_set_affinity_hint(vector, NULL);
 	free_irq(vector, nvmeq);
 
 	return 0;
@@ -1075,15 +1086,14 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	return NULL;
 }
 
-static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
-							const char *name)
+static int queue_request_irq(struct nvme_queue *nvmeq)
 {
 	if (use_threaded_interrupts)
-		return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
-					nvme_irq_check, nvme_irq, IRQF_SHARED,
-					name, nvmeq);
-	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
-				IRQF_SHARED, name, nvmeq);
+		return request_threaded_irq(nvmeq_irq(nvmeq), nvme_irq_check,
+				nvme_irq, IRQF_SHARED, nvmeq->irqname, nvmeq);
+	else
+		return request_irq(nvmeq_irq(nvmeq), nvme_irq, IRQF_SHARED,
+				nvmeq->irqname, nvmeq);
 }
 
 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
@@ -1114,7 +1124,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	if (result < 0)
 		goto release_cq;
 
-	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
+	result = queue_request_irq(nvmeq);
 	if (result < 0)
 		goto release_sq;
 
@@ -1142,6 +1152,7 @@ static struct blk_mq_ops nvme_mq_ops = {
 	.complete	= nvme_complete_rq,
 	.init_hctx	= nvme_init_hctx,
 	.init_request	= nvme_init_request,
+	.map_queues	= nvme_pci_map_queues,
 	.timeout	= nvme_timeout,
 	.poll		= nvme_poll,
 };
@@ -1232,7 +1243,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 		goto free_nvmeq;
 
 	nvmeq->cq_vector = 0;
-	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
+	result = queue_request_irq(nvmeq);
 	if (result) {
 		nvmeq->cq_vector = -1;
 		goto free_nvmeq;
@@ -1380,7 +1391,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	struct nvme_queue *adminq = dev->queues[0];
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
-	int result, i, vecs, nr_io_queues, size;
+	int result, nr_io_queues, size;
 
 	nr_io_queues = num_online_cpus();
 	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
@@ -1415,29 +1426,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	}
 
 	/* Deregister the admin queue's interrupt */
-	free_irq(dev->entry[0].vector, adminq);
+	free_irq(pci_irq_vector(pdev, 0), adminq);
 
 	/*
 	 * If we enable msix early due to not intx, disable it again before
 	 * setting up the full range we need.
 	 */
-	if (pdev->msi_enabled)
-		pci_disable_msi(pdev);
-	else if (pdev->msix_enabled)
-		pci_disable_msix(pdev);
-
-	for (i = 0; i < nr_io_queues; i++)
-		dev->entry[i].entry = i;
-	vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
-	if (vecs < 0) {
-		vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));
-		if (vecs < 0) {
-			vecs = 1;
-		} else {
-			for (i = 0; i < vecs; i++)
-				dev->entry[i].vector = i + pdev->irq;
-		}
-	}
+	pci_free_irq_vectors(pdev);
+	nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues,
+			PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY);
+	if (nr_io_queues <= 0)
+		return -EIO;
+	dev->max_qid = nr_io_queues;
 
 	/*
 	 * Should investigate if there's a performance win from allocating
@@ -1445,10 +1445,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	 * path to scale better, even if the receive path is limited by the
 	 * number of interrupts.
 	 */
-	nr_io_queues = vecs;
-	dev->max_qid = nr_io_queues;
 
-	result = queue_request_irq(dev, adminq, adminq->irqname);
+	result = queue_request_irq(adminq);
 	if (result) {
 		adminq->cq_vector = -1;
 		goto free_queues;
@@ -1460,23 +1458,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	return result;
 }
 
-static void nvme_pci_post_scan(struct nvme_ctrl *ctrl)
-{
-	struct nvme_dev *dev = to_nvme_dev(ctrl);
-	struct nvme_queue *nvmeq;
-	int i;
-
-	for (i = 0; i < dev->online_queues; i++) {
-		nvmeq = dev->queues[i];
-
-		if (!nvmeq->tags || !(*nvmeq->tags))
-			continue;
-
-		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
-					blk_mq_tags_cpumask(*nvmeq->tags));
-	}
-}
-
 static void nvme_del_queue_end(struct request *req, int error)
 {
 	struct nvme_queue *nvmeq = req->end_io_data;
@@ -1613,15 +1594,9 @@ static int nvme_pci_enable(struct nvme_dev *dev)
 	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
 	 * adjust this later.
 	 */
-	if (pci_enable_msix(pdev, dev->entry, 1)) {
-		pci_enable_msi(pdev);
-		dev->entry[0].vector = pdev->irq;
-	}
-
-	if (!dev->entry[0].vector) {
-		result = -ENODEV;
-		goto disable;
-	}
+	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
+	if (result < 0)
+		return result;
 
 	cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
 
@@ -1663,10 +1638,7 @@ static void nvme_pci_disable(struct nvme_dev *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
-	if (pdev->msi_enabled)
-		pci_disable_msi(pdev);
-	else if (pdev->msix_enabled)
-		pci_disable_msix(pdev);
+	pci_free_irq_vectors(pdev);
 
 	if (pci_is_enabled(pdev)) {
 		pci_disable_pcie_error_reporting(pdev);
@@ -1736,7 +1708,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	if (dev->ctrl.admin_q)
 		blk_put_queue(dev->ctrl.admin_q);
 	kfree(dev->queues);
-	kfree(dev->entry);
 	kfree(dev);
 }
 
@@ -1880,7 +1851,6 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.reg_read64		= nvme_pci_reg_read64,
 	.reset_ctrl		= nvme_pci_reset_ctrl,
 	.free_ctrl		= nvme_pci_free_ctrl,
-	.post_scan		= nvme_pci_post_scan,
 	.submit_async_event	= nvme_pci_submit_async_event,
 };
 
@@ -1913,10 +1883,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
 	if (!dev)
 		return -ENOMEM;
-	dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
-							GFP_KERNEL, node);
-	if (!dev->entry)
-		goto free;
 	dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
 							GFP_KERNEL, node);
 	if (!dev->queues)
@@ -1957,7 +1923,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	nvme_dev_unmap(dev);
  free:
 	kfree(dev->queues);
-	kfree(dev->entry);
 	kfree(dev);
 	return result;
 }
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* [PATCH 12/13] nvme: remove the post_scan callout
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
                   ` (10 preceding siblings ...)
  2016-09-14 14:18 ` [PATCH 11/13] nvme: switch to use pci_alloc_irq_vectors Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-14 14:18 ` [PATCH 13/13] blk-mq: get rid of the cpumask in struct blk_mq_tags Christoph Hellwig
  2016-09-15 14:40 ` blk-mq: allow passing in an external queue mapping V3 Keith Busch
  13 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
No need now that we don't have to reverse engineer the irq affinity.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 3 ---
 drivers/nvme/host/nvme.h | 1 -
 2 files changed, 4 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2feacc7..b245616 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1826,9 +1826,6 @@ static void nvme_scan_work(struct work_struct *work)
 	list_sort(NULL, &ctrl->namespaces, ns_cmp);
 	mutex_unlock(&ctrl->namespaces_mutex);
 	kfree(id);
-
-	if (ctrl->ops->post_scan)
-		ctrl->ops->post_scan(ctrl);
 }
 
 void nvme_queue_scan(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ab18b78..99e4c16 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -184,7 +184,6 @@ struct nvme_ctrl_ops {
 	int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
 	int (*reset_ctrl)(struct nvme_ctrl *ctrl);
 	void (*free_ctrl)(struct nvme_ctrl *ctrl);
-	void (*post_scan)(struct nvme_ctrl *ctrl);
 	void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx);
 	int (*delete_ctrl)(struct nvme_ctrl *ctrl);
 	const char *(*get_subsysnqn)(struct nvme_ctrl *ctrl);
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* [PATCH 13/13] blk-mq: get rid of the cpumask in struct blk_mq_tags
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
                   ` (11 preceding siblings ...)
  2016-09-14 14:18 ` [PATCH 12/13] nvme: remove the post_scan callout Christoph Hellwig
@ 2016-09-14 14:18 ` Christoph Hellwig
  2016-09-15 14:44   ` Christoph Hellwig
  2016-09-15 14:40 ` blk-mq: allow passing in an external queue mapping V3 Keith Busch
  13 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-14 14:18 UTC (permalink / raw)
  To: axboe, tglx; +Cc: agordeev, keith.busch, linux-block, linux-kernel
Unused now that NVMe sets up irq affinity before calling into blk-mq.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c     |  6 ------
 block/blk-mq-tag.h     |  1 -
 block/blk-mq.c         | 25 +++++++++++++++++++++----
 include/linux/blk-mq.h |  1 -
 4 files changed, 21 insertions(+), 12 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 1602813..2eae3d5 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -665,11 +665,6 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	if (!tags)
 		return NULL;
 
-	if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
-		kfree(tags);
-		return NULL;
-	}
-
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
 
@@ -680,7 +675,6 @@ void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
 	bt_free(&tags->bitmap_tags);
 	bt_free(&tags->breserved_tags);
-	free_cpumask_var(tags->cpumask);
 	kfree(tags);
 }
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index d468a79..5569641 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -44,7 +44,6 @@ struct blk_mq_tags {
 	struct list_head page_list;
 
 	int alloc_policy;
-	cpumask_var_t cpumask;
 };
 
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f3ef898..4411dbb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1851,7 +1851,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 		hctx->tags = set->tags[i];
 		WARN_ON(!hctx->tags);
 
-		cpumask_copy(hctx->tags->cpumask, hctx->cpumask);
 		/*
 		 * Set the map size to the number of mapped software queues.
 		 * This is more accurate and more efficient than looping
@@ -2262,11 +2261,29 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 	return 0;
 }
 
-struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
+static int blk_mq_create_mq_map(struct blk_mq_tag_set *set,
+		const struct cpumask *affinity_mask)
 {
-	return tags->cpumask;
+	int queue = -1, cpu = 0;
+
+	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
+			GFP_KERNEL, set->numa_node);
+	if (!set->mq_map)
+		return -ENOMEM;
+
+	if (!affinity_mask)
+		return 0;	/* map all cpus to queue 0 */
+
+	/* If cpus are offline, map them to first hctx */
+	for_each_online_cpu(cpu) {
+		if (cpumask_test_cpu(cpu, affinity_mask))
+			queue++;
+		if (queue >= 0)
+			set->mq_map[cpu] = queue;
+	}
+
+	return 0;
 }
-EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
 
 /*
  * Alloc a tag set to be associated with one or more request queues.
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ead450a..07589a2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -199,7 +199,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int op,
 		unsigned int flags, unsigned int hctx_idx);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
-struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags);
 
 enum {
 	BLK_MQ_UNIQUE_TAG_BITS = 16,
-- 
2.1.4
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* Re: blk-mq: allow passing in an external queue mapping V3
  2016-09-15 14:40 ` blk-mq: allow passing in an external queue mapping V3 Keith Busch
@ 2016-09-15 14:32   ` Christoph Hellwig
  2016-09-15 14:34     ` Jens Axboe
  0 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-15 14:32 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, axboe, tglx, agordeev, linux-block,
	linux-kernel
Thanks for all the testing and the review Keith, as well as the
fixes earlier.
Jens, what do you think of the series?
Thomas has added the first 5 patches to
https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/log/?h=irq/for-block
so it would be great if we could pull that into a block branch and
get the rest into the block tree sooner or later.
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: blk-mq: allow passing in an external queue mapping V3
  2016-09-15 14:32   ` Christoph Hellwig
@ 2016-09-15 14:34     ` Jens Axboe
  2016-09-15 14:42       ` Christoph Hellwig
  0 siblings, 1 reply; 33+ messages in thread
From: Jens Axboe @ 2016-09-15 14:34 UTC (permalink / raw)
  To: Christoph Hellwig, Keith Busch; +Cc: tglx, agordeev, linux-block, linux-kernel
On 09/15/2016 08:32 AM, Christoph Hellwig wrote:
> Thanks for all the testing and the review Keith, as well as the
> fixes earlier.
>
> Jens, what do you think of the series?
>
> Thomas has added the first 5 patches to
>
> https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/log/?h=irq/for-block
>
> so it would be great if we could pull that into a block branch and
> get the rest into the block tree sooner or later.
I was going to ask about splitting it, but that looks fine, I can pull
that in.
The series looks fine to me. My only real concern is giving drivers the
flexibility to define mappings, I don't want that to evolve into drivers
(again) doing stupid things wrt mappings. As long as we keep it strictly
as a tunnel for passing mappings defined by the (previous blk-mq) core
code, then that's fine.
-- 
Jens Axboe
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: blk-mq: allow passing in an external queue mapping V3
  2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
                   ` (12 preceding siblings ...)
  2016-09-14 14:18 ` [PATCH 13/13] blk-mq: get rid of the cpumask in struct blk_mq_tags Christoph Hellwig
@ 2016-09-15 14:40 ` Keith Busch
  2016-09-15 14:32   ` Christoph Hellwig
  13 siblings, 1 reply; 33+ messages in thread
From: Keith Busch @ 2016-09-15 14:40 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: axboe, tglx, agordeev, linux-block, linux-kernel
On Wed, Sep 14, 2016 at 04:18:46PM +0200, Christoph Hellwig wrote:
> This series is the remainder of the earlier "automatic interrupt affinity for
> MSI/MSI-X capable devices" series, and make uses of the new irq-level
> interrupt / queue mapping code in blk-mq, as well as allowing the driver
> to pass in such a mask obtained from the (PCI) interrupt code.  To fully
> support this feature in drivers the final third in the PCI layer will
> be needed as well.
Thanks, this looks good and tests successfully on my hardware.
For the series:
Reviewed-by: Keith Busch <keith.busch@intel.com>
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: blk-mq: allow passing in an external queue mapping V3
  2016-09-15 14:34     ` Jens Axboe
@ 2016-09-15 14:42       ` Christoph Hellwig
  2016-09-15 14:44         ` Jens Axboe
  0 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-15 14:42 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Keith Busch, tglx, agordeev, linux-block,
	linux-kernel
On Thu, Sep 15, 2016 at 08:34:42AM -0600, Jens Axboe wrote:
> I was going to ask about splitting it, but that looks fine, I can pull
> that in.
>
> The series looks fine to me. My only real concern is giving drivers the
> flexibility to define mappings, I don't want that to evolve into drivers
> (again) doing stupid things wrt mappings. As long as we keep it strictly
> as a tunnel for passing mappings defined by the (previous blk-mq) core
> code, then that's fine.
So my earlier versions just passed in the affinity mask and left
all the mapping in the core.  This doesn't really work anymore with
the sibling aware code so I had to add a method.  That being said there
are some drivers that might want slightly different mappings.
For example skd (if converted to blk-mq) has MSI-X vectors for it's up
to four queues, but it also has MSI-X vectors for misc book keeping
before those, so we'd need a version of our PCI mapping that adds an
offset to add to queue number when assining the MSI-X vectors.
That being said I structured the map_queues interface so it can't do
anything crazy - it can just build up the cpu to queue mapping array
so there isn't exactly a whole lot of crazy things a driver could do.
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 13/13] blk-mq: get rid of the cpumask in struct blk_mq_tags
  2016-09-14 14:18 ` [PATCH 13/13] blk-mq: get rid of the cpumask in struct blk_mq_tags Christoph Hellwig
@ 2016-09-15 14:44   ` Christoph Hellwig
  2016-09-15 14:46     ` Jens Axboe
  0 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-15 14:44 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, tglx, agordeev, keith.busch, linux-block, linux-kernel
> +static int blk_mq_create_mq_map(struct blk_mq_tag_set *set,
> +		const struct cpumask *affinity_mask)
>  {
> +	int queue = -1, cpu = 0;
> +
> +	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
> +			GFP_KERNEL, set->numa_node);
> +	if (!set->mq_map)
> +		return -ENOMEM;
> +
> +	if (!affinity_mask)
> +		return 0;	/* map all cpus to queue 0 */
> +
> +	/* If cpus are offline, map them to first hctx */
> +	for_each_online_cpu(cpu) {
> +		if (cpumask_test_cpu(cpu, affinity_mask))
> +			queue++;
> +		if (queue >= 0)
> +			set->mq_map[cpu] = queue;
> +	}
> +
> +	return 0;
>  }
I just noticed that the patch adds this unused function due to a rebase
error.  Jens, do you you just want to fix this up while applying or
should I resend?
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: blk-mq: allow passing in an external queue mapping V3
  2016-09-15 14:42       ` Christoph Hellwig
@ 2016-09-15 14:44         ` Jens Axboe
  0 siblings, 0 replies; 33+ messages in thread
From: Jens Axboe @ 2016-09-15 14:44 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, tglx, agordeev, linux-block, linux-kernel
On 09/15/2016 08:42 AM, Christoph Hellwig wrote:
> On Thu, Sep 15, 2016 at 08:34:42AM -0600, Jens Axboe wrote:
>> I was going to ask about splitting it, but that looks fine, I can pull
>> that in.
>>
>> The series looks fine to me. My only real concern is giving drivers the
>> flexibility to define mappings, I don't want that to evolve into drivers
>> (again) doing stupid things wrt mappings. As long as we keep it strictly
>> as a tunnel for passing mappings defined by the (previous blk-mq) core
>> code, then that's fine.
>
> So my earlier versions just passed in the affinity mask and left
> all the mapping in the core.  This doesn't really work anymore with
> the sibling aware code so I had to add a method.  That being said there
> are some drivers that might want slightly different mappings.
>
> For example skd (if converted to blk-mq) has MSI-X vectors for it's up
> to four queues, but it also has MSI-X vectors for misc book keeping
> before those, so we'd need a version of our PCI mapping that adds an
> offset to add to queue number when assining the MSI-X vectors.
>
> That being said I structured the map_queues interface so it can't do
> anything crazy - it can just build up the cpu to queue mapping array
> so there isn't exactly a whole lot of crazy things a driver could do.
By crazy, I mean even things like thinking it knows better and defining
mappings differently just because it can. But I'm not too worried about
it, it's just something to watch for that we didn't have to care about
before, as the mappings were completely mandated by the core code.
-- 
Jens Axboe
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 13/13] blk-mq: get rid of the cpumask in struct blk_mq_tags
  2016-09-15 14:44   ` Christoph Hellwig
@ 2016-09-15 14:46     ` Jens Axboe
  0 siblings, 0 replies; 33+ messages in thread
From: Jens Axboe @ 2016-09-15 14:46 UTC (permalink / raw)
  To: Christoph Hellwig, Christoph Hellwig
  Cc: tglx, agordeev, keith.busch, linux-block, linux-kernel
On 09/15/2016 08:44 AM, Christoph Hellwig wrote:
>> +static int blk_mq_create_mq_map(struct blk_mq_tag_set *set,
>> +		const struct cpumask *affinity_mask)
>>  {
>> +	int queue = -1, cpu = 0;
>> +
>> +	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
>> +			GFP_KERNEL, set->numa_node);
>> +	if (!set->mq_map)
>> +		return -ENOMEM;
>> +
>> +	if (!affinity_mask)
>> +		return 0;	/* map all cpus to queue 0 */
>> +
>> +	/* If cpus are offline, map them to first hctx */
>> +	for_each_online_cpu(cpu) {
>> +		if (cpumask_test_cpu(cpu, affinity_mask))
>> +			queue++;
>> +		if (queue >= 0)
>> +			set->mq_map[cpu] = queue;
>> +	}
>> +
>> +	return 0;
>>  }
>
> I just noticed that the patch adds this unused function due to a rebase
> error.  Jens, do you you just want to fix this up while applying or
> should I resend?
Killed it off manually, I already applied and pushed it out.
-- 
Jens Axboe
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 01/13] genirq/msi: Add cpumask allocation to alloc_msi_entry
  2016-09-14 14:18 ` [PATCH 01/13] genirq/msi: Add cpumask allocation to alloc_msi_entry Christoph Hellwig
@ 2016-09-19  7:30   ` Alexander Gordeev
  2016-09-19 13:50     ` Christoph Hellwig
  0 siblings, 1 reply; 33+ messages in thread
From: Alexander Gordeev @ 2016-09-19  7:30 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: axboe, tglx, keith.busch, linux-block, linux-kernel
On Wed, Sep 14, 2016 at 04:18:47PM +0200, Christoph Hellwig wrote:
> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> index 19e9dfb..8a3e8727 100644
> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -18,20 +18,42 @@
>  /* Temparory solution for building, will be removed later */
>  #include <linux/pci.h>
>  
> -struct msi_desc *alloc_msi_entry(struct device *dev)
> +/**
> + * alloc_msi_entry - Allocate an initialize msi_entry
> + * @dev:	Pointer to the device for which this is allocated
> + * @nvec:	The number of vectors used in this entry
> + * @affinity:	Optional pointer to an affinity mask array size of @nvec
> + *
> + * If @affinity is not NULL then a an affinity array[@nvec] is allocated
> + * and the affinity masks from @affinity are copied.
> + */
> +struct msi_desc *
> +alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity)
>  {
> -	struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
> +	struct msi_desc *desc;
> +
> +	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
>  	if (!desc)
>  		return NULL;
>  
>  	INIT_LIST_HEAD(&desc->list);
>  	desc->dev = dev;
> +	desc->nvec_used = nvec;
> +	if (affinity) {
> +		desc->affinity = kmemdup(affinity,
> +			nvec * sizeof(*desc->affinity), GFP_KERNEL);
> +		if (!desc->affinity) {
> +			kfree(desc);
> +			return NULL;
> +		}
> +	}
nit - should not "desc" initialization follow "desc->affinity" allocation?
>  	return desc;
>  }
>  
>  void free_msi_entry(struct msi_desc *entry)
>  {
> +	kfree(entry->affinity);
>  	kfree(entry);
>  }
>  
> -- 
> 2.1.4
> 
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 10/13] blk-mq: provide a default queue mapping for PCI device
  2016-09-14 14:18 ` [PATCH 10/13] blk-mq: provide a default queue mapping for PCI device Christoph Hellwig
@ 2016-09-19  7:33   ` Alexander Gordeev
  2016-09-19 13:49     ` Christoph Hellwig
  0 siblings, 1 reply; 33+ messages in thread
From: Alexander Gordeev @ 2016-09-19  7:33 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: axboe, tglx, keith.busch, linux-block, linux-kernel
On Wed, Sep 14, 2016 at 04:18:56PM +0200, Christoph Hellwig wrote:
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/Makefile             |  2 +-
>  block/blk-mq-pci.c         | 45 +++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/blk-mq-pci.h |  9 +++++++++
>  3 files changed, 55 insertions(+), 1 deletion(-)
>  create mode 100644 block/blk-mq-pci.c
>  create mode 100644 include/linux/blk-mq-pci.h
> 
> diff --git a/block/Makefile b/block/Makefile
> index 9eda232..2447a0b 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -22,4 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
>  obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
>  obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
>  obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
> -
> +obj-$(CONFIG_PCI)		+= blk-mq-pci.o
> diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
> new file mode 100644
> index 0000000..33c7bd7
> --- /dev/null
> +++ b/block/blk-mq-pci.c
> @@ -0,0 +1,45 @@
> +/*
> + * Copyright (c) 2016 Christoph Hellwig.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + */
> +#include <linux/blk-mq.h>
> +#include <linux/blk-mq-pci.h>
> +#include <linux/pci.h>
> +#include <linux/module.h>
> +
> +/**
> + * blk_mq_pci_map_queues - provide a default queue mapping for PCI device
> + * @set:	tagset to provide the mapping for
> + * @pdev:	PCI device associated with @set.
> + *
> + * This function assumes the PCI device @pdev has at least as many available
> + * interrupt vetors as @set has queues.  It will then queuery the vector
> + * corresponding to each queue for it's affinity mask and built queue mapping
> + * that maps a queue to the CPUs that have irq affinity for the corresponding
> + * vector.
> + */
> +int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev)
> +{
> +	const struct cpumask *mask;
> +	unsigned int queue, cpu;
> +
> +	for (queue = 0; queue < set->nr_hw_queues; queue++) {
> +		mask = pci_irq_get_affinity(pdev, queue);
> +		if (!mask)
> +			return -EINVAL;
> +
> +		for_each_cpu(cpu, mask)
> +			set->mq_map[cpu] = queue;
Considering this code is to be used by any device - an assumption
that queue number and interrupt vector are always equal seems
questionable. I.e. what about non-contiguous MSI-Xs?
> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
> diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h
> new file mode 100644
> index 0000000..6ab5952
> --- /dev/null
> +++ b/include/linux/blk-mq-pci.h
> @@ -0,0 +1,9 @@
> +#ifndef _LINUX_BLK_MQ_PCI_H
> +#define _LINUX_BLK_MQ_PCI_H
> +
> +struct blk_mq_tag_set;
> +struct pci_dev;
> +
> +int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev);
> +
> +#endif /* _LINUX_BLK_MQ_PCI_H */
> -- 
> 2.1.4
> 
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 10/13] blk-mq: provide a default queue mapping for PCI device
  2016-09-19  7:33   ` Alexander Gordeev
@ 2016-09-19 13:49     ` Christoph Hellwig
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-19 13:49 UTC (permalink / raw)
  To: Alexander Gordeev
  Cc: Christoph Hellwig, axboe, tglx, keith.busch, linux-block,
	linux-kernel
On Mon, Sep 19, 2016 at 09:33:14AM +0200, Alexander Gordeev wrote:
> Considering this code is to be used by any device - an assumption
> that queue number and interrupt vector are always equal seems
> questionable. I.e. what about non-contiguous MSI-Xs?
It's just a generic default - see the discussion with Jens in the
intro mail for the thread for anyother example that can't be handled
easily.  If we get enough devices for a given scheme we can add another
common helper, in the worst case we'll need driver code for the scheme.
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 01/13] genirq/msi: Add cpumask allocation to alloc_msi_entry
  2016-09-19  7:30   ` Alexander Gordeev
@ 2016-09-19 13:50     ` Christoph Hellwig
  2016-09-20  7:06       ` Alexander Gordeev
  0 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-19 13:50 UTC (permalink / raw)
  To: Alexander Gordeev
  Cc: Christoph Hellwig, axboe, tglx, keith.busch, linux-block,
	linux-kernel
On Mon, Sep 19, 2016 at 09:30:58AM +0200, Alexander Gordeev wrote:
> >  	INIT_LIST_HEAD(&desc->list);
> >  	desc->dev = dev;
> > +	desc->nvec_used = nvec;
> > +	if (affinity) {
> > +		desc->affinity = kmemdup(affinity,
> > +			nvec * sizeof(*desc->affinity), GFP_KERNEL);
> > +		if (!desc->affinity) {
> > +			kfree(desc);
> > +			return NULL;
> > +		}
> > +	}
> 
> nit - should not "desc" initialization follow "desc->affinity" allocation?
I can't parse that sentence.  Do you mean the desc->nvec_used setup?
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 01/13] genirq/msi: Add cpumask allocation to alloc_msi_entry
  2016-09-19 13:50     ` Christoph Hellwig
@ 2016-09-20  7:06       ` Alexander Gordeev
  2016-09-20  8:58         ` Thomas Gleixner
  0 siblings, 1 reply; 33+ messages in thread
From: Alexander Gordeev @ 2016-09-20  7:06 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: axboe, tglx, keith.busch, linux-block, linux-kernel
On Mon, Sep 19, 2016 at 03:50:07PM +0200, Christoph Hellwig wrote:
> On Mon, Sep 19, 2016 at 09:30:58AM +0200, Alexander Gordeev wrote:
> > >  	INIT_LIST_HEAD(&desc->list);
> > >  	desc->dev = dev;
> > > +	desc->nvec_used = nvec;
(*)
> > > +	if (affinity) {
> > > +		desc->affinity = kmemdup(affinity,
> > > +			nvec * sizeof(*desc->affinity), GFP_KERNEL);
> > > +		if (!desc->affinity) {
> > > +			kfree(desc);
> > > +			return NULL;
> > > +		}
> > > +	}
> > 
> > nit - should not "desc" initialization follow "desc->affinity" allocation?
> 
> I can't parse that sentence.  Do you mean the desc->nvec_used setup?
Yes, the inits above (*) would be useless if desc->affinity allocation failed.
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 01/13] genirq/msi: Add cpumask allocation to alloc_msi_entry
  2016-09-20  7:06       ` Alexander Gordeev
@ 2016-09-20  8:58         ` Thomas Gleixner
  0 siblings, 0 replies; 33+ messages in thread
From: Thomas Gleixner @ 2016-09-20  8:58 UTC (permalink / raw)
  To: Alexander Gordeev
  Cc: Christoph Hellwig, axboe, keith.busch, linux-block, linux-kernel
On Tue, 20 Sep 2016, Alexander Gordeev wrote:
> On Mon, Sep 19, 2016 at 03:50:07PM +0200, Christoph Hellwig wrote:
> > On Mon, Sep 19, 2016 at 09:30:58AM +0200, Alexander Gordeev wrote:
> > > >  	INIT_LIST_HEAD(&desc->list);
> > > >  	desc->dev = dev;
> > > > +	desc->nvec_used = nvec;
> 
> (*)
> 
> > > > +	if (affinity) {
> > > > +		desc->affinity = kmemdup(affinity,
> > > > +			nvec * sizeof(*desc->affinity), GFP_KERNEL);
> > > > +		if (!desc->affinity) {
> > > > +			kfree(desc);
> > > > +			return NULL;
> > > > +		}
> > > > +	}
> > > 
> > > nit - should not "desc" initialization follow "desc->affinity" allocation?
> > 
> > I can't parse that sentence.  Do you mean the desc->nvec_used setup?
> 
> Yes, the inits above (*) would be useless if desc->affinity allocation failed.
And that matters how?
Thanks,
	tglx
 
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 03/13] genirq/msi: Switch to new irq spreading infrastructure
  2016-09-14 14:18 ` [PATCH 03/13] genirq/msi: Switch to new " Christoph Hellwig
@ 2016-09-21 12:23   ` Alexander Gordeev
  2016-09-22  8:51   ` Alexander Gordeev
  1 sibling, 0 replies; 33+ messages in thread
From: Alexander Gordeev @ 2016-09-21 12:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: axboe, tglx, keith.busch, linux-block, linux-kernel
On Wed, Sep 14, 2016 at 04:18:49PM +0200, Christoph Hellwig wrote:
> @@ -1039,6 +1058,7 @@ EXPORT_SYMBOL(pci_msi_enabled);
>  static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
>  		unsigned int flags)
>  {
> +	bool affinity = flags & PCI_IRQ_AFFINITY;
>  	int nvec;
>  	int rc;
The below notes apply to __pci_enable_msi_range() obviously.
> @@ -1111,26 +1129,24 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
>  		struct msix_entry *entries, int minvec, int maxvec,
>  		unsigned int flags)
>  {
> -	int nvec = maxvec;
> -	int rc;
> +	bool affinity = flags & PCI_IRQ_AFFINITY;
> +	int rc, nvec = maxvec;
>  
>  	if (maxvec < minvec)
>  		return -ERANGE;
A sanity check is missing in case dev->irq_affinity/cpu_online_mask
weight is less than minvec. We want to throw -EINVAL in this case,
not -ENOSPC.
>  	for (;;) {
> -		if (flags & PCI_IRQ_AFFINITY) {
> -			dev->irq_affinity = irq_create_affinity_mask(&nvec);
> +		if (affinity) {
> +			nvec = irq_calc_affinity_vectors(dev->irq_affinity,
> +					nvec);
>  			if (nvec < minvec)
>  				return -ENOSPC;
>  		}
The affinity mask weight might change and fall below minvec before
__pci_enable_msix() is called. I guess, get/put_online_cpus() calls
need to protect the loop iterations, not just irq_calc_affinity_vectors()
function alone.
But throwing -ENOSPC due to lack of dedicated CPUs for interrupt
handling looks like an overkill in general case, since we still can
distribute interrupts to a lower cpumask. Sorry if I forgot or missed
a discussion on this case.
> -		rc = pci_enable_msix(dev, entries, nvec);
> +		rc = __pci_enable_msix(dev, entries, nvec, affinity);
>  		if (rc == 0)
>  			return nvec;
>  
> -		kfree(dev->irq_affinity);
> -		dev->irq_affinity = NULL;
> -
>  		if (rc < 0)
>  			return rc;
>  		if (rc < minvec)
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 02/13] genirq/affinity: Provide smarter irq spreading infrastructure
  2016-09-14 14:18 ` [PATCH 02/13] genirq/affinity: Provide smarter irq spreading infrastructure Christoph Hellwig
@ 2016-09-21 12:29   ` Alexander Gordeev
  2016-09-22 21:14     ` Thomas Gleixner
  0 siblings, 1 reply; 33+ messages in thread
From: Alexander Gordeev @ 2016-09-21 12:29 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: axboe, tglx, keith.busch, linux-block, linux-kernel
On Wed, Sep 14, 2016 at 04:18:48PM +0200, Christoph Hellwig wrote:
> +/**
> + * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask
> + * @affinity:		The affinity mask to spread. If NULL cpu_online_mask
> + *			is used
> + * @maxvec:		The maximum number of vectors available
> + */
> +int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec)
> +{
> +	int cpus, ret;
> +
> +	/* Stabilize the cpumasks */
> +	get_online_cpus();
> +	/* If the supplied affinity mask is NULL, use cpu online mask */
> +	if (!affinity)
> +		affinity = cpu_online_mask;
> +
> +	cpus = cpumask_weight(affinity);
Should not we consider the result of AND of affinity and cpu_online_mask?
> +	ret = (cpus < maxvec) ? cpus : maxvec;
> +
> +	put_online_cpus();
> +	return ret;
> +}
> +
>  static int get_first_sibling(unsigned int cpu)
>  {
>  	unsigned int ret;
> -- 
> 2.1.4
> 
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 03/13] genirq/msi: Switch to new irq spreading infrastructure
  2016-09-14 14:18 ` [PATCH 03/13] genirq/msi: Switch to new " Christoph Hellwig
  2016-09-21 12:23   ` Alexander Gordeev
@ 2016-09-22  8:51   ` Alexander Gordeev
  1 sibling, 0 replies; 33+ messages in thread
From: Alexander Gordeev @ 2016-09-22  8:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: axboe, tglx, keith.busch, linux-block, linux-kernel
On Wed, Sep 14, 2016 at 04:18:49PM +0200, Christoph Hellwig wrote:
>  static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
> -			      struct msix_entry *entries, int nvec)
> +			      struct msix_entry *entries, int nvec,
> +			      bool affinity)
>  {
> -	const struct cpumask *mask = NULL;
> +	struct cpumask *curmsk, *masks = NULL;
>  	struct msi_desc *entry;
> -	int cpu = -1, i;
> -
> -	for (i = 0; i < nvec; i++) {
> -		if (dev->irq_affinity) {
> -			cpu = cpumask_next(cpu, dev->irq_affinity);
> -			if (cpu >= nr_cpu_ids)
> -				cpu = cpumask_first(dev->irq_affinity);
> -			mask = cpumask_of(cpu);
> -		}
> +	int ret, i;
>  
> -		entry = alloc_msi_entry(&dev->dev, 1, NULL);
> +	if (affinity) {
> +		masks = irq_create_affinity_masks(dev->irq_affinity, nvec);
> +		if (!masks)
> +			pr_err("Unable to allocate affinity masks, ignoring\n");
Okay, so if we can tolerate affinity mask failure here, then we should be
able to tolerate it everywhere. Therefore, this piece of code (I pointed
in my other mail) in __pci_enable_msi_range() should not bail out:
		if (affinity) {
			nvec = irq_calc_affinity_vectors(dev->irq_affinity,
					nvec);
			if (nvec < minvec)
				return -ENOSPC;
		}
> +	}
> +
> +	for (i = 0, curmsk = masks; i < nvec; i++) {
> +		entry = alloc_msi_entry(&dev->dev, 1, curmsk);
>  		if (!entry) {
>  			if (!i)
>  				iounmap(base);
>  			else
>  				free_msi_irqs(dev);
>  			/* No enough memory. Don't try again */
> -			return -ENOMEM;
> +			ret = -ENOMEM;
> +			goto out;
>  		}
>  
>  		entry->msi_attrib.is_msix	= 1;
> @@ -710,11 +720,14 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
>  			entry->msi_attrib.entry_nr = i;
>  		entry->msi_attrib.default_irq	= dev->irq;
>  		entry->mask_base		= base;
> -		entry->affinity			= mask;
>  
>  		list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
> +		if (masks)
> +			curmsk++;
>  	}
> -
> +	ret = 0;
> +out:
> +	kfree(masks);
>  	return 0;
	return ret;
>  }
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 02/13] genirq/affinity: Provide smarter irq spreading infrastructure
  2016-09-21 12:29   ` Alexander Gordeev
@ 2016-09-22 21:14     ` Thomas Gleixner
  0 siblings, 0 replies; 33+ messages in thread
From: Thomas Gleixner @ 2016-09-22 21:14 UTC (permalink / raw)
  To: Alexander Gordeev
  Cc: Christoph Hellwig, axboe, keith.busch, linux-block, linux-kernel
Alexander,
On Wed, 21 Sep 2016, Alexander Gordeev wrote:
> On Wed, Sep 14, 2016 at 04:18:48PM +0200, Christoph Hellwig wrote:
> > +/**
> > + * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask
> > + * @affinity:		The affinity mask to spread. If NULL cpu_online_mask
> > + *			is used
> > + * @maxvec:		The maximum number of vectors available
> > + */
> > +int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec)
> > +{
> > +	int cpus, ret;
> > +
> > +	/* Stabilize the cpumasks */
> > +	get_online_cpus();
> > +	/* If the supplied affinity mask is NULL, use cpu online mask */
> > +	if (!affinity)
> > +		affinity = cpu_online_mask;
> > +
> > +	cpus = cpumask_weight(affinity);
> 
> Should not we consider the result of AND of affinity and cpu_online_mask?
That's a good question.
The argument against it is the increased usage of cpu (soft)hotplug for
power-management. The driver might well want to set the mapping even for an
offline cpu and as long as the interrupt is not requested for that
particular queue, it will stay (in software) associated to that cpu. So
once the CPU is brought up again the driver can request the interrupt and
work with the associated queue.
I'm aware that there are arguments against it, but lets see how it works
out.
Thanks,
	tglx
^ permalink raw reply	[flat|nested] 33+ messages in thread
* Re: [PATCH 11/13] nvme: switch to use pci_alloc_irq_vectors
  2016-09-14 14:18 ` [PATCH 11/13] nvme: switch to use pci_alloc_irq_vectors Christoph Hellwig
@ 2016-09-23 22:21   ` Sagi Grimberg
  2016-09-26 15:09     ` Christoph Hellwig
  0 siblings, 1 reply; 33+ messages in thread
From: Sagi Grimberg @ 2016-09-23 22:21 UTC (permalink / raw)
  To: Christoph Hellwig, axboe, tglx
  Cc: agordeev, keith.busch, linux-block, linux-kernel
On 14/09/16 07:18, Christoph Hellwig wrote:
> Use the new helper to automatically select the right interrupt type, as
> well as to use the automatic interupt affinity assignment.
Patch title and the change description are a little short IMO to
describe what is going on here (need the blk-mq side too).
I'd also think it would be better to split this to 2 patches but
really not a must...
> +static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
> +{
> +	struct nvme_dev *dev = set->driver_data;
> +
> +	return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev));
> +}
> +
Question: is using pci_alloc_irq_vectors() obligated for
supplying blk-mq with the device affinity mask?
If I do this completely-untested [1] what will happen?
[1]:
--
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 8d2875b4c56d..76693d406efe 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1518,6 +1518,14 @@ static void nvme_rdma_complete_rq(struct request *rq)
         blk_mq_end_request(rq, error);
  }
+static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
+{
+       struct nvme_rdma_ctrl *ctrl = set->driver_data;
+       struct device *dev = ctrl->device->dev.dma_device;
+
+       return blk_mq_pci_map_queues(set, to_pci_dev(dev));
+}
+
  static struct blk_mq_ops nvme_rdma_mq_ops = {
         .queue_rq       = nvme_rdma_queue_rq,
         .complete       = nvme_rdma_complete_rq,
@@ -1528,6 +1536,7 @@ static struct blk_mq_ops nvme_rdma_mq_ops = {
         .init_hctx      = nvme_rdma_init_hctx,
         .poll           = nvme_rdma_poll,
         .timeout        = nvme_rdma_timeout,
+       .map_queues     = nvme_rdma_map_queues,
  };
  static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
--
^ permalink raw reply related	[flat|nested] 33+ messages in thread
* Re: [PATCH 11/13] nvme: switch to use pci_alloc_irq_vectors
  2016-09-23 22:21   ` Sagi Grimberg
@ 2016-09-26 15:09     ` Christoph Hellwig
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2016-09-26 15:09 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, axboe, tglx, agordeev, keith.busch,
	linux-block, linux-kernel
On Fri, Sep 23, 2016 at 03:21:14PM -0700, Sagi Grimberg wrote:
> Question: is using pci_alloc_irq_vectors() obligated for
> supplying blk-mq with the device affinity mask?
No, but it's very useful.  We'll need equivalents for other busses
that provide multipl vectors and vector spreading.
> If I do this completely-untested [1] what will happen?
Everything will be crashing and burning because you call to_pci_dev on
something that's not a PCI dev?
For the next merge window I plan to wire up the affinity information
for the RDMA code, and I will add a counterpart to blk_mq_pci_map_queues
that spreads the queues over the completion vectors.
^ permalink raw reply	[flat|nested] 33+ messages in thread
end of thread, other threads:[~2016-09-26 15:09 UTC | newest]
Thread overview: 33+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-09-14 14:18 blk-mq: allow passing in an external queue mapping V3 Christoph Hellwig
2016-09-14 14:18 ` [PATCH 01/13] genirq/msi: Add cpumask allocation to alloc_msi_entry Christoph Hellwig
2016-09-19  7:30   ` Alexander Gordeev
2016-09-19 13:50     ` Christoph Hellwig
2016-09-20  7:06       ` Alexander Gordeev
2016-09-20  8:58         ` Thomas Gleixner
2016-09-14 14:18 ` [PATCH 02/13] genirq/affinity: Provide smarter irq spreading infrastructure Christoph Hellwig
2016-09-21 12:29   ` Alexander Gordeev
2016-09-22 21:14     ` Thomas Gleixner
2016-09-14 14:18 ` [PATCH 03/13] genirq/msi: Switch to new " Christoph Hellwig
2016-09-21 12:23   ` Alexander Gordeev
2016-09-22  8:51   ` Alexander Gordeev
2016-09-14 14:18 ` [PATCH 04/13] genirq/affinity: Remove old irq spread infrastructure Christoph Hellwig
2016-09-14 14:18 ` [PATCH 05/13] pci/msi: Retrieve affinity for a vector Christoph Hellwig
2016-09-14 14:18 ` [PATCH 06/13] blk-mq: don't redistribute hardware queues on a CPU hotplug event Christoph Hellwig
2016-09-14 14:18 ` [PATCH 07/13] blk-mq: only allocate a single mq_map per tag_set Christoph Hellwig
2016-09-14 14:18 ` [PATCH 08/13] blk-mq: remove ->map_queue Christoph Hellwig
2016-09-14 14:18 ` [PATCH 09/13] blk-mq: allow the driver to pass in a queue mapping Christoph Hellwig
2016-09-14 14:18 ` [PATCH 10/13] blk-mq: provide a default queue mapping for PCI device Christoph Hellwig
2016-09-19  7:33   ` Alexander Gordeev
2016-09-19 13:49     ` Christoph Hellwig
2016-09-14 14:18 ` [PATCH 11/13] nvme: switch to use pci_alloc_irq_vectors Christoph Hellwig
2016-09-23 22:21   ` Sagi Grimberg
2016-09-26 15:09     ` Christoph Hellwig
2016-09-14 14:18 ` [PATCH 12/13] nvme: remove the post_scan callout Christoph Hellwig
2016-09-14 14:18 ` [PATCH 13/13] blk-mq: get rid of the cpumask in struct blk_mq_tags Christoph Hellwig
2016-09-15 14:44   ` Christoph Hellwig
2016-09-15 14:46     ` Jens Axboe
2016-09-15 14:40 ` blk-mq: allow passing in an external queue mapping V3 Keith Busch
2016-09-15 14:32   ` Christoph Hellwig
2016-09-15 14:34     ` Jens Axboe
2016-09-15 14:42       ` Christoph Hellwig
2016-09-15 14:44         ` Jens Axboe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).