linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Keith Busch <keith.busch@intel.com>
To: Christoph Hellwig <hch@lst.de>
Cc: axboe@fb.com, linux-block@vger.kernel.org,
	linux-nvme@lists.infradead.org
Subject: Re: [PATCH 4/7] blk-mq: allow the driver to pass in an affinity mask
Date: Thu, 1 Sep 2016 19:30:10 -0400	[thread overview]
Message-ID: <20160901233010.GC10903@localhost.localdomain> (raw)
In-Reply-To: <20160901142410.GA10903@localhost.localdomain>

On Thu, Sep 01, 2016 at 10:24:10AM -0400, Keith Busch wrote:
> Yeah, I gathered that's what it was providing, but that's just barely
> not enough information to do something useful. The CPUs that aren't set
> have to use a previously assigned vector/queue, but which one?

Unless I'm totally missing how to infer paired CPUs, I think we need
arrays.

Here's a stab at that. I'm using the "old" algorithm the NVMe driver used
to pair vectors and cpus. It's not the most efficient way of pairing
that I know of, but it is easy to follow (relatively speaking), and it
actually utilizes every hardware resource available so I get very good
CPU <-> Queue mappings.

---
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9cc08c6..c5c038e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2283,7 +2283,7 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 static int blk_mq_create_mq_map(struct blk_mq_tag_set *set,
 		const struct cpumask *affinity_mask)
 {
-	int queue = -1, cpu = 0;
+	int queue;
 
 	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
 			GFP_KERNEL, set->numa_node);
@@ -2293,11 +2293,10 @@ static int blk_mq_create_mq_map(struct blk_mq_tag_set *set,
 	if (!affinity_mask)
 		return 0;	/* map all cpus to queue 0 */
 
-	/* If cpus are offline, map them to first hctx */
-	for_each_online_cpu(cpu) {
-		if (cpumask_test_cpu(cpu, affinity_mask))
-			queue++;
-		if (queue >= 0)
+	for (queue = 0; queue < set->nr_hw_queues; queue++) {
+		int cpu;
+
+		for_each_cpu(cpu, &affinity_mask[queue])
 			set->mq_map[cpu] = queue;
 	}
 
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 98f1222..03a1ffc 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -683,15 +683,11 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 {
 	const struct cpumask *mask = NULL;
 	struct msi_desc *entry;
-	int cpu = -1, i;
+	int i;
 
 	for (i = 0; i < nvec; i++) {
-		if (dev->irq_affinity) {
-			cpu = cpumask_next(cpu, dev->irq_affinity);
-			if (cpu >= nr_cpu_ids)
-				cpu = cpumask_first(dev->irq_affinity);
-			mask = cpumask_of(cpu);
-		}
+		if (dev->irq_affinity)
+			mask = &dev->irq_affinity[i];
 
 		entry = alloc_msi_entry(&dev->dev);
 		if (!entry) {
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 32f6cfc..9fe548b 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -4,14 +4,47 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 
-static int get_first_sibling(unsigned int cpu)
+static int find_closest_node(int node)
 {
-	unsigned int ret;
+	int n, val, min_val = INT_MAX, best_node = node;
+
+	for_each_online_node(n) {
+		if (n == node)
+			continue;
+		val = node_distance(node, n);
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+	return best_node;
+}
+
+static void set_vec_cpus(const cpumask_t *qmask, struct cpumask *affinity_mask,
+								int count)
+{
+	int cpu;
+
+	for_each_cpu(cpu, qmask) {
+		if (cpumask_weight(affinity_mask) >= count)
+			break;
+		cpumask_set_cpu(cpu, affinity_mask);
+	}
+}
+
+static void add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
+	const cpumask_t *new_mask, struct cpumask *affinity_mask,
+	int cpus_per_queue)
+{
+	int next_cpu;
+
+	for_each_cpu(next_cpu, new_mask) {
+		cpumask_or(mask, mask, get_cpu_mask(next_cpu));
+		cpumask_or(mask, mask, topology_sibling_cpumask(next_cpu));
+		cpumask_and(mask, mask, unassigned_cpus);
+	}
+	set_vec_cpus(mask, affinity_mask, cpus_per_queue);
 
-	ret = cpumask_first(topology_sibling_cpumask(cpu));
-	if (ret < nr_cpu_ids)
-		return ret;
-	return cpu;
 }
 
 /*
@@ -27,37 +60,76 @@ static int get_first_sibling(unsigned int cpu)
  */
 struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
 {
-	struct cpumask *affinity_mask;
-	unsigned int max_vecs = *nr_vecs;
+	struct cpumask *affinity_mask, *masks;
+	unsigned int max_vecs = *nr_vecs, cpu, cpus_per_vec, remainder, i;
+	cpumask_var_t unassigned_cpus;
 
 	if (max_vecs == 1)
 		return NULL;
 
-	affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
-	if (!affinity_mask) {
+	masks = kcalloc(max_vecs, sizeof(*affinity_mask), GFP_KERNEL);
+	if (!masks) {
 		*nr_vecs = 1;
 		return NULL;
 	}
 
 	get_online_cpus();
-	if (max_vecs >= num_online_cpus()) {
-		cpumask_copy(affinity_mask, cpu_online_mask);
-		*nr_vecs = num_online_cpus();
-	} else {
-		unsigned int vecs = 0, cpu;
-
-		for_each_online_cpu(cpu) {
-			if (cpu == get_first_sibling(cpu)) {
-				cpumask_set_cpu(cpu, affinity_mask);
-				vecs++;
-			}
-
-			if (--max_vecs == 0)
-				break;
-		}
-		*nr_vecs = vecs;
+
+	cpus_per_vec = num_online_cpus() / max_vecs;
+	remainder = max_vecs - (num_online_cpus() - max_vecs * cpus_per_vec);
+
+	cpumask_copy(unassigned_cpus, cpu_online_mask);
+	cpu = cpumask_first(unassigned_cpus);
+
+	for (i = 0; i < max_vecs; i++) {
+		cpumask_t mask;
+
+		if (!cpumask_weight(unassigned_cpus))
+			break;
+
+		affinity_mask = &masks[i];
+
+		mask = *get_cpu_mask(cpu);
+		set_vec_cpus(&mask, affinity_mask, cpus_per_vec);
+
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				topology_sibling_cpumask(cpu),
+				affinity_mask, cpus_per_vec);
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				topology_core_cpumask(cpu),
+				affinity_mask, cpus_per_vec);
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				cpumask_of_node(cpu_to_node(cpu)),
+				affinity_mask, cpus_per_vec);
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				cpumask_of_node(
+					find_closest_node(
+						cpu_to_node(cpu))),
+				affinity_mask, cpus_per_vec);
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				unassigned_cpus, affinity_mask,
+				cpus_per_vec);
+
+		cpumask_andnot(unassigned_cpus, unassigned_cpus, affinity_mask);
+		cpu = cpumask_next(cpu, unassigned_cpus);
+
+		if (remainder && !--remainder)
+			cpus_per_vec++;
 	}
 	put_online_cpus();
 
-	return affinity_mask;
+	i = 0;
+	cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
+	for_each_cpu(cpu, unassigned_cpus) {
+		set_vec_cpus(get_cpu_mask(cpu), &masks[i], ~0);
+		i = (i + 1) % max_vecs;
+	}
+	free_cpumask_var(unassigned_cpus);
+
+	return masks;
 }
--

  reply	other threads:[~2016-09-01 23:19 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-08-29 10:53 blk-mq: allow passing in an external queue mapping V2 Christoph Hellwig
2016-08-29 10:53 ` [PATCH 1/7] blk-mq: don't redistribute hardware queues on a CPU hotplug event Christoph Hellwig
2016-08-29 10:53 ` [PATCH 2/7] blk-mq: only allocate a single mq_map per tag_set Christoph Hellwig
2016-08-29 10:53 ` [PATCH 3/7] blk-mq: remove ->map_queue Christoph Hellwig
2016-08-29 10:53 ` [PATCH 4/7] blk-mq: allow the driver to pass in an affinity mask Christoph Hellwig
2016-08-31 16:38   ` Keith Busch
2016-09-01  8:46     ` Christoph Hellwig
2016-09-01 14:24       ` Keith Busch
2016-09-01 23:30         ` Keith Busch [this message]
2016-09-05 19:48         ` Christoph Hellwig
2016-09-06 14:39           ` Keith Busch
2016-09-06 16:50             ` Christoph Hellwig
2016-09-06 17:30               ` Keith Busch
2016-09-07 15:38               ` Thomas Gleixner
2016-08-29 10:53 ` [PATCH 5/7] nvme: switch to use pci_alloc_irq_vectors Christoph Hellwig
2016-08-29 10:53 ` [PATCH 6/7] nvme: remove the post_scan callout Christoph Hellwig
2016-08-29 10:53 ` [PATCH 7/7] blk-mq: get rid of the cpumask in struct blk_mq_tags Christoph Hellwig
2016-08-30 23:28 ` blk-mq: allow passing in an external queue mapping V2 Keith Busch
2016-09-01  8:45   ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160901233010.GC10903@localhost.localdomain \
    --to=keith.busch@intel.com \
    --cc=axboe@fb.com \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).