All of lore.kernel.org
 help / color / mirror / Atom feed
From: Keith Busch <keith.busch@intel.com>
To: Christoph Hellwig <hch@lst.de>
Cc: axboe@fb.com, linux-block@vger.kernel.org,
	linux-nvme@lists.infradead.org
Subject: Re: [PATCH 4/7] blk-mq: allow the driver to pass in an affinity mask
Date: Thu, 1 Sep 2016 19:30:10 -0400	[thread overview]
Message-ID: <20160901233010.GC10903@localhost.localdomain> (raw)
In-Reply-To: <20160901142410.GA10903@localhost.localdomain>

On Thu, Sep 01, 2016 at 10:24:10AM -0400, Keith Busch wrote:
> Yeah, I gathered that's what it was providing, but that's just barely
> not enough information to do something useful. The CPUs that aren't set
> have to use a previously assigned vector/queue, but which one?

Unless I'm totally missing how to infer paired CPUs, I think we need
arrays.

Here's a stab at that. I'm using the "old" algorithm the NVMe driver used
to pair vectors and cpus. It's not the most efficient way of pairing
that I know of, but it is easy to follow (relatively speaking), and it
actually utilizes every hardware resource available so I get very good
CPU <-> Queue mappings.

---
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9cc08c6..c5c038e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2283,7 +2283,7 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 static int blk_mq_create_mq_map(struct blk_mq_tag_set *set,
 		const struct cpumask *affinity_mask)
 {
-	int queue = -1, cpu = 0;
+	int queue;
 
 	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
 			GFP_KERNEL, set->numa_node);
@@ -2293,11 +2293,10 @@ static int blk_mq_create_mq_map(struct blk_mq_tag_set *set,
 	if (!affinity_mask)
 		return 0;	/* map all cpus to queue 0 */
 
-	/* If cpus are offline, map them to first hctx */
-	for_each_online_cpu(cpu) {
-		if (cpumask_test_cpu(cpu, affinity_mask))
-			queue++;
-		if (queue >= 0)
+	for (queue = 0; queue < set->nr_hw_queues; queue++) {
+		int cpu;
+
+		for_each_cpu(cpu, &affinity_mask[queue])
 			set->mq_map[cpu] = queue;
 	}
 
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 98f1222..03a1ffc 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -683,15 +683,11 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 {
 	const struct cpumask *mask = NULL;
 	struct msi_desc *entry;
-	int cpu = -1, i;
+	int i;
 
 	for (i = 0; i < nvec; i++) {
-		if (dev->irq_affinity) {
-			cpu = cpumask_next(cpu, dev->irq_affinity);
-			if (cpu >= nr_cpu_ids)
-				cpu = cpumask_first(dev->irq_affinity);
-			mask = cpumask_of(cpu);
-		}
+		if (dev->irq_affinity)
+			mask = &dev->irq_affinity[i];
 
 		entry = alloc_msi_entry(&dev->dev);
 		if (!entry) {
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 32f6cfc..9fe548b 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -4,14 +4,47 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 
-static int get_first_sibling(unsigned int cpu)
+static int find_closest_node(int node)
 {
-	unsigned int ret;
+	int n, val, min_val = INT_MAX, best_node = node;
+
+	for_each_online_node(n) {
+		if (n == node)
+			continue;
+		val = node_distance(node, n);
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+	return best_node;
+}
+
+static void set_vec_cpus(const cpumask_t *qmask, struct cpumask *affinity_mask,
+								int count)
+{
+	int cpu;
+
+	for_each_cpu(cpu, qmask) {
+		if (cpumask_weight(affinity_mask) >= count)
+			break;
+		cpumask_set_cpu(cpu, affinity_mask);
+	}
+}
+
+static void add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
+	const cpumask_t *new_mask, struct cpumask *affinity_mask,
+	int cpus_per_queue)
+{
+	int next_cpu;
+
+	for_each_cpu(next_cpu, new_mask) {
+		cpumask_or(mask, mask, get_cpu_mask(next_cpu));
+		cpumask_or(mask, mask, topology_sibling_cpumask(next_cpu));
+		cpumask_and(mask, mask, unassigned_cpus);
+	}
+	set_vec_cpus(mask, affinity_mask, cpus_per_queue);
 
-	ret = cpumask_first(topology_sibling_cpumask(cpu));
-	if (ret < nr_cpu_ids)
-		return ret;
-	return cpu;
 }
 
 /*
@@ -27,37 +60,76 @@ static int get_first_sibling(unsigned int cpu)
  */
 struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
 {
-	struct cpumask *affinity_mask;
-	unsigned int max_vecs = *nr_vecs;
+	struct cpumask *affinity_mask, *masks;
+	unsigned int max_vecs = *nr_vecs, cpu, cpus_per_vec, remainder, i;
+	cpumask_var_t unassigned_cpus;
 
 	if (max_vecs == 1)
 		return NULL;
 
-	affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
-	if (!affinity_mask) {
+	masks = kcalloc(max_vecs, sizeof(*affinity_mask), GFP_KERNEL);
+	if (!masks) {
 		*nr_vecs = 1;
 		return NULL;
 	}
 
 	get_online_cpus();
-	if (max_vecs >= num_online_cpus()) {
-		cpumask_copy(affinity_mask, cpu_online_mask);
-		*nr_vecs = num_online_cpus();
-	} else {
-		unsigned int vecs = 0, cpu;
-
-		for_each_online_cpu(cpu) {
-			if (cpu == get_first_sibling(cpu)) {
-				cpumask_set_cpu(cpu, affinity_mask);
-				vecs++;
-			}
-
-			if (--max_vecs == 0)
-				break;
-		}
-		*nr_vecs = vecs;
+
+	cpus_per_vec = num_online_cpus() / max_vecs;
+	remainder = max_vecs - (num_online_cpus() - max_vecs * cpus_per_vec);
+
+	cpumask_copy(unassigned_cpus, cpu_online_mask);
+	cpu = cpumask_first(unassigned_cpus);
+
+	for (i = 0; i < max_vecs; i++) {
+		cpumask_t mask;
+
+		if (!cpumask_weight(unassigned_cpus))
+			break;
+
+		affinity_mask = &masks[i];
+
+		mask = *get_cpu_mask(cpu);
+		set_vec_cpus(&mask, affinity_mask, cpus_per_vec);
+
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				topology_sibling_cpumask(cpu),
+				affinity_mask, cpus_per_vec);
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				topology_core_cpumask(cpu),
+				affinity_mask, cpus_per_vec);
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				cpumask_of_node(cpu_to_node(cpu)),
+				affinity_mask, cpus_per_vec);
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				cpumask_of_node(
+					find_closest_node(
+						cpu_to_node(cpu))),
+				affinity_mask, cpus_per_vec);
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				unassigned_cpus, affinity_mask,
+				cpus_per_vec);
+
+		cpumask_andnot(unassigned_cpus, unassigned_cpus, affinity_mask);
+		cpu = cpumask_next(cpu, unassigned_cpus);
+
+		if (remainder && !--remainder)
+			cpus_per_vec++;
 	}
 	put_online_cpus();
 
-	return affinity_mask;
+	i = 0;
+	cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
+	for_each_cpu(cpu, unassigned_cpus) {
+		set_vec_cpus(get_cpu_mask(cpu), &masks[i], ~0);
+		i = (i + 1) % max_vecs;
+	}
+	free_cpumask_var(unassigned_cpus);
+
+	return masks;
 }
--

WARNING: multiple messages have this Message-ID (diff)
From: keith.busch@intel.com (Keith Busch)
Subject: [PATCH 4/7] blk-mq: allow the driver to pass in an affinity mask
Date: Thu, 1 Sep 2016 19:30:10 -0400	[thread overview]
Message-ID: <20160901233010.GC10903@localhost.localdomain> (raw)
In-Reply-To: <20160901142410.GA10903@localhost.localdomain>

On Thu, Sep 01, 2016@10:24:10AM -0400, Keith Busch wrote:
> Yeah, I gathered that's what it was providing, but that's just barely
> not enough information to do something useful. The CPUs that aren't set
> have to use a previously assigned vector/queue, but which one?

Unless I'm totally missing how to infer paired CPUs, I think we need
arrays.

Here's a stab at that. I'm using the "old" algorithm the NVMe driver used
to pair vectors and cpus. It's not the most efficient way of pairing
that I know of, but it is easy to follow (relatively speaking), and it
actually utilizes every hardware resource available so I get very good
CPU <-> Queue mappings.

---
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9cc08c6..c5c038e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2283,7 +2283,7 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 static int blk_mq_create_mq_map(struct blk_mq_tag_set *set,
 		const struct cpumask *affinity_mask)
 {
-	int queue = -1, cpu = 0;
+	int queue;
 
 	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
 			GFP_KERNEL, set->numa_node);
@@ -2293,11 +2293,10 @@ static int blk_mq_create_mq_map(struct blk_mq_tag_set *set,
 	if (!affinity_mask)
 		return 0;	/* map all cpus to queue 0 */
 
-	/* If cpus are offline, map them to first hctx */
-	for_each_online_cpu(cpu) {
-		if (cpumask_test_cpu(cpu, affinity_mask))
-			queue++;
-		if (queue >= 0)
+	for (queue = 0; queue < set->nr_hw_queues; queue++) {
+		int cpu;
+
+		for_each_cpu(cpu, &affinity_mask[queue])
 			set->mq_map[cpu] = queue;
 	}
 
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 98f1222..03a1ffc 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -683,15 +683,11 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 {
 	const struct cpumask *mask = NULL;
 	struct msi_desc *entry;
-	int cpu = -1, i;
+	int i;
 
 	for (i = 0; i < nvec; i++) {
-		if (dev->irq_affinity) {
-			cpu = cpumask_next(cpu, dev->irq_affinity);
-			if (cpu >= nr_cpu_ids)
-				cpu = cpumask_first(dev->irq_affinity);
-			mask = cpumask_of(cpu);
-		}
+		if (dev->irq_affinity)
+			mask = &dev->irq_affinity[i];
 
 		entry = alloc_msi_entry(&dev->dev);
 		if (!entry) {
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 32f6cfc..9fe548b 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -4,14 +4,47 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 
-static int get_first_sibling(unsigned int cpu)
+static int find_closest_node(int node)
 {
-	unsigned int ret;
+	int n, val, min_val = INT_MAX, best_node = node;
+
+	for_each_online_node(n) {
+		if (n == node)
+			continue;
+		val = node_distance(node, n);
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+	return best_node;
+}
+
+static void set_vec_cpus(const cpumask_t *qmask, struct cpumask *affinity_mask,
+								int count)
+{
+	int cpu;
+
+	for_each_cpu(cpu, qmask) {
+		if (cpumask_weight(affinity_mask) >= count)
+			break;
+		cpumask_set_cpu(cpu, affinity_mask);
+	}
+}
+
+static void add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
+	const cpumask_t *new_mask, struct cpumask *affinity_mask,
+	int cpus_per_queue)
+{
+	int next_cpu;
+
+	for_each_cpu(next_cpu, new_mask) {
+		cpumask_or(mask, mask, get_cpu_mask(next_cpu));
+		cpumask_or(mask, mask, topology_sibling_cpumask(next_cpu));
+		cpumask_and(mask, mask, unassigned_cpus);
+	}
+	set_vec_cpus(mask, affinity_mask, cpus_per_queue);
 
-	ret = cpumask_first(topology_sibling_cpumask(cpu));
-	if (ret < nr_cpu_ids)
-		return ret;
-	return cpu;
 }
 
 /*
@@ -27,37 +60,76 @@ static int get_first_sibling(unsigned int cpu)
  */
 struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
 {
-	struct cpumask *affinity_mask;
-	unsigned int max_vecs = *nr_vecs;
+	struct cpumask *affinity_mask, *masks;
+	unsigned int max_vecs = *nr_vecs, cpu, cpus_per_vec, remainder, i;
+	cpumask_var_t unassigned_cpus;
 
 	if (max_vecs == 1)
 		return NULL;
 
-	affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
-	if (!affinity_mask) {
+	masks = kcalloc(max_vecs, sizeof(*affinity_mask), GFP_KERNEL);
+	if (!masks) {
 		*nr_vecs = 1;
 		return NULL;
 	}
 
 	get_online_cpus();
-	if (max_vecs >= num_online_cpus()) {
-		cpumask_copy(affinity_mask, cpu_online_mask);
-		*nr_vecs = num_online_cpus();
-	} else {
-		unsigned int vecs = 0, cpu;
-
-		for_each_online_cpu(cpu) {
-			if (cpu == get_first_sibling(cpu)) {
-				cpumask_set_cpu(cpu, affinity_mask);
-				vecs++;
-			}
-
-			if (--max_vecs == 0)
-				break;
-		}
-		*nr_vecs = vecs;
+
+	cpus_per_vec = num_online_cpus() / max_vecs;
+	remainder = max_vecs - (num_online_cpus() - max_vecs * cpus_per_vec);
+
+	cpumask_copy(unassigned_cpus, cpu_online_mask);
+	cpu = cpumask_first(unassigned_cpus);
+
+	for (i = 0; i < max_vecs; i++) {
+		cpumask_t mask;
+
+		if (!cpumask_weight(unassigned_cpus))
+			break;
+
+		affinity_mask = &masks[i];
+
+		mask = *get_cpu_mask(cpu);
+		set_vec_cpus(&mask, affinity_mask, cpus_per_vec);
+
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				topology_sibling_cpumask(cpu),
+				affinity_mask, cpus_per_vec);
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				topology_core_cpumask(cpu),
+				affinity_mask, cpus_per_vec);
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				cpumask_of_node(cpu_to_node(cpu)),
+				affinity_mask, cpus_per_vec);
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				cpumask_of_node(
+					find_closest_node(
+						cpu_to_node(cpu))),
+				affinity_mask, cpus_per_vec);
+		if (cpumask_weight(&mask) < cpus_per_vec)
+			add_cpus(&mask, unassigned_cpus,
+				unassigned_cpus, affinity_mask,
+				cpus_per_vec);
+
+		cpumask_andnot(unassigned_cpus, unassigned_cpus, affinity_mask);
+		cpu = cpumask_next(cpu, unassigned_cpus);
+
+		if (remainder && !--remainder)
+			cpus_per_vec++;
 	}
 	put_online_cpus();
 
-	return affinity_mask;
+	i = 0;
+	cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
+	for_each_cpu(cpu, unassigned_cpus) {
+		set_vec_cpus(get_cpu_mask(cpu), &masks[i], ~0);
+		i = (i + 1) % max_vecs;
+	}
+	free_cpumask_var(unassigned_cpus);
+
+	return masks;
 }
--

  reply	other threads:[~2016-09-01 23:19 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-08-29 10:53 blk-mq: allow passing in an external queue mapping V2 Christoph Hellwig
2016-08-29 10:53 ` Christoph Hellwig
2016-08-29 10:53 ` [PATCH 1/7] blk-mq: don't redistribute hardware queues on a CPU hotplug event Christoph Hellwig
2016-08-29 10:53   ` Christoph Hellwig
2016-08-29 10:53 ` [PATCH 2/7] blk-mq: only allocate a single mq_map per tag_set Christoph Hellwig
2016-08-29 10:53   ` Christoph Hellwig
2016-08-29 10:53 ` [PATCH 3/7] blk-mq: remove ->map_queue Christoph Hellwig
2016-08-29 10:53   ` Christoph Hellwig
2016-08-29 10:53 ` [PATCH 4/7] blk-mq: allow the driver to pass in an affinity mask Christoph Hellwig
2016-08-29 10:53   ` Christoph Hellwig
2016-08-31 16:38   ` Keith Busch
2016-08-31 16:38     ` Keith Busch
2016-09-01  8:46     ` Christoph Hellwig
2016-09-01  8:46       ` Christoph Hellwig
2016-09-01 14:24       ` Keith Busch
2016-09-01 14:24         ` Keith Busch
2016-09-01 23:30         ` Keith Busch [this message]
2016-09-01 23:30           ` Keith Busch
2016-09-05 19:48         ` Christoph Hellwig
2016-09-05 19:48           ` Christoph Hellwig
2016-09-06 14:39           ` Keith Busch
2016-09-06 14:39             ` Keith Busch
2016-09-06 16:50             ` Christoph Hellwig
2016-09-06 16:50               ` Christoph Hellwig
2016-09-06 17:30               ` Keith Busch
2016-09-06 17:30                 ` Keith Busch
2016-09-07 15:38               ` Thomas Gleixner
2016-09-07 15:38                 ` Thomas Gleixner
2016-08-29 10:53 ` [PATCH 5/7] nvme: switch to use pci_alloc_irq_vectors Christoph Hellwig
2016-08-29 10:53   ` Christoph Hellwig
2016-08-29 10:53 ` [PATCH 6/7] nvme: remove the post_scan callout Christoph Hellwig
2016-08-29 10:53   ` Christoph Hellwig
2016-08-29 10:53 ` [PATCH 7/7] blk-mq: get rid of the cpumask in struct blk_mq_tags Christoph Hellwig
2016-08-29 10:53   ` Christoph Hellwig
2016-08-30 23:28 ` blk-mq: allow passing in an external queue mapping V2 Keith Busch
2016-08-30 23:28   ` Keith Busch
2016-09-01  8:45   ` Christoph Hellwig
2016-09-01  8:45     ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160901233010.GC10903@localhost.localdomain \
    --to=keith.busch@intel.com \
    --cc=axboe@fb.com \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.