All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv2 0/4] IO Queue fixes rewrite
@ 2014-01-31 23:53 Keith Busch
  2014-01-31 23:53 ` [PATCHv2 1/4] NVMe: Namespace use after free on surprise removal Keith Busch
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Keith Busch @ 2014-01-31 23:53 UTC (permalink / raw)


New set replacing this one:

http://merlin.infradead.org/pipermail/linux-nvme/2014-January/000656.html

I narrowed this set down to only the surprise removal fixes and the io
queue/cpu mapping. The cosmetic and irq vector patches from the previous
set can wait.

The only patch different in v2 from the previous is the "Per-cpu IO
queues". The feedback on using the per cpu variable to point to an index
rather than a struct nvme_queue pointer was a great idea!

I only included all 4 in this set because I wanted to make sure they
apply cleanly against the new tree.

I also ran sparse this time; I've exceeded my fair share of sparse errors
working on this driver. :)

Keith Busch (4):
  NVMe: Namespace use after free on surprise removal
  NVMe: RCU access to nvme_queue
  NVMe: Per-cpu IO queues
  NVMe: CPU hot plug notification

 drivers/block/nvme-core.c |  314 ++++++++++++++++++++++++++++++++++-----------
 include/linux/nvme.h      |    9 +-
 2 files changed, 247 insertions(+), 76 deletions(-)

-- 
1.7.10.4

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCHv2 1/4] NVMe: Namespace use after free on surprise removal
  2014-01-31 23:53 [PATCHv2 0/4] IO Queue fixes rewrite Keith Busch
@ 2014-01-31 23:53 ` Keith Busch
  2014-01-31 23:53 ` [PATCHv2 2/4] NVMe: RCU access to nvme_queue Keith Busch
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2014-01-31 23:53 UTC (permalink / raw)


An nvme block device may have open references when the device is
removed. New commands may still be sent on the removed device, so we
need to ref count the opens, return errors for to new commands, and not
free the namespace and nvme_dev until all references are closed.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |   55 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 2372809..3c8f7f2 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1716,10 +1716,31 @@ static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
 #define nvme_compat_ioctl	NULL
 #endif
 
+static int nvme_open(struct block_device *bdev, fmode_t mode)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+	struct nvme_dev *dev = ns->dev;
+
+	kref_get(&dev->kref);
+	return 0;
+}
+
+static void nvme_free_dev(struct kref *kref);
+
+static void nvme_release(struct gendisk *disk, fmode_t mode)
+{
+	struct nvme_ns *ns = disk->private_data;
+	struct nvme_dev *dev = ns->dev;
+
+	kref_put(&dev->kref, nvme_free_dev);
+}
+
 static const struct block_device_operations nvme_fops = {
 	.owner		= THIS_MODULE,
 	.ioctl		= nvme_ioctl,
 	.compat_ioctl	= nvme_compat_ioctl,
+	.open		= nvme_open,
+	.release	= nvme_release,
 };
 
 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
@@ -1849,13 +1870,6 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	return NULL;
 }
 
-static void nvme_ns_free(struct nvme_ns *ns)
-{
-	put_disk(ns->disk);
-	blk_cleanup_queue(ns->queue);
-	kfree(ns);
-}
-
 static int set_queue_count(struct nvme_dev *dev, int count)
 {
 	int status;
@@ -2287,12 +2301,13 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 
 static void nvme_dev_remove(struct nvme_dev *dev)
 {
-	struct nvme_ns *ns, *next;
+	struct nvme_ns *ns;
 
-	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
-		list_del(&ns->list);
-		del_gendisk(ns->disk);
-		nvme_ns_free(ns);
+	list_for_each_entry(ns, &dev->namespaces, list) {
+		if (ns->disk->flags & GENHD_FL_UP)
+			del_gendisk(ns->disk);
+		if (!blk_queue_dying(ns->queue))
+			blk_cleanup_queue(ns->queue);
 	}
 }
 
@@ -2349,9 +2364,22 @@ static void nvme_release_instance(struct nvme_dev *dev)
 	spin_unlock(&dev_list_lock);
 }
 
+static void nvme_free_namespaces(struct nvme_dev *dev)
+{
+	struct nvme_ns *ns, *next;
+
+	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+		list_del(&ns->list);
+		put_disk(ns->disk);
+		kfree(ns);
+	}
+}
+
 static void nvme_free_dev(struct kref *kref)
 {
 	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
+
+	nvme_free_namespaces(dev);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2525,6 +2553,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto release_pools;
 	}
 
+	kref_init(&dev->kref);
 	result = nvme_dev_add(dev);
 	if (result)
 		goto shutdown;
@@ -2540,11 +2569,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto remove;
 
 	dev->initialized = 1;
-	kref_init(&dev->kref);
 	return 0;
 
  remove:
 	nvme_dev_remove(dev);
+	nvme_free_namespaces(dev);
  shutdown:
 	nvme_dev_shutdown(dev);
  release_pools:
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCHv2 2/4] NVMe: RCU access to nvme_queue
  2014-01-31 23:53 [PATCHv2 0/4] IO Queue fixes rewrite Keith Busch
  2014-01-31 23:53 ` [PATCHv2 1/4] NVMe: Namespace use after free on surprise removal Keith Busch
@ 2014-01-31 23:53 ` Keith Busch
  2014-01-31 23:53 ` [PATCHv2 3/4] NVMe: Per-cpu IO queues Keith Busch
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2014-01-31 23:53 UTC (permalink / raw)


This adds rcu protected access to nvme_queue to fix a potential race
between a surprise removal freeing the queue and a thread with open
reference on a NVMe block device using that queue.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |   53 ++++++++++++++++++++-------------------------
 include/linux/nvme.h      |    2 +-
 2 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 3c8f7f2..4ef748a 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -74,6 +74,7 @@ struct async_cmd_info {
  * commands and one for I/O commands).
  */
 struct nvme_queue {
+	struct rcu_head r_head;
 	struct device *q_dmadev;
 	struct nvme_dev *dev;
 	char irqname[24];	/* nvme4294967295-65535\0 */
@@ -264,12 +265,16 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
 
 struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
 {
-	return dev->queues[get_cpu() + 1];
+	int queue;
+	rcu_read_lock();
+	queue = get_cpu() + 1;
+	return rcu_dereference(dev->queues[queue]);
 }
 
 void put_nvmeq(struct nvme_queue *nvmeq)
 {
 	put_cpu();
+	rcu_read_unlock();
 }
 
 /**
@@ -819,9 +824,9 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio)
 	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
 	int result = -EBUSY;
 
-	if (!nvmeq) {
+	if (unlikely(!nvmeq)) {
 		put_nvmeq(NULL);
-		bio_endio(bio, -EIO);
+		bio_endio(bio, -ENXIO);
 		return;
 	}
 
@@ -1137,8 +1142,10 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
 	}
 }
 
-static void nvme_free_queue(struct nvme_queue *nvmeq)
+static void nvme_free_queue(struct rcu_head *r)
 {
+	struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head);
+
 	spin_lock_irq(&nvmeq->q_lock);
 	while (bio_list_peek(&nvmeq->sq_cong)) {
 		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
@@ -1157,10 +1164,13 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 {
 	int i;
 
+	for (i = num_possible_cpus(); i > dev->queue_count - 1; i--)
+		rcu_assign_pointer(dev->queues[i], NULL);
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
-		nvme_free_queue(dev->queues[i]);
+		struct nvme_queue *nvmeq = dev->queues[i];
+		rcu_assign_pointer(dev->queues[i], NULL);
+		call_rcu(&nvmeq->r_head, nvme_free_queue);
 		dev->queue_count--;
-		dev->queues[i] = NULL;
 	}
 }
 
@@ -1783,8 +1793,11 @@ static int nvme_kthread(void *data)
 				queue_work(nvme_workq, &dev->reset_work);
 				continue;
 			}
+
+			rcu_read_lock();
 			for (i = 0; i < dev->queue_count; i++) {
-				struct nvme_queue *nvmeq = dev->queues[i];
+				struct nvme_queue *nvmeq =
+						rcu_dereference(dev->queues[i]);
 				if (!nvmeq)
 					continue;
 				spin_lock_irq(&nvmeq->q_lock);
@@ -1796,6 +1809,7 @@ static int nvme_kthread(void *data)
  unlock:
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
+			rcu_read_unlock();
 		}
 		spin_unlock(&dev_list_lock);
 		schedule_timeout(round_jiffies_relative(HZ));
@@ -1962,19 +1976,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	}
 
 	/* Free previously allocated queues that are no longer usable */
-	spin_lock(&dev_list_lock);
-	for (i = dev->queue_count - 1; i > nr_io_queues; i--) {
-		struct nvme_queue *nvmeq = dev->queues[i];
-
-		spin_lock_irq(&nvmeq->q_lock);
-		nvme_cancel_ios(nvmeq, false);
-		spin_unlock_irq(&nvmeq->q_lock);
-
-		nvme_free_queue(nvmeq);
-		dev->queue_count--;
-		dev->queues[i] = NULL;
-	}
-	spin_unlock(&dev_list_lock);
+	nvme_free_queues(dev, nr_io_queues + 1);
 
 	cpu = cpumask_first(cpu_online_mask);
 	for (i = 0; i < nr_io_queues; i++) {
@@ -2465,18 +2467,10 @@ static int nvme_remove_dead_ctrl(void *arg)
 
 static void nvme_remove_disks(struct work_struct *ws)
 {
-	int i;
 	struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
 
 	nvme_dev_remove(dev);
-	spin_lock(&dev_list_lock);
-	for (i = dev->queue_count - 1; i > 0; i--) {
-		BUG_ON(!dev->queues[i] || !dev->queues[i]->q_suspended);
-		nvme_free_queue(dev->queues[i]);
-		dev->queue_count--;
-		dev->queues[i] = NULL;
-	}
-	spin_unlock(&dev_list_lock);
+	nvme_free_queues(dev, 1);
 }
 
 static int nvme_dev_resume(struct nvme_dev *dev)
@@ -2608,6 +2602,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_dev_remove(dev);
 	nvme_dev_shutdown(dev);
 	nvme_free_queues(dev, 0);
+	rcu_barrier();
 	nvme_release_instance(dev);
 	nvme_release_prp_pools(dev);
 	kref_put(&dev->kref, nvme_free_dev);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 69ae03f..98d367b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -73,7 +73,7 @@ enum {
  */
 struct nvme_dev {
 	struct list_head node;
-	struct nvme_queue **queues;
+	struct nvme_queue __rcu **queues;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
 	struct dma_pool *prp_page_pool;
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCHv2 3/4] NVMe: Per-cpu IO queues
  2014-01-31 23:53 [PATCHv2 0/4] IO Queue fixes rewrite Keith Busch
  2014-01-31 23:53 ` [PATCHv2 1/4] NVMe: Namespace use after free on surprise removal Keith Busch
  2014-01-31 23:53 ` [PATCHv2 2/4] NVMe: RCU access to nvme_queue Keith Busch
@ 2014-01-31 23:53 ` Keith Busch
  2014-01-31 23:53 ` [PATCHv2 4/4] NVMe: CPU hot plug notification Keith Busch
  2014-02-02 18:28 ` [PATCHv2 0/4] IO Queue fixes rewrite Matthew Wilcox
  4 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2014-01-31 23:53 UTC (permalink / raw)


NVMe IO queues are associated with CPUs, and linux provices a handy
per-cpu implementation. This gives us a convienient way to optimally
assign queues to multiple cpus when the device supports fewer queues
than the host has cpus. The previous implementation did not share these
optimally and may have shared very poorly in some situations. This new
way will share queues among cpus that are "close" together and should
have the lowest penalty for lock contention.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |  205 +++++++++++++++++++++++++++++++++++----------
 include/linux/nvme.h      |    6 +-
 2 files changed, 168 insertions(+), 43 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 4ef748a..acea1ee 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -20,6 +20,7 @@
 #include <linux/bio.h>
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
+#include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -35,6 +36,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
+#include <linux/percpu.h>
 #include <linux/poison.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
@@ -96,6 +98,7 @@ struct nvme_queue {
 	u8 cq_phase;
 	u8 cqe_seen;
 	u8 q_suspended;
+	cpumask_t cpu_mask;
 	struct async_cmd_info cmdinfo;
 	unsigned long cmdid_data[];
 };
@@ -263,18 +266,17 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
 	return ctx;
 }
 
-struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
+struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
 {
-	int queue;
+	unsigned i = get_cpu_var(*dev->io_queue);
 	rcu_read_lock();
-	queue = get_cpu() + 1;
-	return rcu_dereference(dev->queues[queue]);
+	return rcu_dereference(dev->queues[i]);
 }
 
-void put_nvmeq(struct nvme_queue *nvmeq)
+void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
 {
-	put_cpu();
 	rcu_read_unlock();
+	put_cpu_var(nvmeq->dev->io_queue);
 }
 
 /**
@@ -1164,10 +1166,9 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 {
 	int i;
 
-	for (i = num_possible_cpus(); i > dev->queue_count - 1; i--)
-		rcu_assign_pointer(dev->queues[i], NULL);
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
 		struct nvme_queue *nvmeq = dev->queues[i];
+
 		rcu_assign_pointer(dev->queues[i], NULL);
 		call_rcu(&nvmeq->r_head, nvme_free_queue);
 		dev->queue_count--;
@@ -1259,6 +1260,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->cq_vector = vector;
 	nvmeq->qid = qid;
 	nvmeq->q_suspended = 1;
+	cpumask_clear(&nvmeq->cpu_mask);
+	rcu_assign_pointer(dev->queues[qid], nvmeq);
 	dev->queue_count++;
 
 	return nvmeq;
@@ -1295,6 +1298,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
 	nvme_cancel_ios(nvmeq, false);
 	nvmeq->q_suspended = 0;
+	dev->online_queues++;
 }
 
 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
@@ -1884,6 +1888,144 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	return NULL;
 }
 
+static int nvme_find_closest_node(int node)
+{
+	int n, val, min_val = INT_MAX, best_node = node;
+
+	for_each_online_node(n) {
+		if (n == node)
+			continue;
+		val = node_distance(node, n);
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+	return best_node;
+}
+
+static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq,
+								int count)
+{
+	int cpu;
+	for_each_cpu(cpu, qmask) {
+		if (cpus_weight(nvmeq->cpu_mask) >= count)
+			break;
+		if (!cpumask_test_and_set_cpu(cpu, &nvmeq->cpu_mask))
+			*per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid;
+	}
+}
+
+static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
+	const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue)
+{
+	int next_cpu;
+	for_each_cpu(next_cpu, new_mask) {
+		cpumask_or(mask, mask, get_cpu_mask(next_cpu));
+		cpumask_or(mask, mask, topology_thread_cpumask(next_cpu));
+		cpumask_and(mask, mask, unassigned_cpus);
+		nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue);
+	}
+}
+
+static void nvme_create_io_queues(struct nvme_dev *dev)
+{
+	unsigned i, max;
+
+	max = min(dev->max_qid, num_online_cpus());
+	for (i = dev->queue_count; i <= max; i++)
+		if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
+			break;
+
+	max = min(dev->queue_count - 1, num_online_cpus());
+	for (i = dev->online_queues; i <= max; i++)
+		if (nvme_create_queue(dev->queues[i], i))
+			break;
+}
+
+/*
+ * If there are fewer queues than online cpus, this will try to optimally
+ * assign a queue to multiple cpus by grouping cpus that are "close" together:
+ * thread siblings, core, socket, closest node, then whatever else is
+ * available.
+ */
+static void nvme_assign_io_queues(struct nvme_dev *dev)
+{
+	unsigned cpu, cpus_per_queue, queues, remainder, i;
+	cpumask_t unassigned_cpus;
+
+	nvme_create_io_queues(dev);
+
+	queues = min(dev->online_queues - 1, num_online_cpus());
+	if (!queues)
+		return;
+
+	cpus_per_queue = num_online_cpus() / queues;
+	remainder = queues - (num_online_cpus() - queues * cpus_per_queue);
+
+	unassigned_cpus = *cpu_online_mask;
+	cpu = cpumask_first(&unassigned_cpus);
+	for (i = 1; i <= queues; i++) {
+		struct nvme_queue *nvmeq = dev->queues[i];
+		cpumask_t mask;
+
+		cpumask_clear(&nvmeq->cpu_mask);
+		if (!cpus_weight(unassigned_cpus))
+			break;
+
+		mask = *get_cpu_mask(cpu);
+		nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, &unassigned_cpus,
+				topology_thread_cpumask(cpu),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, &unassigned_cpus,
+				topology_core_cpumask(cpu),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, &unassigned_cpus,
+				cpumask_of_node(cpu_to_node(cpu)),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, &unassigned_cpus,
+				cpumask_of_node(
+					nvme_find_closest_node(
+						cpu_to_node(cpu))),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, &unassigned_cpus,
+				&unassigned_cpus,
+				nvmeq, cpus_per_queue);
+
+		WARN(cpus_weight(nvmeq->cpu_mask) != cpus_per_queue,
+			"nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
+			dev->instance, i);
+
+		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
+							&nvmeq->cpu_mask);
+
+		cpumask_andnot(&unassigned_cpus, &unassigned_cpus,
+						&nvmeq->cpu_mask);
+
+		cpu = cpumask_next(cpu, &unassigned_cpus);
+		if (remainder && !--remainder)
+			cpus_per_queue++;
+	}
+	WARN(cpus_weight(unassigned_cpus), "nvme%d unassigned online cpus\n",
+								dev->instance);
+
+	/*
+	 * All possible cpus must point to a valid queue. We don't have thread
+	 * sibling info on offline cpus, so no sharing optimization on these
+	 * cpus.
+	 */
+	cpumask_andnot(&unassigned_cpus, cpu_possible_mask, cpu_online_mask);
+	i = 0;
+	for_each_cpu(cpu, &unassigned_cpus)
+		*per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1;
+}
+
 static int set_queue_count(struct nvme_dev *dev, int count)
 {
 	int status;
@@ -1906,9 +2048,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	struct nvme_queue *adminq = dev->queues[0];
 	struct pci_dev *pdev = dev->pci_dev;
-	int result, cpu, i, vecs, nr_io_queues, size, q_depth;
+	int result, i, vecs, nr_io_queues, size;
 
-	nr_io_queues = num_online_cpus();
+	nr_io_queues = num_possible_cpus();
 	result = set_queue_count(dev, nr_io_queues);
 	if (result < 0)
 		return result;
@@ -1968,6 +2110,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	 * number of interrupts.
 	 */
 	nr_io_queues = vecs;
+	dev->max_qid = nr_io_queues;
 
 	result = queue_request_irq(dev, adminq, adminq->irqname);
 	if (result) {
@@ -1977,37 +2120,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
 	/* Free previously allocated queues that are no longer usable */
 	nvme_free_queues(dev, nr_io_queues + 1);
-
-	cpu = cpumask_first(cpu_online_mask);
-	for (i = 0; i < nr_io_queues; i++) {
-		irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
-		cpu = cpumask_next(cpu, cpu_online_mask);
-	}
-
-	q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
-								NVME_Q_DEPTH);
-	for (i = dev->queue_count - 1; i < nr_io_queues; i++) {
-		dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i);
-		if (!dev->queues[i + 1]) {
-			result = -ENOMEM;
-			goto free_queues;
-		}
-	}
-
-	for (; i < num_possible_cpus(); i++) {
-		int target = i % rounddown_pow_of_two(dev->queue_count - 1);
-		dev->queues[i + 1] = dev->queues[target + 1];
-	}
-
-	for (i = 1; i < dev->queue_count; i++) {
-		result = nvme_create_queue(dev->queues[i], i);
-		if (result) {
-			for (--i; i > 0; i--)
-				nvme_disable_queue(dev, i);
-			goto free_queues;
-		}
-	}
-
+	nvme_assign_io_queues(dev);
 	return 0;
 
  free_queues:
@@ -2085,6 +2198,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 
 static int nvme_dev_map(struct nvme_dev *dev)
 {
+	u64 cap;
 	int bars, result = -ENOMEM;
 	struct pci_dev *pdev = dev->pci_dev;
 
@@ -2108,7 +2222,9 @@ static int nvme_dev_map(struct nvme_dev *dev)
 		result = -ENODEV;
 		goto unmap;
 	}
-	dev->db_stride = 1 << NVME_CAP_STRIDE(readq(&dev->bar->cap));
+	cap = readq(&dev->bar->cap);
+	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
+	dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
 	dev->dbs = ((void __iomem *)dev->bar) + 4096;
 
 	return 0;
@@ -2382,6 +2498,7 @@ static void nvme_free_dev(struct kref *kref)
 	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
 
 	nvme_free_namespaces(dev);
+	free_percpu(dev->io_queue);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2527,6 +2644,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 								GFP_KERNEL);
 	if (!dev->queues)
 		goto free;
+	dev->io_queue = alloc_percpu(unsigned short);
+	if (!dev->io_queue)
+		goto free;
 
 	INIT_LIST_HEAD(&dev->namespaces);
 	INIT_WORK(&dev->reset_work, nvme_reset_failed_dev);
@@ -2576,6 +2696,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  release:
 	nvme_release_instance(dev);
  free:
+	free_percpu(dev->io_queue);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 98d367b..d574acd 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -74,12 +74,16 @@ enum {
 struct nvme_dev {
 	struct list_head node;
 	struct nvme_queue __rcu **queues;
+	unsigned short __percpu *io_queue;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
 	struct dma_pool *prp_page_pool;
 	struct dma_pool *prp_small_pool;
 	int instance;
-	int queue_count;
+	unsigned queue_count;
+	unsigned online_queues;
+	unsigned max_qid;
+	int q_depth;
 	u32 db_stride;
 	u32 ctrl_config;
 	struct msix_entry *entry;
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCHv2 4/4] NVMe: CPU hot plug notification
  2014-01-31 23:53 [PATCHv2 0/4] IO Queue fixes rewrite Keith Busch
                   ` (2 preceding siblings ...)
  2014-01-31 23:53 ` [PATCHv2 3/4] NVMe: Per-cpu IO queues Keith Busch
@ 2014-01-31 23:53 ` Keith Busch
  2014-02-02 18:28 ` [PATCHv2 0/4] IO Queue fixes rewrite Matthew Wilcox
  4 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2014-01-31 23:53 UTC (permalink / raw)


Registers with hot cpu notification to rebalance - and potentially
allocate additional - io queues among cpus.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |   23 ++++++++++++++++++++++-
 include/linux/nvme.h      |    1 +
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index acea1ee..c68016d 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2018,7 +2018,8 @@ static void nvme_assign_io_queues(struct nvme_dev *dev)
 	/*
 	 * All possible cpus must point to a valid queue. We don't have thread
 	 * sibling info on offline cpus, so no sharing optimization on these
-	 * cpus.
+	 * cpus. These should automatically be rebalanced from hot plug
+	 * notification.
 	 */
 	cpumask_andnot(&unassigned_cpus, cpu_possible_mask, cpu_online_mask);
 	i = 0;
@@ -2044,6 +2045,19 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
 	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
 }
 
+static int nvme_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	struct nvme_dev *dev = container_of(self, struct nvme_dev, nb);
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		nvme_assign_io_queues(dev);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	struct nvme_queue *adminq = dev->queues[0];
@@ -2121,6 +2135,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	/* Free previously allocated queues that are no longer usable */
 	nvme_free_queues(dev, nr_io_queues + 1);
 	nvme_assign_io_queues(dev);
+
+	dev->nb.notifier_call = &nvme_cpu_notify;
+	result = register_hotcpu_notifier(&dev->nb);
+	if (result)
+		goto free_queues;
+
 	return 0;
 
  free_queues:
@@ -2398,6 +2418,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 	int i;
 
 	dev->initialized = 0;
+	unregister_hotcpu_notifier(&dev->nb);
 
 	spin_lock(&dev_list_lock);
 	list_del_init(&dev->node);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d574acd..fb5911d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -92,6 +92,7 @@ struct nvme_dev {
 	struct kref kref;
 	struct miscdevice miscdev;
 	struct work_struct reset_work;
+	struct notifier_block nb;
 	char name[12];
 	char serial[20];
 	char model[40];
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCHv2 0/4] IO Queue fixes rewrite
  2014-01-31 23:53 [PATCHv2 0/4] IO Queue fixes rewrite Keith Busch
                   ` (3 preceding siblings ...)
  2014-01-31 23:53 ` [PATCHv2 4/4] NVMe: CPU hot plug notification Keith Busch
@ 2014-02-02 18:28 ` Matthew Wilcox
  4 siblings, 0 replies; 6+ messages in thread
From: Matthew Wilcox @ 2014-02-02 18:28 UTC (permalink / raw)


On Fri, Jan 31, 2014@04:53:38PM -0700, Keith Busch wrote:
> The only patch different in v2 from the previous is the "Per-cpu IO
> queues". The feedback on using the per cpu variable to point to an index
> rather than a struct nvme_queue pointer was a great idea!
> 
> I only included all 4 in this set because I wanted to make sure they
> apply cleanly against the new tree.
> 
> I also ran sparse this time; I've exceeded my fair share of sparse errors
> working on this driver. :)

Unfortunately, you missed a CONFIG option to enable checking of __rcu.
You need CONFIG_SPARSE_RCU_POINTER=y

With that enabled, I get a bunch of warnings like this:

drivers/block/nvme-core.c:942:48: warning: incorrect type in argument 1 (different address spaces)
drivers/block/nvme-core.c:942:48:    expected struct nvme_queue *nvmeq
drivers/block/nvme-core.c:942:48:    got struct nvme_queue [noderef] <asn:4>*<noident>
drivers/block/nvme-core.c:948:49: warning: incorrect type in argument 1 (different address spaces)
drivers/block/nvme-core.c:948:49:    expected struct nvme_queue *nvmeq
drivers/block/nvme-core.c:948:49:    got struct nvme_queue [noderef] <asn:4>*<noident>

The problem is that only the IO queues need to be RCU-protected, but every
time we access the admin queue, we get a warning, because clearly there's
no way to tell sparse "element 0 of this array isn't RCU protected,
only elements 1-n are".

So ... I think I'll just include patch 1 out of this set and send a pull
request to Linux today.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2014-02-02 18:28 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-01-31 23:53 [PATCHv2 0/4] IO Queue fixes rewrite Keith Busch
2014-01-31 23:53 ` [PATCHv2 1/4] NVMe: Namespace use after free on surprise removal Keith Busch
2014-01-31 23:53 ` [PATCHv2 2/4] NVMe: RCU access to nvme_queue Keith Busch
2014-01-31 23:53 ` [PATCHv2 3/4] NVMe: Per-cpu IO queues Keith Busch
2014-01-31 23:53 ` [PATCHv2 4/4] NVMe: CPU hot plug notification Keith Busch
2014-02-02 18:28 ` [PATCHv2 0/4] IO Queue fixes rewrite Matthew Wilcox

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.