[PATCH] NVMe: SQ/CQ NUMA locality

linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] NVMe: SQ/CQ NUMA locality
@ 2013-01-29  1:20 Keith Busch
  2013-03-13  6:48 ` Matthew Wilcox
  0 siblings, 1 reply; 2+ messages in thread
From: Keith Busch @ 2013-01-29  1:20 UTC (permalink / raw)


Allocates the memory for the SQ/CQ pairs based on the NUMA node for the
CPU which the queue is associated with.

Signed-off-by: Keith Busch <keith.busch at intel.com>

This is related to an item off the "TODO" list that suggests experimenting
with NUMA locality. There is no dma alloc routine that takes a NUMA node id, so
the allocations are done a bit different. I am not sure if this is the correct
way to use dma_map/umap_single, but it seems to work fine. 

I tested this on an Intel SC2600C0 server with two E5-2600 Xeons (32 total
cpu threads) with all memory sockets fully populated and giving two NUMA
domains.  The only NVMe device I can test with is a pre-alpha level with an
FPGA, so it doesn't run as fast as it could, but I could still measure a
small difference using fio, though not a very significant difference.

With NUMA:

   READ: io=65534MB, aggrb=262669KB/s, minb=8203KB/s, maxb=13821KB/s, mint=152006msec, maxt=255482msec
  WRITE: io=65538MB, aggrb=262681KB/s, minb=8213KB/s, maxb=13792KB/s, mint=152006msec, maxt=255482msec

Without NUMA:

   READ: io=65535MB, aggrb=257995KB/s, minb=8014KB/s, maxb=13217KB/s, mint=159122msec, maxt=264339msec
  WRITE: io=65537MB, aggrb=258001KB/s, minb=8035KB/s, maxb=13198KB/s, mint=159122msec, maxt=264339msec

---
 drivers/block/nvme.c |   60 +++++++++++++++++++++++++++----------------------
 1 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 993c014..af9bf2e 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -898,10 +898,12 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
 
 static void nvme_free_queue_mem(struct nvme_queue *nvmeq)
 {
-	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
-				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
-	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
-					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	dma_unmap_single(nvmeq->q_dmadev, nvmeq->cq_dma_addr,
+				CQ_SIZE(nvmeq->q_depth), DMA_FROM_DEVICE);
+	free_pages_exact((void *)nvmeq->cqes, CQ_SIZE(nvmeq->q_depth));
+	dma_unmap_single(nvmeq->q_dmadev, nvmeq->sq_dma_addr,
+				SQ_SIZE(nvmeq->q_depth), DMA_TO_DEVICE);
+	free_pages_exact((void *)nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
 	kfree(nvmeq);
 }
 
@@ -931,25 +933,28 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid)
 }
 
 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
-							int depth, int vector)
+						int depth, int vector, int nid)
 {
 	struct device *dmadev = &dev->pci_dev->dev;
 	unsigned extra = DIV_ROUND_UP(depth, 8) + (depth *
 						sizeof(struct nvme_cmd_info));
-	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
+	struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq) + extra,
+							GFP_KERNEL, nid);
 	if (!nvmeq)
 		return NULL;
 
-	nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
-					&nvmeq->cq_dma_addr, GFP_KERNEL);
+	nvmeq->cqes = alloc_pages_exact_nid(nid, CQ_SIZE(depth), GFP_KERNEL);
 	if (!nvmeq->cqes)
 		goto free_nvmeq;
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
+	nvmeq->cq_dma_addr = dma_map_single(dmadev, (void *)nvmeq->cqes,
+					CQ_SIZE(depth), DMA_FROM_DEVICE);
 
-	nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
-					&nvmeq->sq_dma_addr, GFP_KERNEL);
+	nvmeq->sq_cmds = alloc_pages_exact_nid(nid, SQ_SIZE(depth), GFP_KERNEL);
 	if (!nvmeq->sq_cmds)
 		goto free_cqdma;
+	nvmeq->sq_dma_addr = dma_map_single(dmadev, (void *)nvmeq->sq_cmds,
+					SQ_SIZE(depth), DMA_TO_DEVICE);
 
 	nvmeq->q_dmadev = dmadev;
 	nvmeq->dev = dev;
@@ -966,8 +971,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	return nvmeq;
 
  free_cqdma:
-	dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes,
-							nvmeq->cq_dma_addr);
+	dma_unmap_single(dmadev, nvmeq->cq_dma_addr, CQ_SIZE(depth), DMA_FROM_DEVICE);
+	free_pages_exact((void *)nvmeq->cqes, CQ_SIZE(depth));
  free_nvmeq:
 	kfree(nvmeq);
 	return NULL;
@@ -986,10 +991,10 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 }
 
 static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
-					int qid, int cq_size, int vector)
+					int qid, int cq_size, int vector, int nid)
 {
 	int result;
-	struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
+	struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector, nid);
 
 	if (!nvmeq)
 		return ERR_PTR(-ENOMEM);
@@ -1013,10 +1018,12 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
  release_cq:
 	adapter_delete_cq(dev, qid);
  free_nvmeq:
-	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
-				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
-	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
-					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	dma_unmap_single(nvmeq->q_dmadev, nvmeq->cq_dma_addr,
+				CQ_SIZE(nvmeq->q_depth), DMA_FROM_DEVICE);
+	free_pages_exact((void *)nvmeq->cqes, CQ_SIZE(nvmeq->q_depth));
+	dma_unmap_single(nvmeq->q_dmadev, nvmeq->sq_dma_addr,
+				SQ_SIZE(nvmeq->q_depth), DMA_TO_DEVICE);
+	free_pages_exact((void *)nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
 	kfree(nvmeq);
 	return ERR_PTR(result);
 }
@@ -1031,7 +1038,7 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 
 	dev->dbs = ((void __iomem *)dev->bar) + 4096;
 
-	nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+	nvmeq = nvme_alloc_queue(dev, 0, 64, 0, -1);
 	if (!nvmeq)
 		return -ENOMEM;
 
@@ -1425,7 +1432,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 
 static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	int result, cpu, i, nr_io_queues, db_bar_size, q_depth;
+	int result, cpu, nid, i, nr_io_queues, db_bar_size, q_depth;
 
 	nr_io_queues = num_online_cpus();
 	result = set_queue_count(dev, nr_io_queues);
@@ -1465,21 +1472,20 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 	result = queue_request_irq(dev, dev->queues[0], "nvme admin");
 	/* XXX: handle failure here */
 
-	cpu = cpumask_first(cpu_online_mask);
-	for (i = 0; i < nr_io_queues; i++) {
-		irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
-		cpu = cpumask_next(cpu, cpu_online_mask);
-	}
-
 	q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
 								NVME_Q_DEPTH);
+	cpu = cpumask_first(cpu_online_mask);
 	for (i = 0; i < nr_io_queues; i++) {
-		dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i);
+		irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
+		nid = cpu_to_node(cpu);
+		dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i, nid);
 		if (IS_ERR(dev->queues[i + 1]))
 			return PTR_ERR(dev->queues[i + 1]);
 		dev->queue_count++;
+		cpu = cpumask_next(cpu, cpu_online_mask);
 	}
 
+	/* XXX: use NUMA node for optimal queue wrapping */
 	for (; i < num_possible_cpus(); i++) {
 		int target = i % rounddown_pow_of_two(dev->queue_count - 1);
 		dev->queues[i + 1] = dev->queues[target + 1];
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH] NVMe: SQ/CQ NUMA locality
  2013-01-29  1:20 [PATCH] NVMe: SQ/CQ NUMA locality Keith Busch
@ 2013-03-13  6:48 ` Matthew Wilcox
  0 siblings, 0 replies; 2+ messages in thread
From: Matthew Wilcox @ 2013-03-13  6:48 UTC (permalink / raw)

On Mon, Jan 28, 2013@06:20:41PM -0700, Keith Busch wrote:
> This is related to an item off the "TODO" list that suggests experimenting
> with NUMA locality. There is no dma alloc routine that takes a NUMA node id, so
> the allocations are done a bit different. I am not sure if this is the correct
> way to use dma_map/umap_single, but it seems to work fine. 

Ah ... works fine on Intel architectures ... not so fine on
other architectures.  We'd have to add in explicit calls to
dma_sync_single_for_cpu() and dma_sync_single_for_device(), and that's
just not going to be efficient.

> I tested this on an Intel SC2600C0 server with two E5-2600 Xeons (32 total
> cpu threads) with all memory sockets fully populated and giving two NUMA
> domains.  The only NVMe device I can test with is a pre-alpha level with an
> FPGA, so it doesn't run as fast as it could, but I could still measure a
> small difference using fio, though not a very significant difference.
> 
> With NUMA:
> 
>    READ: io=65534MB, aggrb=262669KB/s, minb=8203KB/s, maxb=13821KB/s, mint=152006msec, maxt=255482msec
>   WRITE: io=65538MB, aggrb=262681KB/s, minb=8213KB/s, maxb=13792KB/s, mint=152006msec, maxt=255482msec
> 
> Without NUMA:
> 
>    READ: io=65535MB, aggrb=257995KB/s, minb=8014KB/s, maxb=13217KB/s, mint=159122msec, maxt=264339msec
>   WRITE: io=65537MB, aggrb=258001KB/s, minb=8035KB/s, maxb=13198KB/s, mint=159122msec, maxt=264339msec

I think we can get in trouble for posting raw numbers ... so let's
pretend you simply said "About a 2% performance improvement".  Now, OK,
that doesn't sound like much, but that's significant enough to make this
worth pursuing.

So ... I think we need to add a dma_alloc_attrs_node() or something,
and pass the nid all the way down to the ->alloc routine.

Another thing I'd like you to try is allocating *only* the completion
queue local to the node.  ie allocate the submission queue on the node
local to the device and the completion queue on the node local to the
CPU that is using it.

My reason for thinking this is a good idea is the assumption that
cross-node writes are cheaper than reads.  So having the CPU write to
remote memory, the device read from local memory, then the device write
to remote memory and the CPU read from local memory should work out
better than either allocating both the submission & completion queues
local to the CPU or local to the device.

I think that dma_alloc_coherent currently allocates memory local to the
device, so all you need to do to test this theory is revert the half of
your patch which allocates the submission queue local to the CPU.

Thanks for trying this out!

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2013-03-13  6:48 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-01-29  1:20 [PATCH] NVMe: SQ/CQ NUMA locality Keith Busch
2013-03-13  6:48 ` Matthew Wilcox

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).