Linux-NVME Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: axboe@kernel.dk (Jens Axboe)
Subject: [PATCH] nvme: use __GFP_NOWARN for iod allocation
Date: Thu, 21 Jun 2018 09:11:28 -0600	[thread overview]
Message-ID: <b142d5e6-e7cd-fa41-776e-2acd18691bae@kernel.dk> (raw)
In-Reply-To: <20180621150624.GA1718@infradead.org>

On 6/21/18 9:06 AM, Christoph Hellwig wrote:
> On Thu, Jun 21, 2018@09:02:18AM -0600, Jens Axboe wrote:
>> >From a quick look, the fabric code goes through the same nvme_queue_rq()
>> and ends up doing the same iod->sg allocation. I don't see any use of
>> chained sg lists, we're allocating one big sg table for the command.
> 
> nvme_queue_rq is only used for PCIe.
> 
> E.g. for RDMA the call is:
> 
> nvme_rdma_queue_rq
>  -> nvme_rdma_map_data
>   -> sg_alloc_table_chained
> 
> similar for FC and loop.

Gotcha. So we can safely ignore limiting segments and max size for
fabrics for now.

How about the below?


diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 21710a7460c8..f1af08171d38 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1799,6 +1799,16 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
 
+/*
+ * For non-fabrics, limit the max IO size and segments so that we know our
+ * iod->sg allocation can fall within a single page. Fabrics is more sensitive
+ * to command size, and it also does sg chaining to avoid large allocations.
+ */
+static bool nvme_ctrl_limit_io_size(struct nvme_ctrl *ctrl)
+{
+	return (ctrl->ops->flags & NVME_F_FABRICS) == 0;
+}
+
 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		struct request_queue *q)
 {
@@ -1808,8 +1818,11 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		u32 max_segments =
 			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
 
+		if (nvme_ctrl_limit_io_size(ctrl))
+			max_segments = min_not_zero(max_segments,
+							(u32)NVME_MAX_SEGS);
 		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
-		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+		blk_queue_max_segments(q, max_segments);
 	}
 	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
 	    is_power_of_2(ctrl->max_hw_sectors))
@@ -2377,8 +2390,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	ctrl->cntlid = le16_to_cpup(&id->cntlid);
 	if (id->mdts)
 		max_hw_sectors = 1 << (id->mdts + page_shift - 9);
-	else
+	else if (!nvme_ctrl_limit_io_size(ctrl))
 		max_hw_sectors = UINT_MAX;
+	else
+		max_hw_sectors = NVME_MAX_KB_SZ << 1;
 	ctrl->max_hw_sectors =
 		min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 231807cbc849..efce853a6638 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -33,6 +33,13 @@ extern unsigned int admin_timeout;
 #define NVME_DEFAULT_KATO	5
 #define NVME_KATO_GRACE		10
 
+/*
+ * These can be higher, but we need to ensure that any command doesn't
+ * require an sg allocation that needs more than a page of data.
+ */
+#define NVME_MAX_KB_SZ	4096
+#define NVME_MAX_SEGS	127
+
 extern struct workqueue_struct *nvme_wq;
 extern struct workqueue_struct *nvme_reset_wq;
 extern struct workqueue_struct *nvme_delete_wq;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fc33804662e7..663446d55a49 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -100,6 +100,8 @@ struct nvme_dev {
 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
 
+	mempool_t *iod_mempool;
+
 	/* shadow doorbell buffer support: */
 	u32 *dbbuf_dbs;
 	dma_addr_t dbbuf_dbs_dma_addr;
@@ -477,10 +479,7 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
 	iod->use_sgl = nvme_pci_use_sgls(dev, rq);
 
 	if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
-		size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
-				iod->use_sgl);
-
-		iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
+		iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
 		if (!iod->sg)
 			return BLK_STS_RESOURCE;
 	} else {
@@ -526,7 +525,7 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
 	}
 
 	if (iod->sg != iod->inline_sg)
-		kfree(iod->sg);
+		mempool_free(iod->sg, dev->iod_mempool);
 }
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
@@ -2280,6 +2279,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 		blk_put_queue(dev->ctrl.admin_q);
 	kfree(dev->queues);
 	free_opal_dev(dev->ctrl.opal_dev);
+	mempool_destroy(dev->iod_mempool);
 	kfree(dev);
 }
 
@@ -2509,6 +2509,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	int node, result = -ENOMEM;
 	struct nvme_dev *dev;
 	unsigned long quirks = id->driver_data;
+	size_t alloc_size;
 
 	node = dev_to_node(&pdev->dev);
 	if (node == NUMA_NO_NODE)
@@ -2546,6 +2547,23 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (result)
 		goto release_pools;
 
+	/*
+	 * Double check that our mempool alloc size will cover the biggest
+	 * command we support.
+	 */
+	alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
+						NVME_MAX_SEGS, true);
+	WARN_ON_ONCE(alloc_size > PAGE_SIZE);
+
+	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
+						mempool_kfree,
+						(void *) alloc_size,
+						GFP_KERNEL, node);
+	if (!dev->iod_mempool) {
+		result = -ENOMEM;
+		goto release_pools;
+	}
+
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
 	nvme_get_ctrl(&dev->ctrl);

-- 
Jens Axboe

      reply	other threads:[~2018-06-21 15:11 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-20 19:10 [PATCH] nvme: use __GFP_NOWARN for iod allocation Jens Axboe
2018-06-20 19:16 ` Jens Axboe
2018-06-20 20:09   ` Jens Axboe
2018-06-20 20:43     ` Keith Busch
2018-06-20 20:54       ` Jens Axboe
2018-06-20 22:30         ` Keith Busch
2018-06-20 22:30           ` Jens Axboe
2018-06-21  7:10             ` Christoph Hellwig
2018-06-21 14:47               ` Jens Axboe
2018-06-21 15:01                 ` Christoph Hellwig
2018-06-21 15:04                   ` Jens Axboe
2018-06-21 15:06                     ` Jens Axboe
2018-06-21 15:24                       ` Christoph Hellwig
2018-06-21 15:26                         ` Jens Axboe
2018-06-21 15:24                     ` Christoph Hellwig
2018-06-21 15:34                       ` Jens Axboe
2018-06-21 15:45                         ` Christoph Hellwig
2018-06-21 15:49                           ` Jens Axboe
2018-06-21 16:27                             ` Keith Busch
2018-06-21 17:20                               ` Jens Axboe
2018-06-21 17:54                                 ` Christoph Hellwig
2018-06-21 19:18                                   ` Jens Axboe
2018-06-20 20:51     ` Jens Axboe
2018-06-21  7:15       ` Christoph Hellwig
2018-06-21 14:37         ` Jens Axboe
2018-06-21 14:56           ` Christoph Hellwig
2018-06-21 15:01             ` Jens Axboe
2018-06-21  7:09     ` Christoph Hellwig
2018-06-21 14:39       ` Jens Axboe
2018-06-21 14:58         ` Christoph Hellwig
2018-06-21 15:02           ` Jens Axboe
2018-06-21 15:06             ` Christoph Hellwig
2018-06-21 15:11               ` Jens Axboe [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b142d5e6-e7cd-fa41-776e-2acd18691bae@kernel.dk \
    --to=axboe@kernel.dk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox