All of lore.kernel.org
 help / color / mirror / Atom feed
From: axboe@kernel.dk (Jens Axboe)
Subject: [PATCH] nvme: use __GFP_NOWARN for iod allocation
Date: Thu, 21 Jun 2018 09:11:28 -0600	[thread overview]
Message-ID: <b142d5e6-e7cd-fa41-776e-2acd18691bae@kernel.dk> (raw)
In-Reply-To: <20180621150624.GA1718@infradead.org>

On 6/21/18 9:06 AM, Christoph Hellwig wrote:
> On Thu, Jun 21, 2018@09:02:18AM -0600, Jens Axboe wrote:
>> >From a quick look, the fabric code goes through the same nvme_queue_rq()
>> and ends up doing the same iod->sg allocation. I don't see any use of
>> chained sg lists, we're allocating one big sg table for the command.
> 
> nvme_queue_rq is only used for PCIe.
> 
> E.g. for RDMA the call is:
> 
> nvme_rdma_queue_rq
>  -> nvme_rdma_map_data
>   -> sg_alloc_table_chained
> 
> similar for FC and loop.

Gotcha. So we can safely ignore limiting segments and max size for
fabrics for now.

How about the below?


diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 21710a7460c8..f1af08171d38 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1799,6 +1799,16 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
 
+/*
+ * For non-fabrics, limit the max IO size and segments so that we know our
+ * iod->sg allocation can fall within a single page. Fabrics is more sensitive
+ * to command size, and it also does sg chaining to avoid large allocations.
+ */
+static bool nvme_ctrl_limit_io_size(struct nvme_ctrl *ctrl)
+{
+	return (ctrl->ops->flags & NVME_F_FABRICS) == 0;
+}
+
 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		struct request_queue *q)
 {
@@ -1808,8 +1818,11 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		u32 max_segments =
 			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
 
+		if (nvme_ctrl_limit_io_size(ctrl))
+			max_segments = min_not_zero(max_segments,
+							(u32)NVME_MAX_SEGS);
 		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
-		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+		blk_queue_max_segments(q, max_segments);
 	}
 	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
 	    is_power_of_2(ctrl->max_hw_sectors))
@@ -2377,8 +2390,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	ctrl->cntlid = le16_to_cpup(&id->cntlid);
 	if (id->mdts)
 		max_hw_sectors = 1 << (id->mdts + page_shift - 9);
-	else
+	else if (!nvme_ctrl_limit_io_size(ctrl))
 		max_hw_sectors = UINT_MAX;
+	else
+		max_hw_sectors = NVME_MAX_KB_SZ << 1;
 	ctrl->max_hw_sectors =
 		min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 231807cbc849..efce853a6638 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -33,6 +33,13 @@ extern unsigned int admin_timeout;
 #define NVME_DEFAULT_KATO	5
 #define NVME_KATO_GRACE		10
 
+/*
+ * These can be higher, but we need to ensure that any command doesn't
+ * require an sg allocation that needs more than a page of data.
+ */
+#define NVME_MAX_KB_SZ	4096
+#define NVME_MAX_SEGS	127
+
 extern struct workqueue_struct *nvme_wq;
 extern struct workqueue_struct *nvme_reset_wq;
 extern struct workqueue_struct *nvme_delete_wq;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fc33804662e7..663446d55a49 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -100,6 +100,8 @@ struct nvme_dev {
 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
 
+	mempool_t *iod_mempool;
+
 	/* shadow doorbell buffer support: */
 	u32 *dbbuf_dbs;
 	dma_addr_t dbbuf_dbs_dma_addr;
@@ -477,10 +479,7 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
 	iod->use_sgl = nvme_pci_use_sgls(dev, rq);
 
 	if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
-		size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
-				iod->use_sgl);
-
-		iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
+		iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
 		if (!iod->sg)
 			return BLK_STS_RESOURCE;
 	} else {
@@ -526,7 +525,7 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
 	}
 
 	if (iod->sg != iod->inline_sg)
-		kfree(iod->sg);
+		mempool_free(iod->sg, dev->iod_mempool);
 }
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
@@ -2280,6 +2279,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 		blk_put_queue(dev->ctrl.admin_q);
 	kfree(dev->queues);
 	free_opal_dev(dev->ctrl.opal_dev);
+	mempool_destroy(dev->iod_mempool);
 	kfree(dev);
 }
 
@@ -2509,6 +2509,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	int node, result = -ENOMEM;
 	struct nvme_dev *dev;
 	unsigned long quirks = id->driver_data;
+	size_t alloc_size;
 
 	node = dev_to_node(&pdev->dev);
 	if (node == NUMA_NO_NODE)
@@ -2546,6 +2547,23 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (result)
 		goto release_pools;
 
+	/*
+	 * Double check that our mempool alloc size will cover the biggest
+	 * command we support.
+	 */
+	alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
+						NVME_MAX_SEGS, true);
+	WARN_ON_ONCE(alloc_size > PAGE_SIZE);
+
+	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
+						mempool_kfree,
+						(void *) alloc_size,
+						GFP_KERNEL, node);
+	if (!dev->iod_mempool) {
+		result = -ENOMEM;
+		goto release_pools;
+	}
+
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
 	nvme_get_ctrl(&dev->ctrl);

-- 
Jens Axboe

      reply	other threads:[~2018-06-21 15:11 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-20 19:10 [PATCH] nvme: use __GFP_NOWARN for iod allocation Jens Axboe
2018-06-20 19:16 ` Jens Axboe
2018-06-20 20:09   ` Jens Axboe
2018-06-20 20:43     ` Keith Busch
2018-06-20 20:54       ` Jens Axboe
2018-06-20 22:30         ` Keith Busch
2018-06-20 22:30           ` Jens Axboe
2018-06-21  7:10             ` Christoph Hellwig
2018-06-21 14:47               ` Jens Axboe
2018-06-21 15:01                 ` Christoph Hellwig
2018-06-21 15:04                   ` Jens Axboe
2018-06-21 15:06                     ` Jens Axboe
2018-06-21 15:24                       ` Christoph Hellwig
2018-06-21 15:26                         ` Jens Axboe
2018-06-21 15:24                     ` Christoph Hellwig
2018-06-21 15:34                       ` Jens Axboe
2018-06-21 15:45                         ` Christoph Hellwig
2018-06-21 15:49                           ` Jens Axboe
2018-06-21 16:27                             ` Keith Busch
2018-06-21 17:20                               ` Jens Axboe
2018-06-21 17:54                                 ` Christoph Hellwig
2018-06-21 19:18                                   ` Jens Axboe
2018-06-20 20:51     ` Jens Axboe
2018-06-21  7:15       ` Christoph Hellwig
2018-06-21 14:37         ` Jens Axboe
2018-06-21 14:56           ` Christoph Hellwig
2018-06-21 15:01             ` Jens Axboe
2018-06-21  7:09     ` Christoph Hellwig
2018-06-21 14:39       ` Jens Axboe
2018-06-21 14:58         ` Christoph Hellwig
2018-06-21 15:02           ` Jens Axboe
2018-06-21 15:06             ` Christoph Hellwig
2018-06-21 15:11               ` Jens Axboe [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b142d5e6-e7cd-fa41-776e-2acd18691bae@kernel.dk \
    --to=axboe@kernel.dk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.