From: axboe@kernel.dk (Jens Axboe)
Subject: [PATCH] nvme: use __GFP_NOWARN for iod allocation
Date: Thu, 21 Jun 2018 09:11:28 -0600 [thread overview]
Message-ID: <b142d5e6-e7cd-fa41-776e-2acd18691bae@kernel.dk> (raw)
In-Reply-To: <20180621150624.GA1718@infradead.org>
On 6/21/18 9:06 AM, Christoph Hellwig wrote:
> On Thu, Jun 21, 2018@09:02:18AM -0600, Jens Axboe wrote:
>> >From a quick look, the fabric code goes through the same nvme_queue_rq()
>> and ends up doing the same iod->sg allocation. I don't see any use of
>> chained sg lists, we're allocating one big sg table for the command.
>
> nvme_queue_rq is only used for PCIe.
>
> E.g. for RDMA the call is:
>
> nvme_rdma_queue_rq
> -> nvme_rdma_map_data
> -> sg_alloc_table_chained
>
> similar for FC and loop.
Gotcha. So we can safely ignore limiting segments and max size for
fabrics for now.
How about the below?
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 21710a7460c8..f1af08171d38 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1799,6 +1799,16 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
+/*
+ * For non-fabrics, limit the max IO size and segments so that we know our
+ * iod->sg allocation can fall within a single page. Fabrics is more sensitive
+ * to command size, and it also does sg chaining to avoid large allocations.
+ */
+static bool nvme_ctrl_limit_io_size(struct nvme_ctrl *ctrl)
+{
+ return (ctrl->ops->flags & NVME_F_FABRICS) == 0;
+}
+
static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
struct request_queue *q)
{
@@ -1808,8 +1818,11 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
u32 max_segments =
(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
+ if (nvme_ctrl_limit_io_size(ctrl))
+ max_segments = min_not_zero(max_segments,
+ (u32)NVME_MAX_SEGS);
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
- blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+ blk_queue_max_segments(q, max_segments);
}
if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
is_power_of_2(ctrl->max_hw_sectors))
@@ -2377,8 +2390,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->cntlid = le16_to_cpup(&id->cntlid);
if (id->mdts)
max_hw_sectors = 1 << (id->mdts + page_shift - 9);
- else
+ else if (!nvme_ctrl_limit_io_size(ctrl))
max_hw_sectors = UINT_MAX;
+ else
+ max_hw_sectors = NVME_MAX_KB_SZ << 1;
ctrl->max_hw_sectors =
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 231807cbc849..efce853a6638 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -33,6 +33,13 @@ extern unsigned int admin_timeout;
#define NVME_DEFAULT_KATO 5
#define NVME_KATO_GRACE 10
+/*
+ * These can be higher, but we need to ensure that any command doesn't
+ * require an sg allocation that needs more than a page of data.
+ */
+#define NVME_MAX_KB_SZ 4096
+#define NVME_MAX_SEGS 127
+
extern struct workqueue_struct *nvme_wq;
extern struct workqueue_struct *nvme_reset_wq;
extern struct workqueue_struct *nvme_delete_wq;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fc33804662e7..663446d55a49 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -100,6 +100,8 @@ struct nvme_dev {
struct nvme_ctrl ctrl;
struct completion ioq_wait;
+ mempool_t *iod_mempool;
+
/* shadow doorbell buffer support: */
u32 *dbbuf_dbs;
dma_addr_t dbbuf_dbs_dma_addr;
@@ -477,10 +479,7 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
iod->use_sgl = nvme_pci_use_sgls(dev, rq);
if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
- size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
- iod->use_sgl);
-
- iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
+ iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
if (!iod->sg)
return BLK_STS_RESOURCE;
} else {
@@ -526,7 +525,7 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
}
if (iod->sg != iod->inline_sg)
- kfree(iod->sg);
+ mempool_free(iod->sg, dev->iod_mempool);
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
@@ -2280,6 +2279,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
blk_put_queue(dev->ctrl.admin_q);
kfree(dev->queues);
free_opal_dev(dev->ctrl.opal_dev);
+ mempool_destroy(dev->iod_mempool);
kfree(dev);
}
@@ -2509,6 +2509,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
int node, result = -ENOMEM;
struct nvme_dev *dev;
unsigned long quirks = id->driver_data;
+ size_t alloc_size;
node = dev_to_node(&pdev->dev);
if (node == NUMA_NO_NODE)
@@ -2546,6 +2547,23 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result)
goto release_pools;
+ /*
+ * Double check that our mempool alloc size will cover the biggest
+ * command we support.
+ */
+ alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
+ NVME_MAX_SEGS, true);
+ WARN_ON_ONCE(alloc_size > PAGE_SIZE);
+
+ dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
+ mempool_kfree,
+ (void *) alloc_size,
+ GFP_KERNEL, node);
+ if (!dev->iod_mempool) {
+ result = -ENOMEM;
+ goto release_pools;
+ }
+
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
nvme_get_ctrl(&dev->ctrl);
--
Jens Axboe
prev parent reply other threads:[~2018-06-21 15:11 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-06-20 19:10 [PATCH] nvme: use __GFP_NOWARN for iod allocation Jens Axboe
2018-06-20 19:16 ` Jens Axboe
2018-06-20 20:09 ` Jens Axboe
2018-06-20 20:43 ` Keith Busch
2018-06-20 20:54 ` Jens Axboe
2018-06-20 22:30 ` Keith Busch
2018-06-20 22:30 ` Jens Axboe
2018-06-21 7:10 ` Christoph Hellwig
2018-06-21 14:47 ` Jens Axboe
2018-06-21 15:01 ` Christoph Hellwig
2018-06-21 15:04 ` Jens Axboe
2018-06-21 15:06 ` Jens Axboe
2018-06-21 15:24 ` Christoph Hellwig
2018-06-21 15:26 ` Jens Axboe
2018-06-21 15:24 ` Christoph Hellwig
2018-06-21 15:34 ` Jens Axboe
2018-06-21 15:45 ` Christoph Hellwig
2018-06-21 15:49 ` Jens Axboe
2018-06-21 16:27 ` Keith Busch
2018-06-21 17:20 ` Jens Axboe
2018-06-21 17:54 ` Christoph Hellwig
2018-06-21 19:18 ` Jens Axboe
2018-06-20 20:51 ` Jens Axboe
2018-06-21 7:15 ` Christoph Hellwig
2018-06-21 14:37 ` Jens Axboe
2018-06-21 14:56 ` Christoph Hellwig
2018-06-21 15:01 ` Jens Axboe
2018-06-21 7:09 ` Christoph Hellwig
2018-06-21 14:39 ` Jens Axboe
2018-06-21 14:58 ` Christoph Hellwig
2018-06-21 15:02 ` Jens Axboe
2018-06-21 15:06 ` Christoph Hellwig
2018-06-21 15:11 ` Jens Axboe [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=b142d5e6-e7cd-fa41-776e-2acd18691bae@kernel.dk \
--to=axboe@kernel.dk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox