From: Keith Busch <kbusch@fb.com>
To: <linux-nvme@lists.infradead.org>, <linux-block@vger.kernel.org>,
<io-uring@vger.kernel.org>, <linux-fsdevel@vger.kernel.org>
Cc: <axboe@kernel.dk>, <hch@lst.de>,
Alexander Viro <viro@zeniv.linux.org.uk>,
Kernel Team <Kernel-team@fb.com>, Keith Busch <kbusch@kernel.org>
Subject: [PATCHv3 7/7] nvme-pci: implement dma_map support
Date: Fri, 5 Aug 2022 09:24:44 -0700 [thread overview]
Message-ID: <20220805162444.3985535-8-kbusch@fb.com> (raw)
In-Reply-To: <20220805162444.3985535-1-kbusch@fb.com>
From: Keith Busch <kbusch@kernel.org>
Implement callbacks to convert a registered bio_vec to a prp list, and
use this for each IO that uses the returned tag. This saves repeated IO
conversions and dma mapping/unmapping. In many cases, the driver can
skip per-IO pool allocations entirely, potentially reducing signficant
CPU cycles.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
drivers/nvme/host/pci.c | 314 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 303 insertions(+), 11 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 71a4f26ba476..d42b00c6e041 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -104,12 +104,23 @@ static bool noacpi;
module_param(noacpi, bool, 0444);
MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");
+static const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
+
struct nvme_dev;
struct nvme_queue;
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
+struct nvme_dma_mapping {
+ int nr_pages;
+ u16 offset;
+ bool needs_sync;
+ u8 rsvd;
+ dma_addr_t prp_dma_addr;
+ __le64 *prps;
+};
+
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
@@ -544,9 +555,30 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
return true;
}
+static void nvme_sync_dma(struct nvme_dev *dev, struct request *req,
+ struct nvme_dma_mapping *mapping)
+{
+ int offset, i, j, length, nprps;
+
+ offset = blk_rq_dma_offset(req) + mapping->offset;
+ i = offset >> NVME_CTRL_PAGE_SHIFT;
+
+ offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
+ length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset);
+ nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
+
+ dma_sync_single_for_cpu(dev->dev,
+ le64_to_cpu(mapping->prps[i++]),
+ NVME_CTRL_PAGE_SIZE - offset, DMA_FROM_DEVICE);
+ for (j = 1; j < nprps; j++) {
+ dma_sync_single_for_cpu(dev->dev,
+ le64_to_cpu(mapping->prps[i++]),
+ NVME_CTRL_PAGE_SIZE, DMA_FROM_DEVICE);
+ }
+}
+
static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
{
- const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
dma_addr_t dma_addr = iod->first_dma;
int i;
@@ -576,10 +608,24 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
}
}
+static void nvme_free_prp_chain(struct nvme_dev *dev, struct request *req,
+ struct nvme_iod *iod)
+{
+ if (iod->npages == 0)
+ dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+ iod->first_dma);
+ else if (iod->use_sgl)
+ nvme_free_sgls(dev, req);
+ else
+ nvme_free_prps(dev, req);
+ mempool_free(iod->sg, dev->iod_mempool);
+}
+
static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ WARN_ON_ONCE(!iod->nents);
if (is_pci_p2pdma_page(sg_page(iod->sg)))
pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
rq_dma_dir(req));
@@ -589,25 +635,25 @@ static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
+ struct nvme_dma_mapping *mapping = blk_rq_dma_tag(req);
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ if (mapping) {
+ if (mapping->needs_sync && rq_data_dir(req) == READ)
+ nvme_sync_dma(dev, req, mapping);
+ if (iod->npages >= 0)
+ nvme_free_prp_chain(dev, req, iod);
+ return;
+ }
+
if (iod->dma_len) {
dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
rq_dma_dir(req));
return;
}
- WARN_ON_ONCE(!iod->nents);
-
nvme_unmap_sg(dev, req);
- if (iod->npages == 0)
- dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
- iod->first_dma);
- else if (iod->use_sgl)
- nvme_free_sgls(dev, req);
- else
- nvme_free_prps(dev, req);
- mempool_free(iod->sg, dev->iod_mempool);
+ nvme_free_prp_chain(dev, req, iod);
}
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
@@ -835,13 +881,145 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
return BLK_STS_OK;
}
+static blk_status_t nvme_premapped_slow(struct nvme_dev *dev,
+ struct request *req, struct nvme_iod *iod,
+ struct nvme_dma_mapping *mapping, int nprps)
+{
+ struct dma_pool *pool;
+ dma_addr_t prp_dma;
+ __le64 *prp_list;
+ void **list;
+ int i;
+
+ iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
+ if (!iod->sg)
+ return BLK_STS_RESOURCE;
+
+ if (nprps <= (256 / 8)) {
+ pool = dev->prp_small_pool;
+ iod->npages = 0;
+ } else {
+ pool = dev->prp_page_pool;
+ iod->npages = 1;
+ }
+
+ prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+ if (!prp_list) {
+ iod->npages = -1;
+ goto out_free_sg;
+ }
+
+ list = nvme_pci_iod_list(req);
+ list[0] = prp_list;
+ iod->first_dma = prp_dma;
+
+ for (;;) {
+ dma_addr_t next_prp_dma;
+ __le64 *next_prp_list;
+
+ if (nprps <= last_prp + 1) {
+ memcpy(prp_list, &mapping->prps[i], nprps * 8);
+ break;
+ }
+
+ memcpy(prp_list, &mapping->prps[i], NVME_CTRL_PAGE_SIZE - 8);
+ nprps -= last_prp;
+ i += last_prp;
+
+ next_prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &next_prp_dma);
+ if (!next_prp_list)
+ goto free_prps;
+
+ prp_list[last_prp] = cpu_to_le64(next_prp_dma);
+ prp_list = next_prp_list;
+ prp_dma = next_prp_dma;
+ list[iod->npages++] = prp_list;
+ }
+ return BLK_STS_OK;
+
+free_prps:
+ nvme_free_prps(dev, req);
+out_free_sg:
+ mempool_free(iod->sg, dev->iod_mempool);
+ return BLK_STS_RESOURCE;
+}
+
+static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
+ struct nvme_dma_mapping *mapping,
+ struct nvme_rw_command *cmnd,
+ struct nvme_iod *iod)
+{
+ bool needs_sync = mapping->needs_sync && rq_data_dir(req) == WRITE;
+ dma_addr_t prp_list_start, prp_list_end;
+ int i, offset, j, length, nprps;
+ blk_status_t ret;
+
+ offset = blk_rq_dma_offset(req) + mapping->offset;
+ i = offset >> NVME_CTRL_PAGE_SHIFT;
+
+ if (needs_sync)
+ dma_sync_single_for_device(dev->dev,
+ le64_to_cpu(mapping->prps[i]),
+ NVME_CTRL_PAGE_SIZE - offset, DMA_TO_DEVICE);
+
+ offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
+ cmnd->dptr.prp1 = cpu_to_le64(le64_to_cpu(mapping->prps[i++]) + offset);
+
+ length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset);
+ if (length <= 0)
+ return BLK_STS_OK;
+
+ if (length <= NVME_CTRL_PAGE_SIZE) {
+ if (needs_sync)
+ dma_sync_single_for_device(dev->dev,
+ le64_to_cpu(mapping->prps[i]),
+ NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE);
+ cmnd->dptr.prp2 = mapping->prps[i];
+ return BLK_STS_OK;
+ }
+
+ nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
+ prp_list_start = mapping->prp_dma_addr + 8 * i;
+ prp_list_end = prp_list_start + 8 * nprps;
+
+ /* Optimization when remaining list fits in one nvme page */
+ if ((prp_list_start >> NVME_CTRL_PAGE_SHIFT) ==
+ (prp_list_end >> NVME_CTRL_PAGE_SHIFT)) {
+ cmnd->dptr.prp2 = cpu_to_le64(prp_list_start);
+ goto sync;
+ }
+
+ ret = nvme_premapped_slow(dev, req, iod, mapping, nprps);
+ if (ret != BLK_STS_OK)
+ return ret;
+
+ cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
+sync:
+ if (!needs_sync)
+ return BLK_STS_OK;
+
+ i = offset >> NVME_CTRL_PAGE_SHIFT;
+ for (j = 0; j < nprps; j++)
+ dma_sync_single_for_device(dev->dev,
+ le64_to_cpu(mapping->prps[i++]),
+ NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE);
+ return BLK_STS_OK;
+}
+
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
+ struct nvme_dma_mapping *mapping = blk_rq_dma_tag(req);
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret = BLK_STS_RESOURCE;
int nr_mapped;
+ if (mapping) {
+ iod->dma_len = 0;
+ iod->use_sgl = false;
+ return nvme_premapped(dev, req, mapping, &cmnd->rw, iod);
+ }
+
if (blk_rq_nr_phys_segments(req) == 1) {
struct bio_vec bv = req_bvec(req);
@@ -1732,6 +1910,116 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
return result;
}
+#ifdef CONFIG_HAS_DMA
+/*
+ * Important: bvec must be describing a virtually contiguous buffer.
+ */
+static void *nvme_pci_dma_map(struct request_queue *q,
+ struct bio_vec *bvec, int nr_vecs)
+{
+ const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE);
+ struct nvme_ns *ns = q->queuedata;
+ struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
+ struct nvme_dma_mapping *mapping;
+ int i, j, k, size, ppv, ret = -ENOMEM;
+
+ if (!nr_vecs)
+ return ERR_PTR(-EINVAL);
+
+ mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
+ if (!mapping)
+ return ERR_PTR(-ENOMEM);
+
+ mapping->nr_pages = nr_vecs * nvme_pages;
+ size = sizeof(*mapping->prps) * mapping->nr_pages;
+ mapping->prps = dma_alloc_coherent(dev->dev, size,
+ &mapping->prp_dma_addr, GFP_KERNEL);
+ if (!mapping->prps)
+ goto free_mapping;
+
+ mapping->needs_sync = false;
+ for (i = 0, k = 0; i < nr_vecs; i++) {
+ struct bio_vec *bv = bvec + i;
+ dma_addr_t dma_addr;
+
+ ppv = nvme_pages;
+ if (i == 0) {
+ mapping->offset = bv->bv_offset;
+ ppv -= mapping->offset >> NVME_CTRL_PAGE_SHIFT;
+ } else if (bv->bv_offset) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (bv->bv_offset + bv->bv_len != PAGE_SIZE &&
+ i < nr_vecs - 1) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ dma_addr = dma_map_bvec(dev->dev, bv, 0, 0);
+ if (dma_mapping_error(dev->dev, dma_addr)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (i == 0)
+ dma_addr -= mapping->offset;
+
+ if (dma_need_sync(dev->dev, dma_addr))
+ mapping->needs_sync = true;
+
+ for (j = 0; j < ppv; j++)
+ mapping->prps[k++] = cpu_to_le64(dma_addr +
+ j * NVME_CTRL_PAGE_SIZE);
+ }
+
+ get_device(dev->dev);
+ return mapping;
+
+err:
+ for (i = 0; i < k; i += ppv) {
+ __u64 dma_addr = le64_to_cpu(mapping->prps[i]);
+ ppv = nvme_pages;
+
+ if (i == 0)
+ ppv -= mapping->offset >> NVME_CTRL_PAGE_SHIFT;
+ dma_unmap_page(dev->dev, dma_addr,
+ PAGE_SIZE - offset_in_page(dma_addr), 0);
+ }
+
+ dma_free_coherent(dev->dev, size, (void *)mapping->prps,
+ mapping->prp_dma_addr);
+free_mapping:
+ kfree(mapping);
+ return ERR_PTR(ret);
+}
+
+static void nvme_pci_dma_unmap(struct request_queue *q, void *dma_tag)
+{
+ const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE);
+ struct nvme_ns *ns = q->queuedata;
+ struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
+ struct nvme_dma_mapping *mapping = dma_tag;
+ int i, ppv;
+
+ for (i = 0; i < mapping->nr_pages; i += ppv) {
+ __u64 dma_addr = le64_to_cpu(mapping->prps[i]);
+ ppv = nvme_pages;
+
+ if (i == 0)
+ ppv -= mapping->offset >> NVME_CTRL_PAGE_SHIFT;
+ dma_unmap_page(dev->dev, dma_addr,
+ PAGE_SIZE - offset_in_page(dma_addr), 0);
+ }
+
+ dma_free_coherent(dev->dev, mapping->nr_pages * sizeof(*mapping->prps),
+ (void *)mapping->prps, mapping->prp_dma_addr);
+ kfree(mapping);
+ put_device(dev->dev);
+}
+#endif
+
static const struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq = nvme_queue_rq,
.complete = nvme_pci_complete_rq,
@@ -1750,6 +2038,10 @@ static const struct blk_mq_ops nvme_mq_ops = {
.map_queues = nvme_pci_map_queues,
.timeout = nvme_timeout,
.poll = nvme_poll,
+#ifdef CONFIG_HAS_DMA
+ .dma_map = nvme_pci_dma_map,
+ .dma_unmap = nvme_pci_dma_unmap,
+#endif
};
static void nvme_dev_remove_admin(struct nvme_dev *dev)
--
2.30.2
next prev parent reply other threads:[~2022-08-05 16:26 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-08-05 16:24 [PATCHv3 0/7] dma mapping optimisations Keith Busch
2022-08-05 16:24 ` [PATCHv3 1/7] blk-mq: add ops to dma map bvec Keith Busch
2022-08-05 16:24 ` [PATCHv3 2/7] file: " Keith Busch
2022-08-08 0:21 ` Dave Chinner
2022-08-08 1:13 ` Matthew Wilcox
2022-08-08 2:15 ` Dave Chinner
2022-08-08 2:49 ` Matthew Wilcox
2022-08-08 7:31 ` Dave Chinner
2022-08-08 15:28 ` Keith Busch
2022-08-08 10:14 ` Pavel Begunkov
2022-08-05 16:24 ` [PATCHv3 3/7] iov_iter: introduce type for preregistered dma tags Keith Busch
2022-08-05 16:24 ` [PATCHv3 4/7] block: add dma tag bio type Keith Busch
2022-08-05 16:24 ` [PATCHv3 5/7] io_uring: introduce file slot release helper Keith Busch
2022-08-05 16:24 ` [PATCHv3 6/7] io_uring: add support for dma pre-mapping Keith Busch
2022-08-05 16:24 ` Keith Busch [this message]
2022-08-09 6:46 ` [PATCHv3 0/7] dma mapping optimisations Christoph Hellwig
2022-08-09 14:18 ` Keith Busch
2022-08-09 18:39 ` Christoph Hellwig
2022-08-09 16:46 ` Keith Busch
2022-08-09 18:41 ` Christoph Hellwig
2022-08-10 18:05 ` Keith Busch
2022-08-11 7:22 ` Christoph Hellwig
2022-08-31 21:19 ` Keith Busch
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220805162444.3985535-8-kbusch@fb.com \
--to=kbusch@fb.com \
--cc=Kernel-team@fb.com \
--cc=axboe@kernel.dk \
--cc=hch@lst.de \
--cc=io-uring@vger.kernel.org \
--cc=kbusch@kernel.org \
--cc=linux-block@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.