* Re: [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
2025-07-07 12:52 [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping Christoph Hellwig
@ 2025-07-07 14:29 ` Keith Busch
2025-07-07 15:01 ` Christoph Hellwig
2025-07-07 15:13 ` Keith Busch
` (2 subsequent siblings)
3 siblings, 1 reply; 10+ messages in thread
From: Keith Busch @ 2025-07-07 14:29 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe, sagi, ben.copeland, leon, linux-nvme, linux-block
On Mon, Jul 07, 2025 at 02:52:23PM +0200, Christoph Hellwig wrote:
> While the reconstruction is easy and works fine for the SGL path, where
> the on the wire representation maps 1:1 to DMA mappings, the code to
> reconstruct the DMA mapping ranges from PRPs can't always work, as a
> given PRP layout can come from different DMA mappings, and the current
> code doesn't even always get that right.
Given it works fine for SGL, how do you feel about unconditionally
creating an NVMe SGL, and then call some "nvme_sgl_to_prp()" helper only
when needed? This means we only have one teardown path that matches the
setup.
This is something I hacked up over the weekend. It's only lightly
tested, and I know there's a bug somewhere here with the chainging, but
it's a start.
---
drivers/nvme/host/pci.c | 342 +++++++++++++++++++++++++--------------------------------
1 file changed, 150 insertions(+), 192 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 10aa04879d6996..7ef56775e3be9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -41,7 +41,7 @@
* Arbitrary upper bound.
*/
#define NVME_MAX_BYTES SZ_8M
-#define NVME_MAX_NR_DESCRIPTORS 5
+#define NVME_MAX_NR_DESCRIPTORS 6
/*
* For data SGLs we support a single descriptors worth of SGL entries.
@@ -70,7 +70,7 @@
(NVME_MAX_BYTES + 2 * (NVME_CTRL_PAGE_SIZE - 1))
static_assert(MAX_PRP_RANGE / NVME_CTRL_PAGE_SIZE <=
- (1 /* prp1 */ + NVME_MAX_NR_DESCRIPTORS * PRPS_PER_PAGE));
+ (1 /* prp1 */ + (NVME_MAX_NR_DESCRIPTORS - 1) * PRPS_PER_PAGE));
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0444);
@@ -273,6 +273,7 @@ struct nvme_iod {
unsigned int total_len;
struct dma_iova_state dma_state;
void *descriptors[NVME_MAX_NR_DESCRIPTORS];
+ struct nvme_sgl_desc sgl;
dma_addr_t meta_dma;
struct nvme_sgl_desc *meta_descriptor;
@@ -644,99 +645,61 @@ static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd)
return le64_to_cpu(cmd->common.dptr.prp2);
}
-static void nvme_free_descriptors(struct request *req)
+static void nvme_free_descriptors(struct request *req, struct nvme_iod *iod)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd);
- int i;
+ struct dma_pool *pool;
+ dma_addr_t dma_addr;
+ int i = 0;
- if (iod->nr_descriptors == 1) {
- dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0],
- dma_addr);
- return;
+ if (iod->sgl.type == NVME_SGL_FMT_LAST_SEG_DESC << 4) {
+ struct nvme_sgl_desc *sg_list = iod->descriptors[0];
+ unsigned int dma_len = le32_to_cpu(iod->sgl.length);
+ unsigned int nr_entries = dma_len / sizeof(*sg_list);
+
+ if (nr_entries <= NVME_SMALL_POOL_SIZE / sizeof(__le64))
+ pool = nvmeq->descriptor_pools.small;
+ else
+ pool = nvmeq->descriptor_pools.large;
+
+ dma_addr = le64_to_cpu(iod->sgl.addr);
+ dma_pool_free(pool, iod->descriptors[i++], dma_addr);
}
- for (i = 0; i < iod->nr_descriptors; i++) {
+ pool = nvme_dma_pool(nvmeq, iod);
+ for (; i < iod->nr_descriptors; i++) {
__le64 *prp_list = iod->descriptors[i];
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
- dma_pool_free(nvmeq->descriptor_pools.large, prp_list,
- dma_addr);
+ dma_pool_free(pool, prp_list, dma_addr);
dma_addr = next_dma_addr;
}
}
-static void nvme_free_prps(struct request *req)
+static void nvme_free_sgls(struct request *req, struct nvme_iod *iod)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct device *dma_dev = nvmeq->dev->dev;
+ unsigned int dma_len = le32_to_cpu(iod->sgl.length);
enum dma_data_direction dir = rq_dma_dir(req);
- int length = iod->total_len;
- dma_addr_t dma_addr;
- int i, desc;
- __le64 *prp_list;
- u32 dma_len;
-
- dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
- dma_len = min_t(u32, length,
- NVME_CTRL_PAGE_SIZE - (dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
- length -= dma_len;
- if (!length) {
- dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
- return;
- }
-
- if (length <= NVME_CTRL_PAGE_SIZE) {
- dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
- dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2);
- dma_unmap_page(dma_dev, dma_addr, length, dir);
- return;
- }
-
- i = 0;
- desc = 0;
- prp_list = iod->descriptors[desc];
- do {
- dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
- if (i == NVME_CTRL_PAGE_SIZE >> 3) {
- prp_list = iod->descriptors[++desc];
- i = 0;
- }
-
- dma_addr = le64_to_cpu(prp_list[i++]);
- dma_len = min(length, NVME_CTRL_PAGE_SIZE);
- length -= dma_len;
- } while (length);
-}
-
-static void nvme_free_sgls(struct request *req)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct device *dma_dev = nvmeq->dev->dev;
- dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr);
- unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length);
- struct nvme_sgl_desc *sg_list = iod->descriptors[0];
- enum dma_data_direction dir = rq_dma_dir(req);
- if (iod->nr_descriptors) {
- unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i;
+ if (iod->sgl.type == NVME_SGL_FMT_LAST_SEG_DESC << 4) {
+ struct nvme_sgl_desc *sg_list = iod->descriptors[0];
+ unsigned int i, nr_entries = dma_len / sizeof(*sg_list);
for (i = 0; i < nr_entries; i++)
dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
le32_to_cpu(sg_list[i].length), dir);
} else {
- dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir);
+ dma_unmap_page(dma_dev, le64_to_cpu(iod->sgl.addr), dma_len, dir);
}
}
static void nvme_unmap_data(struct request *req)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct device *dma_dev = nvmeq->dev->dev;
if (iod->flags & IOD_SINGLE_SEGMENT) {
@@ -747,137 +710,116 @@ static void nvme_unmap_data(struct request *req)
return;
}
- if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) {
- if (nvme_pci_cmd_use_sgl(&iod->cmd))
- nvme_free_sgls(req);
- else
- nvme_free_prps(req);
+ if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len))
+ nvme_free_sgls(req, iod);
+ if (iod->nr_descriptors)
+ nvme_free_descriptors(req, iod);
+}
+
+static blk_status_t nvme_pci_setup_prp_list(dma_addr_t dma_addr,
+ unsigned int dma_len, unsigned int *index,
+ __le64 **pprp_list, struct nvme_iod *iod,
+ struct nvme_queue *nvmeq)
+{
+ __le64 *prp_list = *pprp_list;
+ int i = *index;
+
+ for (;;) {
+ unsigned int prp_len = min(dma_len, NVME_CTRL_PAGE_SIZE);
+
+ if (i == NVME_CTRL_PAGE_SIZE >> 3) {
+ __le64 *old_prp_list = prp_list;
+ dma_addr_t prp_list_dma;
+
+ BUG_ON(iod->nr_descriptors >= NVME_MAX_NR_DESCRIPTORS);
+ prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
+ GFP_ATOMIC, &prp_list_dma);
+ if (!prp_list)
+ return BLK_STS_RESOURCE;
+
+ iod->descriptors[iod->nr_descriptors++] = prp_list;
+ prp_list[0] = old_prp_list[i - 1];
+ old_prp_list[i - 1] = cpu_to_le64(prp_list_dma);
+ i = 1;
+ }
+
+ prp_list[i++] = cpu_to_le64(dma_addr);
+ dma_len -= prp_len;
+ if (dma_len == 0)
+ break;
+ dma_addr += prp_len;
}
- if (iod->nr_descriptors)
- nvme_free_descriptors(req);
+ *index = i;
+ *pprp_list = prp_list;
+ return BLK_STS_OK;
}
-static blk_status_t nvme_pci_setup_data_prp(struct request *req,
- struct blk_dma_iter *iter)
+static blk_status_t nvme_pci_sgl_to_prp(struct request *req,
+ struct nvme_iod *iod)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- unsigned int length = blk_rq_payload_bytes(req);
+ unsigned int dma_len, prp_len, entries, j, i = 0;
dma_addr_t prp1_dma, prp2_dma = 0;
- unsigned int prp_len, i;
+ struct nvme_sgl_desc *sg_list;
+ struct dma_pool *pool;
+ dma_addr_t dma_addr;
__le64 *prp_list;
+ blk_status_t ret;
- /*
- * PRP1 always points to the start of the DMA transfers.
- *
- * This is the only PRP (except for the list entries) that could be
- * non-aligned.
- */
- prp1_dma = iter->addr;
- prp_len = min(length, NVME_CTRL_PAGE_SIZE -
- (iter->addr & (NVME_CTRL_PAGE_SIZE - 1)));
- iod->total_len += prp_len;
- iter->addr += prp_len;
- iter->len -= prp_len;
- length -= prp_len;
- if (!length)
- goto done;
-
- if (!iter->len) {
- if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
- &iod->dma_state, iter)) {
- if (WARN_ON_ONCE(!iter->status))
- goto bad_sgl;
- goto done;
- }
+ if (iod->sgl.type == NVME_SGL_FMT_LAST_SEG_DESC << 4) {
+ entries = iod->sgl.length / sizeof(*sg_list);
+ sg_list = iod->descriptors[0];
+ } else {
+ entries = 1;
+ sg_list = &iod->sgl;
+ }
+
+ dma_addr = le64_to_cpu(sg_list[i].addr);
+ dma_len = le32_to_cpu(sg_list[i].length);
+ prp1_dma = dma_addr;
+ prp_len = min(dma_len, NVME_CTRL_PAGE_SIZE -
+ (dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
+ dma_len -= prp_len;
+ if (!dma_len) {
+ if (++i == entries)
+ return BLK_STS_OK;
+ dma_addr = le64_to_cpu(sg_list[i].addr);
+ dma_len = le32_to_cpu(sg_list[i].length);
+ } else {
+ dma_addr += prp_len;
}
- /*
- * PRP2 is usually a list, but can point to data if all data to be
- * transferred fits into PRP1 + PRP2:
- */
- if (length <= NVME_CTRL_PAGE_SIZE) {
- prp2_dma = iter->addr;
- iod->total_len += length;
+ if (i + 1 == entries && dma_len <= NVME_CTRL_PAGE_SIZE) {
+ prp2_dma = dma_addr;
goto done;
}
- if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=
+ if (DIV_ROUND_UP(iod->total_len - prp_len, NVME_CTRL_PAGE_SIZE) <=
NVME_SMALL_POOL_SIZE / sizeof(__le64))
iod->flags |= IOD_SMALL_DESCRIPTOR;
- prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
- &prp2_dma);
- if (!prp_list) {
- iter->status = BLK_STS_RESOURCE;
- goto done;
- }
- iod->descriptors[iod->nr_descriptors++] = prp_list;
+ pool = nvme_dma_pool(nvmeq, iod);
+ prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp2_dma);
+ if (!prp_list)
+ return BLK_STS_RESOURCE;
- i = 0;
+ j = 0;
for (;;) {
- prp_list[i++] = cpu_to_le64(iter->addr);
- prp_len = min(length, NVME_CTRL_PAGE_SIZE);
- if (WARN_ON_ONCE(iter->len < prp_len))
- goto bad_sgl;
-
- iod->total_len += prp_len;
- iter->addr += prp_len;
- iter->len -= prp_len;
- length -= prp_len;
- if (!length)
- break;
-
- if (iter->len == 0) {
- if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
- &iod->dma_state, iter)) {
- if (WARN_ON_ONCE(!iter->status))
- goto bad_sgl;
- goto done;
- }
- }
-
- /*
- * If we've filled the entire descriptor, allocate a new that is
- * pointed to be the last entry in the previous PRP list. To
- * accommodate for that move the last actual entry to the new
- * descriptor.
- */
- if (i == NVME_CTRL_PAGE_SIZE >> 3) {
- __le64 *old_prp_list = prp_list;
- dma_addr_t prp_list_dma;
-
- prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
- GFP_ATOMIC, &prp_list_dma);
- if (!prp_list) {
- iter->status = BLK_STS_RESOURCE;
- goto done;
- }
- iod->descriptors[iod->nr_descriptors++] = prp_list;
+ ret = nvme_pci_setup_prp_list(dma_addr, dma_len, &j, &prp_list,
+ iod, nvmeq);
+ if (ret)
+ return ret;
- prp_list[0] = old_prp_list[i - 1];
- old_prp_list[i - 1] = cpu_to_le64(prp_list_dma);
- i = 1;
- }
+ if (++i == entries)
+ break;
+ dma_addr = le64_to_cpu(sg_list[i].addr);
+ dma_len = le32_to_cpu(sg_list[i].length);
}
-
done:
- /*
- * nvme_unmap_data uses the DPT field in the SQE to tear down the
- * mapping, so initialize it even for failures.
- */
iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma);
iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma);
- if (unlikely(iter->status))
- nvme_unmap_data(req);
- return iter->status;
-
-bad_sgl:
- dev_err_once(nvmeq->dev->dev,
- "Incorrectly formed request for payload:%d nents:%d\n",
- blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req));
- return BLK_STS_IOERR;
+ return ret;
}
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
@@ -897,49 +839,50 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
}
static blk_status_t nvme_pci_setup_data_sgl(struct request *req,
- struct blk_dma_iter *iter)
+ struct nvme_iod *iod, struct blk_dma_iter *iter)
+
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ unsigned int entries = blk_rq_nr_phys_segments(req), mapped = 0;
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- unsigned int entries = blk_rq_nr_phys_segments(req);
struct nvme_sgl_desc *sg_list;
+ struct dma_pool *pool;
dma_addr_t sgl_dma;
- unsigned int mapped = 0;
-
- /* set the transfer type as SGL */
- iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) {
- nvme_pci_sgl_set_data(&iod->cmd.common.dptr.sgl, iter);
+ nvme_pci_sgl_set_data(&iod->sgl, iter);
iod->total_len += iter->len;
return BLK_STS_OK;
}
if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list))
- iod->flags |= IOD_SMALL_DESCRIPTOR;
+ pool = nvmeq->descriptor_pools.small;
+ else
+ pool = nvmeq->descriptor_pools.large;
- sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
- &sgl_dma);
+ sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
if (!sg_list)
return BLK_STS_RESOURCE;
- iod->descriptors[iod->nr_descriptors++] = sg_list;
+ iod->descriptors[iod->nr_descriptors++] = sg_list;
do {
if (WARN_ON_ONCE(mapped == entries)) {
iter->status = BLK_STS_IOERR;
break;
}
+
nvme_pci_sgl_set_data(&sg_list[mapped++], iter);
iod->total_len += iter->len;
} while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state,
iter));
- nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped);
if (unlikely(iter->status))
- nvme_free_sgls(req);
+ nvme_free_sgls(req, iod);
+ else
+ nvme_pci_sgl_set_seg(&iod->sgl, sgl_dma, mapped);
return iter->status;
}
+
static blk_status_t nvme_pci_setup_data_simple(struct request *req,
enum nvme_use_sgl use_sgl)
{
@@ -979,6 +922,13 @@ static blk_status_t nvme_pci_setup_data_simple(struct request *req,
return BLK_STS_OK;
}
+static inline bool nvme_check_sgl_threshold(struct request *req,
+ enum nvme_use_sgl use_sgl)
+{
+ return use_sgl == SGL_SUPPORTED && sgl_threshold &&
+ nvme_pci_avg_seg_size(req) >= sgl_threshold;
+}
+
static blk_status_t nvme_map_data(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1001,11 +951,18 @@ static blk_status_t nvme_map_data(struct request *req)
if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
return iter.status;
- if (use_sgl == SGL_FORCED ||
- (use_sgl == SGL_SUPPORTED &&
- (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
- return nvme_pci_setup_data_sgl(req, &iter);
- return nvme_pci_setup_data_prp(req, &iter);
+ ret = nvme_pci_setup_data_sgl(req, iod, &iter);
+ if (ret)
+ return ret;
+
+ if (use_sgl == SGL_FORCED || nvme_check_sgl_threshold(req, use_sgl)) {
+ iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
+ iod->cmd.common.dptr.sgl = iod->sgl;
+ } else {
+ ret = nvme_pci_sgl_to_prp(req, iod);
+ }
+
+ return ret;
}
static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
@@ -1075,6 +1032,7 @@ static blk_status_t nvme_prep_rq(struct request *req)
iod->flags = 0;
iod->nr_descriptors = 0;
iod->total_len = 0;
+ iod->sgl.type = 0;
ret = nvme_setup_cmd(req->q->queuedata, req);
if (ret)
--
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
2025-07-07 12:52 [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping Christoph Hellwig
` (2 preceding siblings ...)
2025-07-08 12:55 ` Jens Axboe
@ 2025-07-11 0:25 ` Klara Modin
2025-07-11 0:37 ` Keith Busch
2025-07-11 8:32 ` Christoph Hellwig
3 siblings, 2 replies; 10+ messages in thread
From: Klara Modin @ 2025-07-11 0:25 UTC (permalink / raw)
To: Christoph Hellwig
Cc: kbusch, axboe, sagi, ben.copeland, leon, linux-nvme, linux-block
Hi,
On 2025-07-07 14:52:23 +0200, Christoph Hellwig wrote:
> The current version of the blk_rq_dma_map support in nvme-pci tries to
> reconstruct the DMA mappings from the on the wire descriptors if they
> are needed for unmapping. While this is not the case for the direct
> mapping fast path and the IOVA path, it is needed for the non-IOVA slow
> path, e.g. when using the interconnect is not dma coherent, when using
> swiotlb bounce buffering, or a IOMMU mapping that can't coalesce.
>
> While the reconstruction is easy and works fine for the SGL path, where
> the on the wire representation maps 1:1 to DMA mappings, the code to
> reconstruct the DMA mapping ranges from PRPs can't always work, as a
> given PRP layout can come from different DMA mappings, and the current
> code doesn't even always get that right.
>
> Give up on this approach and track the actual DMA mapping when actually
> needed again.
>
> Fixes: 7ce3c1dd78fc ("nvme-pci: convert the data mapping to blk_rq_dma_map")
> Reported-by: Ben Copeland <ben.copeland@linaro.org>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
This patch seems to introduce a memory leak, causing the slab to grow
continuously. The easiest way I found to trigger it is along the lines
of
# dd if=/dev/nvme0n1 of=/dev/null bs=8M
It also seems to happen during normal use but much more slowly.
Reverting fixes the issue.
# cat /proc/allocinfo | sort -n | numfmt --invalid=ignore --to=iec-i | tail -n5
146Mi 5285 mm/memory.c:4977 func:alloc_anon_folio
182Mi 24395 mm/shmem.c:1847 func:shmem_alloc_folio
420Mi 38746 mm/readahead.c:186 func:ractl_alloc_folio
29Gi 7383484 drivers/nvme/host/pci.c:767 func:nvme_pci_setup_data_prp
29Gi 937347 mm/slub.c:2487 func:alloc_slab_page
# slabtop --human --once | head
Active / Total Objects (% used) : 8966529 / 9031691 (99.3%)
Active / Total Slabs (% used) : 971257 / 971257 (100.0%)
Active / Total Caches (% used) : 128 / 211 (60.7%)
Active / Total Size (% used) : 28Gi / 28Gi (99.9%)
Minimum / Average / Maximum Object : 0.01K / 3.32K / 8.00K
OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
7410112 7410112 100% 4.00K 926264 8 28Gi kmalloc-4k
929920 929920 100% 0.12K 29060 32 113Mi kmalloc-128
81744 80996 99% 0.10K 2096 39 8.2Mi btrfs_free_space
My machine has 32 GiB of memory and if I leave it be it locks up and
sometimes panics.
# nvme id-ctrl /dev/nvme0 | grep sg
sgls : 0
Let me know if there's anything else you need.
Regards,
Klara Modin
> ---
> drivers/nvme/host/pci.c | 114 ++++++++++++++++++++++------------------
> 1 file changed, 62 insertions(+), 52 deletions(-)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 6a7a8bdbf385..6af184f2b73b 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -173,6 +173,7 @@ struct nvme_dev {
> bool hmb;
> struct sg_table *hmb_sgt;
>
> + mempool_t *dmavec_mempool;
> mempool_t *iod_meta_mempool;
>
> /* shadow doorbell buffer support: */
> @@ -262,6 +263,11 @@ enum nvme_iod_flags {
> IOD_SINGLE_SEGMENT = 1U << 2,
> };
>
> +struct nvme_dma_vec {
> + dma_addr_t addr;
> + unsigned int len;
> +};
> +
> /*
> * The nvme_iod describes the data in an I/O.
> */
> @@ -274,6 +280,8 @@ struct nvme_iod {
> unsigned int total_len;
> struct dma_iova_state dma_state;
> void *descriptors[NVME_MAX_NR_DESCRIPTORS];
> + struct nvme_dma_vec *dma_vecs;
> + unsigned int nr_dma_vecs;
>
> dma_addr_t meta_dma;
> struct sg_table meta_sgt;
> @@ -674,44 +682,12 @@ static void nvme_free_prps(struct request *req)
> {
> struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
> - struct device *dma_dev = nvmeq->dev->dev;
> - enum dma_data_direction dir = rq_dma_dir(req);
> - int length = iod->total_len;
> - dma_addr_t dma_addr;
> - int i, desc;
> - __le64 *prp_list;
> - u32 dma_len;
> -
> - dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
> - dma_len = min_t(u32, length,
> - NVME_CTRL_PAGE_SIZE - (dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
> - length -= dma_len;
> - if (!length) {
> - dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
> - return;
> - }
> -
> - if (length <= NVME_CTRL_PAGE_SIZE) {
> - dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
> - dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2);
> - dma_unmap_page(dma_dev, dma_addr, length, dir);
> - return;
> - }
> -
> - i = 0;
> - desc = 0;
> - prp_list = iod->descriptors[desc];
> - do {
> - dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
> - if (i == NVME_CTRL_PAGE_SIZE >> 3) {
> - prp_list = iod->descriptors[++desc];
> - i = 0;
> - }
> + unsigned int i;
>
> - dma_addr = le64_to_cpu(prp_list[i++]);
> - dma_len = min(length, NVME_CTRL_PAGE_SIZE);
> - length -= dma_len;
> - } while (length);
> + for (i = 0; i < iod->nr_dma_vecs; i++)
> + dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr,
> + iod->dma_vecs[i].len, rq_dma_dir(req));
> + mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
> }
>
> static void nvme_free_sgls(struct request *req)
> @@ -760,6 +736,23 @@ static void nvme_unmap_data(struct request *req)
> nvme_free_descriptors(req);
> }
>
> +static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev,
> + struct blk_dma_iter *iter)
> +{
> + struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> +
> + if (iter->len)
> + return true;
> + if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter))
> + return false;
> + if (dma_need_unmap(dma_dev)) {
> + iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr;
> + iod->dma_vecs[iod->nr_dma_vecs].len = iter->len;
> + iod->nr_dma_vecs++;
> + }
> + return true;
> +}
> +
> static blk_status_t nvme_pci_setup_data_prp(struct request *req,
> struct blk_dma_iter *iter)
> {
> @@ -770,6 +763,16 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
> unsigned int prp_len, i;
> __le64 *prp_list;
>
> + if (dma_need_unmap(nvmeq->dev->dev)) {
> + iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool,
> + GFP_ATOMIC);
> + if (!iod->dma_vecs)
> + return BLK_STS_RESOURCE;
> + iod->dma_vecs[0].addr = iter->addr;
> + iod->dma_vecs[0].len = iter->len;
> + iod->nr_dma_vecs = 1;
> + }
> +
> /*
> * PRP1 always points to the start of the DMA transfers.
> *
> @@ -786,13 +789,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
> if (!length)
> goto done;
>
> - if (!iter->len) {
> - if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
> - &iod->dma_state, iter)) {
> - if (WARN_ON_ONCE(!iter->status))
> - goto bad_sgl;
> - goto done;
> - }
> + if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
> + if (WARN_ON_ONCE(!iter->status))
> + goto bad_sgl;
> + goto done;
> }
>
> /*
> @@ -831,13 +831,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
> if (!length)
> break;
>
> - if (iter->len == 0) {
> - if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
> - &iod->dma_state, iter)) {
> - if (WARN_ON_ONCE(!iter->status))
> - goto bad_sgl;
> - goto done;
> - }
> + if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
> + if (WARN_ON_ONCE(!iter->status))
> + goto bad_sgl;
> + goto done;
> }
>
> /*
> @@ -3025,14 +3022,25 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
> static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
> {
> size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
> + size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS;
> +
> + dev->dmavec_mempool = mempool_create_node(1,
> + mempool_kmalloc, mempool_kfree,
> + (void *)alloc_size, GFP_KERNEL,
> + dev_to_node(dev->dev));
> + if (!dev->dmavec_mempool)
> + return -ENOMEM;
>
> dev->iod_meta_mempool = mempool_create_node(1,
> mempool_kmalloc, mempool_kfree,
> (void *)meta_size, GFP_KERNEL,
> dev_to_node(dev->dev));
> if (!dev->iod_meta_mempool)
> - return -ENOMEM;
> + goto free;
> return 0;
> +free:
> + mempool_destroy(dev->dmavec_mempool);
> + return -ENOMEM;
> }
>
> static void nvme_free_tagset(struct nvme_dev *dev)
> @@ -3477,6 +3485,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> nvme_dbbuf_dma_free(dev);
> nvme_free_queues(dev, 0);
> out_release_iod_mempool:
> + mempool_destroy(dev->dmavec_mempool);
> mempool_destroy(dev->iod_meta_mempool);
> out_dev_unmap:
> nvme_dev_unmap(dev);
> @@ -3540,6 +3549,7 @@ static void nvme_remove(struct pci_dev *pdev)
> nvme_dev_remove_admin(dev);
> nvme_dbbuf_dma_free(dev);
> nvme_free_queues(dev, 0);
> + mempool_destroy(dev->dmavec_mempool);
> mempool_destroy(dev->iod_meta_mempool);
> nvme_release_descriptor_pools(dev);
> nvme_dev_unmap(dev);
> --
> 2.47.2
>
^ permalink raw reply [flat|nested] 10+ messages in thread