[PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping

Linux block layer
 help / color / mirror / Atom feed

* [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
@ 2025-07-07 12:52 Christoph Hellwig
  2025-07-07 14:29 ` Keith Busch
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Christoph Hellwig @ 2025-07-07 12:52 UTC (permalink / raw)
  To: kbusch, axboe, sagi; +Cc: ben.copeland, leon, linux-nvme, linux-block

The current version of the blk_rq_dma_map support in nvme-pci tries to
reconstruct the DMA mappings from the on the wire descriptors if they
are needed for unmapping.  While this is not the case for the direct
mapping fast path and the IOVA path, it is needed for the non-IOVA slow
path, e.g. when using the interconnect is not dma coherent, when using
swiotlb bounce buffering, or a IOMMU mapping that can't coalesce.

While the reconstruction is easy and works fine for the SGL path, where
the on the wire representation maps 1:1 to DMA mappings, the code to
reconstruct the DMA mapping ranges from PRPs can't always work, as a
given PRP layout can come from different DMA mappings, and the current
code doesn't even always get that right.

Give up on this approach and track the actual DMA mapping when actually
needed again.

Fixes: 7ce3c1dd78fc ("nvme-pci: convert the data mapping to blk_rq_dma_map")
Reported-by: Ben Copeland <ben.copeland@linaro.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 114 ++++++++++++++++++++++------------------
 1 file changed, 62 insertions(+), 52 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6a7a8bdbf385..6af184f2b73b 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -173,6 +173,7 @@ struct nvme_dev {
 	bool hmb;
 	struct sg_table *hmb_sgt;
 
+	mempool_t *dmavec_mempool;
 	mempool_t *iod_meta_mempool;
 
 	/* shadow doorbell buffer support: */
@@ -262,6 +263,11 @@ enum nvme_iod_flags {
 	IOD_SINGLE_SEGMENT	= 1U << 2,
 };
 
+struct nvme_dma_vec {
+	dma_addr_t addr;
+	unsigned int len;
+};
+
 /*
  * The nvme_iod describes the data in an I/O.
  */
@@ -274,6 +280,8 @@ struct nvme_iod {
 	unsigned int total_len;
 	struct dma_iova_state dma_state;
 	void *descriptors[NVME_MAX_NR_DESCRIPTORS];
+	struct nvme_dma_vec *dma_vecs;
+	unsigned int nr_dma_vecs;
 
 	dma_addr_t meta_dma;
 	struct sg_table meta_sgt;
@@ -674,44 +682,12 @@ static void nvme_free_prps(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-	struct device *dma_dev = nvmeq->dev->dev;
-	enum dma_data_direction dir = rq_dma_dir(req);
-	int length = iod->total_len;
-	dma_addr_t dma_addr;
-	int i, desc;
-	__le64 *prp_list;
-	u32 dma_len;
-
-	dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
-	dma_len = min_t(u32, length,
-		NVME_CTRL_PAGE_SIZE - (dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
-	length -= dma_len;
-	if (!length) {
-		dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
-		return;
-	}
-
-	if (length <= NVME_CTRL_PAGE_SIZE) {
-		dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
-		dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2);
-		dma_unmap_page(dma_dev, dma_addr, length, dir);
-		return;
-	}
-
-	i = 0;
-	desc = 0;
-	prp_list = iod->descriptors[desc];
-	do {
-		dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
-		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
-			prp_list = iod->descriptors[++desc];
-			i = 0;
-		}
+	unsigned int i;
 
-		dma_addr = le64_to_cpu(prp_list[i++]);
-		dma_len = min(length, NVME_CTRL_PAGE_SIZE);
-		length -= dma_len;
-	} while (length);
+	for (i = 0; i < iod->nr_dma_vecs; i++)
+		dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr,
+				iod->dma_vecs[i].len, rq_dma_dir(req));
+	mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
 }
 
 static void nvme_free_sgls(struct request *req)
@@ -760,6 +736,23 @@ static void nvme_unmap_data(struct request *req)
 		nvme_free_descriptors(req);
 }
 
+static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev,
+		struct blk_dma_iter *iter)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+	if (iter->len)
+		return true;
+	if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter))
+		return false;
+	if (dma_need_unmap(dma_dev)) {
+		iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr;
+		iod->dma_vecs[iod->nr_dma_vecs].len = iter->len;
+		iod->nr_dma_vecs++;
+	}
+	return true;
+}
+
 static blk_status_t nvme_pci_setup_data_prp(struct request *req,
 		struct blk_dma_iter *iter)
 {
@@ -770,6 +763,16 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
 	unsigned int prp_len, i;
 	__le64 *prp_list;
 
+	if (dma_need_unmap(nvmeq->dev->dev)) {
+		iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool,
+				GFP_ATOMIC);
+		if (!iod->dma_vecs)
+			return BLK_STS_RESOURCE;
+		iod->dma_vecs[0].addr = iter->addr;
+		iod->dma_vecs[0].len = iter->len;
+		iod->nr_dma_vecs = 1;
+	}
+
 	/*
 	 * PRP1 always points to the start of the DMA transfers.
 	 *
@@ -786,13 +789,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
 	if (!length)
 		goto done;
 
-	if (!iter->len) {
-		if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
-				&iod->dma_state, iter)) {
-			if (WARN_ON_ONCE(!iter->status))
-				goto bad_sgl;
-			goto done;
-		}
+	if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
+		if (WARN_ON_ONCE(!iter->status))
+			goto bad_sgl;
+		goto done;
 	}
 
 	/*
@@ -831,13 +831,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
 		if (!length)
 			break;
 
-		if (iter->len == 0) {
-			if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
-					&iod->dma_state, iter)) {
-				if (WARN_ON_ONCE(!iter->status))
-					goto bad_sgl;
-				goto done;
-			}
+		if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
+			if (WARN_ON_ONCE(!iter->status))
+				goto bad_sgl;
+			goto done;
 		}
 
 		/*
@@ -3025,14 +3022,25 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
 {
 	size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
+	size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS;
+
+	dev->dmavec_mempool = mempool_create_node(1,
+			mempool_kmalloc, mempool_kfree,
+			(void *)alloc_size, GFP_KERNEL,
+			dev_to_node(dev->dev));
+	if (!dev->dmavec_mempool)
+		return -ENOMEM;
 
 	dev->iod_meta_mempool = mempool_create_node(1,
 			mempool_kmalloc, mempool_kfree,
 			(void *)meta_size, GFP_KERNEL,
 			dev_to_node(dev->dev));
 	if (!dev->iod_meta_mempool)
-		return -ENOMEM;
+		goto free;
 	return 0;
+free:
+	mempool_destroy(dev->dmavec_mempool);
+	return -ENOMEM;
 }
 
 static void nvme_free_tagset(struct nvme_dev *dev)
@@ -3477,6 +3485,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	nvme_dbbuf_dma_free(dev);
 	nvme_free_queues(dev, 0);
 out_release_iod_mempool:
+	mempool_destroy(dev->dmavec_mempool);
 	mempool_destroy(dev->iod_meta_mempool);
 out_dev_unmap:
 	nvme_dev_unmap(dev);
@@ -3540,6 +3549,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_dev_remove_admin(dev);
 	nvme_dbbuf_dma_free(dev);
 	nvme_free_queues(dev, 0);
+	mempool_destroy(dev->dmavec_mempool);
 	mempool_destroy(dev->iod_meta_mempool);
 	nvme_release_descriptor_pools(dev);
 	nvme_dev_unmap(dev);
-- 
2.47.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
  2025-07-07 12:52 [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping Christoph Hellwig
@ 2025-07-07 14:29 ` Keith Busch
  2025-07-07 15:01   ` Christoph Hellwig
  2025-07-07 15:13 ` Keith Busch
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: Keith Busch @ 2025-07-07 14:29 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, sagi, ben.copeland, leon, linux-nvme, linux-block

On Mon, Jul 07, 2025 at 02:52:23PM +0200, Christoph Hellwig wrote:
> While the reconstruction is easy and works fine for the SGL path, where
> the on the wire representation maps 1:1 to DMA mappings, the code to
> reconstruct the DMA mapping ranges from PRPs can't always work, as a
> given PRP layout can come from different DMA mappings, and the current
> code doesn't even always get that right.

Given it works fine for SGL, how do you feel about unconditionally
creating an NVMe SGL, and then call some "nvme_sgl_to_prp()" helper only
when needed? This means we only have one teardown path that matches the
setup. 

This is something I hacked up over the weekend. It's only lightly
tested, and I know there's a bug somewhere here with the chainging, but
it's a start.

---
 drivers/nvme/host/pci.c | 342 +++++++++++++++++++++++++--------------------------------
 1 file changed, 150 insertions(+), 192 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 10aa04879d6996..7ef56775e3be9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -41,7 +41,7 @@
  * Arbitrary upper bound.
  */
 #define NVME_MAX_BYTES		SZ_8M
-#define NVME_MAX_NR_DESCRIPTORS	5
+#define NVME_MAX_NR_DESCRIPTORS	6
 
 /*
  * For data SGLs we support a single descriptors worth of SGL entries.
@@ -70,7 +70,7 @@
 	(NVME_MAX_BYTES + 2 * (NVME_CTRL_PAGE_SIZE - 1))
 
 static_assert(MAX_PRP_RANGE / NVME_CTRL_PAGE_SIZE <=
-	(1 /* prp1 */ + NVME_MAX_NR_DESCRIPTORS * PRPS_PER_PAGE));
+	(1 /* prp1 */ + (NVME_MAX_NR_DESCRIPTORS - 1) * PRPS_PER_PAGE));
 
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0444);
@@ -273,6 +273,7 @@ struct nvme_iod {
 	unsigned int total_len;
 	struct dma_iova_state dma_state;
 	void *descriptors[NVME_MAX_NR_DESCRIPTORS];
+	struct nvme_sgl_desc sgl;
 
 	dma_addr_t meta_dma;
 	struct nvme_sgl_desc *meta_descriptor;
@@ -644,99 +645,61 @@ static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd)
 	return le64_to_cpu(cmd->common.dptr.prp2);
 }
 
-static void nvme_free_descriptors(struct request *req)
+static void nvme_free_descriptors(struct request *req, struct nvme_iod *iod)
 {
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 	const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd);
-	int i;
+	struct dma_pool *pool;
+	dma_addr_t dma_addr;
+	int i = 0;
 
-	if (iod->nr_descriptors == 1) {
-		dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0],
-				dma_addr);
-		return;
+	if (iod->sgl.type == NVME_SGL_FMT_LAST_SEG_DESC << 4) {
+		struct nvme_sgl_desc *sg_list = iod->descriptors[0];
+		unsigned int dma_len = le32_to_cpu(iod->sgl.length);
+		unsigned int nr_entries = dma_len / sizeof(*sg_list);
+
+		if (nr_entries <= NVME_SMALL_POOL_SIZE / sizeof(__le64))
+			pool = nvmeq->descriptor_pools.small;
+		else
+			pool = nvmeq->descriptor_pools.large;
+
+		dma_addr = le64_to_cpu(iod->sgl.addr);
+		dma_pool_free(pool, iod->descriptors[i++], dma_addr);
 	}
 
-	for (i = 0; i < iod->nr_descriptors; i++) {
+	pool = nvme_dma_pool(nvmeq, iod);
+	for (; i < iod->nr_descriptors; i++) {
 		__le64 *prp_list = iod->descriptors[i];
 		dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
 
-		dma_pool_free(nvmeq->descriptor_pools.large, prp_list,
-				dma_addr);
+		dma_pool_free(pool, prp_list, dma_addr);
 		dma_addr = next_dma_addr;
 	}
 }
 
-static void nvme_free_prps(struct request *req)
+static void nvme_free_sgls(struct request *req, struct nvme_iod *iod)
 {
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-	struct device *dma_dev = nvmeq->dev->dev;
+	unsigned int dma_len = le32_to_cpu(iod->sgl.length);
 	enum dma_data_direction dir = rq_dma_dir(req);
-	int length = iod->total_len;
-	dma_addr_t dma_addr;
-	int i, desc;
-	__le64 *prp_list;
-	u32 dma_len;
-
-	dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
-	dma_len = min_t(u32, length,
-		NVME_CTRL_PAGE_SIZE - (dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
-	length -= dma_len;
-	if (!length) {
-		dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
-		return;
-	}
-
-	if (length <= NVME_CTRL_PAGE_SIZE) {
-		dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
-		dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2);
-		dma_unmap_page(dma_dev, dma_addr, length, dir);
-		return;
-	}
-
-	i = 0;
-	desc = 0;
-	prp_list = iod->descriptors[desc];
-	do {
-		dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
-		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
-			prp_list = iod->descriptors[++desc];
-			i = 0;
-		}
-
-		dma_addr = le64_to_cpu(prp_list[i++]);
-		dma_len = min(length, NVME_CTRL_PAGE_SIZE);
-		length -= dma_len;
-	} while (length);
-}
-
-static void nvme_free_sgls(struct request *req)
-{
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 	struct device *dma_dev = nvmeq->dev->dev;
-	dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr);
-	unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length);
-	struct nvme_sgl_desc *sg_list = iod->descriptors[0];
-	enum dma_data_direction dir = rq_dma_dir(req);
 
-	if (iod->nr_descriptors) {
-		unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i;
+	if (iod->sgl.type == NVME_SGL_FMT_LAST_SEG_DESC << 4) {
+		struct nvme_sgl_desc *sg_list = iod->descriptors[0];
+		unsigned int i, nr_entries = dma_len / sizeof(*sg_list);
 
 		for (i = 0; i < nr_entries; i++)
 			dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
 				le32_to_cpu(sg_list[i].length), dir);
 	} else {
-		dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir);
+		dma_unmap_page(dma_dev, le64_to_cpu(iod->sgl.addr), dma_len, dir);
 	}
 }
 
 static void nvme_unmap_data(struct request *req)
 {
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct device *dma_dev = nvmeq->dev->dev;
 
 	if (iod->flags & IOD_SINGLE_SEGMENT) {
@@ -747,137 +710,116 @@ static void nvme_unmap_data(struct request *req)
 		return;
 	}
 
-	if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) {
-		if (nvme_pci_cmd_use_sgl(&iod->cmd))
-			nvme_free_sgls(req);
-		else
-			nvme_free_prps(req);
+	if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len))
+		nvme_free_sgls(req, iod);
+	if (iod->nr_descriptors)
+		nvme_free_descriptors(req, iod);
+}
+
+static blk_status_t nvme_pci_setup_prp_list(dma_addr_t dma_addr,
+				unsigned int dma_len, unsigned int *index,
+				__le64 **pprp_list, struct nvme_iod *iod,
+				struct nvme_queue *nvmeq)
+{
+	__le64 *prp_list = *pprp_list;
+	int i = *index;
+
+	for (;;) {
+		unsigned int prp_len = min(dma_len, NVME_CTRL_PAGE_SIZE);
+
+		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
+			__le64 *old_prp_list = prp_list;
+			dma_addr_t prp_list_dma;
+
+			BUG_ON(iod->nr_descriptors >= NVME_MAX_NR_DESCRIPTORS);
+			prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
+						  GFP_ATOMIC, &prp_list_dma);
+			if (!prp_list)
+				return BLK_STS_RESOURCE;
+
+			iod->descriptors[iod->nr_descriptors++] = prp_list;
+			prp_list[0] = old_prp_list[i - 1];
+			old_prp_list[i - 1] = cpu_to_le64(prp_list_dma);
+			i = 1;
+		}
+
+		prp_list[i++] = cpu_to_le64(dma_addr);
+		dma_len -= prp_len;
+		if (dma_len == 0)
+			break;
+		dma_addr += prp_len;
 	}
 
-	if (iod->nr_descriptors)
-		nvme_free_descriptors(req);
+	*index = i;
+	*pprp_list = prp_list;
+	return BLK_STS_OK;
 }
 
-static blk_status_t nvme_pci_setup_data_prp(struct request *req,
-		struct blk_dma_iter *iter)
+static blk_status_t nvme_pci_sgl_to_prp(struct request *req,
+					struct nvme_iod *iod)
 {
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-	unsigned int length = blk_rq_payload_bytes(req);
+	unsigned int dma_len, prp_len, entries, j, i = 0;
 	dma_addr_t prp1_dma, prp2_dma = 0;
-	unsigned int prp_len, i;
+	struct nvme_sgl_desc *sg_list;
+	struct dma_pool *pool;
+	dma_addr_t dma_addr;
 	__le64 *prp_list;
+	blk_status_t ret;
 
-	/*
-	 * PRP1 always points to the start of the DMA transfers.
-	 *
-	 * This is the only PRP (except for the list entries) that could be
-	 * non-aligned.
-	 */
-	prp1_dma = iter->addr;
-	prp_len = min(length, NVME_CTRL_PAGE_SIZE -
-			(iter->addr & (NVME_CTRL_PAGE_SIZE - 1)));
-	iod->total_len += prp_len;
-	iter->addr += prp_len;
-	iter->len -= prp_len;
-	length -= prp_len;
-	if (!length)
-		goto done;
-
-	if (!iter->len) {
-		if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
-				&iod->dma_state, iter)) {
-			if (WARN_ON_ONCE(!iter->status))
-				goto bad_sgl;
-			goto done;
-		}
+	if (iod->sgl.type == NVME_SGL_FMT_LAST_SEG_DESC << 4) {
+		entries = iod->sgl.length / sizeof(*sg_list);
+		sg_list = iod->descriptors[0];
+	} else {
+		entries = 1;
+		sg_list = &iod->sgl;
+	}
+
+	dma_addr = le64_to_cpu(sg_list[i].addr);
+	dma_len = le32_to_cpu(sg_list[i].length);
+	prp1_dma = dma_addr;
+	prp_len = min(dma_len, NVME_CTRL_PAGE_SIZE -
+			(dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
+	dma_len -= prp_len;
+	if (!dma_len) {
+		if (++i == entries)
+			return BLK_STS_OK;
+		dma_addr = le64_to_cpu(sg_list[i].addr);
+		dma_len = le32_to_cpu(sg_list[i].length);
+	} else {
+		dma_addr += prp_len;
 	}
 
-	/*
-	 * PRP2 is usually a list, but can point to data if all data to be
-	 * transferred fits into PRP1 + PRP2:
-	 */
-	if (length <= NVME_CTRL_PAGE_SIZE) {
-		prp2_dma = iter->addr;
-		iod->total_len += length;
+	if (i + 1 == entries && dma_len <= NVME_CTRL_PAGE_SIZE) {
+		prp2_dma = dma_addr;
 		goto done;
 	}
 
-	if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=
+	if (DIV_ROUND_UP(iod->total_len - prp_len, NVME_CTRL_PAGE_SIZE) <=
 	    NVME_SMALL_POOL_SIZE / sizeof(__le64))
 		iod->flags |= IOD_SMALL_DESCRIPTOR;
 
-	prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
-			&prp2_dma);
-	if (!prp_list) {
-		iter->status = BLK_STS_RESOURCE;
-		goto done;
-	}
-	iod->descriptors[iod->nr_descriptors++] = prp_list;
+	pool = nvme_dma_pool(nvmeq, iod);
+	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp2_dma);
+	if (!prp_list)
+		return BLK_STS_RESOURCE;
 
-	i = 0;
+	j = 0;
 	for (;;) {
-		prp_list[i++] = cpu_to_le64(iter->addr);
-		prp_len = min(length, NVME_CTRL_PAGE_SIZE);
-		if (WARN_ON_ONCE(iter->len < prp_len))
-			goto bad_sgl;
-
-		iod->total_len += prp_len;
-		iter->addr += prp_len;
-		iter->len -= prp_len;
-		length -= prp_len;
-		if (!length)
-			break;
-
-		if (iter->len == 0) {
-			if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
-					&iod->dma_state, iter)) {
-				if (WARN_ON_ONCE(!iter->status))
-					goto bad_sgl;
-				goto done;
-			}
-		}
-
-		/*
-		 * If we've filled the entire descriptor, allocate a new that is
-		 * pointed to be the last entry in the previous PRP list.  To
-		 * accommodate for that move the last actual entry to the new
-		 * descriptor.
-		 */
-		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
-			__le64 *old_prp_list = prp_list;
-			dma_addr_t prp_list_dma;
-
-			prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
-					GFP_ATOMIC, &prp_list_dma);
-			if (!prp_list) {
-				iter->status = BLK_STS_RESOURCE;
-				goto done;
-			}
-			iod->descriptors[iod->nr_descriptors++] = prp_list;
+		ret = nvme_pci_setup_prp_list(dma_addr, dma_len, &j, &prp_list,
+						iod, nvmeq);
+		if (ret)
+			return ret;
 
-			prp_list[0] = old_prp_list[i - 1];
-			old_prp_list[i - 1] = cpu_to_le64(prp_list_dma);
-			i = 1;
-		}
+		if (++i == entries)
+			break;
+		dma_addr = le64_to_cpu(sg_list[i].addr);
+		dma_len = le32_to_cpu(sg_list[i].length);
 	}
-
 done:
-	/*
-	 * nvme_unmap_data uses the DPT field in the SQE to tear down the
-	 * mapping, so initialize it even for failures.
-	 */
 	iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma);
 	iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma);
-	if (unlikely(iter->status))
-		nvme_unmap_data(req);
-	return iter->status;
-
-bad_sgl:
-	dev_err_once(nvmeq->dev->dev,
-		"Incorrectly formed request for payload:%d nents:%d\n",
-		blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req));
-	return BLK_STS_IOERR;
+	return ret;
 }
 
 static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
@@ -897,49 +839,50 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
 }
 
 static blk_status_t nvme_pci_setup_data_sgl(struct request *req,
-		struct blk_dma_iter *iter)
+		struct nvme_iod *iod, struct blk_dma_iter *iter)
+
 {
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	unsigned int entries = blk_rq_nr_phys_segments(req), mapped = 0;
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-	unsigned int entries = blk_rq_nr_phys_segments(req);
 	struct nvme_sgl_desc *sg_list;
+	struct dma_pool *pool;
 	dma_addr_t sgl_dma;
-	unsigned int mapped = 0;
-
-	/* set the transfer type as SGL */
-	iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
 
 	if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) {
-		nvme_pci_sgl_set_data(&iod->cmd.common.dptr.sgl, iter);
+		nvme_pci_sgl_set_data(&iod->sgl, iter);
 		iod->total_len += iter->len;
 		return BLK_STS_OK;
 	}
 
 	if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list))
-		iod->flags |= IOD_SMALL_DESCRIPTOR;
+		pool = nvmeq->descriptor_pools.small;
+	else
+		pool = nvmeq->descriptor_pools.large;
 
-	sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
-			&sgl_dma);
+	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
 	if (!sg_list)
 		return BLK_STS_RESOURCE;
-	iod->descriptors[iod->nr_descriptors++] = sg_list;
 
+	iod->descriptors[iod->nr_descriptors++] = sg_list;
 	do {
 		if (WARN_ON_ONCE(mapped == entries)) {
 			iter->status = BLK_STS_IOERR;
 			break;
 		}
+
 		nvme_pci_sgl_set_data(&sg_list[mapped++], iter);
 		iod->total_len += iter->len;
 	} while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state,
 				iter));
 
-	nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped);
 	if (unlikely(iter->status))
-		nvme_free_sgls(req);
+		nvme_free_sgls(req, iod);
+	else
+		nvme_pci_sgl_set_seg(&iod->sgl, sgl_dma, mapped);
 	return iter->status;
 }
 
+
 static blk_status_t nvme_pci_setup_data_simple(struct request *req,
 		enum nvme_use_sgl use_sgl)
 {
@@ -979,6 +922,13 @@ static blk_status_t nvme_pci_setup_data_simple(struct request *req,
 	return BLK_STS_OK;
 }
 
+static inline bool nvme_check_sgl_threshold(struct request *req,
+					enum nvme_use_sgl use_sgl)
+{
+	return use_sgl == SGL_SUPPORTED && sgl_threshold &&
+		nvme_pci_avg_seg_size(req) >= sgl_threshold;
+}
+
 static blk_status_t nvme_map_data(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1001,11 +951,18 @@ static blk_status_t nvme_map_data(struct request *req)
 	if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
 		return iter.status;
 
-	if (use_sgl == SGL_FORCED ||
-	    (use_sgl == SGL_SUPPORTED &&
-	     (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
-		return nvme_pci_setup_data_sgl(req, &iter);
-	return nvme_pci_setup_data_prp(req, &iter);
+	ret = nvme_pci_setup_data_sgl(req, iod, &iter);
+	if (ret)
+		return ret;
+
+	if (use_sgl == SGL_FORCED || nvme_check_sgl_threshold(req, use_sgl)) {
+		iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
+		iod->cmd.common.dptr.sgl = iod->sgl;
+	} else {
+		ret = nvme_pci_sgl_to_prp(req, iod);
+	}
+
+	return ret;
 }
 
 static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
@@ -1075,6 +1032,7 @@ static blk_status_t nvme_prep_rq(struct request *req)
 	iod->flags = 0;
 	iod->nr_descriptors = 0;
 	iod->total_len = 0;
+	iod->sgl.type = 0;
 
 	ret = nvme_setup_cmd(req->q->queuedata, req);
 	if (ret)
--

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
  2025-07-07 14:29 ` Keith Busch
@ 2025-07-07 15:01   ` Christoph Hellwig
  2025-07-07 15:13     ` Keith Busch
  0 siblings, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2025-07-07 15:01 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, axboe, sagi, ben.copeland, leon, linux-nvme,
	linux-block

On Mon, Jul 07, 2025 at 08:29:57AM -0600, Keith Busch wrote:
> Given it works fine for SGL, how do you feel about unconditionally
> creating an NVMe SGL, and then call some "nvme_sgl_to_prp()" helper only
> when needed? This means we only have one teardown path that matches the
> setup.

I briefly thought about it and discard it because:

  1) I thought it was too complex
  2) the need to allocate relatively expensive data dma coherent
     memory for something that is never DMAed to.

> This is something I hacked up over the weekend. It's only lightly
> tested, and I know there's a bug somewhere here with the chainging, but
> it's a start.

This looks a lot less complex than I feared.  I still don't think having
to allocate DMA memory for storing the ranges is all that great, mostly
because this is exactly the path we're going to hit for non-coherent
attachment where the DMA coherent memory will have a performance impact
because it is marked uncachable.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
  2025-07-07 15:01   ` Christoph Hellwig
@ 2025-07-07 15:13     ` Keith Busch
  0 siblings, 0 replies; 10+ messages in thread
From: Keith Busch @ 2025-07-07 15:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, sagi, ben.copeland, leon, linux-nvme, linux-block

On Mon, Jul 07, 2025 at 05:01:16PM +0200, Christoph Hellwig wrote:
> This looks a lot less complex than I feared.  I still don't think having
> to allocate DMA memory for storing the ranges is all that great, mostly
> because this is exactly the path we're going to hit for non-coherent
> attachment where the DMA coherent memory will have a performance impact
> because it is marked uncachable.

Okay. I was hoping to avoid bringing back a mempool, but yes, I can see
how abusing the dma pool allocation to hold the driver's sgl (that won't
be DMA'd) may be harmful on non-coherent machines. Let's go with your
patch now; we can always continue improving on this if needed later.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
  2025-07-07 12:52 [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping Christoph Hellwig
  2025-07-07 14:29 ` Keith Busch
@ 2025-07-07 15:13 ` Keith Busch
  2025-07-08 12:55 ` Jens Axboe
  2025-07-11  0:25 ` Klara Modin
  3 siblings, 0 replies; 10+ messages in thread
From: Keith Busch @ 2025-07-07 15:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, sagi, ben.copeland, leon, linux-nvme, linux-block

On Mon, Jul 07, 2025 at 02:52:23PM +0200, Christoph Hellwig wrote:
> The current version of the blk_rq_dma_map support in nvme-pci tries to
> reconstruct the DMA mappings from the on the wire descriptors if they
> are needed for unmapping.  While this is not the case for the direct
> mapping fast path and the IOVA path, it is needed for the non-IOVA slow
> path, e.g. when using the interconnect is not dma coherent, when using
> swiotlb bounce buffering, or a IOMMU mapping that can't coalesce.
> 
> While the reconstruction is easy and works fine for the SGL path, where
> the on the wire representation maps 1:1 to DMA mappings, the code to
> reconstruct the DMA mapping ranges from PRPs can't always work, as a
> given PRP layout can come from different DMA mappings, and the current
> code doesn't even always get that right.
> 
> Give up on this approach and track the actual DMA mapping when actually
> needed again.

Looks good.

Reviewed-by: Keith Busch <kbusch@kernel.org>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
  2025-07-07 12:52 [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping Christoph Hellwig
  2025-07-07 14:29 ` Keith Busch
  2025-07-07 15:13 ` Keith Busch
@ 2025-07-08 12:55 ` Jens Axboe
  2025-07-11  0:25 ` Klara Modin
  3 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2025-07-08 12:55 UTC (permalink / raw)
  To: kbusch, sagi, Christoph Hellwig
  Cc: ben.copeland, leon, linux-nvme, linux-block


On Mon, 07 Jul 2025 14:52:23 +0200, Christoph Hellwig wrote:
> The current version of the blk_rq_dma_map support in nvme-pci tries to
> reconstruct the DMA mappings from the on the wire descriptors if they
> are needed for unmapping.  While this is not the case for the direct
> mapping fast path and the IOVA path, it is needed for the non-IOVA slow
> path, e.g. when using the interconnect is not dma coherent, when using
> swiotlb bounce buffering, or a IOMMU mapping that can't coalesce.
> 
> [...]

Applied, thanks!

[1/1] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
      commit: b8b7570a7ec872f2a27b775c4f8710ca8a357adf

Best regards,
-- 
Jens Axboe




^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
  2025-07-07 12:52 [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping Christoph Hellwig
                   ` (2 preceding siblings ...)
  2025-07-08 12:55 ` Jens Axboe
@ 2025-07-11  0:25 ` Klara Modin
  2025-07-11  0:37   ` Keith Busch
  2025-07-11  8:32   ` Christoph Hellwig
  3 siblings, 2 replies; 10+ messages in thread
From: Klara Modin @ 2025-07-11  0:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: kbusch, axboe, sagi, ben.copeland, leon, linux-nvme, linux-block

Hi,

On 2025-07-07 14:52:23 +0200, Christoph Hellwig wrote:
> The current version of the blk_rq_dma_map support in nvme-pci tries to
> reconstruct the DMA mappings from the on the wire descriptors if they
> are needed for unmapping.  While this is not the case for the direct
> mapping fast path and the IOVA path, it is needed for the non-IOVA slow
> path, e.g. when using the interconnect is not dma coherent, when using
> swiotlb bounce buffering, or a IOMMU mapping that can't coalesce.
> 
> While the reconstruction is easy and works fine for the SGL path, where
> the on the wire representation maps 1:1 to DMA mappings, the code to
> reconstruct the DMA mapping ranges from PRPs can't always work, as a
> given PRP layout can come from different DMA mappings, and the current
> code doesn't even always get that right.
> 
> Give up on this approach and track the actual DMA mapping when actually
> needed again.
> 
> Fixes: 7ce3c1dd78fc ("nvme-pci: convert the data mapping to blk_rq_dma_map")
> Reported-by: Ben Copeland <ben.copeland@linaro.org>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

This patch seems to introduce a memory leak, causing the slab to grow
continuously. The easiest way I found to trigger it is along the lines
of

# dd if=/dev/nvme0n1 of=/dev/null bs=8M

It also seems to happen during normal use but much more slowly.
Reverting fixes the issue.

# cat /proc/allocinfo | sort -n | numfmt --invalid=ignore --to=iec-i | tail -n5
       146Mi     5285 mm/memory.c:4977 func:alloc_anon_folio 
       182Mi    24395 mm/shmem.c:1847 func:shmem_alloc_folio 
       420Mi    38746 mm/readahead.c:186 func:ractl_alloc_folio 
        29Gi  7383484 drivers/nvme/host/pci.c:767 func:nvme_pci_setup_data_prp 
        29Gi   937347 mm/slub.c:2487 func:alloc_slab_page

# slabtop --human --once | head
 Active / Total Objects (% used)    : 8966529 / 9031691 (99.3%)
 Active / Total Slabs (% used)      : 971257 / 971257 (100.0%)
 Active / Total Caches (% used)     : 128 / 211 (60.7%)
 Active / Total Size (% used)       : 28Gi / 28Gi (99.9%)
 Minimum / Average / Maximum Object : 0.01K / 3.32K / 8.00K

  OBJS ACTIVE  USE OBJ SIZE  SLABS OBJ/SLAB CACHE SIZE NAME                   
7410112 7410112 100%    4.00K 926264        8       28Gi kmalloc-4k             
929920 929920 100%    0.12K  29060       32      113Mi kmalloc-128            
 81744  80996  99%    0.10K   2096       39      8.2Mi btrfs_free_space

My machine has 32 GiB of memory and if I leave it be it locks up and
sometimes panics.

# nvme id-ctrl /dev/nvme0 | grep sg
sgls      : 0

Let me know if there's anything else you need.

Regards,
Klara Modin

> ---
>  drivers/nvme/host/pci.c | 114 ++++++++++++++++++++++------------------
>  1 file changed, 62 insertions(+), 52 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 6a7a8bdbf385..6af184f2b73b 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -173,6 +173,7 @@ struct nvme_dev {
>  	bool hmb;
>  	struct sg_table *hmb_sgt;
>  
> +	mempool_t *dmavec_mempool;
>  	mempool_t *iod_meta_mempool;
>  
>  	/* shadow doorbell buffer support: */
> @@ -262,6 +263,11 @@ enum nvme_iod_flags {
>  	IOD_SINGLE_SEGMENT	= 1U << 2,
>  };
>  
> +struct nvme_dma_vec {
> +	dma_addr_t addr;
> +	unsigned int len;
> +};
> +
>  /*
>   * The nvme_iod describes the data in an I/O.
>   */
> @@ -274,6 +280,8 @@ struct nvme_iod {
>  	unsigned int total_len;
>  	struct dma_iova_state dma_state;
>  	void *descriptors[NVME_MAX_NR_DESCRIPTORS];
> +	struct nvme_dma_vec *dma_vecs;
> +	unsigned int nr_dma_vecs;
>  
>  	dma_addr_t meta_dma;
>  	struct sg_table meta_sgt;
> @@ -674,44 +682,12 @@ static void nvme_free_prps(struct request *req)
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>  	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
> -	struct device *dma_dev = nvmeq->dev->dev;
> -	enum dma_data_direction dir = rq_dma_dir(req);
> -	int length = iod->total_len;
> -	dma_addr_t dma_addr;
> -	int i, desc;
> -	__le64 *prp_list;
> -	u32 dma_len;
> -
> -	dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
> -	dma_len = min_t(u32, length,
> -		NVME_CTRL_PAGE_SIZE - (dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
> -	length -= dma_len;
> -	if (!length) {
> -		dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
> -		return;
> -	}
> -
> -	if (length <= NVME_CTRL_PAGE_SIZE) {
> -		dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
> -		dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2);
> -		dma_unmap_page(dma_dev, dma_addr, length, dir);
> -		return;
> -	}
> -
> -	i = 0;
> -	desc = 0;
> -	prp_list = iod->descriptors[desc];
> -	do {
> -		dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
> -		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
> -			prp_list = iod->descriptors[++desc];
> -			i = 0;
> -		}
> +	unsigned int i;
>  
> -		dma_addr = le64_to_cpu(prp_list[i++]);
> -		dma_len = min(length, NVME_CTRL_PAGE_SIZE);
> -		length -= dma_len;
> -	} while (length);
> +	for (i = 0; i < iod->nr_dma_vecs; i++)
> +		dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr,
> +				iod->dma_vecs[i].len, rq_dma_dir(req));
> +	mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
>  }
>  
>  static void nvme_free_sgls(struct request *req)
> @@ -760,6 +736,23 @@ static void nvme_unmap_data(struct request *req)
>  		nvme_free_descriptors(req);
>  }
>  
> +static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev,
> +		struct blk_dma_iter *iter)
> +{
> +	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> +
> +	if (iter->len)
> +		return true;
> +	if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter))
> +		return false;
> +	if (dma_need_unmap(dma_dev)) {
> +		iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr;
> +		iod->dma_vecs[iod->nr_dma_vecs].len = iter->len;
> +		iod->nr_dma_vecs++;
> +	}
> +	return true;
> +}
> +
>  static blk_status_t nvme_pci_setup_data_prp(struct request *req,
>  		struct blk_dma_iter *iter)
>  {
> @@ -770,6 +763,16 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
>  	unsigned int prp_len, i;
>  	__le64 *prp_list;
>  
> +	if (dma_need_unmap(nvmeq->dev->dev)) {
> +		iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool,
> +				GFP_ATOMIC);
> +		if (!iod->dma_vecs)
> +			return BLK_STS_RESOURCE;
> +		iod->dma_vecs[0].addr = iter->addr;
> +		iod->dma_vecs[0].len = iter->len;
> +		iod->nr_dma_vecs = 1;
> +	}
> +
>  	/*
>  	 * PRP1 always points to the start of the DMA transfers.
>  	 *
> @@ -786,13 +789,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
>  	if (!length)
>  		goto done;
>  
> -	if (!iter->len) {
> -		if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
> -				&iod->dma_state, iter)) {
> -			if (WARN_ON_ONCE(!iter->status))
> -				goto bad_sgl;
> -			goto done;
> -		}
> +	if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
> +		if (WARN_ON_ONCE(!iter->status))
> +			goto bad_sgl;
> +		goto done;
>  	}
>  
>  	/*
> @@ -831,13 +831,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
>  		if (!length)
>  			break;
>  
> -		if (iter->len == 0) {
> -			if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
> -					&iod->dma_state, iter)) {
> -				if (WARN_ON_ONCE(!iter->status))
> -					goto bad_sgl;
> -				goto done;
> -			}
> +		if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
> +			if (WARN_ON_ONCE(!iter->status))
> +				goto bad_sgl;
> +			goto done;
>  		}
>  
>  		/*
> @@ -3025,14 +3022,25 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
>  static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
>  {
>  	size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
> +	size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS;
> +
> +	dev->dmavec_mempool = mempool_create_node(1,
> +			mempool_kmalloc, mempool_kfree,
> +			(void *)alloc_size, GFP_KERNEL,
> +			dev_to_node(dev->dev));
> +	if (!dev->dmavec_mempool)
> +		return -ENOMEM;
>  
>  	dev->iod_meta_mempool = mempool_create_node(1,
>  			mempool_kmalloc, mempool_kfree,
>  			(void *)meta_size, GFP_KERNEL,
>  			dev_to_node(dev->dev));
>  	if (!dev->iod_meta_mempool)
> -		return -ENOMEM;
> +		goto free;
>  	return 0;
> +free:
> +	mempool_destroy(dev->dmavec_mempool);
> +	return -ENOMEM;
>  }
>  
>  static void nvme_free_tagset(struct nvme_dev *dev)
> @@ -3477,6 +3485,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	nvme_dbbuf_dma_free(dev);
>  	nvme_free_queues(dev, 0);
>  out_release_iod_mempool:
> +	mempool_destroy(dev->dmavec_mempool);
>  	mempool_destroy(dev->iod_meta_mempool);
>  out_dev_unmap:
>  	nvme_dev_unmap(dev);
> @@ -3540,6 +3549,7 @@ static void nvme_remove(struct pci_dev *pdev)
>  	nvme_dev_remove_admin(dev);
>  	nvme_dbbuf_dma_free(dev);
>  	nvme_free_queues(dev, 0);
> +	mempool_destroy(dev->dmavec_mempool);
>  	mempool_destroy(dev->iod_meta_mempool);
>  	nvme_release_descriptor_pools(dev);
>  	nvme_dev_unmap(dev);
> -- 
> 2.47.2
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
  2025-07-11  0:25 ` Klara Modin
@ 2025-07-11  0:37   ` Keith Busch
  2025-07-11  8:32   ` Christoph Hellwig
  1 sibling, 0 replies; 10+ messages in thread
From: Keith Busch @ 2025-07-11  0:37 UTC (permalink / raw)
  To: Klara Modin
  Cc: Christoph Hellwig, axboe, sagi, ben.copeland, leon, linux-nvme,
	linux-block

On Fri, Jul 11, 2025 at 02:25:12AM +0200, Klara Modin wrote:
> This patch seems to introduce a memory leak, causing the slab to grow
> continuously. The easiest way I found to trigger it is along the lines
> of
> 
> # dd if=/dev/nvme0n1 of=/dev/null bs=8M

Doh! Thank you for the report, I appreciate notification for regressions
on the -next branches. I'll look into it tomorrow if hch doesn't beat me
to it.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
  2025-07-11  0:25 ` Klara Modin
  2025-07-11  0:37   ` Keith Busch
@ 2025-07-11  8:32   ` Christoph Hellwig
  2025-07-11  9:07     ` Klara Modin
  1 sibling, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2025-07-11  8:32 UTC (permalink / raw)
  To: Klara Modin
  Cc: Christoph Hellwig, kbusch, axboe, sagi, ben.copeland, leon,
	linux-nvme, linux-block

Hi Klara,

can you try the attached patch?

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6af184f2b73b..ac3b90d31380 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -745,7 +745,7 @@ static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev,
 		return true;
 	if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter))
 		return false;
-	if (dma_need_unmap(dma_dev)) {
+	if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(dma_dev)) {
 		iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr;
 		iod->dma_vecs[iod->nr_dma_vecs].len = iter->len;
 		iod->nr_dma_vecs++;
@@ -763,7 +763,7 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
 	unsigned int prp_len, i;
 	__le64 *prp_list;
 
-	if (dma_need_unmap(nvmeq->dev->dev)) {
+	if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(nvmeq->dev->dev)) {
 		iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool,
 				GFP_ATOMIC);
 		if (!iod->dma_vecs)

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
  2025-07-11  8:32   ` Christoph Hellwig
@ 2025-07-11  9:07     ` Klara Modin
  0 siblings, 0 replies; 10+ messages in thread
From: Klara Modin @ 2025-07-11  9:07 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: kbusch, axboe, sagi, ben.copeland, leon, linux-nvme, linux-block

Hi,

On 2025-07-11 10:32:51 +0200, Christoph Hellwig wrote:
> Hi Klara,
> 
> can you try the attached patch?
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 6af184f2b73b..ac3b90d31380 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -745,7 +745,7 @@ static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev,
>  		return true;
>  	if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter))
>  		return false;
> -	if (dma_need_unmap(dma_dev)) {
> +	if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(dma_dev)) {
>  		iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr;
>  		iod->dma_vecs[iod->nr_dma_vecs].len = iter->len;
>  		iod->nr_dma_vecs++;
> @@ -763,7 +763,7 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
>  	unsigned int prp_len, i;
>  	__le64 *prp_list;
>  
> -	if (dma_need_unmap(nvmeq->dev->dev)) {
> +	if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(nvmeq->dev->dev)) {
>  		iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool,
>  				GFP_ATOMIC);
>  		if (!iod->dma_vecs)

Yes, this fixes it for me.

Thanks,
Tested-by: Klara Modin <klarasmodin@gmail.com>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2025-07-11  9:07 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-07 12:52 [PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping Christoph Hellwig
2025-07-07 14:29 ` Keith Busch
2025-07-07 15:01   ` Christoph Hellwig
2025-07-07 15:13     ` Keith Busch
2025-07-07 15:13 ` Keith Busch
2025-07-08 12:55 ` Jens Axboe
2025-07-11  0:25 ` Klara Modin
2025-07-11  0:37   ` Keith Busch
2025-07-11  8:32   ` Christoph Hellwig
2025-07-11  9:07     ` Klara Modin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox