From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from arkamax.eu (128-116-240-228.dyn.eolo.it [128.116.240.228]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 907B33A5E69; Wed, 29 Apr 2026 16:07:55 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=128.116.240.228 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1777478891; cv=none; b=A3jXbt+xqBO7cPKOH06PmPq1lS6WmOeMBzYTYW6ZiLpfsk8aAqvaC3HcNimldOn+pq5vfa8a+h9oVwuYBaaBk9/KElJc5XHfBQaAKB+Qz0h3kE/DypWoWfVXy1C12HcByQwVTAB859i0j8pmmpO9LcXNhUR1tO5F+lhGj85C164= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1777478891; c=relaxed/simple; bh=2OxDRuVuRYUIezquhVk6qf5m05LQJvebSzdYBe3fKkI=; h=Mime-Version:Content-Type:Date:Message-Id:From:To:Cc:Subject: References:In-Reply-To; b=nZkjf4RIvlCOQ6GtwMWgu6VLkq2nSZatDaRU9bSrY3pc3I2ywjsINE6o27KThHJd9oiT5+3rfhrHigNhDLmC1KNuijyo3BgLEtWoQj+W0yLIbiBcUsLASVX/TvyKJtR/9U4tsh+YhFfiWJI60f3LA1vows69ZPZoqEKn8qGXNVg= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=arkamax.eu; spf=pass smtp.mailfrom=arkamax.eu; arc=none smtp.client-ip=128.116.240.228 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=arkamax.eu Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arkamax.eu Received: from localhost (128-116-240-228.dyn.eolo.it [128.116.240.228]) by arkamax.eu (OpenSMTPD) with ESMTPSA id 8ebc5e10 (TLSv1.3:TLS_AES_256_GCM_SHA384:256:NO); Wed, 29 Apr 2026 18:07:48 +0200 (CEST) Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Mime-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset=UTF-8 Date: Wed, 29 Apr 2026 18:07:47 +0200 Message-Id: From: "Maurizio Lombardi" To: "Pavel Begunkov" , "Jens Axboe" , "Keith Busch" , "Christoph Hellwig" , "Sagi Grimberg" , "Alexander Viro" , "Christian Brauner" , "Andrew Morton" , "Sumit Semwal" , =?utf-8?q?Christian_K=C3=B6nig?= , , , , , , , , Cc: "Nitesh Shetty" , "Kanchan Joshi" , "Anuj Gupta" , "Tushar Gohad" , "William Power" , "Phil Cayton" , "Jason Gunthorpe" Subject: Re: [PATCH v3 07/10] nvme-pci: implement dma_token backed requests X-Mailer: aerc 0.21.0 References: <5cecb1157ab784f9f303a91449fdf11b03aa6002.1777475843.git.asml.silence@gmail.com> In-Reply-To: <5cecb1157ab784f9f303a91449fdf11b03aa6002.1777475843.git.asml.silence@gmail.com> On Wed Apr 29, 2026 at 5:25 PM CEST, Pavel Begunkov wrote: > Enable BIO_DMABUF_MAP backed requests. It creates a prp list for the > dmabuf when it's mapped, which is then used to initialise requests. > > Suggested-by: Keith Busch > Signed-off-by: Pavel Begunkov > --- > drivers/nvme/host/pci.c | 282 ++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 282 insertions(+) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index db5fc9bf6627..d2629853a972 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -27,6 +27,8 @@ > #include > #include > #include > +#include > +#include > =20 > #include "trace.h" > #include "nvme.h" > @@ -393,6 +395,17 @@ struct nvme_queue { > struct completion delete_done; > }; > =20 > +struct nvme_dmabuf_token { > + struct dma_buf_attachment *attach; > +}; > + > +struct nvme_dmabuf_map { > + struct io_dmabuf_map base; > + dma_addr_t *dma_list; > + struct sg_table *sgt; > + unsigned nr_entries; > +}; > + > /* bits for iod->flags */ > enum nvme_iod_flags { > /* this command has been aborted by the timeout handler */ > @@ -854,6 +867,134 @@ static void nvme_free_descriptors(struct request *r= eq) > } > } > =20 > +static void nvme_dmabuf_map_sync(struct nvme_dev *nvme_dev, struct reque= st *req, > + bool for_cpu) > +{ > + int length =3D blk_rq_payload_bytes(req); > + struct device *dev =3D nvme_dev->dev; > + enum dma_data_direction dma_dir; > + struct bio *bio =3D req->bio; > + struct nvme_dmabuf_map *map; > + dma_addr_t *dma_list; > + int offset, map_idx; > + > + dma_dir =3D rq_data_dir(req) =3D=3D READ ? DMA_FROM_DEVICE : DMA_TO_DEV= ICE; > + map =3D container_of(bio->dmabuf_map, struct nvme_dmabuf_map, base); > + dma_list =3D map->dma_list; > + > + offset =3D bio->bi_iter.bi_bvec_done; > + map_idx =3D offset / NVME_CTRL_PAGE_SIZE; > + length +=3D offset & (NVME_CTRL_PAGE_SIZE - 1); > + > + while (length > 0) { > + u64 dma_addr =3D dma_list[map_idx++]; > + > + if (for_cpu) > + __dma_sync_single_for_cpu(dev, dma_addr, > + NVME_CTRL_PAGE_SIZE, dma_dir); > + else > + __dma_sync_single_for_device(dev, dma_addr, > + NVME_CTRL_PAGE_SIZE, > + dma_dir); > + length -=3D NVME_CTRL_PAGE_SIZE; > + } > +} > + > +static void nvme_rq_clean_dmabuf_map(struct nvme_dev *dev, > + struct request *req) > +{ > + struct nvme_iod *iod =3D blk_mq_rq_to_pdu(req); > + > + nvme_dmabuf_map_sync(dev, req, true); > + > + if (!(iod->flags & IOD_SINGLE_SEGMENT)) > + nvme_free_descriptors(req); > +} > + > +static blk_status_t nvme_rq_setup_dmabuf_map(struct request *req, > + struct nvme_queue *nvmeq) > +{ > + struct nvme_iod *iod =3D blk_mq_rq_to_pdu(req); > + int length =3D blk_rq_payload_bytes(req); > + u64 dma_addr, prp1_dma, prp2_dma; > + struct bio *bio =3D req->bio; > + struct nvme_dmabuf_map *map; > + dma_addr_t *dma_list; > + dma_addr_t prp_dma; > + __le64 *prp_list; > + int i, map_idx; > + int offset; > + > + nvme_dmabuf_map_sync(nvmeq->dev, req, false); > + > + map =3D container_of(bio->dmabuf_map, struct nvme_dmabuf_map, base); > + dma_list =3D map->dma_list; > + > + offset =3D bio->bi_iter.bi_bvec_done; > + map_idx =3D offset / NVME_CTRL_PAGE_SIZE; > + offset &=3D (NVME_CTRL_PAGE_SIZE - 1); > + prp1_dma =3D dma_list[map_idx++] + offset; > + > + length -=3D (NVME_CTRL_PAGE_SIZE - offset); > + if (length <=3D 0) { > + prp2_dma =3D 0; > + goto done; > + } > + > + if (length <=3D NVME_CTRL_PAGE_SIZE) { > + prp2_dma =3D dma_list[map_idx]; > + goto done; > + } > + > + if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=3D > + NVME_SMALL_POOL_SIZE / sizeof(__le64)) > + iod->flags |=3D IOD_SMALL_DESCRIPTOR; > + > + prp_list =3D dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, > + &prp_dma); > + if (!prp_list) > + return BLK_STS_RESOURCE; > + > + iod->descriptors[iod->nr_descriptors++] =3D prp_list; > + prp2_dma =3D prp_dma; > + i =3D 0; > + for (;;) { > + if (i =3D=3D NVME_CTRL_PAGE_SIZE >> 3) { > + __le64 *old_prp_list =3D prp_list; > + > + prp_list =3D dma_pool_alloc(nvmeq->descriptor_pools.large, > + GFP_ATOMIC, &prp_dma); > + if (!prp_list) > + goto free_prps; > + iod->descriptors[iod->nr_descriptors++] =3D prp_list; > + prp_list[0] =3D old_prp_list[i - 1]; > + old_prp_list[i - 1] =3D cpu_to_le64(prp_dma); > + i =3D 1; > + } > + > + dma_addr =3D dma_list[map_idx++]; > + prp_list[i++] =3D cpu_to_le64(dma_addr); > + > + length -=3D NVME_CTRL_PAGE_SIZE; > + if (length <=3D 0) > + break; > + } > +done: > + iod->cmd.common.dptr.prp1 =3D cpu_to_le64(prp1_dma); > + iod->cmd.common.dptr.prp2 =3D cpu_to_le64(prp2_dma); > + return BLK_STS_OK; > +free_prps: > + nvme_free_descriptors(req); > + return BLK_STS_RESOURCE; > +} > + > +static inline bool nvme_rq_is_dmabuf_attached(struct request *req) > +{ > + if (!IS_ENABLED(CONFIG_DMABUF_TOKEN)) > + return false; > + return req->bio && bio_flagged(req->bio, BIO_DMABUF_MAP); > +} > + > static void nvme_free_prps(struct request *req, unsigned int attrs) > { > struct nvme_iod *iod =3D blk_mq_rq_to_pdu(req); > @@ -932,6 +1073,11 @@ static void nvme_unmap_data(struct request *req) > struct device *dma_dev =3D nvmeq->dev->dev; > unsigned int attrs =3D 0; > =20 > + if (nvme_rq_is_dmabuf_attached(req)) { > + nvme_rq_clean_dmabuf_map(nvmeq->dev, req); > + return; > + } > + > if (iod->flags & IOD_SINGLE_SEGMENT) { > static_assert(offsetof(union nvme_data_ptr, prp1) =3D=3D > offsetof(union nvme_data_ptr, sgl.addr)); > @@ -1222,6 +1368,9 @@ static blk_status_t nvme_map_data(struct request *r= eq) > struct blk_dma_iter iter; > blk_status_t ret; > =20 > + if (nvme_rq_is_dmabuf_attached(req)) > + return nvme_rq_setup_dmabuf_map(req, nvmeq); > + > /* > * Try to skip the DMA iterator for single segment requests, as that > * significantly improves performances for small I/O sizes. > @@ -2238,6 +2387,134 @@ static int nvme_create_queue(struct nvme_queue *n= vmeq, int qid, bool polled) > return result; > } > =20 > +#ifdef CONFIG_DMABUF_TOKEN > +static void nvme_dmabuf_invalidate_mappings(struct dma_buf_attachment *a= ttach) > +{ > + struct io_dmabuf_token *token =3D attach->importer_priv; > + > + io_dmabuf_token_invalidate_mappings(token); > +} > + > +const struct dma_buf_attach_ops nvme_dmabuf_importer_ops =3D { > + .invalidate_mappings =3D nvme_dmabuf_invalidate_mappings, > + .allow_peer2peer =3D true, > +}; > + > +static struct io_dmabuf_map *nvme_dmabuf_token_map(struct io_dmabuf_toke= n *token) > +{ > + struct nvme_dmabuf_token *data =3D token->dev_priv; > + struct dma_buf_attachment *attach =3D data->attach; > + dma_addr_t *dma_list =3D NULL; > + unsigned long tmp, i =3D 0; > + struct nvme_dmabuf_map *map; > + struct scatterlist *sg; > + struct sg_table *sgt; > + unsigned nr_entries; > + int ret; > + > + dma_resv_assert_held(token->dmabuf->resv); > + > + map =3D kmalloc(sizeof(*map), GFP_KERNEL); > + if (!map) > + return ERR_PTR(-ENOMEM); > + > + nr_entries =3D token->dmabuf->size / NVME_CTRL_PAGE_SIZE; > + dma_list =3D kmalloc_array(nr_entries, sizeof(dma_list[0]), GFP_KERNEL)= ; > + if (!dma_list) { > + ret =3D -ENOMEM; > + goto err; > + } > + > + sgt =3D dma_buf_map_attachment(attach, token->dir); > + if (IS_ERR(sgt)) { > + ret =3D PTR_ERR(sgt); > + sgt =3D NULL; > + goto err; > + } > + > + for_each_sgtable_dma_sg(sgt, sg, tmp) { > + dma_addr_t dma_addr =3D sg_dma_address(sg); > + unsigned long sg_len =3D sg_dma_len(sg); > + > + if (sg_len % NVME_CTRL_PAGE_SIZE) { > + ret =3D -EINVAL; > + goto err; > + } > + > + while (sg_len) { > + dma_list[i++] =3D dma_addr; > + dma_addr +=3D NVME_CTRL_PAGE_SIZE; > + sg_len -=3D NVME_CTRL_PAGE_SIZE; > + } > + } > + > + ret =3D io_dmabuf_init_map(token, &map->base); > + if (ret) > + goto err; > + map->nr_entries =3D nr_entries; > + map->dma_list =3D dma_list; > + map->sgt =3D sgt; > + return &map->base; > +err: > + if (sgt) > + dma_buf_unmap_attachment(attach, sgt, token->dir); > + kfree(map); > + kfree(dma_list); > + return ERR_PTR(ret); > +} > + > +static void nvme_dmabuf_token_unmap(struct io_dmabuf_token *token, > + struct io_dmabuf_map *map_base) > +{ > + struct nvme_dmabuf_token *data =3D token->dev_priv; > + struct nvme_dmabuf_map *map =3D container_of(map_base, > + struct nvme_dmabuf_map, base); > + > + dma_resv_assert_held(token->dmabuf->resv); > + > + dma_buf_unmap_attachment(data->attach, map->sgt, token->dir); > + kfree(map->dma_list); > +} > + > +static void nvme_dmabuf_token_release(struct io_dmabuf_token *token) > +{ > + struct nvme_dmabuf_token *data =3D token->dev_priv; > + > + dma_buf_detach(token->dmabuf, data->attach); > + kfree(data); > +} > + > +const struct io_dmabuf_token_dev_ops nvme_dma_token_ops =3D { > + .map =3D nvme_dmabuf_token_map, > + .unmap =3D nvme_dmabuf_token_unmap, > + .release =3D nvme_dmabuf_token_release, > +}; > + > +static int nvme_create_dmabuf_token(struct request_queue *q, > + struct io_dmabuf_token *token) > +{ > + struct nvme_dmabuf_token *data; > + struct dma_buf_attachment *attach; > + struct nvme_ns *ns =3D q->queuedata; > + struct nvme_dev *dev =3D to_nvme_dev(ns->ctrl); > + struct dma_buf *dmabuf =3D token->dmabuf; > + > + data =3D kzalloc(sizeof(data), GFP_KERNEL); > + if (!data) > + return -ENOMEM; Shouldn't this be kzalloc(sizeof(*data)...) ? Also, checkpatch generates a warning because kzalloc_obj() should be preferred for this kind of memory allocations over kzalloc(). > + > + token->dev_priv =3D data; > + token->dev_ops =3D &nvme_dma_token_ops; > + > + attach =3D dma_buf_dynamic_attach(dmabuf, dev->dev, > + &nvme_dmabuf_importer_ops, token); > + if (IS_ERR(attach)) > + return PTR_ERR(attach); Supposing dma_buf_dynamic_attach() returns an error, won't the 'data' pointer be leaked? Maurizio