public inbox for linux-pci@vger.kernel.org
 help / color / mirror / Atom feed
From: Alex Williamson <alex@shazbot.org>
To: Zhiping Zhang <zhipingz@meta.com>
Cc: Stanislav Fomichev <sdf@meta.com>,
	Keith Busch <kbusch@kernel.org>, Jason Gunthorpe <jgg@ziepe.ca>,
	Leon Romanovsky <leon@kernel.org>,
	Bjorn Helgaas <helgaas@kernel.org>, <linux-rdma@vger.kernel.org>,
	<linux-pci@vger.kernel.org>, <netdev@vger.kernel.org>,
	<dri-devel@lists.freedesktop.org>,
	Yochai Cohen <yochai@nvidia.com>,
	Yishai Hadas <yishaih@nvidia.com>,
	alex@shazbot.org
Subject: Re: [PATCH v1 1/2] vfio: add callback to get tph info for dma-buf
Date: Wed, 22 Apr 2026 09:23:27 -0600	[thread overview]
Message-ID: <20260422092327.3f629ad6@shazbot.org> (raw)
In-Reply-To: <20260420183920.3626389-2-zhipingz@meta.com>

On Mon, 20 Apr 2026 11:39:15 -0700
Zhiping Zhang <zhipingz@meta.com> wrote:

> Add a dma-buf callback that returns raw TPH metadata from the exporter
> so peer devices can reuse the steering tag and processing hint
> associated with a VFIO-exported buffer.
> 
> Keep the existing VFIO_DEVICE_FEATURE_DMA_BUF uAPI layout intact by
> using a flag plus one extra trailing entries[] object for the optional
> TPH metadata. Rename the uAPI field dma_ranges to entries. The
> nr_ranges field remains the DMA range count; when VFIO_DMABUF_FLAG_TPH
> is set the kernel reads one extra entry beyond nr_ranges for the TPH
> metadata.
> 
> Add an st_width parameter to get_tph() so the exporter can reject
> steering tags that exceed the consumer's supported width (8 vs 16 bit).
> When no TPH metadata was supplied, make get_tph() return -EOPNOTSUPP.
> 
> Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
> ---
>  drivers/vfio/pci/vfio_pci_dmabuf.c | 62 +++++++++++++++++++++++-------
>  include/linux/dma-buf.h            | 17 ++++++++
>  include/uapi/linux/vfio.h          | 28 ++++++++++++--
>  3 files changed, 89 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> index b1d658b8f7b5..fdc05f9ab3ae 100644
> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -17,6 +17,9 @@ struct vfio_pci_dma_buf {
>  	struct phys_vec *phys_vec;
>  	struct p2pdma_provider *provider;
>  	u32 nr_ranges;
> +	u16 steering_tag;
> +	u8 ph;
> +	u8 tph_present : 1;
>  	u8 revoked : 1;
>  };
>  
> @@ -60,6 +63,22 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
>  				       priv->size, dir);
>  }
>  
> +static int vfio_pci_dma_buf_get_tph(struct dma_buf *dmabuf, u16 *steering_tag,
> +				    u8 *ph, u8 st_width)
> +{
> +	struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> +	if (!priv->tph_present)
> +		return -EOPNOTSUPP;
> +
> +	if (st_width < 16 && priv->steering_tag > ((1U << st_width) - 1))
> +		return -EINVAL;
> +
> +	*steering_tag = priv->steering_tag;
> +	*ph = priv->ph;
> +	return 0;
> +}
> +
>  static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
>  				   struct sg_table *sgt,
>  				   enum dma_data_direction dir)
> @@ -89,6 +108,7 @@ static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
>  	.pin = vfio_pci_dma_buf_pin,
>  	.unpin = vfio_pci_dma_buf_unpin,
>  	.attach = vfio_pci_dma_buf_attach,
> +	.get_tph = vfio_pci_dma_buf_get_tph,
>  	.map_dma_buf = vfio_pci_dma_buf_map,
>  	.unmap_dma_buf = vfio_pci_dma_buf_unmap,
>  	.release = vfio_pci_dma_buf_release,
> @@ -211,7 +231,9 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
>  				  size_t argsz)
>  {
>  	struct vfio_device_feature_dma_buf get_dma_buf = {};
> -	struct vfio_region_dma_range *dma_ranges;
> +	bool tph_supplied;
> +	u32 tph_index;
> +	struct vfio_region_dma_range *entries;
>  	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
>  	struct vfio_pci_dma_buf *priv;
>  	size_t length;
> @@ -228,7 +250,10 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
>  	if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
>  		return -EFAULT;
>  
> -	if (!get_dma_buf.nr_ranges || get_dma_buf.flags)
> +	tph_supplied = !!(get_dma_buf.flags & VFIO_DMABUF_FLAG_TPH);
> +	tph_index = get_dma_buf.nr_ranges;
> +	if (!get_dma_buf.nr_ranges ||
> +	    (get_dma_buf.flags & ~VFIO_DMABUF_FLAG_TPH))
>  		return -EINVAL;
>  
>  	/*
> @@ -237,19 +262,21 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
>  	if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX)
>  		return -ENODEV;
>  
> -	dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges,
> -				       sizeof(*dma_ranges));
> -	if (IS_ERR(dma_ranges))
> -		return PTR_ERR(dma_ranges);
> +	entries = memdup_array_user(&arg->entries,
> +				    get_dma_buf.nr_ranges +
> +					(tph_supplied ? 1 : 0),
> +				    sizeof(*entries));
> +	if (IS_ERR(entries))
> +		return PTR_ERR(entries);
>  
> -	ret = validate_dmabuf_input(&get_dma_buf, dma_ranges, &length);
> +	ret = validate_dmabuf_input(&get_dma_buf, entries, &length);
>  	if (ret)
> -		goto err_free_ranges;
> +		goto err_free_entries;
>  
>  	priv = kzalloc_obj(*priv);
>  	if (!priv) {
>  		ret = -ENOMEM;
> -		goto err_free_ranges;
> +		goto err_free_entries;
>  	}
>  	priv->phys_vec = kzalloc_objs(*priv->phys_vec, get_dma_buf.nr_ranges);
>  	if (!priv->phys_vec) {
> @@ -260,15 +287,22 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
>  	priv->vdev = vdev;
>  	priv->nr_ranges = get_dma_buf.nr_ranges;
>  	priv->size = length;
> +
> +	if (tph_supplied) {
> +		priv->steering_tag = entries[tph_index].tph.steering_tag;
> +		priv->ph = entries[tph_index].tph.ph;
> +		priv->tph_present = 1;
> +	}
> +
>  	ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider,
>  					     get_dma_buf.region_index,
> -					     priv->phys_vec, dma_ranges,
> +					     priv->phys_vec, entries,
>  					     priv->nr_ranges);
>  	if (ret)
>  		goto err_free_phys;
>  
> -	kfree(dma_ranges);
> -	dma_ranges = NULL;
> +	kfree(entries);
> +	entries = NULL;
>  
>  	if (!vfio_device_try_get_registration(&vdev->vdev)) {
>  		ret = -ENODEV;
> @@ -311,8 +345,8 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
>  	kfree(priv->phys_vec);
>  err_free_priv:
>  	kfree(priv);
> -err_free_ranges:
> -	kfree(dma_ranges);
> +err_free_entries:
> +	kfree(entries);
>  	return ret;
>  }
>  
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> index 133b9e637b55..b0a79ccbe100 100644
> --- a/include/linux/dma-buf.h
> +++ b/include/linux/dma-buf.h
> @@ -113,6 +113,23 @@ struct dma_buf_ops {
>  	 */
>  	void (*unpin)(struct dma_buf_attachment *attach);
>  
> +	/**
> +	 * @get_tph:
> +	 * @dmabuf: DMA buffer for which to retrieve TPH metadata
> +	 * @steering_tag: Returns the raw TPH steering tag
> +	 * @ph: Returns the TPH processing hint
> +	 * @st_width: Consumer's supported steering tag width in bits (8 or 16)
> +	 *
> +	 * Return the TPH (TLP Processing Hints) metadata associated with this
> +	 * DMA buffer. Exporters that do not provide TPH metadata should return
> +	 * -EOPNOTSUPP. If the steering tag exceeds @st_width bits, return
> +	 * -EINVAL.
> +	 *
> +	 * This callback is optional.
> +	 */
> +	int (*get_tph)(struct dma_buf *dmabuf, u16 *steering_tag, u8 *ph,
> +		       u8 st_width);
> +
>  	/**
>  	 * @map_dma_buf:
>  	 *
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index bb7b89330d35..a0bd24623c52 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1490,16 +1490,36 @@ struct vfio_device_feature_bus_master {
>   * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
>   * etc. offset/length specify a slice of the region to create the dmabuf from.
>   * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
> + * When VFIO_DMABUF_FLAG_TPH is set, entries[] contains one extra trailing
> + * object after the nr_ranges DMA ranges carrying the TPH steering tag and
> + * processing hint.

I really don't think we want to design an API where entries is
implicitly one-off from what's actually there.  This feeds back into
the below removal of the __counted by attribute, which is a red flag
that this is the wrong approach.

In general though, I'm really hoping that someone interested in
enabling TPH as an interface through vfio actually decides to take
resource targeting and revocation seriously.  There's no validation of
the steering tag here relative to what the user has access to and no
mechanism to revoke those tags if access changes.  In fact, there's not
even a proposed mechanism allowing the user to derive valid steering
tags.  Does the user implicitly know the value and the kernel just
allows it because... yolo?  Thanks,

Alex

>   *
> - * flags should be 0.
> + * flags should be 0 or VFIO_DMABUF_FLAG_TPH.
>   *
>   * Return: The fd number on success, -1 and errno is set on failure.
>   */
>  #define VFIO_DEVICE_FEATURE_DMA_BUF 11
>  
> +enum vfio_device_feature_dma_buf_flags {
> +	VFIO_DMABUF_FLAG_TPH = 1 << 0,
> +};
> +
> +struct vfio_region_dma_tph {
> +	__u16 steering_tag;
> +	__u8 ph;
> +	__u8 reserved;
> +	__u32 reserved2;
> +};
> +
>  struct vfio_region_dma_range {
> -	__u64 offset;
> -	__u64 length;
> +	union {
> +		__u64 offset;
> +		struct vfio_region_dma_tph tph;
> +	};
> +	union {
> +		__u64 length;
> +		__u64 reserved;
> +	};
>  };
>  
>  struct vfio_device_feature_dma_buf {
> @@ -1507,7 +1527,7 @@ struct vfio_device_feature_dma_buf {
>  	__u32	open_flags;
>  	__u32   flags;
>  	__u32   nr_ranges;
> -	struct vfio_region_dma_range dma_ranges[] __counted_by(nr_ranges);
> +	struct vfio_region_dma_range entries[];
>  };
>  
>  /* -------- API for Type1 VFIO IOMMU -------- */


  reply	other threads:[~2026-04-22 15:23 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-20 18:39 [PATCH v1 0/2] Retrieve TPH from dma-buf for PCIe P2P memory access Zhiping Zhang
2026-04-20 18:39 ` [PATCH v1 1/2] vfio: add callback to get tph info for dma-buf Zhiping Zhang
2026-04-22 15:23   ` Alex Williamson [this message]
2026-04-22 16:29     ` Jason Gunthorpe
2026-04-22 19:27       ` Alex Williamson
2026-04-20 18:39 ` [PATCH v1 2/2] RDMA/mlx5: get tph for p2p access when registering dma-buf mr Zhiping Zhang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260422092327.3f629ad6@shazbot.org \
    --to=alex@shazbot.org \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=helgaas@kernel.org \
    --cc=jgg@ziepe.ca \
    --cc=kbusch@kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=sdf@meta.com \
    --cc=yishaih@nvidia.com \
    --cc=yochai@nvidia.com \
    --cc=zhipingz@meta.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox