Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed
* Re: [PATCH v9 3/4] vfio/pci: implement get_pci_tph and DMA_BUF_TPH feature
       [not found] ` <20260622184211.2229399-4-zhipingz@meta.com>
@ 2026-06-23 18:17   ` Alex Williamson
  0 siblings, 0 replies; only message in thread
From: Alex Williamson @ 2026-06-23 18:17 UTC (permalink / raw)
  To: Zhiping Zhang
  Cc: Jason Gunthorpe, Leon Romanovsky, Michael Guralnik, Sumit Semwal,
	Christian Konig, Bjorn Helgaas, kvm, linux-rdma, linux-pci,
	dri-devel, alex

On Mon, 22 Jun 2026 11:41:36 -0700
Zhiping Zhang <zhipingz@meta.com> wrote:

> Implement dma-buf get_pci_tph for vfio-pci exported dma-bufs and add
> VFIO_DEVICE_FEATURE_DMA_BUF_TPH so userspace can publish TPH metadata
> for a VFIO-owned device.
> 
> 8-bit ST and 16-bit Extended ST are distinct PCIe TPH namespaces; the
> uAPI carries both with explicit validity flags, and get_pci_tph()
> returns the value matching the importer's requested namespace or
> -EOPNOTSUPP.
> 
> Publish and read the TPH descriptor under dmabuf->resv, matching the
> locking used for other importer-visible dma-buf state. The SET ioctl
> takes dma_resv_lock_interruptible(), while the callback runs under
> DMA-buf's asserted resv lock.
> 
> Reject requests the device cannot consume as a completer:
> pcie_tph_completer_type() must report at least
> PCI_EXP_DEVCAP2_TPH_COMP_TPH_ONLY, and Extended ST requires
> PCI_EXP_DEVCAP2_TPH_COMP_EXT_TPH. Make PROBE follow the same hardware
> gate so the feature only probes as supported when the device can really
> consume it.
> 
> Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c   |  3 +
>  drivers/vfio/pci/vfio_pci_dmabuf.c | 97 +++++++++++++++++++++++++++++-
>  drivers/vfio/pci/vfio_pci_priv.h   | 12 ++++
>  include/uapi/linux/vfio.h          | 37 ++++++++++++
>  4 files changed, 148 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index a28f1e99362c..c7d6902bc61b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1572,6 +1572,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  		return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
>  	case VFIO_DEVICE_FEATURE_DMA_BUF:
>  		return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
> +	case VFIO_DEVICE_FEATURE_DMA_BUF_TPH:
> +		return vfio_pci_core_feature_dma_buf_tph(vdev, flags, arg,
> +							 argsz);
>  	default:
>  		return -ENOTTY;
>  	}
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> index c16f460c01d6..d6f5dd321000 100644
> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -3,6 +3,7 @@
>   */
>  #include <linux/dma-buf-mapping.h>
>  #include <linux/pci-p2pdma.h>
> +#include <linux/pci-tph.h>
>  #include <linux/dma-resv.h>
>  
>  #include "vfio_pci_priv.h"
> @@ -19,7 +20,14 @@ struct vfio_pci_dma_buf {
>  	u32 nr_ranges;
>  	struct kref kref;
>  	struct completion comp;
> -	u8 revoked : 1;
> +
> +	/* Protected by dmabuf->resv. */

Nit, it would be more accurate to say:

	/*
	 * Updates protected by dmabuf->resv, @revoked additionally
	 * protected by memory_lock.
	 */

revoked also has an unprotected read, but it's previously existing and
benign, and likely just needs a READ_ONCE() annotation.

> +	u16 tph_st_ext;
> +	u8 tph_st;
> +	u8 revoked:1;
> +	u8 tph_st_valid:1;
> +	u8 tph_st_ext_valid:1;
> +	u8 tph_ph:2;
>  };
>  
>  static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> @@ -69,6 +77,26 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
>  	return ret;
>  }
>  
> +static int vfio_pci_dma_buf_get_pci_tph(struct dma_buf *dmabuf, bool extended,
> +					u16 *steering_tag, u8 *ph)
> +{
> +	struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> +	dma_resv_assert_held(dmabuf->resv);
> +
> +	if (extended) {
> +		if (!priv->tph_st_ext_valid)
> +			return -EOPNOTSUPP;
> +		*steering_tag = priv->tph_st_ext;
> +	} else {
> +		if (!priv->tph_st_valid)
> +			return -EOPNOTSUPP;
> +		*steering_tag = priv->tph_st;
> +	}
> +	*ph = priv->tph_ph;
> +	return 0;
> +}
> +
>  static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
>  				   struct sg_table *sgt,
>  				   enum dma_data_direction dir)
> @@ -101,6 +129,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
>  
>  static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
>  	.attach = vfio_pci_dma_buf_attach,
> +	.get_pci_tph = vfio_pci_dma_buf_get_pci_tph,
>  	.map_dma_buf = vfio_pci_dma_buf_map,
>  	.unmap_dma_buf = vfio_pci_dma_buf_unmap,
>  	.release = vfio_pci_dma_buf_release,
> @@ -333,6 +362,72 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
>  	return ret;
>  }
>  
> +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> +				      u32 flags,
> +				      struct vfio_device_feature_dma_buf_tph __user *arg,
> +				      size_t argsz)
> +{
> +	struct vfio_device_feature_dma_buf_tph set_tph;
> +	struct vfio_pci_dma_buf *priv;
> +	struct dma_buf *dmabuf;
> +	u8 comp;
> +	int ret;
> +
> +	comp = pcie_tph_completer_type(vdev->pdev);
> +	if (comp == PCI_EXP_DEVCAP2_TPH_COMP_NONE)
> +		return -EOPNOTSUPP;
> +
> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
> +				 sizeof(set_tph));
> +	if (ret != 1)
> +		return ret;
> +
> +	if (copy_from_user(&set_tph, arg, sizeof(set_tph)))
> +		return -EFAULT;
> +
> +	if (set_tph.flags & ~(VFIO_DMA_BUF_TPH_ST | VFIO_DMA_BUF_TPH_ST_EXT))
> +		return -EINVAL;
> +
> +	if (set_tph.ph & ~0x3)
> +		return -EINVAL;
> +
> +	if ((set_tph.flags & VFIO_DMA_BUF_TPH_ST_EXT) &&
> +	    comp != PCI_EXP_DEVCAP2_TPH_COMP_EXT_TPH)
> +		return -EOPNOTSUPP;
> +
> +	dmabuf = dma_buf_get(set_tph.dmabuf_fd);
> +	if (IS_ERR(dmabuf))
> +		return PTR_ERR(dmabuf);
> +
> +	if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
> +		ret = -EINVAL;
> +		goto out_put;
> +	}
> +
> +	priv = dmabuf->priv;
> +	if (priv->vdev != vdev) {
> +		ret = -EINVAL;
> +		goto out_put;
> +	}
> +
> +	ret = dma_resv_lock_interruptible(dmabuf->resv, NULL);
> +	if (ret)
> +		goto out_put;
> +
> +	priv->tph_st         = set_tph.steering_tag;
> +	priv->tph_st_ext     = set_tph.steering_tag_ext;
> +	priv->tph_ph         = set_tph.ph;
> +	priv->tph_st_valid   = !!(set_tph.flags & VFIO_DMA_BUF_TPH_ST);
> +	priv->tph_st_ext_valid =
> +		!!(set_tph.flags & VFIO_DMA_BUF_TPH_ST_EXT);
> +	dma_resv_unlock(dmabuf->resv);
> +	ret = 0;
> +
> +out_put:
> +	dma_buf_put(dmabuf);
> +	return ret;
> +}
> +
>  void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
>  {
>  	struct vfio_pci_dma_buf *priv;
> diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
> index fca9d0dfac90..c58f369be4b3 100644
> --- a/drivers/vfio/pci/vfio_pci_priv.h
> +++ b/drivers/vfio/pci/vfio_pci_priv.h
> @@ -118,6 +118,10 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
>  int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
>  				  struct vfio_device_feature_dma_buf __user *arg,
>  				  size_t argsz);
> +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> +				      u32 flags,
> +				      struct vfio_device_feature_dma_buf_tph __user *arg,
> +				      size_t argsz);
>  void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
>  void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
>  #else
> @@ -128,6 +132,14 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
>  {
>  	return -ENOTTY;
>  }
> +
> +static inline int
> +vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev, u32 flags,
> +				  struct vfio_device_feature_dma_buf_tph __user *arg,
> +				  size_t argsz)
> +{
> +	return -ENOTTY;
> +}
>  static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
>  {
>  }
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 5de618a3a5ee..2d30ba43e2cf 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1534,6 +1534,43 @@ struct vfio_device_feature_dma_buf {
>   */
>  #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2  12
>  
> +/**
> + * Upon VFIO_DEVICE_FEATURE_SET associate TPH (TLP Processing Hints) metadata
> + * with a vfio-exported dma-buf. The dma-buf must have been created by
> + * VFIO_DEVICE_FEATURE_DMA_BUF on this device, and the device must report
> + * TPH Completer support in Device Capabilities 2 (bits 13:12); requests
> + * carrying VFIO_DMA_BUF_TPH_ST_EXT additionally require the device to
> + * report the Extended TPH Completer encoding. Otherwise the ioctl
> + * returns -EOPNOTSUPP.
> + *
> + * dmabuf_fd is the file descriptor returned by VFIO_DEVICE_FEATURE_DMA_BUF.
> + *
> + * 8-bit ST (steering_tag) and 16-bit Extended ST (steering_tag_ext) are
> + * distinct namespaces. Userspace supplies whichever values are valid and sets
> + * the matching VFIO_DMA_BUF_TPH_ST / VFIO_DMA_BUF_TPH_ST_EXT bits in @flags;
> + * an importer requests one namespace and receives the matching value.
> + *
> + * @flags == 0 marks any previously published ST / Extended-ST as invalid
> + * for future PCI TPH queries on this dma-buf.

I think we should avoid "published" as it still suggests we're somehow
able to invalidate what was previously reported and consumed.  It's
offset by the trailing clause here, but that's absent below.

Also, we're noting @flags == 0 as if it's a special case, but we really
need the clarity that if either flag bit is not set, the corresponding
field is marked invalid for future queries.  Perhaps something like:

   * @flags is the authoritative validity for each namespace: when
   * VFIO_DMA_BUF_TPH_ST is set, @steering_tag becomes the valid 8-bit ST; when
   * VFIO_DMA_BUF_TPH_ST_EXT is set, @steering_tag_ext becomes the valid 16-bit
   * Extended ST.  A namespace whose bit is clear is marked invalid and
   * reported as unsupported to importers requesting it.
   *
   * Each SET fully replaces the dma-buf's TPH state: any namespace not selected
   * in @flags is left invalid, so @flags == 0 marks both ST and Extended ST
   * invalid.  This only affects TPH queries made after the SET completes; an
   * importer that has already retrieved a value is unaffected.  Userspace must
   * therefore configure TPH before handing the dma-buf fd to an importer.

> + *
> + * ph is the 2-bit TLP Processing Hint and must be in the range [0, 3].

Slight inconsistency here, @ph.  We might also help to silence the
Sashiko warning to note:

   * Undefined @flags and @ph bits must always be zero.

> + *
> + * Userspace must publish TPH before handing the dma-buf fd to an importer.
> + * Calling SET again replaces the published values.

The above suggestion is meant to replace this as well.

> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF_TPH 13

There are several series in-flight contending for device feature
indexes, so this should really go in through the vfio tree to reduce
the risk of duplicates.  We also still need acks from the relevant
maintainers for PCI, dma-buf, and mlx5 before this can be queued for
v7.3.  Thanks,

Alex

> +
> +#define VFIO_DMA_BUF_TPH_ST		(1 << 0)
> +#define VFIO_DMA_BUF_TPH_ST_EXT		(1 << 1)
> +
> +struct vfio_device_feature_dma_buf_tph {
> +	__s32	dmabuf_fd;
> +	__u32	flags;
> +	__u16	steering_tag_ext;
> +	__u8	steering_tag;
> +	__u8	ph;
> +};
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2026-06-23 18:17 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20260622184211.2229399-1-zhipingz@meta.com>
     [not found] ` <20260622184211.2229399-4-zhipingz@meta.com>
2026-06-23 18:17   ` [PATCH v9 3/4] vfio/pci: implement get_pci_tph and DMA_BUF_TPH feature Alex Williamson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox