* Re: [PATCH v6 4/5] vfio/pci: implement get_tph and DMA_BUF_TPH feature
[not found] ` <20260608185646.4085127-5-zhipingz@meta.com>
@ 2026-06-09 8:12 ` Christian König
2026-06-09 14:39 ` Zhiping Zhang
2026-06-09 21:46 ` Alex Williamson
1 sibling, 1 reply; 6+ messages in thread
From: Christian König @ 2026-06-09 8:12 UTC (permalink / raw)
To: Zhiping Zhang, Alex Williamson, Jason Gunthorpe, Leon Romanovsky,
Sumit Semwal
Cc: Bjorn Helgaas, kvm, linux-rdma, linux-pci, netdev, dri-devel,
Keith Busch, Yochai Cohen, Yishai Hadas
On 6/8/26 20:56, Zhiping Zhang wrote:
> Implement the dma-buf get_tph callback for vfio-pci-exported dma-bufs
> and add VFIO_DEVICE_FEATURE_DMA_BUF_TPH so userspace can attach TPH
> metadata to such a dma-buf.
>
> 8-bit ST and 16-bit Extended ST are distinct PCIe TPH namespaces; the
> uAPI carries both with explicit validity flags, and get_tph() returns
> the value matching the importer's requested width (or -EOPNOTSUPP).
>
> The TPH descriptor is published and read under a new per-dma-buf mutex
> priv->tph_lock so a SET racing with a get_tph reader sees consistent
> fields. The mutex's only role is serialising the TPH state; priv->vdev
> and dmabuf lifetime are managed by the existing ioctl reference and
> dma_buf_get() ref, so the cleanup path does not need to take this
> mutex.
>
> The SET ioctl returns -EOPNOTSUPP if the underlying device does not
> expose the PCIe TPH Extended Capability (pdev->tph_cap == 0); setting
> ST metadata on a device that cannot act as a TPH completer is
> nonsensical and rejecting it early gives userspace a clear signal.
>
> The uAPI itself is not device-specific. It publishes the PCI SIG-defined
> ST/PH tuple for a VFIO-owned PCIe completer and keeps the tuple opaque
> to dma-buf; any importer simply requests the namespace it supports and
> places the returned value on generated TLPs. Any other userspace driver
> using vfio-pci for an endpoint that accepts inbound TPH can reuse the
> same interface.
>
> Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
> ---
> Test plan: verified the kernel-side behavior by checking that an
> importer such as mlx5 emits the programmed ST/PH on outbound P2P TLPs
> after a successful VFIO_DEVICE_FEATURE_DMA_BUF_TPH set.
>
> drivers/vfio/pci/vfio_pci_core.c | 3 +
> drivers/vfio/pci/vfio_pci_dmabuf.c | 92 +++++++++++++++++++++++++++++-
> drivers/vfio/pci/vfio_pci_priv.h | 12 ++++
> include/uapi/linux/vfio.h | 45 +++++++++++++++
> 4 files changed, 151 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 050e7542952e..4fa36f2f7555 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1569,6 +1569,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> case VFIO_DEVICE_FEATURE_DMA_BUF:
> return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
> + case VFIO_DEVICE_FEATURE_DMA_BUF_TPH:
> + return vfio_pci_core_feature_dma_buf_tph(vdev, flags, arg,
> + argsz);
> default:
> return -ENOTTY;
> }
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> index 1a177ce7de54..dd11a7db6b41 100644
> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -2,7 +2,9 @@
> /* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
> */
> #include <linux/dma-buf-mapping.h>
> +#include <linux/mutex.h>
> #include <linux/pci-p2pdma.h>
> +#include <linux/pci-tph.h>
> #include <linux/dma-resv.h>
>
> #include "vfio_pci_priv.h"
> @@ -19,7 +21,14 @@ struct vfio_pci_dma_buf {
> u32 nr_ranges;
> struct kref kref;
> struct completion comp;
> - u8 revoked : 1;
> + /* @tph_lock serializes TPH SET vs get_tph on the TPH fields below. */
> + struct mutex tph_lock;
Clear NO-GO.
When that info is exposed through DMA-buf it must be protected by the DMA-buf resv lock.
Christian.
> + u8 tph_st_valid:1;
> + u8 tph_st_ext_valid:1;
> + u8 tph_ph:2;
> + u8 tph_st;
> + u16 tph_st_ext;
> + u8 revoked:1;
> };
>
> static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> @@ -69,6 +78,25 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> return ret;
> }
>
> +static int vfio_pci_dma_buf_get_tph(struct dma_buf *dmabuf, bool extended,
> + u16 *steering_tag, u8 *ph)
> +{
> + struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> + guard(mutex)(&priv->tph_lock);
> + if (extended) {
> + if (!priv->tph_st_ext_valid)
> + return -EOPNOTSUPP;
> + *steering_tag = priv->tph_st_ext;
> + } else {
> + if (!priv->tph_st_valid)
> + return -EOPNOTSUPP;
> + *steering_tag = priv->tph_st;
> + }
> + *ph = priv->tph_ph;
> + return 0;
> +}
> +
> static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> struct sg_table *sgt,
> enum dma_data_direction dir)
> @@ -95,12 +123,14 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> up_write(&priv->vdev->memory_lock);
> vfio_device_put_registration(&priv->vdev->vdev);
> }
> + mutex_destroy(&priv->tph_lock);
> kfree(priv->phys_vec);
> kfree(priv);
> }
>
> static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
> .attach = vfio_pci_dma_buf_attach,
> + .get_tph = vfio_pci_dma_buf_get_tph,
> .map_dma_buf = vfio_pci_dma_buf_map,
> .unmap_dma_buf = vfio_pci_dma_buf_unmap,
> .release = vfio_pci_dma_buf_release,
> @@ -265,6 +295,7 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> ret = -ENOMEM;
> goto err_free_ranges;
> }
> + mutex_init(&priv->tph_lock);
> priv->phys_vec = kzalloc_objs(*priv->phys_vec, get_dma_buf.nr_ranges);
> if (!priv->phys_vec) {
> ret = -ENOMEM;
> @@ -327,12 +358,71 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> err_free_phys:
> kfree(priv->phys_vec);
> err_free_priv:
> + mutex_destroy(&priv->tph_lock);
> kfree(priv);
> err_free_ranges:
> kfree(dma_ranges);
> return ret;
> }
>
> +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> + u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz)
> +{
> + struct vfio_device_feature_dma_buf_tph set_tph;
> + struct vfio_pci_dma_buf *priv;
> + struct dma_buf *dmabuf;
> + int ret;
> +
> + if (!pcie_tph_supported(vdev->pdev))
> + return -EOPNOTSUPP;
> +
> + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
> + sizeof(set_tph));
> + if (ret != 1)
> + return ret;
> +
> + if (copy_from_user(&set_tph, arg, sizeof(set_tph)))
> + return -EFAULT;
> +
> + if (set_tph.flags & ~(VFIO_DMA_BUF_TPH_ST | VFIO_DMA_BUF_TPH_ST_EXT))
> + return -EINVAL;
> +
> + /* PCIe TLP Processing Hint is a 2-bit field. */
> + if (set_tph.ph & ~0x3)
> + return -EINVAL;
> +
> + dmabuf = dma_buf_get(set_tph.dmabuf_fd);
> + if (IS_ERR(dmabuf))
> + return PTR_ERR(dmabuf);
> +
> + if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
> + ret = -EINVAL;
> + goto out_put;
> + }
> +
> + priv = dmabuf->priv;
> + if (priv->vdev != vdev) {
> + ret = -EINVAL;
> + goto out_put;
> + }
> +
> + scoped_guard(mutex, &priv->tph_lock) {
> + priv->tph_st = set_tph.steering_tag;
> + priv->tph_st_ext = set_tph.steering_tag_ext;
> + priv->tph_ph = set_tph.ph;
> + priv->tph_st_valid = !!(set_tph.flags & VFIO_DMA_BUF_TPH_ST);
> + priv->tph_st_ext_valid =
> + !!(set_tph.flags & VFIO_DMA_BUF_TPH_ST_EXT);
> + }
> + ret = 0;
> +
> +out_put:
> + dma_buf_put(dmabuf);
> + return ret;
> +}
> +
> void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> {
> struct vfio_pci_dma_buf *priv;
> diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
> index fca9d0dfac90..c58f369be4b3 100644
> --- a/drivers/vfio/pci/vfio_pci_priv.h
> +++ b/drivers/vfio/pci/vfio_pci_priv.h
> @@ -118,6 +118,10 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
> int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> struct vfio_device_feature_dma_buf __user *arg,
> size_t argsz);
> +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> + u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz);
> void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
> void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
> #else
> @@ -128,6 +132,14 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> {
> return -ENOTTY;
> }
> +
> +static inline int
> +vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev, u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz)
> +{
> + return -ENOTTY;
> +}
> static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> {
> }
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 5de618a3a5ee..0ca26721849b 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1534,6 +1534,51 @@ struct vfio_device_feature_dma_buf {
> */
> #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12
>
> +/**
> + * Upon VFIO_DEVICE_FEATURE_SET associate TPH (TLP Processing Hints) metadata
> + * with a vfio-exported dma-buf. The dma-buf must have been created by
> + * VFIO_DEVICE_FEATURE_DMA_BUF on this device, and the device must expose the
> + * TPH Extended Capability (otherwise the ioctl returns -EOPNOTSUPP).
> + *
> + * dmabuf_fd is the file descriptor returned by VFIO_DEVICE_FEATURE_DMA_BUF.
> + *
> + * 8-bit ST (steering_tag) and 16-bit Extended ST (steering_tag_ext) are
> + * distinct namespaces in the PCIe TPH ST table and may both be present with
> + * different values. Userspace should populate the value(s) it has from the
> + * firmware ST table for this device and set the matching VFIO_DMA_BUF_TPH_ST /
> + * VFIO_DMA_BUF_TPH_ST_EXT bit in @flags. An importer requests a specific
> + * width and receives the matching value; if the requested width is not
> + * present, the importer is told TPH is unavailable for this dma-buf.
> + *
> + * This publishes the PCI SIG-defined ST/PH tuple for a VFIO-owned PCIe
> + * completer. The dma-buf core treats the tuple as opaque completer-owned
> + * metadata; an importer simply requests the namespace it supports and places
> + * the returned value on generated TLPs.
> + *
> + * @flags == 0 clears any previously published metadata.
> + *
> + * ph is the 2-bit TLP Processing Hint and must be in the range [0, 3].
> + *
> + * Userspace is responsible for setting TPH on the dma-buf before handing the
> + * fd to the importer. Calling SET again replaces the previously published
> + * values; racing a SET against an importer that is already consuming the
> + * dma-buf is a userspace ordering problem.
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF_TPH 13
> +
> +#define VFIO_DMA_BUF_TPH_ST (1 << 0) /* steering_tag valid */
> +#define VFIO_DMA_BUF_TPH_ST_EXT (1 << 1) /* steering_tag_ext valid */
> +
> +struct vfio_device_feature_dma_buf_tph {
> + __s32 dmabuf_fd;
> + __u32 flags;
> + __u16 steering_tag_ext;
> + __u8 steering_tag;
> + __u8 ph;
> +};
> +
> /* -------- API for Type1 VFIO IOMMU -------- */
>
> /**
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [PATCH v6 4/5] vfio/pci: implement get_tph and DMA_BUF_TPH feature
[not found] ` <20260608185646.4085127-5-zhipingz@meta.com>
2026-06-09 8:12 ` [PATCH v6 4/5] vfio/pci: implement get_tph and DMA_BUF_TPH feature Christian König
@ 2026-06-09 21:46 ` Alex Williamson
2026-06-09 23:48 ` Zhiping Zhang
1 sibling, 1 reply; 6+ messages in thread
From: Alex Williamson @ 2026-06-09 21:46 UTC (permalink / raw)
To: Zhiping Zhang
Cc: Jason Gunthorpe, Leon Romanovsky, Sumit Semwal, Christian Konig,
Bjorn Helgaas, kvm, linux-rdma, linux-pci, netdev, dri-devel,
Keith Busch, Yochai Cohen, Yishai Hadas, alex
On Mon, 8 Jun 2026 11:56:41 -0700
Zhiping Zhang <zhipingz@meta.com> wrote:
> @@ -327,12 +358,71 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> err_free_phys:
> kfree(priv->phys_vec);
> err_free_priv:
> + mutex_destroy(&priv->tph_lock);
> kfree(priv);
> err_free_ranges:
> kfree(dma_ranges);
> return ret;
> }
>
> +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> + u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz)
> +{
> + struct vfio_device_feature_dma_buf_tph set_tph;
> + struct vfio_pci_dma_buf *priv;
> + struct dma_buf *dmabuf;
> + int ret;
> +
> + if (!pcie_tph_supported(vdev->pdev))
> + return -EOPNOTSUPP;
This tests for the TPH capability, but the TPH capability is only a
requirement for functions that generate TLPs with TPH, ie. a requester.
This feature is about providing TPH steering tags when the device is a
completer. Bits 13:12 of the Device Capabilities 2 register indicate
if the device is supported as a TPH completer.
Additionally these bits indicate if the device supports standard and
extended TPH, which means we should not only fail if the device reports
00b, but should reject extended steering tags unless the device reports
11b.
> +
> + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
> + sizeof(set_tph));
> + if (ret != 1)
> + return ret;
> +
> + if (copy_from_user(&set_tph, arg, sizeof(set_tph)))
> + return -EFAULT;
> +
> + if (set_tph.flags & ~(VFIO_DMA_BUF_TPH_ST | VFIO_DMA_BUF_TPH_ST_EXT))
> + return -EINVAL;
> +
> + /* PCIe TLP Processing Hint is a 2-bit field. */
> + if (set_tph.ph & ~0x3)
> + return -EINVAL;
Sashiko notes what appears to be a false positive here, the uAPI states
ph is to be in the range [0, 3] and nowhere else says that it's allowed
to be garbage for a clear operation.
> +
> + dmabuf = dma_buf_get(set_tph.dmabuf_fd);
> + if (IS_ERR(dmabuf))
> + return PTR_ERR(dmabuf);
> +
> + if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
> + ret = -EINVAL;
> + goto out_put;
> + }
> +
> + priv = dmabuf->priv;
> + if (priv->vdev != vdev) {
> + ret = -EINVAL;
> + goto out_put;
> + }
Sashiko notes this may need READ_ONCE()/WRITE_ONCE() semantics, but
that may get fixed as part of the resv lock usage.
> +
> + scoped_guard(mutex, &priv->tph_lock) {
> + priv->tph_st = set_tph.steering_tag;
> + priv->tph_st_ext = set_tph.steering_tag_ext;
> + priv->tph_ph = set_tph.ph;
> + priv->tph_st_valid = !!(set_tph.flags & VFIO_DMA_BUF_TPH_ST);
> + priv->tph_st_ext_valid =
> + !!(set_tph.flags & VFIO_DMA_BUF_TPH_ST_EXT);
> + }
> + ret = 0;
> +
> +out_put:
> + dma_buf_put(dmabuf);
> + return ret;
> +}
> +
> void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> {
> struct vfio_pci_dma_buf *priv;
...
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 5de618a3a5ee..0ca26721849b 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1534,6 +1534,51 @@ struct vfio_device_feature_dma_buf {
> */
> #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12
>
> +/**
> + * Upon VFIO_DEVICE_FEATURE_SET associate TPH (TLP Processing Hints) metadata
> + * with a vfio-exported dma-buf. The dma-buf must have been created by
> + * VFIO_DEVICE_FEATURE_DMA_BUF on this device, and the device must expose the
> + * TPH Extended Capability (otherwise the ioctl returns -EOPNOTSUPP).
> + *
> + * dmabuf_fd is the file descriptor returned by VFIO_DEVICE_FEATURE_DMA_BUF.
> + *
> + * 8-bit ST (steering_tag) and 16-bit Extended ST (steering_tag_ext) are
> + * distinct namespaces in the PCIe TPH ST table and may both be present with
> + * different values. Userspace should populate the value(s) it has from the
> + * firmware ST table for this device and set the matching VFIO_DMA_BUF_TPH_ST /
> + * VFIO_DMA_BUF_TPH_ST_EXT bit in @flags. An importer requests a specific
> + * width and receives the matching value; if the requested width is not
> + * present, the importer is told TPH is unavailable for this dma-buf.
> + *
> + * This publishes the PCI SIG-defined ST/PH tuple for a VFIO-owned PCIe
> + * completer. The dma-buf core treats the tuple as opaque completer-owned
> + * metadata; an importer simply requests the namespace it supports and places
> + * the returned value on generated TLPs.
> + *
> + * @flags == 0 clears any previously published metadata.
This is overselling the invalidation. It only flags the fields as
invalid for future get_tph() requests, it does nothing to clear
previously published metadata from importers. Thanks,
Alex
> + *
> + * ph is the 2-bit TLP Processing Hint and must be in the range [0, 3].
> + *
> + * Userspace is responsible for setting TPH on the dma-buf before handing the
> + * fd to the importer. Calling SET again replaces the previously published
> + * values; racing a SET against an importer that is already consuming the
> + * dma-buf is a userspace ordering problem.
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF_TPH 13
> +
> +#define VFIO_DMA_BUF_TPH_ST (1 << 0) /* steering_tag valid */
> +#define VFIO_DMA_BUF_TPH_ST_EXT (1 << 1) /* steering_tag_ext valid */
> +
> +struct vfio_device_feature_dma_buf_tph {
> + __s32 dmabuf_fd;
> + __u32 flags;
> + __u16 steering_tag_ext;
> + __u8 steering_tag;
> + __u8 ph;
> +};
> +
> /* -------- API for Type1 VFIO IOMMU -------- */
>
> /**
^ permalink raw reply [flat|nested] 6+ messages in thread