From: Alex Williamson <alex@shazbot.org>
To: Zhiping Zhang <zhipingz@meta.com>
Cc: <netdev@vger.kernel.org>, <kvm@vger.kernel.org>,
<linux-rdma@vger.kernel.org>, <linux-pci@vger.kernel.org>,
<dri-devel@lists.freedesktop.org>,
alex@shazbot.org
Subject: Re: [PATCH v7 4/5] vfio/pci: implement get_tph and DMA_BUF_TPH feature
Date: Fri, 12 Jun 2026 11:10:01 -0600 [thread overview]
Message-ID: <20260612111001.5b7206eb@shazbot.org> (raw)
In-Reply-To: <20260611161546.4075580-5-zhipingz@meta.com>
On Thu, 11 Jun 2026 09:11:19 -0700
Zhiping Zhang <zhipingz@meta.com> wrote:
> Implement dma-buf get_tph for vfio-pci exported dma-bufs and add
> VFIO_DEVICE_FEATURE_DMA_BUF_TPH so userspace can publish TPH metadata
> for a VFIO-owned device.
>
> 8-bit ST and 16-bit Extended ST are distinct PCIe TPH namespaces; the
> uAPI carries both with explicit validity flags, and get_tph() returns
> the value matching the importer's requested namespace or -EOPNOTSUPP.
>
> Publish and read the TPH descriptor under dmabuf->resv, matching the
> locking used for other importer-visible dma-buf state. The SET ioctl
> takes dma_resv_lock_interruptible(), while the callback runs under
> DMA-buf's asserted resv lock.
>
> Reject requests the device cannot consume as a completer:
> pcie_tph_completer_type() must report at least
> PCI_EXP_DEVCAP2_TPH_COMP_TPH_ONLY, and Extended ST requires
> PCI_EXP_DEVCAP2_TPH_COMP_EXT_TPH. Validate fields before the completer
> check so userspace gets the narrowest errno.
>
> Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
> ---
> drivers/vfio/pci/vfio_pci_core.c | 3 +
> drivers/vfio/pci/vfio_pci_dmabuf.c | 94 +++++++++++++++++++++++++++++-
> drivers/vfio/pci/vfio_pci_priv.h | 12 ++++
> include/uapi/linux/vfio.h | 37 ++++++++++++
> 4 files changed, 145 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 050e7542952e..4fa36f2f7555 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1569,6 +1569,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> case VFIO_DEVICE_FEATURE_DMA_BUF:
> return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
> + case VFIO_DEVICE_FEATURE_DMA_BUF_TPH:
> + return vfio_pci_core_feature_dma_buf_tph(vdev, flags, arg,
> + argsz);
> default:
> return -ENOTTY;
> }
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> index 1a177ce7de54..0a0705c8dbea 100644
> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -3,6 +3,7 @@
> */
> #include <linux/dma-buf-mapping.h>
> #include <linux/pci-p2pdma.h>
> +#include <linux/pci-tph.h>
> #include <linux/dma-resv.h>
>
> #include "vfio_pci_priv.h"
> @@ -19,7 +20,12 @@ struct vfio_pci_dma_buf {
> u32 nr_ranges;
> struct kref kref;
> struct completion comp;
> - u8 revoked : 1;
> + u8 tph_st_valid:1;
> + u8 tph_st_ext_valid:1;
> + u8 tph_ph:2;
> + u8 tph_st;
> + u16 tph_st_ext;
> + u8 revoked:1;
If these bitfields are now all protected under dma_resv_lock they
should be grouped together with a comment to that effect, no need for
revoked to get kicked out to its own storage unit. In [1] I'm
proposing runtime modified flags each get their own storage unit, but
for more isolated cases, so long as we keep track and enforce serialized
updates, I'm ok with runtime bitfields. Thanks,
Alex
[1]https://lore.kernel.org/all/20260611213539.4100590-1-alex.williamson@nvidia.com/
> };
>
> static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> @@ -69,6 +75,26 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> return ret;
> }
>
> +static int vfio_pci_dma_buf_get_tph(struct dma_buf *dmabuf, bool extended,
> + u16 *steering_tag, u8 *ph)
> +{
> + struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> + dma_resv_assert_held(dmabuf->resv);
> +
> + if (extended) {
> + if (!priv->tph_st_ext_valid)
> + return -EOPNOTSUPP;
> + *steering_tag = priv->tph_st_ext;
> + } else {
> + if (!priv->tph_st_valid)
> + return -EOPNOTSUPP;
> + *steering_tag = priv->tph_st;
> + }
> + *ph = priv->tph_ph;
> + return 0;
> +}
> +
> static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> struct sg_table *sgt,
> enum dma_data_direction dir)
> @@ -101,6 +127,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
>
> static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
> .attach = vfio_pci_dma_buf_attach,
> + .get_tph = vfio_pci_dma_buf_get_tph,
> .map_dma_buf = vfio_pci_dma_buf_map,
> .unmap_dma_buf = vfio_pci_dma_buf_unmap,
> .release = vfio_pci_dma_buf_release,
> @@ -333,6 +360,71 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> return ret;
> }
>
> +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> + u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz)
> +{
> + struct vfio_device_feature_dma_buf_tph set_tph;
> + struct vfio_pci_dma_buf *priv;
> + struct dma_buf *dmabuf;
> + u8 comp;
> + int ret;
> +
> + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
> + sizeof(set_tph));
> + if (ret != 1)
> + return ret;
> +
> + if (copy_from_user(&set_tph, arg, sizeof(set_tph)))
> + return -EFAULT;
> +
> + if (set_tph.flags & ~(VFIO_DMA_BUF_TPH_ST | VFIO_DMA_BUF_TPH_ST_EXT))
> + return -EINVAL;
> +
> + if (set_tph.ph & ~0x3)
> + return -EINVAL;
> +
> + comp = pcie_tph_completer_type(vdev->pdev);
> + if (comp == PCI_EXP_DEVCAP2_TPH_COMP_NONE)
> + return -EOPNOTSUPP;
> + if ((set_tph.flags & VFIO_DMA_BUF_TPH_ST_EXT) &&
> + comp != PCI_EXP_DEVCAP2_TPH_COMP_EXT_TPH)
> + return -EOPNOTSUPP;
> +
> + dmabuf = dma_buf_get(set_tph.dmabuf_fd);
> + if (IS_ERR(dmabuf))
> + return PTR_ERR(dmabuf);
> +
> + if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
> + ret = -EINVAL;
> + goto out_put;
> + }
> +
> + priv = dmabuf->priv;
> + if (priv->vdev != vdev) {
> + ret = -EINVAL;
> + goto out_put;
> + }
> +
> + ret = dma_resv_lock_interruptible(dmabuf->resv, NULL);
> + if (ret)
> + goto out_put;
> +
> + priv->tph_st = set_tph.steering_tag;
> + priv->tph_st_ext = set_tph.steering_tag_ext;
> + priv->tph_ph = set_tph.ph;
> + priv->tph_st_valid = !!(set_tph.flags & VFIO_DMA_BUF_TPH_ST);
> + priv->tph_st_ext_valid =
> + !!(set_tph.flags & VFIO_DMA_BUF_TPH_ST_EXT);
> + dma_resv_unlock(dmabuf->resv);
> + ret = 0;
> +
> +out_put:
> + dma_buf_put(dmabuf);
> + return ret;
> +}
> +
> void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> {
> struct vfio_pci_dma_buf *priv;
> diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
> index fca9d0dfac90..c58f369be4b3 100644
> --- a/drivers/vfio/pci/vfio_pci_priv.h
> +++ b/drivers/vfio/pci/vfio_pci_priv.h
> @@ -118,6 +118,10 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
> int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> struct vfio_device_feature_dma_buf __user *arg,
> size_t argsz);
> +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> + u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz);
> void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
> void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
> #else
> @@ -128,6 +132,14 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> {
> return -ENOTTY;
> }
> +
> +static inline int
> +vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev, u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz)
> +{
> + return -ENOTTY;
> +}
> static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> {
> }
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 5de618a3a5ee..5dd693220a0d 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1534,6 +1534,43 @@ struct vfio_device_feature_dma_buf {
> */
> #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12
>
> +/**
> + * Upon VFIO_DEVICE_FEATURE_SET associate TPH (TLP Processing Hints) metadata
> + * with a vfio-exported dma-buf. The dma-buf must have been created by
> + * VFIO_DEVICE_FEATURE_DMA_BUF on this device, and the device must report
> + * TPH Completer support in Device Capabilities 2 (bits 13:12); requests
> + * carrying VFIO_DMA_BUF_TPH_ST_EXT additionally require the device to
> + * report the Extended TPH Completer encoding. Otherwise the ioctl
> + * returns -EOPNOTSUPP.
> + *
> + * dmabuf_fd is the file descriptor returned by VFIO_DEVICE_FEATURE_DMA_BUF.
> + *
> + * 8-bit ST (steering_tag) and 16-bit Extended ST (steering_tag_ext) are
> + * distinct namespaces. Userspace supplies whichever values are valid and sets
> + * the matching VFIO_DMA_BUF_TPH_ST / VFIO_DMA_BUF_TPH_ST_EXT bits in @flags;
> + * an importer requests one namespace and receives the matching value.
> + *
> + * @flags == 0 marks any previously published ST / Extended-ST as invalid
> + * for future get_tph() requests on this dma-buf.
> + *
> + * ph is the 2-bit TLP Processing Hint and must be in the range [0, 3].
> + *
> + * Userspace must publish TPH before handing the dma-buf fd to an importer.
> + * Calling SET again replaces the published values.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF_TPH 13
> +
> +#define VFIO_DMA_BUF_TPH_ST (1 << 0)
> +#define VFIO_DMA_BUF_TPH_ST_EXT (1 << 1)
> +
> +struct vfio_device_feature_dma_buf_tph {
> + __s32 dmabuf_fd;
> + __u32 flags;
> + __u16 steering_tag_ext;
> + __u8 steering_tag;
> + __u8 ph;
> +};
> +
> /* -------- API for Type1 VFIO IOMMU -------- */
>
> /**
next prev parent reply other threads:[~2026-06-12 17:10 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-11 16:11 [PATCH v7 0/5] vfio/dma-buf: add TPH support for peer-to-peer access Zhiping Zhang
2026-06-11 16:11 ` [PATCH v7 1/5] net/mlx5: free mlx5_st_idx_data on final dealloc Zhiping Zhang
2026-06-11 16:11 ` [PATCH v7 2/5] PCI/TPH: Add requester/completer type helpers Zhiping Zhang
2026-06-12 16:52 ` Alex Williamson
2026-06-11 16:11 ` [PATCH v7 3/5] dma-buf: add optional get_tph() callback Zhiping Zhang
2026-06-11 16:11 ` [PATCH v7 4/5] vfio/pci: implement get_tph and DMA_BUF_TPH feature Zhiping Zhang
2026-06-12 17:10 ` Alex Williamson [this message]
2026-06-11 16:11 ` [PATCH v7 5/5] RDMA/mlx5: get tph for p2p access when registering dma-buf mr Zhiping Zhang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260612111001.5b7206eb@shazbot.org \
--to=alex@shazbot.org \
--cc=dri-devel@lists.freedesktop.org \
--cc=kvm@vger.kernel.org \
--cc=linux-pci@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=zhipingz@meta.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox