* [PATCH v8 1/4] PCI/TPH: Add requester/completer type helpers
2026-06-15 6:58 [PATCH v8 0/4] vfio/dma-buf: add TPH support for peer-to-peer access Zhiping Zhang
@ 2026-06-15 6:58 ` Zhiping Zhang
2026-06-15 6:58 ` [PATCH v8 2/4] dma-buf: add optional get_pci_tph() callback Zhiping Zhang
` (2 subsequent siblings)
3 siblings, 0 replies; 7+ messages in thread
From: Zhiping Zhang @ 2026-06-15 6:58 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Michael Guralnik, Sumit Semwal,
Christian Konig, Alex Williamson, Bjorn Helgaas
Cc: kvm, linux-rdma, linux-pci, dri-devel, Zhiping Zhang
Add pcie_tph_enabled_req_type() so drivers can query the enabled TPH
requester mode without reaching into pci_dev internals.
Add pcie_tph_completer_type() so drivers that publish TPH metadata for
a device acting as a completer can gate on the "TPH Completer
Supported" field of Device Capabilities 2 (bits 13:12,
PCI_EXP_DEVCAP2_TPH_COMP_MASK) rather than reusing requester-side
state. Fold the reserved 0b10 encoding into NONE so callers only see
the defined values.
This keeps pci_dev::tph_req_type and the completer-capability decode
inside the PCI/TPH code and provides !CONFIG_PCIE_TPH stubs for
callers.
Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
---
drivers/pci/tph.c | 45 +++++++++++++++++++++++++++++++++++++++++
include/linux/pci-tph.h | 8 ++++++++
2 files changed, 53 insertions(+)
diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c
index 91145e8d9d95..717b47d1d9ca 100644
--- a/drivers/pci/tph.c
+++ b/drivers/pci/tph.c
@@ -174,6 +174,51 @@ u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev)
}
EXPORT_SYMBOL(pcie_tph_get_st_table_loc);
+/**
+ * pcie_tph_enabled_req_type - Return the device's enabled TPH requester type
+ * @pdev: PCI device to query
+ *
+ * Return: PCI_TPH_REQ_DISABLE, PCI_TPH_REQ_TPH_ONLY or PCI_TPH_REQ_EXT_TPH.
+ */
+u8 pcie_tph_enabled_req_type(struct pci_dev *pdev)
+{
+ return pdev->tph_req_type;
+}
+EXPORT_SYMBOL(pcie_tph_enabled_req_type);
+
+/**
+ * pcie_tph_completer_type - Return the device's TPH Completer support
+ * @pdev: PCI device to query
+ *
+ * Reads the "TPH Completer Supported" field (bits 13:12) of Device
+ * Capabilities 2. The reserved 0b10 encoding is folded into
+ * "not supported" so callers only need to compare against the three
+ * defined values.
+ *
+ * Return: one of %PCI_EXP_DEVCAP2_TPH_COMP_NONE,
+ * %PCI_EXP_DEVCAP2_TPH_COMP_TPH_ONLY or
+ * %PCI_EXP_DEVCAP2_TPH_COMP_EXT_TPH.
+ */
+u8 pcie_tph_completer_type(struct pci_dev *pdev)
+{
+ u32 reg;
+
+ if (pcie_capability_read_dword(pdev, PCI_EXP_DEVCAP2, ®))
+ return PCI_EXP_DEVCAP2_TPH_COMP_NONE;
+ if (PCI_POSSIBLE_ERROR(reg))
+ return PCI_EXP_DEVCAP2_TPH_COMP_NONE;
+
+ switch (FIELD_GET(PCI_EXP_DEVCAP2_TPH_COMP_MASK, reg)) {
+ case PCI_EXP_DEVCAP2_TPH_COMP_TPH_ONLY:
+ return PCI_EXP_DEVCAP2_TPH_COMP_TPH_ONLY;
+ case PCI_EXP_DEVCAP2_TPH_COMP_EXT_TPH:
+ return PCI_EXP_DEVCAP2_TPH_COMP_EXT_TPH;
+ default:
+ return PCI_EXP_DEVCAP2_TPH_COMP_NONE;
+ }
+}
+EXPORT_SYMBOL(pcie_tph_completer_type);
+
/*
* Return the size of ST table. If ST table is not in TPH Requester Extended
* Capability space, return 0. Otherwise return the ST Table Size + 1.
diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h
index be68cd17f2f8..7743af6fe432 100644
--- a/include/linux/pci-tph.h
+++ b/include/linux/pci-tph.h
@@ -9,6 +9,8 @@
#ifndef LINUX_PCI_TPH_H
#define LINUX_PCI_TPH_H
+#include <linux/pci_regs.h>
+
/*
* According to the ECN for PCI Firmware Spec, Steering Tag can be different
* depending on the memory type: Volatile Memory or Persistent Memory. When a
@@ -30,6 +32,8 @@ void pcie_disable_tph(struct pci_dev *pdev);
int pcie_enable_tph(struct pci_dev *pdev, int mode);
u16 pcie_tph_get_st_table_size(struct pci_dev *pdev);
u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev);
+u8 pcie_tph_enabled_req_type(struct pci_dev *pdev);
+u8 pcie_tph_completer_type(struct pci_dev *pdev);
#else
static inline int pcie_tph_set_st_entry(struct pci_dev *pdev,
unsigned int index, u16 tag)
@@ -41,6 +45,10 @@ static inline int pcie_tph_get_cpu_st(struct pci_dev *dev,
static inline void pcie_disable_tph(struct pci_dev *pdev) { }
static inline int pcie_enable_tph(struct pci_dev *pdev, int mode)
{ return -EINVAL; }
+static inline u8 pcie_tph_enabled_req_type(struct pci_dev *pdev)
+{ return PCI_TPH_REQ_DISABLE; }
+static inline u8 pcie_tph_completer_type(struct pci_dev *pdev)
+{ return PCI_EXP_DEVCAP2_TPH_COMP_NONE; }
#endif
#endif /* LINUX_PCI_TPH_H */
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v8 2/4] dma-buf: add optional get_pci_tph() callback
2026-06-15 6:58 [PATCH v8 0/4] vfio/dma-buf: add TPH support for peer-to-peer access Zhiping Zhang
2026-06-15 6:58 ` [PATCH v8 1/4] PCI/TPH: Add requester/completer type helpers Zhiping Zhang
@ 2026-06-15 6:58 ` Zhiping Zhang
2026-06-17 9:41 ` Leon Romanovsky
2026-06-15 6:59 ` [PATCH v8 3/4] vfio/pci: implement get_pci_tph and DMA_BUF_TPH feature Zhiping Zhang
2026-06-15 6:59 ` [PATCH v8 4/4] RDMA/mlx5: get tph for p2p access when registering dma-buf mr Zhiping Zhang
3 siblings, 1 reply; 7+ messages in thread
From: Zhiping Zhang @ 2026-06-15 6:58 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Michael Guralnik, Sumit Semwal,
Christian Konig, Alex Williamson, Bjorn Helgaas
Cc: kvm, linux-rdma, linux-pci, dri-devel, Zhiping Zhang
Add an optional dma_buf_ops.get_pci_tph callback and a
DMA-buf importer wrapper, dma_buf_get_pci_tph().
TPH is PCIe TLP Processing Hint. 8-bit ST and 16-bit Extended ST are
distinct PCIe TPH namespaces, so the importer requests the namespace it
can emit and the exporter returns the matching ST/PH tuple or
-EOPNOTSUPP.
dma_buf_get_pci_tph() is the importer entry point. It requires
&dmabuf->resv to be held while the callback runs and returns
-EOPNOTSUPP when the exporter does not provide PCI TPH metadata.
The first user is VFIO_DEVICE_FEATURE_DMA_BUF_TPH in vfio-pci, with
mlx5 as the first importer.
Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
---
drivers/dma-buf/dma-buf.c | 25 +++++++++++++++++++++++++
include/linux/dma-buf.h | 16 ++++++++++++++++
2 files changed, 41 insertions(+)
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index d504c636dc29..7a4c9b0d5dab 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1144,6 +1144,31 @@ void dma_buf_unpin(struct dma_buf_attachment *attach)
}
EXPORT_SYMBOL_NS_GPL(dma_buf_unpin, "DMA_BUF");
+/**
+ * dma_buf_get_pci_tph - Retrieve PCIe TLP Processing Hint (TPH) metadata
+ * @dmabuf: DMA buffer to query
+ * @extended: false for 8-bit ST, true for 16-bit Extended ST
+ * @steering_tag: returns the raw steering tag for the requested namespace
+ * @ph: returns the TPH processing hint
+ *
+ * Wrapper for the optional &dma_buf_ops.get_pci_tph callback.
+ *
+ * Must be called with &dma_buf.resv held. Returns -EOPNOTSUPP if the
+ * exporter does not implement the callback or has no metadata for the
+ * requested namespace.
+ */
+int dma_buf_get_pci_tph(struct dma_buf *dmabuf, bool extended,
+ u16 *steering_tag, u8 *ph)
+{
+ dma_resv_assert_held(dmabuf->resv);
+
+ if (!dmabuf->ops->get_pci_tph)
+ return -EOPNOTSUPP;
+
+ return dmabuf->ops->get_pci_tph(dmabuf, extended, steering_tag, ph);
+}
+EXPORT_SYMBOL_NS_GPL(dma_buf_get_pci_tph, "DMA_BUF");
+
/**
* dma_buf_map_attachment - Returns the scatterlist table of the attachment;
* mapped into _device_ address space. Is a wrapper for map_dma_buf() of the
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index d1203da56fc5..5e7b69a40f3d 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -113,6 +113,20 @@ struct dma_buf_ops {
*/
void (*unpin)(struct dma_buf_attachment *attach);
+ /**
+ * @get_pci_tph:
+ * @dmabuf: DMA buffer for which to retrieve TPH metadata
+ * @extended: false for 8-bit ST, true for 16-bit Extended ST
+ * @steering_tag: Returns the raw TPH steering tag for the requested
+ * namespace
+ * @ph: Returns the TPH processing hint (2-bit value)
+ *
+ * Optional callback for dma_buf_get_pci_tph(). Called with
+ * &dma_buf.resv held.
+ */
+ int (*get_pci_tph)(struct dma_buf *dmabuf, bool extended,
+ u16 *steering_tag, u8 *ph);
+
/**
* @map_dma_buf:
*
@@ -563,6 +577,8 @@ void dma_buf_detach(struct dma_buf *dmabuf,
struct dma_buf_attachment *attach);
int dma_buf_pin(struct dma_buf_attachment *attach);
void dma_buf_unpin(struct dma_buf_attachment *attach);
+int dma_buf_get_pci_tph(struct dma_buf *dmabuf, bool extended,
+ u16 *steering_tag, u8 *ph);
struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info);
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v8 3/4] vfio/pci: implement get_pci_tph and DMA_BUF_TPH feature
2026-06-15 6:58 [PATCH v8 0/4] vfio/dma-buf: add TPH support for peer-to-peer access Zhiping Zhang
2026-06-15 6:58 ` [PATCH v8 1/4] PCI/TPH: Add requester/completer type helpers Zhiping Zhang
2026-06-15 6:58 ` [PATCH v8 2/4] dma-buf: add optional get_pci_tph() callback Zhiping Zhang
@ 2026-06-15 6:59 ` Zhiping Zhang
2026-06-15 6:59 ` [PATCH v8 4/4] RDMA/mlx5: get tph for p2p access when registering dma-buf mr Zhiping Zhang
3 siblings, 0 replies; 7+ messages in thread
From: Zhiping Zhang @ 2026-06-15 6:59 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Michael Guralnik, Sumit Semwal,
Christian Konig, Alex Williamson, Bjorn Helgaas
Cc: kvm, linux-rdma, linux-pci, dri-devel, Zhiping Zhang
Implement dma-buf get_pci_tph for vfio-pci exported dma-bufs and add
VFIO_DEVICE_FEATURE_DMA_BUF_TPH so userspace can publish TPH metadata
for a VFIO-owned device.
8-bit ST and 16-bit Extended ST are distinct PCIe TPH namespaces; the
uAPI carries both with explicit validity flags, and get_pci_tph()
returns the value matching the importer's requested namespace or
-EOPNOTSUPP.
Publish and read the TPH descriptor under dmabuf->resv, matching the
locking used for other importer-visible dma-buf state. The SET ioctl
takes dma_resv_lock_interruptible(), while the callback runs under
DMA-buf's asserted resv lock.
Reject requests the device cannot consume as a completer:
pcie_tph_completer_type() must report at least
PCI_EXP_DEVCAP2_TPH_COMP_TPH_ONLY, and Extended ST requires
PCI_EXP_DEVCAP2_TPH_COMP_EXT_TPH. Make PROBE follow the same hardware
gate so the feature only probes as supported when the device can really
consume it.
Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
---
drivers/vfio/pci/vfio_pci_core.c | 3 +
drivers/vfio/pci/vfio_pci_dmabuf.c | 97 +++++++++++++++++++++++++++++-
drivers/vfio/pci/vfio_pci_priv.h | 12 ++++
include/uapi/linux/vfio.h | 37 ++++++++++++
4 files changed, 148 insertions(+), 1 deletion(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 050e7542952e..4fa36f2f7555 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1569,6 +1569,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
case VFIO_DEVICE_FEATURE_DMA_BUF:
return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
+ case VFIO_DEVICE_FEATURE_DMA_BUF_TPH:
+ return vfio_pci_core_feature_dma_buf_tph(vdev, flags, arg,
+ argsz);
default:
return -ENOTTY;
}
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index 1a177ce7de54..6284ce208a6a 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -3,6 +3,7 @@
*/
#include <linux/dma-buf-mapping.h>
#include <linux/pci-p2pdma.h>
+#include <linux/pci-tph.h>
#include <linux/dma-resv.h>
#include "vfio_pci_priv.h"
@@ -19,7 +20,14 @@ struct vfio_pci_dma_buf {
u32 nr_ranges;
struct kref kref;
struct completion comp;
- u8 revoked : 1;
+
+ /* Protected by dmabuf->resv. */
+ u16 tph_st_ext;
+ u8 tph_st;
+ u8 revoked:1;
+ u8 tph_st_valid:1;
+ u8 tph_st_ext_valid:1;
+ u8 tph_ph:2;
};
static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
@@ -69,6 +77,26 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
return ret;
}
+static int vfio_pci_dma_buf_get_pci_tph(struct dma_buf *dmabuf, bool extended,
+ u16 *steering_tag, u8 *ph)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+ dma_resv_assert_held(dmabuf->resv);
+
+ if (extended) {
+ if (!priv->tph_st_ext_valid)
+ return -EOPNOTSUPP;
+ *steering_tag = priv->tph_st_ext;
+ } else {
+ if (!priv->tph_st_valid)
+ return -EOPNOTSUPP;
+ *steering_tag = priv->tph_st;
+ }
+ *ph = priv->tph_ph;
+ return 0;
+}
+
static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
struct sg_table *sgt,
enum dma_data_direction dir)
@@ -101,6 +129,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
.attach = vfio_pci_dma_buf_attach,
+ .get_pci_tph = vfio_pci_dma_buf_get_pci_tph,
.map_dma_buf = vfio_pci_dma_buf_map,
.unmap_dma_buf = vfio_pci_dma_buf_unmap,
.release = vfio_pci_dma_buf_release,
@@ -333,6 +362,72 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
return ret;
}
+int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
+ u32 flags,
+ struct vfio_device_feature_dma_buf_tph __user *arg,
+ size_t argsz)
+{
+ struct vfio_device_feature_dma_buf_tph set_tph;
+ struct vfio_pci_dma_buf *priv;
+ struct dma_buf *dmabuf;
+ u8 comp;
+ int ret;
+
+ comp = pcie_tph_completer_type(vdev->pdev);
+ if (comp == PCI_EXP_DEVCAP2_TPH_COMP_NONE)
+ return -EOPNOTSUPP;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
+ sizeof(set_tph));
+ if (ret != 1)
+ return ret;
+
+ if (copy_from_user(&set_tph, arg, sizeof(set_tph)))
+ return -EFAULT;
+
+ if (set_tph.flags & ~(VFIO_DMA_BUF_TPH_ST | VFIO_DMA_BUF_TPH_ST_EXT))
+ return -EINVAL;
+
+ if (set_tph.ph & ~0x3)
+ return -EINVAL;
+
+ if ((set_tph.flags & VFIO_DMA_BUF_TPH_ST_EXT) &&
+ comp != PCI_EXP_DEVCAP2_TPH_COMP_EXT_TPH)
+ return -EOPNOTSUPP;
+
+ dmabuf = dma_buf_get(set_tph.dmabuf_fd);
+ if (IS_ERR(dmabuf))
+ return PTR_ERR(dmabuf);
+
+ if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ priv = dmabuf->priv;
+ if (priv->vdev != vdev) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ ret = dma_resv_lock_interruptible(dmabuf->resv, NULL);
+ if (ret)
+ goto out_put;
+
+ priv->tph_st = set_tph.steering_tag;
+ priv->tph_st_ext = set_tph.steering_tag_ext;
+ priv->tph_ph = set_tph.ph;
+ priv->tph_st_valid = !!(set_tph.flags & VFIO_DMA_BUF_TPH_ST);
+ priv->tph_st_ext_valid =
+ !!(set_tph.flags & VFIO_DMA_BUF_TPH_ST_EXT);
+ dma_resv_unlock(dmabuf->resv);
+ ret = 0;
+
+out_put:
+ dma_buf_put(dmabuf);
+ return ret;
+}
+
void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
{
struct vfio_pci_dma_buf *priv;
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index fca9d0dfac90..c58f369be4b3 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -118,6 +118,10 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
struct vfio_device_feature_dma_buf __user *arg,
size_t argsz);
+int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
+ u32 flags,
+ struct vfio_device_feature_dma_buf_tph __user *arg,
+ size_t argsz);
void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
#else
@@ -128,6 +132,14 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
{
return -ENOTTY;
}
+
+static inline int
+vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf_tph __user *arg,
+ size_t argsz)
+{
+ return -ENOTTY;
+}
static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
{
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5de618a3a5ee..2d30ba43e2cf 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1534,6 +1534,43 @@ struct vfio_device_feature_dma_buf {
*/
#define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12
+/**
+ * Upon VFIO_DEVICE_FEATURE_SET associate TPH (TLP Processing Hints) metadata
+ * with a vfio-exported dma-buf. The dma-buf must have been created by
+ * VFIO_DEVICE_FEATURE_DMA_BUF on this device, and the device must report
+ * TPH Completer support in Device Capabilities 2 (bits 13:12); requests
+ * carrying VFIO_DMA_BUF_TPH_ST_EXT additionally require the device to
+ * report the Extended TPH Completer encoding. Otherwise the ioctl
+ * returns -EOPNOTSUPP.
+ *
+ * dmabuf_fd is the file descriptor returned by VFIO_DEVICE_FEATURE_DMA_BUF.
+ *
+ * 8-bit ST (steering_tag) and 16-bit Extended ST (steering_tag_ext) are
+ * distinct namespaces. Userspace supplies whichever values are valid and sets
+ * the matching VFIO_DMA_BUF_TPH_ST / VFIO_DMA_BUF_TPH_ST_EXT bits in @flags;
+ * an importer requests one namespace and receives the matching value.
+ *
+ * @flags == 0 marks any previously published ST / Extended-ST as invalid
+ * for future PCI TPH queries on this dma-buf.
+ *
+ * ph is the 2-bit TLP Processing Hint and must be in the range [0, 3].
+ *
+ * Userspace must publish TPH before handing the dma-buf fd to an importer.
+ * Calling SET again replaces the published values.
+ */
+#define VFIO_DEVICE_FEATURE_DMA_BUF_TPH 13
+
+#define VFIO_DMA_BUF_TPH_ST (1 << 0)
+#define VFIO_DMA_BUF_TPH_ST_EXT (1 << 1)
+
+struct vfio_device_feature_dma_buf_tph {
+ __s32 dmabuf_fd;
+ __u32 flags;
+ __u16 steering_tag_ext;
+ __u8 steering_tag;
+ __u8 ph;
+};
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v8 4/4] RDMA/mlx5: get tph for p2p access when registering dma-buf mr
2026-06-15 6:58 [PATCH v8 0/4] vfio/dma-buf: add TPH support for peer-to-peer access Zhiping Zhang
` (2 preceding siblings ...)
2026-06-15 6:59 ` [PATCH v8 3/4] vfio/pci: implement get_pci_tph and DMA_BUF_TPH feature Zhiping Zhang
@ 2026-06-15 6:59 ` Zhiping Zhang
2026-06-17 9:25 ` Leon Romanovsky
3 siblings, 1 reply; 7+ messages in thread
From: Zhiping Zhang @ 2026-06-15 6:59 UTC (permalink / raw)
To: Jason Gunthorpe, Leon Romanovsky, Michael Guralnik, Sumit Semwal,
Christian Konig, Alex Williamson, Bjorn Helgaas
Cc: kvm, linux-rdma, linux-pci, dri-devel, Zhiping Zhang
Query dma-buf PCI TPH metadata when registering a dma-buf MR for
peer-to-peer access to a PCIe endpoint and use it to program
requester-side TPH on the outbound mkey. If the exporter has no
metadata, fall back to the existing no-TPH path.
Use mlx5_st_alloc_index_by_tag() to translate exporter-provided
steering tags into local ST entries when table mode is active, and add
mlx5_st_get_index() for DMAH-backed flows that already carry an ST
index.
For TPH-backed FRMRs, keep the extra ST-table reference tied to MR
lifetime rather than pooled mkey lifetime. Acquire the ref before MR
creation and release it again when the MR is returned to the pool or
the backing mkey is destroyed, while leaving the generic FRMR pool core
unchanged.
Import the DMA_BUF namespace for the new dma_buf_get_pci_tph() call so
modular mlx5_ib builds link cleanly.
Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
---
drivers/infiniband/hw/mlx5/main.c | 1 +
drivers/infiniband/hw/mlx5/mr.c | 105 +++++++++++++++++-
.../net/ethernet/mellanox/mlx5/core/lib/st.c | 49 ++++++--
include/linux/mlx5/driver.h | 12 ++
4 files changed, 157 insertions(+), 10 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 61078281953d..6d14909c8a2c 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -60,6 +60,7 @@
MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) IB driver");
MODULE_LICENSE("Dual BSD/GPL");
+MODULE_IMPORT_NS("DMA_BUF");
struct mlx5_ib_event_work {
struct work_struct work;
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 8984b31b9429..d73005bd32a6 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -39,6 +39,7 @@
#include <linux/delay.h>
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
+#include <linux/pci-tph.h>
#include <rdma/frmr_pools.h>
#include <rdma/ib_umem_odp.h>
#include "dm.h"
@@ -167,6 +168,32 @@ static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
#define MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK GENMASK_ULL(23, 16)
#define MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK GENMASK_ULL(15, 0)
+static int mlx5_ib_get_frmr_st_handle_ref(struct mlx5_ib_dev *dev,
+ u16 st_index)
+{
+ if (st_index == MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
+ return 0;
+
+ return mlx5_st_get_index(dev->mdev, st_index);
+}
+
+static void mlx5_ib_put_st_index_ref(struct mlx5_ib_dev *dev, u16 st_index)
+{
+ if (st_index == MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
+ return;
+
+ mlx5_st_dealloc_index(dev->mdev, st_index);
+}
+
+static void mlx5_ib_put_frmr_st_handle_ref(struct mlx5_ib_dev *dev,
+ u64 kernel_vendor_key)
+{
+ u16 st_index = FIELD_GET(MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK,
+ kernel_vendor_key);
+
+ mlx5_ib_put_st_index_ref(dev, st_index);
+}
+
static struct mlx5_ib_mr *
_mlx5_frmr_pool_alloc(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int access_flags, int access_mode,
@@ -218,7 +245,9 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
0 :
MLX5_FRMR_POOLS_KEY_ACCESS_MODE_KSM_MASK,
.num_dma_blocks = ndescs,
- .kernel_vendor_key = 0, /* no PH and no ST index */
+ .kernel_vendor_key =
+ FIELD_PREP(MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK,
+ MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX),
};
struct mlx5_ib_mr *mr;
int ret;
@@ -335,6 +364,7 @@ static int mlx5r_build_frmr_key(struct ib_device *device,
get_unchangeable_access_flags(dev, in->access_flags);
out->vendor_key = in->vendor_key;
out->num_dma_blocks = in->num_dma_blocks;
+ out->kernel_vendor_key = in->kernel_vendor_key;
return 0;
}
@@ -557,6 +587,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
mr->ibmr.pd = pd;
mr->access_flags = access_flags;
mr->page_shift = order_base_2(page_size);
+ mr->ibmr.frmr.key.kernel_vendor_key =
+ FIELD_PREP(MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK, st_index) |
+ FIELD_PREP(MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK, ph);
inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
if (populate)
@@ -755,6 +788,12 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
if (xlt_with_umr) {
+ err = mlx5_ib_get_frmr_st_handle_ref(dev, st_index);
+ if (err) {
+ ib_umem_release(umem);
+ return ERR_PTR(err);
+ }
+
mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
MLX5_MKC_ACCESS_MODE_MTT,
st_index, ph);
@@ -769,6 +808,8 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
mutex_unlock(&dev->slow_path_mutex);
}
if (IS_ERR(mr)) {
+ if (xlt_with_umr)
+ mlx5_ib_put_st_index_ref(dev, st_index);
ib_umem_release(umem);
return ERR_CAST(mr);
}
@@ -901,6 +942,52 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
.invalidate_mappings = mlx5_ib_dmabuf_invalidate_cb,
};
+static void get_pci_tph_mr_dmabuf(struct mlx5_ib_dev *dev, struct dma_buf *dmabuf,
+ u16 *st_index, u8 *ph)
+{
+ u16 local_st_index;
+ u16 steering_tag;
+ u8 local_ph;
+ bool extended;
+ int ret;
+
+ switch (pcie_tph_enabled_req_type(dev->mdev->pdev)) {
+ case PCI_TPH_REQ_TPH_ONLY:
+ extended = false;
+ break;
+ case PCI_TPH_REQ_EXT_TPH:
+ extended = true;
+ break;
+ default:
+ return;
+ }
+
+ dma_resv_lock(dmabuf->resv, NULL);
+ ret = dma_buf_get_pci_tph(dmabuf, extended, &steering_tag, &local_ph);
+ dma_resv_unlock(dmabuf->resv);
+ if (ret) {
+ if (ret != -EOPNOTSUPP)
+ mlx5_ib_dbg(dev, "get_pci_tph failed (%d)\n", ret);
+ return;
+ }
+
+ ret = mlx5_st_alloc_index_by_tag(dev->mdev, steering_tag,
+ &local_st_index);
+ if (ret) {
+ mlx5_ib_dbg(dev, "st_alloc_index_by_tag failed (%d)\n", ret);
+ return;
+ }
+
+ *st_index = local_st_index;
+ *ph = local_ph;
+}
+
+static void mlx5_ib_mr_put_frmr_st_handle_ref(struct mlx5_ib_mr *mr)
+{
+ mlx5_ib_put_frmr_st_handle_ref(mr_to_mdev(mr),
+ mr->ibmr.frmr.key.kernel_vendor_key);
+}
+
static struct ib_mr *
reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
u64 offset, u64 length, u64 virt_addr,
@@ -943,12 +1030,22 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
ph = dmah->ph;
if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
st_index = mdmah->st_index;
+
+ err = mlx5_ib_get_frmr_st_handle_ref(dev, st_index);
+ if (err) {
+ ib_umem_release(&umem_dmabuf->umem);
+ return ERR_PTR(err);
+ }
+ } else {
+ get_pci_tph_mr_dmabuf(dev, umem_dmabuf->attach->dmabuf,
+ &st_index, &ph);
}
mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
access_flags, access_mode,
st_index, ph);
if (IS_ERR(mr)) {
+ mlx5_ib_put_st_index_ref(dev, st_index);
ib_umem_release(&umem_dmabuf->umem);
return ERR_CAST(mr);
}
@@ -1384,8 +1481,10 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
int ret;
if (mr->ibmr.frmr.pool && !mlx5_umr_revoke_mr_with_lock(mr) &&
- !ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr))
+ !ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr)) {
+ mlx5_ib_mr_put_frmr_st_handle_ref(mr);
return 0;
+ }
if (is_odp)
mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
@@ -1406,6 +1505,8 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
dma_resv_unlock(
to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
}
+ if (!ret)
+ mlx5_ib_mr_put_frmr_st_handle_ref(mr);
return ret;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
index 7cedc348790d..877b37b4e639 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
@@ -92,23 +92,18 @@ void mlx5_st_destroy(struct mlx5_core_dev *dev)
kfree(st);
}
-int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
- unsigned int cpu_uid, u16 *st_index)
+int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag,
+ u16 *st_index)
{
struct mlx5_st_idx_data *idx_data;
struct mlx5_st *st = dev->st;
unsigned long index;
u32 xa_id;
- u16 tag;
- int ret;
+ int ret = 0;
if (!st)
return -EOPNOTSUPP;
- ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
- if (ret)
- return ret;
-
if (st->direct_mode) {
*st_index = tag;
return 0;
@@ -152,8 +147,46 @@ int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
mutex_unlock(&st->lock);
return ret;
}
+EXPORT_SYMBOL_GPL(mlx5_st_alloc_index_by_tag);
+
+int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
+ unsigned int cpu_uid, u16 *st_index)
+{
+ u16 tag;
+ int ret;
+
+ ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
+ if (ret)
+ return ret;
+
+ return mlx5_st_alloc_index_by_tag(dev, tag, st_index);
+}
EXPORT_SYMBOL_GPL(mlx5_st_alloc_index);
+int mlx5_st_get_index(struct mlx5_core_dev *dev, u16 st_index)
+{
+ struct mlx5_st_idx_data *idx_data;
+ struct mlx5_st *st = dev->st;
+ int ret = 0;
+
+ if (!st)
+ return -EOPNOTSUPP;
+
+ if (st->direct_mode)
+ return 0;
+
+ mutex_lock(&st->lock);
+ idx_data = xa_load(&st->idx_xa, st_index);
+ if (WARN_ON_ONCE(!idx_data))
+ ret = -EINVAL;
+ else
+ refcount_inc(&idx_data->usecount);
+ mutex_unlock(&st->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mlx5_st_get_index);
+
int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
{
struct mlx5_st_idx_data *idx_data;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 04b96c5abb57..0480b5c4f189 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1166,10 +1166,22 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type
u64 length, u16 uid, phys_addr_t addr, u32 obj_id);
#ifdef CONFIG_PCIE_TPH
+int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag,
+ u16 *st_index);
+int mlx5_st_get_index(struct mlx5_core_dev *dev, u16 st_index);
int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
unsigned int cpu_uid, u16 *st_index);
int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index);
#else
+static inline int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev,
+ u16 tag, u16 *st_index)
+{
+ return -EOPNOTSUPP;
+}
+static inline int mlx5_st_get_index(struct mlx5_core_dev *dev, u16 st_index)
+{
+ return -EOPNOTSUPP;
+}
static inline int mlx5_st_alloc_index(struct mlx5_core_dev *dev,
enum tph_mem_type mem_type,
unsigned int cpu_uid, u16 *st_index)
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 7+ messages in thread