All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alex Williamson <alex@shazbot.org>
To: Zhiping Zhang <zhipingz@meta.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>, Leon Romanovsky <leon@kernel.org>,
	Sumit Semwal <sumit.semwal@linaro.org>,
	Christian Konig <christian.koenig@amd.com>,
	Bjorn Helgaas <helgaas@kernel.org>, <kvm@vger.kernel.org>,
	<linux-rdma@vger.kernel.org>, <linux-pci@vger.kernel.org>,
	<netdev@vger.kernel.org>, <dri-devel@lists.freedesktop.org>,
	Keith Busch <kbusch@kernel.org>, Yochai Cohen <yochai@nvidia.com>,
	Yishai Hadas <yishaih@nvidia.com>,
	alex@shazbot.org
Subject: Re: [PATCH v5 4/4] RDMA/mlx5: get tph for p2p access when registering dma-buf mr
Date: Wed, 27 May 2026 13:00:45 -0600	[thread overview]
Message-ID: <20260527130045.704a4502@shazbot.org> (raw)
In-Reply-To: <20260526144401.1485788-5-zhipingz@meta.com>

On Tue, 26 May 2026 07:43:56 -0700
Zhiping Zhang <zhipingz@meta.com> wrote:

> Query dma-buf TPH metadata when registering a dma-buf MR for peer-to-
> peer access and translate the returned steering tag into an mlx5 ST
> index. Keep the DMAH path as the first priority and only fall back to
> DMA-buf metadata when no DMAH is supplied.
> 
> Track per-MR ownership of the allocated ST index and release it on MR
> setup failure, destroy, and FRMR-pool reuse. Release the ST index before
> the MR is pushed back into the FRMR pool, and free mlx5_st_idx_data when
> its refcount reaches zero so repeated allocation/deallocation does not
> leak memory.
> 
> Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
> ---
>  drivers/infiniband/hw/mlx5/mlx5_ib.h          |  6 ++
>  drivers/infiniband/hw/mlx5/mr.c               | 86 ++++++++++++++++++-
>  .../net/ethernet/mellanox/mlx5/core/lib/st.c  | 28 ++++--
>  include/linux/mlx5/driver.h                   |  7 ++
>  4 files changed, 115 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> index e156dc4d7529..4ab867392267 100644
> --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
> +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> @@ -721,6 +721,12 @@ struct mlx5_ib_mr {
>  			u8 revoked :1;
>  			/* Indicates previous dmabuf page fault occurred */
>  			u8 dmabuf_faulted:1;
> +			/* Set when the MR owns dmabuf_st_index and must
> +			 * release it via mlx5_st_dealloc_index() once the
> +			 * firmware mkey is no longer referencing it.
> +			 */
> +			u8 dmabuf_st_owned:1;
> +			u16 dmabuf_st_index;
>  			struct mlx5_ib_mkey null_mmkey;
>  		};
>  	};
> diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
> index 3b6da45061a5..8059b5e4da97 100644
> --- a/drivers/infiniband/hw/mlx5/mr.c
> +++ b/drivers/infiniband/hw/mlx5/mr.c
> @@ -38,6 +38,7 @@
>  #include <linux/delay.h>
>  #include <linux/dma-buf.h>
>  #include <linux/dma-resv.h>
> +#include <linux/pci-tph.h>
>  #include <rdma/frmr_pools.h>
>  #include <rdma/ib_umem_odp.h>
>  #include "dm.h"
> @@ -46,6 +47,8 @@
>  #include "data_direct.h"
>  #include "dmah.h"
>  
> +MODULE_IMPORT_NS("DMA_BUF");
> +

This doesn't appear to add any dma-buf namespace dependencies.

>  static int mkey_max_umr_order(struct mlx5_ib_dev *dev)
>  {
>  	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
> @@ -899,6 +902,63 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
>  	.invalidate_mappings = mlx5_ib_dmabuf_invalidate_cb,
>  };
>  
> +/*
> + * Query TPH metadata from @dmabuf and translate the raw steering tag into
> + * an mlx5 ST index. On success, returns 0 and the caller becomes the
> + * owner of *@st_index (must be released with mlx5_st_dealloc_index()
> + * once the firmware mkey no longer references it). On any failure
> + * *@st_index and *@ph are left as the no-TPH defaults set by the caller.
> + *
> + * @dmabuf must already be referenced by the caller (e.g. via the umem's
> + * attachment) so we don't re-resolve the user's fd here and avoid a
> + * dup2() TOCTOU between umem creation and TPH lookup.
> + */
> +static void get_tph_mr_dmabuf(struct mlx5_ib_dev *dev, struct dma_buf *dmabuf,
> +			      u16 *st_index, u8 *ph)
> +{
> +	u8 req_type;
> +	u16 steering_tag;
> +	u8 st_width;
> +	int ret;
> +
> +	if (!dmabuf->ops->get_tph)
> +		return;
> +
> +	req_type = pcie_tph_enabled_req_type(dev->mdev->pdev);
> +	switch (req_type) {
> +	case PCI_TPH_REQ_TPH_ONLY:
> +		st_width = 8;
> +		break;
> +	case PCI_TPH_REQ_EXT_TPH:
> +		st_width = 16;
> +		break;
> +	default:
> +		return;
> +	}
> +
> +	ret = dmabuf->ops->get_tph(dmabuf, &steering_tag, ph, st_width);
> +	if (ret) {
> +		mlx5_ib_dbg(dev, "get_tph failed (%d)\n", ret);
> +		*ph = MLX5_IB_NO_PH;
> +		return;
> +	}
> +
> +	ret = mlx5_st_alloc_index_by_tag(dev->mdev, steering_tag, st_index);
> +	if (ret) {
> +		*ph = MLX5_IB_NO_PH;
> +		mlx5_ib_dbg(dev, "st_alloc_index_by_tag failed (%d)\n", ret);
> +	}
> +}

ph handling is inconsistent, why not use a local variable and only set
the caller's pointer on success?

> +
> +static void mlx5_ib_mr_put_dmabuf_st(struct mlx5_ib_mr *mr)
> +{
> +	if (mr->umem && mr->dmabuf_st_owned) {
> +		mlx5_st_dealloc_index(mr_to_mdev(mr)->mdev,
> +				      mr->dmabuf_st_index);
> +		mr->dmabuf_st_owned = 0;
> +	}
> +}
> +
>  static struct ib_mr *
>  reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
>  		   u64 offset, u64 length, u64 virt_addr,
> @@ -941,16 +1001,26 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
>  		ph = dmah->ph;
>  		if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
>  			st_index = mdmah->st_index;
> +	} else {
> +		get_tph_mr_dmabuf(dev, umem_dmabuf->attach->dmabuf,
> +				  &st_index, &ph);
>  	}
>  
>  	mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
>  				access_flags, access_mode,
>  				st_index, ph);
>  	if (IS_ERR(mr)) {
> +		if (!dmah && st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
> +			mlx5_st_dealloc_index(dev->mdev, st_index);
>  		ib_umem_release(&umem_dmabuf->umem);
>  		return ERR_CAST(mr);
>  	}
>  
> +	if (!dmah && st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX) {
> +		mr->dmabuf_st_index = st_index;
> +		mr->dmabuf_st_owned = 1;
> +	}
> +
>  	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
>  
>  	atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
> @@ -1377,9 +1447,17 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
>  	bool is_odp = is_odp_mr(mr);
>  	int ret;
>  
> -	if (mr->ibmr.frmr.pool && !mlx5_umr_revoke_mr_with_lock(mr) &&
> -	    !ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr))
> -		return 0;
> +	if (mr->ibmr.frmr.pool && !mlx5_umr_revoke_mr_with_lock(mr)) {
> +		/*
> +		 * The mkey has been revoked: firmware no longer references
> +		 * dmabuf_st_index, so release it before this mr can re-enter
> +		 * the FRMR cache for reuse by another registration.
> +		 */
> +		mlx5_ib_mr_put_dmabuf_st(mr);
> +
> +		if (!ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr))
> +			return 0;
> +	}
>  
>  	if (is_odp)
>  		mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
> @@ -1400,6 +1478,8 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
>  		dma_resv_unlock(
>  			to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
>  	}
> +	if (!ret)
> +		mlx5_ib_mr_put_dmabuf_st(mr);
>  	return ret;
>  }
>  
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
> index 997be91f0a13..8929c17c88bc 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
> @@ -29,7 +29,7 @@ struct mlx5_st *mlx5_st_create(struct mlx5_core_dev *dev)
>  	u8 direct_mode = 0;
>  	u16 num_entries;
>  	u32 tbl_loc;
> -	int ret;
> +	int ret = 0;

Unnecessary change.

>  
>  	if (!MLX5_CAP_GEN(dev, mkey_pcie_tph))
>  		return NULL;
> @@ -92,23 +92,18 @@ void mlx5_st_destroy(struct mlx5_core_dev *dev)
>  	kfree(st);
>  }
>  
> -int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
> -			unsigned int cpu_uid, u16 *st_index)
> +int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag,
> +			       u16 *st_index)
>  {
>  	struct mlx5_st_idx_data *idx_data;
>  	struct mlx5_st *st = dev->st;
>  	unsigned long index;
>  	u32 xa_id;
> -	u16 tag;
> -	int ret;
> +	int ret = 0;
>  
>  	if (!st)
>  		return -EOPNOTSUPP;
>  
> -	ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
> -	if (ret)
> -		return ret;
> -
>  	if (st->direct_mode) {
>  		*st_index = tag;
>  		return 0;
> @@ -152,6 +147,20 @@ int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
>  	mutex_unlock(&st->lock);
>  	return ret;
>  }
> +EXPORT_SYMBOL_GPL(mlx5_st_alloc_index_by_tag);
> +
> +int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
> +			unsigned int cpu_uid, u16 *st_index)
> +{
> +	u16 tag;
> +	int ret;
> +
> +	ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
> +	if (ret)
> +		return ret;
> +
> +	return mlx5_st_alloc_index_by_tag(dev, tag, st_index);
> +}
>  EXPORT_SYMBOL_GPL(mlx5_st_alloc_index);
>  
>  int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
> @@ -175,6 +184,7 @@ int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
>  
>  	if (refcount_dec_and_test(&idx_data->usecount)) {
>  		xa_erase(&st->idx_xa, st_index);
> +		kfree(idx_data);
>  		/* We leave PCI config space as was before, no mkey will refer to it */
>  	}

Should this be pulled out as a fix separate from the feature added
here?  Thanks,

Alex

>  
> diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
> index 04b96c5abb57..523a9ab0ae1e 100644
> --- a/include/linux/mlx5/driver.h
> +++ b/include/linux/mlx5/driver.h
> @@ -1166,10 +1166,17 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type
>  			   u64 length, u16 uid, phys_addr_t addr, u32 obj_id);
>  
>  #ifdef CONFIG_PCIE_TPH
> +int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag,
> +			       u16 *st_index);
>  int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
>  			unsigned int cpu_uid, u16 *st_index);
>  int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index);
>  #else
> +static inline int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev,
> +					     u16 tag, u16 *st_index)
> +{
> +	return -EOPNOTSUPP;
> +}
>  static inline int mlx5_st_alloc_index(struct mlx5_core_dev *dev,
>  				      enum tph_mem_type mem_type,
>  				      unsigned int cpu_uid, u16 *st_index)


  reply	other threads:[~2026-05-27 19:00 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-26 14:43 [PATCH v5 0/4] vfio/dma-buf: add TPH support for peer-to-peer access Zhiping Zhang
2026-05-26 14:43 ` [PATCH v5 1/4] PCI/TPH: expose the enabled TPH requester type Zhiping Zhang
2026-05-27 20:53   ` Alex Williamson
2026-05-28  5:35     ` Zhiping Zhang
2026-05-28  8:04       ` fengchengwen
2026-05-29  6:41         ` Zhiping Zhang
2026-05-26 14:43 ` [PATCH v5 2/4] dma-buf: add optional get_tph() callback Zhiping Zhang
2026-05-27  6:57   ` Christian König
2026-05-27 17:03   ` Alex Williamson
2026-05-26 14:43 ` [PATCH v5 3/4] vfio/pci: implement get_tph and DMA_BUF_TPH feature Zhiping Zhang
2026-05-27 18:06   ` Alex Williamson
2026-05-28  5:34     ` Zhiping Zhang
2026-05-26 14:43 ` [PATCH v5 4/4] RDMA/mlx5: get tph for p2p access when registering dma-buf mr Zhiping Zhang
2026-05-27 19:00   ` Alex Williamson [this message]
2026-05-28  5:54     ` Zhiping Zhang
2026-05-27 22:55   ` Michael Gur
2026-05-28  6:07     ` Zhiping Zhang
2026-05-27  6:55 ` [PATCH v5 0/4] vfio/dma-buf: add TPH support for peer-to-peer access Christian König
2026-05-27 12:14   ` Jason Gunthorpe
2026-05-27 12:23     ` Christian König
2026-05-27 12:36       ` Jason Gunthorpe
2026-05-27 12:53         ` Christian König
2026-05-28  4:55           ` Zhiping Zhang
2026-05-28  7:46             ` Christian König
2026-05-29  6:34               ` Zhiping Zhang
2026-05-29  7:36                 ` Christian König
2026-05-29 20:11                   ` Jason Gunthorpe
2026-06-01  9:59                     ` Christian König
2026-06-01 17:47                       ` Jason Gunthorpe
2026-06-01 18:17                         ` Christian König
2026-06-01 18:48                           ` Jason Gunthorpe
2026-06-02  7:14                             ` Zhiping Zhang
2026-05-29 20:31                   ` Keith Busch
2026-06-01 10:03                     ` Christian König
2026-06-01 17:50                     ` Jason Gunthorpe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260527130045.704a4502@shazbot.org \
    --to=alex@shazbot.org \
    --cc=christian.koenig@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=helgaas@kernel.org \
    --cc=jgg@ziepe.ca \
    --cc=kbusch@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=sumit.semwal@linaro.org \
    --cc=yishaih@nvidia.com \
    --cc=yochai@nvidia.com \
    --cc=zhipingz@meta.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.