Linux RDMA and InfiniBand development

Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed

* Re: [PATCH net-next v10 2/2] net: mana: force full-page RX buffers via ethtool private flag
From: Dipayaan Roy @ 2026-06-09  4:32 UTC (permalink / raw)
  To: Jacob Keller
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, dipayanroy, leitao, kees, john.fastabend,
	hawk, bpf, daniel, ast, sdf, yury.norov, pavan.chebbi
In-Reply-To: <c3b2ab74-754d-4d09-b7a2-d274343d0936@intel.com>

On Thu, Jun 04, 2026 at 11:40:30AM -0700, Jacob Keller wrote:
> On 6/2/2026 1:24 PM, Dipayaan Roy wrote:
> > On some ARM64 platforms with 4K PAGE_SIZE, page_pool fragment
> > allocation in the RX refill path can cause 15-20% throughput
> > regression under high connection counts (>16 TCP streams).
> > 
> > Add an ethtool private flag "full-page-rx" that allows the user to
> > force one RX buffer per page, bypassing the page_pool fragment path.
> > This restores line-rate (180+ Gbps) performance on affected platforms.
> > 
> > Usage:
> >   ethtool --set-priv-flags eth0 full-page-rx on
> > 
> > There is no behavioral change by default. The flag must be explicitly
> > enabled by the user or udev rule.
> > 
> > The existing single-buffer-per-page logic for XDP and jumbo frames is
> > consolidated into a new helper mana_use_single_rxbuf_per_page() which
> > is now the single decision point for both the automatic and
> > user-controlled paths.
> > 
> > Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
> > ---
> 
> I had one or two minor nits, but nothing that I think really deserves a
> v11. The only real comment is a future "gotcha" that could happen if you
> ever added a second private flag, which seems unlikely and maybe not
> worth dealing with until it matters.
> 
> Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
>

Hi Jacob,

Thank you for the review.
I will keep this patch as is, since no plans for any new private flags.

Regards
Dipayaan Roy

> >  drivers/net/ethernet/microsoft/mana/mana_en.c |  22 +++-
> >  .../ethernet/microsoft/mana/mana_ethtool.c    | 103 ++++++++++++++++++
> >  include/net/mana/mana.h                       |   8 ++
> >  3 files changed, 131 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > index db14357d3732..447cecfd3f67 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > @@ -744,6 +744,25 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da)
> >  	return va;
> >  }
> >  
> > +static bool
> > +mana_use_single_rxbuf_per_page(struct mana_port_context *apc, u32 mtu)
> > +{
> > +	/* On some platforms with 4K PAGE_SIZE, page_pool fragment allocation
> > +	 * in the RX refill path (~2kB buffer) can cause significant throughput
> > +	 * regression under high connection counts. Allow user to force one RX
> > +	 * buffer per page via ethtool private flag to bypass the fragment
> > +	 * path.
> > +	 */
> > +	if (apc->priv_flags & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF))
> > +		return true;
> > +
> > +	/* For xdp and jumbo frames make sure only one packet fits per page. */
> > +	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc))
> > +		return true;
> 
> Technically you could combine all three into one if, but I agree that
> clarity and space for the comment about why the private flag exists
> makes sense.
> 
> > +
> > +	return false;
> > +}
> > +
> >  /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
> >  static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
> >  			       int mtu, u32 *datasize, u32 *alloc_size,
> > @@ -754,8 +773,7 @@ static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
> >  	/* Calculate datasize first (consistent across all cases) */
> >  	*datasize = mtu + ETH_HLEN;
> >  
> > -	/* For xdp and jumbo frames make sure only one packet fits per page */
> > -	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) {
> > +	if (mana_use_single_rxbuf_per_page(apc, mtu)) {
> >  		if (mana_xdp_get(apc)) {
> >  			*headroom = XDP_PACKET_HEADROOM;
> >  			*alloc_size = PAGE_SIZE;
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > index 7e79681634db..f22bbb325948 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > @@ -133,6 +133,10 @@ static const struct mana_stats_desc mana_phy_stats[] = {
> >  	{ "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc7_phy) },
> >  };
> >  
> > +static const char mana_priv_flags[MANA_PRIV_FLAG_MAX][ETH_GSTRING_LEN] = {
> > +	[MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF] = "full-page-rx"
> > +};
> > +
> >  static int mana_get_sset_count(struct net_device *ndev, int stringset)
> >  {
> >  	struct mana_port_context *apc = netdev_priv(ndev);
> > @@ -144,6 +148,10 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
> >  		       ARRAY_SIZE(mana_phy_stats) +
> >  		       ARRAY_SIZE(mana_hc_stats)  +
> >  		       num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
> > +
> > +	case ETH_SS_PRIV_FLAGS:
> > +		return MANA_PRIV_FLAG_MAX;
> > +
> >  	default:
> >  		return -EINVAL;
> >  	}
> > @@ -192,6 +200,14 @@ static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
> >  	}
> >  }
> >  
> > +static void mana_get_strings_priv_flags(u8 **data)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < MANA_PRIV_FLAG_MAX; i++)
> > +		ethtool_puts(data, mana_priv_flags[i]);
> > +}
> > +
> >  static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
> >  {
> >  	struct mana_port_context *apc = netdev_priv(ndev);
> > @@ -200,6 +216,9 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
> >  	case ETH_SS_STATS:
> >  		mana_get_strings_stats(apc, &data);
> >  		break;
> > +	case ETH_SS_PRIV_FLAGS:
> > +		mana_get_strings_priv_flags(&data);
> > +		break;
> >  	default:
> >  		break;
> >  	}
> > @@ -590,6 +609,88 @@ static int mana_get_link_ksettings(struct net_device *ndev,
> >  	return 0;
> >  }
> >  
> > +static u32 mana_get_priv_flags(struct net_device *ndev)
> > +{
> > +	struct mana_port_context *apc = netdev_priv(ndev);
> > +
> > +	return apc->priv_flags;
> > +}
> > +
> > +static int mana_set_priv_flags(struct net_device *ndev, u32 priv_flags)
> > +{
> > +	struct mana_port_context *apc = netdev_priv(ndev);
> > +	u32 changed = apc->priv_flags ^ priv_flags;
> > +	u32 old_priv_flags = apc->priv_flags;
> > +	bool schedule_port_reset = false;
> > +	int err = 0;
> > +
> > +	if (!changed)
> > +		return 0;
> > +
> > +	/* Reject unknown bits */
> > +	if (priv_flags & ~GENMASK(MANA_PRIV_FLAG_MAX - 1, 0))
> > +		return -EINVAL;
> 
> Good. Explicit rejection ensures that there's no risk of bad value. I
> think this is only required for the legacy ioctl interface, and won't be
> able to have a bit set that isn't in your accepted list. However the
> legacy ioctl interface looks like it doesn't do that double checking, so
> its good to have this.
> 
> > +
> > +	if (changed & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF)) {
> > +		apc->priv_flags = priv_flags;
> > +
> 
> In the (unlikely) event that you need another private flag in the
> future, this bit seems like it shouldn't be inside the if block here. It
> seems like you'd want to either do this at the end or up front. Of
> course it doesn't matter as long as this is the only private flag you have.
> 
> > +		if (!apc->port_is_up) {
> > +			/* Port is down, flag updated to apply on next up
> > +			 * so just return.
> > +			 */
> > +			return 0;
> > +		}
> > +
> > +		/* Pre-allocate buffers to prevent failure in mana_attach
> > +		 * later
> > +		 */
> > +		err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
> > +		if (err) {
> > +			netdev_err(ndev,
> > +				   "Insufficient memory for new allocations\n");
> > +			apc->priv_flags = old_priv_flags;
> > +			return err;
> > +		}
> > +
> > +		err = mana_detach(ndev, false);
> > +		if (err) {
> > +			netdev_err(ndev, "mana_detach failed: %d\n", err);
> > +			apc->priv_flags = old_priv_flags;
> > +
> > +			/* Port is in an inconsistent state. Restore
> > +			 * 'port_is_up' so that queue reset work handler
> > +			 * can properly detach and re-attach.
> > +			 */
> > +			apc->port_is_up = true;
> > +			schedule_port_reset = true;
> > +			goto out;
> > +		}
> > +
> > +		err = mana_attach(ndev);
> > +		if (err) {
> > +			netdev_err(ndev, "mana_attach failed: %d\n", err);
> > +			apc->priv_flags = old_priv_flags;
> > +
> > +			/* Restore 'port_is_up' so the reset work handler
> > +			 * can properly detach/attach. Without this,
> > +			 * the handler sees port_is_up=false and skips
> > +			 * queue allocation, leaving the port dead.
> > +			 */
> > +			apc->port_is_up = true;
> > +			schedule_port_reset = true;
> > +		}
> 
> I might have made this bit a separate function, but that comes from
> history of working with older drivers which accumulated a larger number
> of private flags. Given that we frown on adding new ones except in more
> rare cases these days, this is probably fine.
> 
> > +	}
> > +
> > +out:
> > +	mana_pre_dealloc_rxbufs(apc);
> > +
> > +	if (schedule_port_reset)
> > +		queue_work(apc->ac->per_port_queue_reset_wq,
> > +			   &apc->queue_reset_work);
> > +
> > +	return err;
> > +}
> > +
> >  const struct ethtool_ops mana_ethtool_ops = {
> >  	.supported_coalesce_params = ETHTOOL_COALESCE_RX_CQE_FRAMES,
> >  	.get_ethtool_stats	= mana_get_ethtool_stats,
> > @@ -608,4 +709,6 @@ const struct ethtool_ops mana_ethtool_ops = {
> >  	.set_ringparam          = mana_set_ringparam,
> >  	.get_link_ksettings	= mana_get_link_ksettings,
> >  	.get_link		= ethtool_op_get_link,
> > +	.get_priv_flags		= mana_get_priv_flags,
> > +	.set_priv_flags		= mana_set_priv_flags,
> >  };
> > diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
> > index d9c27310fd04..26fd5e041a47 100644
> > --- a/include/net/mana/mana.h
> > +++ b/include/net/mana/mana.h
> > @@ -30,6 +30,12 @@ enum TRI_STATE {
> >  	TRI_STATE_TRUE = 1
> >  };
> >  
> > +/* MANA ethtool private flag bit positions */
> > +enum mana_priv_flag_bits {
> > +	MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF = 0,
> > +	MANA_PRIV_FLAG_MAX,
> 
> For cases like this, I find it helpful to add a comment indicating this
> must be the last entry. (and in that case, drop the trailing comma).
> 
> > +};
> > +
> >  /* Number of entries for hardware indirection table must be in power of 2 */
> >  #define MANA_INDIRECT_TABLE_MAX_SIZE 512
> >  #define MANA_INDIRECT_TABLE_DEF_SIZE 64
> > @@ -531,6 +537,8 @@ struct mana_port_context {
> >  	u32 rxbpre_headroom;
> >  	u32 rxbpre_frag_count;
> >  
> > +	u32 priv_flags;
> > +
> >  	struct bpf_prog *bpf_prog;
> >  
> >  	/* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
> 

^ permalink raw reply

* Re: [PATCH net v3] net/mlx5: Use effective affinity mask for IRQ selection
From: patchwork-bot+netdevbpf @ 2026-06-09  2:10 UTC (permalink / raw)
  To: Fushuai Wang
  Cc: saeedm, leon, tariqt, mbloch, andrew+netdev, davem, edumazet,
	kuba, pabeni, shayd, parav, moshe, netdev, linux-rdma,
	linux-kernel, wangfushuai
In-Reply-To: <20260605102112.91772-1-fushuai.wang@linux.dev>

Hello:

This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Fri,  5 Jun 2026 18:21:12 +0800 you wrote:
> From: Fushuai Wang <wangfushuai@baidu.com>
> 
> When a sf is created after a CPU has been taken offline, the IRQ pool may
> contain IRQs with affinity masks that include the offline CPU. Since only
> online CPUs should be considered for IRQ placement, cpumask_subset() check
> would fail because the iter_mask contains offline CPUs that are not present
> in req_mask, causing sf creation to fail.
> 
> [...]

Here is the summary with links:
  - [net,v3] net/mlx5: Use effective affinity mask for IRQ selection
    https://git.kernel.org/netdev/net/c/a7767290e77c

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH net-next v3] net/mlx5: Simplify cpumask operations in comp_irq_request_sf()
From: patchwork-bot+netdevbpf @ 2026-06-09  2:00 UTC (permalink / raw)
  To: Fushuai Wang
  Cc: saeedm, leon, tariqt, mbloch, andrew+netdev, davem, edumazet,
	kuba, pabeni, shayd, netdev, linux-rdma, linux-kernel,
	wangfushuai
In-Reply-To: <20260605101756.91275-1-fushuai.wang@linux.dev>

Hello:

This patch was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Fri,  5 Jun 2026 18:17:56 +0800 you wrote:
> From: Fushuai Wang <wangfushuai@baidu.com>
> 
> Combine cpumask_copy() and cpumask_andnot() into a single
> cpumask_andnot() since the function can take cpu_online_mask
> directly as the source.
> 
> Signed-off-by: Fushuai Wang <wangfushuai@baidu.com>
> Reviewed-by: Shay Drory <shayd@nvidia.com>
> 
> [...]

Here is the summary with links:
  - [net-next,v3] net/mlx5: Simplify cpumask operations in comp_irq_request_sf()
    https://git.kernel.org/netdev/net-next/c/32fbe56b3f8a

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH net] net/mlx5: Fix slab-out-of-bounds in mlx5_query_nic_vport_mac_list
From: patchwork-bot+netdevbpf @ 2026-06-09  2:00 UTC (permalink / raw)
  To: Tariq Toukan
  Cc: edumazet, kuba, pabeni, andrew+netdev, davem, saeedm, leon,
	mbloch, ogerlitz, saeedm, netdev, linux-rdma, linux-kernel, gal,
	dtatulea, cjubran
In-Reply-To: <20260604135849.458060-1-tariqt@nvidia.com>

Hello:

This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Thu, 4 Jun 2026 16:58:49 +0300 you wrote:
> From: Dragos Tatulea <dtatulea@nvidia.com>
> 
> mlx5_query_nic_vport_mac_list() sizes its firmware command buffer using
> the PF's log_max_current_uc/mc_list capabilities. When querying a VF
> vport with a larger configured max (via devlink), the firmware response
> can overflow this buffer:
> 
> [...]

Here is the summary with links:
  - [net] net/mlx5: Fix slab-out-of-bounds in mlx5_query_nic_vport_mac_list
    https://git.kernel.org/netdev/net/c/894e036a24a2

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH net] net/mlx5e: xsk: Fix DMA and xdp_frame leak on XDP_TX xmit failure
From: patchwork-bot+netdevbpf @ 2026-06-09  2:00 UTC (permalink / raw)
  To: Tariq Toukan
  Cc: edumazet, kuba, pabeni, andrew+netdev, davem, ast, daniel, hawk,
	john.fastabend, sdf, saeedm, leon, mbloch, saeedm, tariqt,
	maxtram95, netdev, bpf, linux-rdma, linux-kernel, gal, dtatulea
In-Reply-To: <20260604135446.456119-1-tariqt@nvidia.com>

Hello:

This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Thu, 4 Jun 2026 16:54:46 +0300 you wrote:
> From: Dragos Tatulea <dtatulea@nvidia.com>
> 
> In the XSK branch of mlx5e_xmit_xdp_buff(), when sq->xmit_xdp_frame()
> returns false (e.g. XDPSQ is full), the function returns without
> unmapping the DMA address or freeing the xdp_frame allocated by
> xdp_convert_zc_to_xdp_frame(). The xdpi_fifo push only happens on
> success, so the completion path cannot recover these entries.
> 
> [...]

Here is the summary with links:
  - [net] net/mlx5e: xsk: Fix DMA and xdp_frame leak on XDP_TX xmit failure
    https://git.kernel.org/netdev/net/c/b69004f5a6ad

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH net 3/4] net/mlx5e: Bounds-check stats_nch in mlx5e_get_queue_stats_rx()
From: Jakub Kicinski @ 2026-06-09  1:54 UTC (permalink / raw)
  To: Tariq Toukan
  Cc: Eric Dumazet, Paolo Abeni, Andrew Lunn, David S. Miller,
	Saeed Mahameed, Leon Romanovsky, Mark Bloch, Eran Ben Elisha,
	Feng Liu, Cosmin Ratiu, Gal Pressman, Simon Horman, Alexei Lazar,
	Nimrod Oren, Carolina Jubran, Kees Cook, Lama Kayal,
	Eran Ben Elisha, Saeed Mahameed, Haiyang Zhang, Joe Damato,
	netdev, linux-rdma, linux-kernel
In-Reply-To: <20260604135041.455754-4-tariqt@nvidia.com>

On Thu, 4 Jun 2026 16:50:40 +0300 Tariq Toukan wrote:
> mlx5e_get_queue_stats_rx() is invoked by the netdev stats core with
> an RX queue index 'i' from real_num_rx_queues. Today it only guards
> against priv->stats_nch == 0 and then dereferences
> priv->channel_stats[i] unconditionally.
> 
> During interface bring-up channel_stats[] is populated incrementally
> by mlx5e_channel_stats_alloc(), so a concurrent QSTATS netlink dump
> can call into the helper with i >= stats_nch. The non-zero check
> passes, channel_stats[i] is NULL, and the dereference panics.
> 
> Replace the non-zero check with an upper-bound check against
> stats_nch, which subsumes the zero check and prevents the
> out-of-bounds dereference.

I don't think there can be any race here?
The open/close and queue stats readers are under netdev->lock
Your description makes it sound as if we could access half-initialized
state?

Sure, the ndo path is tricky since it's lockless, but please don't
add unnecessary checks in the locked paths.
-- 
pw-bot: cr

^ permalink raw reply

* Re: [PATCH 6.6] RDMA/rxe: Fix "trying to register non-static key in rxe_qp_do_cleanup" bug
From: Sasha Levin @ 2026-06-09  0:51 UTC (permalink / raw)
  To: stable, Greg Kroah-Hartman
  Cc: Sasha Levin, Vladislav Nikolaev, Zhu Yanjun, Doug Ledford,
	Jason Gunthorpe, Haggai Eran, Kamal Heib, Amir Vadai, Moni Shoua,
	Yonatan Cohen, Leon Romanovsky, linux-rdma, linux-kernel,
	Zhu Yanjun, lvc-project, syzbot+4edb496c3cad6e953a31, Zhu Yanjun
In-Reply-To: <20260605165556.1082-1-vlad102nikolaev@gmail.com>

> [PATCH 6.6] RDMA/rxe: Fix "trying to register non-static key in
> rxe_qp_do_cleanup" bug

Queued for 6.6, thanks.

--
Thanks,
Sasha

^ permalink raw reply

* Re: [PATCH net v2] net/mlx4: avoid GCC 10 __bad_copy_from() false positive
From: patchwork-bot+netdevbpf @ 2026-06-09  0:30 UTC (permalink / raw)
  To: Yao Sang
  Cc: tariqt, davem, kuba, pabeni, andrew+netdev, edumazet, gustavoars,
	netdev, linux-rdma
In-Reply-To: <20260603061044.2055155-1-sangyao@kylinos.cn>

Hello:

This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Wed,  3 Jun 2026 14:10:44 +0800 you wrote:
> mlx4_init_user_cqes() fills a scratch buffer with the CQE
> initialization pattern and then copies from that buffer to userspace.
> 
> In the single-copy path, the copy length is array_size(entries,
> cqe_size), but the scratch buffer is allocated with PAGE_SIZE. GCC 10
> does not carry the branch invariant strongly enough through the object
> size checks and falsely triggers __bad_copy_from().
> 
> [...]

Here is the summary with links:
  - [net,v2] net/mlx4: avoid GCC 10 __bad_copy_from() false positive
    https://git.kernel.org/netdev/net/c/2365343f4aad

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* [PATCH net-next] net/mlx5e: Report link down on administrative close
From: Manjunath Patil @ 2026-06-08 23:42 UTC (permalink / raw)
  To: netdev
  Cc: saeedm, tariqt, mbloch, leon, andrew+netdev, davem, edumazet,
	kuba, pabeni, linux-rdma, linux-kernel, manjunath.b.patil

mlx5e_update_carrier() reports both link-up and link-down carrier
changes, but an administrative down does not reach it in practice. The
close path first changes the port admin state and then clears
MLX5E_STATE_OPENED and drops carrier silently in mlx5e_close_locked().
Any queued carrier worker will skip update_carrier() once the device is
no longer opened.

This leaves "ip link set dev <dev> down" without a matching netdev
"Link down" message, while reopening the device still reports "Link up".

Report the link-down transition in mlx5e_close() before the common close
helper clears the opened state and drops carrier. Guard the message with
the current opened and carrier state to avoid duplicates when the netdev
is already closed or carrier is already down.

Assisted-by: Codex:gpt-5
Signed-off-by: Manjunath Patil <manjunath.b.patil@oracle.com>
---
Validation:
- Built an OL8 mainline test kernel from this change.
- Booted 7.1.0-rc6.bug123456.el8.v1.x86_64 on an mlx5-backed VM.
- Confirmed `ip link set dev re0 down/up` and `re1 down/up` now emit
  netdev `Link down` and `Link up` messages, alongside the existing RDMA
  port state notifications.

 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 8f2b3abe0092..a04a89f0eddf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3628,6 +3628,9 @@ int mlx5e_close(struct net_device *netdev)

 	mutex_lock(&priv->state_lock);
 	mlx5e_modify_admin_state(priv->mdev, MLX5_PORT_DOWN);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state) &&
+	    netif_carrier_ok(netdev))
+		netdev_info(netdev, "Link down\n");
 	err = mlx5e_close_locked(netdev);
 	mutex_unlock(&priv->state_lock);

base-commit: e43ffb69e0438cddd72aaa30898b4dc446f664f8
-- 
2.47.3

^ permalink raw reply related

* Re: [PATCH] RDMA: During rereg_mr ensure that REREG_ACCESS is compatible
From: Jason Gunthorpe @ 2026-06-08 23:25 UTC (permalink / raw)
  To: yanjun.zhu
  Cc: Junxian Huang, Krzysztof Czurylo, linux-rdma, Chengchang Tang,
	Tatyana Nikolova, Yishai Hadas, Zhu Yanjun, Andrew Morton,
	David Hildenbrand, Leon Romanovsky, patches, Philip Tsukerman,
	stable
In-Reply-To: <22629c63-ca98-4af7-9e3b-480b89be6ce1@linux.dev>

On Mon, Jun 08, 2026 at 03:38:32PM -0700, yanjun.zhu wrote:

> But I found the following problem. I am not sure if we fix this problem in
> this commit or file a new commit.

The core code does all of that

Jason

^ permalink raw reply

* Re: [pull-request] mlx5-next updates 2026-06-07
From: patchwork-bot+netdevbpf @ 2026-06-08 22:40 UTC (permalink / raw)
  To: Tariq Toukan
  Cc: edumazet, kuba, pabeni, andrew+netdev, davem, saeedm, leon,
	mbloch, netdev, linux-rdma, linux-kernel, gal, dtatulea, moshe,
	shayd
In-Reply-To: <20260607111157.470978-1-tariqt@nvidia.com>

Hello:

This pull request was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Sun, 7 Jun 2026 14:11:57 +0300 you wrote:
> Hi,
> 
> The following pull-request contains common mlx5 updates
> for your *net-next* tree.
> Please pull and let me know of any problem.
> 
> Regards,
> Tariq
> 
> [...]

Here is the summary with links:
  - [pull-request] mlx5-next updates 2026-06-07
    https://git.kernel.org/netdev/net-next/c/199f6b9a1603

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH] RDMA: During rereg_mr ensure that REREG_ACCESS is compatible
From: yanjun.zhu @ 2026-06-08 22:38 UTC (permalink / raw)
  To: Jason Gunthorpe, Junxian Huang, Krzysztof Czurylo, linux-rdma,
	Chengchang Tang, Tatyana Nikolova, Yishai Hadas, Zhu Yanjun,
	Zhu Yanjun
  Cc: Andrew Morton, David Hildenbrand, Leon Romanovsky, patches,
	Philip Tsukerman, stable
In-Reply-To: <0-v1-06fb1a2d6cf5+107-rereg_access_jgg@nvidia.com>

On 6/8/26 9:44 AM, Jason Gunthorpe wrote:
> If IB_MR_REREG_ACCESS changes from RO to RW then the umem has to be
> re-evaluated to ensure it is properly pinned as RW. Since the umem is
> hidden inside each driver's mr struct add a ib_umem_check_rereg() function
> that each driver has to call before processing IB_MR_REREG_ACCESS.
> 
> mlx4 has to retain its duplicate ib_access_writable check because it
> implements IB_MR_REREG_ACCESS | IB_MR_REREG_TRANS by changing both items
> in place sequentially while the MR is live, so it will continue to not
> support this combination.
> 
> Cc: stable@vger.kernel.org
> Fixes: b40656aa7d55 ("RDMA/umem: remove FOLL_FORCE usage")
> Reported-by: Philip Tsukerman <philiptsukerman@gmail.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>   drivers/infiniband/core/umem.c          | 16 ++++++++++++++++
>   drivers/infiniband/hw/hns/hns_roce_mr.c |  4 ++++
>   drivers/infiniband/hw/irdma/verbs.c     |  4 ++++
>   drivers/infiniband/hw/mlx4/mr.c         |  4 ++++
>   drivers/infiniband/hw/mlx5/mr.c         |  4 ++++
>   drivers/infiniband/sw/rxe/rxe_verbs.c   |  5 +++++
>   include/rdma/ib_umem.h                  |  8 ++++++++
>   7 files changed, 45 insertions(+)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index 786fa1aa8e552b..4b055712b0d0db 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -332,3 +332,19 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
>   		return 0;
>   }
>   EXPORT_SYMBOL(ib_umem_copy_from);
> +
> +/*
> + * Called during rereg mr if the driver is able to re-use a umem for
> + * IB_MR_REREG_ACCESS.
> + */
> +int ib_umem_check_rereg(struct ib_umem *umem, int flags, int new_access_flags)
> +{
> +	if (!umem)
> +		return 0;
> +
> +	if ((flags & IB_MR_REREG_ACCESS) && !(flags & IB_MR_REREG_TRANS))
> +		if (ib_access_writable(new_access_flags) && !umem->writable)
> +			return -EACCES;
> +	return 0;
> +}
> +EXPORT_SYMBOL(ib_umem_check_rereg);
> diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
> index 896af1828a38de..25bfd3970f5b6e 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_mr.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
> @@ -300,6 +300,10 @@ struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start,
>   		goto err_out;
>   	}
>   
> +	ret = ib_umem_check_rereg(mr->pbl_mtr.umem, flags, mr_access_flags);
> +	if (ret)
> +		goto err_out;
> +
>   	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
>   	ret = PTR_ERR_OR_ZERO(mailbox);
>   	if (ret)
> diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
> index 17086048d2d7fc..8cd4275328052e 100644
> --- a/drivers/infiniband/hw/irdma/verbs.c
> +++ b/drivers/infiniband/hw/irdma/verbs.c
> @@ -3803,6 +3803,10 @@ static struct ib_mr *irdma_rereg_user_mr(struct ib_mr *ib_mr, int flags,
>   	if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
>   		return ERR_PTR(-EOPNOTSUPP);
>   
> +	ret = ib_umem_check_rereg(iwmr->region, flags, new_access);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
>   	if (dmabuf_revocable) {
>   		umem_dmabuf = to_ib_umem_dmabuf(iwmr->region);
>   
> diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
> index 650b4a9121ff6d..6747bca3067770 100644
> --- a/drivers/infiniband/hw/mlx4/mr.c
> +++ b/drivers/infiniband/hw/mlx4/mr.c
> @@ -209,6 +209,10 @@ struct ib_mr *mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, u64 start,
>   	struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
>   	int err;
>   
> +	err = ib_umem_check_rereg(mmr->umem, flags, mr_access_flags);
> +	if (err)
> +		return ERR_PTR(err);
> +
>   	/* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
>   	 * we assume that the calls can't run concurrently. Otherwise, a
>   	 * race exists.
> diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
> index 3b6da45061a552..fb40b44496f47a 100644
> --- a/drivers/infiniband/hw/mlx5/mr.c
> +++ b/drivers/infiniband/hw/mlx5/mr.c
> @@ -1179,6 +1179,10 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
>   	if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
>   		return ERR_PTR(-EOPNOTSUPP);
>   
> +	err = ib_umem_check_rereg(mr->umem, flags, new_access_flags);
> +	if (err)
> +		return ERR_PTR(err);
> +
>   	if (!(flags & IB_MR_REREG_ACCESS))
>   		new_access_flags = mr->access_flags;
>   	if (!(flags & IB_MR_REREG_PD))
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index 4d4891dc28846b..4cf04a44189c64 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -1319,6 +1319,7 @@ static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags,
>   	struct rxe_mr *mr = to_rmr(ibmr);
>   	struct rxe_pd *old_pd = to_rpd(ibmr->pd);
>   	struct rxe_pd *pd = to_rpd(ibpd);
> +	int err;
>   
>   	/* for now only support the two easy cases:
>   	 * rereg_pd and rereg_access
> @@ -1328,6 +1329,10 @@ static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags,
>   		return ERR_PTR(-EOPNOTSUPP);
>   	}
>   
> +	err = ib_umem_check_rereg(mr->umem, flags, access);
> +	if (err)
> +		return ERR_PTR(err);
> +

Thanks a lot. I am fine with this.

Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>

But I found the following problem. I am not sure if we fix this problem 
in this commit or file a new commit.

diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c 
b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 4d4891dc2884..3b99649c342d 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1319,6 +1319,7 @@ static struct ib_mr *rxe_rereg_user_mr(struct 
ib_mr *ibmr, int flags,
         struct rxe_mr *mr = to_rmr(ibmr);
         struct rxe_pd *old_pd = to_rpd(ibmr->pd);
         struct rxe_pd *pd = to_rpd(ibpd);
+       struct ib_pd *old_ibpd;

         /* for now only support the two easy cases:
          * rereg_pd and rereg_access
@@ -1331,12 +1332,18 @@ static struct ib_mr *rxe_rereg_user_mr(struct 
ib_mr *ibmr, int flags,
         if (flags & IB_MR_REREG_PD) {
                 rxe_put(old_pd);
                 rxe_get(pd);
+               old_ibpd = mr->ibmr.pd;
                 mr->ibmr.pd = ibpd;
         }

         if (flags & IB_MR_REREG_ACCESS) {
                 if (access & ~RXE_ACCESS_SUPPORTED_MR) {
                         rxe_err_mr(mr, "access = %#x not supported\n", 
access);
+                       if (flags & IB_MR_REREG_PD) {
+                               rxe_get(old_pd);
+                               rxe_put(pd);
+                               mr->ibmr.pd = old_ibpd;
+                       }
                         return ERR_PTR(-EOPNOTSUPP);
                 }
                 mr->access = access;

Zhu Yanjun

>   	if (flags & IB_MR_REREG_PD) {
>   		rxe_put(old_pd);
>   		rxe_get(pd);
> diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
> index 2ad52cc1d52bdd..49172098a8de14 100644
> --- a/include/rdma/ib_umem.h
> +++ b/include/rdma/ib_umem.h
> @@ -156,6 +156,8 @@ void ib_umem_dmabuf_revoke_lock(struct ib_umem_dmabuf *umem_dmabuf);
>   void ib_umem_dmabuf_revoke_unlock(struct ib_umem_dmabuf *umem_dmabuf);
>   void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf);
>   
> +int ib_umem_check_rereg(struct ib_umem *umem, int flags, int new_access_flags);
> +
>   #else /* CONFIG_INFINIBAND_USER_MEM */
>   
>   #include <linux/err.h>
> @@ -230,5 +232,11 @@ static inline void ib_umem_dmabuf_revoke_lock(struct ib_umem_dmabuf *umem_dmabuf
>   static inline void ib_umem_dmabuf_revoke_unlock(struct ib_umem_dmabuf *umem_dmabuf) {}
>   static inline void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf) {}
>   
> +static inline int ib_umem_check_rereg(struct ib_umem *umem, int flags,
> +				      int new_access_flags)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
>   #endif /* CONFIG_INFINIBAND_USER_MEM */
>   #endif /* IB_UMEM_H */
> 
> base-commit: 323c98a4ff06aa28114f2bf658fb43eb3b536bbc


^ permalink raw reply related

* Re: [for-next v3 0/5] ionic: RDMA completion timestamping support
From: Jakub Kicinski @ 2026-06-08 21:20 UTC (permalink / raw)
  To: Abhijit Gangurde
  Cc: jgg, leon, brett.creeley, andrew+netdev, davem, edumazet, pabeni,
	allen.hubbe, nikhil.agarwal, linux-rdma, netdev, linux-kernel,
	David Woodhouse
In-Reply-To: <20260606050003.3648306-1-abhijit.gangurde@amd.com>

On Sat, 6 Jun 2026 10:29:58 +0530 Abhijit Gangurde wrote:
> This series adds RDMA completion timestamp support for ionic.
> 
> It enables PHC registration for RDMA timestamp capability, exposes a PHC
> state page for safe user-space reads, maps that PHC state through RDMA
> ucontext mmap, and extends the RDMA CQE format to carry completion
> timestamps.
> 
> With this, user space can read completion timestamps and convert them to
> wall time with low overhead.

please CC David Woodhouse <dwmw2@infradead.org> on future versions.
Since David is working on uAPI for clocks he may have some thoughts.

^ permalink raw reply

* Re: [PATCH net-next V5 00/12] devlink: add per-port resource support
From: Jakub Kicinski @ 2026-06-08 20:30 UTC (permalink / raw)
  To: Tariq Toukan
  Cc: Eric Dumazet, Paolo Abeni, Andrew Lunn, David S. Miller,
	Simon Horman, Donald Hunter, Jiri Pirko, Jonathan Corbet,
	Shuah Khan, Saeed Mahameed, Leon Romanovsky, Mark Bloch,
	Shuah Khan, Matthieu Baerts (NGI0), Chuck Lever, Carolina Jubran,
	Or Har-Toov, Moshe Shemesh, Dragos Tatulea, Daniel Zahka,
	Shahar Shitrit, Cosmin Ratiu, Jacob Keller, Parav Pandit,
	Adithya Jayachandran, Shay Drori, Kees Cook, Daniel Jurgens,
	netdev, linux-kernel, linux-doc, linux-rdma, linux-kselftest,
	Gal Pressman
In-Reply-To: <20260407194107.148063-1-tariqt@nvidia.com>

On Tue, 7 Apr 2026 22:40:55 +0300 Tariq Toukan wrote:
> Userspace patches for iproute2:
> https://github.com/ohartoov/iproute2/tree/port_resources

Hi! As far as I can tell these iproute2 patches have not even been
posted? Please try to get them upstream ASAP, we have to manually 
carry them in the CI right now.

^ permalink raw reply

* Re: [PATCH] RDMA/iwpm: fix kref bypass in iwpm_register_pid() error path
From: Jason Gunthorpe @ 2026-06-08 18:34 UTC (permalink / raw)
  To: Wentao Liang; +Cc: leon, linux-rdma, linux-kernel, stable
In-Reply-To: <20260608103001.142648-1-vulab@iscas.ac.cn>

On Mon, Jun 08, 2026 at 10:30:01AM +0000, Wentao Liang wrote:
> iwpm_get_nlmsg_request() returns a request with kref_init() +
> kref_get() (refcount=2, one for the caller and one for the
> iwpm_nlmsg_req_list). On the error path, iwpm_register_pid()
> calls iwpm_free_nlmsg_request() directly instead of using
> kref_put(), bypassing the kref mechanism and freeing the object
> while the refcount is still non-zero.
> 
> Replace the direct iwpm_free_nlmsg_request() call with
> kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request).
> 
> Cc: stable@vger.kernel.org
> Fixes: 30dc5e63d6a5 ("RDMA/core: Add support for iWARP Port Mapper user space service")
> Signed-off-by: Wentao Liang <vulab@iscas.ac.cn>
> ---
>  drivers/infiniband/core/iwpm_msg.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
> index 4625abd29ac0..672b0c33a6de 100644
> --- a/drivers/infiniband/core/iwpm_msg.c
> +++ b/drivers/infiniband/core/iwpm_msg.c
> @@ -122,7 +122,7 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
>  	pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client);
>  	dev_kfree_skb(skb);
>  	if (nlmsg_request)
> -		iwpm_free_nlmsg_request(&nlmsg_request->kref);
> +		kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
>  	return ret;

Sashiko doesn't like any of these changes.

https://patchwork.kernel.org/project/linux-rdma/patch/20260608103001.142648-1-vulab@iscas.ac.cn/

Does this change introduce a stack memory corruption risk?

Because iwpm_get_nlmsg_request() initializes the request with a refcount of
2, this kref_put() only decrements it to 1. The object is not freed or
removed from the global iwpm_nlmsg_req_list.

Jason

^ permalink raw reply

* Re: [PATCH] RDMA/core: fix refcount leak in __ib_alloc_pd()
From: Jason Gunthorpe @ 2026-06-08 18:32 UTC (permalink / raw)
  To: Wentao Liang; +Cc: leon, linux-rdma, linux-kernel, stable
In-Reply-To: <20260608085625.138331-1-vulab@iscas.ac.cn>

On Mon, Jun 08, 2026 at 08:56:25AM +0000, Wentao Liang wrote:
> The error handling in __ib_alloc_pd() has a refcount leak.

No it doesn't, the error handling is as designed.

> When get_dma_mr() fails it calls ib_dealloc_pd() which invokes
> ib_dealloc_pd_user().  If the driver's dealloc_pd operation returns
> an error, ib_dealloc_pd_user() returns early and skips both
> rdma_restrack_del() and kfree(pd).  This leaves the resource
> tracking kref held and the pd memory unfreed.  Because
> ib_dealloc_pd() has a void return, __ib_alloc_pd() cannot detect the
> failure, so the leak persists.

Yes, we can't handle failures on this path, the kernel verb
ib_dealloc_pd() could gain a WARN_ON to make these clearer, but the
correct thing for the user facing API is to just leave it as is
untouched.

If you mess with this you break the uverbs stuff pretty badly.

Jason

^ permalink raw reply

* Re: [PATCH] RDMA/rtrs-srv: Fix integer underflow in process_read and process_write
From: Jason Gunthorpe @ 2026-06-08 18:27 UTC (permalink / raw)
  To: Aurelien DESBRIERES; +Cc: linux-rdma, leon, haris.iqbal, jinpu.wang, gregkh
In-Reply-To: <20260608134802.5019-1-aurelien@hackers.camp>

On Mon, Jun 08, 2026 at 03:47:15PM +0200, Aurelien DESBRIERES wrote:
> usr_len is read from a network-supplied message field (le16_to_cpu)
> and used to compute data_len = off - usr_len without validating that
> usr_len <= off. A malicious RDMA client can send usr_len > off causing
> an integer underflow, resulting in data_len wrapping to a huge size_t
> value which is then passed to the rdma_ev callback as a memory length,
> leading to out-of-bounds memory access.
> 
> Fix by reading and validating usr_len <= off before rtrs_srv_get_ops_ids()
> in both process_read() and process_write(), ensuring the early return
> path acquires no reference and has no resource leak.
> 
> Reported-by: Aurelien DESBRIERES <aurelien@hackers.camp>
> Reviewed-by: Md Haris Iqbal <haris.iqbal@ionos.com>
> Signed-off-by: Aurelien DESBRIERES <aurelien@hackers.camp>
> Assisted-by: Claude <claude-sonnet-4-6>
> Acked-by: Md Haris Iqbal <haris.iqbal@ionos.com>
> ---
>  drivers/infiniband/ulp/rtrs/rtrs-srv.c | 12 ++++++++++--
>  1 file changed, 10 insertions(+), 2 deletions(-)

Applied, thanks

Jason

^ permalink raw reply

* Re: [PATCH 00/10] Fix races around IB_MR_REREG_PD and mr->pd
From: Jason Gunthorpe @ 2026-06-08 17:52 UTC (permalink / raw)
  To: Leon Romanovsky, linux-rdma
  Cc: Doug Ledford, Edward Srouji, Leon Romanovsky, Leon Romanovsky,
	Matan Barak, Michael Guralnik, Noa Osherovich, patches,
	Steve Wise
In-Reply-To: <0-v1-29ebd2c229b5+fd5-ib_mr_pd_jgg@nvidia.com>

On Wed, Jun 03, 2026 at 10:27:39PM -0300, Jason Gunthorpe wrote:
> Sashiko pointed out an existing bug related to mr->pd: when IB_MR_REREG_PD
> is used the mr->pd is changed while only holding the write side of the
> MR's uobject lock.
> 
> Effectively, because IB_MR_REREG_PD is usually implemented by changing the
> MR in-place, the mr->pd becomes unreadable outside an MR-specific system
> call that holds the uobject lock. All the readers in this series could
> race with an IB_MR_REREG_PD and potentially UAF the mr->pd.
> 
>  https://sashiko.dev/#/patchset/20260427-security-bug-fixes-v3-0-4621fa52de0e%40nvidia.com?part=4
> 
> This was presented as a simple 'oh look it can race with nldev' which is
> correct. However, asking AI to fully audit mr->pd touches also revealed a
> much more convoluted issue inside mlx5 ODP that is also using mr->pd from
> the page fault work queue, advise mr work queue and advise mr system call
> without any locking.
> 
> It turns out this mlx5 problem is entirely unnecessary since outside
> implicit mr there are only three cases where the UMR actually flags the
> PDN to be read by HW, umr_rereg_pas(), mlx5_ib_init_odp_mr() and
> mlx5_ib_init_dmabuf_mr(). umr_rereg_pas() is particularly tricky because
> it illegaly updates mr->pd inside the driver.  Reorganize the giant call
> chain from mlx5r_umr_set_update_xlt_mkey_seg() upward so that pdn is
> passed down from those three functions instead of unconditionally picked
> out at the bottom.
> 
> nldev however is trickier to fix. To avoid disurbing the happy paths build
> a synchronize barrier by removing the mr from the xarray and then putting
> it right back. The kref completion acts as a positive signal that the
> mr->pd is no longer being used.
> 
> Jason Gunthorpe (10):
>   IB/mlx5: Don't take the rereg_mr fallback without a new translation
>   RDMA/mlx5: Create ODP EQ for non-pinned dmabuf MRs
>   IB/mlx5: Properly support implicit ODP rereg_mr
>   RDMA/nldev: Fix locking when accessing mr->pd
>   IB/mlx5: Remove unused mkc bits in mlx5r_umr_update_mr_page_shift()
>   IB/mlx5: Pull the pdn out of the depths of the umr machinery
>   IB/mlx5: Don't mangle the mr->pd inside the rereg callback
>   IB/mlx5: Push pdn above mlx5r_umr_update_xlt()
>   IB/mlx5: Push pdn above pagfault_real_mr()
>   IB/mlx5: Push pdn above pagefault_dmabuf_mr()

Applied to for-next

Thanks,
Jason

^ permalink raw reply

* Re: [for-next v3 5/5] RDMA/mlx5: move mlx5 clock info to common struct ib_uverbs_clock_info
From: Jason Gunthorpe @ 2026-06-08 17:30 UTC (permalink / raw)
  To: Abhijit Gangurde
  Cc: leon, brett.creeley, andrew+netdev, davem, edumazet, kuba, pabeni,
	allen.hubbe, nikhil.agarwal, linux-rdma, netdev, linux-kernel
In-Reply-To: <20260606050003.3648306-6-abhijit.gangurde@amd.com>

On Sat, Jun 06, 2026 at 10:30:03AM +0530, Abhijit Gangurde wrote:
> Use struct ib_uverbs_clock_info from ib_user_verbs.h for clock info.
> 
> The original struct mlx5_ib_clock_info remains in mlx5-abi.h for
> userspace ABI compatibility.

Please don't do that, we generally don't do compatability for
compilation only, especially in this case where rdma-core is the only
consumer and it has its own version linked copies of all these
headers.

Though it seems like the contents are the same so you could just do:

> +/*
> + * deprecated, see struct ib_uverbs_clock_info from ib_user_verbs.h
> + */

#define mlx5_ib_clock_info ib_uverbs_clock_info

Jason

^ permalink raw reply

* [PATCH] RDMA: During rereg_mr ensure that REREG_ACCESS is compatible
From: Jason Gunthorpe @ 2026-06-08 16:44 UTC (permalink / raw)
  To: Junxian Huang, Krzysztof Czurylo, linux-rdma, Chengchang Tang,
	Tatyana Nikolova, Yishai Hadas, Zhu Yanjun
  Cc: Andrew Morton, David Hildenbrand, Leon Romanovsky, patches,
	Philip Tsukerman, stable

If IB_MR_REREG_ACCESS changes from RO to RW then the umem has to be
re-evaluated to ensure it is properly pinned as RW. Since the umem is
hidden inside each driver's mr struct add a ib_umem_check_rereg() function
that each driver has to call before processing IB_MR_REREG_ACCESS.

mlx4 has to retain its duplicate ib_access_writable check because it
implements IB_MR_REREG_ACCESS | IB_MR_REREG_TRANS by changing both items
in place sequentially while the MR is live, so it will continue to not
support this combination.

Cc: stable@vger.kernel.org
Fixes: b40656aa7d55 ("RDMA/umem: remove FOLL_FORCE usage")
Reported-by: Philip Tsukerman <philiptsukerman@gmail.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/core/umem.c          | 16 ++++++++++++++++
 drivers/infiniband/hw/hns/hns_roce_mr.c |  4 ++++
 drivers/infiniband/hw/irdma/verbs.c     |  4 ++++
 drivers/infiniband/hw/mlx4/mr.c         |  4 ++++
 drivers/infiniband/hw/mlx5/mr.c         |  4 ++++
 drivers/infiniband/sw/rxe/rxe_verbs.c   |  5 +++++
 include/rdma/ib_umem.h                  |  8 ++++++++
 7 files changed, 45 insertions(+)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 786fa1aa8e552b..4b055712b0d0db 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -332,3 +332,19 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 		return 0;
 }
 EXPORT_SYMBOL(ib_umem_copy_from);
+
+/*
+ * Called during rereg mr if the driver is able to re-use a umem for
+ * IB_MR_REREG_ACCESS.
+ */
+int ib_umem_check_rereg(struct ib_umem *umem, int flags, int new_access_flags)
+{
+	if (!umem)
+		return 0;
+
+	if ((flags & IB_MR_REREG_ACCESS) && !(flags & IB_MR_REREG_TRANS))
+		if (ib_access_writable(new_access_flags) && !umem->writable)
+			return -EACCES;
+	return 0;
+}
+EXPORT_SYMBOL(ib_umem_check_rereg);
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 896af1828a38de..25bfd3970f5b6e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -300,6 +300,10 @@ struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start,
 		goto err_out;
 	}
 
+	ret = ib_umem_check_rereg(mr->pbl_mtr.umem, flags, mr_access_flags);
+	if (ret)
+		goto err_out;
+
 	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
 	ret = PTR_ERR_OR_ZERO(mailbox);
 	if (ret)
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index 17086048d2d7fc..8cd4275328052e 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -3803,6 +3803,10 @@ static struct ib_mr *irdma_rereg_user_mr(struct ib_mr *ib_mr, int flags,
 	if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	ret = ib_umem_check_rereg(iwmr->region, flags, new_access);
+	if (ret)
+		return ERR_PTR(ret);
+
 	if (dmabuf_revocable) {
 		umem_dmabuf = to_ib_umem_dmabuf(iwmr->region);
 
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 650b4a9121ff6d..6747bca3067770 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -209,6 +209,10 @@ struct ib_mr *mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, u64 start,
 	struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
 	int err;
 
+	err = ib_umem_check_rereg(mmr->umem, flags, mr_access_flags);
+	if (err)
+		return ERR_PTR(err);
+
 	/* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
 	 * we assume that the calls can't run concurrently. Otherwise, a
 	 * race exists.
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 3b6da45061a552..fb40b44496f47a 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1179,6 +1179,10 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 	if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	err = ib_umem_check_rereg(mr->umem, flags, new_access_flags);
+	if (err)
+		return ERR_PTR(err);
+
 	if (!(flags & IB_MR_REREG_ACCESS))
 		new_access_flags = mr->access_flags;
 	if (!(flags & IB_MR_REREG_PD))
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 4d4891dc28846b..4cf04a44189c64 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1319,6 +1319,7 @@ static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags,
 	struct rxe_mr *mr = to_rmr(ibmr);
 	struct rxe_pd *old_pd = to_rpd(ibmr->pd);
 	struct rxe_pd *pd = to_rpd(ibpd);
+	int err;
 
 	/* for now only support the two easy cases:
 	 * rereg_pd and rereg_access
@@ -1328,6 +1329,10 @@ static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags,
 		return ERR_PTR(-EOPNOTSUPP);
 	}
 
+	err = ib_umem_check_rereg(mr->umem, flags, access);
+	if (err)
+		return ERR_PTR(err);
+
 	if (flags & IB_MR_REREG_PD) {
 		rxe_put(old_pd);
 		rxe_get(pd);
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 2ad52cc1d52bdd..49172098a8de14 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -156,6 +156,8 @@ void ib_umem_dmabuf_revoke_lock(struct ib_umem_dmabuf *umem_dmabuf);
 void ib_umem_dmabuf_revoke_unlock(struct ib_umem_dmabuf *umem_dmabuf);
 void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf);
 
+int ib_umem_check_rereg(struct ib_umem *umem, int flags, int new_access_flags);
+
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
 #include <linux/err.h>
@@ -230,5 +232,11 @@ static inline void ib_umem_dmabuf_revoke_lock(struct ib_umem_dmabuf *umem_dmabuf
 static inline void ib_umem_dmabuf_revoke_unlock(struct ib_umem_dmabuf *umem_dmabuf) {}
 static inline void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf) {}
 
+static inline int ib_umem_check_rereg(struct ib_umem *umem, int flags,
+				      int new_access_flags)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_INFINIBAND_USER_MEM */
 #endif /* IB_UMEM_H */

base-commit: 323c98a4ff06aa28114f2bf658fb43eb3b536bbc
-- 
2.43.0


^ permalink raw reply related

* [PATCH] RDMA/iwpm: fix kref bypass in iwpm_add_and_query_mapping() error path
From: Wentao Liang @ 2026-06-08 15:42 UTC (permalink / raw)
  To: jgg, leon; +Cc: fw, kees, linux-rdma, linux-kernel, Wentao Liang, stable

iwpm_get_nlmsg_request() returns with kref_init() + kref_get()
(refcount=2). iwpm_add_and_query_mapping() calls
iwpm_free_nlmsg_request() directly on the error path instead of
using kref_put(), freeing the object while the refcount is still
non-zero. The success path correctly uses kref_put() via
iwpm_wait_complete_req().

Replace the direct iwpm_free_nlmsg_request() call with
kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request).

Fixes: 30dc5e63d6a5 ("RDMA/core: Add support for iWARP Port Mapper user space service")
Cc: stable@vger.kernel.org
Signed-off-by: Wentao Liang <vulab@iscas.ac.cn>
---
 drivers/infiniband/core/iwpm_msg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 854c974d6586..bac3d1f321ab 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -296,7 +296,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 query_mapping_error_nowarn:
 	dev_kfree_skb(skb);
 	if (nlmsg_request)
-		iwpm_free_nlmsg_request(&nlmsg_request->kref);
+		kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
 	return ret;
 }
 
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH for-rc] RDMA/efa: Propagate destroy AH error
From: Jason Gunthorpe @ 2026-06-08 15:26 UTC (permalink / raw)
  To: tom sela
  Cc: mrgolin, leon, linux-rdma, sleybo, matua, gal.pressman,
	Yonatan Nachum
In-Reply-To: <20260608145738.GA43925@dev-dsk-tomsela-1c-ce9cc34e.eu-west-1.amazon.com>

On Mon, Jun 08, 2026 at 02:57:38PM +0000, tom sela wrote:
> On Mon, Jun 01, 2026 at 09:22:23PM -0300, Jason Gunthorpe wrote:
> > On Tue, May 26, 2026 at 07:33:34AM +0000, Tom Sela wrote:
> > > AH destruction currently always returns success, ignoring any error
> > > from the device. Propagate the actual device error so the caller can
> > > handle failures appropriately.
> > 
> > Callers don't handle failures. Drivers are not permitted to fail
> > destroy, if they do it probably will trigger a WARN_ON.
> > 
> > You can make some of an argument to allow failing destroy for user
> > objects only, but not like this in general for kernel objects.
> > 
> > If your FW fails destroying a kernel object then the device is busted,
> > you should reset it and succeed to destroy the kernel object anyhow.
> > 
> > Jason
> 
> 
> This code is for user objects only. When destroy is called for a
> user object, the core code handles the failure gracefully and can
> retry cleanup at a later stage.
> 
> Currently we don't have a code path where destroy_ah actually fails
> in device, but we'd like the error propagation in place for
> completeness so that if a future FW change can return a transient
> error, we handle it correctly rather than silently ignoring it.
> 
> Would you prefer we explicitly guard this with a check for
> ibah->uobject (i.e., only propagate the error when it's a user
> object).

Do you ever plan to support kverbs on efa?

It is still not Ok to propogae all failures even on uobjects, you will
still trigger a WARN_ON eventually.. It has to succeed under the retry
logic.

Jason

^ permalink raw reply

* [PATCH] RDMA/iwpm: fix kref bypass in iwpm_add_mapping() error path
From: Wentao Liang @ 2026-06-08 15:17 UTC (permalink / raw)
  To: jgg, leon; +Cc: fw, kees, linux-rdma, linux-kernel, Wentao Liang, stable

iwpm_get_nlmsg_request() returns with kref_init() + kref_get()
(refcount=2). iwpm_add_mapping() calls iwpm_free_nlmsg_request()
directly on the error path instead of using kref_put(), bypassing
the kref mechanism and freeing the object with a non-zero refcount.

Replace the direct iwpm_free_nlmsg_request() call with
kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request).

Cc: stable@vger.kernel.org
Fixes: 30dc5e63d6a5 ("RDMA/core: Add support for iWARP Port Mapper user space service")
Signed-off-by: Wentao Liang <vulab@iscas.ac.cn>
---
 drivers/infiniband/core/iwpm_msg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 672b0c33a6de..854c974d6586 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -207,7 +207,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 add_mapping_error_nowarn:
 	dev_kfree_skb(skb);
 	if (nlmsg_request)
-		iwpm_free_nlmsg_request(&nlmsg_request->kref);
+		kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
 	return ret;
 }
 
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH for-rc] RDMA/efa: Propagate destroy AH error
From: tom sela @ 2026-06-08 14:57 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: mrgolin, leon, linux-rdma, sleybo, matua, gal.pressman,
	Yonatan Nachum
In-Reply-To: <20260602002223.GA644685@nvidia.com>

On Mon, Jun 01, 2026 at 09:22:23PM -0300, Jason Gunthorpe wrote:
> On Tue, May 26, 2026 at 07:33:34AM +0000, Tom Sela wrote:
> > AH destruction currently always returns success, ignoring any error
> > from the device. Propagate the actual device error so the caller can
> > handle failures appropriately.
> 
> Callers don't handle failures. Drivers are not permitted to fail
> destroy, if they do it probably will trigger a WARN_ON.
> 
> You can make some of an argument to allow failing destroy for user
> objects only, but not like this in general for kernel objects.
> 
> If your FW fails destroying a kernel object then the device is busted,
> you should reset it and succeed to destroy the kernel object anyhow.
> 
> Jason

This code is for user objects only. When destroy is called for a user object, the core code handles the failure gracefully and can retry cleanup at a later stage.

Currently we don't have a code path where destroy_ah actually fails in device, but we'd like the error propagation in place for completeness so that if a future FW change can return a transient error, we handle it correctly rather than silently ignoring it.

Would you prefer we explicitly guard this with a check for ibah->uobject
(i.e., only propagate the error when it's a user object).

Tom

^ permalink raw reply

* Re: [PATCH] RDMA/rtrs-srv: Fix integer underflow in process_read and process_write
From: Haris Iqbal @ 2026-06-08 14:21 UTC (permalink / raw)
  To: Aurelien DESBRIERES; +Cc: linux-rdma, jgg, leon, jinpu.wang, gregkh
In-Reply-To: <20260608134802.5019-1-aurelien@hackers.camp>

On Mon, Jun 8, 2026 at 3:48 PM Aurelien DESBRIERES
<aurelien@hackers.camp> wrote:
>
> usr_len is read from a network-supplied message field (le16_to_cpu)
> and used to compute data_len = off - usr_len without validating that
> usr_len <= off. A malicious RDMA client can send usr_len > off causing
> an integer underflow, resulting in data_len wrapping to a huge size_t
> value which is then passed to the rdma_ev callback as a memory length,
> leading to out-of-bounds memory access.
>
> Fix by reading and validating usr_len <= off before rtrs_srv_get_ops_ids()
> in both process_read() and process_write(), ensuring the early return
> path acquires no reference and has no resource leak.
>
> Reported-by: Aurelien DESBRIERES <aurelien@hackers.camp>
> Reviewed-by: Md Haris Iqbal <haris.iqbal@ionos.com>
> Signed-off-by: Aurelien DESBRIERES <aurelien@hackers.camp>
> Assisted-by: Claude <claude-sonnet-4-6>

Acked-by: Md Haris Iqbal <haris.iqbal@ionos.com>

> ---
>  drivers/infiniband/ulp/rtrs/rtrs-srv.c | 12 ++++++++++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
> index 6482ad859..f2fd80c8a 100644
> --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c
> +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
> @@ -1059,6 +1059,11 @@ static void process_read(struct rtrs_srv_con *con,
>                             "Processing read request failed, invalid message\n");
>                 return;
>         }
> +       usr_len = le16_to_cpu(msg->usr_len);
> +       if (usr_len > off) {
> +               pr_debug("rtrs-srv: Invalid usr_len %zu > off %u\n", usr_len, off);
> +               return;
> +       }
>         rtrs_srv_get_ops_ids(srv_path);
>         rtrs_srv_update_rdma_stats(srv_path->stats, off, READ);
>         id = srv_path->ops_ids[buf_id];
> @@ -1066,7 +1071,6 @@ static void process_read(struct rtrs_srv_con *con,
>         id->dir         = READ;
>         id->msg_id      = buf_id;
>         id->rd_msg      = msg;
> -       usr_len = le16_to_cpu(msg->usr_len);
>         data_len = off - usr_len;
>         data = page_address(srv->chunks[buf_id]);
>         ret = ctx->ops.rdma_ev(srv->priv, id, data, data_len,
> @@ -1112,6 +1116,11 @@ static void process_write(struct rtrs_srv_con *con,
>                              rtrs_srv_state_str(srv_path->state));
>                 return;
>         }
> +       usr_len = le16_to_cpu(req->usr_len);
> +       if (usr_len > off) {
> +               pr_debug("rtrs-srv: Invalid usr_len %zu > off %u\n", usr_len, off);
> +               return;
> +       }
>         rtrs_srv_get_ops_ids(srv_path);
>         rtrs_srv_update_rdma_stats(srv_path->stats, off, WRITE);
>         id = srv_path->ops_ids[buf_id];
> @@ -1119,7 +1128,6 @@ static void process_write(struct rtrs_srv_con *con,
>         id->dir    = WRITE;
>         id->msg_id = buf_id;
>
> -       usr_len = le16_to_cpu(req->usr_len);
>         data_len = off - usr_len;
>         data = page_address(srv->chunks[buf_id]);
>         ret = ctx->ops.rdma_ev(srv->priv, id, data, data_len,
> --
> 2.43.0
>

^ permalink raw reply

page: next (older)
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox