Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed
From: Shay Drori <shayd@nvidia.com>
To: Tariq Toukan <tariqt@nvidia.com>,
	Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	"Andrew Lunn" <andrew+netdev@lunn.ch>,
	"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
	Leon Romanovsky <leon@kernel.org>, Mark Bloch <mbloch@nvidia.com>,
	Or Har-Toov <ohartoov@nvidia.com>,
	"Edward Srouji" <edwards@nvidia.com>,
	Simon Horman <horms@kernel.org>,
	Maher Sanalla <msanalla@nvidia.com>,
	Parav Pandit <parav@nvidia.com>, Kees Cook <kees@kernel.org>,
	Moshe Shemesh <moshe@nvidia.com>,
	Patrisious Haddad <phaddad@nvidia.com>, <netdev@vger.kernel.org>,
	<linux-rdma@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	Gal Pressman <gal@nvidia.com>
Subject: Re: [PATCH net-next 12/15] net/mlx5: LAG, add MPESW over SD LAG support
Date: Sun, 7 Jun 2026 14:20:39 +0300	[thread overview]
Message-ID: <2f076488-3d1a-4ae1-aa24-e9b80be08ec7@nvidia.com> (raw)
In-Reply-To: <20260604114455.434711-13-tariqt@nvidia.com>



On 04/06/2026 14:44, Tariq Toukan wrote:
> From: Shay Drory <shayd@nvidia.com>
> 
> Enable MPESW LAG creation over SD LAG members, forming a composite LAG
> hierarchy. This allows bonding multiple SD groups together under a
> single MPESW configuration with shared FDB.
> 
> When enabling composite MPESW, the individual SD LAG shared FDB
> configurations are temporarily torn down and recreated when the
> composite LAG is disabled.
> 
> Signed-off-by: Shay Drory <shayd@nvidia.com>
> Reviewed-by: Mark Bloch <mbloch@nvidia.com>
> Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
> ---
>   .../net/ethernet/mellanox/mlx5/core/lag/lag.c |  6 ++
>   .../net/ethernet/mellanox/mlx5/core/lag/lag.h |  8 ++
>   .../ethernet/mellanox/mlx5/core/lag/mpesw.c   | 95 +++++++++++++++++--
>   .../ethernet/mellanox/mlx5/core/lag/mpesw.h   |  4 +
>   4 files changed, 105 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
> index 9566fbf59fdb..25a9012e3014 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
> @@ -2545,6 +2545,7 @@ void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
>   	struct mlx5_core_dev *primary;
>   	struct mlx5_lag *ldev;
>   	struct lag_func *pf;
> +	bool mpesw;
>   	int i;
>   
>   	ldev = mlx5_lag_dev(dev);
> @@ -2553,6 +2554,9 @@ void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
>   
>   	primary = mlx5_sd_get_primary(dev) ?: dev;
>   	mlx5_devcom_comp_lock(primary->priv.hca_devcom_comp);
> +	mpesw = ldev->mode == MLX5_LAG_MODE_MPESW;
> +	if (mpesw)
> +		mlx5_mpesw_sd_devcoms_lock(ldev);
>   	mutex_lock(&ldev->lock);
>   
>   	ldev->mode_changes_in_progress++;
> @@ -2564,6 +2568,8 @@ void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
>   	}
>   
>   	mutex_unlock(&ldev->lock);
> +	if (mpesw)
> +		mlx5_mpesw_sd_devcoms_unlock(ldev);
>   	mlx5_devcom_comp_unlock(primary->priv.hca_devcom_comp);
>   
>   	if (!sd_devcom)
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
> index 34350b0a7307..3a90d360d724 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
> @@ -157,6 +157,14 @@ __mlx5_lag_is_sd(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
>   	return pf && pf->group_id != 0;
>   }
>   
> +static inline bool
> +__mlx5_lag_dev_is_port(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
> +{
> +	struct lag_func *pf = mlx5_lag_pf_by_dev(ldev, dev);
> +
> +	return pf && xa_get_mark(&ldev->pfs, pf->idx, MLX5_LAG_XA_MARK_PORT);
> +}
> +
>   static inline bool
>   __mlx5_lag_is_active(struct mlx5_lag *ldev)
>   {
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
> index 2cb44084e239..50bfb450c71e 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
> @@ -15,7 +15,7 @@ static void mlx5_mpesw_metadata_cleanup(struct mlx5_lag *ldev)
>   	u32 pf_metadata;
>   	int i;
>   
> -	mlx5_ldev_for_each(i, 0, ldev) {
> +	mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
>   		dev = mlx5_lag_pf(ldev, i)->dev;
>   		esw = dev->priv.eswitch;
>   		pf_metadata = ldev->lag_mpesw.pf_metadata[i];
> @@ -36,7 +36,7 @@ static int mlx5_mpesw_metadata_set(struct mlx5_lag *ldev)
>   	u32 pf_metadata;
>   	int i, err;
>   
> -	mlx5_ldev_for_each(i, 0, ldev) {
> +	mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
>   		dev = mlx5_lag_pf(ldev, i)->dev;
>   		esw = dev->priv.eswitch;
>   		pf_metadata = mlx5_esw_match_metadata_alloc(esw);
> @@ -52,7 +52,7 @@ static int mlx5_mpesw_metadata_set(struct mlx5_lag *ldev)
>   			goto err_metadata;
>   	}
>   
> -	mlx5_ldev_for_each(i, 0, ldev) {
> +	mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
>   		dev = mlx5_lag_pf(ldev, i)->dev;
>   		mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_MULTIPORT_ESW,
>   					 (void *)0);
> @@ -65,6 +65,48 @@ static int mlx5_mpesw_metadata_set(struct mlx5_lag *ldev)
>   	return err;
>   }
>   
> +static void mlx5_mpesw_restore_sd_fdb(struct mlx5_lag *ldev)
> +{
> +	struct lag_func *pf;
> +	int err, i;
> +
> +	mlx5_ldev_for_each(i, 0, ldev) {
> +		pf = mlx5_lag_pf(ldev, i);
> +		err = mlx5_lag_shared_fdb_create(ldev, NULL, 0, pf->group_id);

sashiko.dev says:

Does this accidentally activate global FW LAG for non-SD ports?
This loop unconditionally calls mlx5_lag_shared_fdb_create() using
pf->group_id. For standalone non-SD ports, pf->group_id is 0.
Inside mlx5_lag_shared_fdb_create(), passing group_id = 0 sets the filter
to MLX5_LAG_FILTER_ALL, which bypasses the SD-specific single-FDB path and
incorrectly triggers a full global LAG activation via mlx5_activate_lag().
Should there be a check for pf->group_id != 0 or pf->sd_fdb_active here,
similar to the check in mlx5_mpesw_teardown_sd_fdb()?


pf->sd_fdb_active is false here by definition-we are re-creating it.
And this API is already gated by pf->group_id check.
When MPESW is destroyed, all PFs have group ID (if this is SD NIC)

> +		if (err)
> +			mlx5_core_warn(pf->dev,
> +				       "Failed to restore SD shared FDB (%d)\n",
> +				       err);
> +	}
> +}
> +
> +static int mlx5_mpesw_teardown_sd_fdb(struct mlx5_lag *ldev)
> +{
> +	struct lag_func *pf;
> +	int i;
> +
> +	mlx5_ldev_for_each(i, 0, ldev) {
> +		pf = mlx5_lag_pf(ldev, i);
> +		if (!pf->sd_fdb_active)
> +			continue;
> +		mlx5_lag_shared_fdb_destroy(ldev, pf->group_id);
> +	}
> +	return 0;
> +}
> +
> +static bool mlx5_lag_has_sd_group(struct mlx5_lag *ldev)
> +{
> +	struct lag_func *pf;
> +	int i;
> +
> +	mlx5_ldev_for_each(i, 0, ldev) {
> +		pf = mlx5_lag_pf(ldev, i);
> +		if (pf->group_id)
> +			return true;
> +	}
> +	return false;
> +}
> +
>   static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
>   {
>   	int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
> @@ -92,10 +134,17 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
>   	if (err)
>   		return err;
>   
> +	if (mlx5_lag_has_sd_group(ldev))
> +		mlx5_mpesw_teardown_sd_fdb(ldev);
> +
>   	err = mlx5_lag_shared_fdb_create(ldev, NULL, MLX5_LAG_MODE_MPESW,
>   					 MLX5_LAG_FILTER_ALL);
>   	if (err) {
> -		mlx5_core_warn(dev0, "Failed to create LAG in MPESW mode (%d)\n", err);
> +		mlx5_core_warn(dev0,
> +			       "Failed to create LAG in MPESW mode (%d)\n",
> +			       err);
> +		if (mlx5_lag_has_sd_group(ldev))
> +			mlx5_mpesw_restore_sd_fdb(ldev);
>   		mlx5_mpesw_metadata_cleanup(ldev);
>   		return err;
>   	}
> @@ -105,9 +154,36 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
>   
>   void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev)
>   {
> -	if (ldev->mode == MLX5_LAG_MODE_MPESW) {
> -		mlx5_mpesw_metadata_cleanup(ldev);
> -		mlx5_lag_shared_fdb_destroy(ldev, MLX5_LAG_FILTER_ALL);
> +	if (ldev->mode != MLX5_LAG_MODE_MPESW)
> +		return;
> +
> +	mlx5_mpesw_metadata_cleanup(ldev);
> +	mlx5_lag_shared_fdb_destroy(ldev, MLX5_LAG_FILTER_ALL);
> +	if (mlx5_lag_has_sd_group(ldev))
> +		mlx5_mpesw_restore_sd_fdb(ldev);
> +}
> +
> +void mlx5_mpesw_sd_devcoms_lock(struct mlx5_lag *ldev)
> +{
> +	struct mlx5_devcom_comp_dev *sd_devcom;
> +	int i;
> +
> +	mlx5_ldev_for_each(i, 0, ldev) {
> +		sd_devcom = mlx5_sd_get_devcom(mlx5_lag_pf(ldev, i)->dev);

sashiko.dev says:
Can this lead to a use-after-free on the pf pointer?
The mlx5_ldev_for_each() loop iterates over the ldev->pfs xarray, 
fetching pf
pointers via mlx5_lag_pf() and dereferencing pf->dev.
However, this helper is called in mlx5_lag_disable_change() before acquiring
ldev->lock and without an explicit rcu_read_lock(). Because xa_load() drops
its internal RCU locks before returning, the returned pf pointer is 
completely
unprotected.
If a concurrent device unload via mlx5_lag_remove_mdev() acquires 
ldev->lock,
calls xa_erase(), and frees the pf object with kfree(pf), would this access
to pf->dev trigger a use-after-free?


No. mlx5_mpesw_sd_devcoms_lock() runs only when ldev->mode == MPESW
(checked under the held HCA devcom_comp). mlx5_lag_remove_mdev() is
reachable only after mlx5_eswitch_disable()/mlx5_lag_disable_change()
has torn down MPESW for that LAG (setting ldev->mode = NONE).
The shared HCA devcom_comp serializes the two paths, so any thread
that reaches remove_mdev's xa_erase has already moved ldev->mode out
of MPESW, and any concurrent disable_change waiting on HCA devcom
will then see mpesw=false and skip the iteration.


> +		if (sd_devcom)
> +			mlx5_devcom_comp_lock(sd_devcom);
> +	}
> +}
> +
> +void mlx5_mpesw_sd_devcoms_unlock(struct mlx5_lag *ldev)
> +{
> +	struct mlx5_devcom_comp_dev *sd_devcom;
> +	int i;
> +
> +	mlx5_ldev_for_each_reverse(i, MLX5_MAX_PORTS, 0, ldev) {
> +		sd_devcom = mlx5_sd_get_devcom(mlx5_lag_pf(ldev, i)->dev);
> +		if (sd_devcom)
> +			mlx5_devcom_comp_unlock(sd_devcom);
>   	}
>   }
>   
> @@ -122,6 +198,7 @@ static void mlx5_mpesw_work(struct work_struct *work)
>   		return;
>   
>   	mlx5_devcom_comp_lock(devcom);
> +	mlx5_mpesw_sd_devcoms_lock(ldev);
>   	mutex_lock(&ldev->lock);
>   	if (ldev->mode_changes_in_progress) {
>   		mpesww->result = -EAGAIN;
> @@ -134,6 +211,7 @@ static void mlx5_mpesw_work(struct work_struct *work)
>   		mlx5_lag_disable_mpesw(ldev);
>   unlock:
>   	mutex_unlock(&ldev->lock);
> +	mlx5_mpesw_sd_devcoms_unlock(ldev);
>   	mlx5_devcom_comp_unlock(devcom);
>   	complete(&mpesww->comp);
>   }
> @@ -199,7 +277,8 @@ bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev)
>   {
>   	struct mlx5_lag *ldev = mlx5_lag_dev(dev);
>   
> -	return ldev && ldev->mode == MLX5_LAG_MODE_MPESW;
> +	return ldev && ldev->mode == MLX5_LAG_MODE_MPESW &&
> +	       __mlx5_lag_dev_is_port(ldev, dev);
>   }
>   EXPORT_SYMBOL(mlx5_lag_is_mpesw);
>   
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h
> index b767dbb4f457..5099723ba0f7 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h
> @@ -33,8 +33,12 @@ void mlx5_lag_mpesw_disable(struct mlx5_core_dev *dev);
>   int mlx5_lag_mpesw_enable(struct mlx5_core_dev *dev);
>   #ifdef CONFIG_MLX5_ESWITCH
>   void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev);
> +void mlx5_mpesw_sd_devcoms_lock(struct mlx5_lag *ldev);
> +void mlx5_mpesw_sd_devcoms_unlock(struct mlx5_lag *ldev);
>   #else
>   static inline void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev) {}
> +static inline void mlx5_mpesw_sd_devcoms_lock(struct mlx5_lag *ldev) {}
> +static inline void mlx5_mpesw_sd_devcoms_unlock(struct mlx5_lag *ldev) {}
>   #endif /* CONFIG_MLX5_ESWITCH */
>   
>   #ifdef CONFIG_MLX5_ESWITCH


  reply	other threads:[~2026-06-07 11:21 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-04 11:44 [PATCH net-next 00/15] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 2/2 Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 01/15] net/mlx5: E-Switch, skip uplink IB rep load for SD secondary devices Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 02/15] net/mlx5: devcom, expose locked variant of send_event Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 03/15] net/mlx5: devcom, add DEVCOM_CANT_FAIL for non-rollback events Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 04/15] net/mlx5: SD, make primary/secondary role determination more robust Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 05/15] net/mlx5: SD, add L2 table silent mode query support Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 06/15] net/mlx5: SD, expend vport metadata for SD secondary devices Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 07/15] net/mlx5: SD, support switchdev mode transition with shared FDB Tariq Toukan
2026-06-07 11:03   ` Shay Drori
2026-06-04 11:44 ` [PATCH net-next 08/15] net/mlx5: E-Switch, notify SD on eswitch disable Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 09/15] net/mlx5: LAG, store demux resources per master lag_func Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 10/15] net/mlx5: LAG, disable both regular and SD LAG on lag_disable_change Tariq Toukan
2026-06-07 11:07   ` Shay Drori
2026-06-04 11:44 ` [PATCH net-next 11/15] net/mlx5: LAG, introduce software vport LAG implementation Tariq Toukan
2026-06-07 11:12   ` Shay Drori
2026-06-04 11:44 ` [PATCH net-next 12/15] net/mlx5: LAG, add MPESW over SD LAG support Tariq Toukan
2026-06-07 11:20   ` Shay Drori [this message]
2026-06-04 11:44 ` [PATCH net-next 13/15] net/mlx5: E-Switch, defer rep load while SD LAG is not active Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 14/15] net/mlx5: SD, defer vport metadata init until SD is ready Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 15/15] net/mlx5: SD, enable SD over ECPF and allow switchdev transition Tariq Toukan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2f076488-3d1a-4ae1-aa24-e9b80be08ec7@nvidia.com \
    --to=shayd@nvidia.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=edwards@nvidia.com \
    --cc=gal@nvidia.com \
    --cc=horms@kernel.org \
    --cc=kees@kernel.org \
    --cc=kuba@kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=mbloch@nvidia.com \
    --cc=moshe@nvidia.com \
    --cc=msanalla@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=ohartoov@nvidia.com \
    --cc=pabeni@redhat.com \
    --cc=parav@nvidia.com \
    --cc=phaddad@nvidia.com \
    --cc=saeedm@nvidia.com \
    --cc=tariqt@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox