From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Andrew Lunn <andrew+netdev@lunn.ch>,
"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
Leon Romanovsky <leon@kernel.org>,
Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
Shay Drory <shayd@nvidia.com>, Or Har-Toov <ohartoov@nvidia.com>,
Edward Srouji <edwards@nvidia.com>,
Simon Horman <horms@kernel.org>,
Maher Sanalla <msanalla@nvidia.com>,
Parav Pandit <parav@nvidia.com>, Kees Cook <kees@kernel.org>,
Moshe Shemesh <moshe@nvidia.com>,
Patrisious Haddad <phaddad@nvidia.com>, <netdev@vger.kernel.org>,
<linux-rdma@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
Gal Pressman <gal@nvidia.com>
Subject: [PATCH net-next 09/15] net/mlx5: LAG, store demux resources per master lag_func
Date: Thu, 4 Jun 2026 14:44:49 +0300 [thread overview]
Message-ID: <20260604114455.434711-10-tariqt@nvidia.com> (raw)
In-Reply-To: <20260604114455.434711-1-tariqt@nvidia.com>
From: Shay Drory <shayd@nvidia.com>
The lag demux resources (flow table, flow group, and rules xarray)
are stored on the shared ldev. With Socket Direct, multiple SD groups
each create their own demux FT/FG during their master's IB device
initialization. Since they all write to the same ldev fields, the
second group's init overwrites the first group's pointers, leaking
the first group's FT/FG.
During teardown, the cleanup uses the overwritten pointers, destroying
the wrong group's resources and leaving leaked flow tables in the LAG
namespace. These leaked tables can interfere with subsequently created
demux tables.
Move the demux resources from the shared ldev to per-master lag_func
instances. Each master device now owns its own independent demux
state. The rule_add and rule_del helpers look up the appropriate
master's lag_func via the existing filter/group infrastructure.
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/lag/lag.c | 95 +++++++++++++------
.../net/ethernet/mellanox/mlx5/core/lag/lag.h | 7 +-
2 files changed, 68 insertions(+), 34 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index dd3f18f85466..e23c1e81b98f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -1590,7 +1590,7 @@ struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev)
static int mlx5_lag_demux_ft_fg_init(struct mlx5_core_dev *dev,
struct mlx5_flow_table_attr *ft_attr,
- struct mlx5_lag *ldev)
+ struct lag_func *pf)
{
#ifdef CONFIG_MLX5_ESWITCH
struct mlx5_flow_namespace *ns;
@@ -1601,20 +1601,20 @@ static int mlx5_lag_demux_ft_fg_init(struct mlx5_core_dev *dev,
if (!ns)
return 0;
- ldev->lag_demux_ft = mlx5_create_flow_table(ns, ft_attr);
- if (IS_ERR(ldev->lag_demux_ft))
- return PTR_ERR(ldev->lag_demux_ft);
+ pf->lag_demux_ft = mlx5_create_flow_table(ns, ft_attr);
+ if (IS_ERR(pf->lag_demux_ft))
+ return PTR_ERR(pf->lag_demux_ft);
fg = mlx5_esw_lag_demux_fg_create(dev->priv.eswitch,
- ldev->lag_demux_ft);
+ pf->lag_demux_ft);
if (IS_ERR(fg)) {
err = PTR_ERR(fg);
- mlx5_destroy_flow_table(ldev->lag_demux_ft);
- ldev->lag_demux_ft = NULL;
+ mlx5_destroy_flow_table(pf->lag_demux_ft);
+ pf->lag_demux_ft = NULL;
return err;
}
- ldev->lag_demux_fg = fg;
+ pf->lag_demux_fg = fg;
return 0;
#else
return -EOPNOTSUPP;
@@ -1623,7 +1623,7 @@ static int mlx5_lag_demux_ft_fg_init(struct mlx5_core_dev *dev,
static int mlx5_lag_demux_fw_init(struct mlx5_core_dev *dev,
struct mlx5_flow_table_attr *ft_attr,
- struct mlx5_lag *ldev)
+ struct lag_func *pf)
{
struct mlx5_flow_namespace *ns;
int err;
@@ -1632,12 +1632,12 @@ static int mlx5_lag_demux_fw_init(struct mlx5_core_dev *dev,
if (!ns)
return 0;
- ldev->lag_demux_fg = NULL;
+ pf->lag_demux_fg = NULL;
ft_attr->max_fte = 1;
- ldev->lag_demux_ft = mlx5_create_lag_demux_flow_table(ns, ft_attr);
- if (IS_ERR(ldev->lag_demux_ft)) {
- err = PTR_ERR(ldev->lag_demux_ft);
- ldev->lag_demux_ft = NULL;
+ pf->lag_demux_ft = mlx5_create_lag_demux_flow_table(ns, ft_attr);
+ if (IS_ERR(pf->lag_demux_ft)) {
+ err = PTR_ERR(pf->lag_demux_ft);
+ pf->lag_demux_ft = NULL;
return err;
}
@@ -1648,6 +1648,7 @@ int mlx5_lag_demux_init(struct mlx5_core_dev *dev,
struct mlx5_flow_table_attr *ft_attr)
{
struct mlx5_lag *ldev;
+ struct lag_func *pf;
if (!ft_attr)
return -EINVAL;
@@ -1656,12 +1657,16 @@ int mlx5_lag_demux_init(struct mlx5_core_dev *dev,
if (!ldev)
return -ENODEV;
- xa_init(&ldev->lag_demux_rules);
+ pf = mlx5_lag_pf_by_dev(ldev, dev);
+ if (!pf)
+ return -ENODEV;
+
+ xa_init(&pf->lag_demux_rules);
if (mlx5_get_sd(dev))
- return mlx5_lag_demux_ft_fg_init(dev, ft_attr, ldev);
+ return mlx5_lag_demux_ft_fg_init(dev, ft_attr, pf);
- return mlx5_lag_demux_fw_init(dev, ft_attr, ldev);
+ return mlx5_lag_demux_fw_init(dev, ft_attr, pf);
}
EXPORT_SYMBOL(mlx5_lag_demux_init);
@@ -1670,40 +1675,63 @@ void mlx5_lag_demux_cleanup(struct mlx5_core_dev *dev)
struct mlx5_flow_handle *rule;
struct mlx5_lag *ldev;
unsigned long vport_num;
+ struct lag_func *pf;
ldev = mlx5_lag_dev(dev);
if (!ldev)
return;
- xa_for_each(&ldev->lag_demux_rules, vport_num, rule)
+ pf = mlx5_lag_pf_by_dev(ldev, dev);
+ if (!pf)
+ return;
+
+ xa_for_each(&pf->lag_demux_rules, vport_num, rule)
mlx5_del_flow_rules(rule);
- xa_destroy(&ldev->lag_demux_rules);
+ xa_destroy(&pf->lag_demux_rules);
- if (ldev->lag_demux_fg)
- mlx5_destroy_flow_group(ldev->lag_demux_fg);
- if (ldev->lag_demux_ft)
- mlx5_destroy_flow_table(ldev->lag_demux_ft);
- ldev->lag_demux_fg = NULL;
- ldev->lag_demux_ft = NULL;
+ if (pf->lag_demux_fg)
+ mlx5_destroy_flow_group(pf->lag_demux_fg);
+ if (pf->lag_demux_ft)
+ mlx5_destroy_flow_table(pf->lag_demux_ft);
+ pf->lag_demux_fg = NULL;
+ pf->lag_demux_ft = NULL;
}
EXPORT_SYMBOL(mlx5_lag_demux_cleanup);
+static struct lag_func *mlx5_lag_dev_get_master_pf(struct mlx5_lag *ldev,
+ struct mlx5_core_dev *dev)
+{
+ u32 filter = mlx5_lag_get_filter(ldev, dev);
+ int idx;
+
+ idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1, filter);
+ if (idx < 0)
+ return NULL;
+
+ return mlx5_lag_pf(ldev, idx);
+}
+
int mlx5_lag_demux_rule_add(struct mlx5_core_dev *vport_dev, u16 vport_num,
int index)
{
struct mlx5_flow_handle *rule;
+ struct lag_func *master;
struct mlx5_lag *ldev;
int err;
ldev = mlx5_lag_dev(vport_dev);
- if (!ldev || !ldev->lag_demux_fg)
+ if (!ldev)
return 0;
- if (xa_load(&ldev->lag_demux_rules, index))
+ master = mlx5_lag_dev_get_master_pf(ldev, vport_dev);
+ if (!master || !master->lag_demux_fg)
+ return 0;
+
+ if (xa_load(&master->lag_demux_rules, index))
return 0;
rule = mlx5_esw_lag_demux_rule_create(vport_dev->priv.eswitch,
- vport_num, ldev->lag_demux_ft);
+ vport_num, master->lag_demux_ft);
if (IS_ERR(rule)) {
err = PTR_ERR(rule);
mlx5_core_warn(vport_dev,
@@ -1712,7 +1740,7 @@ int mlx5_lag_demux_rule_add(struct mlx5_core_dev *vport_dev, u16 vport_num,
return err;
}
- err = xa_err(xa_store(&ldev->lag_demux_rules, index, rule,
+ err = xa_err(xa_store(&master->lag_demux_rules, index, rule,
GFP_KERNEL));
if (err) {
mlx5_del_flow_rules(rule);
@@ -1728,13 +1756,18 @@ EXPORT_SYMBOL(mlx5_lag_demux_rule_add);
void mlx5_lag_demux_rule_del(struct mlx5_core_dev *dev, int index)
{
struct mlx5_flow_handle *rule;
+ struct lag_func *master_pf;
struct mlx5_lag *ldev;
ldev = mlx5_lag_dev(dev);
- if (!ldev || !ldev->lag_demux_fg)
+ if (!ldev)
+ return;
+
+ master_pf = mlx5_lag_dev_get_master_pf(ldev, dev);
+ if (!master_pf || !master_pf->lag_demux_fg)
return;
- rule = xa_erase(&ldev->lag_demux_rules, index);
+ rule = xa_erase(&master_pf->lag_demux_rules, index);
if (rule)
mlx5_del_flow_rules(rule);
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index 0296f752bb4c..c689f1951cd8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -59,6 +59,10 @@ struct lag_func {
struct mlx5_nb port_change_nb;
u32 group_id; /* SD group ID, 0 = not SD */
bool sd_fdb_active; /* set on all SD group members */
+ /* Lag demux resources - only populated on master devices */
+ struct mlx5_flow_table *lag_demux_ft;
+ struct mlx5_flow_group *lag_demux_fg;
+ struct xarray lag_demux_rules;
};
/* Used for collection of netdev event info. */
@@ -95,9 +99,6 @@ struct mlx5_lag {
/* Protect lag fields/state changes */
struct mutex lock;
struct lag_mpesw lag_mpesw;
- struct mlx5_flow_table *lag_demux_ft;
- struct mlx5_flow_group *lag_demux_fg;
- struct xarray lag_demux_rules;
};
static inline struct mlx5_lag *
--
2.44.0
next prev parent reply other threads:[~2026-06-04 11:46 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-04 11:44 [PATCH net-next 00/15] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 2/2 Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 01/15] net/mlx5: E-Switch, skip uplink IB rep load for SD secondary devices Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 02/15] net/mlx5: devcom, expose locked variant of send_event Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 03/15] net/mlx5: devcom, add DEVCOM_CANT_FAIL for non-rollback events Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 04/15] net/mlx5: SD, make primary/secondary role determination more robust Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 05/15] net/mlx5: SD, add L2 table silent mode query support Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 06/15] net/mlx5: SD, expend vport metadata for SD secondary devices Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 07/15] net/mlx5: SD, support switchdev mode transition with shared FDB Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 08/15] net/mlx5: E-Switch, notify SD on eswitch disable Tariq Toukan
2026-06-04 11:44 ` Tariq Toukan [this message]
2026-06-04 11:44 ` [PATCH net-next 10/15] net/mlx5: LAG, disable both regular and SD LAG on lag_disable_change Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 11/15] net/mlx5: LAG, introduce software vport LAG implementation Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 12/15] net/mlx5: LAG, add MPESW over SD LAG support Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 13/15] net/mlx5: E-Switch, defer rep load while SD LAG is not active Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 14/15] net/mlx5: SD, defer vport metadata init until SD is ready Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 15/15] net/mlx5: SD, enable SD over ECPF and allow switchdev transition Tariq Toukan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260604114455.434711-10-tariqt@nvidia.com \
--to=tariqt@nvidia.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=edwards@nvidia.com \
--cc=gal@nvidia.com \
--cc=horms@kernel.org \
--cc=kees@kernel.org \
--cc=kuba@kernel.org \
--cc=leon@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=mbloch@nvidia.com \
--cc=moshe@nvidia.com \
--cc=msanalla@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=ohartoov@nvidia.com \
--cc=pabeni@redhat.com \
--cc=parav@nvidia.com \
--cc=phaddad@nvidia.com \
--cc=saeedm@nvidia.com \
--cc=shayd@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox