Netdev List
 help / color / mirror / Atom feed
From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Andrew Lunn <andrew+netdev@lunn.ch>,
	"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
	Leon Romanovsky <leon@kernel.org>,
	Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
	Shay Drory <shayd@nvidia.com>, Or Har-Toov <ohartoov@nvidia.com>,
	Edward Srouji <edwards@nvidia.com>,
	Simon Horman <horms@kernel.org>,
	Maher Sanalla <msanalla@nvidia.com>,
	Parav Pandit <parav@nvidia.com>, Kees Cook <kees@kernel.org>,
	Moshe Shemesh <moshe@nvidia.com>,
	Patrisious Haddad <phaddad@nvidia.com>, <netdev@vger.kernel.org>,
	<linux-rdma@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	Gal Pressman <gal@nvidia.com>
Subject: [PATCH net-next 07/15] net/mlx5: SD, support switchdev mode transition with shared FDB
Date: Thu, 4 Jun 2026 14:44:47 +0300	[thread overview]
Message-ID: <20260604114455.434711-8-tariqt@nvidia.com> (raw)
In-Reply-To: <20260604114455.434711-1-tariqt@nvidia.com>

From: Shay Drory <shayd@nvidia.com>

When the eswitch transitions, propagate the change to SD: secondaries
get their TX flow table root reconfigured for the new mode, and when
all group devices move to switchdev, the per-group shared FDB is
activated.

Shared FDB activation is best-effort - failure does not block the
eswitch transition; the next transition retries.

Note: the existing mlx5_get_sd() guard that blocks switchdev for SD
devices is intentionally retained. It will be removed once all
supporting patches are in place.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../mellanox/mlx5/core/eswitch_offloads.c     |  24 +++-
 .../net/ethernet/mellanox/mlx5/core/lib/sd.c  | 133 +++++++++++++++++-
 .../net/ethernet/mellanox/mlx5/core/lib/sd.h  |   7 +
 3 files changed, 156 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 366531d8ef02..1133267a53fb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -46,6 +46,7 @@
 #include "fs_core.h"
 #include "lib/mlx5.h"
 #include "lib/devcom.h"
+#include "lib/sd.h"
 #include "lib/eq.h"
 #include "lib/fs_chains.h"
 #include "en_tc.h"
@@ -3164,6 +3165,9 @@ static void esw_unset_master_egress_rule(struct mlx5_core_dev *dev,
 	vport = mlx5_eswitch_get_vport(dev->priv.eswitch,
 				       dev->priv.eswitch->manager_vport);
 
+	if (!vport->egress.acl)
+		return;
+
 	esw_acl_egress_ofld_bounce_rule_destroy(vport, MLX5_CAP_GEN(slave_dev, vhca_id));
 
 	if (xa_empty(&vport->egress.offloads.bounce_rules)) {
@@ -3182,6 +3186,9 @@ int mlx5_eswitch_offloads_single_fdb_add_one(struct mlx5_eswitch *master_esw,
 	if (err)
 		return err;
 
+	if (!mlx5_sd_is_primary(slave_esw->dev))
+		return 0;
+
 	err = esw_set_master_egress_rule(master_esw->dev,
 					 slave_esw->dev, max_slaves);
 	if (err)
@@ -3401,7 +3408,7 @@ void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw,
 		return;
 
 	if ((MLX5_VPORT_MANAGER(esw->dev) || mlx5_core_is_ecpf_esw_manager(esw->dev)) &&
-	    !mlx5_lag_is_supported(esw->dev))
+	    (!mlx5_lag_is_supported(esw->dev) && !mlx5_get_sd(esw->dev)))
 		return;
 
 	xa_init(&esw->paired);
@@ -4219,11 +4226,6 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 	if (IS_ERR(esw))
 		return PTR_ERR(esw);
 
-	if (mlx5_fw_reset_in_progress(esw->dev)) {
-		NL_SET_ERR_MSG_MOD(extack, "Can't change eswitch mode during firmware reset");
-		return -EBUSY;
-	}
-
 	if (esw_mode_from_devlink(mode, &mlx5_mode))
 		return -EINVAL;
 
@@ -4233,11 +4235,18 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 		return -EPERM;
 	}
 
+	if (mlx5_fw_reset_in_progress(esw->dev)) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Can't change eswitch mode during firmware reset");
+		return -EBUSY;
+	}
+
 	/* Avoid try_lock, active/inactive mode change is not restricted */
 	if (mlx5_devlink_switchdev_active_mode_change(esw, mode))
 		return 0;
 
 	mlx5_lag_disable_change(esw->dev);
+
 	err = mlx5_esw_try_lock(esw);
 	if (err < 0) {
 		NL_SET_ERR_MSG_MOD(extack, "Can't change mode, E-Switch is busy");
@@ -4304,6 +4313,9 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 	esw->eswitch_operation_in_progress = false;
 unlock:
 	mlx5_esw_unlock(esw);
+	/* Shared FDB activation is creating LAG which is changing reps. */
+	if (!err)
+		mlx5_sd_eswitch_mode_set(esw->dev, mlx5_mode);
 enable_lag:
 	mlx5_lag_enable_change(esw->dev);
 	return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
index 8b1f3a25d80d..d2ed156ed1c6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
@@ -5,6 +5,8 @@
 #include "../lag/lag.h"
 #include "mlx5_core.h"
 #include "lib/mlx5.h"
+#include "devlink.h"
+#include "eswitch.h"
 #include "fs_cmd.h"
 #include <linux/mlx5/eswitch.h>
 #include <linux/mlx5/vport.h>
@@ -33,6 +35,8 @@ struct mlx5_sd {
 		struct { /* secondary */
 			struct mlx5_core_dev *primary_dev;
 			u32 alias_obj_id;
+			/* TX flow table root in switchdev (silent) config */
+			bool tx_root_silent;
 		};
 	};
 };
@@ -669,6 +673,29 @@ static void sd_secondary_destroy_alias_ft(struct mlx5_core_dev *secondary)
 				   MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS);
 }
 
+static int mlx5_sd_secondary_conf_tx_root(struct mlx5_core_dev *secondary,
+					  bool disconnect)
+{
+	struct mlx5_sd *sd = mlx5_get_sd(secondary);
+	int err;
+
+	/* Idempotent: skip if TX root is already in the requested state. */
+	if (sd->tx_root_silent == disconnect)
+		return 0;
+
+	if (disconnect)
+		err = mlx5_fs_cmd_set_tx_flow_table_root(secondary, 0, true);
+	else
+		err = mlx5_fs_cmd_set_tx_flow_table_root(secondary,
+							 sd->alias_obj_id,
+							 false);
+	if (err)
+		return err;
+
+	sd->tx_root_silent = disconnect;
+	return 0;
+}
+
 static int sd_cmd_set_secondary(struct mlx5_core_dev *secondary,
 				struct mlx5_core_dev *primary,
 				u8 *alias_key)
@@ -688,7 +715,8 @@ static int sd_cmd_set_secondary(struct mlx5_core_dev *secondary,
 	if (err)
 		goto err_unset_silent;
 
-	err = mlx5_fs_cmd_set_tx_flow_table_root(secondary, sd->alias_obj_id, false);
+	err = mlx5_fs_cmd_set_tx_flow_table_root(secondary, sd->alias_obj_id,
+						 false);
 	if (err)
 		goto err_destroy_alias_ft;
 
@@ -707,7 +735,7 @@ static void sd_cmd_unset_secondary(struct mlx5_core_dev *secondary)
 	struct mlx5_sd *primary_sd;
 
 	primary_sd = mlx5_get_sd(mlx5_sd_get_primary(secondary));
-	mlx5_fs_cmd_set_tx_flow_table_root(secondary, 0, true);
+	mlx5_sd_secondary_conf_tx_root(secondary, true);
 	sd_secondary_destroy_alias_ft(secondary);
 	if (!primary_sd->fw_silents_secondaries)
 		mlx5_fs_cmd_set_l2table_entry_silent(secondary, 0);
@@ -936,6 +964,107 @@ struct auxiliary_device *mlx5_sd_get_adev(struct mlx5_core_dev *dev,
 	return &primary_adev->adev;
 }
 
+#ifdef CONFIG_MLX5_ESWITCH
+/* All SD members must have completed esw_offloads_enable (i.e., reached
+ * mlx5_esw_offloads_devcom_init) and become eswitch-peers of the primary.
+ * Until then, mlx5_eswitch_is_peer() returns false for the not-yet-paired
+ * member and shared_fdb_supported_filter would reject. When all PFs transition
+ * in parallel, only the last one to finish satisfies this gate; the earlier
+ * ones return 0 silently here.
+ */
+static bool mlx5_sd_all_paired(struct mlx5_core_dev *primary)
+{
+	struct mlx5_eswitch *primary_esw = primary->priv.eswitch;
+	struct mlx5_core_dev *pos;
+	int i;
+
+	mlx5_sd_for_each_secondary(i, primary, pos) {
+		if (!mlx5_eswitch_is_peer(primary_esw, pos->priv.eswitch))
+			return false;
+	}
+	return true;
+}
+
+static void mlx5_sd_activate_shared_fdb(struct mlx5_core_dev *primary)
+{
+	struct mlx5_sd *sd = mlx5_get_sd(primary);
+	struct mlx5_lag *ldev;
+	struct lag_func *pf;
+	int err;
+	int i;
+
+	if (!mlx5_sd_all_paired(primary))
+		return;
+
+	ldev = mlx5_lag_dev(primary);
+	if (!ldev) {
+		sd_warn(primary, "Shared FDB MUST have ldev\n");
+		return;
+	}
+
+	mutex_lock(&ldev->lock);
+	/* Check if SD FDB is already active for this group */
+	mlx5_lag_for_each(i, 0, ldev, sd->group_id) {
+		pf = mlx5_lag_pf(ldev, i);
+		if (pf->sd_fdb_active)
+			goto unlock;
+		break;
+	}
+
+	if (!mlx5_lag_shared_fdb_supported_filter(ldev, sd->group_id)) {
+		sd_warn(primary, "Shared FDB not supported\n");
+		goto unlock;
+	}
+
+	err = mlx5_lag_shared_fdb_create(ldev, NULL, 0, sd->group_id);
+	if (err)
+		sd_warn(primary, "Failed to create shared FDB: %d\n", err);
+	else
+		sd_info(primary, "Shared FDB created\n");
+
+unlock:
+	mutex_unlock(&ldev->lock);
+}
+
+void mlx5_sd_eswitch_mode_set(struct mlx5_core_dev *dev, u16 mlx5_mode)
+{
+	struct mlx5_core_dev *primary;
+	struct mlx5_sd *sd;
+	int err;
+
+	sd = mlx5_get_sd(dev);
+	if (!sd || !mlx5_devcom_comp_is_ready(sd->devcom))
+		return;
+
+	mlx5_devcom_comp_lock(sd->devcom);
+	if (!mlx5_devcom_comp_is_ready(sd->devcom))
+		goto unlock;
+
+	primary = mlx5_sd_get_primary(dev);
+
+	/* Secondary devices need TX root reconfiguration */
+	if (dev != primary) {
+		bool disconnect = (mlx5_mode == MLX5_ESWITCH_OFFLOADS);
+
+		err = mlx5_sd_secondary_conf_tx_root(dev, disconnect);
+		if (err) {
+			sd_warn(dev, "Failed to set TX root: %d\n", err);
+			goto unlock;
+		}
+	}
+
+	/* Try to activate shared FDB when all devices are in switchdev.
+	 * Shared FDB is optional - failure here doesn't fail the transition.
+	 */
+	if (mlx5_mode == MLX5_ESWITCH_OFFLOADS)
+		mlx5_sd_activate_shared_fdb(primary);
+
+unlock:
+	mlx5_devcom_comp_unlock(sd->devcom);
+}
+
+#endif /* CONFIG_MLX5_ESWITCH */
+
 void mlx5_sd_put_adev(struct auxiliary_device *actual_adev,
 		      struct auxiliary_device *adev)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
index 7a41adbcee71..cb88bf34079a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
@@ -45,6 +45,13 @@ mlx5_sd_get_devcom(struct mlx5_core_dev *dev)
 }
 #endif
 
+#ifdef CONFIG_MLX5_ESWITCH
+void mlx5_sd_eswitch_mode_set(struct mlx5_core_dev *dev, u16 mlx5_mode);
+#else
+static inline void
+mlx5_sd_eswitch_mode_set(struct mlx5_core_dev *dev, u16 mlx5_mode) { return; }
+#endif
+
 #define mlx5_sd_for_each_dev_from_to(i, primary, ix_from, to, pos)	\
 	for (i = ix_from;							\
 	     (pos = mlx5_sd_primary_get_peer(primary, i)) && pos != (to); i++)
-- 
2.44.0


  parent reply	other threads:[~2026-06-04 11:46 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-04 11:44 [PATCH net-next 00/15] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 2/2 Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 01/15] net/mlx5: E-Switch, skip uplink IB rep load for SD secondary devices Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 02/15] net/mlx5: devcom, expose locked variant of send_event Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 03/15] net/mlx5: devcom, add DEVCOM_CANT_FAIL for non-rollback events Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 04/15] net/mlx5: SD, make primary/secondary role determination more robust Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 05/15] net/mlx5: SD, add L2 table silent mode query support Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 06/15] net/mlx5: SD, expend vport metadata for SD secondary devices Tariq Toukan
2026-06-04 11:44 ` Tariq Toukan [this message]
2026-06-04 11:44 ` [PATCH net-next 08/15] net/mlx5: E-Switch, notify SD on eswitch disable Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 09/15] net/mlx5: LAG, store demux resources per master lag_func Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 10/15] net/mlx5: LAG, disable both regular and SD LAG on lag_disable_change Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 11/15] net/mlx5: LAG, introduce software vport LAG implementation Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 12/15] net/mlx5: LAG, add MPESW over SD LAG support Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 13/15] net/mlx5: E-Switch, defer rep load while SD LAG is not active Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 14/15] net/mlx5: SD, defer vport metadata init until SD is ready Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 15/15] net/mlx5: SD, enable SD over ECPF and allow switchdev transition Tariq Toukan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260604114455.434711-8-tariqt@nvidia.com \
    --to=tariqt@nvidia.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=edwards@nvidia.com \
    --cc=gal@nvidia.com \
    --cc=horms@kernel.org \
    --cc=kees@kernel.org \
    --cc=kuba@kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=mbloch@nvidia.com \
    --cc=moshe@nvidia.com \
    --cc=msanalla@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=ohartoov@nvidia.com \
    --cc=pabeni@redhat.com \
    --cc=parav@nvidia.com \
    --cc=phaddad@nvidia.com \
    --cc=saeedm@nvidia.com \
    --cc=shayd@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox