Netdev List
 help / color / mirror / Atom feed
From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Andrew Lunn <andrew+netdev@lunn.ch>,
	"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
	Leon Romanovsky <leon@kernel.org>,
	Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
	Shay Drory <shayd@nvidia.com>, Or Har-Toov <ohartoov@nvidia.com>,
	Edward Srouji <edwards@nvidia.com>,
	Simon Horman <horms@kernel.org>,
	Maher Sanalla <msanalla@nvidia.com>,
	Parav Pandit <parav@nvidia.com>, Kees Cook <kees@kernel.org>,
	Moshe Shemesh <moshe@nvidia.com>,
	Patrisious Haddad <phaddad@nvidia.com>, <netdev@vger.kernel.org>,
	<linux-rdma@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	Gal Pressman <gal@nvidia.com>
Subject: [PATCH net-next 04/15] net/mlx5: SD, make primary/secondary role determination more robust
Date: Thu, 4 Jun 2026 14:44:44 +0300	[thread overview]
Message-ID: <20260604114455.434711-5-tariqt@nvidia.com> (raw)
In-Reply-To: <20260604114455.434711-1-tariqt@nvidia.com>

From: Shay Drory <shayd@nvidia.com>

Refactor SD group registration to use devcom event-driven role
determination to ensure SD is marked as ready only after roles are fully
assigned and the group state is consistent, making outside accessors,
which will be added in downstream patches, safe to use without races.

The devcom events:
- SD_PRIMARY_SET event: each device compares bus numbers with peers
  to determine which should be primary
- SD_SECONDARIES_SET event: secondaries register themselves with the
  elected primary device

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/lib/sd.c  | 137 +++++++++++++-----
 1 file changed, 102 insertions(+), 35 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
index 25286ecd724e..41979bf6a615 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
@@ -26,6 +26,8 @@ struct mlx5_sd {
 		struct { /* primary */
 			struct mlx5_core_dev *secondaries[MLX5_SD_MAX_GROUP_SZ - 1];
 			struct mlx5_flow_table *tx_ft;
+			/* Next index for secondary registration */
+			u8 next_secondary_idx;
 		};
 		struct { /* secondary */
 			struct mlx5_core_dev *primary_dev;
@@ -374,62 +376,125 @@ static void sd_lag_cleanup(struct mlx5_core_dev *dev)
 	mutex_unlock(&ldev->lock);
 }
 
+enum {
+	SD_PRIMARY_SET,
+	SD_SECONDARIES_SET,
+};
+
+static void sd_handle_primary_set(struct mlx5_core_dev *dev,
+				  struct mlx5_core_dev *peer)
+{
+	struct mlx5_sd *peer_sd = mlx5_get_sd(peer);
+	struct mlx5_sd *sd = mlx5_get_sd(dev);
+	struct mlx5_core_dev *candidate;
+	struct mlx5_sd *candidate_sd;
+
+	/* Peer is the device that being sent to all the other devices in the
+	 * group. Hence, use peer to get the candidate device.
+	 */
+	candidate = peer_sd->primary ? peer : peer_sd->primary_dev;
+
+	if (dev->pdev->bus->number >= candidate->pdev->bus->number)
+		return;
+
+	candidate_sd = mlx5_get_sd(candidate);
+
+	sd->primary = true;
+	candidate_sd->primary = false;
+	candidate_sd->primary_dev = dev;
+	peer_sd->primary = false;
+	peer_sd->primary_dev = dev;
+}
+
+static void sd_handle_secondaries_set(struct mlx5_core_dev *dev,
+				      struct mlx5_core_dev *peer)
+{
+	struct mlx5_sd *peer_sd = mlx5_get_sd(peer);
+	struct mlx5_sd *sd = mlx5_get_sd(dev);
+	u8 idx;
+
+	/* Primary has nothing to register with itself. */
+	if (sd->primary)
+		return;
+
+	/* dev is a secondary device, peer is the primary device.
+	 * Secondary registers itself with the primary.
+	 */
+	idx = peer_sd->next_secondary_idx++;
+	peer_sd->secondaries[idx] = dev;
+	sd->primary_dev = peer;
+}
+
+static int mlx5_sd_devcom_event(int event, void *my_data, void *event_data)
+{
+	struct mlx5_core_dev *peer = event_data;
+	struct mlx5_core_dev *dev = my_data;
+
+	switch (event) {
+	case SD_PRIMARY_SET:
+		sd_handle_primary_set(dev, peer);
+		break;
+	case SD_SECONDARIES_SET:
+		sd_handle_secondaries_set(dev, peer);
+		break;
+	}
+
+	return 0;
+}
+
 static int sd_register(struct mlx5_core_dev *dev)
 {
-	struct mlx5_devcom_comp_dev *devcom, *pos;
 	struct mlx5_devcom_match_attr attr = {};
-	struct mlx5_core_dev *peer, *primary;
-	struct mlx5_sd *sd, *primary_sd;
-	int err, i;
+	struct mlx5_devcom_comp_dev *devcom;
+	struct mlx5_core_dev *primary;
+	struct mlx5_sd *sd;
+	int err;
 
 	sd = mlx5_get_sd(dev);
 	attr.key.val = sd->group_id;
 	attr.flags = MLX5_DEVCOM_MATCH_FLAGS_NS;
 	attr.net = mlx5_core_net(dev);
-	devcom = mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_SD_GROUP,
-						&attr, NULL, dev);
+	devcom = mlx5_devcom_register_component(dev->priv.devc,
+						MLX5_DEVCOM_SD_GROUP,
+						&attr, mlx5_sd_devcom_event,
+						dev);
 	if (!devcom)
 		return -EINVAL;
 
 	sd->devcom = devcom;
 
-	if (mlx5_devcom_comp_get_size(devcom) != sd->host_buses)
-		return 0;
-
 	mlx5_devcom_comp_lock(devcom);
-	mlx5_devcom_comp_set_ready(devcom, true);
-	mlx5_devcom_comp_unlock(devcom);
+	if (mlx5_devcom_comp_get_size(devcom) != sd->host_buses ||
+	    mlx5_devcom_comp_is_ready(devcom))
+		goto out;
 
-	if (!mlx5_devcom_for_each_peer_begin(devcom)) {
-		err = -ENODEV;
+	/* Send SD_PRIMARY_SET event with this device.
+	 * All peers will receive this event and compare to this device.
+	 * The one with lowest bus number will be marked as primary.
+	 */
+	sd->primary = true;
+	err = mlx5_devcom_locked_send_event(devcom, SD_PRIMARY_SET,
+					    SD_PRIMARY_SET, dev);
+	if (err)
 		goto err_devcom_unreg;
-	}
-
-	primary = dev;
-	mlx5_devcom_for_each_peer_entry(devcom, peer, pos)
-		if (peer->pdev->bus->number < primary->pdev->bus->number)
-			primary = peer;
 
-	primary_sd = mlx5_get_sd(primary);
-	primary_sd->primary = true;
-	i = 0;
-	/* loop the secondaries */
-	mlx5_devcom_for_each_peer_entry(primary_sd->devcom, peer, pos) {
-		struct mlx5_sd *peer_sd = mlx5_get_sd(peer);
-
-		primary_sd->secondaries[i++] = peer;
-		peer_sd->primary = false;
-		peer_sd->primary_dev = primary;
-	}
+	/* Broadcast SD_SECONDARIES_SET. Each non-sender peer's handler runs;
+	 * the primary's handler returns early so only secondaries register.
+	 */
+	primary = sd->primary ? dev : sd->primary_dev;
+	if (!sd->primary)
+		sd_handle_secondaries_set(dev, primary);
+	mlx5_devcom_locked_send_event(devcom, SD_SECONDARIES_SET,
+				      DEVCOM_CANT_FAIL, primary);
 
-	mlx5_devcom_for_each_peer_end(devcom);
+	mlx5_devcom_comp_set_ready(devcom, true);
+out:
+	mlx5_devcom_comp_unlock(devcom);
 	return 0;
 
 err_devcom_unreg:
-	mlx5_devcom_comp_lock(sd->devcom);
-	mlx5_devcom_comp_set_ready(sd->devcom, false);
-	mlx5_devcom_comp_unlock(sd->devcom);
-	mlx5_devcom_unregister_component(sd->devcom);
+	mlx5_devcom_comp_unlock(devcom);
+	mlx5_devcom_unregister_component(devcom);
 	return err;
 }
 
@@ -672,6 +737,7 @@ int mlx5_sd_init(struct mlx5_core_dev *dev)
 		peer_sd->primary_dev = NULL;
 	}
 	primary_sd->primary = false;
+	primary_sd->next_secondary_idx = 0;
 	mlx5_devcom_comp_set_ready(sd->devcom, false);
 	mlx5_devcom_comp_unlock(sd->devcom);
 	sd_unregister(dev);
@@ -719,6 +785,7 @@ void mlx5_sd_cleanup(struct mlx5_core_dev *dev)
 		peer_sd->primary_dev = NULL;
 	}
 	primary_sd->primary = false;
+	primary_sd->next_secondary_idx = 0;
 out_ready_false:
 	mlx5_devcom_comp_set_ready(sd->devcom, false);
 out_unlock:
-- 
2.44.0


  parent reply	other threads:[~2026-06-04 11:46 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-04 11:44 [PATCH net-next 00/15] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 2/2 Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 01/15] net/mlx5: E-Switch, skip uplink IB rep load for SD secondary devices Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 02/15] net/mlx5: devcom, expose locked variant of send_event Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 03/15] net/mlx5: devcom, add DEVCOM_CANT_FAIL for non-rollback events Tariq Toukan
2026-06-04 11:44 ` Tariq Toukan [this message]
2026-06-04 11:44 ` [PATCH net-next 05/15] net/mlx5: SD, add L2 table silent mode query support Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 06/15] net/mlx5: SD, expend vport metadata for SD secondary devices Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 07/15] net/mlx5: SD, support switchdev mode transition with shared FDB Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 08/15] net/mlx5: E-Switch, notify SD on eswitch disable Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 09/15] net/mlx5: LAG, store demux resources per master lag_func Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 10/15] net/mlx5: LAG, disable both regular and SD LAG on lag_disable_change Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 11/15] net/mlx5: LAG, introduce software vport LAG implementation Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 12/15] net/mlx5: LAG, add MPESW over SD LAG support Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 13/15] net/mlx5: E-Switch, defer rep load while SD LAG is not active Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 14/15] net/mlx5: SD, defer vport metadata init until SD is ready Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 15/15] net/mlx5: SD, enable SD over ECPF and allow switchdev transition Tariq Toukan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260604114455.434711-5-tariqt@nvidia.com \
    --to=tariqt@nvidia.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=edwards@nvidia.com \
    --cc=gal@nvidia.com \
    --cc=horms@kernel.org \
    --cc=kees@kernel.org \
    --cc=kuba@kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=mbloch@nvidia.com \
    --cc=moshe@nvidia.com \
    --cc=msanalla@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=ohartoov@nvidia.com \
    --cc=pabeni@redhat.com \
    --cc=parav@nvidia.com \
    --cc=phaddad@nvidia.com \
    --cc=saeedm@nvidia.com \
    --cc=shayd@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox