netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Saeed Mahameed <saeed@kernel.org>
To: "David S. Miller" <davem@davemloft.net>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Eric Dumazet <edumazet@google.com>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
	netdev@vger.kernel.org, Tariq Toukan <tariqt@nvidia.com>,
	Shay Drory <shayd@nvidia.com>, Parav Pandit <parav@nvidia.com>
Subject: [net 09/16] net/mlx5: SF: Fix probing active SFs during driver probe phase
Date: Tue, 18 Oct 2022 23:38:06 -0700	[thread overview]
Message-ID: <20221019063813.802772-10-saeed@kernel.org> (raw)
In-Reply-To: <20221019063813.802772-1-saeed@kernel.org>

From: Shay Drory <shayd@nvidia.com>

When SF devices and SF port representors are located on different
functions, unloading and reloading of SF parent driver doesn't recreate
the existing SF present in the device.
Fix it by querying SFs and probe active SFs during driver probe phase.

Fixes: 90d010b8634b ("net/mlx5: SF, Add auxiliary device support")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../ethernet/mellanox/mlx5/core/sf/dev/dev.c  | 86 +++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/sf/sf.h   | 10 +++
 2 files changed, 96 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
index 7da012ff0d41..4fa4ed36b3ef 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
@@ -18,6 +18,10 @@ struct mlx5_sf_dev_table {
 	phys_addr_t base_address;
 	u64 sf_bar_length;
 	struct notifier_block nb;
+	struct mutex table_lock; /* Serializes sf life cycle and vhca state change handler */
+	struct workqueue_struct *active_wq;
+	struct work_struct work;
+	u8 stop_active_wq:1;
 	struct mlx5_core_dev *dev;
 };
 
@@ -168,6 +172,7 @@ mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_
 		return 0;
 
 	sf_index = event->function_id - base_id;
+	mutex_lock(&table->table_lock);
 	sf_dev = xa_load(&table->devices, sf_index);
 	switch (event->new_vhca_state) {
 	case MLX5_VHCA_STATE_INVALID:
@@ -191,6 +196,7 @@ mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_
 	default:
 		break;
 	}
+	mutex_unlock(&table->table_lock);
 	return 0;
 }
 
@@ -215,6 +221,78 @@ static int mlx5_sf_dev_vhca_arm_all(struct mlx5_sf_dev_table *table)
 	return 0;
 }
 
+static void mlx5_sf_dev_add_active_work(struct work_struct *work)
+{
+	struct mlx5_sf_dev_table *table = container_of(work, struct mlx5_sf_dev_table, work);
+	u32 out[MLX5_ST_SZ_DW(query_vhca_state_out)] = {};
+	struct mlx5_core_dev *dev = table->dev;
+	u16 max_functions;
+	u16 function_id;
+	u16 sw_func_id;
+	int err = 0;
+	u8 state;
+	int i;
+
+	max_functions = mlx5_sf_max_functions(dev);
+	function_id = MLX5_CAP_GEN(dev, sf_base_id);
+	for (i = 0; i < max_functions; i++, function_id++) {
+		if (table->stop_active_wq)
+			return;
+		err = mlx5_cmd_query_vhca_state(dev, function_id, out, sizeof(out));
+		if (err)
+			/* A failure of specific vhca doesn't mean others will
+			 * fail as well.
+			 */
+			continue;
+		state = MLX5_GET(query_vhca_state_out, out, vhca_state_context.vhca_state);
+		sw_func_id = MLX5_GET(query_vhca_state_out, out, vhca_state_context.sw_function_id);
+		if (state == MLX5_VHCA_STATE_ACTIVE) {
+			mutex_lock(&table->table_lock);
+			/* Don't probe device which is already probe */
+			if (!xa_load(&table->devices, i))
+				mlx5_sf_dev_add(dev, i, function_id, sw_func_id);
+			/* There is a race where SF got inactive after the query
+			 * above. e.g.: the query returns that the state of the
+			 * SF is active, and after that the eswitch manager set it to
+			 * inactive.
+			 * This case cannot be managed in SW, since the probing of the
+			 * SF is on one system, and the inactivation is on a different
+			 * system.
+			 * If the inactive is done after the SF perform init_hca(),
+			 * the SF will fully probe and then removed. If it was
+			 * done before init_hca(), the SF probe will fail.
+			 */
+			mutex_unlock(&table->table_lock);
+		}
+	}
+}
+
+/* In case SFs are generated externally, probe active SFs */
+int mlx5_sf_dev_queue_active_work(struct mlx5_sf_dev_table *table)
+{
+	if (!mlx5_sf_table_external(table->dev))
+		return 0;
+	/* Use a workqueue to probe active SFs, which are in large
+	 * quantity and may take up to minutes to probe.
+	 */
+	table->active_wq = create_singlethread_workqueue("mlx5_active_sf");
+	if (!table->active_wq)
+		return -ENOMEM;
+	INIT_WORK(&table->work, &mlx5_sf_dev_add_active_work);
+	mutex_init(&table->table_lock);
+	queue_work(table->active_wq, &table->work);
+	return 0;
+}
+
+void mlx5_sf_dev_destroy_active_work(struct mlx5_sf_dev_table *table)
+{
+	if (table->active_wq) {
+		table->stop_active_wq = true;
+		destroy_workqueue(table->active_wq);
+		mutex_destroy(&table->table_lock);
+	}
+}
+
 void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev)
 {
 	struct mlx5_sf_dev_table *table;
@@ -245,6 +323,11 @@ void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev)
 	err = mlx5_vhca_event_notifier_register(dev, &table->nb);
 	if (err)
 		goto vhca_err;
+
+	err = mlx5_sf_dev_queue_active_work(table);
+	if (err)
+		goto add_active_err;
+
 	err = mlx5_sf_dev_vhca_arm_all(table);
 	if (err)
 		goto arm_err;
@@ -252,6 +335,8 @@ void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev)
 	return;
 
 arm_err:
+	mlx5_sf_dev_destroy_active_work(table);
+add_active_err:
 	mlx5_vhca_event_notifier_unregister(dev, &table->nb);
 vhca_err:
 	table->max_sfs = 0;
@@ -279,6 +364,7 @@ void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev)
 	if (!table)
 		return;
 
+	mlx5_sf_dev_destroy_active_work(table);
 	mlx5_vhca_event_notifier_unregister(dev, &table->nb);
 
 	/* Now that event handler is not running, it is safe to destroy
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
index 3a480e06ecc0..be7aac4e47b9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
@@ -18,6 +18,11 @@ void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev);
 int mlx5_sf_table_init(struct mlx5_core_dev *dev);
 void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev);
 
+static inline bool mlx5_sf_table_external(const struct mlx5_core_dev *dev)
+{
+	return !MLX5_CAP_GEN(dev, eswitch_manager);
+}
+
 int mlx5_devlink_sf_port_new(struct devlink *devlink,
 			     const struct devlink_port_new_attrs *add_attr,
 			     struct netlink_ext_ack *extack,
@@ -60,6 +65,11 @@ static inline void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev)
 {
 }
 
+static inline bool mlx5_sf_table_external(const struct mlx5_core_dev *dev)
+{
+	return true;
+}
+
 #endif
 
 #endif
-- 
2.37.3


  parent reply	other threads:[~2022-10-19  6:39 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-19  6:37 [pull request][RESEND net 00/16] mlx5 fixes 2022-10-14 Saeed Mahameed
2022-10-19  6:37 ` [net 01/16] net/mlx5e: Do not increment ESN when updating IPsec ESN state Saeed Mahameed
2022-10-19  6:37 ` [net 02/16] net/mlx5: Wait for firmware to enable CRS before pci_restore_state Saeed Mahameed
2022-10-20 10:01   ` Paolo Abeni
2022-10-19  6:38 ` [net 03/16] net/mlx5: DR, Fix matcher disconnect error flow Saeed Mahameed
2022-10-19  6:38 ` [net 04/16] net/mlx5e: Extend SKB room check to include PTP-SQ Saeed Mahameed
2022-10-19  6:38 ` [net 05/16] net/mlx5e: Update restore chain id for slow path packets Saeed Mahameed
2022-10-19  6:38 ` [net 06/16] net/mlx5: ASO, Create the ASO SQ with the correct timestamp format Saeed Mahameed
2022-10-19  6:38 ` [net 07/16] net/mlx5: Fix possible use-after-free in async command interface Saeed Mahameed
2022-10-19  6:38 ` [net 08/16] net/mlx5e: TC, Reject forwarding from internal port to internal port Saeed Mahameed
2022-10-19  6:38 ` Saeed Mahameed [this message]
2022-10-19  6:38 ` [net 10/16] net/mlx5e: TC, Fix cloned flow attr instance dests are not zeroed Saeed Mahameed
2022-10-19  6:38 ` [net 11/16] net/mlx5: Update fw fatal reporter state on PCI handlers successful recover Saeed Mahameed
2022-10-19  6:38 ` [net 12/16] net/mlx5: Fix crash during sync firmware reset Saeed Mahameed
2022-10-19  6:38 ` [net 13/16] net/mlx5e: Fix macsec coverity issue at rx sa update Saeed Mahameed
2022-10-19  6:38 ` [net 14/16] net/mlx5e: Fix macsec rx security association (SA) update/delete Saeed Mahameed
2022-10-19  6:38 ` [net 15/16] net/mlx5e: Fix wrong bitwise comparison usage in macsec_fs_rx_add_rule function Saeed Mahameed
2022-10-19  6:38 ` [net 16/16] net/mlx5e: Fix macsec sci endianness at rx sa update Saeed Mahameed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221019063813.802772-10-saeed@kernel.org \
    --to=saeed@kernel.org \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=kuba@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=parav@nvidia.com \
    --cc=saeedm@nvidia.com \
    --cc=shayd@nvidia.com \
    --cc=tariqt@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).