From: Saeed Mahameed <saeed@kernel.org>
To: "David S. Miller" <davem@davemloft.net>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>
Cc: netdev@vger.kernel.org, Shay Drory <shayd@nvidia.com>,
Moshe Shemesh <moshe@nvidia.com>,
Saeed Mahameed <saeedm@nvidia.com>
Subject: [net 11/11] net/mlx5: Drain fw_reset when removing device
Date: Tue, 17 May 2022 23:34:27 -0700 [thread overview]
Message-ID: <20220518063427.123758-12-saeed@kernel.org> (raw)
In-Reply-To: <20220518063427.123758-1-saeed@kernel.org>
From: Shay Drory <shayd@nvidia.com>
In case fw sync reset is called in parallel to device removal, device
might stuck in the following deadlock:
CPU 0 CPU 1
----- -----
remove_one
uninit_one (locks intf_state_mutex)
mlx5_sync_reset_now_event()
work in fw_reset->wq.
mlx5_enter_error_state()
mutex_lock (intf_state_mutex)
cleanup_once
fw_reset_cleanup()
destroy_workqueue(fw_reset->wq)
Drain the fw_reset WQ, and make sure no new work is being queued, before
entering uninit_one().
The Drain is done before devlink_unregister() since fw_reset, in some
flows, is using devlink API devlink_remote_reload_actions_performed().
Fixes: 38b9f903f22b ("net/mlx5: Handle sync reset request event")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
.../ethernet/mellanox/mlx5/core/fw_reset.c | 25 ++++++++++++++++---
.../ethernet/mellanox/mlx5/core/fw_reset.h | 1 +
.../net/ethernet/mellanox/mlx5/core/main.c | 4 +++
3 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
index ca1aba845dd6..81eb67fb95b0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
@@ -8,7 +8,8 @@
enum {
MLX5_FW_RESET_FLAGS_RESET_REQUESTED,
MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST,
- MLX5_FW_RESET_FLAGS_PENDING_COMP
+ MLX5_FW_RESET_FLAGS_PENDING_COMP,
+ MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS
};
struct mlx5_fw_reset {
@@ -208,7 +209,10 @@ static void poll_sync_reset(struct timer_list *t)
if (fatal_error) {
mlx5_core_warn(dev, "Got Device Reset\n");
- queue_work(fw_reset->wq, &fw_reset->reset_reload_work);
+ if (!test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags))
+ queue_work(fw_reset->wq, &fw_reset->reset_reload_work);
+ else
+ mlx5_core_err(dev, "Device is being removed, Drop new reset work\n");
return;
}
@@ -433,9 +437,12 @@ static int fw_reset_event_notifier(struct notifier_block *nb, unsigned long acti
struct mlx5_fw_reset *fw_reset = mlx5_nb_cof(nb, struct mlx5_fw_reset, nb);
struct mlx5_eqe *eqe = data;
+ if (test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags))
+ return NOTIFY_DONE;
+
switch (eqe->sub_type) {
case MLX5_GENERAL_SUBTYPE_FW_LIVE_PATCH_EVENT:
- queue_work(fw_reset->wq, &fw_reset->fw_live_patch_work);
+ queue_work(fw_reset->wq, &fw_reset->fw_live_patch_work);
break;
case MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT:
mlx5_sync_reset_events_handle(fw_reset, eqe);
@@ -479,6 +486,18 @@ void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev)
mlx5_eq_notifier_unregister(dev, &dev->priv.fw_reset->nb);
}
+void mlx5_drain_fw_reset(struct mlx5_core_dev *dev)
+{
+ struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
+
+ set_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags);
+ cancel_work_sync(&fw_reset->fw_live_patch_work);
+ cancel_work_sync(&fw_reset->reset_request_work);
+ cancel_work_sync(&fw_reset->reset_reload_work);
+ cancel_work_sync(&fw_reset->reset_now_work);
+ cancel_work_sync(&fw_reset->reset_abort_work);
+}
+
int mlx5_fw_reset_init(struct mlx5_core_dev *dev)
{
struct mlx5_fw_reset *fw_reset = kzalloc(sizeof(*fw_reset), GFP_KERNEL);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h
index 694fc7cb2684..dc141c7e641a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h
@@ -16,6 +16,7 @@ int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev);
int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev);
void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev);
void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev);
+void mlx5_drain_fw_reset(struct mlx5_core_dev *dev);
int mlx5_fw_reset_init(struct mlx5_core_dev *dev);
void mlx5_fw_reset_cleanup(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index c6737720bf3a..ef196cb764e2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1627,6 +1627,10 @@ static void remove_one(struct pci_dev *pdev)
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
struct devlink *devlink = priv_to_devlink(dev);
+ /* mlx5_drain_fw_reset() is using devlink APIs. Hence, we must drain
+ * fw_reset before unregistering the devlink.
+ */
+ mlx5_drain_fw_reset(dev);
devlink_unregister(devlink);
mlx5_sriov_disable(pdev);
mlx5_crdump_disable(dev);
--
2.36.1
prev parent reply other threads:[~2022-05-18 6:35 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-05-18 6:34 [pull request][net 00/11] mlx5 fixes 2022-05-17 Saeed Mahameed
2022-05-18 6:34 ` [net 01/11] net/mlx5: DR, Fix missing flow_source when creating multi-destination FW table Saeed Mahameed
2022-05-18 11:10 ` patchwork-bot+netdevbpf
2022-05-18 6:34 ` [net 02/11] net/mlx5: Initialize flow steering during driver probe Saeed Mahameed
2022-05-18 6:34 ` [net 03/11] net/mlx5: DR, Ignore modify TTL on RX if device doesn't support it Saeed Mahameed
2022-05-18 6:34 ` [net 04/11] net/mlx5e: Wrap mlx5e_trap_napi_poll into rcu_read_lock Saeed Mahameed
2022-05-18 6:34 ` [net 05/11] net/mlx5e: Block rx-gro-hw feature in switchdev mode Saeed Mahameed
2022-05-18 6:34 ` [net 06/11] net/mlx5e: Properly block LRO when XDP is enabled Saeed Mahameed
2022-05-18 6:34 ` [net 07/11] net/mlx5e: Properly block HW GRO " Saeed Mahameed
2022-05-18 6:34 ` [net 08/11] net/mlx5e: Remove HW-GRO from reported features Saeed Mahameed
2022-05-18 6:34 ` [net 09/11] net/mlx5e: CT: Fix support for GRE tuples Saeed Mahameed
2022-05-18 6:34 ` [net 10/11] net/mlx5e: CT: Fix setting flow_source for smfs ct tuples Saeed Mahameed
2022-05-18 6:34 ` Saeed Mahameed [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220518063427.123758-12-saeed@kernel.org \
--to=saeed@kernel.org \
--cc=davem@davemloft.net \
--cc=kuba@kernel.org \
--cc=moshe@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=saeedm@nvidia.com \
--cc=shayd@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).