From: Saeed Mahameed <saeed@kernel.org>
To: "David S. Miller" <davem@davemloft.net>,
Jakub Kicinski <kuba@kernel.org>
Cc: netdev@vger.kernel.org, Saeed Mahameed <saeedm@nvidia.com>,
Moshe Shemesh <moshe@nvidia.com>
Subject: [net-next v0 06/14] net/mlx5: Print more info on pci error handlers
Date: Thu, 2 Dec 2021 16:56:14 -0800 [thread overview]
Message-ID: <20211203005622.183325-7-saeed@kernel.org> (raw)
In-Reply-To: <20211203005622.183325-1-saeed@kernel.org>
From: Saeed Mahameed <saeedm@nvidia.com>
In case mlx5_pci_err_detected was called with state equals to
pci_channel_io_perm_failure, the driver will never come back up.
It is nice to know why the driver went to zombie land, so print some
useful information on pci err handlers.
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/main.c | 51 ++++++++++++++-----
1 file changed, 37 insertions(+), 14 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 7df9c7f8d9c8..d97c9e86d7b3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1604,12 +1604,28 @@ static void remove_one(struct pci_dev *pdev)
mlx5_devlink_free(devlink);
}
+#define mlx5_pci_trace(dev, fmt, ...) ({ \
+ struct mlx5_core_dev *__dev = (dev); \
+ mlx5_core_info(__dev, "%s Device state = %d health sensors: %d pci_status: %d. " fmt, \
+ __func__, __dev->state, mlx5_health_check_fatal_sensors(__dev), \
+ __dev->pci_status, ##__VA_ARGS__); \
+})
+
+static const char *result2str(enum pci_ers_result result)
+{
+ return result == PCI_ERS_RESULT_NEED_RESET ? "need reset" :
+ result == PCI_ERS_RESULT_DISCONNECT ? "disconnect" :
+ result == PCI_ERS_RESULT_RECOVERED ? "recovered" :
+ "unknown";
+}
+
static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
pci_channel_state_t state)
{
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+ enum pci_ers_result res;
- mlx5_core_info(dev, "%s was called\n", __func__);
+ mlx5_pci_trace(dev, "Enter, pci channel state = %d\n", state);
mlx5_enter_error_state(dev, false);
mlx5_error_sw_reset(dev);
@@ -1617,8 +1633,11 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
mlx5_drain_health_wq(dev);
mlx5_pci_disable_device(dev);
- return state == pci_channel_io_perm_failure ?
+ res = state == pci_channel_io_perm_failure ?
PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+
+ mlx5_pci_trace(dev, "Exit, result = %d, %s\n", res, result2str(res));
+ return res;
}
/* wait for the device to show vital signs by waiting
@@ -1652,28 +1671,34 @@ static int wait_vital(struct pci_dev *pdev)
static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev)
{
+ enum pci_ers_result res = PCI_ERS_RESULT_DISCONNECT;
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
int err;
- mlx5_core_info(dev, "%s was called\n", __func__);
+ mlx5_pci_trace(dev, "Enter\n");
err = mlx5_pci_enable_device(dev);
if (err) {
mlx5_core_err(dev, "%s: mlx5_pci_enable_device failed with error code: %d\n",
__func__, err);
- return PCI_ERS_RESULT_DISCONNECT;
+ goto out;
}
pci_set_master(pdev);
pci_restore_state(pdev);
pci_save_state(pdev);
- if (wait_vital(pdev)) {
- mlx5_core_err(dev, "%s: wait_vital timed out\n", __func__);
- return PCI_ERS_RESULT_DISCONNECT;
+ err = wait_vital(pdev);
+ if (err) {
+ mlx5_core_err(dev, "%s: wait vital failed with error code: %d\n",
+ __func__, err);
+ goto out;
}
- return PCI_ERS_RESULT_RECOVERED;
+ res = PCI_ERS_RESULT_RECOVERED;
+out:
+ mlx5_pci_trace(dev, "Exit, err = %d, result = %d, %s\n", err, res, result2str(res));
+ return res;
}
static void mlx5_pci_resume(struct pci_dev *pdev)
@@ -1681,14 +1706,12 @@ static void mlx5_pci_resume(struct pci_dev *pdev)
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
int err;
- mlx5_core_info(dev, "%s was called\n", __func__);
+ mlx5_pci_trace(dev, "Enter, loading driver..\n");
err = mlx5_load_one(dev);
- if (err)
- mlx5_core_err(dev, "%s: mlx5_load_one failed with error code: %d\n",
- __func__, err);
- else
- mlx5_core_info(dev, "%s: device recovered\n", __func__);
+
+ mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err,
+ !err ? "recovered" : "Failed");
}
static const struct pci_error_handlers mlx5_err_handler = {
--
2.31.1
next prev parent reply other threads:[~2021-12-03 0:56 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-12-03 0:56 [pull request][net-next v0 00/14] mlx5 updates 2021-12-02 Saeed Mahameed
2021-12-03 0:56 ` [net-next v0 01/14] mlx5: fix psample_sample_packet link error Saeed Mahameed
2021-12-03 3:20 ` patchwork-bot+netdevbpf
2021-12-03 0:56 ` [net-next v0 02/14] mlx5: fix mlx5i_grp_sw_update_stats() stack usage Saeed Mahameed
2021-12-03 0:56 ` [net-next v0 03/14] net/mlx5: Fix error return code in esw_qos_create() Saeed Mahameed
2021-12-03 0:56 ` [net-next v0 04/14] net/mlx5: Fix some error handling paths in 'mlx5e_tc_add_fdb_flow()' Saeed Mahameed
2021-12-03 0:56 ` [net-next v0 05/14] net/mlx5: SF, silence an uninitialized variable warning Saeed Mahameed
2021-12-03 0:56 ` Saeed Mahameed [this message]
2021-12-03 0:56 ` [net-next v0 07/14] net/mlx5e: SHAMPO, clean MLX5E_MAX_KLM_PER_WQE macro Saeed Mahameed
2021-12-03 0:56 ` [net-next v0 08/14] net/mlx5e: Hide function mlx5e_num_channels_changed Saeed Mahameed
2021-12-03 0:56 ` [net-next v0 09/14] net/mlx5e: TC, Remove redundant action stack var Saeed Mahameed
2021-12-03 0:56 ` [net-next v0 10/14] net/mlx5e: Remove redundant actions arg from validate_goto_chain() Saeed Mahameed
2021-12-03 0:56 ` [net-next v0 11/14] net/mlx5e: Remove redundant actions arg from vlan push/pop funcs Saeed Mahameed
2021-12-03 0:56 ` [net-next v0 12/14] net/mlx5e: TC, Move common flow_action checks into function Saeed Mahameed
2021-12-03 0:56 ` [net-next v0 13/14] net/mlx5e: TC, Set flow attr ip_version earlier Saeed Mahameed
2021-12-03 0:56 ` [net-next v0 14/14] net/mlx5: Dynamically resize flow counters query buffer Saeed Mahameed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20211203005622.183325-7-saeed@kernel.org \
--to=saeed@kernel.org \
--cc=davem@davemloft.net \
--cc=kuba@kernel.org \
--cc=moshe@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=saeedm@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).