netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Saeed Mahameed <saeedm@mellanox.com>
To: "David S. Miller" <davem@davemloft.net>
Cc: "netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	Jiri Pirko <jiri@mellanox.com>,
	Moshe Shemesh <moshe@mellanox.com>,
	Eran Ben Elisha <eranbe@mellanox.com>,
	Saeed Mahameed <saeedm@mellanox.com>
Subject: [net-next 13/15] net/mlx5: Add fw fatal devlink health reporter
Date: Sun, 5 May 2019 00:33:31 +0000	[thread overview]
Message-ID: <20190505003207.1353-14-saeedm@mellanox.com> (raw)
In-Reply-To: <20190505003207.1353-1-saeedm@mellanox.com>

From: Moshe Shemesh <moshe@mellanox.com>

Create mlx5_devlink_health_reporter for fw fatal reporter.
The fw fatal reporter is added in addition to the fw reporter and
implements the recover callback.
The point of having two reporters for FW issues, is that we
don't want to run FW recover on any issue, but only fatal ones.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/health.c  | 70 ++++++++++++++-----
 include/linux/mlx5/driver.h                   |  1 +
 2 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 03b9fc9ebd6e..e64f0e32cd67 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -301,31 +301,43 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
 
 /* How much time to wait until health resetting the driver (in msecs) */
 #define MLX5_RECOVERY_WAIT_MSECS 60000
-static void health_care(struct work_struct *work)
+static int mlx5_health_care(struct mlx5_core_dev *dev)
 {
-	struct mlx5_core_health *health;
-	struct mlx5_core_dev *dev;
-	struct mlx5_priv *priv;
 	unsigned long end;
 
-	health = container_of(work, struct mlx5_core_health, work);
-	priv = container_of(health, struct mlx5_priv, health);
-	dev = container_of(priv, struct mlx5_core_dev, priv);
 	mlx5_core_warn(dev, "handling bad device here\n");
 	mlx5_handle_bad_state(dev);
-
 	end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS);
 	while (sensor_pci_not_working(dev)) {
 		if (time_after(jiffies, end)) {
 			mlx5_core_err(dev,
 				      "health recovery flow aborted, PCI reads still not working\n");
-			return;
+			return -EIO;
 		}
 		msleep(100);
 	}
 
 	mlx5_core_err(dev, "starting health recovery flow\n");
 	mlx5_recover_device(dev);
+	if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state) ||
+	    check_fatal_sensors(dev)) {
+		mlx5_core_err(dev, "health recovery failed\n");
+		return -EIO;
+	}
+	return 0;
+}
+
+static void health_care_work(struct work_struct *work)
+{
+	struct mlx5_core_health *health;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+
+	health = container_of(work, struct mlx5_core_health, work);
+	priv = container_of(health, struct mlx5_priv, health);
+	dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	mlx5_health_care(dev);
 }
 
 static const char *hsynd_str(u8 synd)
@@ -526,7 +538,22 @@ static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
 		.dump = mlx5_fw_reporter_dump,
 };
 
-static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
+static int
+mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter,
+			       void *priv_ctx)
+{
+	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
+
+	return mlx5_health_care(dev);
+}
+
+static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
+		.name = "fw_fatal",
+		.recover = mlx5_fw_fatal_reporter_recover,
+};
+
+#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 1200000
+static void mlx5_fw_reporters_create(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 	struct devlink *devlink = priv_to_devlink(dev);
@@ -537,16 +564,25 @@ static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
 	if (IS_ERR(health->fw_reporter))
 		mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n",
 			       PTR_ERR(health->fw_reporter));
+
+	health->fw_fatal_reporter =
+		devlink_health_reporter_create(devlink, &mlx5_fw_fatal_reporter_ops,
+					       MLX5_REPORTER_FW_GRACEFUL_PERIOD,
+					       true, dev);
+	if (IS_ERR(health->fw_fatal_reporter))
+		mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n",
+			       PTR_ERR(health->fw_fatal_reporter));
 }
 
-static void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev)
+static void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 
-	if (IS_ERR_OR_NULL(health->fw_reporter))
-		return;
+	if (!IS_ERR_OR_NULL(health->fw_reporter))
+		devlink_health_reporter_destroy(health->fw_reporter);
 
-	devlink_health_reporter_destroy(health->fw_reporter);
+	if (!IS_ERR_OR_NULL(health->fw_fatal_reporter))
+		devlink_health_reporter_destroy(health->fw_fatal_reporter);
 }
 
 static unsigned long get_next_poll_jiffies(void)
@@ -669,7 +705,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 
 	kfree(health->info_buf);
 	destroy_workqueue(health->wq);
-	mlx5_fw_reporter_destroy(dev);
+	mlx5_fw_reporters_destroy(dev);
 }
 
 int mlx5_health_init(struct mlx5_core_dev *dev)
@@ -689,7 +725,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 	if (!health->wq)
 		return -ENOMEM;
 	spin_lock_init(&health->wq_lock);
-	INIT_WORK(&health->work, health_care);
+	INIT_WORK(&health->work, health_care_work);
 	INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
 	health->crdump = NULL;
 	health->info_buf = kmalloc(HEALTH_INFO_MAX_BUFF, GFP_KERNEL);
@@ -697,7 +733,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 		goto err_alloc_buff;
 	mutex_init(&health->info_buf_lock);
 
-	mlx5_fw_reporter_create(dev);
+	mlx5_fw_reporters_create(dev);
 
 	return 0;
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 604079b4706c..6f65787bf91b 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -452,6 +452,7 @@ struct mlx5_core_health {
 	/* protect info buf access */
 	struct mutex			info_buf_lock;
 	struct devlink_health_reporter *fw_reporter;
+	struct devlink_health_reporter *fw_fatal_reporter;
 };
 
 struct mlx5_qp_table {
-- 
2.20.1


  parent reply	other threads:[~2019-05-05  0:34 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-05  0:32 [pull request][net-next 00/15] Mellanox, mlx5 Firmware devlink health and sw reset Saeed Mahameed
2019-05-05  0:32 ` [net-next 01/15] net/mlx5: Move all devlink related functions calls to devlink.c Saeed Mahameed
2019-05-05  0:32 ` [net-next 02/15] net/mlx5: Add Vendor Specific Capability access gateway Saeed Mahameed
2019-05-05  0:33 ` [net-next 03/15] net/mlx5: Add Crdump FW snapshot support Saeed Mahameed
2019-05-05 15:36   ` Jiri Pirko
2019-05-05  0:33 ` [net-next 04/15] net/mlx5: Add support for devlink region_snapshot parameter Saeed Mahameed
2019-05-05  0:33 ` [net-next 05/15] net/mlx5: Handle SW reset of FW in error flow Saeed Mahameed
2019-05-05  0:33 ` [net-next 06/15] net/mlx5: Control CR-space access by different PFs Saeed Mahameed
2019-05-05  0:33 ` [net-next 07/15] net/mlx5: Issue SW reset on FW assert Saeed Mahameed
2019-05-05 15:38   ` Jiri Pirko
2019-05-06 10:44     ` Moshe Shemesh
2019-05-05  0:33 ` [net-next 08/15] net/mlx5: Refactor print health info Saeed Mahameed
2019-05-05 15:42   ` Jiri Pirko
2019-05-05  0:33 ` [net-next 09/15] net/mlx5: Create FW devlink health reporter Saeed Mahameed
2019-05-05 15:42   ` Jiri Pirko
2019-05-06 10:45     ` Moshe Shemesh
2019-05-06 11:38       ` Jiri Pirko
2019-05-06 19:52         ` Saeed Mahameed
2019-05-06 21:46           ` Alexei Starovoitov
2019-05-07  5:59             ` Jiri Pirko
2019-05-07  6:01           ` Jiri Pirko
2019-05-07  0:11   ` Jakub Kicinski
2019-05-05  0:33 ` [net-next 10/15] net/mlx5: Add core dump register access functions Saeed Mahameed
2019-05-05  0:33 ` [net-next 11/15] net/mlx5: Add support for FW reporter dump Saeed Mahameed
2019-05-05 15:49   ` Jiri Pirko
2019-05-06 10:51     ` Moshe Shemesh
2019-05-06 11:37       ` Jiri Pirko
2019-05-05  0:33 ` [net-next 12/15] net/mlx5: Report devlink health on FW issues Saeed Mahameed
2019-05-05  0:33 ` Saeed Mahameed [this message]
2019-05-05  0:33 ` [net-next 14/15] net/mlx5: Add support for FW fatal reporter dump Saeed Mahameed
2019-05-05 15:52   ` Jiri Pirko
2019-05-06 10:54     ` Moshe Shemesh
2019-05-06 11:42       ` Jiri Pirko
2019-05-05  0:33 ` [net-next 15/15] net/mlx5: Report devlink health on FW fatal issues Saeed Mahameed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190505003207.1353-14-saeedm@mellanox.com \
    --to=saeedm@mellanox.com \
    --cc=davem@davemloft.net \
    --cc=eranbe@mellanox.com \
    --cc=jiri@mellanox.com \
    --cc=moshe@mellanox.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).