From: Jiri Pirko <jiri@resnulli.us>
To: Saeed Mahameed <saeedm@mellanox.com>
Cc: "David S. Miller" <davem@davemloft.net>,
"netdev@vger.kernel.org" <netdev@vger.kernel.org>,
Jiri Pirko <jiri@mellanox.com>, Alex Vesker <valex@mellanox.com>,
Moshe Shemesh <moshe@mellanox.com>,
Feras Daoud <ferasda@mellanox.com>
Subject: Re: [net-next 03/15] net/mlx5: Add Crdump FW snapshot support
Date: Sun, 5 May 2019 17:36:05 +0200 [thread overview]
Message-ID: <20190505153605.GA31501@nanopsycho.orion> (raw)
In-Reply-To: <20190505003207.1353-4-saeedm@mellanox.com>
Sun, May 05, 2019 at 02:33:00AM CEST, saeedm@mellanox.com wrote:
>From: Alex Vesker <valex@mellanox.com>
>
>Crdump allows the driver to create a snapshot of the FW PCI crspace.
>This is useful in case of catastrophic issues which may require FW
>reset. The snapshot can be used for later debug.
>
>The snapshot is exposed using devlink region_snapshot in downstream patch,
>cr-space address regions are registered on init and snapshots are attached
>once a new snapshot is collected by the driver.
>
>Signed-off-by: Alex Vesker <valex@mellanox.com>
>Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
>Reviewed-by: Feras Daoud <ferasda@mellanox.com>
>Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
>---
> .../net/ethernet/mellanox/mlx5/core/Makefile | 2 +-
> .../ethernet/mellanox/mlx5/core/diag/crdump.c | 179 ++++++++++++++++++
> .../net/ethernet/mellanox/mlx5/core/health.c | 1 +
> .../ethernet/mellanox/mlx5/core/lib/mlx5.h | 4 +
> .../net/ethernet/mellanox/mlx5/core/main.c | 5 +
> include/linux/mlx5/driver.h | 4 +
> 6 files changed, 194 insertions(+), 1 deletion(-)
> create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
>
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>index 34d9a079b608..5feed9e1bec7 100644
>--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>@@ -16,7 +16,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
> transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
> fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
> lib/devcom.o lib/pci_vsc.o diag/fs_tracepoint.o \
>- diag/fw_tracer.o devlink.o
>+ diag/fw_tracer.o diag/crdump.o devlink.o
>
> #
> # Netdev basic
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
>new file mode 100644
>index 000000000000..6430ceeefb53
>--- /dev/null
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
>@@ -0,0 +1,179 @@
>+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>+/* Copyright (c) 2019 Mellanox Technologies */
>+
>+#include <linux/proc_fs.h>
>+#include <linux/mlx5/driver.h>
>+#include <net/devlink.h>
>+#include "mlx5_core.h"
>+#include "lib/pci_vsc.h"
>+#include "lib/mlx5.h"
>+
>+#define BAD_ACCESS 0xBADACCE5
>+#define MLX5_PROTECTED_CR_SCAN_CRSPACE 0x7
>+#define MAX_NUM_OF_DUMPS_TO_STORE (8)
Please make your local defines prefixed with "mlx5_". For example
"BAD_ACCESS" sounds way too generic.
>+
>+static const char *region_cr_space_str = "cr-space";
>+
>+struct mlx5_fw_crdump {
>+ u32 size;
>+ struct devlink_region *region_crspace;
>+};
>+
>+static bool mlx5_crdump_enbaled(struct mlx5_core_dev *dev)
Typo: s/enbaled/enabled/
>+{
>+ struct mlx5_priv *priv = &dev->priv;
>+
>+ return (!!priv->health.crdump);
Poitlesss brackets. Please remove.
>+}
>+
>+static int mlx5_crdump_fill(struct mlx5_core_dev *dev,
>+ char *crdump_region, u32 *snapshot_id)
>+{
>+ struct devlink *devlink = priv_to_devlink(dev);
>+ struct mlx5_priv *priv = &dev->priv;
>+ struct mlx5_fw_crdump *crdump = priv->health.crdump;
>+ int i, ret = 0;
>+ u32 *cr_data;
>+ u32 id;
>+
>+ cr_data = kvmalloc(crdump->size, GFP_KERNEL);
>+ if (!cr_data)
>+ return -ENOMEM;
>+
>+ for (i = 0; i < (crdump->size / 4); i++)
>+ cr_data[i] = BAD_ACCESS;
>+
>+ ret = mlx5_vsc_gw_read_block_fast(dev, cr_data, crdump->size);
>+ if (ret <= 0) {
>+ if (ret == 0)
>+ ret = -EIO;
>+ goto free_data;
>+ }
>+
>+ if (crdump->size != ret) {
>+ mlx5_core_warn(dev, "failed to read full dump, read %d out of %u\n",
>+ ret, crdump->size);
>+ ret = -EINVAL;
>+ goto free_data;
>+ }
>+
>+ /* Get the available snapshot ID for the dumps */
>+ id = devlink_region_shapshot_id_get(devlink);
>+ ret = devlink_region_snapshot_create(crdump->region_crspace,
>+ crdump->size, (u8 *)cr_data,
>+ id, &kvfree);
>+ if (ret) {
>+ mlx5_core_warn(dev, "crdump: devlink create %s snapshot id %d err %d\n",
>+ region_cr_space_str, id, ret);
>+ goto free_data;
>+ } else {
>+ *snapshot_id = id;
>+ strcpy(crdump_region, region_cr_space_str);
>+ }
>+ return 0;
>+
>+free_data:
>+ kvfree(cr_data);
>+ return ret;
>+}
>+
>+int mlx5_crdump_collect(struct mlx5_core_dev *dev,
>+ char *crdump_region, u32 *snapshot_id)
>+{
>+ int ret = 0;
>+
>+ if (!mlx5_crdump_enbaled(dev))
>+ return -ENODEV;
>+
>+ ret = mlx5_vsc_gw_lock(dev);
>+ if (ret) {
>+ mlx5_core_warn(dev, "crdump: failed to lock vsc gw err %d\n",
>+ ret);
>+ return ret;
>+ }
>+
>+ ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL);
>+ if (ret)
>+ goto unlock;
>+
>+ ret = mlx5_crdump_fill(dev, crdump_region, snapshot_id);
>+
>+unlock:
>+ mlx5_vsc_gw_unlock(dev);
>+ return ret;
>+}
>+
>+int mlx5_crdump_init(struct mlx5_core_dev *dev)
>+{
>+ struct devlink *devlink = priv_to_devlink(dev);
>+ struct mlx5_priv *priv = &dev->priv;
>+ struct mlx5_fw_crdump *crdump;
>+ u32 space_size;
>+ int ret;
>+
>+ if (!mlx5_core_is_pf(dev) || !mlx5_vsc_accessible(dev) ||
>+ mlx5_crdump_enbaled(dev))
>+ return 0;
>+
>+ ret = mlx5_vsc_gw_lock(dev);
>+ if (ret)
>+ return ret;
>+
>+ /* Check if space is supported and get space size */
>+ ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE,
>+ &space_size);
>+ if (ret) {
>+ /* Unlock and mask error since space is not supported */
>+ mlx5_vsc_gw_unlock(dev);
>+ return 0;
>+ }
>+
>+ if (!space_size) {
>+ mlx5_core_warn(dev, "Invalid Crspace size, zero\n");
>+ mlx5_vsc_gw_unlock(dev);
>+ return -EINVAL;
>+ }
>+
>+ ret = mlx5_vsc_gw_unlock(dev);
>+ if (ret)
>+ return ret;
>+
>+ crdump = kzalloc(sizeof(*crdump), GFP_KERNEL);
>+ if (!crdump)
>+ return -ENOMEM;
>+
>+ /* Create cr-space region */
>+ crdump->size = space_size;
>+ crdump->region_crspace =
>+ devlink_region_create(devlink,
>+ region_cr_space_str,
>+ MAX_NUM_OF_DUMPS_TO_STORE,
>+ space_size);
Unnecessary wraps.
>+ if (IS_ERR(crdump->region_crspace)) {
>+ mlx5_core_warn(dev,
Unnecessary wrap.
>+ "crdump: create devlink region %s err %ld\n",
>+ region_cr_space_str,
>+ PTR_ERR(crdump->region_crspace));
>+ ret = PTR_ERR(crdump->region_crspace);
>+ goto free_crdump;
>+ }
>+ priv->health.crdump = crdump;
>+ return 0;
>+
>+free_crdump:
>+ kfree(crdump);
>+ return ret;
>+}
>+
>+void mlx5_crdump_cleanup(struct mlx5_core_dev *dev)
>+{
>+ struct mlx5_priv *priv = &dev->priv;
>+ struct mlx5_fw_crdump *crdump = priv->health.crdump;
>+
>+ if (!crdump)
>+ return;
>+
>+ devlink_region_destroy(crdump->region_crspace);
>+ kfree(crdump);
>+ priv->health.crdump = NULL;
Why do you need to have this NULL. Where do you check for NULL?
>+}
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
>index a2656f4008d9..90f3da6da7f9 100644
>--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
>@@ -388,6 +388,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
> spin_lock_init(&health->wq_lock);
> INIT_WORK(&health->work, health_care);
> INIT_DELAYED_WORK(&health->recover_work, health_recover);
>+ health->crdump = NULL;
Same here.
>
> return 0;
> }
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
>index 397a2847867a..3c9a6dedccaa 100644
>--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
>@@ -41,6 +41,10 @@ int mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count);
> void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count);
> int mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index);
> void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index);
>+int mlx5_crdump_init(struct mlx5_core_dev *dev);
>+void mlx5_crdump_cleanup(struct mlx5_core_dev *dev);
>+int mlx5_crdump_collect(struct mlx5_core_dev *dev,
>+ char *crdump_region, u32 *snapshot_id);
>
> /* TODO move to lib/events.h */
>
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
>index 64eb2a558b30..43f5487de4c3 100644
>--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
>@@ -1320,6 +1320,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id)
> if (err)
> goto clean_load;
>
>+ err = mlx5_crdump_init(dev);
>+ if (err)
>+ dev_err(&pdev->dev, "mlx5_crdump_init failed with error code %d\n", err);
>+
> pci_save_state(pdev);
> return 0;
>
>@@ -1341,6 +1345,7 @@ static void remove_one(struct pci_dev *pdev)
> struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
> struct devlink *devlink = priv_to_devlink(dev);
>
>+ mlx5_crdump_cleanup(dev);
> mlx5_devlink_unregister(devlink);
> mlx5_unregister_device(dev);
>
>diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
>index 56d0a116f575..ddf6f41a75d3 100644
>--- a/include/linux/mlx5/driver.h
>+++ b/include/linux/mlx5/driver.h
>@@ -53,6 +53,7 @@
> #include <linux/mlx5/eq.h>
> #include <linux/timecounter.h>
> #include <linux/ptp_clock_kernel.h>
>+#include <net/devlink.h>
>
> enum {
> MLX5_BOARD_ID_LEN = 64,
>@@ -427,6 +428,8 @@ struct mlx5_sq_bfreg {
> unsigned int offset;
> };
>
>+struct mlx5_fw_crdump;
>+
> struct mlx5_core_health {
> struct health_buffer __iomem *health;
> __be32 __iomem *health_counter;
>@@ -440,6 +443,7 @@ struct mlx5_core_health {
> unsigned long flags;
> struct work_struct work;
> struct delayed_work recover_work;
>+ struct mlx5_fw_crdump *crdump;
> };
>
> struct mlx5_qp_table {
>--
>2.20.1
>
next prev parent reply other threads:[~2019-05-05 15:36 UTC|newest]
Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-05-05 0:32 [pull request][net-next 00/15] Mellanox, mlx5 Firmware devlink health and sw reset Saeed Mahameed
2019-05-05 0:32 ` [net-next 01/15] net/mlx5: Move all devlink related functions calls to devlink.c Saeed Mahameed
2019-05-05 0:32 ` [net-next 02/15] net/mlx5: Add Vendor Specific Capability access gateway Saeed Mahameed
2019-05-05 0:33 ` [net-next 03/15] net/mlx5: Add Crdump FW snapshot support Saeed Mahameed
2019-05-05 15:36 ` Jiri Pirko [this message]
2019-05-05 0:33 ` [net-next 04/15] net/mlx5: Add support for devlink region_snapshot parameter Saeed Mahameed
2019-05-05 0:33 ` [net-next 05/15] net/mlx5: Handle SW reset of FW in error flow Saeed Mahameed
2019-05-05 0:33 ` [net-next 06/15] net/mlx5: Control CR-space access by different PFs Saeed Mahameed
2019-05-05 0:33 ` [net-next 07/15] net/mlx5: Issue SW reset on FW assert Saeed Mahameed
2019-05-05 15:38 ` Jiri Pirko
2019-05-06 10:44 ` Moshe Shemesh
2019-05-05 0:33 ` [net-next 08/15] net/mlx5: Refactor print health info Saeed Mahameed
2019-05-05 15:42 ` Jiri Pirko
2019-05-05 0:33 ` [net-next 09/15] net/mlx5: Create FW devlink health reporter Saeed Mahameed
2019-05-05 15:42 ` Jiri Pirko
2019-05-06 10:45 ` Moshe Shemesh
2019-05-06 11:38 ` Jiri Pirko
2019-05-06 19:52 ` Saeed Mahameed
2019-05-06 21:46 ` Alexei Starovoitov
2019-05-07 5:59 ` Jiri Pirko
2019-05-07 6:01 ` Jiri Pirko
2019-05-07 0:11 ` Jakub Kicinski
2019-05-05 0:33 ` [net-next 10/15] net/mlx5: Add core dump register access functions Saeed Mahameed
2019-05-05 0:33 ` [net-next 11/15] net/mlx5: Add support for FW reporter dump Saeed Mahameed
2019-05-05 15:49 ` Jiri Pirko
2019-05-06 10:51 ` Moshe Shemesh
2019-05-06 11:37 ` Jiri Pirko
2019-05-05 0:33 ` [net-next 12/15] net/mlx5: Report devlink health on FW issues Saeed Mahameed
2019-05-05 0:33 ` [net-next 13/15] net/mlx5: Add fw fatal devlink health reporter Saeed Mahameed
2019-05-05 0:33 ` [net-next 14/15] net/mlx5: Add support for FW fatal reporter dump Saeed Mahameed
2019-05-05 15:52 ` Jiri Pirko
2019-05-06 10:54 ` Moshe Shemesh
2019-05-06 11:42 ` Jiri Pirko
2019-05-05 0:33 ` [net-next 15/15] net/mlx5: Report devlink health on FW fatal issues Saeed Mahameed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190505153605.GA31501@nanopsycho.orion \
--to=jiri@resnulli.us \
--cc=davem@davemloft.net \
--cc=ferasda@mellanox.com \
--cc=jiri@mellanox.com \
--cc=moshe@mellanox.com \
--cc=netdev@vger.kernel.org \
--cc=saeedm@mellanox.com \
--cc=valex@mellanox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).