From: Michael Chan <michael.chan@broadcom.com>
To: davem@davemloft.net
Cc: netdev@vger.kernel.org, kuba@kernel.org, edwin.peer@broadcom.com,
gospo@broadcom.com, jiri@nvidia.com
Subject: [PATCH net-next v2 09/19] bnxt_en: improve fw diagnose devlink health messages
Date: Fri, 29 Oct 2021 03:47:46 -0400 [thread overview]
Message-ID: <1635493676-10767-10-git-send-email-michael.chan@broadcom.com> (raw)
In-Reply-To: <1635493676-10767-1-git-send-email-michael.chan@broadcom.com>
[-- Attachment #1: Type: text/plain, Size: 10818 bytes --]
From: Edwin Peer <edwin.peer@broadcom.com>
Add firmware event counters as well as health state severity. In
the unhealthy state, recommend a remedy and inform the user as to
its impact.
Readability of the devlink tool's output is negatively impacted by
adding these fields to the diagnosis. The single line of text, as
rendered by devlink health diagnose, benefits from more terse
descriptions, which can be substituted without loss of clarity, even
in pretty printed JSON mode.
Signed-off-by: Edwin Peer <edwin.peer@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 19 ++-
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 25 ++++
.../net/ethernet/broadcom/bnxt/bnxt_devlink.c | 131 ++++++++++++++----
3 files changed, 148 insertions(+), 27 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1251d78ffd46..b4d9374548f8 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2138,10 +2138,12 @@ static int bnxt_async_event_process(struct bnxt *bp,
set_bit(BNXT_STATE_FW_ACTIVATE_RESET, &bp->state);
} else if (EVENT_DATA1_RESET_NOTIFY_FATAL(data1)) {
type_str = "Fatal";
+ bp->fw_health->fatalities++;
set_bit(BNXT_STATE_FW_FATAL_COND, &bp->state);
} else if (data2 && BNXT_FW_STATUS_HEALTHY !=
EVENT_DATA2_RESET_NOTIFY_FW_STATUS_CODE(data2)) {
type_str = "Non-fatal";
+ bp->fw_health->survivals++;
set_bit(BNXT_STATE_FW_NON_FATAL_COND, &bp->state);
}
netif_warn(bp, hw, bp->dev,
@@ -7604,6 +7606,7 @@ static int __bnxt_alloc_fw_health(struct bnxt *bp)
if (!bp->fw_health)
return -ENOMEM;
+ mutex_init(&bp->fw_health->lock);
return 0;
}
@@ -7650,12 +7653,16 @@ static void bnxt_inv_fw_health_reg(struct bnxt *bp)
struct bnxt_fw_health *fw_health = bp->fw_health;
u32 reg_type;
- if (!fw_health || !fw_health->status_reliable)
+ if (!fw_health)
return;
reg_type = BNXT_FW_HEALTH_REG_TYPE(fw_health->regs[BNXT_FW_HEALTH_REG]);
if (reg_type == BNXT_FW_HEALTH_REG_TYPE_GRC)
fw_health->status_reliable = false;
+
+ reg_type = BNXT_FW_HEALTH_REG_TYPE(fw_health->regs[BNXT_FW_RESET_CNT_REG]);
+ if (reg_type == BNXT_FW_HEALTH_REG_TYPE_GRC)
+ fw_health->resets_reliable = false;
}
static void bnxt_try_map_fw_health_reg(struct bnxt *bp)
@@ -7712,6 +7719,7 @@ static int bnxt_map_fw_health_regs(struct bnxt *bp)
int i;
bp->fw_health->status_reliable = false;
+ bp->fw_health->resets_reliable = false;
/* Only pre-map the monitoring GRC registers using window 3 */
for (i = 0; i < 4; i++) {
u32 reg = fw_health->regs[i];
@@ -7725,6 +7733,7 @@ static int bnxt_map_fw_health_regs(struct bnxt *bp)
fw_health->mapped_regs[i] = BNXT_FW_HEALTH_WIN_OFF(reg);
}
bp->fw_health->status_reliable = true;
+ bp->fw_health->resets_reliable = true;
if (reg_base == 0xffffffff)
return 0;
@@ -11264,14 +11273,18 @@ static void bnxt_fw_health_check(struct bnxt *bp)
}
val = bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG);
- if (val == fw_health->last_fw_heartbeat)
+ if (val == fw_health->last_fw_heartbeat) {
+ fw_health->arrests++;
goto fw_reset;
+ }
fw_health->last_fw_heartbeat = val;
val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
- if (val != fw_health->last_fw_reset_cnt)
+ if (val != fw_health->last_fw_reset_cnt) {
+ fw_health->discoveries++;
goto fw_reset;
+ }
fw_health->tmr_counter = fw_health->tmr_multiplier;
return;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 2873f600a7dd..bbbc63e882d1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1523,6 +1523,21 @@ struct bnxt_ctx_mem_info {
struct bnxt_mem_init mem_init[BNXT_CTX_MEM_INIT_MAX];
};
+enum bnxt_health_severity {
+ SEVERITY_NORMAL = 0,
+ SEVERITY_WARNING,
+ SEVERITY_RECOVERABLE,
+ SEVERITY_FATAL,
+};
+
+enum bnxt_health_remedy {
+ REMEDY_DEVLINK_RECOVER,
+ REMEDY_POWER_CYCLE_DEVICE,
+ REMEDY_POWER_CYCLE_HOST,
+ REMEDY_FW_UPDATE,
+ REMEDY_HW_REPLACE,
+};
+
struct bnxt_fw_health {
u32 flags;
u32 polling_dsecs;
@@ -1542,6 +1557,7 @@ struct bnxt_fw_health {
u8 enabled:1;
u8 primary:1;
u8 status_reliable:1;
+ u8 resets_reliable:1;
u8 tmr_multiplier;
u8 tmr_counter;
u8 fw_reset_seq_cnt;
@@ -1551,6 +1567,15 @@ struct bnxt_fw_health {
u32 echo_req_data1;
u32 echo_req_data2;
struct devlink_health_reporter *fw_reporter;
+ /* Protects severity and remedy */
+ struct mutex lock;
+ enum bnxt_health_severity severity;
+ enum bnxt_health_remedy remedy;
+ u32 arrests;
+ u32 discoveries;
+ u32 survivals;
+ u32 fatalities;
+ u32 diagnoses;
};
#define BNXT_FW_HEALTH_REG_TYPE_MASK 3
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index a295d2042b6e..930cbf1ca4e0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -71,43 +71,110 @@ static int bnxt_hwrm_remote_dev_reset_set(struct bnxt *bp, bool remote_reset)
return hwrm_req_send(bp, req);
}
+static char *bnxt_health_severity_str(enum bnxt_health_severity severity)
+{
+ switch (severity) {
+ case SEVERITY_NORMAL: return "normal";
+ case SEVERITY_WARNING: return "warning";
+ case SEVERITY_RECOVERABLE: return "recoverable";
+ case SEVERITY_FATAL: return "fatal";
+ default: return "unknown";
+ }
+}
+
+static char *bnxt_health_remedy_str(enum bnxt_health_remedy remedy)
+{
+ switch (remedy) {
+ case REMEDY_DEVLINK_RECOVER: return "devlink recover";
+ case REMEDY_POWER_CYCLE_DEVICE: return "device power cycle";
+ case REMEDY_POWER_CYCLE_HOST: return "host power cycle";
+ case REMEDY_FW_UPDATE: return "update firmware";
+ case REMEDY_HW_REPLACE: return "replace hardware";
+ default: return "unknown";
+ }
+}
+
static int bnxt_fw_diagnose(struct devlink_health_reporter *reporter,
struct devlink_fmsg *fmsg,
struct netlink_ext_ack *extack)
{
struct bnxt *bp = devlink_health_reporter_priv(reporter);
- u32 val;
+ struct bnxt_fw_health *h = bp->fw_health;
+ u32 fw_status, fw_resets;
int rc;
if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state))
- return 0;
+ return devlink_fmsg_string_pair_put(fmsg, "Status", "recovering");
- val = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG);
+ if (!h->status_reliable)
+ return devlink_fmsg_string_pair_put(fmsg, "Status", "unknown");
- if (BNXT_FW_IS_BOOTING(val)) {
- rc = devlink_fmsg_string_pair_put(fmsg, "Description",
- "Not yet completed initialization");
+ mutex_lock(&h->lock);
+ fw_status = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG);
+ if (BNXT_FW_IS_BOOTING(fw_status)) {
+ rc = devlink_fmsg_string_pair_put(fmsg, "Status", "initializing");
if (rc)
- return rc;
- } else if (BNXT_FW_IS_ERR(val)) {
- rc = devlink_fmsg_string_pair_put(fmsg, "Description",
- "Encountered fatal error and cannot recover");
+ goto unlock;
+ } else if (h->severity || fw_status != BNXT_FW_STATUS_HEALTHY) {
+ if (!h->severity) {
+ h->severity = SEVERITY_FATAL;
+ h->remedy = REMEDY_POWER_CYCLE_DEVICE;
+ h->diagnoses++;
+ devlink_health_report(h->fw_reporter,
+ "FW error diagnosed", h);
+ }
+ rc = devlink_fmsg_string_pair_put(fmsg, "Status", "error");
if (rc)
- return rc;
+ goto unlock;
+ rc = devlink_fmsg_u32_pair_put(fmsg, "Syndrome", fw_status);
+ if (rc)
+ goto unlock;
+ } else {
+ rc = devlink_fmsg_string_pair_put(fmsg, "Status", "healthy");
+ if (rc)
+ goto unlock;
}
- if (val >> 16) {
- rc = devlink_fmsg_u32_pair_put(fmsg, "Error code", val >> 16);
+ rc = devlink_fmsg_string_pair_put(fmsg, "Severity",
+ bnxt_health_severity_str(h->severity));
+ if (rc)
+ goto unlock;
+
+ if (h->severity) {
+ rc = devlink_fmsg_string_pair_put(fmsg, "Remedy",
+ bnxt_health_remedy_str(h->remedy));
if (rc)
- return rc;
+ goto unlock;
+ if (h->remedy == REMEDY_DEVLINK_RECOVER) {
+ rc = devlink_fmsg_string_pair_put(fmsg, "Impact",
+ "traffic+ntuple_cfg");
+ if (rc)
+ goto unlock;
+ }
}
- val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
- rc = devlink_fmsg_u32_pair_put(fmsg, "Reset count", val);
- if (rc)
+unlock:
+ mutex_unlock(&h->lock);
+ if (rc || !h->resets_reliable)
return rc;
- return 0;
+ fw_resets = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
+ rc = devlink_fmsg_u32_pair_put(fmsg, "Resets", fw_resets);
+ if (rc)
+ return rc;
+ rc = devlink_fmsg_u32_pair_put(fmsg, "Arrests", h->arrests);
+ if (rc)
+ return rc;
+ rc = devlink_fmsg_u32_pair_put(fmsg, "Survivals", h->survivals);
+ if (rc)
+ return rc;
+ rc = devlink_fmsg_u32_pair_put(fmsg, "Discoveries", h->discoveries);
+ if (rc)
+ return rc;
+ rc = devlink_fmsg_u32_pair_put(fmsg, "Fatalities", h->fatalities);
+ if (rc)
+ return rc;
+ return devlink_fmsg_u32_pair_put(fmsg, "Diagnoses", h->diagnoses);
}
static int bnxt_fw_recover(struct devlink_health_reporter *reporter,
@@ -116,6 +183,9 @@ static int bnxt_fw_recover(struct devlink_health_reporter *reporter,
{
struct bnxt *bp = devlink_health_reporter_priv(reporter);
+ if (bp->fw_health->severity == SEVERITY_FATAL)
+ return -ENODEV;
+
set_bit(BNXT_STATE_RECOVER, &bp->state);
__bnxt_fw_recover(bp);
@@ -165,6 +235,7 @@ void bnxt_dl_fw_reporters_destroy(struct bnxt *bp, bool all)
void bnxt_devlink_health_fw_report(struct bnxt *bp)
{
struct bnxt_fw_health *fw_health = bp->fw_health;
+ int rc;
if (!fw_health)
return;
@@ -174,20 +245,32 @@ void bnxt_devlink_health_fw_report(struct bnxt *bp)
return;
}
- devlink_health_report(fw_health->fw_reporter, "FW error reported", NULL);
+ mutex_lock(&fw_health->lock);
+ fw_health->severity = SEVERITY_RECOVERABLE;
+ fw_health->remedy = REMEDY_DEVLINK_RECOVER;
+ mutex_unlock(&fw_health->lock);
+ rc = devlink_health_report(fw_health->fw_reporter, "FW error reported",
+ fw_health);
+ if (rc == -ECANCELED)
+ __bnxt_fw_recover(bp);
}
void bnxt_dl_health_fw_status_update(struct bnxt *bp, bool healthy)
{
- struct bnxt_fw_health *health = bp->fw_health;
+ struct bnxt_fw_health *fw_health = bp->fw_health;
u8 state;
- if (healthy)
+ mutex_lock(&fw_health->lock);
+ if (healthy) {
+ fw_health->severity = SEVERITY_NORMAL;
state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
- else
+ } else {
+ fw_health->severity = SEVERITY_FATAL;
+ fw_health->remedy = REMEDY_POWER_CYCLE_DEVICE;
state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
-
- devlink_health_reporter_state_update(health->fw_reporter, state);
+ }
+ mutex_unlock(&fw_health->lock);
+ devlink_health_reporter_state_update(fw_health->fw_reporter, state);
}
void bnxt_dl_health_fw_recovery_done(struct bnxt *bp)
--
2.18.1
[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4209 bytes --]
next prev parent reply other threads:[~2021-10-29 7:48 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-10-29 7:47 [PATCH net-next v2 00/19] bnxt_en: devlink enhancements Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 01/19] bnxt_en: refactor printing of device info Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 02/19] bnxt_en: refactor cancellation of resource reservations Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 03/19] bnxt_en: implement devlink dev reload driver_reinit Michael Chan
2021-10-29 17:06 ` Leon Romanovsky
2021-10-29 7:47 ` [PATCH net-next v2 04/19] bnxt_en: implement devlink dev reload fw_activate Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 05/19] bnxt_en: add enable_remote_dev_reset devlink parameter Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 06/19] bnxt_en: improve error recovery information messages Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 07/19] bnxt_en: remove fw_reset devlink health reporter Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 08/19] bnxt_en: consolidate fw devlink health reporters Michael Chan
2021-10-29 7:47 ` Michael Chan [this message]
2021-10-29 7:47 ` [PATCH net-next v2 10/19] bnxt_en: Refactor coredump functions Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 11/19] bnxt_en: move coredump functions into dedicated file Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 12/19] bnxt_en: Add compression flags information in coredump segment header Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 13/19] bnxt_en: Retrieve coredump and crashdump size via FW command Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 14/19] bnxt_en: extract coredump command line from current task Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 15/19] bnxt_en: implement dump callback for fw health reporter Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 16/19] bnxt_en: Update firmware interface to 1.10.2.63 Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 17/19] bnxt_en: implement firmware live patching Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 18/19] bnxt_en: Provide stored devlink "fw" version on older firmware Michael Chan
2021-10-29 7:47 ` [PATCH net-next v2 19/19] bnxt_en: Update bnxt.rst devlink documentation Michael Chan
2021-10-29 11:30 ` [PATCH net-next v2 00/19] bnxt_en: devlink enhancements patchwork-bot+netdevbpf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1635493676-10767-10-git-send-email-michael.chan@broadcom.com \
--to=michael.chan@broadcom.com \
--cc=davem@davemloft.net \
--cc=edwin.peer@broadcom.com \
--cc=gospo@broadcom.com \
--cc=jiri@nvidia.com \
--cc=kuba@kernel.org \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.