netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jakub Kicinski <kuba@kernel.org>
To: netdev@vger.kernel.org
Cc: jiri@resnulli.us, saeedm@nvidia.com,
	andrew.gospodarek@broadcom.com, jacob.e.keller@intel.com,
	guglielmo.morandin@broadcom.com, eugenem@fb.com,
	eranbe@mellanox.com, Jakub Kicinski <kuba@kernel.org>
Subject: [RFC net-next v2 2/3] devlink: health: add remediation type
Date: Wed, 10 Mar 2021 19:26:12 -0800	[thread overview]
Message-ID: <20210311032613.1533100-2-kuba@kernel.org> (raw)
In-Reply-To: <20210311032613.1533100-1-kuba@kernel.org>

Currently devlink health does not give user any clear information
of what kind of remediation ->recover callback will perform. This
makes it difficult to understand the impact of enabling auto-
-remediation, and the severity of the error itself.

To allow users to make more informed decision add a new remediation
type attribute.

Note that we only allow one remediation type per reporter, this
is intentional. devlink health is not built for mixing issues
of different severity into one reporter since it only maintains
one dump, of the first event and a single error counter.
Nudging vendors towards categorizing issues beyond coarse
groups is an added bonus.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h        |  2 ++
 include/uapi/linux/devlink.h | 25 +++++++++++++++++++++++++
 net/core/devlink.c           |  7 ++++++-
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index b424328af658..72b37769761f 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -659,6 +659,7 @@ struct devlink_health_reporter;
 /**
  * struct devlink_health_reporter_ops - Reporter operations
  * @name: reporter name
+ * remedy: severity of the remediation required
  * @recover: callback to recover from reported error
  *           if priv_ctx is NULL, run a full recover
  * @dump: callback to dump an object
@@ -669,6 +670,7 @@ struct devlink_health_reporter;
 
 struct devlink_health_reporter_ops {
 	char *name;
+	enum devlink_health_remedy remedy;
 	int (*recover)(struct devlink_health_reporter *reporter,
 		       void *priv_ctx, struct netlink_ext_ack *extack);
 	int (*dump)(struct devlink_health_reporter *reporter,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 41a6ea3b2256..8cd1508b525b 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -534,6 +534,9 @@ enum devlink_attr {
 	DEVLINK_ATTR_RELOAD_ACTION_STATS,       /* nested */
 
 	DEVLINK_ATTR_PORT_PCI_SF_NUMBER,	/* u32 */
+
+	DEVLINK_ATTR_HEALTH_REPORTER_REMEDY,	/* u32 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
@@ -620,4 +623,26 @@ enum devlink_health_state {
 	DL_HEALTH_STATE_ERROR,
 };
 
+/**
+ * enum devlink_health_reporter_remedy - severity of remediation procedure
+ * @DL_HEALTH_REMEDY_NONE: transient error, no remediation required
+ * @DL_HEALTH_REMEDY_KICK: device stalled, processing will be re-triggered
+ * @DL_HEALTH_REMEDY_COMP_RESET: associated device component (e.g. device queue)
+ *			will be reset
+ * @DL_HEALTH_REMEDY_RESET: full device reset, will result in temporary
+ *			unavailability of the device, device configuration
+ *			should not be lost
+ * @DL_HEALTH_REMEDY_REINIT: device will be reinitialized and configuration lost
+ *
+ * Used in %DEVLINK_ATTR_HEALTH_REPORTER_REMEDY, categorizes the health reporter
+ * by the severity of the remediation.
+ */
+enum devlink_health_remedy {
+	DL_HEALTH_REMEDY_NONE = 1,
+	DL_HEALTH_REMEDY_KICK,
+	DL_HEALTH_REMEDY_COMP_RESET,
+	DL_HEALTH_REMEDY_RESET,
+	DL_HEALTH_REMEDY_REINIT,
+};
+
 #endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8e4e4bd7bb36..09d77d43ff63 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -6095,7 +6095,8 @@ __devlink_health_reporter_create(struct devlink *devlink,
 {
 	struct devlink_health_reporter *reporter;
 
-	if (WARN_ON(graceful_period && !ops->recover))
+	if (WARN_ON(graceful_period && !ops->recover) ||
+	    WARN_ON(ops->recover && !ops->remedy))
 		return ERR_PTR(-EINVAL);
 
 	reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
@@ -6265,6 +6266,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
 	if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
 			   reporter->ops->name))
 		goto reporter_nest_cancel;
+	if (reporter->ops->remedy &&
+	    nla_put_u32(msg, DEVLINK_ATTR_HEALTH_REPORTER_REMEDY,
+			reporter->ops->remedy))
+		goto reporter_nest_cancel;
 	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
 		       reporter->health_state))
 		goto reporter_nest_cancel;
-- 
2.29.2


  reply	other threads:[~2021-03-11  3:27 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-11  3:26 [RFC net-next v2 1/3] devlink: move health state to uAPI Jakub Kicinski
2021-03-11  3:26 ` Jakub Kicinski [this message]
2021-03-11  7:48   ` [RFC net-next v2 2/3] devlink: health: add remediation type Jiri Pirko
2021-03-11 14:32   ` Eran Ben Elisha
2021-03-11 16:45     ` Jakub Kicinski
2021-03-11  3:26 ` [RFC net-next v2 3/3] devlink: add more failure modes Jakub Kicinski
2021-03-11 14:23   ` Eran Ben Elisha
2021-03-11 16:49     ` Jakub Kicinski
2021-03-14 12:33       ` Eran Ben Elisha
2021-03-15 17:06         ` Jakub Kicinski
2021-03-11  7:47 ` [RFC net-next v2 1/3] devlink: move health state to uAPI Jiri Pirko
2021-03-11 16:46   ` Jakub Kicinski
2021-03-12 19:56     ` Keller, Jacob E

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210311032613.1533100-2-kuba@kernel.org \
    --to=kuba@kernel.org \
    --cc=andrew.gospodarek@broadcom.com \
    --cc=eranbe@mellanox.com \
    --cc=eugenem@fb.com \
    --cc=f242ed68-d31b-527d-562f-c5a35123861a@intel.com \
    --cc=guglielmo.morandin@broadcom.com \
    --cc=jacob.e.keller@intel.com \
    --cc=jiri@resnulli.us \
    --cc=netdev@vger.kernel.org \
    --cc=saeedm@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).