* [PATCH net-next V2 1/5] devlink: Move graceful period parameter to reporter ops
2025-07-24 20:48 [PATCH net-next V2 0/5] Expose grace period delay for devlink health reporter Tariq Toukan
@ 2025-07-24 20:48 ` Tariq Toukan
2025-07-24 20:48 ` [PATCH net-next V2 2/5] devlink: Move health reporter recovery abort logic to a separate function Tariq Toukan
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Tariq Toukan @ 2025-07-24 20:48 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller, Jiri Pirko, Jiri Pirko
Cc: Donald Hunter, Jonathan Corbet, Brett Creeley, Michael Chan,
Pavan Chebbi, Cai Huoqing, Tony Nguyen, Przemek Kitszel,
Sunil Goutham, Linu Cherian, Geetha sowjanya, Jerin Jacob,
hariprasad, Subbaraya Sundeep, Saeed Mahameed, Leon Romanovsky,
Tariq Toukan, Mark Bloch, Ido Schimmel, Petr Machata,
Manish Chopra, netdev, linux-kernel, linux-doc, intel-wired-lan,
linux-rdma, Shahar Shitrit, Gal Pressman
From: Shahar Shitrit <shshitrit@nvidia.com>
Move the default graceful period from a parameter to
devlink_health_reporter_create() to a field in the
devlink_health_reporter_ops structure.
This change improves consistency, as the graceful period is inherently
tied to the reporter's behavior and recovery policy. It simplifies the
signature of devlink_health_reporter_create() and its internal helper
functions. It also centralizes the reporter configuration at the ops
structure, preparing the groundwork for a downstream patch that will
introduce a devlink health reporter grace period delay attribute whose
default value will similarly be provided by the driver via the ops
structure.
Signed-off-by: Shahar Shitrit <shshitrit@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
drivers/net/ethernet/amd/pds_core/main.c | 2 +-
.../net/ethernet/broadcom/bnxt/bnxt_devlink.c | 2 +-
.../net/ethernet/huawei/hinic/hinic_devlink.c | 10 +++--
.../net/ethernet/intel/ice/devlink/health.c | 3 +-
.../marvell/octeontx2/af/rvu_devlink.c | 32 +++++++++++----
.../mellanox/mlx5/core/diag/reporter_vnic.c | 2 +-
.../mellanox/mlx5/core/en/reporter_rx.c | 10 +++--
.../mellanox/mlx5/core/en/reporter_tx.c | 10 +++--
.../net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +-
.../net/ethernet/mellanox/mlx5/core/health.c | 41 +++++++++++--------
drivers/net/ethernet/mellanox/mlxsw/core.c | 2 +-
drivers/net/ethernet/qlogic/qed/qed_devlink.c | 10 +++--
drivers/net/netdevsim/health.c | 4 +-
include/net/devlink.h | 11 +++--
net/devlink/health.c | 28 +++++--------
15 files changed, 98 insertions(+), 71 deletions(-)
diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c
index 9b81e1c260c2..c7a2eff57632 100644
--- a/drivers/net/ethernet/amd/pds_core/main.c
+++ b/drivers/net/ethernet/amd/pds_core/main.c
@@ -280,7 +280,7 @@ static int pdsc_init_pf(struct pdsc *pdsc)
goto err_out_del_dev;
}
- hr = devl_health_reporter_create(dl, &pdsc_fw_reporter_ops, 0, pdsc);
+ hr = devl_health_reporter_create(dl, &pdsc_fw_reporter_ops, pdsc);
if (IS_ERR(hr)) {
devl_unlock(dl);
dev_warn(pdsc->dev, "Failed to create fw reporter: %pe\n", hr);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 4c4581b0342e..43fb75806cd6 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -220,7 +220,7 @@ __bnxt_dl_reporter_create(struct bnxt *bp,
{
struct devlink_health_reporter *reporter;
- reporter = devlink_health_reporter_create(bp->dl, ops, 0, bp);
+ reporter = devlink_health_reporter_create(bp->dl, ops, bp);
if (IS_ERR(reporter)) {
netdev_warn(bp->dev, "Failed to create %s health reporter, rc = %ld\n",
ops->name, PTR_ERR(reporter));
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c
index 03e42512a2d5..300bc267a259 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c
@@ -443,8 +443,9 @@ int hinic_health_reporters_create(struct hinic_devlink_priv *priv)
struct devlink *devlink = priv_to_devlink(priv);
priv->hw_fault_reporter =
- devlink_health_reporter_create(devlink, &hinic_hw_fault_reporter_ops,
- 0, priv);
+ devlink_health_reporter_create(devlink,
+ &hinic_hw_fault_reporter_ops,
+ priv);
if (IS_ERR(priv->hw_fault_reporter)) {
dev_warn(&priv->hwdev->hwif->pdev->dev, "Failed to create hw fault reporter, err: %ld\n",
PTR_ERR(priv->hw_fault_reporter));
@@ -452,8 +453,9 @@ int hinic_health_reporters_create(struct hinic_devlink_priv *priv)
}
priv->fw_fault_reporter =
- devlink_health_reporter_create(devlink, &hinic_fw_fault_reporter_ops,
- 0, priv);
+ devlink_health_reporter_create(devlink,
+ &hinic_fw_fault_reporter_ops,
+ priv);
if (IS_ERR(priv->fw_fault_reporter)) {
dev_warn(&priv->hwdev->hwif->pdev->dev, "Failed to create fw fault reporter, err: %ld\n",
PTR_ERR(priv->fw_fault_reporter));
diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c b/drivers/net/ethernet/intel/ice/devlink/health.c
index 19c3d37aa768..3177496e7828 100644
--- a/drivers/net/ethernet/intel/ice/devlink/health.c
+++ b/drivers/net/ethernet/intel/ice/devlink/health.c
@@ -448,9 +448,8 @@ ice_init_devlink_rep(struct ice_pf *pf,
{
struct devlink *devlink = priv_to_devlink(pf);
struct devlink_health_reporter *rep;
- const u64 graceful_period = 0;
- rep = devl_health_reporter_create(devlink, ops, graceful_period, pf);
+ rep = devl_health_reporter_create(devlink, ops, pf);
if (IS_ERR(rep)) {
struct device *dev = ice_pf_to_dev(pf);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
index 27c3a2daaaa9..3735372539bd 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
@@ -505,7 +505,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl)
rvu_reporters->nix_event_ctx = nix_event_context;
rvu_reporters->rvu_hw_nix_intr_reporter =
- devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_intr_reporter_ops, 0, rvu);
+ devlink_health_reporter_create(rvu_dl->dl,
+ &rvu_hw_nix_intr_reporter_ops,
+ rvu);
if (IS_ERR(rvu_reporters->rvu_hw_nix_intr_reporter)) {
dev_warn(rvu->dev, "Failed to create hw_nix_intr reporter, err=%ld\n",
PTR_ERR(rvu_reporters->rvu_hw_nix_intr_reporter));
@@ -513,7 +515,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl)
}
rvu_reporters->rvu_hw_nix_gen_reporter =
- devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_gen_reporter_ops, 0, rvu);
+ devlink_health_reporter_create(rvu_dl->dl,
+ &rvu_hw_nix_gen_reporter_ops,
+ rvu);
if (IS_ERR(rvu_reporters->rvu_hw_nix_gen_reporter)) {
dev_warn(rvu->dev, "Failed to create hw_nix_gen reporter, err=%ld\n",
PTR_ERR(rvu_reporters->rvu_hw_nix_gen_reporter));
@@ -521,7 +525,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl)
}
rvu_reporters->rvu_hw_nix_err_reporter =
- devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_err_reporter_ops, 0, rvu);
+ devlink_health_reporter_create(rvu_dl->dl,
+ &rvu_hw_nix_err_reporter_ops,
+ rvu);
if (IS_ERR(rvu_reporters->rvu_hw_nix_err_reporter)) {
dev_warn(rvu->dev, "Failed to create hw_nix_err reporter, err=%ld\n",
PTR_ERR(rvu_reporters->rvu_hw_nix_err_reporter));
@@ -529,7 +535,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl)
}
rvu_reporters->rvu_hw_nix_ras_reporter =
- devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_ras_reporter_ops, 0, rvu);
+ devlink_health_reporter_create(rvu_dl->dl,
+ &rvu_hw_nix_ras_reporter_ops,
+ rvu);
if (IS_ERR(rvu_reporters->rvu_hw_nix_ras_reporter)) {
dev_warn(rvu->dev, "Failed to create hw_nix_ras reporter, err=%ld\n",
PTR_ERR(rvu_reporters->rvu_hw_nix_ras_reporter));
@@ -1051,7 +1059,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl)
rvu_reporters->npa_event_ctx = npa_event_context;
rvu_reporters->rvu_hw_npa_intr_reporter =
- devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_intr_reporter_ops, 0, rvu);
+ devlink_health_reporter_create(rvu_dl->dl,
+ &rvu_hw_npa_intr_reporter_ops,
+ rvu);
if (IS_ERR(rvu_reporters->rvu_hw_npa_intr_reporter)) {
dev_warn(rvu->dev, "Failed to create hw_npa_intr reporter, err=%ld\n",
PTR_ERR(rvu_reporters->rvu_hw_npa_intr_reporter));
@@ -1059,7 +1069,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl)
}
rvu_reporters->rvu_hw_npa_gen_reporter =
- devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_gen_reporter_ops, 0, rvu);
+ devlink_health_reporter_create(rvu_dl->dl,
+ &rvu_hw_npa_gen_reporter_ops,
+ rvu);
if (IS_ERR(rvu_reporters->rvu_hw_npa_gen_reporter)) {
dev_warn(rvu->dev, "Failed to create hw_npa_gen reporter, err=%ld\n",
PTR_ERR(rvu_reporters->rvu_hw_npa_gen_reporter));
@@ -1067,7 +1079,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl)
}
rvu_reporters->rvu_hw_npa_err_reporter =
- devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_err_reporter_ops, 0, rvu);
+ devlink_health_reporter_create(rvu_dl->dl,
+ &rvu_hw_npa_err_reporter_ops,
+ rvu);
if (IS_ERR(rvu_reporters->rvu_hw_npa_err_reporter)) {
dev_warn(rvu->dev, "Failed to create hw_npa_err reporter, err=%ld\n",
PTR_ERR(rvu_reporters->rvu_hw_npa_err_reporter));
@@ -1075,7 +1089,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl)
}
rvu_reporters->rvu_hw_npa_ras_reporter =
- devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_ras_reporter_ops, 0, rvu);
+ devlink_health_reporter_create(rvu_dl->dl,
+ &rvu_hw_npa_ras_reporter_ops,
+ rvu);
if (IS_ERR(rvu_reporters->rvu_hw_npa_ras_reporter)) {
dev_warn(rvu->dev, "Failed to create hw_npa_ras reporter, err=%ld\n",
PTR_ERR(rvu_reporters->rvu_hw_npa_ras_reporter));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c
index 86253a89c24c..878f9b46bf18 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c
@@ -133,7 +133,7 @@ void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev)
health->vnic_reporter =
devlink_health_reporter_create(devlink,
&mlx5_reporter_vnic_ops,
- 0, dev);
+ dev);
if (IS_ERR(health->vnic_reporter))
mlx5_core_warn(dev,
"Failed to create vnic reporter, err = %ld\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
index e75759533ae0..e106f0696486 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
@@ -644,22 +644,24 @@ void mlx5e_reporter_icosq_resume_recovery(struct mlx5e_channel *c)
mutex_unlock(&c->icosq_recovery_lock);
}
+#define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500
+
static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = {
.name = "rx",
.recover = mlx5e_rx_reporter_recover,
.diagnose = mlx5e_rx_reporter_diagnose,
.dump = mlx5e_rx_reporter_dump,
+ .default_graceful_period = MLX5E_REPORTER_RX_GRACEFUL_PERIOD,
};
-#define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500
-
void mlx5e_reporter_rx_create(struct mlx5e_priv *priv)
{
+ struct devlink_port *port = priv->netdev->devlink_port;
struct devlink_health_reporter *reporter;
- reporter = devlink_port_health_reporter_create(priv->netdev->devlink_port,
+ reporter = devlink_port_health_reporter_create(port,
&mlx5_rx_reporter_ops,
- MLX5E_REPORTER_RX_GRACEFUL_PERIOD, priv);
+ priv);
if (IS_ERR(reporter)) {
netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n",
PTR_ERR(reporter));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index bd96988e102c..6fb0d143ad1b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -514,22 +514,24 @@ void mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq *ptpsq)
mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
}
+#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
+
static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
.name = "tx",
.recover = mlx5e_tx_reporter_recover,
.diagnose = mlx5e_tx_reporter_diagnose,
.dump = mlx5e_tx_reporter_dump,
+ .default_graceful_period = MLX5_REPORTER_TX_GRACEFUL_PERIOD,
};
-#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
-
void mlx5e_reporter_tx_create(struct mlx5e_priv *priv)
{
+ struct devlink_port *port = priv->netdev->devlink_port;
struct devlink_health_reporter *reporter;
- reporter = devlink_port_health_reporter_create(priv->netdev->devlink_port,
+ reporter = devlink_port_health_reporter_create(port,
&mlx5_tx_reporter_ops,
- MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv);
+ priv);
if (IS_ERR(reporter)) {
netdev_warn(priv->netdev,
"Failed to create tx reporter, err = %ld\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 63a7a788fb0d..b231e7855bca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1447,7 +1447,7 @@ static void mlx5e_rep_vnic_reporter_create(struct mlx5e_priv *priv,
reporter = devl_port_health_reporter_create(dl_port,
&mlx5_rep_vnic_reporter_ops,
- 0, rpriv);
+ rpriv);
if (IS_ERR(reporter)) {
mlx5_core_err(priv->mdev,
"Failed to create representor vnic reporter, err = %ld\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index cf7a1edd0530..6959fea03443 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -669,54 +669,61 @@ static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work)
}
}
+#define MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD 180000
+#define MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD 60000
+#define MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD 30000
+#define MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD \
+ MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD
+
+static const
+struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ecpf_ops = {
+ .name = "fw_fatal",
+ .recover = mlx5_fw_fatal_reporter_recover,
+ .dump = mlx5_fw_fatal_reporter_dump,
+ .default_graceful_period =
+ MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD,
+};
+
static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_pf_ops = {
.name = "fw_fatal",
.recover = mlx5_fw_fatal_reporter_recover,
.dump = mlx5_fw_fatal_reporter_dump,
+ .default_graceful_period = MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD,
};
static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
.name = "fw_fatal",
.recover = mlx5_fw_fatal_reporter_recover,
+ .default_graceful_period =
+ MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD,
};
-#define MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD 180000
-#define MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD 60000
-#define MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD 30000
-#define MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD
-
void mlx5_fw_reporters_create(struct mlx5_core_dev *dev)
{
const struct devlink_health_reporter_ops *fw_fatal_ops;
struct mlx5_core_health *health = &dev->priv.health;
const struct devlink_health_reporter_ops *fw_ops;
struct devlink *devlink = priv_to_devlink(dev);
- u64 grace_period;
- fw_fatal_ops = &mlx5_fw_fatal_reporter_pf_ops;
fw_ops = &mlx5_fw_reporter_pf_ops;
if (mlx5_core_is_ecpf(dev)) {
- grace_period = MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD;
+ fw_fatal_ops = &mlx5_fw_fatal_reporter_ecpf_ops;
} else if (mlx5_core_is_pf(dev)) {
- grace_period = MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD;
+ fw_fatal_ops = &mlx5_fw_fatal_reporter_pf_ops;
} else {
/* VF or SF */
- grace_period = MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD;
fw_fatal_ops = &mlx5_fw_fatal_reporter_ops;
fw_ops = &mlx5_fw_reporter_ops;
}
- health->fw_reporter =
- devl_health_reporter_create(devlink, fw_ops, 0, dev);
+ health->fw_reporter = devl_health_reporter_create(devlink, fw_ops, dev);
if (IS_ERR(health->fw_reporter))
mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n",
PTR_ERR(health->fw_reporter));
- health->fw_fatal_reporter =
- devl_health_reporter_create(devlink,
- fw_fatal_ops,
- grace_period,
- dev);
+ health->fw_fatal_reporter = devl_health_reporter_create(devlink,
+ fw_fatal_ops,
+ dev);
if (IS_ERR(health->fw_fatal_reporter))
mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n",
PTR_ERR(health->fw_fatal_reporter));
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 2bb2b77351bd..980f3223f124 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -2043,7 +2043,7 @@ static int mlxsw_core_health_init(struct mlxsw_core *mlxsw_core)
return 0;
fw_fatal = devl_health_reporter_create(devlink, &mlxsw_core_health_fw_fatal_ops,
- 0, mlxsw_core);
+ mlxsw_core);
if (IS_ERR(fw_fatal)) {
dev_err(mlxsw_core->bus_info->dev, "Failed to create fw fatal reporter");
return PTR_ERR(fw_fatal);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_devlink.c b/drivers/net/ethernet/qlogic/qed/qed_devlink.c
index 1adc7fbb3f2f..d000ed734c7c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_devlink.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_devlink.c
@@ -87,20 +87,22 @@ qed_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter,
return 0;
}
+#define QED_REPORTER_FW_GRACEFUL_PERIOD 0
+
static const struct devlink_health_reporter_ops qed_fw_fatal_reporter_ops = {
.name = "fw_fatal",
.recover = qed_fw_fatal_reporter_recover,
.dump = qed_fw_fatal_reporter_dump,
+ .default_graceful_period = QED_REPORTER_FW_GRACEFUL_PERIOD,
};
-#define QED_REPORTER_FW_GRACEFUL_PERIOD 0
-
void qed_fw_reporters_create(struct devlink *devlink)
{
struct qed_devlink *dl = devlink_priv(devlink);
- dl->fw_reporter = devlink_health_reporter_create(devlink, &qed_fw_fatal_reporter_ops,
- QED_REPORTER_FW_GRACEFUL_PERIOD, dl);
+ dl->fw_reporter =
+ devlink_health_reporter_create(devlink,
+ &qed_fw_fatal_reporter_ops, dl);
if (IS_ERR(dl->fw_reporter)) {
DP_NOTICE(dl->cdev, "Failed to create fw reporter, err = %ld\n",
PTR_ERR(dl->fw_reporter));
diff --git a/drivers/net/netdevsim/health.c b/drivers/net/netdevsim/health.c
index 688f05316b5e..3bd0e7a489c3 100644
--- a/drivers/net/netdevsim/health.c
+++ b/drivers/net/netdevsim/health.c
@@ -183,14 +183,14 @@ int nsim_dev_health_init(struct nsim_dev *nsim_dev, struct devlink *devlink)
health->empty_reporter =
devl_health_reporter_create(devlink,
&nsim_dev_empty_reporter_ops,
- 0, health);
+ health);
if (IS_ERR(health->empty_reporter))
return PTR_ERR(health->empty_reporter);
health->dummy_reporter =
devl_health_reporter_create(devlink,
&nsim_dev_dummy_reporter_ops,
- 0, health);
+ health);
if (IS_ERR(health->dummy_reporter)) {
err = PTR_ERR(health->dummy_reporter);
goto err_empty_reporter_destroy;
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 93640a29427c..a65aa24e8df4 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -742,6 +742,8 @@ enum devlink_health_reporter_state {
* if priv_ctx is NULL, run a full dump
* @diagnose: callback to diagnose the current status
* @test: callback to trigger a test event
+ * @default_graceful_period: default min time (in msec)
+ between recovery attempts
*/
struct devlink_health_reporter_ops {
@@ -756,6 +758,7 @@ struct devlink_health_reporter_ops {
struct netlink_ext_ack *extack);
int (*test)(struct devlink_health_reporter *reporter,
struct netlink_ext_ack *extack);
+ u64 default_graceful_period;
};
/**
@@ -1924,22 +1927,22 @@ void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
struct devlink_health_reporter *
devl_port_health_reporter_create(struct devlink_port *port,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv);
+ void *priv);
struct devlink_health_reporter *
devlink_port_health_reporter_create(struct devlink_port *port,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv);
+ void *priv);
struct devlink_health_reporter *
devl_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv);
+ void *priv);
struct devlink_health_reporter *
devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv);
+ void *priv);
void
devl_health_reporter_destroy(struct devlink_health_reporter *reporter);
diff --git a/net/devlink/health.c b/net/devlink/health.c
index b3ce8ecbb7fb..ba144b7426fa 100644
--- a/net/devlink/health.c
+++ b/net/devlink/health.c
@@ -108,11 +108,11 @@ devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port,
static struct devlink_health_reporter *
__devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
- if (WARN_ON(graceful_period && !ops->recover))
+ if (WARN_ON(ops->default_graceful_period && !ops->recover))
return ERR_PTR(-EINVAL);
reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
@@ -122,7 +122,7 @@ __devlink_health_reporter_create(struct devlink *devlink,
reporter->priv = priv;
reporter->ops = ops;
reporter->devlink = devlink;
- reporter->graceful_period = graceful_period;
+ reporter->graceful_period = ops->default_graceful_period;
reporter->auto_recover = !!ops->recover;
reporter->auto_dump = !!ops->dump;
return reporter;
@@ -134,13 +134,12 @@ __devlink_health_reporter_create(struct devlink *devlink,
*
* @port: devlink_port to which health reports will relate
* @ops: devlink health reporter ops
- * @graceful_period: min time (in msec) between recovery attempts
* @priv: driver priv pointer
*/
struct devlink_health_reporter *
devl_port_health_reporter_create(struct devlink_port *port,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
@@ -150,8 +149,7 @@ devl_port_health_reporter_create(struct devlink_port *port,
ops->name))
return ERR_PTR(-EEXIST);
- reporter = __devlink_health_reporter_create(port->devlink, ops,
- graceful_period, priv);
+ reporter = __devlink_health_reporter_create(port->devlink, ops, priv);
if (IS_ERR(reporter))
return reporter;
@@ -164,14 +162,13 @@ EXPORT_SYMBOL_GPL(devl_port_health_reporter_create);
struct devlink_health_reporter *
devlink_port_health_reporter_create(struct devlink_port *port,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
struct devlink *devlink = port->devlink;
devl_lock(devlink);
- reporter = devl_port_health_reporter_create(port, ops,
- graceful_period, priv);
+ reporter = devl_port_health_reporter_create(port, ops, priv);
devl_unlock(devlink);
return reporter;
}
@@ -182,13 +179,12 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create);
*
* @devlink: devlink instance which the health reports will relate
* @ops: devlink health reporter ops
- * @graceful_period: min time (in msec) between recovery attempts
* @priv: driver priv pointer
*/
struct devlink_health_reporter *
devl_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
@@ -197,8 +193,7 @@ devl_health_reporter_create(struct devlink *devlink,
if (devlink_health_reporter_find_by_name(devlink, ops->name))
return ERR_PTR(-EEXIST);
- reporter = __devlink_health_reporter_create(devlink, ops,
- graceful_period, priv);
+ reporter = __devlink_health_reporter_create(devlink, ops, priv);
if (IS_ERR(reporter))
return reporter;
@@ -210,13 +205,12 @@ EXPORT_SYMBOL_GPL(devl_health_reporter_create);
struct devlink_health_reporter *
devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
devl_lock(devlink);
- reporter = devl_health_reporter_create(devlink, ops,
- graceful_period, priv);
+ reporter = devl_health_reporter_create(devlink, ops, priv);
devl_unlock(devlink);
return reporter;
}
--
2.31.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH net-next V2 4/5] devlink: Make health reporter grace period delay configurable
2025-07-24 20:48 [PATCH net-next V2 0/5] Expose grace period delay for devlink health reporter Tariq Toukan
` (2 preceding siblings ...)
2025-07-24 20:48 ` [PATCH net-next V2 3/5] devlink: Introduce grace period delay for health reporter Tariq Toukan
@ 2025-07-24 20:48 ` Tariq Toukan
2025-07-24 20:48 ` [PATCH net-next V2 5/5] net/mlx5e: Set default grace period delay for TX and RX reporters Tariq Toukan
2025-07-25 1:01 ` [PATCH net-next V2 0/5] Expose grace period delay for devlink health reporter Jakub Kicinski
5 siblings, 0 replies; 7+ messages in thread
From: Tariq Toukan @ 2025-07-24 20:48 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller, Jiri Pirko, Jiri Pirko
Cc: Donald Hunter, Jonathan Corbet, Brett Creeley, Michael Chan,
Pavan Chebbi, Cai Huoqing, Tony Nguyen, Przemek Kitszel,
Sunil Goutham, Linu Cherian, Geetha sowjanya, Jerin Jacob,
hariprasad, Subbaraya Sundeep, Saeed Mahameed, Leon Romanovsky,
Tariq Toukan, Mark Bloch, Ido Schimmel, Petr Machata,
Manish Chopra, netdev, linux-kernel, linux-doc, intel-wired-lan,
linux-rdma, Shahar Shitrit, Gal Pressman
From: Shahar Shitrit <shshitrit@nvidia.com>
Enable configuration of the grace period delay — a time window
starting from the first error recovery, during which the reporter
allows recovery attempts for each reported error.
This feature is helpful when a single underlying issue causes
multiple errors, as it delays the start of the grace period
to allow sufficient time for recovering all related errors.
For example, if multiple TX queues time out simultaneously,
a sufficient grace period delay could allow all affected TX
queues to be recovered within that window. Without this delay,
only the first TX queue that reports a timeout will undergo
recovery, while the remaining TX queues will be blocked once
the grace period begins.
Configuration example:
$ devlink health set pci/0000:00:09.0 reporter tx grace_period_delay 500
Configuration example with ynl:
./tools/net/ynl/pyynl/cli.py \
--spec Documentation/netlink/specs/devlink.yaml \
--do health-reporter-set --json '{
"bus-name": "auxiliary",
"dev-name": "mlx5_core.eth.0",
"port-index": 65535,
"health-reporter-name": "tx",
"health-reporter-gp-delay": 500
}'
Signed-off-by: Shahar Shitrit <shshitrit@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
Documentation/netlink/specs/devlink.yaml | 6 ++++
.../networking/devlink/devlink-health.rst | 2 +-
include/uapi/linux/devlink.h | 2 ++
net/devlink/health.c | 29 +++++++++++++++++--
net/devlink/netlink_gen.c | 5 ++--
5 files changed, 38 insertions(+), 6 deletions(-)
diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml
index bb87111d5e16..9b996d0abfd3 100644
--- a/Documentation/netlink/specs/devlink.yaml
+++ b/Documentation/netlink/specs/devlink.yaml
@@ -853,6 +853,9 @@ attribute-sets:
type: nest
multi-attr: true
nested-attributes: dl-rate-tc-bws
+ -
+ name: health-reporter-gp-delay
+ type: u64
-
name: dl-dev-stats
subset-of: devlink
@@ -1216,6 +1219,8 @@ attribute-sets:
name: health-reporter-dump-ts-ns
-
name: health-reporter-auto-dump
+ -
+ name: health-reporter-gp-delay
-
name: dl-attr-stats
@@ -1961,6 +1966,7 @@ operations:
- health-reporter-graceful-period
- health-reporter-auto-recover
- health-reporter-auto-dump
+ - health-reporter-gp-delay
-
name: health-reporter-recover
diff --git a/Documentation/networking/devlink/devlink-health.rst b/Documentation/networking/devlink/devlink-health.rst
index e0b8cfed610a..07602f678282 100644
--- a/Documentation/networking/devlink/devlink-health.rst
+++ b/Documentation/networking/devlink/devlink-health.rst
@@ -50,7 +50,7 @@ Once an error is reported, devlink health will perform the following actions:
* Auto recovery attempt is being done. Depends on:
- Auto-recovery configuration
- - Grace period vs. time passed since last recover
+ - Grace period (and grace period delay) vs. time passed since last recover
Devlink formatted message
=========================
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 9fcb25a0f447..a47e7f413511 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -636,6 +636,8 @@ enum devlink_attr {
DEVLINK_ATTR_RATE_TC_BWS, /* nested */
+ DEVLINK_ATTR_HEALTH_REPORTER_GP_DELAY, /* u64 */
+
/* Add new attributes above here, update the spec in
* Documentation/netlink/specs/devlink.yaml and re-generate
* net/devlink/netlink_gen.c.
diff --git a/net/devlink/health.c b/net/devlink/health.c
index a0269975f592..1e9a2d0d0631 100644
--- a/net/devlink/health.c
+++ b/net/devlink/health.c
@@ -113,7 +113,9 @@ __devlink_health_reporter_create(struct devlink *devlink,
{
struct devlink_health_reporter *reporter;
- if (WARN_ON(ops->default_graceful_period && !ops->recover))
+ if (WARN_ON(ops->default_graceful_period_delay &&
+ !ops->default_graceful_period) ||
+ WARN_ON(ops->default_graceful_period && !ops->recover))
return ERR_PTR(-EINVAL);
reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
@@ -293,6 +295,11 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
reporter->graceful_period))
goto reporter_nest_cancel;
+ if (reporter->ops->recover &&
+ devlink_nl_put_u64(msg,
+ DEVLINK_ATTR_HEALTH_REPORTER_GP_DELAY,
+ reporter->graceful_period_delay))
+ goto reporter_nest_cancel;
if (reporter->ops->recover &&
nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
reporter->auto_recover))
@@ -458,16 +465,32 @@ int devlink_nl_health_reporter_set_doit(struct sk_buff *skb,
if (!reporter->ops->recover &&
(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] ||
- info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]))
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] ||
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GP_DELAY]))
return -EOPNOTSUPP;
if (!reporter->ops->dump &&
info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
return -EOPNOTSUPP;
- if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) {
reporter->graceful_period =
nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]);
+ if (!reporter->graceful_period)
+ reporter->graceful_period_delay = 0;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GP_DELAY]) {
+ u64 configured_delay =
+ nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GP_DELAY]);
+
+ if (!reporter->graceful_period && configured_delay) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Cannot set grace period delay without a grace period.");
+ return -EINVAL;
+ }
+
+ reporter->graceful_period_delay = configured_delay;
+ }
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
reporter->auto_recover =
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
index d97c326a9045..de3aabba37b5 100644
--- a/net/devlink/netlink_gen.c
+++ b/net/devlink/netlink_gen.c
@@ -389,7 +389,7 @@ static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLIN
};
/* DEVLINK_CMD_HEALTH_REPORTER_SET - do */
-static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP + 1] = {
+static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_GP_DELAY + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
@@ -397,6 +397,7 @@ static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATT
[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, },
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, },
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_GP_DELAY] = { .type = NLA_U64, },
};
/* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */
@@ -1032,7 +1033,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
.doit = devlink_nl_health_reporter_set_doit,
.post_doit = devlink_nl_post_doit,
.policy = devlink_health_reporter_set_nl_policy,
- .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_GP_DELAY,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
{
--
2.31.1
^ permalink raw reply related [flat|nested] 7+ messages in thread