Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed
* [PATCH for-next 0/4] RDMA/bnxt_re: Device re-initialization after Firmware error
@ 2024-09-10  4:00 Selvin Xavier
  2024-09-10  4:00 ` [PATCH for-next 1/4] RDMA/bnxt_re: Change aux driver data to en_info to hold more information Selvin Xavier
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Selvin Xavier @ 2024-09-10  4:00 UTC (permalink / raw)
  To: leon, jgg
  Cc: linux-rdma, andrew.gospodarek, kalesh-anakkur.purayil,
	Selvin Xavier

Add support for complete re-initialization of the device when
driver detects a firmware reset. Code reorg that updates the
device handles stored with Auxiliary bus and the bnxt_en driver.
bnxt_en driver calls suspend and resume hooks upon error recovery.
Driver destroys and recreates the roce device instance upon receiving
these calls. 

Please review and apply these changes for 6.12.

Thanks,
Selvin Xavier

Chandramohan Akula (2):
  RDMA/bnxt_re: Change aux driver data to en_info to hold more
    information
  RDMA/bnxt_re: Use the aux device for L2 ULP callbacks

Selvin Xavier (2):
  RDMA/bnxt_re: Group all operations under add_device and remove_device
  RDMA/bnxt_re: Recover the device when FW error is detected

 drivers/infiniband/hw/bnxt_re/bnxt_re.h   |  21 ++++
 drivers/infiniband/hw/bnxt_re/main.c      | 199 +++++++++++++++++++++---------
 drivers/infiniband/hw/bnxt_re/qplib_res.h |   1 +
 3 files changed, 160 insertions(+), 61 deletions(-)

-- 
2.5.5


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH for-next 1/4] RDMA/bnxt_re: Change aux driver data to en_info to hold more information
  2024-09-10  4:00 [PATCH for-next 0/4] RDMA/bnxt_re: Device re-initialization after Firmware error Selvin Xavier
@ 2024-09-10  4:00 ` Selvin Xavier
  2024-09-10 18:05   ` kernel test robot
  2024-09-10  4:01 ` [PATCH for-next 2/4] RDMA/bnxt_re: Use the aux device for L2 ULP callbacks Selvin Xavier
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 6+ messages in thread
From: Selvin Xavier @ 2024-09-10  4:00 UTC (permalink / raw)
  To: leon, jgg
  Cc: linux-rdma, andrew.gospodarek, kalesh-anakkur.purayil,
	Chandramohan Akula, Selvin Xavier

From: Chandramohan Akula <chandramohan.akula@broadcom.com>

rdev will be destroyed and recreated during the FW error
recovery scenarios. So to keep the state, if any, use an
en_info structure which gets created/freed based on auxiliary
device initialization/de-initialization.

Signed-off-by: Chandramohan Akula <chandramohan.akula@broadcom.com>
Reviewed-by: Kashyap Desai <kashyap.desai@broadcom.com>
Reviewed-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
 drivers/infiniband/hw/bnxt_re/bnxt_re.h |  6 +++
 drivers/infiniband/hw/bnxt_re/main.c    | 73 ++++++++++++++++++++++++++++-----
 2 files changed, 68 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 2be9a62..5df3ce1 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -107,6 +107,11 @@ struct bnxt_re_gsi_context {
 	struct	bnxt_re_sqp_entries *sqp_tbl;
 };
 
+struct bnxt_re_en_dev_info {
+	struct bnxt_en_dev *en_dev;
+	struct bnxt_re_dev *rdev;
+};
+
 #define BNXT_RE_AEQ_IDX			0
 #define BNXT_RE_NQ_IDX			1
 #define BNXT_RE_GEN_P5_MAX_VF		64
@@ -155,6 +160,7 @@ struct bnxt_re_dev {
 #define BNXT_RE_FLAG_ERR_DEVICE_DETACHED       17
 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS          29
 	struct net_device		*netdev;
+	struct auxiliary_device         *adev;
 	struct notifier_block		nb;
 	unsigned int			version, major, minor;
 	struct bnxt_qplib_chip_ctx	*chip_ctx;
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 16a84ca..8679459 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -292,10 +292,13 @@ static void bnxt_re_vf_res_config(struct bnxt_re_dev *rdev)
 
 static void bnxt_re_shutdown(struct auxiliary_device *adev)
 {
-	struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev);
+	struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev);
+	struct bnxt_re_dev *rdev;
 
-	if (!rdev)
+	if (!en_info)
 		return;
+
+	rdev = en_info->rdev;
 	ib_unregister_device(&rdev->ibdev);
 	bnxt_re_dev_uninit(rdev);
 }
@@ -1794,14 +1797,33 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev)
 	return rc;
 }
 
+static void bnxt_re_update_en_info_rdev(struct bnxt_re_dev *rdev,
+					struct bnxt_re_en_dev_info *en_info,
+					struct auxiliary_device *adev)
+{
+	/* Before updating the rdev pointer in bnxt_re_en_dev_info structure,
+	 * take the rtnl lock to avoid accessing invalid rdev pointer from
+	 * L2 ULP callbacks. This is applicable in all the places where rdev
+	 * pointer is updated in bnxt_re_en_dev_info.
+	 */
+	rtnl_lock();
+	en_info->rdev = rdev;
+	rdev->adev = adev;
+	rtnl_unlock();
+}
+
 static int bnxt_re_add_device(struct auxiliary_device *adev)
 {
 	struct bnxt_aux_priv *aux_priv =
 		container_of(adev, struct bnxt_aux_priv, aux_dev);
+	struct bnxt_re_en_dev_info *en_info;
 	struct bnxt_en_dev *en_dev;
 	struct bnxt_re_dev *rdev;
 	int rc;
 
+	en_info = auxiliary_get_drvdata(adev);
+	en_dev = en_info->en_dev;
+
 	/* en_dev should never be NULL as long as adev and aux_dev are valid. */
 	en_dev = aux_priv->edev;
 
@@ -1811,6 +1833,8 @@ static int bnxt_re_add_device(struct auxiliary_device *adev)
 		goto exit;
 	}
 
+	bnxt_re_update_en_info_rdev(rdev, en_info, adev);
+
 	rc = bnxt_re_dev_init(rdev);
 	if (rc)
 		goto re_dev_dealloc;
@@ -1821,11 +1845,11 @@ static int bnxt_re_add_device(struct auxiliary_device *adev)
 			aux_priv->aux_dev.name);
 		goto re_dev_uninit;
 	}
-	auxiliary_set_drvdata(adev, rdev);
 
 	return 0;
 
 re_dev_uninit:
+	bnxt_re_update_en_info_rdev(NULL, en_info, adev);
 	bnxt_re_dev_uninit(rdev);
 re_dev_dealloc:
 	ib_dealloc_device(&rdev->ibdev);
@@ -1911,12 +1935,20 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 
 static void bnxt_re_remove(struct auxiliary_device *adev)
 {
-	struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev);
+	struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev);
+	struct bnxt_en_dev *en_dev;
+	struct bnxt_re_dev *rdev;
 
-	if (!rdev)
+	mutex_lock(&bnxt_re_mutex);
+	if (!en_info) {
+		mutex_unlock(&bnxt_re_mutex);
 		return;
+	}
+	en_dev = en_info->en_dev;
+	rdev = en_info->rdev;
+	if (!rdev)
+		goto skip_remove;
 
-	mutex_lock(&bnxt_re_mutex);
 	if (rdev->nb.notifier_call) {
 		unregister_netdevice_notifier(&rdev->nb);
 		rdev->nb.notifier_call = NULL;
@@ -1931,16 +1963,31 @@ static void bnxt_re_remove(struct auxiliary_device *adev)
 	bnxt_re_dev_uninit(rdev);
 	ib_dealloc_device(&rdev->ibdev);
 skip_remove:
+	kfree(en_info);
 	mutex_unlock(&bnxt_re_mutex);
 }
 
 static int bnxt_re_probe(struct auxiliary_device *adev,
 			 const struct auxiliary_device_id *id)
 {
+	struct bnxt_aux_priv *aux_priv =
+		container_of(adev, struct bnxt_aux_priv, aux_dev);
+	struct bnxt_re_en_dev_info *en_info;
+	struct bnxt_en_dev *en_dev;
 	struct bnxt_re_dev *rdev;
 	int rc;
 
+	en_dev = aux_priv->edev;
+
 	mutex_lock(&bnxt_re_mutex);
+	en_info = kzalloc(sizeof(*en_info), GFP_KERNEL);
+	if (!en_info) {
+		mutex_unlock(&bnxt_re_mutex);
+		return -ENOMEM;
+	}
+	en_info->en_dev = en_dev;
+
+	auxiliary_set_drvdata(adev, en_info);
 
 	rc = bnxt_re_add_device(adev);
 	if (rc) {
@@ -1948,7 +1995,7 @@ static int bnxt_re_probe(struct auxiliary_device *adev,
 		return rc;
 	}
 
-	rdev = auxiliary_get_drvdata(adev);
+	rdev = en_info->rdev;
 
 	rdev->nb.notifier_call = bnxt_re_netdev_event;
 	rc = register_netdevice_notifier(&rdev->nb);
@@ -1972,11 +2019,13 @@ static int bnxt_re_probe(struct auxiliary_device *adev,
 
 static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state)
 {
-	struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev);
+	struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev);
+	struct bnxt_re_dev *rdev;
 
-	if (!rdev)
+	if (!en_info)
 		return 0;
 
+	rdev = en_info->rdev;
 	mutex_lock(&bnxt_re_mutex);
 	/* L2 driver may invoke this callback during device error/crash or device
 	 * reset. Current RoCE driver doesn't recover the device in case of
@@ -2009,11 +2058,13 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state)
 
 static int bnxt_re_resume(struct auxiliary_device *adev)
 {
-	struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev);
+	struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev);
+	struct bnxt_re_dev *rdev;
 
-	if (!rdev)
+	if (!en_info)
 		return 0;
 
+	rdev = en_info->rdev;
 	mutex_lock(&bnxt_re_mutex);
 	/* L2 driver may invoke this callback during device recovery, resume.
 	 * reset. Current RoCE driver doesn't recover the device in case of
-- 
2.5.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH for-next 2/4] RDMA/bnxt_re: Use the aux device for L2 ULP callbacks
  2024-09-10  4:00 [PATCH for-next 0/4] RDMA/bnxt_re: Device re-initialization after Firmware error Selvin Xavier
  2024-09-10  4:00 ` [PATCH for-next 1/4] RDMA/bnxt_re: Change aux driver data to en_info to hold more information Selvin Xavier
@ 2024-09-10  4:01 ` Selvin Xavier
  2024-09-10  4:01 ` [PATCH for-next 3/4] RDMA/bnxt_re: Group all operations under add_device and remove_device Selvin Xavier
  2024-09-10  4:01 ` [PATCH for-next 4/4] RDMA/bnxt_re: Recover the device when FW error is detected Selvin Xavier
  3 siblings, 0 replies; 6+ messages in thread
From: Selvin Xavier @ 2024-09-10  4:01 UTC (permalink / raw)
  To: leon, jgg
  Cc: linux-rdma, andrew.gospodarek, kalesh-anakkur.purayil,
	Chandramohan Akula, Selvin Xavier

From: Chandramohan Akula <chandramohan.akula@broadcom.com>

While registering with the L2 for ULP operations, use the
aux device pointer as the handle. Aux device has
the data bnxt_re_en_dev_info, which is used to
store required information for the bnxt_re_suspend
and bnxt_re_resume functions.

Signed-off-by: Chandramohan Akula <chandramohan.akula@broadcom.com>
Reviewed-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Reviewed-by: Kashyap Desai <kashyap.desai@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
 drivers/infiniband/hw/bnxt_re/main.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 8679459..a5e0c21 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -305,11 +305,18 @@ static void bnxt_re_shutdown(struct auxiliary_device *adev)
 
 static void bnxt_re_stop_irq(void *handle)
 {
-	struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle;
-	struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw;
+	struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle);
+	struct bnxt_qplib_rcfw *rcfw;
+	struct bnxt_re_dev *rdev;
 	struct bnxt_qplib_nq *nq;
 	int indx;
 
+	if (!en_info)
+		return;
+
+	rdev = en_info->rdev;
+	rcfw = &rdev->rcfw;
+
 	for (indx = BNXT_RE_NQ_IDX; indx < rdev->num_msix; indx++) {
 		nq = &rdev->nq[indx - 1];
 		bnxt_qplib_nq_stop_irq(nq, false);
@@ -320,12 +327,19 @@ static void bnxt_re_stop_irq(void *handle)
 
 static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent)
 {
-	struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle;
-	struct bnxt_msix_entry *msix_ent = rdev->en_dev->msix_entries;
-	struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw;
+	struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle);
+	struct bnxt_msix_entry *msix_ent;
+	struct bnxt_qplib_rcfw *rcfw;
+	struct bnxt_re_dev *rdev;
 	struct bnxt_qplib_nq *nq;
 	int indx, rc;
 
+	if (!en_info)
+		return;
+
+	rdev = en_info->rdev;
+	msix_ent = rdev->en_dev->msix_entries;
+	rcfw = &rdev->rcfw;
 	if (!ent) {
 		/* Not setting the f/w timeout bit in rcfw.
 		 * During the driver unload the first command
@@ -374,7 +388,7 @@ static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev)
 
 	en_dev = rdev->en_dev;
 
-	rc = bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev);
+	rc = bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev->adev);
 	if (!rc)
 		rdev->qplib_res.pdev = rdev->en_dev->pdev;
 	return rc;
-- 
2.5.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH for-next 3/4] RDMA/bnxt_re: Group all operations under add_device and remove_device
  2024-09-10  4:00 [PATCH for-next 0/4] RDMA/bnxt_re: Device re-initialization after Firmware error Selvin Xavier
  2024-09-10  4:00 ` [PATCH for-next 1/4] RDMA/bnxt_re: Change aux driver data to en_info to hold more information Selvin Xavier
  2024-09-10  4:01 ` [PATCH for-next 2/4] RDMA/bnxt_re: Use the aux device for L2 ULP callbacks Selvin Xavier
@ 2024-09-10  4:01 ` Selvin Xavier
  2024-09-10  4:01 ` [PATCH for-next 4/4] RDMA/bnxt_re: Recover the device when FW error is detected Selvin Xavier
  3 siblings, 0 replies; 6+ messages in thread
From: Selvin Xavier @ 2024-09-10  4:01 UTC (permalink / raw)
  To: leon, jgg
  Cc: linux-rdma, andrew.gospodarek, kalesh-anakkur.purayil,
	Selvin Xavier, Chandramohan Akula

Adding and removing device need to be handled from multiple contexts
when Firmware error recovery is supported. So group all the add and remove
operations to add_device and remove_device function.

Signed-off-by: Chandramohan Akula <chandramohan.akula@broadcom.com>
Reviewed-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
 drivers/infiniband/hw/bnxt_re/main.c | 64 +++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index a5e0c21..1793a0c 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -88,6 +88,7 @@ static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev);
 
 static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len,
 			     u32 *offset);
+static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable);
 static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev)
 {
 	struct bnxt_qplib_chip_ctx *cctx;
@@ -1860,6 +1861,16 @@ static int bnxt_re_add_device(struct auxiliary_device *adev)
 		goto re_dev_uninit;
 	}
 
+	rdev->nb.notifier_call = bnxt_re_netdev_event;
+	rc = register_netdevice_notifier(&rdev->nb);
+	if (rc) {
+		rdev->nb.notifier_call = NULL;
+		pr_err("%s: Cannot register to netdevice_notifier",
+		       ROCE_DRV_MODULE_NAME);
+		return rc;
+	}
+	bnxt_re_setup_cc(rdev, true);
+
 	return 0;
 
 re_dev_uninit:
@@ -1947,21 +1958,13 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 
 #define BNXT_ADEV_NAME "bnxt_en"
 
-static void bnxt_re_remove(struct auxiliary_device *adev)
+static void bnxt_re_remove_device(struct bnxt_re_dev *rdev,
+				  struct auxiliary_device *aux_dev)
 {
-	struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev);
+	struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(aux_dev);
 	struct bnxt_en_dev *en_dev;
-	struct bnxt_re_dev *rdev;
 
-	mutex_lock(&bnxt_re_mutex);
-	if (!en_info) {
-		mutex_unlock(&bnxt_re_mutex);
-		return;
-	}
 	en_dev = en_info->en_dev;
-	rdev = en_info->rdev;
-	if (!rdev)
-		goto skip_remove;
 
 	if (rdev->nb.notifier_call) {
 		unregister_netdevice_notifier(&rdev->nb);
@@ -1970,13 +1973,30 @@ static void bnxt_re_remove(struct auxiliary_device *adev)
 		/* If notifier is null, we should have already done a
 		 * clean up before coming here.
 		 */
-		goto skip_remove;
+		return;
 	}
 	bnxt_re_setup_cc(rdev, false);
 	ib_unregister_device(&rdev->ibdev);
 	bnxt_re_dev_uninit(rdev);
 	ib_dealloc_device(&rdev->ibdev);
-skip_remove:
+}
+
+static void bnxt_re_remove(struct auxiliary_device *adev)
+{
+	struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev);
+	struct bnxt_en_dev *en_dev;
+	struct bnxt_re_dev *rdev;
+
+	mutex_lock(&bnxt_re_mutex);
+	if (!en_info) {
+		mutex_unlock(&bnxt_re_mutex);
+		return;
+	}
+	en_dev = en_info->en_dev;
+	rdev = en_info->rdev;
+
+	if (rdev)
+		bnxt_re_remove_device(rdev, adev);
 	kfree(en_info);
 	mutex_unlock(&bnxt_re_mutex);
 }
@@ -1988,7 +2008,6 @@ static int bnxt_re_probe(struct auxiliary_device *adev,
 		container_of(adev, struct bnxt_aux_priv, aux_dev);
 	struct bnxt_re_en_dev_info *en_info;
 	struct bnxt_en_dev *en_dev;
-	struct bnxt_re_dev *rdev;
 	int rc;
 
 	en_dev = aux_priv->edev;
@@ -2004,23 +2023,8 @@ static int bnxt_re_probe(struct auxiliary_device *adev,
 	auxiliary_set_drvdata(adev, en_info);
 
 	rc = bnxt_re_add_device(adev);
-	if (rc) {
-		mutex_unlock(&bnxt_re_mutex);
-		return rc;
-	}
-
-	rdev = en_info->rdev;
-
-	rdev->nb.notifier_call = bnxt_re_netdev_event;
-	rc = register_netdevice_notifier(&rdev->nb);
-	if (rc) {
-		rdev->nb.notifier_call = NULL;
-		pr_err("%s: Cannot register to netdevice_notifier",
-		       ROCE_DRV_MODULE_NAME);
+	if (rc)
 		goto err;
-	}
-
-	bnxt_re_setup_cc(rdev, true);
 	mutex_unlock(&bnxt_re_mutex);
 	return 0;
 
-- 
2.5.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH for-next 4/4] RDMA/bnxt_re: Recover the device when FW error is detected
  2024-09-10  4:00 [PATCH for-next 0/4] RDMA/bnxt_re: Device re-initialization after Firmware error Selvin Xavier
                   ` (2 preceding siblings ...)
  2024-09-10  4:01 ` [PATCH for-next 3/4] RDMA/bnxt_re: Group all operations under add_device and remove_device Selvin Xavier
@ 2024-09-10  4:01 ` Selvin Xavier
  3 siblings, 0 replies; 6+ messages in thread
From: Selvin Xavier @ 2024-09-10  4:01 UTC (permalink / raw)
  To: leon, jgg
  Cc: linux-rdma, andrew.gospodarek, kalesh-anakkur.purayil,
	Selvin Xavier, Chandramohan Akula

If the FW crashes, L2 driver gets notified and it notifies
the RoCE driver. Currently driver doesn't re-initialize the
device. Add support for re-initialize the RoCE device.

RoCE device is removed and re-attached in the ulp_stop and
ulp_start respectively. The recovery logic expects the RoCE
driver to be registered with L2 driver while its being removed.
So the driver avoids unregistering with L2 driver in the
recovery path.

Signed-off-by: Chandramohan Akula <chandramohan.akula@broadcom.com>
Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
 drivers/infiniband/hw/bnxt_re/bnxt_re.h   | 15 +++++++
 drivers/infiniband/hw/bnxt_re/main.c      | 70 +++++++++++++++++--------------
 drivers/infiniband/hw/bnxt_re/qplib_res.h |  1 +
 3 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 5df3ce1..e94518b 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -91,6 +91,15 @@ struct bnxt_re_ring_attr {
 	u8		mode;
 };
 
+/*
+ * Data structure and defines to handle
+ * recovery
+ */
+#define BNXT_RE_PRE_RECOVERY_REMOVE 0x1
+#define BNXT_RE_COMPLETE_REMOVE 0x2
+#define BNXT_RE_POST_RECOVERY_INIT 0x4
+#define BNXT_RE_COMPLETE_INIT 0x8
+
 struct bnxt_re_sqp_entries {
 	struct bnxt_qplib_sge sge;
 	u64 wrid;
@@ -224,4 +233,10 @@ static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev)
 }
 
 extern const struct uapi_definition bnxt_re_uapi_defs[];
+
+static inline void bnxt_re_set_pacing_dev_state(struct bnxt_re_dev *rdev)
+{
+	rdev->qplib_res.pacing_data->dev_err_state =
+		test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags);
+}
 #endif
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 1793a0c..1359b52 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -83,7 +83,7 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev);
 static int bnxt_re_netdev_event(struct notifier_block *notifier,
 				unsigned long event, void *ptr);
 static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev);
-static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev);
+static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type);
 static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev);
 
 static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len,
@@ -169,6 +169,7 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev)
 
 	en_dev = rdev->en_dev;
 
+	rdev->qplib_res.pdev = en_dev->pdev;
 	chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL);
 	if (!chip_ctx)
 		return -ENOMEM;
@@ -301,7 +302,7 @@ static void bnxt_re_shutdown(struct auxiliary_device *adev)
 
 	rdev = en_info->rdev;
 	ib_unregister_device(&rdev->ibdev);
-	bnxt_re_dev_uninit(rdev);
+	bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE);
 }
 
 static void bnxt_re_stop_irq(void *handle)
@@ -385,14 +386,9 @@ static struct bnxt_ulp_ops bnxt_re_ulp_ops = {
 static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev)
 {
 	struct bnxt_en_dev *en_dev;
-	int rc;
 
 	en_dev = rdev->en_dev;
-
-	rc = bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev->adev);
-	if (!rc)
-		rdev->qplib_res.pdev = rdev->en_dev->pdev;
-	return rc;
+	return bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev->adev);
 }
 
 static void bnxt_re_init_hwrm_hdr(struct input *hdr, u16 opcd)
@@ -1593,7 +1589,7 @@ static int bnxt_re_ib_init(struct bnxt_re_dev *rdev)
 	return rc;
 }
 
-static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev)
+static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type)
 {
 	u8 type;
 	int rc;
@@ -1626,8 +1622,10 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev)
 		bnxt_re_deinitialize_dbr_pacing(rdev);
 
 	bnxt_re_destroy_chip_ctx(rdev);
-	if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags))
-		bnxt_unregister_dev(rdev->en_dev);
+	if (op_type == BNXT_RE_COMPLETE_REMOVE) {
+		if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags))
+			bnxt_unregister_dev(rdev->en_dev);
+	}
 }
 
 /* worker thread for polling periodic events. Now used for QoS programming*/
@@ -1640,7 +1638,7 @@ static void bnxt_re_worker(struct work_struct *work)
 	schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
 }
 
-static int bnxt_re_dev_init(struct bnxt_re_dev *rdev)
+static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type)
 {
 	struct bnxt_re_ring_attr rattr = {};
 	struct bnxt_qplib_creq_ctx *creq;
@@ -1649,12 +1647,14 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev)
 	u8 type;
 	int rc;
 
-	/* Registered a new RoCE device instance to netdev */
-	rc = bnxt_re_register_netdev(rdev);
-	if (rc) {
-		ibdev_err(&rdev->ibdev,
-			  "Failed to register with netedev: %#x\n", rc);
-		return -EINVAL;
+	if (op_type == BNXT_RE_COMPLETE_INIT) {
+		/* Registered a new RoCE device instance to netdev */
+		rc = bnxt_re_register_netdev(rdev);
+		if (rc) {
+			ibdev_err(&rdev->ibdev,
+				  "Failed to register with netedev: %#x\n", rc);
+			return -EINVAL;
+		}
 	}
 	set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
 
@@ -1807,7 +1807,7 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev)
 free_rcfw:
 	bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
 fail:
-	bnxt_re_dev_uninit(rdev);
+	bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE);
 
 	return rc;
 }
@@ -1827,7 +1827,7 @@ static void bnxt_re_update_en_info_rdev(struct bnxt_re_dev *rdev,
 	rtnl_unlock();
 }
 
-static int bnxt_re_add_device(struct auxiliary_device *adev)
+static int bnxt_re_add_device(struct auxiliary_device *adev, u8 op_type)
 {
 	struct bnxt_aux_priv *aux_priv =
 		container_of(adev, struct bnxt_aux_priv, aux_dev);
@@ -1839,8 +1839,6 @@ static int bnxt_re_add_device(struct auxiliary_device *adev)
 	en_info = auxiliary_get_drvdata(adev);
 	en_dev = en_info->en_dev;
 
-	/* en_dev should never be NULL as long as adev and aux_dev are valid. */
-	en_dev = aux_priv->edev;
 
 	rdev = bnxt_re_dev_add(aux_priv, en_dev);
 	if (!rdev || !rdev_to_dev(rdev)) {
@@ -1850,7 +1848,7 @@ static int bnxt_re_add_device(struct auxiliary_device *adev)
 
 	bnxt_re_update_en_info_rdev(rdev, en_info, adev);
 
-	rc = bnxt_re_dev_init(rdev);
+	rc = bnxt_re_dev_init(rdev, op_type);
 	if (rc)
 		goto re_dev_dealloc;
 
@@ -1875,7 +1873,7 @@ static int bnxt_re_add_device(struct auxiliary_device *adev)
 
 re_dev_uninit:
 	bnxt_re_update_en_info_rdev(NULL, en_info, adev);
-	bnxt_re_dev_uninit(rdev);
+	bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE);
 re_dev_dealloc:
 	ib_dealloc_device(&rdev->ibdev);
 exit:
@@ -1958,7 +1956,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 
 #define BNXT_ADEV_NAME "bnxt_en"
 
-static void bnxt_re_remove_device(struct bnxt_re_dev *rdev,
+static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, u8 op_type,
 				  struct auxiliary_device *aux_dev)
 {
 	struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(aux_dev);
@@ -1977,7 +1975,7 @@ static void bnxt_re_remove_device(struct bnxt_re_dev *rdev,
 	}
 	bnxt_re_setup_cc(rdev, false);
 	ib_unregister_device(&rdev->ibdev);
-	bnxt_re_dev_uninit(rdev);
+	bnxt_re_dev_uninit(rdev, op_type);
 	ib_dealloc_device(&rdev->ibdev);
 }
 
@@ -1996,7 +1994,7 @@ static void bnxt_re_remove(struct auxiliary_device *adev)
 	rdev = en_info->rdev;
 
 	if (rdev)
-		bnxt_re_remove_device(rdev, adev);
+		bnxt_re_remove_device(rdev, BNXT_RE_COMPLETE_REMOVE, adev);
 	kfree(en_info);
 	mutex_unlock(&bnxt_re_mutex);
 }
@@ -2022,7 +2020,7 @@ static int bnxt_re_probe(struct auxiliary_device *adev,
 
 	auxiliary_set_drvdata(adev, en_info);
 
-	rc = bnxt_re_add_device(adev);
+	rc = bnxt_re_add_device(adev, BNXT_RE_COMPLETE_INIT);
 	if (rc)
 		goto err;
 	mutex_unlock(&bnxt_re_mutex);
@@ -2038,12 +2036,14 @@ static int bnxt_re_probe(struct auxiliary_device *adev,
 static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state)
 {
 	struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev);
+	struct bnxt_en_dev *en_dev;
 	struct bnxt_re_dev *rdev;
 
 	if (!en_info)
 		return 0;
 
 	rdev = en_info->rdev;
+	en_dev = en_info->en_dev;
 	mutex_lock(&bnxt_re_mutex);
 	/* L2 driver may invoke this callback during device error/crash or device
 	 * reset. Current RoCE driver doesn't recover the device in case of
@@ -2062,13 +2062,20 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state)
 		set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags);
 
 	bnxt_re_dev_stop(rdev);
-	bnxt_re_stop_irq(rdev);
+	bnxt_re_stop_irq(adev);
 	/* Move the device states to detached and  avoid sending any more
 	 * commands to HW
 	 */
 	set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags);
 	set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags);
 	wake_up_all(&rdev->rcfw.cmdq.waitq);
+
+	if (rdev->pacing.dbr_pacing)
+		bnxt_re_set_pacing_dev_state(rdev);
+
+	ibdev_info(&rdev->ibdev, "%s: L2 driver notified to stop en_state 0x%lx",
+		   __func__, en_dev->en_state);
+	bnxt_re_remove_device(rdev, BNXT_RE_PRE_RECOVERY_REMOVE, adev);
 	mutex_unlock(&bnxt_re_mutex);
 
 	return 0;
@@ -2082,7 +2089,6 @@ static int bnxt_re_resume(struct auxiliary_device *adev)
 	if (!en_info)
 		return 0;
 
-	rdev = en_info->rdev;
 	mutex_lock(&bnxt_re_mutex);
 	/* L2 driver may invoke this callback during device recovery, resume.
 	 * reset. Current RoCE driver doesn't recover the device in case of
@@ -2091,7 +2097,9 @@ static int bnxt_re_resume(struct auxiliary_device *adev)
 	 * L2 driver want to modify the MSIx table.
 	 */
 
-	ibdev_info(&rdev->ibdev, "Handle device resume call");
+	bnxt_re_add_device(adev, BNXT_RE_POST_RECOVERY_INIT);
+	rdev = en_info->rdev;
+	ibdev_info(&rdev->ibdev, "Device resume completed");
 	mutex_unlock(&bnxt_re_mutex);
 
 	return 0;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
index 049805a..c2f7103 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -82,6 +82,7 @@ struct bnxt_qplib_db_pacing_data {
 	u32 fifo_room_mask;
 	u32 fifo_room_shift;
 	u32 grc_reg_offset;
+	u32 dev_err_state;
 };
 
 #define BNXT_QPLIB_DBR_PF_DB_OFFSET     0x10000
-- 
2.5.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH for-next 1/4] RDMA/bnxt_re: Change aux driver data to en_info to hold more information
  2024-09-10  4:00 ` [PATCH for-next 1/4] RDMA/bnxt_re: Change aux driver data to en_info to hold more information Selvin Xavier
@ 2024-09-10 18:05   ` kernel test robot
  0 siblings, 0 replies; 6+ messages in thread
From: kernel test robot @ 2024-09-10 18:05 UTC (permalink / raw)
  To: Selvin Xavier, leon, jgg
  Cc: oe-kbuild-all, linux-rdma, andrew.gospodarek,
	kalesh-anakkur.purayil, Chandramohan Akula, Selvin Xavier

Hi Selvin,

kernel test robot noticed the following build warnings:

[auto build test WARNING on rdma/for-next]
[also build test WARNING on next-20240910]
[cannot apply to linus/master v6.11-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Selvin-Xavier/RDMA-bnxt_re-Change-aux-driver-data-to-en_info-to-hold-more-information/20240910-122414
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git for-next
patch link:    https://lore.kernel.org/r/1725940862-4821-2-git-send-email-selvin.xavier%40broadcom.com
patch subject: [PATCH for-next 1/4] RDMA/bnxt_re: Change aux driver data to en_info to hold more information
config: s390-allyesconfig (https://download.01.org/0day-ci/archive/20240911/202409110129.V1tgNuph-lkp@intel.com/config)
compiler: s390-linux-gcc (GCC) 14.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240911/202409110129.V1tgNuph-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409110129.V1tgNuph-lkp@intel.com/

All warnings (new ones prefixed by >>):

   drivers/infiniband/hw/bnxt_re/main.c: In function 'bnxt_re_remove':
>> drivers/infiniband/hw/bnxt_re/main.c:1939:29: warning: variable 'en_dev' set but not used [-Wunused-but-set-variable]
    1939 |         struct bnxt_en_dev *en_dev;
         |                             ^~~~~~


vim +/en_dev +1939 drivers/infiniband/hw/bnxt_re/main.c

  1935	
  1936	static void bnxt_re_remove(struct auxiliary_device *adev)
  1937	{
  1938		struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev);
> 1939		struct bnxt_en_dev *en_dev;
  1940		struct bnxt_re_dev *rdev;
  1941	
  1942		mutex_lock(&bnxt_re_mutex);
  1943		if (!en_info) {
  1944			mutex_unlock(&bnxt_re_mutex);
  1945			return;
  1946		}
  1947		en_dev = en_info->en_dev;
  1948		rdev = en_info->rdev;
  1949		if (!rdev)
  1950			goto skip_remove;
  1951	
  1952		if (rdev->nb.notifier_call) {
  1953			unregister_netdevice_notifier(&rdev->nb);
  1954			rdev->nb.notifier_call = NULL;
  1955		} else {
  1956			/* If notifier is null, we should have already done a
  1957			 * clean up before coming here.
  1958			 */
  1959			goto skip_remove;
  1960		}
  1961		bnxt_re_setup_cc(rdev, false);
  1962		ib_unregister_device(&rdev->ibdev);
  1963		bnxt_re_dev_uninit(rdev);
  1964		ib_dealloc_device(&rdev->ibdev);
  1965	skip_remove:
  1966		kfree(en_info);
  1967		mutex_unlock(&bnxt_re_mutex);
  1968	}
  1969	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-09-10 18:06 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-10  4:00 [PATCH for-next 0/4] RDMA/bnxt_re: Device re-initialization after Firmware error Selvin Xavier
2024-09-10  4:00 ` [PATCH for-next 1/4] RDMA/bnxt_re: Change aux driver data to en_info to hold more information Selvin Xavier
2024-09-10 18:05   ` kernel test robot
2024-09-10  4:01 ` [PATCH for-next 2/4] RDMA/bnxt_re: Use the aux device for L2 ULP callbacks Selvin Xavier
2024-09-10  4:01 ` [PATCH for-next 3/4] RDMA/bnxt_re: Group all operations under add_device and remove_device Selvin Xavier
2024-09-10  4:01 ` [PATCH for-next 4/4] RDMA/bnxt_re: Recover the device when FW error is detected Selvin Xavier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox