* [PATCH for-rc v2] RDMA/bnxt_re: Fix error recovery sequence
@ 2024-12-20 7:59 Kalesh AP
2024-12-23 15:00 ` Leon Romanovsky
0 siblings, 1 reply; 6+ messages in thread
From: Kalesh AP @ 2024-12-20 7:59 UTC (permalink / raw)
To: leon, jgg
Cc: linux-rdma, andrew.gospodarek, selvin.xavier, Kalesh AP,
Kashyap Desai
Fixed to return ENXIO from __send_message_basic_sanity()
to indicate that device is in error state. In the case of
ERR_DEVICE_DETACHED state, the driver should not post the
commands to the firmware as it will time out eventually.
Removed bnxt_re_modify_qp() call from bnxt_re_dev_stop()
as it is a no-op.
Fixes: cc5b9b48d447 ("RDMA/bnxt_re: Recover the device when FW error is detected")
Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Kashyap Desai <kashyap.desai@broadcom.com>
Reviewed-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
V2: No changes since v1 and is just a resend.
V1: https://patchwork.kernel.org/project/linux-rdma/patch/20241204075416.478431-5-kalesh-anakkur.purayil@broadcom.com/
---
drivers/infiniband/hw/bnxt_re/main.c | 8 +-------
drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 7 ++++---
drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 3 +++
3 files changed, 8 insertions(+), 10 deletions(-)
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index b7af0d5ff3b6..c143f273b759 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -1715,11 +1715,8 @@ static bool bnxt_re_is_qp1_or_shadow_qp(struct bnxt_re_dev *rdev,
static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev)
{
- int mask = IB_QP_STATE;
- struct ib_qp_attr qp_attr;
struct bnxt_re_qp *qp;
- qp_attr.qp_state = IB_QPS_ERR;
mutex_lock(&rdev->qp_lock);
list_for_each_entry(qp, &rdev->qp_list, list) {
/* Modify the state of all QPs except QP1/Shadow QP */
@@ -1727,12 +1724,9 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev)
if (qp->qplib_qp.state !=
CMDQ_MODIFY_QP_NEW_STATE_RESET &&
qp->qplib_qp.state !=
- CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+ CMDQ_MODIFY_QP_NEW_STATE_ERR)
bnxt_re_dispatch_event(&rdev->ibdev, &qp->ib_qp,
1, IB_EVENT_QP_FATAL);
- bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, mask,
- NULL);
- }
}
}
mutex_unlock(&rdev->qp_lock);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
index 5e90ea232de8..c8e65169f58a 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -423,8 +423,9 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw,
cmdq = &rcfw->cmdq;
/* Prevent posting if f/w is not in a state to process */
- if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags))
- return bnxt_qplib_map_rc(opcode);
+ if (RCFW_NO_FW_ACCESS(rcfw))
+ return -ENXIO;
+
if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags))
return -ETIMEDOUT;
@@ -493,7 +494,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
rc = __send_message_basic_sanity(rcfw, msg, opcode);
if (rc)
- return rc;
+ return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc;
rc = __send_message(rcfw, msg, opcode);
if (rc)
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
index 88814cb3aa74..4f7d800e35c3 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -129,6 +129,9 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req)
#define RCFW_MAX_COOKIE_VALUE (BNXT_QPLIB_CMDQE_MAX_CNT - 1)
#define RCFW_CMD_IS_BLOCKING 0x8000
+#define RCFW_NO_FW_ACCESS(rcfw) \
+ (test_bit(ERR_DEVICE_DETACHED, &(rcfw)->cmdq.flags) || \
+ pci_channel_offline((rcfw)->pdev))
#define HWRM_VERSION_DEV_ATTR_MAX_DPI 0x1000A0000000DULL
/* HWRM version 1.10.3.18 */
--
2.43.5
^ permalink raw reply related [flat|nested] 6+ messages in thread* Re: [PATCH for-rc v2] RDMA/bnxt_re: Fix error recovery sequence 2024-12-20 7:59 [PATCH for-rc v2] RDMA/bnxt_re: Fix error recovery sequence Kalesh AP @ 2024-12-23 15:00 ` Leon Romanovsky 2024-12-23 15:42 ` Kalesh Anakkur Purayil 0 siblings, 1 reply; 6+ messages in thread From: Leon Romanovsky @ 2024-12-23 15:00 UTC (permalink / raw) To: Kalesh AP Cc: jgg, linux-rdma, andrew.gospodarek, selvin.xavier, Kashyap Desai On Fri, Dec 20, 2024 at 01:29:20PM +0530, Kalesh AP wrote: > Fixed to return ENXIO from __send_message_basic_sanity() > to indicate that device is in error state. In the case of > ERR_DEVICE_DETACHED state, the driver should not post the > commands to the firmware as it will time out eventually. > > Removed bnxt_re_modify_qp() call from bnxt_re_dev_stop() > as it is a no-op. > > Fixes: cc5b9b48d447 ("RDMA/bnxt_re: Recover the device when FW error is detected") > Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com> > Signed-off-by: Kashyap Desai <kashyap.desai@broadcom.com> > Reviewed-by: Selvin Xavier <selvin.xavier@broadcom.com> > --- > V2: No changes since v1 and is just a resend. > V1: https://patchwork.kernel.org/project/linux-rdma/patch/20241204075416.478431-5-kalesh-anakkur.purayil@broadcom.com/ > --- > drivers/infiniband/hw/bnxt_re/main.c | 8 +------- > drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 7 ++++--- > drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 3 +++ > 3 files changed, 8 insertions(+), 10 deletions(-) > > diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c > index b7af0d5ff3b6..c143f273b759 100644 > --- a/drivers/infiniband/hw/bnxt_re/main.c > +++ b/drivers/infiniband/hw/bnxt_re/main.c > @@ -1715,11 +1715,8 @@ static bool bnxt_re_is_qp1_or_shadow_qp(struct bnxt_re_dev *rdev, > > static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) > { > - int mask = IB_QP_STATE; > - struct ib_qp_attr qp_attr; > struct bnxt_re_qp *qp; > > - qp_attr.qp_state = IB_QPS_ERR; > mutex_lock(&rdev->qp_lock); > list_for_each_entry(qp, &rdev->qp_list, list) { > /* Modify the state of all QPs except QP1/Shadow QP */ > @@ -1727,12 +1724,9 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) > if (qp->qplib_qp.state != > CMDQ_MODIFY_QP_NEW_STATE_RESET && > qp->qplib_qp.state != > - CMDQ_MODIFY_QP_NEW_STATE_ERR) { > + CMDQ_MODIFY_QP_NEW_STATE_ERR) > bnxt_re_dispatch_event(&rdev->ibdev, &qp->ib_qp, > 1, IB_EVENT_QP_FATAL); > - bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, mask, > - NULL); > - } > } > } > mutex_unlock(&rdev->qp_lock); > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > index 5e90ea232de8..c8e65169f58a 100644 > --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > @@ -423,8 +423,9 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, > cmdq = &rcfw->cmdq; > > /* Prevent posting if f/w is not in a state to process */ > - if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) > - return bnxt_qplib_map_rc(opcode); > + if (RCFW_NO_FW_ACCESS(rcfw)) > + return -ENXIO; > + > if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) > return -ETIMEDOUT; > > @@ -493,7 +494,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, > > rc = __send_message_basic_sanity(rcfw, msg, opcode); > if (rc) > - return rc; > + return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; > > rc = __send_message(rcfw, msg, opcode); > if (rc) > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > index 88814cb3aa74..4f7d800e35c3 100644 > --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > @@ -129,6 +129,9 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req) > > #define RCFW_MAX_COOKIE_VALUE (BNXT_QPLIB_CMDQE_MAX_CNT - 1) > #define RCFW_CMD_IS_BLOCKING 0x8000 > +#define RCFW_NO_FW_ACCESS(rcfw) \ > + (test_bit(ERR_DEVICE_DETACHED, &(rcfw)->cmdq.flags) || \ > + pci_channel_offline((rcfw)->pdev)) You promised me that this patch handles races, so how is this pci_channel_offline() check protected? Thansk > > #define HWRM_VERSION_DEV_ATTR_MAX_DPI 0x1000A0000000DULL > /* HWRM version 1.10.3.18 */ > -- > 2.43.5 > ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH for-rc v2] RDMA/bnxt_re: Fix error recovery sequence 2024-12-23 15:00 ` Leon Romanovsky @ 2024-12-23 15:42 ` Kalesh Anakkur Purayil 2024-12-23 16:42 ` Leon Romanovsky 0 siblings, 1 reply; 6+ messages in thread From: Kalesh Anakkur Purayil @ 2024-12-23 15:42 UTC (permalink / raw) To: Leon Romanovsky Cc: jgg, linux-rdma, andrew.gospodarek, selvin.xavier, Kashyap Desai [-- Attachment #1.1: Type: text/plain, Size: 5040 bytes --] Regards, Kalesh AP On Mon, 23 Dec 2024 at 8:30 PM, Leon Romanovsky <leon@kernel.org> wrote: > On Fri, Dec 20, 2024 at 01:29:20PM +0530, Kalesh AP wrote: > > Fixed to return ENXIO from __send_message_basic_sanity() > > to indicate that device is in error state. In the case of > > ERR_DEVICE_DETACHED state, the driver should not post the > > commands to the firmware as it will time out eventually. > > > > Removed bnxt_re_modify_qp() call from bnxt_re_dev_stop() > > as it is a no-op. > > > > Fixes: cc5b9b48d447 ("RDMA/bnxt_re: Recover the device when FW error is > detected") > > Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com> > > Signed-off-by: Kashyap Desai <kashyap.desai@broadcom.com> > > Reviewed-by: Selvin Xavier <selvin.xavier@broadcom.com> > > --- > > V2: No changes since v1 and is just a resend. > > V1: > https://patchwork.kernel.org/project/linux-rdma/patch/20241204075416.478431-5-kalesh-anakkur.purayil@broadcom.com/ > > --- > > drivers/infiniband/hw/bnxt_re/main.c | 8 +------- > > drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 7 ++++--- > > drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 3 +++ > > 3 files changed, 8 insertions(+), 10 deletions(-) > > > > diff --git a/drivers/infiniband/hw/bnxt_re/main.c > b/drivers/infiniband/hw/bnxt_re/main.c > > index b7af0d5ff3b6..c143f273b759 100644 > > --- a/drivers/infiniband/hw/bnxt_re/main.c > > +++ b/drivers/infiniband/hw/bnxt_re/main.c > > @@ -1715,11 +1715,8 @@ static bool bnxt_re_is_qp1_or_shadow_qp(struct > bnxt_re_dev *rdev, > > > > static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) > > { > > - int mask = IB_QP_STATE; > > - struct ib_qp_attr qp_attr; > > struct bnxt_re_qp *qp; > > > > - qp_attr.qp_state = IB_QPS_ERR; > > mutex_lock(&rdev->qp_lock); > > list_for_each_entry(qp, &rdev->qp_list, list) { > > /* Modify the state of all QPs except QP1/Shadow QP */ > > @@ -1727,12 +1724,9 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev > *rdev) > > if (qp->qplib_qp.state != > > CMDQ_MODIFY_QP_NEW_STATE_RESET && > > qp->qplib_qp.state != > > - CMDQ_MODIFY_QP_NEW_STATE_ERR) { > > + CMDQ_MODIFY_QP_NEW_STATE_ERR) > > bnxt_re_dispatch_event(&rdev->ibdev, > &qp->ib_qp, > > 1, > IB_EVENT_QP_FATAL); > > - bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, > mask, > > - NULL); > > - } > > } > > } > > mutex_unlock(&rdev->qp_lock); > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > index 5e90ea232de8..c8e65169f58a 100644 > > --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > @@ -423,8 +423,9 @@ static int __send_message_basic_sanity(struct > bnxt_qplib_rcfw *rcfw, > > cmdq = &rcfw->cmdq; > > > > /* Prevent posting if f/w is not in a state to process */ > > - if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) > > - return bnxt_qplib_map_rc(opcode); > > + if (RCFW_NO_FW_ACCESS(rcfw)) > > + return -ENXIO; > > + > > if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) > > return -ETIMEDOUT; > > > > @@ -493,7 +494,7 @@ static int __bnxt_qplib_rcfw_send_message(struct > bnxt_qplib_rcfw *rcfw, > > > > rc = __send_message_basic_sanity(rcfw, msg, opcode); > > if (rc) > > - return rc; > > + return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; > > > > rc = __send_message(rcfw, msg, opcode); > > if (rc) > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > index 88814cb3aa74..4f7d800e35c3 100644 > > --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > @@ -129,6 +129,9 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct > cmdq_base *req) > > > > #define RCFW_MAX_COOKIE_VALUE (BNXT_QPLIB_CMDQE_MAX_CNT > - 1) > > #define RCFW_CMD_IS_BLOCKING 0x8000 > > +#define RCFW_NO_FW_ACCESS(rcfw) > \ > > + (test_bit(ERR_DEVICE_DETACHED, &(rcfw)->cmdq.flags) || \ > > + pci_channel_offline((rcfw)->pdev)) > > You promised me that this patch handles races, so how is this > pci_channel_offline() check protected? > > Thansk Hi Leon, Sorry, I may be missing something here. Could you help me understand what is the race condition here? I can then internally discuss with the team based on your input. Regards, Kalesh AP > > > > > > #define HWRM_VERSION_DEV_ATTR_MAX_DPI 0x1000A0000000DULL > > /* HWRM version 1.10.3.18 */ > > -- > > 2.43.5 > > > [-- Attachment #1.2: Type: text/html, Size: 7067 bytes --] [-- Attachment #2: S/MIME Cryptographic Signature --] [-- Type: application/pkcs7-signature, Size: 4239 bytes --] ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH for-rc v2] RDMA/bnxt_re: Fix error recovery sequence 2024-12-23 15:42 ` Kalesh Anakkur Purayil @ 2024-12-23 16:42 ` Leon Romanovsky 2024-12-25 13:55 ` Kalesh Anakkur Purayil 0 siblings, 1 reply; 6+ messages in thread From: Leon Romanovsky @ 2024-12-23 16:42 UTC (permalink / raw) To: Kalesh Anakkur Purayil Cc: jgg, linux-rdma, andrew.gospodarek, selvin.xavier, Kashyap Desai On Mon, Dec 23, 2024 at 09:12:53PM +0530, Kalesh Anakkur Purayil wrote: > Regards, > Kalesh AP > > > On Mon, 23 Dec 2024 at 8:30 PM, Leon Romanovsky <leon@kernel.org> wrote: > > > On Fri, Dec 20, 2024 at 01:29:20PM +0530, Kalesh AP wrote: > > > Fixed to return ENXIO from __send_message_basic_sanity() > > > to indicate that device is in error state. In the case of > > > ERR_DEVICE_DETACHED state, the driver should not post the > > > commands to the firmware as it will time out eventually. > > > > > > Removed bnxt_re_modify_qp() call from bnxt_re_dev_stop() > > > as it is a no-op. > > > > > > Fixes: cc5b9b48d447 ("RDMA/bnxt_re: Recover the device when FW error is > > detected") > > > Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com> > > > Signed-off-by: Kashyap Desai <kashyap.desai@broadcom.com> > > > Reviewed-by: Selvin Xavier <selvin.xavier@broadcom.com> > > > --- > > > V2: No changes since v1 and is just a resend. > > > V1: > > https://patchwork.kernel.org/project/linux-rdma/patch/20241204075416.478431-5-kalesh-anakkur.purayil@broadcom.com/ > > > --- > > > drivers/infiniband/hw/bnxt_re/main.c | 8 +------- > > > drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 7 ++++--- > > > drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 3 +++ > > > 3 files changed, 8 insertions(+), 10 deletions(-) > > > > > > diff --git a/drivers/infiniband/hw/bnxt_re/main.c > > b/drivers/infiniband/hw/bnxt_re/main.c > > > index b7af0d5ff3b6..c143f273b759 100644 > > > --- a/drivers/infiniband/hw/bnxt_re/main.c > > > +++ b/drivers/infiniband/hw/bnxt_re/main.c > > > @@ -1715,11 +1715,8 @@ static bool bnxt_re_is_qp1_or_shadow_qp(struct > > bnxt_re_dev *rdev, > > > > > > static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) > > > { > > > - int mask = IB_QP_STATE; > > > - struct ib_qp_attr qp_attr; > > > struct bnxt_re_qp *qp; > > > > > > - qp_attr.qp_state = IB_QPS_ERR; > > > mutex_lock(&rdev->qp_lock); > > > list_for_each_entry(qp, &rdev->qp_list, list) { > > > /* Modify the state of all QPs except QP1/Shadow QP */ > > > @@ -1727,12 +1724,9 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev > > *rdev) > > > if (qp->qplib_qp.state != > > > CMDQ_MODIFY_QP_NEW_STATE_RESET && > > > qp->qplib_qp.state != > > > - CMDQ_MODIFY_QP_NEW_STATE_ERR) { > > > + CMDQ_MODIFY_QP_NEW_STATE_ERR) > > > bnxt_re_dispatch_event(&rdev->ibdev, > > &qp->ib_qp, > > > 1, > > IB_EVENT_QP_FATAL); > > > - bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, > > mask, > > > - NULL); > > > - } > > > } > > > } > > > mutex_unlock(&rdev->qp_lock); > > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > > index 5e90ea232de8..c8e65169f58a 100644 > > > --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > > @@ -423,8 +423,9 @@ static int __send_message_basic_sanity(struct > > bnxt_qplib_rcfw *rcfw, > > > cmdq = &rcfw->cmdq; > > > > > > /* Prevent posting if f/w is not in a state to process */ > > > - if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) > > > - return bnxt_qplib_map_rc(opcode); > > > + if (RCFW_NO_FW_ACCESS(rcfw)) > > > + return -ENXIO; > > > + > > > if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) > > > return -ETIMEDOUT; > > > > > > @@ -493,7 +494,7 @@ static int __bnxt_qplib_rcfw_send_message(struct > > bnxt_qplib_rcfw *rcfw, > > > > > > rc = __send_message_basic_sanity(rcfw, msg, opcode); > > > if (rc) > > > - return rc; > > > + return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; > > > > > > rc = __send_message(rcfw, msg, opcode); > > > if (rc) > > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > > index 88814cb3aa74..4f7d800e35c3 100644 > > > --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > > @@ -129,6 +129,9 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct > > cmdq_base *req) > > > > > > #define RCFW_MAX_COOKIE_VALUE (BNXT_QPLIB_CMDQE_MAX_CNT > > - 1) > > > #define RCFW_CMD_IS_BLOCKING 0x8000 > > > +#define RCFW_NO_FW_ACCESS(rcfw) > > \ > > > + (test_bit(ERR_DEVICE_DETACHED, &(rcfw)->cmdq.flags) || \ > > > + pci_channel_offline((rcfw)->pdev)) > > > > You promised me that this patch handles races, so how is this > > pci_channel_offline() check protected? > > > > Thansk > > Hi Leon, > > Sorry, I may be missing something here. > Could you help me understand what is the race condition here? I can then > internally discuss with the team based on your input. pci_channel_offline() is placed completely randomly here. There is no guarantee that PCI card won't be offline immediately after this check. Thanks > > Regards, > Kalesh AP > > > > > > > > > > > #define HWRM_VERSION_DEV_ATTR_MAX_DPI 0x1000A0000000DULL > > > /* HWRM version 1.10.3.18 */ > > > -- > > > 2.43.5 > > > > > ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH for-rc v2] RDMA/bnxt_re: Fix error recovery sequence 2024-12-23 16:42 ` Leon Romanovsky @ 2024-12-25 13:55 ` Kalesh Anakkur Purayil 2024-12-30 18:29 ` Leon Romanovsky 0 siblings, 1 reply; 6+ messages in thread From: Kalesh Anakkur Purayil @ 2024-12-25 13:55 UTC (permalink / raw) To: Leon Romanovsky Cc: jgg, linux-rdma, andrew.gospodarek, selvin.xavier, Kashyap Desai [-- Attachment #1.1: Type: text/plain, Size: 6120 bytes --] On Mon, 23 Dec 2024 at 10:12 PM, Leon Romanovsky <leon@kernel.org> wrote: > On Mon, Dec 23, 2024 at 09:12:53PM +0530, Kalesh Anakkur Purayil wrote: > > Regards, > > Kalesh AP > > > > > > On Mon, 23 Dec 2024 at 8:30 PM, Leon Romanovsky <leon@kernel.org> wrote: > > > > > On Fri, Dec 20, 2024 at 01:29:20PM +0530, Kalesh AP wrote: > > > > Fixed to return ENXIO from __send_message_basic_sanity() > > > > to indicate that device is in error state. In the case of > > > > ERR_DEVICE_DETACHED state, the driver should not post the > > > > commands to the firmware as it will time out eventually. > > > > > > > > Removed bnxt_re_modify_qp() call from bnxt_re_dev_stop() > > > > as it is a no-op. > > > > > > > > Fixes: cc5b9b48d447 ("RDMA/bnxt_re: Recover the device when FW error > is > > > detected") > > > > Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com> > > > > Signed-off-by: Kashyap Desai <kashyap.desai@broadcom.com> > > > > Reviewed-by: Selvin Xavier <selvin.xavier@broadcom.com> > > > > --- > > > > V2: No changes since v1 and is just a resend. > > > > V1: > > > > https://patchwork.kernel.org/project/linux-rdma/patch/20241204075416.478431-5-kalesh-anakkur.purayil@broadcom.com/ > > > > --- > > > > drivers/infiniband/hw/bnxt_re/main.c | 8 +------- > > > > drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 7 ++++--- > > > > drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 3 +++ > > > > 3 files changed, 8 insertions(+), 10 deletions(-) > > > > > > > > diff --git a/drivers/infiniband/hw/bnxt_re/main.c > > > b/drivers/infiniband/hw/bnxt_re/main.c > > > > index b7af0d5ff3b6..c143f273b759 100644 > > > > --- a/drivers/infiniband/hw/bnxt_re/main.c > > > > +++ b/drivers/infiniband/hw/bnxt_re/main.c > > > > @@ -1715,11 +1715,8 @@ static bool bnxt_re_is_qp1_or_shadow_qp(struct > > > bnxt_re_dev *rdev, > > > > > > > > static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) > > > > { > > > > - int mask = IB_QP_STATE; > > > > - struct ib_qp_attr qp_attr; > > > > struct bnxt_re_qp *qp; > > > > > > > > - qp_attr.qp_state = IB_QPS_ERR; > > > > mutex_lock(&rdev->qp_lock); > > > > list_for_each_entry(qp, &rdev->qp_list, list) { > > > > /* Modify the state of all QPs except QP1/Shadow QP */ > > > > @@ -1727,12 +1724,9 @@ static void bnxt_re_dev_stop(struct > bnxt_re_dev > > > *rdev) > > > > if (qp->qplib_qp.state != > > > > CMDQ_MODIFY_QP_NEW_STATE_RESET && > > > > qp->qplib_qp.state != > > > > - CMDQ_MODIFY_QP_NEW_STATE_ERR) { > > > > + CMDQ_MODIFY_QP_NEW_STATE_ERR) > > > > bnxt_re_dispatch_event(&rdev->ibdev, > > > &qp->ib_qp, > > > > 1, > > > IB_EVENT_QP_FATAL); > > > > - bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, > > > mask, > > > > - NULL); > > > > - } > > > > } > > > > } > > > > mutex_unlock(&rdev->qp_lock); > > > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > > b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > > > index 5e90ea232de8..c8e65169f58a 100644 > > > > --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > > > @@ -423,8 +423,9 @@ static int __send_message_basic_sanity(struct > > > bnxt_qplib_rcfw *rcfw, > > > > cmdq = &rcfw->cmdq; > > > > > > > > /* Prevent posting if f/w is not in a state to process */ > > > > - if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) > > > > - return bnxt_qplib_map_rc(opcode); > > > > + if (RCFW_NO_FW_ACCESS(rcfw)) > > > > + return -ENXIO; > > > > + > > > > if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) > > > > return -ETIMEDOUT; > > > > > > > > @@ -493,7 +494,7 @@ static int __bnxt_qplib_rcfw_send_message(struct > > > bnxt_qplib_rcfw *rcfw, > > > > > > > > rc = __send_message_basic_sanity(rcfw, msg, opcode); > > > > if (rc) > > > > - return rc; > > > > + return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; > > > > > > > > rc = __send_message(rcfw, msg, opcode); > > > > if (rc) > > > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > > b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > > > index 88814cb3aa74..4f7d800e35c3 100644 > > > > --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > > > @@ -129,6 +129,9 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct > > > cmdq_base *req) > > > > > > > > #define RCFW_MAX_COOKIE_VALUE > (BNXT_QPLIB_CMDQE_MAX_CNT > > > - 1) > > > > #define RCFW_CMD_IS_BLOCKING 0x8000 > > > > +#define RCFW_NO_FW_ACCESS(rcfw) > > > \ > > > > + (test_bit(ERR_DEVICE_DETACHED, &(rcfw)->cmdq.flags) || > \ > > > > + pci_channel_offline((rcfw)->pdev)) > > > > > > You promised me that this patch handles races, so how is this > > > pci_channel_offline() check protected? > > > > > > Thansk > > > > Hi Leon, > > > > Sorry, I may be missing something here. > > Could you help me understand what is the race condition here? I can then > > internally discuss with the team based on your input. > > pci_channel_offline() is placed completely randomly here. There is no > guarantee that PCI card won't be offline immediately after this check. Thank you Leon. I will discuss with the team about this and take it forward as a separate patch. For the time being, I will push a V3 with pci_channel_offline() check removed, is that okay with you? Regards, Kalesh > > > Thanks > > > > > > Regards, > > Kalesh AP > > > > > > > > > > > > > > > > #define HWRM_VERSION_DEV_ATTR_MAX_DPI 0x1000A0000000DULL > > > > /* HWRM version 1.10.3.18 */ > > > > -- > > > > 2.43.5 > > > > > > > > > > [-- Attachment #1.2: Type: text/html, Size: 8848 bytes --] [-- Attachment #2: S/MIME Cryptographic Signature --] [-- Type: application/pkcs7-signature, Size: 4239 bytes --] ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH for-rc v2] RDMA/bnxt_re: Fix error recovery sequence 2024-12-25 13:55 ` Kalesh Anakkur Purayil @ 2024-12-30 18:29 ` Leon Romanovsky 0 siblings, 0 replies; 6+ messages in thread From: Leon Romanovsky @ 2024-12-30 18:29 UTC (permalink / raw) To: Kalesh Anakkur Purayil Cc: jgg, linux-rdma, andrew.gospodarek, selvin.xavier, Kashyap Desai On Wed, Dec 25, 2024 at 07:25:08PM +0530, Kalesh Anakkur Purayil wrote: > On Mon, 23 Dec 2024 at 10:12 PM, Leon Romanovsky <leon@kernel.org> wrote: > > > On Mon, Dec 23, 2024 at 09:12:53PM +0530, Kalesh Anakkur Purayil wrote: > > > Regards, > > > Kalesh AP > > > > > > > > > On Mon, 23 Dec 2024 at 8:30 PM, Leon Romanovsky <leon@kernel.org> wrote: > > > > > > > On Fri, Dec 20, 2024 at 01:29:20PM +0530, Kalesh AP wrote: > > > > > Fixed to return ENXIO from __send_message_basic_sanity() > > > > > to indicate that device is in error state. In the case of > > > > > ERR_DEVICE_DETACHED state, the driver should not post the > > > > > commands to the firmware as it will time out eventually. > > > > > > > > > > Removed bnxt_re_modify_qp() call from bnxt_re_dev_stop() > > > > > as it is a no-op. > > > > > > > > > > Fixes: cc5b9b48d447 ("RDMA/bnxt_re: Recover the device when FW error > > is > > > > detected") > > > > > Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com> > > > > > Signed-off-by: Kashyap Desai <kashyap.desai@broadcom.com> > > > > > Reviewed-by: Selvin Xavier <selvin.xavier@broadcom.com> > > > > > --- > > > > > V2: No changes since v1 and is just a resend. > > > > > V1: > > > > > > https://patchwork.kernel.org/project/linux-rdma/patch/20241204075416.478431-5-kalesh-anakkur.purayil@broadcom.com/ > > > > > --- > > > > > drivers/infiniband/hw/bnxt_re/main.c | 8 +------- > > > > > drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 7 ++++--- > > > > > drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 3 +++ > > > > > 3 files changed, 8 insertions(+), 10 deletions(-) > > > > > > > > > > diff --git a/drivers/infiniband/hw/bnxt_re/main.c > > > > b/drivers/infiniband/hw/bnxt_re/main.c > > > > > index b7af0d5ff3b6..c143f273b759 100644 > > > > > --- a/drivers/infiniband/hw/bnxt_re/main.c > > > > > +++ b/drivers/infiniband/hw/bnxt_re/main.c > > > > > @@ -1715,11 +1715,8 @@ static bool bnxt_re_is_qp1_or_shadow_qp(struct > > > > bnxt_re_dev *rdev, > > > > > > > > > > static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) > > > > > { > > > > > - int mask = IB_QP_STATE; > > > > > - struct ib_qp_attr qp_attr; > > > > > struct bnxt_re_qp *qp; > > > > > > > > > > - qp_attr.qp_state = IB_QPS_ERR; > > > > > mutex_lock(&rdev->qp_lock); > > > > > list_for_each_entry(qp, &rdev->qp_list, list) { > > > > > /* Modify the state of all QPs except QP1/Shadow QP */ > > > > > @@ -1727,12 +1724,9 @@ static void bnxt_re_dev_stop(struct > > bnxt_re_dev > > > > *rdev) > > > > > if (qp->qplib_qp.state != > > > > > CMDQ_MODIFY_QP_NEW_STATE_RESET && > > > > > qp->qplib_qp.state != > > > > > - CMDQ_MODIFY_QP_NEW_STATE_ERR) { > > > > > + CMDQ_MODIFY_QP_NEW_STATE_ERR) > > > > > bnxt_re_dispatch_event(&rdev->ibdev, > > > > &qp->ib_qp, > > > > > 1, > > > > IB_EVENT_QP_FATAL); > > > > > - bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, > > > > mask, > > > > > - NULL); > > > > > - } > > > > > } > > > > > } > > > > > mutex_unlock(&rdev->qp_lock); > > > > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > > > b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > > > > index 5e90ea232de8..c8e65169f58a 100644 > > > > > --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > > > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c > > > > > @@ -423,8 +423,9 @@ static int __send_message_basic_sanity(struct > > > > bnxt_qplib_rcfw *rcfw, > > > > > cmdq = &rcfw->cmdq; > > > > > > > > > > /* Prevent posting if f/w is not in a state to process */ > > > > > - if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) > > > > > - return bnxt_qplib_map_rc(opcode); > > > > > + if (RCFW_NO_FW_ACCESS(rcfw)) > > > > > + return -ENXIO; > > > > > + > > > > > if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) > > > > > return -ETIMEDOUT; > > > > > > > > > > @@ -493,7 +494,7 @@ static int __bnxt_qplib_rcfw_send_message(struct > > > > bnxt_qplib_rcfw *rcfw, > > > > > > > > > > rc = __send_message_basic_sanity(rcfw, msg, opcode); > > > > > if (rc) > > > > > - return rc; > > > > > + return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; > > > > > > > > > > rc = __send_message(rcfw, msg, opcode); > > > > > if (rc) > > > > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > > > b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > > > > index 88814cb3aa74..4f7d800e35c3 100644 > > > > > --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > > > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h > > > > > @@ -129,6 +129,9 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct > > > > cmdq_base *req) > > > > > > > > > > #define RCFW_MAX_COOKIE_VALUE > > (BNXT_QPLIB_CMDQE_MAX_CNT > > > > - 1) > > > > > #define RCFW_CMD_IS_BLOCKING 0x8000 > > > > > +#define RCFW_NO_FW_ACCESS(rcfw) > > > > \ > > > > > + (test_bit(ERR_DEVICE_DETACHED, &(rcfw)->cmdq.flags) || > > \ > > > > > + pci_channel_offline((rcfw)->pdev)) > > > > > > > > You promised me that this patch handles races, so how is this > > > > pci_channel_offline() check protected? > > > > > > > > Thansk > > > > > > Hi Leon, > > > > > > Sorry, I may be missing something here. > > > Could you help me understand what is the race condition here? I can then > > > internally discuss with the team based on your input. > > > > pci_channel_offline() is placed completely randomly here. There is no > > guarantee that PCI card won't be offline immediately after this check. > > Thank you Leon. I will discuss with the team about this and take it forward > as a separate patch. > For the time being, I will push a V3 with pci_channel_offline() check > removed, is that okay with you? Yes, please. > > Regards, > Kalesh > > > > > > > Thanks > > > > > > > > > > Regards, > > > Kalesh AP > > > > > > > > > > > > > > > > > > > > > #define HWRM_VERSION_DEV_ATTR_MAX_DPI 0x1000A0000000DULL > > > > > /* HWRM version 1.10.3.18 */ > > > > > -- > > > > > 2.43.5 > > > > > > > > > > > > > > > ^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2024-12-30 18:29 UTC | newest] Thread overview: 6+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2024-12-20 7:59 [PATCH for-rc v2] RDMA/bnxt_re: Fix error recovery sequence Kalesh AP 2024-12-23 15:00 ` Leon Romanovsky 2024-12-23 15:42 ` Kalesh Anakkur Purayil 2024-12-23 16:42 ` Leon Romanovsky 2024-12-25 13:55 ` Kalesh Anakkur Purayil 2024-12-30 18:29 ` Leon Romanovsky
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox