* [PATCH v2 for-rc 1/5] RDMA/hns: Fix an AEQE overflow error caused by untimely update of eq_db_ci
2024-10-24 12:39 [PATCH v2 for-rc 0/5] RDMA/hns: Bugfixes Junxian Huang
@ 2024-10-24 12:39 ` Junxian Huang
2024-10-24 12:39 ` [PATCH v2 for-rc 2/5] RDMA/hns: Fix flush cqe error when racing with destroy qp Junxian Huang
` (4 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Junxian Huang @ 2024-10-24 12:39 UTC (permalink / raw)
To: jgg, leon
Cc: linux-rdma, linuxarm, linux-kernel, huangjunxian6, tangchengchang
From: wenglianfa <wenglianfa@huawei.com>
eq_db_ci is updated only after all AEQEs are processed in the AEQ
interrupt handler, which is not timely enough and may result in
AEQ overflow. Two optimization methods are proposed:
1. Set an upper limit for AEQE processing.
2. Move time-consuming operations such as printings to the bottom
half of the interrupt.
cmd events and flush_cqe events are still fully processed in the top half
to ensure timely handling.
Fixes: a5073d6054f7 ("RDMA/hns: Add eq support of hip08")
Signed-off-by: wenglianfa <wenglianfa@huawei.com>
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/hns_roce_device.h | 1 +
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 75 ++++++++++++++-------
drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 5 ++
drivers/infiniband/hw/hns/hns_roce_qp.c | 54 +++++++++------
4 files changed, 91 insertions(+), 44 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 0b1e21cb6d2d..73c78005901e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -1289,6 +1289,7 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn);
void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type);
void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp);
void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type);
+void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn);
void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type);
void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev);
int hns_roce_init(struct hns_roce_dev *hr_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 24e906b9d3ae..e85c450e1809 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -5967,11 +5967,10 @@ static int hns_roce_v2_query_mpt(struct hns_roce_dev *hr_dev, u32 key,
return ret;
}
-static void hns_roce_irq_work_handle(struct work_struct *work)
+static void dump_aeqe_log(struct hns_roce_work *irq_work)
{
- struct hns_roce_work *irq_work =
- container_of(work, struct hns_roce_work, work);
- struct ib_device *ibdev = &irq_work->hr_dev->ib_dev;
+ struct hns_roce_dev *hr_dev = irq_work->hr_dev;
+ struct ib_device *ibdev = &hr_dev->ib_dev;
switch (irq_work->event_type) {
case HNS_ROCE_EVENT_TYPE_PATH_MIG:
@@ -6015,6 +6014,8 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
ibdev_warn(ibdev, "DB overflow.\n");
break;
+ case HNS_ROCE_EVENT_TYPE_MB:
+ break;
case HNS_ROCE_EVENT_TYPE_FLR:
ibdev_warn(ibdev, "function level reset.\n");
break;
@@ -6025,8 +6026,46 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
ibdev_err(ibdev, "invalid xrceth error.\n");
break;
default:
+ ibdev_info(ibdev, "Undefined event %d.\n",
+ irq_work->event_type);
break;
}
+}
+
+static void hns_roce_irq_work_handle(struct work_struct *work)
+{
+ struct hns_roce_work *irq_work =
+ container_of(work, struct hns_roce_work, work);
+ struct hns_roce_dev *hr_dev = irq_work->hr_dev;
+ int event_type = irq_work->event_type;
+ u32 queue_num = irq_work->queue_num;
+
+ switch (event_type) {
+ case HNS_ROCE_EVENT_TYPE_PATH_MIG:
+ case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
+ case HNS_ROCE_EVENT_TYPE_COMM_EST:
+ case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
+ case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+ case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
+ case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
+ case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+ case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION:
+ case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH:
+ hns_roce_qp_event(hr_dev, queue_num, event_type);
+ break;
+ case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
+ case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
+ hns_roce_srq_event(hr_dev, queue_num, event_type);
+ break;
+ case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
+ case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
+ hns_roce_cq_event(hr_dev, queue_num, event_type);
+ break;
+ default:
+ break;
+ }
+
+ dump_aeqe_log(irq_work);
kfree(irq_work);
}
@@ -6087,14 +6126,14 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq)
static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
struct hns_roce_eq *eq)
{
- struct device *dev = hr_dev->dev;
struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq);
irqreturn_t aeqe_found = IRQ_NONE;
+ int num_aeqes = 0;
int event_type;
u32 queue_num;
int sub_type;
- while (aeqe) {
+ while (aeqe && num_aeqes < HNS_AEQ_POLLING_BUDGET) {
/* Make sure we read AEQ entry after we have checked the
* ownership bit
*/
@@ -6105,25 +6144,12 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
queue_num = hr_reg_read(aeqe, AEQE_EVENT_QUEUE_NUM);
switch (event_type) {
- case HNS_ROCE_EVENT_TYPE_PATH_MIG:
- case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
- case HNS_ROCE_EVENT_TYPE_COMM_EST:
- case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
- case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION:
case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH:
- hns_roce_qp_event(hr_dev, queue_num, event_type);
- break;
- case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
- case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
- hns_roce_srq_event(hr_dev, queue_num, event_type);
- break;
- case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
- case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
- hns_roce_cq_event(hr_dev, queue_num, event_type);
+ hns_roce_flush_cqe(hr_dev, queue_num);
break;
case HNS_ROCE_EVENT_TYPE_MB:
hns_roce_cmd_event(hr_dev,
@@ -6131,12 +6157,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
aeqe->event.cmd.status,
le64_to_cpu(aeqe->event.cmd.out_param));
break;
- case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
- case HNS_ROCE_EVENT_TYPE_FLR:
- break;
default:
- dev_err(dev, "unhandled event %d on EQ %d at idx %u.\n",
- event_type, eq->eqn, eq->cons_index);
break;
}
@@ -6150,6 +6171,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
hns_roce_v2_init_irq_work(hr_dev, eq, queue_num);
aeqe = next_aeqe_sw_v2(eq);
+ ++num_aeqes;
}
update_eq_db(eq);
@@ -6699,6 +6721,9 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
int ret;
int i;
+ if (hr_dev->caps.aeqe_depth < HNS_AEQ_POLLING_BUDGET)
+ return -EINVAL;
+
other_num = hr_dev->caps.num_other_vectors;
comp_num = hr_dev->caps.num_comp_vectors;
aeq_num = hr_dev->caps.num_aeq_vectors;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index c65f68a14a26..3b3c6259ace0 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -85,6 +85,11 @@
#define HNS_ROCE_V2_TABLE_CHUNK_SIZE (1 << 18)
+/* budget must be smaller than aeqe_depth to guarantee that we update
+ * the ci before we polled all the entries in the EQ.
+ */
+#define HNS_AEQ_POLLING_BUDGET 64
+
enum {
HNS_ROCE_CMD_FLAG_IN = BIT(0),
HNS_ROCE_CMD_FLAG_OUT = BIT(1),
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 6b03ba671ff8..dcaa370d4a26 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -39,6 +39,25 @@
#include "hns_roce_device.h"
#include "hns_roce_hem.h"
+static struct hns_roce_qp *hns_roce_qp_lookup(struct hns_roce_dev *hr_dev,
+ u32 qpn)
+{
+ struct device *dev = hr_dev->dev;
+ struct hns_roce_qp *qp;
+ unsigned long flags;
+
+ xa_lock_irqsave(&hr_dev->qp_table_xa, flags);
+ qp = __hns_roce_qp_lookup(hr_dev, qpn);
+ if (qp)
+ refcount_inc(&qp->refcount);
+ xa_unlock_irqrestore(&hr_dev->qp_table_xa, flags);
+
+ if (!qp)
+ dev_warn(dev, "async event for bogus QP %08x\n", qpn);
+
+ return qp;
+}
+
static void flush_work_handle(struct work_struct *work)
{
struct hns_roce_work *flush_work = container_of(work,
@@ -95,31 +114,28 @@ void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp)
void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type)
{
- struct device *dev = hr_dev->dev;
struct hns_roce_qp *qp;
- xa_lock(&hr_dev->qp_table_xa);
- qp = __hns_roce_qp_lookup(hr_dev, qpn);
- if (qp)
- refcount_inc(&qp->refcount);
- xa_unlock(&hr_dev->qp_table_xa);
-
- if (!qp) {
- dev_warn(dev, "async event for bogus QP %08x\n", qpn);
+ qp = hns_roce_qp_lookup(hr_dev, qpn);
+ if (!qp)
return;
- }
- if (event_type == HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR ||
- event_type == HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR ||
- event_type == HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR ||
- event_type == HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION ||
- event_type == HNS_ROCE_EVENT_TYPE_INVALID_XRCETH) {
- qp->state = IB_QPS_ERR;
+ qp->event(qp, (enum hns_roce_event)event_type);
- flush_cqe(hr_dev, qp);
- }
+ if (refcount_dec_and_test(&qp->refcount))
+ complete(&qp->free);
+}
- qp->event(qp, (enum hns_roce_event)event_type);
+void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn)
+{
+ struct hns_roce_qp *qp;
+
+ qp = hns_roce_qp_lookup(hr_dev, qpn);
+ if (!qp)
+ return;
+
+ qp->state = IB_QPS_ERR;
+ flush_cqe(hr_dev, qp);
if (refcount_dec_and_test(&qp->refcount))
complete(&qp->free);
--
2.33.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH v2 for-rc 2/5] RDMA/hns: Fix flush cqe error when racing with destroy qp
2024-10-24 12:39 [PATCH v2 for-rc 0/5] RDMA/hns: Bugfixes Junxian Huang
2024-10-24 12:39 ` [PATCH v2 for-rc 1/5] RDMA/hns: Fix an AEQE overflow error caused by untimely update of eq_db_ci Junxian Huang
@ 2024-10-24 12:39 ` Junxian Huang
2024-10-24 17:48 ` Zhu Yanjun
2024-10-24 12:39 ` [PATCH v2 for-rc 3/5] RDMA/hns: Modify debugfs name Junxian Huang
` (3 subsequent siblings)
5 siblings, 1 reply; 10+ messages in thread
From: Junxian Huang @ 2024-10-24 12:39 UTC (permalink / raw)
To: jgg, leon
Cc: linux-rdma, linuxarm, linux-kernel, huangjunxian6, tangchengchang
From: wenglianfa <wenglianfa@huawei.com>
QP needs to be modified to IB_QPS_ERROR to trigger HW flush cqe. But
when this process races with destroy qp, the destroy-qp process may
modify the QP to IB_QPS_RESET first. In this case flush cqe will fail
since it is invalid to modify qp from IB_QPS_RESET to IB_QPS_ERROR.
Add lock and bit flag to make sure pending flush cqe work is completed
first and no more new works will be added.
Fixes: ffd541d45726 ("RDMA/hns: Add the workqueue framework for flush cqe handler")
Signed-off-by: wenglianfa <wenglianfa@huawei.com>
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/hns_roce_device.h | 2 ++
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 7 +++++++
drivers/infiniband/hw/hns/hns_roce_qp.c | 15 +++++++++++++--
3 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 73c78005901e..9b51d5a1533f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -593,6 +593,7 @@ struct hns_roce_dev;
enum {
HNS_ROCE_FLUSH_FLAG = 0,
+ HNS_ROCE_STOP_FLUSH_FLAG = 1,
};
struct hns_roce_work {
@@ -656,6 +657,7 @@ struct hns_roce_qp {
enum hns_roce_cong_type cong_type;
u8 tc_mode;
u8 priority;
+ spinlock_t flush_lock;
};
struct hns_roce_ib_iboe {
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index e85c450e1809..aa42c5a9b254 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -5598,8 +5598,15 @@ int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
{
struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+ unsigned long flags;
int ret;
+ /* Make sure flush_cqe() is completed */
+ spin_lock_irqsave(&hr_qp->flush_lock, flags);
+ set_bit(HNS_ROCE_STOP_FLUSH_FLAG, &hr_qp->flush_flag);
+ spin_unlock_irqrestore(&hr_qp->flush_lock, flags);
+ flush_work(&hr_qp->flush_work.work);
+
ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
if (ret)
ibdev_err(&hr_dev->ib_dev,
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index dcaa370d4a26..2ad03ecdbf8e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -90,11 +90,18 @@ static void flush_work_handle(struct work_struct *work)
void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
{
struct hns_roce_work *flush_work = &hr_qp->flush_work;
+ unsigned long flags;
+
+ spin_lock_irqsave(&hr_qp->flush_lock, flags);
+ /* Exit directly after destroy_qp() */
+ if (test_bit(HNS_ROCE_STOP_FLUSH_FLAG, &hr_qp->flush_flag)) {
+ spin_unlock_irqrestore(&hr_qp->flush_lock, flags);
+ return;
+ }
- flush_work->hr_dev = hr_dev;
- INIT_WORK(&flush_work->work, flush_work_handle);
refcount_inc(&hr_qp->refcount);
queue_work(hr_dev->irq_workq, &flush_work->work);
+ spin_unlock_irqrestore(&hr_qp->flush_lock, flags);
}
void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp)
@@ -1140,6 +1147,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
struct ib_udata *udata,
struct hns_roce_qp *hr_qp)
{
+ struct hns_roce_work *flush_work = &hr_qp->flush_work;
struct hns_roce_ib_create_qp_resp resp = {};
struct ib_device *ibdev = &hr_dev->ib_dev;
struct hns_roce_ib_create_qp ucmd = {};
@@ -1148,9 +1156,12 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
mutex_init(&hr_qp->mutex);
spin_lock_init(&hr_qp->sq.lock);
spin_lock_init(&hr_qp->rq.lock);
+ spin_lock_init(&hr_qp->flush_lock);
hr_qp->state = IB_QPS_RESET;
hr_qp->flush_flag = 0;
+ flush_work->hr_dev = hr_dev;
+ INIT_WORK(&flush_work->work, flush_work_handle);
if (init_attr->create_flags)
return -EOPNOTSUPP;
--
2.33.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v2 for-rc 2/5] RDMA/hns: Fix flush cqe error when racing with destroy qp
2024-10-24 12:39 ` [PATCH v2 for-rc 2/5] RDMA/hns: Fix flush cqe error when racing with destroy qp Junxian Huang
@ 2024-10-24 17:48 ` Zhu Yanjun
0 siblings, 0 replies; 10+ messages in thread
From: Zhu Yanjun @ 2024-10-24 17:48 UTC (permalink / raw)
To: Junxian Huang, jgg, leon
Cc: linux-rdma, linuxarm, linux-kernel, tangchengchang
在 2024/10/24 14:39, Junxian Huang 写道:
> From: wenglianfa <wenglianfa@huawei.com>
>
> QP needs to be modified to IB_QPS_ERROR to trigger HW flush cqe. But
> when this process races with destroy qp, the destroy-qp process may
> modify the QP to IB_QPS_RESET first. In this case flush cqe will fail
> since it is invalid to modify qp from IB_QPS_RESET to IB_QPS_ERROR.
>
> Add lock and bit flag to make sure pending flush cqe work is completed
> first and no more new works will be added.
>
> Fixes: ffd541d45726 ("RDMA/hns: Add the workqueue framework for flush cqe handler")
> Signed-off-by: wenglianfa <wenglianfa@huawei.com>
> Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
> ---
> drivers/infiniband/hw/hns/hns_roce_device.h | 2 ++
> drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 7 +++++++
> drivers/infiniband/hw/hns/hns_roce_qp.c | 15 +++++++++++++--
> 3 files changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
> index 73c78005901e..9b51d5a1533f 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_device.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_device.h
> @@ -593,6 +593,7 @@ struct hns_roce_dev;
>
> enum {
> HNS_ROCE_FLUSH_FLAG = 0,
> + HNS_ROCE_STOP_FLUSH_FLAG = 1,
> };
>
> struct hns_roce_work {
> @@ -656,6 +657,7 @@ struct hns_roce_qp {
> enum hns_roce_cong_type cong_type;
> u8 tc_mode;
> u8 priority;
> + spinlock_t flush_lock;
> };
>
> struct hns_roce_ib_iboe {
> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> index e85c450e1809..aa42c5a9b254 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> @@ -5598,8 +5598,15 @@ int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
> {
> struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
> struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
> + unsigned long flags;
> int ret;
>
> + /* Make sure flush_cqe() is completed */
> + spin_lock_irqsave(&hr_qp->flush_lock, flags);
> + set_bit(HNS_ROCE_STOP_FLUSH_FLAG, &hr_qp->flush_flag);
> + spin_unlock_irqrestore(&hr_qp->flush_lock, flags);
> + flush_work(&hr_qp->flush_work.work);
> +
> ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
> if (ret)
> ibdev_err(&hr_dev->ib_dev,
> diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
> index dcaa370d4a26..2ad03ecdbf8e 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_qp.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
> @@ -90,11 +90,18 @@ static void flush_work_handle(struct work_struct *work)
> void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
> {
> struct hns_roce_work *flush_work = &hr_qp->flush_work;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&hr_qp->flush_lock, flags);
> + /* Exit directly after destroy_qp() */
> + if (test_bit(HNS_ROCE_STOP_FLUSH_FLAG, &hr_qp->flush_flag)) {
> + spin_unlock_irqrestore(&hr_qp->flush_lock, flags);
> + return;
> + }
>
> - flush_work->hr_dev = hr_dev;
> - INIT_WORK(&flush_work->work, flush_work_handle);
> refcount_inc(&hr_qp->refcount);
> queue_work(hr_dev->irq_workq, &flush_work->work);
> + spin_unlock_irqrestore(&hr_qp->flush_lock, flags);
> }
>
> void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp)
> @@ -1140,6 +1147,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
> struct ib_udata *udata,
> struct hns_roce_qp *hr_qp)
> {
> + struct hns_roce_work *flush_work = &hr_qp->flush_work;
> struct hns_roce_ib_create_qp_resp resp = {};
> struct ib_device *ibdev = &hr_dev->ib_dev;
> struct hns_roce_ib_create_qp ucmd = {};
> @@ -1148,9 +1156,12 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
> mutex_init(&hr_qp->mutex);
> spin_lock_init(&hr_qp->sq.lock);
> spin_lock_init(&hr_qp->rq.lock);
> + spin_lock_init(&hr_qp->flush_lock);
Thanks a lot. I am fine with this spin_lock_init(&hr_qp->flush_lock);
Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
Zhu Yanjun
>
> hr_qp->state = IB_QPS_RESET;
> hr_qp->flush_flag = 0;
> + flush_work->hr_dev = hr_dev;
> + INIT_WORK(&flush_work->work, flush_work_handle);
>
> if (init_attr->create_flags)
> return -EOPNOTSUPP;
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v2 for-rc 3/5] RDMA/hns: Modify debugfs name
2024-10-24 12:39 [PATCH v2 for-rc 0/5] RDMA/hns: Bugfixes Junxian Huang
2024-10-24 12:39 ` [PATCH v2 for-rc 1/5] RDMA/hns: Fix an AEQE overflow error caused by untimely update of eq_db_ci Junxian Huang
2024-10-24 12:39 ` [PATCH v2 for-rc 2/5] RDMA/hns: Fix flush cqe error when racing with destroy qp Junxian Huang
@ 2024-10-24 12:39 ` Junxian Huang
2024-10-30 12:12 ` Leon Romanovsky
2024-10-24 12:39 ` [PATCH v2 for-rc 4/5] RDMA/hns: Use dev_* printings in hem code instead of ibdev_* Junxian Huang
` (2 subsequent siblings)
5 siblings, 1 reply; 10+ messages in thread
From: Junxian Huang @ 2024-10-24 12:39 UTC (permalink / raw)
To: jgg, leon
Cc: linux-rdma, linuxarm, linux-kernel, huangjunxian6, tangchengchang
From: Yuyu Li <liyuyu6@huawei.com>
The sub-directory of hns_roce debugfs is named after the device's
kernel name currently, but it will be inconvenient to use when
the device is renamed.
Modify the name to pci name as users can always easily find the
correspondence between an RDMA device and its pci name.
Fixes: eb7854d63db5 ("RDMA/hns: Support SW stats with debugfs")
Signed-off-by: Yuyu Li <liyuyu6@huawei.com>
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/hns_roce_debugfs.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_debugfs.c b/drivers/infiniband/hw/hns/hns_roce_debugfs.c
index e8febb40f645..b869cdc54118 100644
--- a/drivers/infiniband/hw/hns/hns_roce_debugfs.c
+++ b/drivers/infiniband/hw/hns/hns_roce_debugfs.c
@@ -5,6 +5,7 @@
#include <linux/debugfs.h>
#include <linux/device.h>
+#include <linux/pci.h>
#include "hns_roce_device.h"
@@ -86,7 +87,7 @@ void hns_roce_register_debugfs(struct hns_roce_dev *hr_dev)
{
struct hns_roce_dev_debugfs *dbgfs = &hr_dev->dbgfs;
- dbgfs->root = debugfs_create_dir(dev_name(&hr_dev->ib_dev.dev),
+ dbgfs->root = debugfs_create_dir(pci_name(hr_dev->pci_dev),
hns_roce_dbgfs_root);
create_sw_stat_debugfs(hr_dev, dbgfs->root);
--
2.33.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v2 for-rc 3/5] RDMA/hns: Modify debugfs name
2024-10-24 12:39 ` [PATCH v2 for-rc 3/5] RDMA/hns: Modify debugfs name Junxian Huang
@ 2024-10-30 12:12 ` Leon Romanovsky
2024-10-31 9:16 ` Junxian Huang
0 siblings, 1 reply; 10+ messages in thread
From: Leon Romanovsky @ 2024-10-30 12:12 UTC (permalink / raw)
To: Junxian Huang; +Cc: jgg, linux-rdma, linuxarm, linux-kernel, tangchengchang
On Thu, Oct 24, 2024 at 08:39:58PM +0800, Junxian Huang wrote:
> From: Yuyu Li <liyuyu6@huawei.com>
>
> The sub-directory of hns_roce debugfs is named after the device's
> kernel name currently, but it will be inconvenient to use when
> the device is renamed.
>
> Modify the name to pci name as users can always easily find the
> correspondence between an RDMA device and its pci name.
>
> Fixes: eb7854d63db5 ("RDMA/hns: Support SW stats with debugfs")
> Signed-off-by: Yuyu Li <liyuyu6@huawei.com>
> Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
> ---
> drivers/infiniband/hw/hns/hns_roce_debugfs.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/infiniband/hw/hns/hns_roce_debugfs.c b/drivers/infiniband/hw/hns/hns_roce_debugfs.c
> index e8febb40f645..b869cdc54118 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_debugfs.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_debugfs.c
> @@ -5,6 +5,7 @@
>
> #include <linux/debugfs.h>
> #include <linux/device.h>
> +#include <linux/pci.h>
>
> #include "hns_roce_device.h"
>
> @@ -86,7 +87,7 @@ void hns_roce_register_debugfs(struct hns_roce_dev *hr_dev)
> {
> struct hns_roce_dev_debugfs *dbgfs = &hr_dev->dbgfs;
>
> - dbgfs->root = debugfs_create_dir(dev_name(&hr_dev->ib_dev.dev),
> + dbgfs->root = debugfs_create_dir(pci_name(hr_dev->pci_dev),
> hns_roce_dbgfs_root);
Let's take this change, but the more correct way is to add .rename()
callback to ib_device ops in similar way to what we do in ib_client
and call to debugfs_rename() from there.
See ib_device_rename() implementation for "lient->rename(ibdev, client_data);" call.
Thanks
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH v2 for-rc 3/5] RDMA/hns: Modify debugfs name
2024-10-30 12:12 ` Leon Romanovsky
@ 2024-10-31 9:16 ` Junxian Huang
0 siblings, 0 replies; 10+ messages in thread
From: Junxian Huang @ 2024-10-31 9:16 UTC (permalink / raw)
To: Leon Romanovsky; +Cc: jgg, linux-rdma, linuxarm, linux-kernel, tangchengchang
On 2024/10/30 20:12, Leon Romanovsky wrote:
> On Thu, Oct 24, 2024 at 08:39:58PM +0800, Junxian Huang wrote:
>> From: Yuyu Li <liyuyu6@huawei.com>
>>
>> The sub-directory of hns_roce debugfs is named after the device's
>> kernel name currently, but it will be inconvenient to use when
>> the device is renamed.
>>
>> Modify the name to pci name as users can always easily find the
>> correspondence between an RDMA device and its pci name.
>>
>> Fixes: eb7854d63db5 ("RDMA/hns: Support SW stats with debugfs")
>> Signed-off-by: Yuyu Li <liyuyu6@huawei.com>
>> Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
>> ---
>> drivers/infiniband/hw/hns/hns_roce_debugfs.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_debugfs.c b/drivers/infiniband/hw/hns/hns_roce_debugfs.c
>> index e8febb40f645..b869cdc54118 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_debugfs.c
>> +++ b/drivers/infiniband/hw/hns/hns_roce_debugfs.c
>> @@ -5,6 +5,7 @@
>>
>> #include <linux/debugfs.h>
>> #include <linux/device.h>
>> +#include <linux/pci.h>
>>
>> #include "hns_roce_device.h"
>>
>> @@ -86,7 +87,7 @@ void hns_roce_register_debugfs(struct hns_roce_dev *hr_dev)
>> {
>> struct hns_roce_dev_debugfs *dbgfs = &hr_dev->dbgfs;
>>
>> - dbgfs->root = debugfs_create_dir(dev_name(&hr_dev->ib_dev.dev),
>> + dbgfs->root = debugfs_create_dir(pci_name(hr_dev->pci_dev),
>> hns_roce_dbgfs_root);
>
> Let's take this change, but the more correct way is to add .rename()
> callback to ib_device ops in similar way to what we do in ib_client
> and call to debugfs_rename() from there.
>
> See ib_device_rename() implementation for "lient->rename(ibdev, client_data);" call.
>
Thanks for applying and the guidance. I'll have a look at it.
Junxian
> Thanks
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v2 for-rc 4/5] RDMA/hns: Use dev_* printings in hem code instead of ibdev_*
2024-10-24 12:39 [PATCH v2 for-rc 0/5] RDMA/hns: Bugfixes Junxian Huang
` (2 preceding siblings ...)
2024-10-24 12:39 ` [PATCH v2 for-rc 3/5] RDMA/hns: Modify debugfs name Junxian Huang
@ 2024-10-24 12:39 ` Junxian Huang
2024-10-24 12:40 ` [PATCH v2 for-rc 5/5] RDMA/hns: Fix cpu stuck caused by printings during reset Junxian Huang
2024-10-30 12:14 ` [PATCH v2 for-rc 0/5] RDMA/hns: Bugfixes Leon Romanovsky
5 siblings, 0 replies; 10+ messages in thread
From: Junxian Huang @ 2024-10-24 12:39 UTC (permalink / raw)
To: jgg, leon
Cc: linux-rdma, linuxarm, linux-kernel, huangjunxian6, tangchengchang
The hem code is executed before ib_dev is registered, so use dev_*
printing instead of ibdev_* to avoid log like this:
(null): set HEM address to HW failed!
Fixes: 2f49de21f3e9 ("RDMA/hns: Optimize mhop get flow for multi-hop addressing")
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/hns_roce_hem.c | 44 ++++++++++++------------
1 file changed, 22 insertions(+), 22 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index c7c167e2a045..ee5d2c1bb5ca 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -300,7 +300,7 @@ static int calc_hem_config(struct hns_roce_dev *hr_dev,
struct hns_roce_hem_mhop *mhop,
struct hns_roce_hem_index *index)
{
- struct ib_device *ibdev = &hr_dev->ib_dev;
+ struct device *dev = hr_dev->dev;
unsigned long mhop_obj = obj;
u32 l0_idx, l1_idx, l2_idx;
u32 chunk_ba_num;
@@ -331,14 +331,14 @@ static int calc_hem_config(struct hns_roce_dev *hr_dev,
index->buf = l0_idx;
break;
default:
- ibdev_err(ibdev, "table %u not support mhop.hop_num = %u!\n",
- table->type, mhop->hop_num);
+ dev_err(dev, "table %u not support mhop.hop_num = %u!\n",
+ table->type, mhop->hop_num);
return -EINVAL;
}
if (unlikely(index->buf >= table->num_hem)) {
- ibdev_err(ibdev, "table %u exceed hem limt idx %llu, max %lu!\n",
- table->type, index->buf, table->num_hem);
+ dev_err(dev, "table %u exceed hem limt idx %llu, max %lu!\n",
+ table->type, index->buf, table->num_hem);
return -EINVAL;
}
@@ -448,14 +448,14 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev,
struct hns_roce_hem_mhop *mhop,
struct hns_roce_hem_index *index)
{
- struct ib_device *ibdev = &hr_dev->ib_dev;
+ struct device *dev = hr_dev->dev;
u32 step_idx;
int ret = 0;
if (index->inited & HEM_INDEX_L0) {
ret = hr_dev->hw->set_hem(hr_dev, table, obj, 0);
if (ret) {
- ibdev_err(ibdev, "set HEM step 0 failed!\n");
+ dev_err(dev, "set HEM step 0 failed!\n");
goto out;
}
}
@@ -463,7 +463,7 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev,
if (index->inited & HEM_INDEX_L1) {
ret = hr_dev->hw->set_hem(hr_dev, table, obj, 1);
if (ret) {
- ibdev_err(ibdev, "set HEM step 1 failed!\n");
+ dev_err(dev, "set HEM step 1 failed!\n");
goto out;
}
}
@@ -475,7 +475,7 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev,
step_idx = mhop->hop_num;
ret = hr_dev->hw->set_hem(hr_dev, table, obj, step_idx);
if (ret)
- ibdev_err(ibdev, "set HEM step last failed!\n");
+ dev_err(dev, "set HEM step last failed!\n");
}
out:
return ret;
@@ -485,14 +485,14 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev,
struct hns_roce_hem_table *table,
unsigned long obj)
{
- struct ib_device *ibdev = &hr_dev->ib_dev;
struct hns_roce_hem_index index = {};
struct hns_roce_hem_mhop mhop = {};
+ struct device *dev = hr_dev->dev;
int ret;
ret = calc_hem_config(hr_dev, table, obj, &mhop, &index);
if (ret) {
- ibdev_err(ibdev, "calc hem config failed!\n");
+ dev_err(dev, "calc hem config failed!\n");
return ret;
}
@@ -504,7 +504,7 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev,
ret = alloc_mhop_hem(hr_dev, table, &mhop, &index);
if (ret) {
- ibdev_err(ibdev, "alloc mhop hem failed!\n");
+ dev_err(dev, "alloc mhop hem failed!\n");
goto out;
}
@@ -512,7 +512,7 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev,
if (table->type < HEM_TYPE_MTT) {
ret = set_mhop_hem(hr_dev, table, obj, &mhop, &index);
if (ret) {
- ibdev_err(ibdev, "set HEM address to HW failed!\n");
+ dev_err(dev, "set HEM address to HW failed!\n");
goto err_alloc;
}
}
@@ -575,7 +575,7 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev,
struct hns_roce_hem_mhop *mhop,
struct hns_roce_hem_index *index)
{
- struct ib_device *ibdev = &hr_dev->ib_dev;
+ struct device *dev = hr_dev->dev;
u32 hop_num = mhop->hop_num;
u32 chunk_ba_num;
u32 step_idx;
@@ -605,21 +605,21 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev,
ret = hr_dev->hw->clear_hem(hr_dev, table, obj, step_idx);
if (ret)
- ibdev_warn(ibdev, "failed to clear hop%u HEM, ret = %d.\n",
- hop_num, ret);
+ dev_warn(dev, "failed to clear hop%u HEM, ret = %d.\n",
+ hop_num, ret);
if (index->inited & HEM_INDEX_L1) {
ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 1);
if (ret)
- ibdev_warn(ibdev, "failed to clear HEM step 1, ret = %d.\n",
- ret);
+ dev_warn(dev, "failed to clear HEM step 1, ret = %d.\n",
+ ret);
}
if (index->inited & HEM_INDEX_L0) {
ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 0);
if (ret)
- ibdev_warn(ibdev, "failed to clear HEM step 0, ret = %d.\n",
- ret);
+ dev_warn(dev, "failed to clear HEM step 0, ret = %d.\n",
+ ret);
}
}
}
@@ -629,14 +629,14 @@ static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev,
unsigned long obj,
int check_refcount)
{
- struct ib_device *ibdev = &hr_dev->ib_dev;
struct hns_roce_hem_index index = {};
struct hns_roce_hem_mhop mhop = {};
+ struct device *dev = hr_dev->dev;
int ret;
ret = calc_hem_config(hr_dev, table, obj, &mhop, &index);
if (ret) {
- ibdev_err(ibdev, "calc hem config failed!\n");
+ dev_err(dev, "calc hem config failed!\n");
return;
}
--
2.33.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH v2 for-rc 5/5] RDMA/hns: Fix cpu stuck caused by printings during reset
2024-10-24 12:39 [PATCH v2 for-rc 0/5] RDMA/hns: Bugfixes Junxian Huang
` (3 preceding siblings ...)
2024-10-24 12:39 ` [PATCH v2 for-rc 4/5] RDMA/hns: Use dev_* printings in hem code instead of ibdev_* Junxian Huang
@ 2024-10-24 12:40 ` Junxian Huang
2024-10-30 12:14 ` [PATCH v2 for-rc 0/5] RDMA/hns: Bugfixes Leon Romanovsky
5 siblings, 0 replies; 10+ messages in thread
From: Junxian Huang @ 2024-10-24 12:40 UTC (permalink / raw)
To: jgg, leon
Cc: linux-rdma, linuxarm, linux-kernel, huangjunxian6, tangchengchang
From: wenglianfa <wenglianfa@huawei.com>
During reset, cmd to destroy resources such as qp, cq, and mr may fail,
and error logs will be printed. When a large number of resources are
destroyed, there will be lots of printings, and it may lead to a cpu
stuck.
Delete some unnecessary printings and replace other printing functions
in these paths with the ratelimited version.
Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver")
Fixes: c7bcb13442e1 ("RDMA/hns: Add SRQ support for hip08 kernel mode")
Fixes: 70f92521584f ("RDMA/hns: Use the reserved loopback QPs to free MR before destroying MPT")
Fixes: 926a01dc000d ("RDMA/hns: Add QP operations support for hip08 SoC")
Signed-off-by: wenglianfa <wenglianfa@huawei.com>
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/hns_roce_cq.c | 4 +-
drivers/infiniband/hw/hns/hns_roce_hem.c | 4 +-
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 73 ++++++++++------------
drivers/infiniband/hw/hns/hns_roce_mr.c | 4 +-
drivers/infiniband/hw/hns/hns_roce_srq.c | 4 +-
5 files changed, 41 insertions(+), 48 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 4ec66611a143..4106423a1b39 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -179,8 +179,8 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_CQC,
hr_cq->cqn);
if (ret)
- dev_err(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret,
- hr_cq->cqn);
+ dev_err_ratelimited(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n",
+ ret, hr_cq->cqn);
xa_erase_irq(&cq_table->array, hr_cq->cqn);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index ee5d2c1bb5ca..f84521be3bea 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -672,8 +672,8 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev,
ret = hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT);
if (ret)
- dev_warn(dev, "failed to clear HEM base address, ret = %d.\n",
- ret);
+ dev_warn_ratelimited(dev, "failed to clear HEM base address, ret = %d.\n",
+ ret);
hns_roce_free_hem(hr_dev, table->hem[i]);
table->hem[i] = NULL;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index aa42c5a9b254..b6a0498a7b03 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -373,19 +373,12 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr,
static int check_send_valid(struct hns_roce_dev *hr_dev,
struct hns_roce_qp *hr_qp)
{
- struct ib_device *ibdev = &hr_dev->ib_dev;
-
if (unlikely(hr_qp->state == IB_QPS_RESET ||
hr_qp->state == IB_QPS_INIT ||
- hr_qp->state == IB_QPS_RTR)) {
- ibdev_err(ibdev, "failed to post WQE, QP state %u!\n",
- hr_qp->state);
+ hr_qp->state == IB_QPS_RTR))
return -EINVAL;
- } else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) {
- ibdev_err(ibdev, "failed to post WQE, dev state %d!\n",
- hr_dev->state);
+ else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN))
return -EIO;
- }
return 0;
}
@@ -2775,8 +2768,8 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev,
ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT,
IB_QPS_INIT, NULL);
if (ret) {
- ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n",
- ret);
+ ibdev_err_ratelimited(ibdev, "failed to modify qp to init, ret = %d.\n",
+ ret);
return ret;
}
@@ -3421,8 +3414,8 @@ static int free_mr_post_send_lp_wqe(struct hns_roce_qp *hr_qp)
ret = hns_roce_v2_post_send(&hr_qp->ibqp, send_wr, &bad_wr);
if (ret) {
- ibdev_err(ibdev, "failed to post wqe for free mr, ret = %d.\n",
- ret);
+ ibdev_err_ratelimited(ibdev, "failed to post wqe for free mr, ret = %d.\n",
+ ret);
return ret;
}
@@ -3461,9 +3454,9 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev)
ret = free_mr_post_send_lp_wqe(hr_qp);
if (ret) {
- ibdev_err(ibdev,
- "failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n",
- hr_qp->qpn, ret);
+ ibdev_err_ratelimited(ibdev,
+ "failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n",
+ hr_qp->qpn, ret);
break;
}
@@ -3474,16 +3467,16 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev)
while (cqe_cnt) {
npolled = hns_roce_v2_poll_cq(&free_mr->rsv_cq->ib_cq, cqe_cnt, wc);
if (npolled < 0) {
- ibdev_err(ibdev,
- "failed to poll cqe for free mr, remain %d cqe.\n",
- cqe_cnt);
+ ibdev_err_ratelimited(ibdev,
+ "failed to poll cqe for free mr, remain %d cqe.\n",
+ cqe_cnt);
goto out;
}
if (time_after(jiffies, end)) {
- ibdev_err(ibdev,
- "failed to poll cqe for free mr and timeout, remain %d cqe.\n",
- cqe_cnt);
+ ibdev_err_ratelimited(ibdev,
+ "failed to poll cqe for free mr and timeout, remain %d cqe.\n",
+ cqe_cnt);
goto out;
}
cqe_cnt -= npolled;
@@ -5061,10 +5054,8 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp,
struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
int ret = 0;
- if (!check_qp_state(cur_state, new_state)) {
- ibdev_err(&hr_dev->ib_dev, "Illegal state for QP!\n");
+ if (!check_qp_state(cur_state, new_state))
return -EINVAL;
- }
if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
memset(qpc_mask, 0, hr_dev->caps.qpc_sz);
@@ -5325,7 +5316,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
/* SW pass context to HW */
ret = hns_roce_v2_qp_modify(hr_dev, context, qpc_mask, hr_qp);
if (ret) {
- ibdev_err(ibdev, "failed to modify QP, ret = %d.\n", ret);
+ ibdev_err_ratelimited(ibdev, "failed to modify QP, ret = %d.\n", ret);
goto out;
}
@@ -5463,7 +5454,9 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
ret = hns_roce_v2_query_qpc(hr_dev, hr_qp->qpn, &context);
if (ret) {
- ibdev_err(ibdev, "failed to query QPC, ret = %d.\n", ret);
+ ibdev_err_ratelimited(ibdev,
+ "failed to query QPC, ret = %d.\n",
+ ret);
ret = -EINVAL;
goto out;
}
@@ -5471,7 +5464,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
state = hr_reg_read(&context, QPC_QP_ST);
tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state);
if (tmp_qp_state == -1) {
- ibdev_err(ibdev, "Illegal ib_qp_state\n");
+ ibdev_err_ratelimited(ibdev, "Illegal ib_qp_state\n");
ret = -EINVAL;
goto out;
}
@@ -5564,9 +5557,9 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0,
hr_qp->state, IB_QPS_RESET, udata);
if (ret)
- ibdev_err(ibdev,
- "failed to modify QP to RST, ret = %d.\n",
- ret);
+ ibdev_err_ratelimited(ibdev,
+ "failed to modify QP to RST, ret = %d.\n",
+ ret);
}
send_cq = hr_qp->ibqp.send_cq ? to_hr_cq(hr_qp->ibqp.send_cq) : NULL;
@@ -5609,9 +5602,9 @@ int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
if (ret)
- ibdev_err(&hr_dev->ib_dev,
- "failed to destroy QP, QPN = 0x%06lx, ret = %d.\n",
- hr_qp->qpn, ret);
+ ibdev_err_ratelimited(&hr_dev->ib_dev,
+ "failed to destroy QP, QPN = 0x%06lx, ret = %d.\n",
+ hr_qp->qpn, ret);
hns_roce_qp_destroy(hr_dev, hr_qp, udata);
@@ -5905,9 +5898,9 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
HNS_ROCE_CMD_MODIFY_CQC, hr_cq->cqn);
hns_roce_free_cmd_mailbox(hr_dev, mailbox);
if (ret)
- ibdev_err(&hr_dev->ib_dev,
- "failed to process cmd when modifying CQ, ret = %d.\n",
- ret);
+ ibdev_err_ratelimited(&hr_dev->ib_dev,
+ "failed to process cmd when modifying CQ, ret = %d.\n",
+ ret);
err_out:
if (ret)
@@ -5931,9 +5924,9 @@ static int hns_roce_v2_query_cqc(struct hns_roce_dev *hr_dev, u32 cqn,
ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma,
HNS_ROCE_CMD_QUERY_CQC, cqn);
if (ret) {
- ibdev_err(&hr_dev->ib_dev,
- "failed to process cmd when querying CQ, ret = %d.\n",
- ret);
+ ibdev_err_ratelimited(&hr_dev->ib_dev,
+ "failed to process cmd when querying CQ, ret = %d.\n",
+ ret);
goto err_mailbox;
}
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 846da8c78b8b..b3f4327d0e64 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -138,8 +138,8 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr
key_to_hw_index(mr->key) &
(hr_dev->caps.num_mtpts - 1));
if (ret)
- ibdev_warn(ibdev, "failed to destroy mpt, ret = %d.\n",
- ret);
+ ibdev_warn_ratelimited(ibdev, "failed to destroy mpt, ret = %d.\n",
+ ret);
}
free_mr_pbl(hr_dev, mr);
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index c9b8233f4b05..70c06ef65603 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -151,8 +151,8 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_SRQ,
srq->srqn);
if (ret)
- dev_err(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n",
- ret, srq->srqn);
+ dev_err_ratelimited(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n",
+ ret, srq->srqn);
xa_erase_irq(&srq_table->xa, srq->srqn);
--
2.33.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v2 for-rc 0/5] RDMA/hns: Bugfixes
2024-10-24 12:39 [PATCH v2 for-rc 0/5] RDMA/hns: Bugfixes Junxian Huang
` (4 preceding siblings ...)
2024-10-24 12:40 ` [PATCH v2 for-rc 5/5] RDMA/hns: Fix cpu stuck caused by printings during reset Junxian Huang
@ 2024-10-30 12:14 ` Leon Romanovsky
5 siblings, 0 replies; 10+ messages in thread
From: Leon Romanovsky @ 2024-10-30 12:14 UTC (permalink / raw)
To: jgg, Junxian Huang; +Cc: linux-rdma, linuxarm, linux-kernel, tangchengchang
On Thu, 24 Oct 2024 20:39:55 +0800, Junxian Huang wrote:
> Some hns bugfixes.
> Patch #5 has been sent once before:
> https://lore.kernel.org/lkml/4c202653-1ad7-d885-55b7-07c77a549b09@hisilicon.com/T/#m05883778af8e39438d864e9c0fb9062aa09f362c
>
> v1 -> v2:
> * Add spin_lock_init() for the newly-added flush_lock in patch #2.
>
> [...]
Applied, thanks!
[1/5] RDMA/hns: Fix an AEQE overflow error caused by untimely update of eq_db_ci
https://git.kernel.org/rdma/rdma/c/571e4ab8a45e53
[2/5] RDMA/hns: Fix flush cqe error when racing with destroy qp
https://git.kernel.org/rdma/rdma/c/377a2097705b91
[3/5] RDMA/hns: Modify debugfs name
https://git.kernel.org/rdma/rdma/c/370a9351bf84af
[4/5] RDMA/hns: Use dev_* printings in hem code instead of ibdev_*
https://git.kernel.org/rdma/rdma/c/d81fb6511abf18
[5/5] RDMA/hns: Fix cpu stuck caused by printings during reset
https://git.kernel.org/rdma/rdma/c/323275ac2ff15b
Best regards,
--
Leon Romanovsky <leon@kernel.org>
^ permalink raw reply [flat|nested] 10+ messages in thread