[PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism

Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed

* [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
@ 2025-02-17  7:01 Junxian Huang
  2025-02-17  7:01 ` [PATCH for-next 1/4] RDMA/hns: Change mtr member to pointer in hns QP/CQ/MR/SRQ/EQ struct Junxian Huang
                   ` (5 more replies)
  0 siblings, 6 replies; 23+ messages in thread
From: Junxian Huang @ 2025-02-17  7:01 UTC (permalink / raw)
  To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang

When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
to notify HW about the destruction. In this case, driver will still
free the resources, while HW may still access them, thus leading to
a UAF.

This series introduces delay-destruction mechanism to fix such HW UAF,
including thw HW CTX and doorbells.

Junxian Huang (2):
  RDMA/hns: Change mtr member to pointer in hns QP/CQ/MR/SRQ/EQ struct
  RDMA/hns: Fix HW doorbell UAF by adding delay-destruction mechanism

wenglianfa (2):
  RDMA/hns: Fix HW CTX UAF by adding delay-destruction mechanism
  Revert "RDMA/hns: Do not destroy QP resources in the hw resetting
    phase"

 drivers/infiniband/hw/hns/hns_roce_cq.c       |  34 +++--
 drivers/infiniband/hw/hns/hns_roce_db.c       |  91 ++++++++++----
 drivers/infiniband/hw/hns/hns_roce_device.h   |  73 ++++++++---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c    |  97 +++++++--------
 drivers/infiniband/hw/hns/hns_roce_main.c     |  13 ++
 drivers/infiniband/hw/hns/hns_roce_mr.c       | 117 ++++++++++++++----
 drivers/infiniband/hw/hns/hns_roce_qp.c       |  30 +++--
 drivers/infiniband/hw/hns/hns_roce_restrack.c |   4 +-
 drivers/infiniband/hw/hns/hns_roce_srq.c      |  45 ++++---
 9 files changed, 348 insertions(+), 156 deletions(-)

--
2.33.0


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH for-next 1/4] RDMA/hns: Change mtr member to pointer in hns QP/CQ/MR/SRQ/EQ struct
  2025-02-17  7:01 [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism Junxian Huang
@ 2025-02-17  7:01 ` Junxian Huang
  2025-02-17  7:01 ` [PATCH for-next 2/4] RDMA/hns: Fix HW CTX UAF by adding delay-destruction mechanism Junxian Huang
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 23+ messages in thread
From: Junxian Huang @ 2025-02-17  7:01 UTC (permalink / raw)
  To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang

Change mtr member to pointer in hns QP/CQ/MR/SRQ/EQ struct to decouple
the life cycle of mtr from these structs. This is the preparation for
delay-destruction mechanism. No functional changes.

Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
Signed-off-by: wenglianfa <wenglianfa@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_cq.c       | 18 ++--
 drivers/infiniband/hw/hns/hns_roce_device.h   | 21 ++---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c    | 82 ++++++++++---------
 drivers/infiniband/hw/hns/hns_roce_mr.c       | 61 ++++++++------
 drivers/infiniband/hw/hns/hns_roce_qp.c       | 13 +--
 drivers/infiniband/hw/hns/hns_roce_restrack.c |  4 +-
 drivers/infiniband/hw/hns/hns_roce_srq.c      | 25 +++---
 7 files changed, 122 insertions(+), 102 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 4106423a1b39..236ee3fefe16 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -135,7 +135,7 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
 	u64 mtts[MTT_MIN_COUNT] = {};
 	int ret;
 
-	ret = hns_roce_mtr_find(hr_dev, &hr_cq->mtr, 0, mtts, ARRAY_SIZE(mtts));
+	ret = hns_roce_mtr_find(hr_dev, hr_cq->mtr, 0, mtts, ARRAY_SIZE(mtts));
 	if (ret) {
 		ibdev_err(ibdev, "failed to find CQ mtr, ret = %d.\n", ret);
 		return ret;
@@ -156,7 +156,7 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
 	}
 
 	ret = hns_roce_create_cqc(hr_dev, hr_cq, mtts,
-				  hns_roce_get_mtr_ba(&hr_cq->mtr));
+				  hns_roce_get_mtr_ba(hr_cq->mtr));
 	if (ret)
 		goto err_xa;
 
@@ -200,25 +200,27 @@ static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
 {
 	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_buf_attr buf_attr = {};
-	int ret;
+	int ret = 0;
 
 	buf_attr.page_shift = hr_dev->caps.cqe_buf_pg_sz + PAGE_SHIFT;
 	buf_attr.region[0].size = hr_cq->cq_depth * hr_cq->cqe_size;
 	buf_attr.region[0].hopnum = hr_dev->caps.cqe_hop_num;
 	buf_attr.region_count = 1;
 
-	ret = hns_roce_mtr_create(hr_dev, &hr_cq->mtr, &buf_attr,
-				  hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT,
-				  udata, addr);
-	if (ret)
+	hr_cq->mtr = hns_roce_mtr_create(hr_dev, &buf_attr,
+					 hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT,
+					 udata, addr);
+	if (IS_ERR(hr_cq->mtr)) {
+		ret = PTR_ERR(hr_cq->mtr);
 		ibdev_err(ibdev, "failed to alloc CQ mtr, ret = %d.\n", ret);
+	}
 
 	return ret;
 }
 
 static void free_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
 {
-	hns_roce_mtr_destroy(hr_dev, &hr_cq->mtr);
+	hns_roce_mtr_destroy(hr_dev, hr_cq->mtr);
 }
 
 static int alloc_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 560a1d9de408..ed0fa29f0cff 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -336,7 +336,7 @@ struct hns_roce_mr {
 	int			enabled; /* MR's active status */
 	int			type; /* MR's register type */
 	u32			pbl_hop_num; /* multi-hop number */
-	struct hns_roce_mtr	pbl_mtr;
+	struct hns_roce_mtr	*pbl_mtr;
 	u32			npages;
 	dma_addr_t		*page_list;
 };
@@ -424,7 +424,7 @@ struct hns_roce_db {
 
 struct hns_roce_cq {
 	struct ib_cq			ib_cq;
-	struct hns_roce_mtr		mtr;
+	struct hns_roce_mtr		*mtr;
 	struct hns_roce_db		db;
 	u32				flags;
 	spinlock_t			lock;
@@ -445,7 +445,7 @@ struct hns_roce_cq {
 };
 
 struct hns_roce_idx_que {
-	struct hns_roce_mtr		mtr;
+	struct hns_roce_mtr		*mtr;
 	u32				entry_shift;
 	unsigned long			*bitmap;
 	u32				head;
@@ -466,7 +466,7 @@ struct hns_roce_srq {
 	refcount_t		refcount;
 	struct completion	free;
 
-	struct hns_roce_mtr	buf_mtr;
+	struct hns_roce_mtr	*buf_mtr;
 
 	u64		       *wrid;
 	struct hns_roce_idx_que idx_que;
@@ -614,7 +614,7 @@ struct hns_roce_qp {
 	enum ib_sig_type	sq_signal_bits;
 	struct hns_roce_wq	sq;
 
-	struct hns_roce_mtr	mtr;
+	struct hns_roce_mtr	*mtr;
 
 	u32			buff_size;
 	struct mutex		mutex;
@@ -712,7 +712,7 @@ struct hns_roce_eq {
 	int				coalesce;
 	int				arm_st;
 	int				hop_num;
-	struct hns_roce_mtr		mtr;
+	struct hns_roce_mtr		*mtr;
 	u16				eq_max_cnt;
 	u32				eq_period;
 	int				shift;
@@ -1179,10 +1179,11 @@ static inline dma_addr_t hns_roce_get_mtr_ba(struct hns_roce_mtr *mtr)
 
 int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
 		      u32 offset, u64 *mtt_buf, int mtt_max);
-int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
-			struct hns_roce_buf_attr *buf_attr,
-			unsigned int page_shift, struct ib_udata *udata,
-			unsigned long user_addr);
+struct hns_roce_mtr *hns_roce_mtr_create(struct hns_roce_dev *hr_dev,
+					 struct hns_roce_buf_attr *buf_attr,
+					 unsigned int ba_page_shift,
+					 struct ib_udata *udata,
+					 unsigned long user_addr);
 void hns_roce_mtr_destroy(struct hns_roce_dev *hr_dev,
 			  struct hns_roce_mtr *mtr);
 int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index dded339802b3..d60ca0a306e9 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -150,7 +150,7 @@ static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
 	hr_reg_write_bool(fseg, FRMR_LW, wr->access & IB_ACCESS_LOCAL_WRITE);
 
 	/* Data structure reuse may lead to confusion */
-	pbl_ba = mr->pbl_mtr.hem_cfg.root_ba;
+	pbl_ba = mr->pbl_mtr->hem_cfg.root_ba;
 	rc_sq_wqe->msg_len = cpu_to_le32(lower_32_bits(pbl_ba));
 	rc_sq_wqe->inv_key = cpu_to_le32(upper_32_bits(pbl_ba));
 
@@ -161,7 +161,7 @@ static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
 
 	hr_reg_write(fseg, FRMR_PBL_SIZE, mr->npages);
 	hr_reg_write(fseg, FRMR_PBL_BUF_PG_SZ,
-		     to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift));
+		     to_hr_hw_page_shift(mr->pbl_mtr->hem_cfg.buf_pg_shift));
 	hr_reg_clear(fseg, FRMR_BLK_MODE);
 }
 
@@ -864,12 +864,12 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
 
 static void *get_srq_wqe_buf(struct hns_roce_srq *srq, u32 n)
 {
-	return hns_roce_buf_offset(srq->buf_mtr.kmem, n << srq->wqe_shift);
+	return hns_roce_buf_offset(srq->buf_mtr->kmem, n << srq->wqe_shift);
 }
 
 static void *get_idx_buf(struct hns_roce_idx_que *idx_que, u32 n)
 {
-	return hns_roce_buf_offset(idx_que->mtr.kmem,
+	return hns_roce_buf_offset(idx_que->mtr->kmem,
 				   n << idx_que->entry_shift);
 }
 
@@ -3227,7 +3227,7 @@ static int set_mtpt_pbl(struct hns_roce_dev *hr_dev,
 	int ret;
 	int i;
 
-	ret = hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, pages,
+	ret = hns_roce_mtr_find(hr_dev, mr->pbl_mtr, 0, pages,
 				min_t(int, ARRAY_SIZE(pages), mr->npages));
 	if (ret) {
 		ibdev_err(ibdev, "failed to find PBL mtr, ret = %d.\n", ret);
@@ -3238,7 +3238,7 @@ static int set_mtpt_pbl(struct hns_roce_dev *hr_dev,
 	for (i = 0; i < ARRAY_SIZE(pages); i++)
 		pages[i] >>= MPT_PBL_BUF_ADDR_S;
 
-	pbl_ba = hns_roce_get_mtr_ba(&mr->pbl_mtr);
+	pbl_ba = hns_roce_get_mtr_ba(mr->pbl_mtr);
 
 	mpt_entry->pbl_size = cpu_to_le32(mr->npages);
 	mpt_entry->pbl_ba_l = cpu_to_le32(pbl_ba >> MPT_PBL_BA_ADDR_S);
@@ -3251,7 +3251,7 @@ static int set_mtpt_pbl(struct hns_roce_dev *hr_dev,
 	mpt_entry->pa1_l = cpu_to_le32(lower_32_bits(pages[1]));
 	hr_reg_write(mpt_entry, MPT_PA1_H, upper_32_bits(pages[1]));
 	hr_reg_write(mpt_entry, MPT_PBL_BUF_PG_SZ,
-		     to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift));
+		     to_hr_hw_page_shift(mr->pbl_mtr->hem_cfg.buf_pg_shift));
 
 	return 0;
 }
@@ -3294,7 +3294,7 @@ static int hns_roce_v2_write_mtpt(struct hns_roce_dev *hr_dev,
 		hr_reg_write(mpt_entry, MPT_PBL_HOP_NUM, mr->pbl_hop_num);
 
 	hr_reg_write(mpt_entry, MPT_PBL_BA_PG_SZ,
-		     to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.ba_pg_shift));
+		     to_hr_hw_page_shift(mr->pbl_mtr->hem_cfg.ba_pg_shift));
 	hr_reg_enable(mpt_entry, MPT_INNER_PA_VLD);
 
 	return set_mtpt_pbl(hr_dev, mpt_entry, mr);
@@ -3338,7 +3338,7 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev,
 
 static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr)
 {
-	dma_addr_t pbl_ba = hns_roce_get_mtr_ba(&mr->pbl_mtr);
+	dma_addr_t pbl_ba = hns_roce_get_mtr_ba(mr->pbl_mtr);
 	struct hns_roce_v2_mpt_entry *mpt_entry;
 
 	mpt_entry = mb_buf;
@@ -3357,9 +3357,9 @@ static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr)
 
 	hr_reg_write(mpt_entry, MPT_PBL_HOP_NUM, 1);
 	hr_reg_write(mpt_entry, MPT_PBL_BA_PG_SZ,
-		     to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.ba_pg_shift));
+		     to_hr_hw_page_shift(mr->pbl_mtr->hem_cfg.ba_pg_shift));
 	hr_reg_write(mpt_entry, MPT_PBL_BUF_PG_SZ,
-		     to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift));
+		     to_hr_hw_page_shift(mr->pbl_mtr->hem_cfg.buf_pg_shift));
 
 	mpt_entry->pbl_size = cpu_to_le32(mr->npages);
 
@@ -3497,7 +3497,7 @@ static void hns_roce_v2_dereg_mr(struct hns_roce_dev *hr_dev)
 
 static void *get_cqe_v2(struct hns_roce_cq *hr_cq, int n)
 {
-	return hns_roce_buf_offset(hr_cq->mtr.kmem, n * hr_cq->cqe_size);
+	return hns_roce_buf_offset(hr_cq->mtr->kmem, n * hr_cq->cqe_size);
 }
 
 static void *get_sw_cqe_v2(struct hns_roce_cq *hr_cq, unsigned int n)
@@ -3609,9 +3609,9 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev,
 	hr_reg_write(cq_context, CQC_CQE_NEX_BLK_ADDR_H,
 		     upper_32_bits(to_hr_hw_page_addr(mtts[1])));
 	hr_reg_write(cq_context, CQC_CQE_BAR_PG_SZ,
-		     to_hr_hw_page_shift(hr_cq->mtr.hem_cfg.ba_pg_shift));
+		     to_hr_hw_page_shift(hr_cq->mtr->hem_cfg.ba_pg_shift));
 	hr_reg_write(cq_context, CQC_CQE_BUF_PG_SZ,
-		     to_hr_hw_page_shift(hr_cq->mtr.hem_cfg.buf_pg_shift));
+		     to_hr_hw_page_shift(hr_cq->mtr->hem_cfg.buf_pg_shift));
 	hr_reg_write(cq_context, CQC_CQE_BA_L, dma_handle >> CQC_CQE_BA_L_S);
 	hr_reg_write(cq_context, CQC_CQE_BA_H, dma_handle >> CQC_CQE_BA_H_S);
 	hr_reg_write_bool(cq_context, CQC_DB_RECORD_EN,
@@ -4368,7 +4368,7 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev,
 	int ret;
 
 	/* Search qp buf's mtts */
-	ret = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->rq.offset, mtts,
+	ret = hns_roce_mtr_find(hr_dev, hr_qp->mtr, hr_qp->rq.offset, mtts,
 				MTT_MIN_COUNT);
 	if (hr_qp->rq.wqe_cnt && ret) {
 		ibdev_err(&hr_dev->ib_dev,
@@ -4377,7 +4377,7 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev,
 		return ret;
 	}
 
-	wqe_sge_ba = hns_roce_get_mtr_ba(&hr_qp->mtr);
+	wqe_sge_ba = hns_roce_get_mtr_ba(hr_qp->mtr);
 
 	context->wqe_sge_ba = cpu_to_le32(wqe_sge_ba >> 3);
 	qpc_mask->wqe_sge_ba = 0;
@@ -4408,11 +4408,11 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev,
 	hr_reg_clear(qpc_mask, QPC_RQ_HOP_NUM);
 
 	hr_reg_write(context, QPC_WQE_SGE_BA_PG_SZ,
-		     to_hr_hw_page_shift(hr_qp->mtr.hem_cfg.ba_pg_shift));
+		     to_hr_hw_page_shift(hr_qp->mtr->hem_cfg.ba_pg_shift));
 	hr_reg_clear(qpc_mask, QPC_WQE_SGE_BA_PG_SZ);
 
 	hr_reg_write(context, QPC_WQE_SGE_BUF_PG_SZ,
-		     to_hr_hw_page_shift(hr_qp->mtr.hem_cfg.buf_pg_shift));
+		     to_hr_hw_page_shift(hr_qp->mtr->hem_cfg.buf_pg_shift));
 	hr_reg_clear(qpc_mask, QPC_WQE_SGE_BUF_PG_SZ);
 
 	context->rq_cur_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[0]));
@@ -4445,7 +4445,7 @@ static int config_qp_sq_buf(struct hns_roce_dev *hr_dev,
 	int ret;
 
 	/* search qp buf's mtts */
-	ret = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->sq.offset,
+	ret = hns_roce_mtr_find(hr_dev, hr_qp->mtr, hr_qp->sq.offset,
 				&sq_cur_blk, 1);
 	if (ret) {
 		ibdev_err(ibdev, "failed to find QP(0x%lx) SQ WQE buf, ret = %d.\n",
@@ -4453,7 +4453,7 @@ static int config_qp_sq_buf(struct hns_roce_dev *hr_dev,
 		return ret;
 	}
 	if (hr_qp->sge.sge_cnt > 0) {
-		ret = hns_roce_mtr_find(hr_dev, &hr_qp->mtr,
+		ret = hns_roce_mtr_find(hr_dev, hr_qp->mtr,
 					hr_qp->sge.offset, &sge_cur_blk, 1);
 		if (ret) {
 			ibdev_err(ibdev, "failed to find QP(0x%lx) SGE buf, ret = %d.\n",
@@ -5734,7 +5734,7 @@ static int hns_roce_v2_write_srqc_index_queue(struct hns_roce_srq *srq,
 	int ret;
 
 	/* Get physical address of idx que buf */
-	ret = hns_roce_mtr_find(hr_dev, &idx_que->mtr, 0, mtts_idx,
+	ret = hns_roce_mtr_find(hr_dev, idx_que->mtr, 0, mtts_idx,
 				ARRAY_SIZE(mtts_idx));
 	if (ret) {
 		ibdev_err(ibdev, "failed to find mtr for SRQ idx, ret = %d.\n",
@@ -5742,7 +5742,7 @@ static int hns_roce_v2_write_srqc_index_queue(struct hns_roce_srq *srq,
 		return ret;
 	}
 
-	dma_handle_idx = hns_roce_get_mtr_ba(&idx_que->mtr);
+	dma_handle_idx = hns_roce_get_mtr_ba(idx_que->mtr);
 
 	hr_reg_write(ctx, SRQC_IDX_HOP_NUM,
 		     to_hr_hem_hopnum(hr_dev->caps.idx_hop_num, srq->wqe_cnt));
@@ -5752,9 +5752,9 @@ static int hns_roce_v2_write_srqc_index_queue(struct hns_roce_srq *srq,
 		     upper_32_bits(dma_handle_idx >> DMA_IDX_SHIFT));
 
 	hr_reg_write(ctx, SRQC_IDX_BA_PG_SZ,
-		     to_hr_hw_page_shift(idx_que->mtr.hem_cfg.ba_pg_shift));
+		     to_hr_hw_page_shift(idx_que->mtr->hem_cfg.ba_pg_shift));
 	hr_reg_write(ctx, SRQC_IDX_BUF_PG_SZ,
-		     to_hr_hw_page_shift(idx_que->mtr.hem_cfg.buf_pg_shift));
+		     to_hr_hw_page_shift(idx_que->mtr->hem_cfg.buf_pg_shift));
 
 	hr_reg_write(ctx, SRQC_IDX_CUR_BLK_ADDR_L,
 		     to_hr_hw_page_addr(mtts_idx[0]));
@@ -5781,7 +5781,7 @@ static int hns_roce_v2_write_srqc(struct hns_roce_srq *srq, void *mb_buf)
 	memset(ctx, 0, sizeof(*ctx));
 
 	/* Get the physical address of srq buf */
-	ret = hns_roce_mtr_find(hr_dev, &srq->buf_mtr, 0, mtts_wqe,
+	ret = hns_roce_mtr_find(hr_dev, srq->buf_mtr, 0, mtts_wqe,
 				ARRAY_SIZE(mtts_wqe));
 	if (ret) {
 		ibdev_err(ibdev, "failed to find mtr for SRQ WQE, ret = %d.\n",
@@ -5789,7 +5789,7 @@ static int hns_roce_v2_write_srqc(struct hns_roce_srq *srq, void *mb_buf)
 		return ret;
 	}
 
-	dma_handle_wqe = hns_roce_get_mtr_ba(&srq->buf_mtr);
+	dma_handle_wqe = hns_roce_get_mtr_ba(srq->buf_mtr);
 
 	hr_reg_write(ctx, SRQC_SRQ_ST, 1);
 	hr_reg_write_bool(ctx, SRQC_SRQ_TYPE,
@@ -5811,9 +5811,9 @@ static int hns_roce_v2_write_srqc(struct hns_roce_srq *srq, void *mb_buf)
 		     upper_32_bits(dma_handle_wqe >> DMA_WQE_SHIFT));
 
 	hr_reg_write(ctx, SRQC_WQE_BA_PG_SZ,
-		     to_hr_hw_page_shift(srq->buf_mtr.hem_cfg.ba_pg_shift));
+		     to_hr_hw_page_shift(srq->buf_mtr->hem_cfg.ba_pg_shift));
 	hr_reg_write(ctx, SRQC_WQE_BUF_PG_SZ,
-		     to_hr_hw_page_shift(srq->buf_mtr.hem_cfg.buf_pg_shift));
+		     to_hr_hw_page_shift(srq->buf_mtr->hem_cfg.buf_pg_shift));
 
 	if (srq->cap_flags & HNS_ROCE_SRQ_CAP_RECORD_DB) {
 		hr_reg_enable(ctx, SRQC_DB_RECORD_EN);
@@ -6166,7 +6166,7 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq)
 {
 	struct hns_roce_aeqe *aeqe;
 
-	aeqe = hns_roce_buf_offset(eq->mtr.kmem,
+	aeqe = hns_roce_buf_offset(eq->mtr->kmem,
 				   (eq->cons_index & (eq->entries - 1)) *
 				   eq->eqe_size);
 
@@ -6234,7 +6234,7 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq)
 {
 	struct hns_roce_ceqe *ceqe;
 
-	ceqe = hns_roce_buf_offset(eq->mtr.kmem,
+	ceqe = hns_roce_buf_offset(eq->mtr->kmem,
 				   (eq->cons_index & (eq->entries - 1)) *
 				   eq->eqe_size);
 
@@ -6474,7 +6474,7 @@ static void hns_roce_v2_int_mask_enable(struct hns_roce_dev *hr_dev,
 
 static void free_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
 {
-	hns_roce_mtr_destroy(hr_dev, &eq->mtr);
+	hns_roce_mtr_destroy(hr_dev, eq->mtr);
 }
 
 static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev,
@@ -6521,14 +6521,14 @@ static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq,
 	init_eq_config(hr_dev, eq);
 
 	/* if not multi-hop, eqe buffer only use one trunk */
-	ret = hns_roce_mtr_find(hr_dev, &eq->mtr, 0, eqe_ba,
+	ret = hns_roce_mtr_find(hr_dev, eq->mtr, 0, eqe_ba,
 				ARRAY_SIZE(eqe_ba));
 	if (ret) {
 		dev_err(hr_dev->dev, "failed to find EQE mtr, ret = %d\n", ret);
 		return ret;
 	}
 
-	bt_ba = hns_roce_get_mtr_ba(&eq->mtr);
+	bt_ba = hns_roce_get_mtr_ba(eq->mtr);
 
 	hr_reg_write(eqc, EQC_EQ_ST, HNS_ROCE_V2_EQ_STATE_VALID);
 	hr_reg_write(eqc, EQC_EQE_HOP_NUM, eq->hop_num);
@@ -6538,9 +6538,9 @@ static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq,
 	hr_reg_write(eqc, EQC_EQN, eq->eqn);
 	hr_reg_write(eqc, EQC_EQE_CNT, HNS_ROCE_EQ_INIT_EQE_CNT);
 	hr_reg_write(eqc, EQC_EQE_BA_PG_SZ,
-		     to_hr_hw_page_shift(eq->mtr.hem_cfg.ba_pg_shift));
+		     to_hr_hw_page_shift(eq->mtr->hem_cfg.ba_pg_shift));
 	hr_reg_write(eqc, EQC_EQE_BUF_PG_SZ,
-		     to_hr_hw_page_shift(eq->mtr.hem_cfg.buf_pg_shift));
+		     to_hr_hw_page_shift(eq->mtr->hem_cfg.buf_pg_shift));
 	hr_reg_write(eqc, EQC_EQ_PROD_INDX, HNS_ROCE_EQ_INIT_PROD_IDX);
 	hr_reg_write(eqc, EQC_EQ_MAX_CNT, eq->eq_max_cnt);
 
@@ -6573,7 +6573,7 @@ static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq,
 static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
 {
 	struct hns_roce_buf_attr buf_attr = {};
-	int err;
+	int err = 0;
 
 	if (hr_dev->caps.eqe_hop_num == HNS_ROCE_HOP_NUM_0)
 		eq->hop_num = 0;
@@ -6585,11 +6585,13 @@ static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
 	buf_attr.region[0].hopnum = eq->hop_num;
 	buf_attr.region_count = 1;
 
-	err = hns_roce_mtr_create(hr_dev, &eq->mtr, &buf_attr,
-				  hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT, NULL,
-				  0);
-	if (err)
+	eq->mtr = hns_roce_mtr_create(hr_dev, &buf_attr,
+				      hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT,
+				      NULL, 0);
+	if (IS_ERR(eq->mtr)) {
+		err = PTR_ERR(eq->mtr);
 		dev_err(hr_dev->dev, "failed to alloc EQE mtr, err %d\n", err);
+	}
 
 	return err;
 }
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 55b9283bfc6f..3902243cac96 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -109,23 +109,23 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
 	buf_attr.adaptive = !is_fast;
 	buf_attr.type = MTR_PBL;
 
-	err = hns_roce_mtr_create(hr_dev, &mr->pbl_mtr, &buf_attr,
-				  hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT,
-				  udata, start);
-	if (err) {
+	mr->pbl_mtr = hns_roce_mtr_create(hr_dev, &buf_attr,
+		hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT, udata, start);
+	if (IS_ERR(mr->pbl_mtr)) {
+		err = PTR_ERR(mr->pbl_mtr);
 		ibdev_err(ibdev, "failed to alloc pbl mtr, ret = %d.\n", err);
 		return err;
 	}
 
-	mr->npages = mr->pbl_mtr.hem_cfg.buf_pg_count;
+	mr->npages = mr->pbl_mtr->hem_cfg.buf_pg_count;
 	mr->pbl_hop_num = buf_attr.region[0].hopnum;
 
-	return err;
+	return 0;
 }
 
 static void free_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr)
 {
-	hns_roce_mtr_destroy(hr_dev, &mr->pbl_mtr);
+	hns_roce_mtr_destroy(hr_dev, mr->pbl_mtr);
 }
 
 static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr)
@@ -196,18 +196,22 @@ struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
 	struct hns_roce_mr *mr;
-	int ret;
+	int ret = -ENOMEM;
 
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
 		return  ERR_PTR(-ENOMEM);
 
+	mr->pbl_mtr = kvzalloc(sizeof(*mr->pbl_mtr), GFP_KERNEL);
+	if (!mr->pbl_mtr)
+		goto err_mtr;
+
 	mr->type = MR_TYPE_DMA;
 	mr->pd = to_hr_pd(pd)->pdn;
 	mr->access = acc;
 
 	/* Allocate memory region key */
-	hns_roce_hem_list_init(&mr->pbl_mtr.hem_list);
+	hns_roce_hem_list_init(&mr->pbl_mtr->hem_list);
 	ret = alloc_mr_key(hr_dev, mr);
 	if (ret)
 		goto err_free;
@@ -223,6 +227,8 @@ struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc)
 	free_mr_key(hr_dev, mr);
 
 err_free:
+	kvfree(mr->pbl_mtr);
+err_mtr:
 	kfree(mr);
 	return ERR_PTR(ret);
 }
@@ -426,7 +432,7 @@ static int hns_roce_set_page(struct ib_mr *ibmr, u64 addr)
 {
 	struct hns_roce_mr *mr = to_hr_mr(ibmr);
 
-	if (likely(mr->npages < mr->pbl_mtr.hem_cfg.buf_pg_count)) {
+	if (likely(mr->npages < mr->pbl_mtr->hem_cfg.buf_pg_count)) {
 		mr->page_list[mr->npages++] = addr;
 		return 0;
 	}
@@ -441,7 +447,7 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device);
 	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_mr *mr = to_hr_mr(ibmr);
-	struct hns_roce_mtr *mtr = &mr->pbl_mtr;
+	struct hns_roce_mtr *mtr = mr->pbl_mtr;
 	int ret, sg_num = 0;
 
 	if (!IS_ALIGNED(sg_offset, HNS_ROCE_FRMR_ALIGN_SIZE) ||
@@ -450,7 +456,7 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		return sg_num;
 
 	mr->npages = 0;
-	mr->page_list = kvcalloc(mr->pbl_mtr.hem_cfg.buf_pg_count,
+	mr->page_list = kvcalloc(mr->pbl_mtr->hem_cfg.buf_pg_count,
 				 sizeof(dma_addr_t), GFP_KERNEL);
 	if (!mr->page_list)
 		return sg_num;
@@ -458,7 +464,7 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 	sg_num = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset_p, hns_roce_set_page);
 	if (sg_num < 1) {
 		ibdev_err(ibdev, "failed to store sg pages %u %u, cnt = %d.\n",
-			  mr->npages, mr->pbl_mtr.hem_cfg.buf_pg_count, sg_num);
+			  mr->npages, mr->pbl_mtr->hem_cfg.buf_pg_count, sg_num);
 		goto err_page_list;
 	}
 
@@ -471,7 +477,7 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		ibdev_err(ibdev, "failed to map sg mtr, ret = %d.\n", ret);
 		sg_num = 0;
 	} else {
-		mr->pbl_mtr.hem_cfg.buf_pg_shift = (u32)ilog2(ibmr->page_size);
+		mr->pbl_mtr->hem_cfg.buf_pg_shift = (u32)ilog2(ibmr->page_size);
 	}
 
 err_page_list:
@@ -1132,20 +1138,25 @@ static void mtr_free_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
  * hns_roce_mtr_create - Create hns memory translate region.
  *
  * @hr_dev: RoCE device struct pointer
- * @mtr: memory translate region
  * @buf_attr: buffer attribute for creating mtr
  * @ba_page_shift: page shift for multi-hop base address table
  * @udata: user space context, if it's NULL, means kernel space
  * @user_addr: userspace virtual address to start at
  */
-int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
-			struct hns_roce_buf_attr *buf_attr,
-			unsigned int ba_page_shift, struct ib_udata *udata,
-			unsigned long user_addr)
+struct hns_roce_mtr *hns_roce_mtr_create(struct hns_roce_dev *hr_dev,
+					 struct hns_roce_buf_attr *buf_attr,
+					 unsigned int ba_page_shift,
+					 struct ib_udata *udata,
+					 unsigned long user_addr)
 {
 	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_mtr *mtr;
 	int ret;
 
+	mtr = kvzalloc(sizeof(*mtr), GFP_KERNEL);
+	if (!mtr)
+		return ERR_PTR(-ENOMEM);
+
 	/* The caller has its own buffer list and invokes the hns_roce_mtr_map()
 	 * to finish the MTT configuration.
 	 */
@@ -1157,7 +1168,7 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
 		if (ret) {
 			ibdev_err(ibdev,
 				  "failed to alloc mtr bufs, ret = %d.\n", ret);
-			return ret;
+			goto err_out;
 		}
 
 		ret = get_best_page_shift(hr_dev, mtr, buf_attr);
@@ -1180,7 +1191,7 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
 	}
 
 	if (buf_attr->mtt_only)
-		return 0;
+		return mtr;
 
 	/* Write buffer's dma address to MTT */
 	ret = mtr_map_bufs(hr_dev, mtr);
@@ -1189,14 +1200,15 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
 		goto err_alloc_mtt;
 	}
 
-	return 0;
+	return mtr;
 
 err_alloc_mtt:
 	mtr_free_mtt(hr_dev, mtr);
 err_init_buf:
 	mtr_free_bufs(hr_dev, mtr);
-
-	return ret;
+err_out:
+	kvfree(mtr);
+	return ERR_PTR(ret);
 }
 
 void hns_roce_mtr_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
@@ -1206,4 +1218,5 @@ void hns_roce_mtr_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
 
 	/* free buffers */
 	mtr_free_bufs(hr_dev, mtr);
+	kvfree(mtr);
 }
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 9e2e76c59406..62fc9a3c784e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -780,10 +780,11 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 		ibdev_err(ibdev, "failed to split WQE buf, ret = %d.\n", ret);
 		goto err_inline;
 	}
-	ret = hns_roce_mtr_create(hr_dev, &hr_qp->mtr, &buf_attr,
-				  PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz,
-				  udata, addr);
-	if (ret) {
+	hr_qp->mtr = hns_roce_mtr_create(hr_dev, &buf_attr,
+					 PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz,
+					 udata, addr);
+	if (IS_ERR(hr_qp->mtr)) {
+		ret = PTR_ERR(hr_qp->mtr);
 		ibdev_err(ibdev, "failed to create WQE mtr, ret = %d.\n", ret);
 		goto err_inline;
 	}
@@ -800,7 +801,7 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 
 static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
-	hns_roce_mtr_destroy(hr_dev, &hr_qp->mtr);
+	hns_roce_mtr_destroy(hr_dev, hr_qp->mtr);
 }
 
 static inline bool user_qp_has_sdb(struct hns_roce_dev *hr_dev,
@@ -1531,7 +1532,7 @@ void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq,
 
 static inline void *get_wqe(struct hns_roce_qp *hr_qp, u32 offset)
 {
-	return hns_roce_buf_offset(hr_qp->mtr.kmem, offset);
+	return hns_roce_buf_offset(hr_qp->mtr->kmem, offset);
 }
 
 void *hns_roce_get_recv_wqe(struct hns_roce_qp *hr_qp, unsigned int n)
diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c
index 3b0c0ffa6a29..db38a2045efe 100644
--- a/drivers/infiniband/hw/hns/hns_roce_restrack.c
+++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c
@@ -291,11 +291,11 @@ int hns_roce_fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ib_mr)
 		goto err;
 
 	if (rdma_nl_put_driver_u32_hex(msg, "ba_pg_shift",
-				       hr_mr->pbl_mtr.hem_cfg.ba_pg_shift))
+				       hr_mr->pbl_mtr->hem_cfg.ba_pg_shift))
 		goto err;
 
 	if (rdma_nl_put_driver_u32_hex(msg, "buf_pg_shift",
-				       hr_mr->pbl_mtr.hem_cfg.buf_pg_shift))
+				       hr_mr->pbl_mtr->hem_cfg.buf_pg_shift))
 		goto err;
 
 	nla_nest_end(msg, table_attr);
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index 70c06ef65603..6685e5a1afd2 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -179,10 +179,10 @@ static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq,
 	buf_attr.region[0].hopnum = hr_dev->caps.idx_hop_num;
 	buf_attr.region_count = 1;
 
-	ret = hns_roce_mtr_create(hr_dev, &idx_que->mtr, &buf_attr,
-				  hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT,
-				  udata, addr);
-	if (ret) {
+	idx_que->mtr = hns_roce_mtr_create(hr_dev, &buf_attr,
+		hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT, udata, addr);
+	if (IS_ERR(idx_que->mtr)) {
+		ret = PTR_ERR(idx_que->mtr);
 		ibdev_err(ibdev,
 			  "failed to alloc SRQ idx mtr, ret = %d.\n", ret);
 		return ret;
@@ -202,7 +202,7 @@ static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq,
 
 	return 0;
 err_idx_mtr:
-	hns_roce_mtr_destroy(hr_dev, &idx_que->mtr);
+	hns_roce_mtr_destroy(hr_dev, idx_que->mtr);
 
 	return ret;
 }
@@ -213,7 +213,7 @@ static void free_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
 
 	bitmap_free(idx_que->bitmap);
 	idx_que->bitmap = NULL;
-	hns_roce_mtr_destroy(hr_dev, &idx_que->mtr);
+	hns_roce_mtr_destroy(hr_dev, idx_que->mtr);
 }
 
 static int alloc_srq_wqe_buf(struct hns_roce_dev *hr_dev,
@@ -222,7 +222,7 @@ static int alloc_srq_wqe_buf(struct hns_roce_dev *hr_dev,
 {
 	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_buf_attr buf_attr = {};
-	int ret;
+	int ret = 0;
 
 	srq->wqe_shift = ilog2(roundup_pow_of_two(max(HNS_ROCE_SGE_SIZE,
 						      HNS_ROCE_SGE_SIZE *
@@ -234,12 +234,13 @@ static int alloc_srq_wqe_buf(struct hns_roce_dev *hr_dev,
 	buf_attr.region[0].hopnum = hr_dev->caps.srqwqe_hop_num;
 	buf_attr.region_count = 1;
 
-	ret = hns_roce_mtr_create(hr_dev, &srq->buf_mtr, &buf_attr,
-				  hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT,
-				  udata, addr);
-	if (ret)
+	srq->buf_mtr = hns_roce_mtr_create(hr_dev, &buf_attr,
+		hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT, udata, addr);
+	if (IS_ERR(srq->buf_mtr)) {
+		ret = PTR_ERR(srq->buf_mtr);
 		ibdev_err(ibdev,
 			  "failed to alloc SRQ buf mtr, ret = %d.\n", ret);
+	}
 
 	return ret;
 }
@@ -247,7 +248,7 @@ static int alloc_srq_wqe_buf(struct hns_roce_dev *hr_dev,
 static void free_srq_wqe_buf(struct hns_roce_dev *hr_dev,
 			     struct hns_roce_srq *srq)
 {
-	hns_roce_mtr_destroy(hr_dev, &srq->buf_mtr);
+	hns_roce_mtr_destroy(hr_dev, srq->buf_mtr);
 }
 
 static int alloc_srq_wrid(struct hns_roce_srq *srq)
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH for-next 2/4] RDMA/hns: Fix HW CTX UAF by adding delay-destruction mechanism
  2025-02-17  7:01 [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism Junxian Huang
  2025-02-17  7:01 ` [PATCH for-next 1/4] RDMA/hns: Change mtr member to pointer in hns QP/CQ/MR/SRQ/EQ struct Junxian Huang
@ 2025-02-17  7:01 ` Junxian Huang
  2025-02-17  7:01 ` [PATCH for-next 3/4] RDMA/hns: Fix HW doorbell " Junxian Huang
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 23+ messages in thread
From: Junxian Huang @ 2025-02-17  7:01 UTC (permalink / raw)
  To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang

From: wenglianfa <wenglianfa@huawei.com>

When mailboxes for resource(QP/CQ/MR/SRQ) destruction fail, it's
unable to notify HW about the destruction. In this case, driver
will still free the CTX, while HW may still access them,
thus leading to a UAF.

Introduce a delay-destruction mechanism. When mailboxes for resource
destruction fail, the related buffer will not be freed in the normal
destruction flow. Instead, link the resource node to a list. Free
all buffers in the list when the device is uninited.

Fixes: b0969f83890b ("RDMA/hns: Do not destroy QP resources in the hw resetting phase")
Signed-off-by: wenglianfa <wenglianfa@huawei.com>
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
 drivers/infiniband/hw/hns/hns_roce_cq.c     | 12 ++++++--
 drivers/infiniband/hw/hns/hns_roce_device.h | 26 +++++++++++++++++
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  |  4 ++-
 drivers/infiniband/hw/hns/hns_roce_main.c   |  7 +++++
 drivers/infiniband/hw/hns/hns_roce_mr.c     | 32 +++++++++++++++++++--
 drivers/infiniband/hw/hns/hns_roce_qp.c     |  8 +++++-
 drivers/infiniband/hw/hns/hns_roce_srq.c    | 17 ++++++++---
 7 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 236ee3fefe16..dc49d35ec4ec 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -178,9 +178,11 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
 
 	ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_CQC,
 				      hr_cq->cqn);
-	if (ret)
+	if (ret) {
+		hr_cq->delayed_destroy_flag = true;
 		dev_err_ratelimited(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n",
 				    ret, hr_cq->cqn);
+	}
 
 	xa_erase_irq(&cq_table->array, hr_cq->cqn);
 
@@ -192,7 +194,8 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
 		complete(&hr_cq->free);
 	wait_for_completion(&hr_cq->free);
 
-	hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn);
+	if (!hr_cq->delayed_destroy_flag)
+		hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn);
 }
 
 static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
@@ -220,7 +223,10 @@ static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
 
 static void free_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
 {
-	hns_roce_mtr_destroy(hr_dev, hr_cq->mtr);
+	if (hr_cq->delayed_destroy_flag)
+		hns_roce_add_unfree_mtr(hr_dev, hr_cq->mtr);
+	else
+		hns_roce_mtr_destroy(hr_dev, hr_cq->mtr);
 }
 
 static int alloc_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index ed0fa29f0cff..e010fb3230a2 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -314,6 +314,7 @@ struct hns_roce_mtr {
 	struct ib_umem		*umem; /* user space buffer */
 	struct hns_roce_buf	*kmem; /* kernel space buffer */
 	struct hns_roce_hem_cfg  hem_cfg; /* config for hardware addressing */
+	struct list_head	 node; /* list node for delay-destruction */
 };
 
 struct hns_roce_mw {
@@ -339,6 +340,11 @@ struct hns_roce_mr {
 	struct hns_roce_mtr	*pbl_mtr;
 	u32			npages;
 	dma_addr_t		*page_list;
+	/* When this is true, the free and destruction of the related
+	 * resources will be delayed until the device is uninited, ensuring
+	 * no memory leak.
+	 */
+	bool delayed_destroy_flag;
 };
 
 struct hns_roce_mr_table {
@@ -442,6 +448,11 @@ struct hns_roce_cq {
 	struct list_head		rq_list; /* all qps on this recv cq */
 	int				is_armed; /* cq is armed */
 	struct list_head		node; /* all armed cqs are on a list */
+	/* When this is true, the free and destruction of the related
+	 * resources will be delayed until the device is uninited, ensuring
+	 * no memory leak.
+	 */
+	bool delayed_destroy_flag;
 };
 
 struct hns_roce_idx_que {
@@ -475,6 +486,11 @@ struct hns_roce_srq {
 	void (*event)(struct hns_roce_srq *srq, enum hns_roce_event event);
 	struct hns_roce_db	rdb;
 	u32			cap_flags;
+	/* When this is true, the free and destruction of the related
+	 * resources will be delayed until the device is uninited, ensuring
+	 * no memory leak.
+	 */
+	bool delayed_destroy_flag;
 };
 
 struct hns_roce_uar_table {
@@ -642,6 +658,11 @@ struct hns_roce_qp {
 
 	/* 0: flush needed, 1: unneeded */
 	unsigned long		flush_flag;
+	/* When this is true, the free and destruction of the related
+	 * resources will be delayed until the device is uninited, ensuring
+	 * no memory leak.
+	 */
+	bool delayed_destroy_flag;
 	struct hns_roce_work	flush_work;
 	struct list_head	node; /* all qps are on a list */
 	struct list_head	rq_node; /* all recv qps are on a list */
@@ -1025,6 +1046,8 @@ struct hns_roce_dev {
 	u64 dwqe_page;
 	struct hns_roce_dev_debugfs dbgfs;
 	atomic64_t *dfx_cnt;
+	struct list_head mtr_unfree_list; /* list of unfree mtr on this dev */
+	struct mutex mtr_unfree_list_mutex; /* protect mtr_unfree_list */
 };
 
 static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev)
@@ -1296,6 +1319,9 @@ int hns_roce_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp);
 int hns_roce_fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ib_qp);
 int hns_roce_fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ib_mr);
 int hns_roce_fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ib_mr);
+void hns_roce_add_unfree_mtr(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_mtr *mtr);
+void hns_roce_free_unfree_mtr(struct hns_roce_dev *hr_dev);
 int hns_roce_fill_res_srq_entry(struct sk_buff *msg, struct ib_srq *ib_srq);
 int hns_roce_fill_res_srq_entry_raw(struct sk_buff *msg, struct ib_srq *ib_srq);
 struct hns_user_mmap_entry *
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index d60ca0a306e9..86d6a8f2a26d 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -5587,10 +5587,12 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 		/* Modify qp to reset before destroying qp */
 		ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0,
 					    hr_qp->state, IB_QPS_RESET, udata);
-		if (ret)
+		if (ret) {
+			hr_qp->delayed_destroy_flag = true;
 			ibdev_err_ratelimited(ibdev,
 					      "failed to modify QP to RST, ret = %d.\n",
 					      ret);
+		}
 	}
 
 	send_cq = hr_qp->ibqp.send_cq ? to_hr_cq(hr_qp->ibqp.send_cq) : NULL;
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index ae24c81c9812..b5ece1bedc11 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -952,6 +952,8 @@ static void hns_roce_teardown_hca(struct hns_roce_dev *hr_dev)
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB ||
 	    hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB)
 		mutex_destroy(&hr_dev->pgdir_mutex);
+
+	mutex_destroy(&hr_dev->mtr_unfree_list_mutex);
 }
 
 /**
@@ -966,6 +968,9 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev)
 
 	spin_lock_init(&hr_dev->sm_lock);
 
+	INIT_LIST_HEAD(&hr_dev->mtr_unfree_list);
+	mutex_init(&hr_dev->mtr_unfree_list_mutex);
+
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB ||
 	    hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) {
 		INIT_LIST_HEAD(&hr_dev->pgdir_list);
@@ -1005,6 +1010,7 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev)
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB ||
 	    hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB)
 		mutex_destroy(&hr_dev->pgdir_mutex);
+	mutex_destroy(&hr_dev->mtr_unfree_list_mutex);
 
 	return ret;
 }
@@ -1179,6 +1185,7 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev)
 
 	if (hr_dev->hw->hw_exit)
 		hr_dev->hw->hw_exit(hr_dev);
+	hns_roce_free_unfree_mtr(hr_dev);
 	hns_roce_teardown_hca(hr_dev);
 	hns_roce_cleanup_hem(hr_dev);
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 3902243cac96..228a3512e1a0 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -83,7 +83,8 @@ static void free_mr_key(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr)
 {
 	unsigned long obj = key_to_hw_index(mr->key);
 
-	hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table, obj);
+	if (!mr->delayed_destroy_flag)
+		hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table, obj);
 	ida_free(&hr_dev->mr_table.mtpt_ida.ida, (int)obj);
 }
 
@@ -125,7 +126,10 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
 
 static void free_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr)
 {
-	hns_roce_mtr_destroy(hr_dev, mr->pbl_mtr);
+	if (mr->delayed_destroy_flag && mr->type != MR_TYPE_DMA)
+		hns_roce_add_unfree_mtr(hr_dev, mr->pbl_mtr);
+	else
+		hns_roce_mtr_destroy(hr_dev, mr->pbl_mtr);
 }
 
 static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr)
@@ -137,9 +141,11 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr
 		ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_MPT,
 					      key_to_hw_index(mr->key) &
 					      (hr_dev->caps.num_mtpts - 1));
-		if (ret)
+		if (ret) {
+			mr->delayed_destroy_flag = true;
 			ibdev_warn_ratelimited(ibdev, "failed to destroy mpt, ret = %d.\n",
 					       ret);
+		}
 	}
 
 	free_mr_pbl(hr_dev, mr);
@@ -1220,3 +1226,23 @@ void hns_roce_mtr_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
 	mtr_free_bufs(hr_dev, mtr);
 	kvfree(mtr);
 }
+
+void hns_roce_add_unfree_mtr(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_mtr *mtr)
+{
+	mutex_lock(&hr_dev->mtr_unfree_list_mutex);
+	list_add_tail(&mtr->node, &hr_dev->mtr_unfree_list);
+	mutex_unlock(&hr_dev->mtr_unfree_list_mutex);
+}
+
+void hns_roce_free_unfree_mtr(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_mtr *mtr, *next;
+
+	mutex_lock(&hr_dev->mtr_unfree_list_mutex);
+	list_for_each_entry_safe(mtr, next, &hr_dev->mtr_unfree_list, node) {
+		list_del(&mtr->node);
+		hns_roce_mtr_destroy(hr_dev, mtr);
+	}
+	mutex_unlock(&hr_dev->mtr_unfree_list_mutex);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 62fc9a3c784e..91c605f67dca 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -410,6 +410,9 @@ static void free_qpc(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
 	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
 
+	if (hr_qp->delayed_destroy_flag)
+		return;
+
 	if (hr_dev->caps.trrl_entry_sz)
 		hns_roce_table_put(hr_dev, &qp_table->trrl_table, hr_qp->qpn);
 	hns_roce_table_put(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
@@ -801,7 +804,10 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 
 static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
-	hns_roce_mtr_destroy(hr_dev, hr_qp->mtr);
+	if (hr_qp->delayed_destroy_flag)
+		hns_roce_add_unfree_mtr(hr_dev, hr_qp->mtr);
+	else
+		hns_roce_mtr_destroy(hr_dev, hr_qp->mtr);
 }
 
 static inline bool user_qp_has_sdb(struct hns_roce_dev *hr_dev,
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index 6685e5a1afd2..848a82395185 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -150,9 +150,11 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
 
 	ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_SRQ,
 				      srq->srqn);
-	if (ret)
+	if (ret) {
+		srq->delayed_destroy_flag = true;
 		dev_err_ratelimited(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n",
 				    ret, srq->srqn);
+	}
 
 	xa_erase_irq(&srq_table->xa, srq->srqn);
 
@@ -160,7 +162,8 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
 		complete(&srq->free);
 	wait_for_completion(&srq->free);
 
-	hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn);
+	if (!srq->delayed_destroy_flag)
+		hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn);
 }
 
 static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq,
@@ -213,7 +216,10 @@ static void free_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
 
 	bitmap_free(idx_que->bitmap);
 	idx_que->bitmap = NULL;
-	hns_roce_mtr_destroy(hr_dev, idx_que->mtr);
+	if (srq->delayed_destroy_flag)
+		hns_roce_add_unfree_mtr(hr_dev, idx_que->mtr);
+	else
+		hns_roce_mtr_destroy(hr_dev, idx_que->mtr);
 }
 
 static int alloc_srq_wqe_buf(struct hns_roce_dev *hr_dev,
@@ -248,7 +254,10 @@ static int alloc_srq_wqe_buf(struct hns_roce_dev *hr_dev,
 static void free_srq_wqe_buf(struct hns_roce_dev *hr_dev,
 			     struct hns_roce_srq *srq)
 {
-	hns_roce_mtr_destroy(hr_dev, srq->buf_mtr);
+	if (srq->delayed_destroy_flag)
+		hns_roce_add_unfree_mtr(hr_dev, srq->buf_mtr);
+	else
+		hns_roce_mtr_destroy(hr_dev, srq->buf_mtr);
 }
 
 static int alloc_srq_wrid(struct hns_roce_srq *srq)
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH for-next 3/4] RDMA/hns: Fix HW doorbell UAF by adding delay-destruction mechanism
  2025-02-17  7:01 [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism Junxian Huang
  2025-02-17  7:01 ` [PATCH for-next 1/4] RDMA/hns: Change mtr member to pointer in hns QP/CQ/MR/SRQ/EQ struct Junxian Huang
  2025-02-17  7:01 ` [PATCH for-next 2/4] RDMA/hns: Fix HW CTX UAF by adding delay-destruction mechanism Junxian Huang
@ 2025-02-17  7:01 ` Junxian Huang
  2025-02-17  7:01 ` [PATCH for-next 4/4] Revert "RDMA/hns: Do not destroy QP resources in the hw resetting phase" Junxian Huang
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 23+ messages in thread
From: Junxian Huang @ 2025-02-17  7:01 UTC (permalink / raw)
  To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang

When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
to notify HW about the destruction. In this case, driver will still
free the doorbells, while HW may still access them, thus leading to
a UAF.

Introduce a delay-destruction mechanism. When mailboxes for resource
destruction fail, the related buffer will not be freed in the normal
destruction flow. Instead, link the resource node to a list. Free
all buffers in the list when the device is uninited.

Fixes: b0969f83890b ("RDMA/hns: Do not destroy QP resources in the hw resetting phase")
Signed-off-by: wenglianfa <wenglianfa@huawei.com>
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
 drivers/infiniband/hw/hns/hns_roce_cq.c     |  6 +-
 drivers/infiniband/hw/hns/hns_roce_db.c     | 91 +++++++++++++++------
 drivers/infiniband/hw/hns/hns_roce_device.h | 26 ++++--
 drivers/infiniband/hw/hns/hns_roce_main.c   |  6 ++
 drivers/infiniband/hw/hns/hns_roce_mr.c     | 25 ++++++
 drivers/infiniband/hw/hns/hns_roce_qp.c     | 11 ++-
 drivers/infiniband/hw/hns/hns_roce_srq.c    |  5 +-
 7 files changed, 134 insertions(+), 36 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index dc49d35ec4ec..13bcf82939cd 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -277,9 +277,11 @@ static void free_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
 		uctx = rdma_udata_to_drv_context(udata,
 						 struct hns_roce_ucontext,
 						 ibucontext);
-		hns_roce_db_unmap_user(uctx, &hr_cq->db);
+		hns_roce_db_unmap_user(uctx, &hr_cq->db,
+				       hr_cq->delayed_destroy_flag);
 	} else {
-		hns_roce_free_db(hr_dev, &hr_cq->db);
+		hns_roce_free_db(hr_dev, &hr_cq->db,
+				 hr_cq->delayed_destroy_flag);
 	}
 }
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c
index 5c4c0480832b..14dd32cd5594 100644
--- a/drivers/infiniband/hw/hns/hns_roce_db.c
+++ b/drivers/infiniband/hw/hns/hns_roce_db.c
@@ -12,6 +12,7 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
 {
 	unsigned long page_addr = virt & PAGE_MASK;
 	struct hns_roce_user_db_page *page;
+	struct ib_umem *umem;
 	unsigned int offset;
 	int ret = 0;
 
@@ -24,43 +25,65 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
 	page = kmalloc(sizeof(*page), GFP_KERNEL);
 	if (!page) {
 		ret = -ENOMEM;
-		goto out;
+		goto err_out;
 	}
 
 	refcount_set(&page->refcount, 1);
 	page->user_virt = page_addr;
-	page->umem = ib_umem_get(context->ibucontext.device, page_addr,
-				 PAGE_SIZE, 0);
-	if (IS_ERR(page->umem)) {
-		ret = PTR_ERR(page->umem);
-		kfree(page);
-		goto out;
+	page->db_node = kvzalloc(sizeof(*page->db_node), GFP_KERNEL);
+	if (!page->db_node) {
+		ret = -ENOMEM;
+		goto err_page;
+	}
+
+	umem = ib_umem_get(context->ibucontext.device, page_addr, PAGE_SIZE, 0);
+	if (IS_ERR(umem)) {
+		ret = PTR_ERR(umem);
+		goto err_dbnode;
 	}
 
+	page->db_node->umem = umem;
 	list_add(&page->list, &context->page_list);
 
 found:
 	offset = virt - page_addr;
-	db->dma = sg_dma_address(page->umem->sgt_append.sgt.sgl) + offset;
-	db->virt_addr = sg_virt(page->umem->sgt_append.sgt.sgl) + offset;
+	db->dma = sg_dma_address(page->db_node->umem->sgt_append.sgt.sgl) + offset;
+	db->virt_addr = sg_virt(page->db_node->umem->sgt_append.sgt.sgl) + offset;
 	db->u.user_page = page;
 	refcount_inc(&page->refcount);
+	mutex_unlock(&context->page_mutex);
+	return 0;
 
-out:
+err_dbnode:
+	kvfree(page->db_node);
+err_page:
+	kfree(page);
+err_out:
 	mutex_unlock(&context->page_mutex);
 
 	return ret;
 }
 
 void hns_roce_db_unmap_user(struct hns_roce_ucontext *context,
-			    struct hns_roce_db *db)
+			    struct hns_roce_db *db,
+			    bool delayed_unmap_flag)
 {
+	struct hns_roce_dev *hr_dev = to_hr_dev(context->ibucontext.device);
+	struct hns_roce_db_pg_node *db_node = db->u.user_page->db_node;
+
 	mutex_lock(&context->page_mutex);
 
+	db_node->delayed_unmap_flag |= delayed_unmap_flag;
+
 	refcount_dec(&db->u.user_page->refcount);
 	if (refcount_dec_if_one(&db->u.user_page->refcount)) {
 		list_del(&db->u.user_page->list);
-		ib_umem_release(db->u.user_page->umem);
+		if (db_node->delayed_unmap_flag) {
+			hns_roce_add_unfree_db(db_node, hr_dev);
+		} else {
+			ib_umem_release(db_node->umem);
+			kvfree(db_node);
+		}
 		kfree(db->u.user_page);
 	}
 
@@ -71,6 +94,8 @@ static struct hns_roce_db_pgdir *hns_roce_alloc_db_pgdir(
 					struct device *dma_device)
 {
 	struct hns_roce_db_pgdir *pgdir;
+	dma_addr_t db_dma;
+	u32 *page;
 
 	pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL);
 	if (!pgdir)
@@ -80,14 +105,24 @@ static struct hns_roce_db_pgdir *hns_roce_alloc_db_pgdir(
 		    HNS_ROCE_DB_PER_PAGE / HNS_ROCE_DB_TYPE_COUNT);
 	pgdir->bits[0] = pgdir->order0;
 	pgdir->bits[1] = pgdir->order1;
-	pgdir->page = dma_alloc_coherent(dma_device, PAGE_SIZE,
-					 &pgdir->db_dma, GFP_KERNEL);
-	if (!pgdir->page) {
-		kfree(pgdir);
-		return NULL;
-	}
+	pgdir->db_node = kvzalloc(sizeof(*pgdir->db_node), GFP_KERNEL);
+	if (!pgdir->db_node)
+		goto err_node;
+
+	page = dma_alloc_coherent(dma_device, PAGE_SIZE, &db_dma, GFP_KERNEL);
+	if (!page)
+		goto err_dma;
+
+	pgdir->db_node->kdb.page = page;
+	pgdir->db_node->kdb.db_dma = db_dma;
 
 	return pgdir;
+
+err_dma:
+	kvfree(pgdir->db_node);
+err_node:
+	kfree(pgdir);
+	return NULL;
 }
 
 static int hns_roce_alloc_db_from_pgdir(struct hns_roce_db_pgdir *pgdir,
@@ -114,8 +149,8 @@ static int hns_roce_alloc_db_from_pgdir(struct hns_roce_db_pgdir *pgdir,
 
 	db->u.pgdir	= pgdir;
 	db->index	= i;
-	db->db_record	= pgdir->page + db->index;
-	db->dma		= pgdir->db_dma  + db->index * HNS_ROCE_DB_UNIT_SIZE;
+	db->db_record = pgdir->db_node->kdb.page + db->index;
+	db->dma	= pgdir->db_node->kdb.db_dma + db->index * HNS_ROCE_DB_UNIT_SIZE;
 	db->order	= order;
 
 	return 0;
@@ -150,13 +185,17 @@ int hns_roce_alloc_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db,
 	return ret;
 }
 
-void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db)
+void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db,
+		      bool delayed_unmap_flag)
 {
+	struct hns_roce_db_pg_node *db_node = db->u.pgdir->db_node;
 	unsigned long o;
 	unsigned long i;
 
 	mutex_lock(&hr_dev->pgdir_mutex);
 
+	db_node->delayed_unmap_flag |= delayed_unmap_flag;
+
 	o = db->order;
 	i = db->index;
 
@@ -170,9 +209,15 @@ void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db)
 
 	if (bitmap_full(db->u.pgdir->order1,
 			HNS_ROCE_DB_PER_PAGE / HNS_ROCE_DB_TYPE_COUNT)) {
-		dma_free_coherent(hr_dev->dev, PAGE_SIZE, db->u.pgdir->page,
-				  db->u.pgdir->db_dma);
 		list_del(&db->u.pgdir->list);
+		if (db_node->delayed_unmap_flag) {
+			hns_roce_add_unfree_db(db_node, hr_dev);
+		} else {
+			dma_free_coherent(hr_dev->dev, PAGE_SIZE,
+					  db_node->kdb.page,
+					  db_node->kdb.db_dma);
+			kvfree(db_node);
+		}
 		kfree(db->u.pgdir);
 	}
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index e010fb3230a2..bcf85ec488fa 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -400,20 +400,29 @@ struct hns_roce_buf {
 	unsigned int			page_shift;
 };
 
+struct hns_roce_db_pg_node {
+	struct list_head list;
+	struct ib_umem *umem;
+	struct {
+		u32 *page;
+		dma_addr_t db_dma;
+	} kdb;
+	bool delayed_unmap_flag;
+};
+
 struct hns_roce_db_pgdir {
 	struct list_head	list;
 	DECLARE_BITMAP(order0, HNS_ROCE_DB_PER_PAGE);
 	DECLARE_BITMAP(order1, HNS_ROCE_DB_PER_PAGE / HNS_ROCE_DB_TYPE_COUNT);
 	unsigned long		*bits[HNS_ROCE_DB_TYPE_COUNT];
-	u32			*page;
-	dma_addr_t		db_dma;
+	struct hns_roce_db_pg_node *db_node;
 };
 
 struct hns_roce_user_db_page {
 	struct list_head	list;
-	struct ib_umem		*umem;
 	unsigned long		user_virt;
 	refcount_t		refcount;
+	struct hns_roce_db_pg_node *db_node;
 };
 
 struct hns_roce_db {
@@ -1048,6 +1057,8 @@ struct hns_roce_dev {
 	atomic64_t *dfx_cnt;
 	struct list_head mtr_unfree_list; /* list of unfree mtr on this dev */
 	struct mutex mtr_unfree_list_mutex; /* protect mtr_unfree_list */
+	struct list_head db_unfree_list; /* list of unfree db on this dev */
+	struct mutex db_unfree_list_mutex; /* protect db_unfree_list */
 };
 
 static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev)
@@ -1299,10 +1310,12 @@ int hns_roce_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
 int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
 			 struct hns_roce_db *db);
 void hns_roce_db_unmap_user(struct hns_roce_ucontext *context,
-			    struct hns_roce_db *db);
+			    struct hns_roce_db *db,
+			    bool delayed_unmap_flag);
 int hns_roce_alloc_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db,
 		      int order);
-void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db);
+void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db,
+		      bool delayed_unmap_flag);
 
 void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn);
 void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type);
@@ -1319,6 +1332,9 @@ int hns_roce_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp);
 int hns_roce_fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ib_qp);
 int hns_roce_fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ib_mr);
 int hns_roce_fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ib_mr);
+void hns_roce_add_unfree_db(struct hns_roce_db_pg_node *db_node,
+			    struct hns_roce_dev *hr_dev);
+void hns_roce_free_unfree_db(struct hns_roce_dev *hr_dev);
 void hns_roce_add_unfree_mtr(struct hns_roce_dev *hr_dev,
 			     struct hns_roce_mtr *mtr);
 void hns_roce_free_unfree_mtr(struct hns_roce_dev *hr_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index b5ece1bedc11..5840340e0f14 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -953,6 +953,7 @@ static void hns_roce_teardown_hca(struct hns_roce_dev *hr_dev)
 	    hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB)
 		mutex_destroy(&hr_dev->pgdir_mutex);
 
+	mutex_destroy(&hr_dev->db_unfree_list_mutex);
 	mutex_destroy(&hr_dev->mtr_unfree_list_mutex);
 }
 
@@ -971,6 +972,9 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev)
 	INIT_LIST_HEAD(&hr_dev->mtr_unfree_list);
 	mutex_init(&hr_dev->mtr_unfree_list_mutex);
 
+	INIT_LIST_HEAD(&hr_dev->db_unfree_list);
+	mutex_init(&hr_dev->db_unfree_list_mutex);
+
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB ||
 	    hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) {
 		INIT_LIST_HEAD(&hr_dev->pgdir_list);
@@ -1010,6 +1014,7 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev)
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB ||
 	    hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB)
 		mutex_destroy(&hr_dev->pgdir_mutex);
+	mutex_destroy(&hr_dev->db_unfree_list_mutex);
 	mutex_destroy(&hr_dev->mtr_unfree_list_mutex);
 
 	return ret;
@@ -1185,6 +1190,7 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev)
 
 	if (hr_dev->hw->hw_exit)
 		hr_dev->hw->hw_exit(hr_dev);
+	hns_roce_free_unfree_db(hr_dev);
 	hns_roce_free_unfree_mtr(hr_dev);
 	hns_roce_teardown_hca(hr_dev);
 	hns_roce_cleanup_hem(hr_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 228a3512e1a0..226a785e1fc4 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -1246,3 +1246,28 @@ void hns_roce_free_unfree_mtr(struct hns_roce_dev *hr_dev)
 	}
 	mutex_unlock(&hr_dev->mtr_unfree_list_mutex);
 }
+
+void hns_roce_add_unfree_db(struct hns_roce_db_pg_node *db_node,
+			    struct hns_roce_dev *hr_dev)
+{
+	mutex_lock(&hr_dev->db_unfree_list_mutex);
+	list_add_tail(&db_node->list, &hr_dev->db_unfree_list);
+	mutex_unlock(&hr_dev->db_unfree_list_mutex);
+}
+
+void hns_roce_free_unfree_db(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_db_pg_node *pos, *next;
+
+	mutex_lock(&hr_dev->db_unfree_list_mutex);
+	list_for_each_entry_safe(pos, next, &hr_dev->db_unfree_list, list) {
+		list_del(&pos->list);
+		if (pos->umem)
+			ib_umem_release(pos->umem);
+		else
+			dma_free_coherent(hr_dev->dev, PAGE_SIZE,
+					  pos->kdb.page, pos->kdb.db_dma);
+		kvfree(pos);
+	}
+	mutex_unlock(&hr_dev->db_unfree_list_mutex);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 91c605f67dca..66854f9b64ad 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -906,7 +906,7 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev,
 
 err_sdb:
 	if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB)
-		hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
+		hns_roce_db_unmap_user(uctx, &hr_qp->sdb, false);
 err_out:
 	return ret;
 }
@@ -988,14 +988,17 @@ static void free_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 
 	if (udata) {
 		if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB)
-			hns_roce_db_unmap_user(uctx, &hr_qp->rdb);
+			hns_roce_db_unmap_user(uctx, &hr_qp->rdb,
+					       hr_qp->delayed_destroy_flag);
 		if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB)
-			hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
+			hns_roce_db_unmap_user(uctx, &hr_qp->sdb,
+					       hr_qp->delayed_destroy_flag);
 		if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DIRECT_WQE)
 			qp_user_mmap_entry_remove(hr_qp);
 	} else {
 		if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB)
-			hns_roce_free_db(hr_dev, &hr_qp->rdb);
+			hns_roce_free_db(hr_dev, &hr_qp->rdb,
+					 hr_qp->delayed_destroy_flag);
 	}
 }
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index 848a82395185..a39f07f44356 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -426,9 +426,10 @@ static void free_srq_db(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq,
 		uctx = rdma_udata_to_drv_context(udata,
 						 struct hns_roce_ucontext,
 						 ibucontext);
-		hns_roce_db_unmap_user(uctx, &srq->rdb);
+		hns_roce_db_unmap_user(uctx, &srq->rdb,
+				       srq->delayed_destroy_flag);
 	} else {
-		hns_roce_free_db(hr_dev, &srq->rdb);
+		hns_roce_free_db(hr_dev, &srq->rdb, srq->delayed_destroy_flag);
 	}
 }
 
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH for-next 4/4] Revert "RDMA/hns: Do not destroy QP resources in the hw resetting phase"
  2025-02-17  7:01 [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism Junxian Huang
                   ` (2 preceding siblings ...)
  2025-02-17  7:01 ` [PATCH for-next 3/4] RDMA/hns: Fix HW doorbell " Junxian Huang
@ 2025-02-17  7:01 ` Junxian Huang
  2025-02-19 12:14 ` [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism Leon Romanovsky
  2025-03-18  3:23 ` Junxian Huang
  5 siblings, 0 replies; 23+ messages in thread
From: Junxian Huang @ 2025-02-17  7:01 UTC (permalink / raw)
  To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang

From: wenglianfa <wenglianfa@huawei.com>

This reverts commit b0969f83890bf8b47f5c8bd42539599b2b52fdeb.

The reverted patch was aimed at delaying resource destruction when
HW resets to avoid HW UAF, but it didn't accomplish the task perfectly
as the problem still occurs when read_poll_timeout_atomic() times out.
Besides, read_poll_timeout_atomic() spends too much CPU time and may
lead to a CPU stuck under heavy load.

Now that we have a delay-destruction mechanism to fix the HW UAF
problem, revert this patch.

Signed-off-by: wenglianfa <wenglianfa@huawei.com>
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 86d6a8f2a26d..75bfd2117699 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -33,7 +33,6 @@
 #include <linux/acpi.h>
 #include <linux/etherdevice.h>
 #include <linux/interrupt.h>
-#include <linux/iopoll.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
@@ -1026,14 +1025,9 @@ static u32 hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev,
 					unsigned long instance_stage,
 					unsigned long reset_stage)
 {
-#define HW_RESET_TIMEOUT_US 1000000
-#define HW_RESET_SLEEP_US 1000
-
 	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	struct hnae3_handle *handle = priv->handle;
 	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
-	unsigned long val;
-	int ret;
 
 	/* When hardware reset is detected, we should stop sending mailbox&cmq&
 	 * doorbell to hardware. If now in .init_instance() function, we should
@@ -1045,11 +1039,7 @@ static u32 hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev,
 	 * again.
 	 */
 	hr_dev->dis_db = true;
-
-	ret = read_poll_timeout(ops->ae_dev_reset_cnt, val,
-				val > hr_dev->reset_cnt, HW_RESET_SLEEP_US,
-				HW_RESET_TIMEOUT_US, false, handle);
-	if (!ret)
+	if (!ops->get_hw_reset_stat(handle))
 		hr_dev->is_reset = true;
 
 	if (!hr_dev->is_reset || reset_stage == HNS_ROCE_STATE_RST_INIT ||
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-17  7:01 [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism Junxian Huang
                   ` (3 preceding siblings ...)
  2025-02-17  7:01 ` [PATCH for-next 4/4] Revert "RDMA/hns: Do not destroy QP resources in the hw resetting phase" Junxian Huang
@ 2025-02-19 12:14 ` Leon Romanovsky
  2025-02-19 13:07   ` Junxian Huang
  2025-03-18  3:23 ` Junxian Huang
  5 siblings, 1 reply; 23+ messages in thread
From: Leon Romanovsky @ 2025-02-19 12:14 UTC (permalink / raw)
  To: Junxian Huang; +Cc: jgg, linux-rdma, linuxarm, tangchengchang

On Mon, Feb 17, 2025 at 03:01:19PM +0800, Junxian Huang wrote:
> When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
> to notify HW about the destruction. In this case, driver will still
> free the resources, while HW may still access them, thus leading to
> a UAF.

> This series introduces delay-destruction mechanism to fix such HW UAF,
> including thw HW CTX and doorbells.

And why can't you fix FW instead?

Thanks

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-19 12:14 ` [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism Leon Romanovsky
@ 2025-02-19 13:07   ` Junxian Huang
  2025-02-19 14:35     ` Leon Romanovsky
  0 siblings, 1 reply; 23+ messages in thread
From: Junxian Huang @ 2025-02-19 13:07 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: jgg, linux-rdma, linuxarm, tangchengchang



On 2025/2/19 20:14, Leon Romanovsky wrote:
> On Mon, Feb 17, 2025 at 03:01:19PM +0800, Junxian Huang wrote:
>> When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
>> to notify HW about the destruction. In this case, driver will still
>> free the resources, while HW may still access them, thus leading to
>> a UAF.
> 
>> This series introduces delay-destruction mechanism to fix such HW UAF,
>> including thw HW CTX and doorbells.
> 
> And why can't you fix FW instead?
> 

The key is the failure of mailbox, and there are some cases that would
lead to it, which we don't really consider as FW bugs.

For example, when some random fatal error like RAS error occurs in FW,
our FW will be reset. Driver's mailbox will fail during the FW reset.

Another case is the mailbox timeout when FW is under heavy load, as it is
shared by multi-functions.

Thanks,
Junxian

> Thanks

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-19 13:07   ` Junxian Huang
@ 2025-02-19 14:35     ` Leon Romanovsky
  2025-02-20  3:48       ` Junxian Huang
  0 siblings, 1 reply; 23+ messages in thread
From: Leon Romanovsky @ 2025-02-19 14:35 UTC (permalink / raw)
  To: Junxian Huang; +Cc: jgg, linux-rdma, linuxarm, tangchengchang

On Wed, Feb 19, 2025 at 09:07:36PM +0800, Junxian Huang wrote:
> 
> 
> On 2025/2/19 20:14, Leon Romanovsky wrote:
> > On Mon, Feb 17, 2025 at 03:01:19PM +0800, Junxian Huang wrote:
> >> When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
> >> to notify HW about the destruction. In this case, driver will still
> >> free the resources, while HW may still access them, thus leading to
> >> a UAF.
> > 
> >> This series introduces delay-destruction mechanism to fix such HW UAF,
> >> including thw HW CTX and doorbells.
> > 
> > And why can't you fix FW instead?
> > 
> 
> The key is the failure of mailbox, and there are some cases that would
> lead to it, which we don't really consider as FW bugs.
> 
> For example, when some random fatal error like RAS error occurs in FW,
> our FW will be reset. Driver's mailbox will fail during the FW reset.

I don't understand this scenario. You said at the beginning that HW can
access host memory and this triggers UAF. However now, you are presenting 
case where driver tries to access mailbox.

> 
> Another case is the mailbox timeout when FW is under heavy load, as it is
> shared by multi-functions.

It is not different from any other mailbox errors. FW needs to handle
these cases.

Thanks

> 
> Thanks,
> Junxian
> 
> > Thanks

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-19 14:35     ` Leon Romanovsky
@ 2025-02-20  3:48       ` Junxian Huang
  2025-02-20  7:32         ` Leon Romanovsky
  2025-02-20 14:10         ` Jason Gunthorpe
  0 siblings, 2 replies; 23+ messages in thread
From: Junxian Huang @ 2025-02-20  3:48 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: jgg, linux-rdma, linuxarm, tangchengchang



On 2025/2/19 22:35, Leon Romanovsky wrote:
> On Wed, Feb 19, 2025 at 09:07:36PM +0800, Junxian Huang wrote:
>>
>>
>> On 2025/2/19 20:14, Leon Romanovsky wrote:
>>> On Mon, Feb 17, 2025 at 03:01:19PM +0800, Junxian Huang wrote:
>>>> When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
>>>> to notify HW about the destruction. In this case, driver will still
>>>> free the resources, while HW may still access them, thus leading to
>>>> a UAF.
>>>
>>>> This series introduces delay-destruction mechanism to fix such HW UAF,
>>>> including thw HW CTX and doorbells.
>>>
>>> And why can't you fix FW instead?
>>>
>>
>> The key is the failure of mailbox, and there are some cases that would
>> lead to it, which we don't really consider as FW bugs.
>>
>> For example, when some random fatal error like RAS error occurs in FW,
>> our FW will be reset. Driver's mailbox will fail during the FW reset.
> 
> I don't understand this scenario. You said at the beginning that HW can
> access host memory and this triggers UAF. However now, you are presenting 
> case where driver tries to access mailbox.
> 

No, I'm saying that mailbox errors are the reason of HW UAF. Let me
explain this scenario in more detail.

Driver notifies HW about the memory release with mailbox. The procedure
of a mailbox is:
	a) driver posts the mailbox to FW
	b) FW writes the mailbox data into HW

In this scenario, step a) will fail due to the FW reset, HW won't get
notified and thus may lead to UAF.

Junxian

>>
>> Another case is the mailbox timeout when FW is under heavy load, as it is
>> shared by multi-functions.
> 
> It is not different from any other mailbox errors. FW needs to handle
> these cases.
> 
> Thanks
> 
>>
>> Thanks,
>> Junxian
>>
>>> Thanks

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-20  3:48       ` Junxian Huang
@ 2025-02-20  7:32         ` Leon Romanovsky
  2025-02-20  8:45           ` Junxian Huang
  2025-02-20 14:10         ` Jason Gunthorpe
  1 sibling, 1 reply; 23+ messages in thread
From: Leon Romanovsky @ 2025-02-20  7:32 UTC (permalink / raw)
  To: Junxian Huang; +Cc: jgg, linux-rdma, linuxarm, tangchengchang

On Thu, Feb 20, 2025 at 11:48:49AM +0800, Junxian Huang wrote:
> 
> 
> On 2025/2/19 22:35, Leon Romanovsky wrote:
> > On Wed, Feb 19, 2025 at 09:07:36PM +0800, Junxian Huang wrote:
> >>
> >>
> >> On 2025/2/19 20:14, Leon Romanovsky wrote:
> >>> On Mon, Feb 17, 2025 at 03:01:19PM +0800, Junxian Huang wrote:
> >>>> When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
> >>>> to notify HW about the destruction. In this case, driver will still
> >>>> free the resources, while HW may still access them, thus leading to
> >>>> a UAF.
> >>>
> >>>> This series introduces delay-destruction mechanism to fix such HW UAF,
> >>>> including thw HW CTX and doorbells.
> >>>
> >>> And why can't you fix FW instead?
> >>>
> >>
> >> The key is the failure of mailbox, and there are some cases that would
> >> lead to it, which we don't really consider as FW bugs.
> >>
> >> For example, when some random fatal error like RAS error occurs in FW,
> >> our FW will be reset. Driver's mailbox will fail during the FW reset.
> > 
> > I don't understand this scenario. You said at the beginning that HW can
> > access host memory and this triggers UAF. However now, you are presenting 
> > case where driver tries to access mailbox.
> > 
> 
> No, I'm saying that mailbox errors are the reason of HW UAF. Let me
> explain this scenario in more detail.
> 
> Driver notifies HW about the memory release with mailbox. The procedure
> of a mailbox is:
> 	a) driver posts the mailbox to FW
> 	b) FW writes the mailbox data into HW
> 
> In this scenario, step a) will fail due to the FW reset, HW won't get
> notified and thus may lead to UAF.

Exactly, FW performed reset and didn't prevent from HW to access it.

Thanks

> 
> Junxian
> 
> >>
> >> Another case is the mailbox timeout when FW is under heavy load, as it is
> >> shared by multi-functions.
> > 
> > It is not different from any other mailbox errors. FW needs to handle
> > these cases.
> > 
> > Thanks
> > 
> >>
> >> Thanks,
> >> Junxian
> >>
> >>> Thanks

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-20  7:32         ` Leon Romanovsky
@ 2025-02-20  8:45           ` Junxian Huang
  2025-02-20  9:08             ` Leon Romanovsky
  0 siblings, 1 reply; 23+ messages in thread
From: Junxian Huang @ 2025-02-20  8:45 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: jgg, linux-rdma, linuxarm, tangchengchang



On 2025/2/20 15:32, Leon Romanovsky wrote:
> On Thu, Feb 20, 2025 at 11:48:49AM +0800, Junxian Huang wrote:
>>
>>
>> On 2025/2/19 22:35, Leon Romanovsky wrote:
>>> On Wed, Feb 19, 2025 at 09:07:36PM +0800, Junxian Huang wrote:
>>>>
>>>>
>>>> On 2025/2/19 20:14, Leon Romanovsky wrote:
>>>>> On Mon, Feb 17, 2025 at 03:01:19PM +0800, Junxian Huang wrote:
>>>>>> When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
>>>>>> to notify HW about the destruction. In this case, driver will still
>>>>>> free the resources, while HW may still access them, thus leading to
>>>>>> a UAF.
>>>>>
>>>>>> This series introduces delay-destruction mechanism to fix such HW UAF,
>>>>>> including thw HW CTX and doorbells.
>>>>>
>>>>> And why can't you fix FW instead?
>>>>>
>>>>
>>>> The key is the failure of mailbox, and there are some cases that would
>>>> lead to it, which we don't really consider as FW bugs.
>>>>
>>>> For example, when some random fatal error like RAS error occurs in FW,
>>>> our FW will be reset. Driver's mailbox will fail during the FW reset.
>>>
>>> I don't understand this scenario. You said at the beginning that HW can
>>> access host memory and this triggers UAF. However now, you are presenting 
>>> case where driver tries to access mailbox.
>>>
>>
>> No, I'm saying that mailbox errors are the reason of HW UAF. Let me
>> explain this scenario in more detail.
>>
>> Driver notifies HW about the memory release with mailbox. The procedure
>> of a mailbox is:
>> 	a) driver posts the mailbox to FW
>> 	b) FW writes the mailbox data into HW
>>
>> In this scenario, step a) will fail due to the FW reset, HW won't get
>> notified and thus may lead to UAF.
> 
> Exactly, FW performed reset and didn't prevent from HW to access it.
> 

Yes, but the problem is that our HW doesn't provide a method to prevent
the access. There's nothing FW can do in this scenario, so we can only
prevent UAF by adding these codes in driver.

Thanks,
Junxian

> Thanks
> 
>>
>> Junxian
>>
>>>>
>>>> Another case is the mailbox timeout when FW is under heavy load, as it is
>>>> shared by multi-functions.
>>>
>>> It is not different from any other mailbox errors. FW needs to handle
>>> these cases.
>>>
>>> Thanks
>>>
>>>>
>>>> Thanks,
>>>> Junxian
>>>>
>>>>> Thanks

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-20  8:45           ` Junxian Huang
@ 2025-02-20  9:08             ` Leon Romanovsky
  2025-02-20 11:05               ` Junxian Huang
  0 siblings, 1 reply; 23+ messages in thread
From: Leon Romanovsky @ 2025-02-20  9:08 UTC (permalink / raw)
  To: Junxian Huang; +Cc: jgg, linux-rdma, linuxarm, tangchengchang

On Thu, Feb 20, 2025 at 04:45:54PM +0800, Junxian Huang wrote:
> 
> 
> On 2025/2/20 15:32, Leon Romanovsky wrote:
> > On Thu, Feb 20, 2025 at 11:48:49AM +0800, Junxian Huang wrote:
> >>
> >>
> >> On 2025/2/19 22:35, Leon Romanovsky wrote:
> >>> On Wed, Feb 19, 2025 at 09:07:36PM +0800, Junxian Huang wrote:
> >>>>
> >>>>
> >>>> On 2025/2/19 20:14, Leon Romanovsky wrote:
> >>>>> On Mon, Feb 17, 2025 at 03:01:19PM +0800, Junxian Huang wrote:
> >>>>>> When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
> >>>>>> to notify HW about the destruction. In this case, driver will still
> >>>>>> free the resources, while HW may still access them, thus leading to
> >>>>>> a UAF.
> >>>>>
> >>>>>> This series introduces delay-destruction mechanism to fix such HW UAF,
> >>>>>> including thw HW CTX and doorbells.
> >>>>>
> >>>>> And why can't you fix FW instead?
> >>>>>
> >>>>
> >>>> The key is the failure of mailbox, and there are some cases that would
> >>>> lead to it, which we don't really consider as FW bugs.
> >>>>
> >>>> For example, when some random fatal error like RAS error occurs in FW,
> >>>> our FW will be reset. Driver's mailbox will fail during the FW reset.
> >>>
> >>> I don't understand this scenario. You said at the beginning that HW can
> >>> access host memory and this triggers UAF. However now, you are presenting 
> >>> case where driver tries to access mailbox.
> >>>
> >>
> >> No, I'm saying that mailbox errors are the reason of HW UAF. Let me
> >> explain this scenario in more detail.
> >>
> >> Driver notifies HW about the memory release with mailbox. The procedure
> >> of a mailbox is:
> >> 	a) driver posts the mailbox to FW
> >> 	b) FW writes the mailbox data into HW
> >>
> >> In this scenario, step a) will fail due to the FW reset, HW won't get
> >> notified and thus may lead to UAF.
> > 
> > Exactly, FW performed reset and didn't prevent from HW to access it.
> > 
> 
> Yes, but the problem is that our HW doesn't provide a method to prevent
> the access. There's nothing FW can do in this scenario, so we can only
> prevent UAF by adding these codes in driver.

Somehow HW doesn't access mailbox if destroy was successful, so why
can't FW use same "method" to inform HW before reset?

Thanks

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-20  9:08             ` Leon Romanovsky
@ 2025-02-20 11:05               ` Junxian Huang
  2025-02-20 14:13                 ` Jason Gunthorpe
  0 siblings, 1 reply; 23+ messages in thread
From: Junxian Huang @ 2025-02-20 11:05 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: jgg, linux-rdma, linuxarm, tangchengchang



On 2025/2/20 17:08, Leon Romanovsky wrote:
> On Thu, Feb 20, 2025 at 04:45:54PM +0800, Junxian Huang wrote:
>>
>>
>> On 2025/2/20 15:32, Leon Romanovsky wrote:
>>> On Thu, Feb 20, 2025 at 11:48:49AM +0800, Junxian Huang wrote:
>>>>
>>>>
>>>> On 2025/2/19 22:35, Leon Romanovsky wrote:
>>>>> On Wed, Feb 19, 2025 at 09:07:36PM +0800, Junxian Huang wrote:
>>>>>>
>>>>>>
>>>>>> On 2025/2/19 20:14, Leon Romanovsky wrote:
>>>>>>> On Mon, Feb 17, 2025 at 03:01:19PM +0800, Junxian Huang wrote:
>>>>>>>> When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
>>>>>>>> to notify HW about the destruction. In this case, driver will still
>>>>>>>> free the resources, while HW may still access them, thus leading to
>>>>>>>> a UAF.
>>>>>>>
>>>>>>>> This series introduces delay-destruction mechanism to fix such HW UAF,
>>>>>>>> including thw HW CTX and doorbells.
>>>>>>>
>>>>>>> And why can't you fix FW instead?
>>>>>>>
>>>>>>
>>>>>> The key is the failure of mailbox, and there are some cases that would
>>>>>> lead to it, which we don't really consider as FW bugs.
>>>>>>
>>>>>> For example, when some random fatal error like RAS error occurs in FW,
>>>>>> our FW will be reset. Driver's mailbox will fail during the FW reset.
>>>>>
>>>>> I don't understand this scenario. You said at the beginning that HW can
>>>>> access host memory and this triggers UAF. However now, you are presenting 
>>>>> case where driver tries to access mailbox.
>>>>>
>>>>
>>>> No, I'm saying that mailbox errors are the reason of HW UAF. Let me
>>>> explain this scenario in more detail.
>>>>
>>>> Driver notifies HW about the memory release with mailbox. The procedure
>>>> of a mailbox is:
>>>> 	a) driver posts the mailbox to FW
>>>> 	b) FW writes the mailbox data into HW
>>>>
>>>> In this scenario, step a) will fail due to the FW reset, HW won't get
>>>> notified and thus may lead to UAF.
>>>
>>> Exactly, FW performed reset and didn't prevent from HW to access it.
>>>
>>
>> Yes, but the problem is that our HW doesn't provide a method to prevent
>> the access. There's nothing FW can do in this scenario, so we can only
>> prevent UAF by adding these codes in driver.
> 
> Somehow HW doesn't access mailbox if destroy was successful, so why
> can't FW use same "method" to inform HW before reset?
> 

Mailbox carries information of the specific resource (QP/CQ/SRQ/MR)
that are being destroyed. It's impossible for FW to predict which
QP/CQ/SRQ/MR will be destroyed by driver during reset before the
reset starts.

Thanks,
Junxian

> Thanks

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-20  3:48       ` Junxian Huang
  2025-02-20  7:32         ` Leon Romanovsky
@ 2025-02-20 14:10         ` Jason Gunthorpe
  2025-02-26  9:46           ` Junxian Huang
  1 sibling, 1 reply; 23+ messages in thread
From: Jason Gunthorpe @ 2025-02-20 14:10 UTC (permalink / raw)
  To: Junxian Huang; +Cc: Leon Romanovsky, linux-rdma, linuxarm, tangchengchang

On Thu, Feb 20, 2025 at 11:48:49AM +0800, Junxian Huang wrote:

> Driver notifies HW about the memory release with mailbox. The procedure
> of a mailbox is:
> 	a) driver posts the mailbox to FW
> 	b) FW writes the mailbox data into HW
> 
> In this scenario, step a) will fail due to the FW reset, HW won't get
> notified and thus may lead to UAF.

That's just wrong, a FW reset must fully stop and sanitize the HW as
well. You can't have HW running rouge with no way for FW to control it
anymore.

Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-20 11:05               ` Junxian Huang
@ 2025-02-20 14:13                 ` Jason Gunthorpe
  2025-02-26  9:38                   ` Junxian Huang
  0 siblings, 1 reply; 23+ messages in thread
From: Jason Gunthorpe @ 2025-02-20 14:13 UTC (permalink / raw)
  To: Junxian Huang; +Cc: Leon Romanovsky, linux-rdma, linuxarm, tangchengchang

On Thu, Feb 20, 2025 at 07:05:06PM +0800, Junxian Huang wrote:

> Mailbox carries information of the specific resource (QP/CQ/SRQ/MR)
> that are being destroyed. It's impossible for FW to predict which
> QP/CQ/SRQ/MR will be destroyed by driver during reset before the
> reset starts.

That doesn't make any sense, the device reset is supposed to clean up
everything. It doesn't matter what the mailbox was doing, after the
reset finishes it is no longer necessary because the reset was the
thing that cleaned it up.

You need a way to track the reset completion and cancel all
outstanding commands with a reset failure so cleanup can
happen. Combined with disassociate and some other locking you need to
create a strong fence across the reset where there is no leakage of
'before' and 'after' reset objects and kernel state.

Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-20 14:13                 ` Jason Gunthorpe
@ 2025-02-26  9:38                   ` Junxian Huang
  0 siblings, 0 replies; 23+ messages in thread
From: Junxian Huang @ 2025-02-26  9:38 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, linux-rdma, linuxarm, tangchengchang

On 2025/2/20 22:13, Jason Gunthorpe wrote:
> On Thu, Feb 20, 2025 at 07:05:06PM +0800, Junxian Huang wrote:
> 
>> Mailbox carries information of the specific resource (QP/CQ/SRQ/MR)
>> that are being destroyed. It's impossible for FW to predict which
>> QP/CQ/SRQ/MR will be destroyed by driver during reset before the
>> reset starts.
> 
> That doesn't make any sense, the device reset is supposed to clean up
> everything. It doesn't matter what the mailbox was doing, after the
> reset finishes it is no longer necessary because the reset was the
> thing that cleaned it up.

Yes, our current implementation is exactly what you said. FW reset
will disable HW, trigger driver reset and clean up everything.

> 
> You need a way to track the reset completion and cancel all
> outstanding commands with a reset failure so cleanup can
> happen. Combined with disassociate and some other locking you need to
> create a strong fence across the reset where there is no leakage of
> 'before' and 'after' reset objects and kernel state.

This is also what we're trying to do now. Currently we check the
reset status in driver when posting mailbox and fail them during
the reset.

The problem is that there is a time gap between the start of FW
reset and the stop of HW, where driver's mailbox will fail while HW
may still access memory. This gap won't last long but in some extreme
cases it's still possible to cause some errors. We try to address
this with this series by reserving the memory until HW is disabled.

Thanks,
Junxian

> 
> Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-20 14:10         ` Jason Gunthorpe
@ 2025-02-26  9:46           ` Junxian Huang
  2025-02-26 12:47             ` Leon Romanovsky
  0 siblings, 1 reply; 23+ messages in thread
From: Junxian Huang @ 2025-02-26  9:46 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, linux-rdma, linuxarm, tangchengchang



On 2025/2/20 22:10, Jason Gunthorpe wrote:
> On Thu, Feb 20, 2025 at 11:48:49AM +0800, Junxian Huang wrote:
> 
>> Driver notifies HW about the memory release with mailbox. The procedure
>> of a mailbox is:
>> 	a) driver posts the mailbox to FW
>> 	b) FW writes the mailbox data into HW
>>
>> In this scenario, step a) will fail due to the FW reset, HW won't get
>> notified and thus may lead to UAF.
> 
> That's just wrong, a FW reset must fully stop and sanitize the HW as
> well. You can't have HW running rouge with no way for FW to control it
> anymore.
> 

I agree, but there is a small time gap between the start of FW reset
and the stop of HW. Please see my earlier reply today.

Thanks,
Junxian

> Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-26  9:46           ` Junxian Huang
@ 2025-02-26 12:47             ` Leon Romanovsky
  2025-02-26 14:25               ` Junxian Huang
  0 siblings, 1 reply; 23+ messages in thread
From: Leon Romanovsky @ 2025-02-26 12:47 UTC (permalink / raw)
  To: Junxian Huang; +Cc: Jason Gunthorpe, linux-rdma, linuxarm, tangchengchang

On Wed, Feb 26, 2025 at 05:46:12PM +0800, Junxian Huang wrote:
> 
> 
> On 2025/2/20 22:10, Jason Gunthorpe wrote:
> > On Thu, Feb 20, 2025 at 11:48:49AM +0800, Junxian Huang wrote:
> > 
> >> Driver notifies HW about the memory release with mailbox. The procedure
> >> of a mailbox is:
> >> 	a) driver posts the mailbox to FW
> >> 	b) FW writes the mailbox data into HW
> >>
> >> In this scenario, step a) will fail due to the FW reset, HW won't get
> >> notified and thus may lead to UAF.
> > 
> > That's just wrong, a FW reset must fully stop and sanitize the HW as
> > well. You can't have HW running rouge with no way for FW to control it
> > anymore.
> > 
> 
> I agree, but there is a small time gap between the start of FW reset
> and the stop of HW. Please see my earlier reply today.

So stop HW before continuing FW reset.

Thanks

> 
> Thanks,
> Junxian
> 
> > Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-26 12:47             ` Leon Romanovsky
@ 2025-02-26 14:25               ` Junxian Huang
  0 siblings, 0 replies; 23+ messages in thread
From: Junxian Huang @ 2025-02-26 14:25 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: Jason Gunthorpe, linux-rdma, linuxarm, tangchengchang



On 2025/2/26 20:47, Leon Romanovsky wrote:
> On Wed, Feb 26, 2025 at 05:46:12PM +0800, Junxian Huang wrote:
>>
>>
>> On 2025/2/20 22:10, Jason Gunthorpe wrote:
>>> On Thu, Feb 20, 2025 at 11:48:49AM +0800, Junxian Huang wrote:
>>>
>>>> Driver notifies HW about the memory release with mailbox. The procedure
>>>> of a mailbox is:
>>>> 	a) driver posts the mailbox to FW
>>>> 	b) FW writes the mailbox data into HW
>>>>
>>>> In this scenario, step a) will fail due to the FW reset, HW won't get
>>>> notified and thus may lead to UAF.
>>>
>>> That's just wrong, a FW reset must fully stop and sanitize the HW as
>>> well. You can't have HW running rouge with no way for FW to control it
>>> anymore.
>>>
>>
>> I agree, but there is a small time gap between the start of FW reset
>> and the stop of HW. Please see my earlier reply today.
> 
> So stop HW before continuing FW reset.

FW reset is a passive behavior, not triggered by FW itself and cannot
be predicted by FW either. If the FW is being reset, usually it is
already crashed and can't function normally due to some fatal errors.
When FW starts to reset, there are some necessary initialization
before it can take control of HW again. So there's always a time gap.

Thanks,
Junxian

> 
> Thanks
> 
>>
>> Thanks,
>> Junxian
>>
>>> Jason
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-02-17  7:01 [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism Junxian Huang
                   ` (4 preceding siblings ...)
  2025-02-19 12:14 ` [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism Leon Romanovsky
@ 2025-03-18  3:23 ` Junxian Huang
  2025-03-18  9:55   ` Leon Romanovsky
  2025-04-01 13:39   ` Jason Gunthorpe
  5 siblings, 2 replies; 23+ messages in thread
From: Junxian Huang @ 2025-03-18  3:23 UTC (permalink / raw)
  To: jgg, leon; +Cc: linux-rdma, linuxarm, tangchengchang, jonathan.cameron

Hi Leon and Jason. After discussions and analysis with our FW team,
we've agreed that FW can stop HW to prevent HW UAF in most FW reset
cases.

But there's still one case where FW cannot intervene when FW reset
is triggered by watchdog due to FW crash, because it is completely
passive for FW. So we still need these patches to prevent this
unlikely but still possible HW UAF case. Is this series okay to be
applied?

Thanks,
Junxian

On 2025/2/17 15:01, Junxian Huang wrote:
> When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
> to notify HW about the destruction. In this case, driver will still
> free the resources, while HW may still access them, thus leading to
> a UAF.
> 
> This series introduces delay-destruction mechanism to fix such HW UAF,
> including thw HW CTX and doorbells.
> 
> Junxian Huang (2):
>   RDMA/hns: Change mtr member to pointer in hns QP/CQ/MR/SRQ/EQ struct
>   RDMA/hns: Fix HW doorbell UAF by adding delay-destruction mechanism
> 
> wenglianfa (2):
>   RDMA/hns: Fix HW CTX UAF by adding delay-destruction mechanism
>   Revert "RDMA/hns: Do not destroy QP resources in the hw resetting
>     phase"
> 
>  drivers/infiniband/hw/hns/hns_roce_cq.c       |  34 +++--
>  drivers/infiniband/hw/hns/hns_roce_db.c       |  91 ++++++++++----
>  drivers/infiniband/hw/hns/hns_roce_device.h   |  73 ++++++++---
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c    |  97 +++++++--------
>  drivers/infiniband/hw/hns/hns_roce_main.c     |  13 ++
>  drivers/infiniband/hw/hns/hns_roce_mr.c       | 117 ++++++++++++++----
>  drivers/infiniband/hw/hns/hns_roce_qp.c       |  30 +++--
>  drivers/infiniband/hw/hns/hns_roce_restrack.c |   4 +-
>  drivers/infiniband/hw/hns/hns_roce_srq.c      |  45 ++++---
>  9 files changed, 348 insertions(+), 156 deletions(-)
> 
> --
> 2.33.0
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-03-18  3:23 ` Junxian Huang
@ 2025-03-18  9:55   ` Leon Romanovsky
  2025-04-01 13:39   ` Jason Gunthorpe
  1 sibling, 0 replies; 23+ messages in thread
From: Leon Romanovsky @ 2025-03-18  9:55 UTC (permalink / raw)
  To: Junxian Huang; +Cc: jgg, linux-rdma, linuxarm, tangchengchang, jonathan.cameron

On Tue, Mar 18, 2025 at 11:23:57AM +0800, Junxian Huang wrote:
> Hi Leon and Jason. After discussions and analysis with our FW team,
> we've agreed that FW can stop HW to prevent HW UAF in most FW reset
> cases.
> 
> But there's still one case where FW cannot intervene when FW reset
> is triggered by watchdog due to FW crash, because it is completely
> passive for FW. So we still need these patches to prevent this
> unlikely but still possible HW UAF case. Is this series okay to be
> applied?

I'm sorry, but no.

Thanks

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-03-18  3:23 ` Junxian Huang
  2025-03-18  9:55   ` Leon Romanovsky
@ 2025-04-01 13:39   ` Jason Gunthorpe
  2025-04-02  5:50     ` Junxian Huang
  1 sibling, 1 reply; 23+ messages in thread
From: Jason Gunthorpe @ 2025-04-01 13:39 UTC (permalink / raw)
  To: Junxian Huang
  Cc: leon, linux-rdma, linuxarm, tangchengchang, jonathan.cameron

On Tue, Mar 18, 2025 at 11:23:57AM +0800, Junxian Huang wrote:
> Hi Leon and Jason. After discussions and analysis with our FW team,
> we've agreed that FW can stop HW to prevent HW UAF in most FW reset
> cases.
> 
> But there's still one case where FW cannot intervene when FW reset
> is triggered by watchdog due to FW crash, because it is completely
> passive for FW. So we still need these patches to prevent this
> unlikely but still possible HW UAF case. Is this series okay to be
> applied?

You need to have an architecture where there are clear rules about
when HW is allowed to access DMA memory and when it is not, then
implement accordingly. Most devices have a hard reset which is a
strong barrier preventing DMA from crossing it. You need something
similar.

For things like mbox failures/timeouts, a timeout means the SW cannot
assume the devices is no longer doing DMA. It would be appropriate to
immediately trigger the reset sequence, wait for it to reach the
barrier, then conclude and error the mbox.

This way this:

> > When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
> > to notify HW about the destruction. In this case, driver will still
> > free the resources, while HW may still access them, thus leading to
> > a UAF.

Changes "destruction fail" into a barrier that waits for the device to
be fenced and DMA to be halted before returning keeping the driver
life cycle together.

You want to avoid "destruction fail" paths that result in the device
continuing to function.

Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism
  2025-04-01 13:39   ` Jason Gunthorpe
@ 2025-04-02  5:50     ` Junxian Huang
  0 siblings, 0 replies; 23+ messages in thread
From: Junxian Huang @ 2025-04-02  5:50 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: leon, linux-rdma, linuxarm, tangchengchang, jonathan.cameron



On 2025/4/1 21:39, Jason Gunthorpe wrote:
> On Tue, Mar 18, 2025 at 11:23:57AM +0800, Junxian Huang wrote:
>> Hi Leon and Jason. After discussions and analysis with our FW team,
>> we've agreed that FW can stop HW to prevent HW UAF in most FW reset
>> cases.
>>
>> But there's still one case where FW cannot intervene when FW reset
>> is triggered by watchdog due to FW crash, because it is completely
>> passive for FW. So we still need these patches to prevent this
>> unlikely but still possible HW UAF case. Is this series okay to be
>> applied?
> 
> You need to have an architecture where there are clear rules about
> when HW is allowed to access DMA memory and when it is not, then
> implement accordingly. Most devices have a hard reset which is a
> strong barrier preventing DMA from crossing it. You need something
> similar.
> 
> For things like mbox failures/timeouts, a timeout means the SW cannot
> assume the devices is no longer doing DMA. It would be appropriate to
> immediately trigger the reset sequence, wait for it to reach the
> barrier, then conclude and error the mbox.
> 
> This way this:
> 
>>> When mailboxes for resource(QP/CQ/SRQ) destruction fail, it's unable
>>> to notify HW about the destruction. In this case, driver will still
>>> free the resources, while HW may still access them, thus leading to
>>> a UAF.
> 
> Changes "destruction fail" into a barrier that waits for the device to
> be fenced and DMA to be halted before returning keeping the driver
> life cycle together.
> 
> You want to avoid "destruction fail" paths that result in the device
> continuing to function.
> 

Thanks, we'll try this way.

Junxian

> Jason

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2025-04-02  5:50 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-02-17  7:01 [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism Junxian Huang
2025-02-17  7:01 ` [PATCH for-next 1/4] RDMA/hns: Change mtr member to pointer in hns QP/CQ/MR/SRQ/EQ struct Junxian Huang
2025-02-17  7:01 ` [PATCH for-next 2/4] RDMA/hns: Fix HW CTX UAF by adding delay-destruction mechanism Junxian Huang
2025-02-17  7:01 ` [PATCH for-next 3/4] RDMA/hns: Fix HW doorbell " Junxian Huang
2025-02-17  7:01 ` [PATCH for-next 4/4] Revert "RDMA/hns: Do not destroy QP resources in the hw resetting phase" Junxian Huang
2025-02-19 12:14 ` [PATCH for-next 0/4] RDMA/hns: Introduce delay-destruction mechanism Leon Romanovsky
2025-02-19 13:07   ` Junxian Huang
2025-02-19 14:35     ` Leon Romanovsky
2025-02-20  3:48       ` Junxian Huang
2025-02-20  7:32         ` Leon Romanovsky
2025-02-20  8:45           ` Junxian Huang
2025-02-20  9:08             ` Leon Romanovsky
2025-02-20 11:05               ` Junxian Huang
2025-02-20 14:13                 ` Jason Gunthorpe
2025-02-26  9:38                   ` Junxian Huang
2025-02-20 14:10         ` Jason Gunthorpe
2025-02-26  9:46           ` Junxian Huang
2025-02-26 12:47             ` Leon Romanovsky
2025-02-26 14:25               ` Junxian Huang
2025-03-18  3:23 ` Junxian Huang
2025-03-18  9:55   ` Leon Romanovsky
2025-04-01 13:39   ` Jason Gunthorpe
2025-04-02  5:50     ` Junxian Huang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox