Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed
* [PATCH v5 rdma-next 5/6] RDMA/qedr: Add doorbell overflow recovery support
From: Michal Kalderon @ 2019-07-08  9:15 UTC (permalink / raw)
  To: michal.kalderon, ariel.elior, jgg, dledford, galpress
  Cc: linux-rdma, davem, netdev
In-Reply-To: <20190708091503.14723-1-michal.kalderon@marvell.com>

Use the doorbell recovery mechanism to register rdma related doorbells
that will be restored in case there is a doorbell overflow attention.

Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
---
 drivers/infiniband/hw/qedr/main.c  |   1 +
 drivers/infiniband/hw/qedr/qedr.h  |   7 +
 drivers/infiniband/hw/qedr/verbs.c | 273 ++++++++++++++++++++++++++++++++-----
 drivers/infiniband/hw/qedr/verbs.h |   2 +
 include/uapi/rdma/qedr-abi.h       |  25 ++++
 5 files changed, 273 insertions(+), 35 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 3db4b6ba5ad6..34225c88f03d 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -206,6 +206,7 @@ static const struct ib_device_ops qedr_dev_ops = {
 	.get_link_layer = qedr_link_layer,
 	.map_mr_sg = qedr_map_mr_sg,
 	.mmap = qedr_mmap,
+	.mmap_free = qedr_mmap_free,
 	.modify_port = qedr_modify_port,
 	.modify_qp = qedr_modify_qp,
 	.modify_srq = qedr_modify_srq,
diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
index 7e80ce521d8d..8aed24b32de6 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -231,6 +231,7 @@ struct qedr_ucontext {
 	u64 dpi_phys_addr;
 	u32 dpi_size;
 	u16 dpi;
+	bool db_rec;
 };
 
 union db_prod64 {
@@ -258,6 +259,12 @@ struct qedr_userq {
 	struct qedr_pbl *pbl_tbl;
 	u64 buf_addr;
 	size_t buf_len;
+
+	/* doorbell recovery */
+	void __iomem *db_addr;
+	struct qedr_user_db_rec *db_rec_data;
+	u64 db_rec_phys;
+	u64 db_rec_key;
 };
 
 struct qedr_cq {
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index f33f0f1e7d76..15221d9c7773 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -60,6 +60,7 @@
 
 enum {
 	QEDR_USER_MMAP_IO_WC = 0,
+	QEDR_USER_MMAP_PHYS_PAGE,
 };
 
 static inline int qedr_ib_copy_to_udata(struct ib_udata *udata, void *src,
@@ -266,6 +267,7 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 	int rc;
 	struct qedr_ucontext *ctx = get_qedr_ucontext(uctx);
 	struct qedr_alloc_ucontext_resp uresp = {};
+	struct qedr_alloc_ucontext_req ureq = {};
 	struct qedr_dev *dev = get_qedr_dev(ibdev);
 	struct qed_rdma_add_user_out_params oparams;
 	u64 key;
@@ -273,6 +275,17 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 	if (!udata)
 		return -EFAULT;
 
+	if (udata->inlen) {
+		rc = ib_copy_from_udata(&ureq, udata,
+					min(sizeof(ureq), udata->inlen));
+		if (rc) {
+			DP_ERR(dev, "Problem copying data from user space\n");
+			return -EFAULT;
+		}
+
+		ctx->db_rec = !!(ureq.context_flags & QEDR_ALLOC_UCTX_DB_REC);
+	}
+
 	rc = dev->ops->rdma_add_user(dev->rdma_ctx, &oparams);
 	if (rc) {
 		DP_ERR(dev,
@@ -325,6 +338,13 @@ void qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
 	uctx->dev->ops->rdma_remove_user(uctx->dev->rdma_ctx, uctx->dpi);
 }
 
+void qedr_mmap_free(u64 address, u64 length, u8 mmap_flag)
+{
+	/* DMA mapping is already gone, now free the pages */
+	if (mmap_flag == QEDR_USER_MMAP_PHYS_PAGE)
+		free_page((unsigned long)phys_to_virt(address));
+}
+
 int qedr_mmap(struct ib_ucontext *ucontext, struct vm_area_struct *vma)
 {
 	struct ib_device *dev = ucontext->device;
@@ -368,6 +388,11 @@ int qedr_mmap(struct ib_ucontext *ucontext, struct vm_area_struct *vma)
 		err = rdma_user_mmap_io(ucontext, vma, pfn, length,
 					pgprot_writecombine(vma->vm_page_prot));
 		break;
+	case QEDR_USER_MMAP_PHYS_PAGE:
+		err = vm_insert_page(vma, vma->vm_start, pfn_to_page(pfn));
+		if (err)
+			break;
+		break;
 	default:
 		err = -EINVAL;
 	}
@@ -606,16 +631,48 @@ static void qedr_populate_pbls(struct qedr_dev *dev, struct ib_umem *umem,
 	}
 }
 
+static int qedr_db_recovery_add(struct qedr_dev *dev,
+				void __iomem *db_addr,
+				void *db_data,
+				enum qed_db_rec_width db_width,
+				enum qed_db_rec_space db_space)
+{
+	if (!db_data) {
+		DP_DEBUG(dev, QEDR_MSG_INIT, "avoiding db rec since old lib\n");
+		return 0;
+	}
+
+	return dev->ops->common->db_recovery_add(dev->cdev, db_addr, db_data,
+						 db_width, db_space);
+}
+
+static void qedr_db_recovery_del(struct qedr_dev *dev,
+				 void __iomem *db_addr,
+				 void *db_data)
+{
+	if (!db_data) {
+		DP_DEBUG(dev, QEDR_MSG_INIT, "avoiding db rec since old lib\n");
+		return;
+	}
+
+	/* Ignore return code as there is not much we can do about it. Error
+	 * log will be printed inside.
+	 */
+	dev->ops->common->db_recovery_del(dev->cdev, db_addr, db_data);
+}
+
 static int qedr_copy_cq_uresp(struct qedr_dev *dev,
-			      struct qedr_cq *cq, struct ib_udata *udata)
+			      struct qedr_cq *cq, struct ib_udata *udata,
+			      u32 db_offset)
 {
 	struct qedr_create_cq_uresp uresp;
 	int rc;
 
 	memset(&uresp, 0, sizeof(uresp));
 
-	uresp.db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT);
+	uresp.db_offset = db_offset;
 	uresp.icid = cq->icid;
+	uresp.db_rec_addr = cq->q.db_rec_key;
 
 	rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 	if (rc)
@@ -643,10 +700,42 @@ static inline int qedr_align_cq_entries(int entries)
 	return aligned_size / QEDR_CQE_SIZE;
 }
 
+static int qedr_init_user_db_rec(struct ib_udata *udata,
+				 struct qedr_dev *dev, struct qedr_userq *q,
+				 bool requires_db_rec)
+{
+	struct qedr_ucontext *uctx =
+		rdma_udata_to_drv_context(udata, struct qedr_ucontext,
+					  ibucontext);
+
+	/* Aborting for non doorbell userqueue (SRQ) or non-supporting lib */
+	if (requires_db_rec == 0 || !uctx->db_rec)
+		return 0;
+
+	/* Allocate a page for doorbell recovery, add to mmap ) */
+	q->db_rec_data = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!q->db_rec_data) {
+		DP_ERR(dev,
+		       "get_free_page failed\n");
+		return -ENOMEM;
+	}
+
+	q->db_rec_phys = virt_to_phys(q->db_rec_data);
+	q->db_rec_key = rdma_user_mmap_entry_insert(&uctx->ibucontext, q,
+						    q->db_rec_phys,
+						    PAGE_SIZE,
+						    QEDR_USER_MMAP_PHYS_PAGE);
+	if (q->db_rec_key == RDMA_USER_MMAP_INVALID)
+		return -ENOMEM;
+
+	return 0;
+}
+
 static inline int qedr_init_user_queue(struct ib_udata *udata,
 				       struct qedr_dev *dev,
 				       struct qedr_userq *q, u64 buf_addr,
-				       size_t buf_len, int access, int dmasync,
+				       size_t buf_len, bool requires_db_rec,
+				       int access, int dmasync,
 				       int alloc_and_init)
 {
 	u32 fw_pages;
@@ -684,7 +773,8 @@ static inline int qedr_init_user_queue(struct ib_udata *udata,
 		}
 	}
 
-	return 0;
+	/* mmap the user address used to store doorbell data for recovery */
+	return qedr_init_user_db_rec(udata, dev, q, requires_db_rec);
 
 err0:
 	ib_umem_release(q->umem);
@@ -770,6 +860,7 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	int entries = attr->cqe;
 	struct qedr_cq *cq = get_qedr_cq(ibcq);
 	int chain_entries;
+	u32 db_offset;
 	int page_cnt;
 	u64 pbl_ptr;
 	u16 icid;
@@ -789,8 +880,12 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	chain_entries = qedr_align_cq_entries(entries);
 	chain_entries = min_t(int, chain_entries, QEDR_MAX_CQES);
 
+	/* calc db offset. user will add DPI base, kernel will add db addr */
+	db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT);
+
 	if (udata) {
-		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
+		if (ib_copy_from_udata(&ureq, udata, min(sizeof(ureq),
+							 udata->inlen))) {
 			DP_ERR(dev,
 			       "create cq: problem copying data from user space\n");
 			goto err0;
@@ -805,8 +900,9 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		cq->cq_type = QEDR_CQ_TYPE_USER;
 
 		rc = qedr_init_user_queue(udata, dev, &cq->q, ureq.addr,
-					  ureq.len, IB_ACCESS_LOCAL_WRITE, 1,
-					  1);
+					  ureq.len, true,
+					  IB_ACCESS_LOCAL_WRITE,
+					  1, 1);
 		if (rc)
 			goto err0;
 
@@ -814,6 +910,7 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		page_cnt = cq->q.pbl_info.num_pbes;
 
 		cq->ibcq.cqe = chain_entries;
+		cq->q.db_addr = ctx->dpi_addr + db_offset;
 	} else {
 		cq->cq_type = QEDR_CQ_TYPE_KERNEL;
 
@@ -844,14 +941,21 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	spin_lock_init(&cq->cq_lock);
 
 	if (udata) {
-		rc = qedr_copy_cq_uresp(dev, cq, udata);
+		rc = qedr_copy_cq_uresp(dev, cq, udata, db_offset);
+		if (rc)
+			goto err3;
+
+		rc = qedr_db_recovery_add(dev, cq->q.db_addr,
+					  &cq->q.db_rec_data->db_data,
+					  DB_REC_WIDTH_64B,
+					  DB_REC_USER);
 		if (rc)
 			goto err3;
+
 	} else {
 		/* Generate doorbell address. */
-		cq->db_addr = dev->db_addr +
-		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT);
 		cq->db.data.icid = cq->icid;
+		cq->db_addr = dev->db_addr + db_offset;
 		cq->db.data.params = DB_AGG_CMD_SET <<
 		    RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT;
 
@@ -861,6 +965,11 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		cq->latest_cqe = NULL;
 		consume_cqe(cq);
 		cq->cq_cons = qed_chain_get_cons_idx_u32(&cq->pbl);
+
+		rc = qedr_db_recovery_add(dev, cq->db_addr, &cq->db.data,
+					  DB_REC_WIDTH_64B, DB_REC_KERNEL);
+		if (rc)
+			goto err3;
 	}
 
 	DP_DEBUG(dev, QEDR_MSG_CQ,
@@ -879,8 +988,18 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	else
 		dev->ops->common->chain_free(dev->cdev, &cq->pbl);
 err1:
-	if (udata)
+	if (udata) {
 		ib_umem_release(cq->q.umem);
+		if (cq->q.db_rec_data) {
+			qedr_db_recovery_del(dev, cq->q.db_addr,
+					     &cq->q.db_rec_data->db_data);
+			if (cq->q.db_rec_key == RDMA_USER_MMAP_INVALID)
+				free_page((unsigned long)cq->q.db_rec_data);
+			/* o/w will be freed by ib_uverbs on context free */
+		}
+	} else {
+		qedr_db_recovery_del(dev, cq->db_addr, &cq->db.data);
+	}
 err0:
 	return -EINVAL;
 }
@@ -911,8 +1030,10 @@ void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 	cq->destroyed = 1;
 
 	/* GSIs CQs are handled by driver, so they don't exist in the FW */
-	if (cq->cq_type == QEDR_CQ_TYPE_GSI)
+	if (cq->cq_type == QEDR_CQ_TYPE_GSI) {
+		qedr_db_recovery_del(dev, cq->db_addr, &cq->db.data);
 		return;
+	}
 
 	iparams.icid = cq->icid;
 	dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
@@ -921,6 +1042,12 @@ void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 	if (udata) {
 		qedr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl);
 		ib_umem_release(cq->q.umem);
+
+		if (cq->q.db_rec_data)
+			qedr_db_recovery_del(dev, cq->q.db_addr,
+					     &cq->q.db_rec_data->db_data);
+	} else {
+		qedr_db_recovery_del(dev, cq->db_addr, &cq->db.data);
 	}
 
 	/* We don't want the IRQ handler to handle a non-existing CQ so we
@@ -1085,8 +1212,8 @@ static int qedr_copy_srq_uresp(struct qedr_dev *dev,
 }
 
 static void qedr_copy_rq_uresp(struct qedr_dev *dev,
-			       struct qedr_create_qp_uresp *uresp,
-			       struct qedr_qp *qp)
+			      struct qedr_create_qp_uresp *uresp,
+			      struct qedr_qp *qp)
 {
 	/* iWARP requires two doorbells per RQ. */
 	if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
@@ -1099,6 +1226,7 @@ static void qedr_copy_rq_uresp(struct qedr_dev *dev,
 	}
 
 	uresp->rq_icid = qp->icid;
+	uresp->rq_db_rec_addr = qp->urq.db_rec_key;
 }
 
 static void qedr_copy_sq_uresp(struct qedr_dev *dev,
@@ -1112,22 +1240,24 @@ static void qedr_copy_sq_uresp(struct qedr_dev *dev,
 		uresp->sq_icid = qp->icid;
 	else
 		uresp->sq_icid = qp->icid + 1;
+
+	uresp->sq_db_rec_addr = qp->usq.db_rec_key;
 }
 
 static int qedr_copy_qp_uresp(struct qedr_dev *dev,
-			      struct qedr_qp *qp, struct ib_udata *udata)
+			      struct qedr_qp *qp, struct ib_udata *udata,
+			      struct qedr_create_qp_uresp *uresp)
 {
-	struct qedr_create_qp_uresp uresp;
 	int rc;
 
-	memset(&uresp, 0, sizeof(uresp));
-	qedr_copy_sq_uresp(dev, &uresp, qp);
-	qedr_copy_rq_uresp(dev, &uresp, qp);
+	memset(uresp, 0, sizeof(*uresp));
+	qedr_copy_sq_uresp(dev, uresp, qp);
+	qedr_copy_rq_uresp(dev, uresp, qp);
 
-	uresp.atomic_supported = dev->atomic_cap != IB_ATOMIC_NONE;
-	uresp.qp_id = qp->qp_id;
+	uresp->atomic_supported = dev->atomic_cap != IB_ATOMIC_NONE;
+	uresp->qp_id = qp->qp_id;
 
-	rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	rc = qedr_ib_copy_to_udata(udata, uresp, sizeof(*uresp));
 	if (rc)
 		DP_ERR(dev,
 		       "create qp: failed a copy to user space with qp icid=0x%x.\n",
@@ -1171,16 +1301,35 @@ static void qedr_set_common_qp_params(struct qedr_dev *dev,
 		 qp->sq.max_sges, qp->sq_cq->icid);
 }
 
-static void qedr_set_roce_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
+static int qedr_set_roce_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
 {
+	int rc;
+
 	qp->sq.db = dev->db_addr +
 		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD);
 	qp->sq.db_data.data.icid = qp->icid + 1;
+	rc = qedr_db_recovery_add(dev, qp->sq.db,
+				  &qp->sq.db_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_KERNEL);
+	if (rc)
+		return rc;
+
 	if (!qp->srq) {
 		qp->rq.db = dev->db_addr +
 			    DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD);
 		qp->rq.db_data.data.icid = qp->icid;
+
+		rc = qedr_db_recovery_add(dev, qp->rq.db,
+					  &qp->rq.db_data,
+					  DB_REC_WIDTH_32B,
+					  DB_REC_KERNEL);
+		if (rc)
+			qedr_db_recovery_del(dev, qp->sq.db,
+					     &qp->sq.db_data);
 	}
+
+	return rc;
 }
 
 static int qedr_check_srq_params(struct qedr_dev *dev,
@@ -1234,7 +1383,7 @@ static int qedr_init_srq_user_params(struct ib_udata *udata,
 	int rc;
 
 	rc = qedr_init_user_queue(udata, srq->dev, &srq->usrq, ureq->srq_addr,
-				  ureq->srq_len, access, dmasync, 1);
+				  ureq->srq_len, false, access, dmasync, 1);
 	if (rc)
 		return rc;
 
@@ -1330,7 +1479,8 @@ int qedr_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init_attr,
 	hw_srq->max_sges = init_attr->attr.max_sge;
 
 	if (udata) {
-		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
+		if (ib_copy_from_udata(&ureq, udata, min(sizeof(ureq),
+							 udata->inlen))) {
 			DP_ERR(dev,
 			       "create srq: problem copying data from user space\n");
 			goto err0;
@@ -1526,6 +1676,14 @@ static void qedr_cleanup_user(struct qedr_dev *dev, struct qedr_qp *qp)
 
 	ib_umem_release(qp->urq.umem);
 	qp->urq.umem = NULL;
+
+	if (qp->usq.db_rec_data)
+		qedr_db_recovery_del(dev, qp->usq.db_addr,
+				     &qp->usq.db_rec_data->db_data);
+
+	if (qp->urq.db_rec_data)
+		qedr_db_recovery_del(dev, qp->urq.db_addr,
+				     &qp->urq.db_rec_data->db_data);
 }
 
 static int qedr_create_user_qp(struct qedr_dev *dev,
@@ -1537,12 +1695,14 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
 	struct qed_rdma_create_qp_in_params in_params;
 	struct qed_rdma_create_qp_out_params out_params;
 	struct qedr_pd *pd = get_qedr_pd(ibpd);
+	struct qedr_create_qp_uresp uresp;
+	struct qedr_ucontext *ctx = NULL;
 	struct qedr_create_qp_ureq ureq;
 	int alloc_and_init = rdma_protocol_roce(&dev->ibdev, 1);
 	int rc = -EINVAL;
 
 	memset(&ureq, 0, sizeof(ureq));
-	rc = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
+	rc = ib_copy_from_udata(&ureq, udata, min(sizeof(ureq), udata->inlen));
 	if (rc) {
 		DP_ERR(dev, "Problem copying data from user space\n");
 		return rc;
@@ -1550,14 +1710,16 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
 
 	/* SQ - read access only (0), dma sync not required (0) */
 	rc = qedr_init_user_queue(udata, dev, &qp->usq, ureq.sq_addr,
-				  ureq.sq_len, 0, 0, alloc_and_init);
+				  ureq.sq_len, true, 0, 0,
+				  alloc_and_init);
 	if (rc)
 		return rc;
 
 	if (!qp->srq) {
 		/* RQ - read access only (0), dma sync not required (0) */
 		rc = qedr_init_user_queue(udata, dev, &qp->urq, ureq.rq_addr,
-					  ureq.rq_len, 0, 0, alloc_and_init);
+					  ureq.rq_len, true,
+					  0, 0, alloc_and_init);
 		if (rc)
 			return rc;
 	}
@@ -1587,13 +1749,31 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
 	qp->qp_id = out_params.qp_id;
 	qp->icid = out_params.icid;
 
-	rc = qedr_copy_qp_uresp(dev, qp, udata);
+	rc = qedr_copy_qp_uresp(dev, qp, udata, &uresp);
 	if (rc)
 		goto err;
 
+	/* db offset was calculated in copy_qp_uresp, now set in the user q */
+	ctx = pd->uctx;
+	qp->usq.db_addr = ctx->dpi_addr + uresp.sq_db_offset;
+	qp->urq.db_addr = ctx->dpi_addr + uresp.rq_db_offset;
+
+	rc = qedr_db_recovery_add(dev, qp->usq.db_addr,
+				  &qp->usq.db_rec_data->db_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_USER);
+	if (rc)
+		goto err;
+
+	rc = qedr_db_recovery_add(dev, qp->urq.db_addr,
+				  &qp->urq.db_rec_data->db_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_USER);
+	if (rc)
+		goto err;
 	qedr_qp_user_print(dev, qp);
 
-	return 0;
+	return rc;
 err:
 	rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
 	if (rc)
@@ -1604,12 +1784,21 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
 	return rc;
 }
 
-static void qedr_set_iwarp_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
+static int qedr_set_iwarp_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
 {
+	int rc;
+
 	qp->sq.db = dev->db_addr +
 	    DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD);
 	qp->sq.db_data.data.icid = qp->icid;
 
+	rc = qedr_db_recovery_add(dev, qp->sq.db,
+				  &qp->sq.db_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_KERNEL);
+	if (rc)
+		return rc;
+
 	qp->rq.db = dev->db_addr +
 		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_IWARP_RQ_PROD);
 	qp->rq.db_data.data.icid = qp->icid;
@@ -1617,6 +1806,13 @@ static void qedr_set_iwarp_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
 			   DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_FLAGS);
 	qp->rq.iwarp_db2_data.data.icid = qp->icid;
 	qp->rq.iwarp_db2_data.data.value = DQ_TCM_IWARP_POST_RQ_CF_CMD;
+
+	rc = qedr_db_recovery_add(dev, qp->rq.db,
+				  &qp->rq.db_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_KERNEL);
+
+	return rc;
 }
 
 static int
@@ -1664,8 +1860,7 @@ qedr_roce_create_kernel_qp(struct qedr_dev *dev,
 	qp->qp_id = out_params.qp_id;
 	qp->icid = out_params.icid;
 
-	qedr_set_roce_db_info(dev, qp);
-	return rc;
+	return qedr_set_roce_db_info(dev, qp);
 }
 
 static int
@@ -1723,8 +1918,7 @@ qedr_iwarp_create_kernel_qp(struct qedr_dev *dev,
 	qp->qp_id = out_params.qp_id;
 	qp->icid = out_params.icid;
 
-	qedr_set_iwarp_db_info(dev, qp);
-	return rc;
+	return qedr_set_iwarp_db_info(dev, qp);
 
 err:
 	dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
@@ -1739,6 +1933,15 @@ static void qedr_cleanup_kernel(struct qedr_dev *dev, struct qedr_qp *qp)
 
 	dev->ops->common->chain_free(dev->cdev, &qp->rq.pbl);
 	kfree(qp->rqe_wr_id);
+
+	/* GSI qp is not registered to db mechanism so no need to delete */
+	if (qp->qp_type == IB_QPT_GSI)
+		return;
+
+	qedr_db_recovery_del(dev, qp->sq.db, &qp->sq.db_data);
+
+	if (!qp->srq)
+		qedr_db_recovery_del(dev, qp->rq.db, &qp->rq.db_data);
 }
 
 static int qedr_create_kernel_qp(struct qedr_dev *dev,
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
index 724d0983e972..21f0ee356bef 100644
--- a/drivers/infiniband/hw/qedr/verbs.h
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -47,6 +47,8 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
 void qedr_dealloc_ucontext(struct ib_ucontext *uctx);
 
 int qedr_mmap(struct ib_ucontext *ucontext, struct vm_area_struct *vma);
+void qedr_mmap_free(u64 address, u64 length, u8 mmap_flag);
+
 int qedr_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 void qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h
index 7a10b3a325fa..c022ee26089b 100644
--- a/include/uapi/rdma/qedr-abi.h
+++ b/include/uapi/rdma/qedr-abi.h
@@ -38,6 +38,15 @@
 #define QEDR_ABI_VERSION		(8)
 
 /* user kernel communication data structures. */
+enum qedr_alloc_ucontext_flags {
+	QEDR_ALLOC_UCTX_RESERVED	= 1 << 0,
+	QEDR_ALLOC_UCTX_DB_REC		= 1 << 1
+};
+
+struct qedr_alloc_ucontext_req {
+	__u32 context_flags;
+	__u32 reserved;
+};
 
 struct qedr_alloc_ucontext_resp {
 	__aligned_u64 db_pa;
@@ -74,6 +83,7 @@ struct qedr_create_cq_uresp {
 	__u32 db_offset;
 	__u16 icid;
 	__u16 reserved;
+	__aligned_u64 db_rec_addr;
 };
 
 struct qedr_create_qp_ureq {
@@ -109,6 +119,13 @@ struct qedr_create_qp_uresp {
 
 	__u32 rq_db2_offset;
 	__u32 reserved;
+
+	/* address of SQ doorbell recovery user entry */
+	__aligned_u64 sq_db_rec_addr;
+
+	/* address of RQ doorbell recovery user entry */
+	__aligned_u64 rq_db_rec_addr;
+
 };
 
 struct qedr_create_srq_ureq {
@@ -128,4 +145,12 @@ struct qedr_create_srq_uresp {
 	__u32 reserved1;
 };
 
+/* doorbell recovery entry allocated and populated by userspace doorbelling
+ * entities and mapped to kernel. Kernel uses this to register doorbell
+ * information with doorbell drop recovery mechanism.
+ */
+struct qedr_user_db_rec {
+	__aligned_u64 db_data; /* doorbell data */
+};
+
 #endif /* __QEDR_USER_H__ */
-- 
2.14.5


^ permalink raw reply related

* [PATCH v5 rdma-next 6/6] RDMA/qedr: Add iWARP doorbell recovery support
From: Michal Kalderon @ 2019-07-08  9:15 UTC (permalink / raw)
  To: michal.kalderon, ariel.elior, jgg, dledford, galpress
  Cc: linux-rdma, davem, netdev
In-Reply-To: <20190708091503.14723-1-michal.kalderon@marvell.com>

This patch adds the iWARP specific doorbells to the doorbell
recovery mechanism

Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
---
 drivers/infiniband/hw/qedr/qedr.h  | 12 +++++++-----
 drivers/infiniband/hw/qedr/verbs.c | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
index 8aed24b32de6..dc9ebbf625d2 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -234,6 +234,11 @@ struct qedr_ucontext {
 	bool db_rec;
 };
 
+union db_prod32 {
+	struct rdma_pwm_val16_data data;
+	u32 raw;
+};
+
 union db_prod64 {
 	struct rdma_pwm_val32_data data;
 	u64 raw;
@@ -265,6 +270,8 @@ struct qedr_userq {
 	struct qedr_user_db_rec *db_rec_data;
 	u64 db_rec_phys;
 	u64 db_rec_key;
+	void __iomem *db_rec_db2_addr;
+	union db_prod32 db_rec_db2_data;
 };
 
 struct qedr_cq {
@@ -300,11 +307,6 @@ struct qedr_pd {
 	struct qedr_ucontext *uctx;
 };
 
-union db_prod32 {
-	struct rdma_pwm_val16_data data;
-	u32 raw;
-};
-
 struct qedr_qp_hwq_info {
 	/* WQE Elements */
 	struct qed_chain pbl;
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 15221d9c7773..dfe5f6c42a2f 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -1684,6 +1684,10 @@ static void qedr_cleanup_user(struct qedr_dev *dev, struct qedr_qp *qp)
 	if (qp->urq.db_rec_data)
 		qedr_db_recovery_del(dev, qp->urq.db_addr,
 				     &qp->urq.db_rec_data->db_data);
+
+	if (rdma_protocol_iwarp(&dev->ibdev, 1))
+		qedr_db_recovery_del(dev, qp->urq.db_rec_db2_addr,
+				     &qp->urq.db_rec_db2_data);
 }
 
 static int qedr_create_user_qp(struct qedr_dev *dev,
@@ -1758,6 +1762,17 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
 	qp->usq.db_addr = ctx->dpi_addr + uresp.sq_db_offset;
 	qp->urq.db_addr = ctx->dpi_addr + uresp.rq_db_offset;
 
+	if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
+		qp->urq.db_rec_db2_addr = ctx->dpi_addr + uresp.rq_db2_offset;
+
+		/* calculate the db_rec_db2 data since it is constant so no
+		 *  need to reflect from user
+		 */
+		qp->urq.db_rec_db2_data.data.icid = cpu_to_le16(qp->icid);
+		qp->urq.db_rec_db2_data.data.value =
+			cpu_to_le16(DQ_TCM_IWARP_POST_RQ_CF_CMD);
+	}
+
 	rc = qedr_db_recovery_add(dev, qp->usq.db_addr,
 				  &qp->usq.db_rec_data->db_data,
 				  DB_REC_WIDTH_32B,
@@ -1771,6 +1786,15 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
 				  DB_REC_USER);
 	if (rc)
 		goto err;
+
+	if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
+		rc = qedr_db_recovery_add(dev, qp->urq.db_rec_db2_addr,
+					  &qp->urq.db_rec_db2_data,
+					  DB_REC_WIDTH_32B,
+					  DB_REC_USER);
+		if (rc)
+			goto err;
+	}
 	qedr_qp_user_print(dev, qp);
 
 	return rc;
@@ -1811,7 +1835,13 @@ static int qedr_set_iwarp_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
 				  &qp->rq.db_data,
 				  DB_REC_WIDTH_32B,
 				  DB_REC_KERNEL);
+	if (rc)
+		return rc;
 
+	rc = qedr_db_recovery_add(dev, qp->rq.iwarp_db2,
+				  &qp->rq.iwarp_db2_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_KERNEL);
 	return rc;
 }
 
@@ -1940,8 +1970,13 @@ static void qedr_cleanup_kernel(struct qedr_dev *dev, struct qedr_qp *qp)
 
 	qedr_db_recovery_del(dev, qp->sq.db, &qp->sq.db_data);
 
-	if (!qp->srq)
+	if (!qp->srq) {
 		qedr_db_recovery_del(dev, qp->rq.db, &qp->rq.db_data);
+
+		if (rdma_protocol_iwarp(&dev->ibdev, 1))
+			qedr_db_recovery_del(dev, qp->rq.iwarp_db2,
+					     &qp->rq.iwarp_db2_data);
+	}
 }
 
 static int qedr_create_kernel_qp(struct qedr_dev *dev,
-- 
2.14.5


^ permalink raw reply related

* Re: [PATCH] Make rxe driver to calculate correct byte_len on receiving side when work completion is generated with IB_WC_RECV_RDMA_WITH_IMM opcode.
From: Yanjun Zhu @ 2019-07-08  1:57 UTC (permalink / raw)
  To: Konstantin Taranov; +Cc: monis, linux-rdma
In-Reply-To: <20190708034621.101b25dc@ktaranov-laptop>


On 2019/7/8 9:46, Konstantin Taranov wrote:
> On Mon, 8 Jul 2019 07:35:24 +0800
> Zhu Yanjun <yanjun.zhu@oracle.com> wrote:
>
>> 在 2019/7/8 5:23, Konstantin Taranov 写道:
>>> On Wed, 3 Jul 2019 09:24:54 +0800
>>> Yanjun Zhu <yanjun.zhu@oracle.com> wrote:
>>>   
>>>> On 2019/6/27 22:06, Konstantin Taranov wrote:
>>>>> Make softRoce to calculate correct byte_len on receiving side when work completion
>>>>> is generated with IB_WC_RECV_RDMA_WITH_IMM opcode.
>>>>>
>>>>> According to documentation byte_len must indicate the number of written
>>>>> bytes, whereas it was always equal to zero for IB_WC_RECV_RDMA_WITH_IMM opcode.
>>>> With roce NIC, what is the byte_len? Thanks a lot.
>>> byte_len is a field of a work completion (struct ib_uverbs_wc or struct ibv_wc). It is defined in verbs and stores
>>> the number of written bytes to the destination memory. In case of IB_WC_RECV_RDMA_WITH_IMM
>>> completion event, the field byte_len must store the number of written bytes for incoming
>>> RDMA_WRITE_WITH_IMM request.
>> Cool. Thanks for your explanations.
>>
>> The above is the test result of physical RoCE NIC?
>>
> Yes. When I use physical nics, the byte_len indicates the number of received bytes.
> It is also fully complies with what is written in https://www.rdmamojo.com/2013/02/15/ibv_poll_cq/ about the byte_len field.

Nice. I am fine with this patch.

Thanks a lot.

Zhu Yanjun

>   
>
>
>> Thanks.
>>
>> Zhu Yanjun
>>
>>>   
>>>> Zhu Yanjun
>>>>   
>>>>> The patch proposes to remember the length of an RDMA request from the RETH header, and use it
>>>>> as byte_len when the work completion with IB_WC_RECV_RDMA_WITH_IMM opcode is generated.
>>>>>
>>>>> Signed-off-by: Konstantin Taranov <konstantin.taranov@inf.ethz.ch>
>>>>> ---
>>>>>     drivers/infiniband/sw/rxe/rxe_resp.c  | 5 ++++-
>>>>>     drivers/infiniband/sw/rxe/rxe_verbs.h | 1 +
>>>>>     2 files changed, 5 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
>>>>> index aca9f60f9b21..1cbfbd98eb22 100644
>>>>> --- a/drivers/infiniband/sw/rxe/rxe_resp.c
>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_resp.c
>>>>> @@ -431,6 +431,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
>>>>>     			qp->resp.va = reth_va(pkt);
>>>>>     			qp->resp.rkey = reth_rkey(pkt);
>>>>>     			qp->resp.resid = reth_len(pkt);
>>>>> +			qp->resp.length = reth_len(pkt);
>>>>>     		}
>>>>>     		access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
>>>>>     						     : IB_ACCESS_REMOTE_WRITE;
>>>>> @@ -856,7 +857,9 @@ static enum resp_states do_complete(struct rxe_qp *qp,
>>>>>     				pkt->mask & RXE_WRITE_MASK) ?
>>>>>     					IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
>>>>>     		wc->vendor_err = 0;
>>>>> -		wc->byte_len = wqe->dma.length - wqe->dma.resid;
>>>>> +		wc->byte_len = (pkt->mask & RXE_IMMDT_MASK &&
>>>>> +				pkt->mask & RXE_WRITE_MASK) ?
>>>>> +					qp->resp.length : wqe->dma.length - wqe->dma.resid;
>>>>>     
>>>>>     		/* fields after byte_len are different between kernel and user
>>>>>     		 * space
>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
>>>>> index e8be7f44e3be..28bfb3ece104 100644
>>>>> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
>>>>> @@ -213,6 +213,7 @@ struct rxe_resp_info {
>>>>>     	struct rxe_mem		*mr;
>>>>>     	u32			resid;
>>>>>     	u32			rkey;
>>>>> +	u32			length;
>>>>>     	u64			atomic_orig;
>>>>>     
>>>>>     	/* SRQ only */

^ permalink raw reply

* Re: [PATCH] Make rxe driver to calculate correct byte_len on receiving side when work completion is generated with IB_WC_RECV_RDMA_WITH_IMM opcode.
From: Konstantin Taranov @ 2019-07-08  1:46 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: monis, linux-rdma
In-Reply-To: <a58291a3-8b04-49bf-6c10-202b8ba426ac@oracle.com>

On Mon, 8 Jul 2019 07:35:24 +0800
Zhu Yanjun <yanjun.zhu@oracle.com> wrote:

> 在 2019/7/8 5:23, Konstantin Taranov 写道:
> > On Wed, 3 Jul 2019 09:24:54 +0800
> > Yanjun Zhu <yanjun.zhu@oracle.com> wrote:
> >  
> >> On 2019/6/27 22:06, Konstantin Taranov wrote:  
> >>> Make softRoce to calculate correct byte_len on receiving side when work completion
> >>> is generated with IB_WC_RECV_RDMA_WITH_IMM opcode.
> >>>
> >>> According to documentation byte_len must indicate the number of written
> >>> bytes, whereas it was always equal to zero for IB_WC_RECV_RDMA_WITH_IMM opcode.  
> >> With roce NIC, what is the byte_len? Thanks a lot.  
> > byte_len is a field of a work completion (struct ib_uverbs_wc or struct ibv_wc). It is defined in verbs and stores
> > the number of written bytes to the destination memory. In case of IB_WC_RECV_RDMA_WITH_IMM
> > completion event, the field byte_len must store the number of written bytes for incoming
> > RDMA_WRITE_WITH_IMM request.  
> 
> Cool. Thanks for your explanations.
> 
> The above is the test result of physical RoCE NIC?
> 
Yes. When I use physical nics, the byte_len indicates the number of received bytes.
It is also fully complies with what is written in https://www.rdmamojo.com/2013/02/15/ibv_poll_cq/ about the byte_len field.  


> Thanks.
> 
> Zhu Yanjun
> 
> >  
> >> Zhu Yanjun
> >>  
> >>> The patch proposes to remember the length of an RDMA request from the RETH header, and use it
> >>> as byte_len when the work completion with IB_WC_RECV_RDMA_WITH_IMM opcode is generated.
> >>>
> >>> Signed-off-by: Konstantin Taranov <konstantin.taranov@inf.ethz.ch>
> >>> ---
> >>>    drivers/infiniband/sw/rxe/rxe_resp.c  | 5 ++++-
> >>>    drivers/infiniband/sw/rxe/rxe_verbs.h | 1 +
> >>>    2 files changed, 5 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
> >>> index aca9f60f9b21..1cbfbd98eb22 100644
> >>> --- a/drivers/infiniband/sw/rxe/rxe_resp.c
> >>> +++ b/drivers/infiniband/sw/rxe/rxe_resp.c
> >>> @@ -431,6 +431,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
> >>>    			qp->resp.va = reth_va(pkt);
> >>>    			qp->resp.rkey = reth_rkey(pkt);
> >>>    			qp->resp.resid = reth_len(pkt);
> >>> +			qp->resp.length = reth_len(pkt);
> >>>    		}
> >>>    		access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
> >>>    						     : IB_ACCESS_REMOTE_WRITE;
> >>> @@ -856,7 +857,9 @@ static enum resp_states do_complete(struct rxe_qp *qp,
> >>>    				pkt->mask & RXE_WRITE_MASK) ?
> >>>    					IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
> >>>    		wc->vendor_err = 0;
> >>> -		wc->byte_len = wqe->dma.length - wqe->dma.resid;
> >>> +		wc->byte_len = (pkt->mask & RXE_IMMDT_MASK &&
> >>> +				pkt->mask & RXE_WRITE_MASK) ?
> >>> +					qp->resp.length : wqe->dma.length - wqe->dma.resid;
> >>>    
> >>>    		/* fields after byte_len are different between kernel and user
> >>>    		 * space
> >>> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> >>> index e8be7f44e3be..28bfb3ece104 100644
> >>> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> >>> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> >>> @@ -213,6 +213,7 @@ struct rxe_resp_info {
> >>>    	struct rxe_mem		*mr;
> >>>    	u32			resid;
> >>>    	u32			rkey;
> >>> +	u32			length;
> >>>    	u64			atomic_orig;
> >>>    
> >>>    	/* SRQ only */  


^ permalink raw reply

* Re: [PATCH] Make rxe driver to calculate correct byte_len on receiving side when work completion is generated with IB_WC_RECV_RDMA_WITH_IMM opcode.
From: Zhu Yanjun @ 2019-07-07 23:35 UTC (permalink / raw)
  To: Konstantin Taranov; +Cc: monis, linux-rdma
In-Reply-To: <20190707231126.774bdd6e@ktaranov-laptop>


在 2019/7/8 5:23, Konstantin Taranov 写道:
> On Wed, 3 Jul 2019 09:24:54 +0800
> Yanjun Zhu <yanjun.zhu@oracle.com> wrote:
>
>> On 2019/6/27 22:06, Konstantin Taranov wrote:
>>> Make softRoce to calculate correct byte_len on receiving side when work completion
>>> is generated with IB_WC_RECV_RDMA_WITH_IMM opcode.
>>>
>>> According to documentation byte_len must indicate the number of written
>>> bytes, whereas it was always equal to zero for IB_WC_RECV_RDMA_WITH_IMM opcode.
>> With roce NIC, what is the byte_len? Thanks a lot.
> byte_len is a field of a work completion (struct ib_uverbs_wc or struct ibv_wc). It is defined in verbs and stores
> the number of written bytes to the destination memory. In case of IB_WC_RECV_RDMA_WITH_IMM
> completion event, the field byte_len must store the number of written bytes for incoming
> RDMA_WRITE_WITH_IMM request.

Cool. Thanks for your explanations.

The above is the test result of physical RoCE NIC?

Thanks.

Zhu Yanjun

>
>> Zhu Yanjun
>>
>>> The patch proposes to remember the length of an RDMA request from the RETH header, and use it
>>> as byte_len when the work completion with IB_WC_RECV_RDMA_WITH_IMM opcode is generated.
>>>
>>> Signed-off-by: Konstantin Taranov <konstantin.taranov@inf.ethz.ch>
>>> ---
>>>    drivers/infiniband/sw/rxe/rxe_resp.c  | 5 ++++-
>>>    drivers/infiniband/sw/rxe/rxe_verbs.h | 1 +
>>>    2 files changed, 5 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
>>> index aca9f60f9b21..1cbfbd98eb22 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe_resp.c
>>> +++ b/drivers/infiniband/sw/rxe/rxe_resp.c
>>> @@ -431,6 +431,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
>>>    			qp->resp.va = reth_va(pkt);
>>>    			qp->resp.rkey = reth_rkey(pkt);
>>>    			qp->resp.resid = reth_len(pkt);
>>> +			qp->resp.length = reth_len(pkt);
>>>    		}
>>>    		access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
>>>    						     : IB_ACCESS_REMOTE_WRITE;
>>> @@ -856,7 +857,9 @@ static enum resp_states do_complete(struct rxe_qp *qp,
>>>    				pkt->mask & RXE_WRITE_MASK) ?
>>>    					IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
>>>    		wc->vendor_err = 0;
>>> -		wc->byte_len = wqe->dma.length - wqe->dma.resid;
>>> +		wc->byte_len = (pkt->mask & RXE_IMMDT_MASK &&
>>> +				pkt->mask & RXE_WRITE_MASK) ?
>>> +					qp->resp.length : wqe->dma.length - wqe->dma.resid;
>>>    
>>>    		/* fields after byte_len are different between kernel and user
>>>    		 * space
>>> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
>>> index e8be7f44e3be..28bfb3ece104 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
>>> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
>>> @@ -213,6 +213,7 @@ struct rxe_resp_info {
>>>    	struct rxe_mem		*mr;
>>>    	u32			resid;
>>>    	u32			rkey;
>>> +	u32			length;
>>>    	u64			atomic_orig;
>>>    
>>>    	/* SRQ only */

^ permalink raw reply

* Re: [PATCH] Make rxe driver to calculate correct byte_len on receiving side when work completion is generated with IB_WC_RECV_RDMA_WITH_IMM opcode.
From: Konstantin Taranov @ 2019-07-07 21:23 UTC (permalink / raw)
  To: Yanjun Zhu; +Cc: monis, linux-rdma
In-Reply-To: <d149da15-523a-438a-1550-095b4b1a840b@oracle.com>

On Wed, 3 Jul 2019 09:24:54 +0800
Yanjun Zhu <yanjun.zhu@oracle.com> wrote:

> On 2019/6/27 22:06, Konstantin Taranov wrote:
> > Make softRoce to calculate correct byte_len on receiving side when work completion
> > is generated with IB_WC_RECV_RDMA_WITH_IMM opcode.
> >
> > According to documentation byte_len must indicate the number of written
> > bytes, whereas it was always equal to zero for IB_WC_RECV_RDMA_WITH_IMM opcode.  
> 
> With roce NIC, what is the byte_len? Thanks a lot.

byte_len is a field of a work completion (struct ib_uverbs_wc or struct ibv_wc). It is defined in verbs and stores
the number of written bytes to the destination memory. In case of IB_WC_RECV_RDMA_WITH_IMM
completion event, the field byte_len must store the number of written bytes for incoming
RDMA_WRITE_WITH_IMM request. 

> 
> Zhu Yanjun
> 
> >
> > The patch proposes to remember the length of an RDMA request from the RETH header, and use it
> > as byte_len when the work completion with IB_WC_RECV_RDMA_WITH_IMM opcode is generated.
> >
> > Signed-off-by: Konstantin Taranov <konstantin.taranov@inf.ethz.ch>
> > ---
> >   drivers/infiniband/sw/rxe/rxe_resp.c  | 5 ++++-
> >   drivers/infiniband/sw/rxe/rxe_verbs.h | 1 +
> >   2 files changed, 5 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
> > index aca9f60f9b21..1cbfbd98eb22 100644
> > --- a/drivers/infiniband/sw/rxe/rxe_resp.c
> > +++ b/drivers/infiniband/sw/rxe/rxe_resp.c
> > @@ -431,6 +431,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
> >   			qp->resp.va = reth_va(pkt);
> >   			qp->resp.rkey = reth_rkey(pkt);
> >   			qp->resp.resid = reth_len(pkt);
> > +			qp->resp.length = reth_len(pkt);
> >   		}
> >   		access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
> >   						     : IB_ACCESS_REMOTE_WRITE;
> > @@ -856,7 +857,9 @@ static enum resp_states do_complete(struct rxe_qp *qp,
> >   				pkt->mask & RXE_WRITE_MASK) ?
> >   					IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
> >   		wc->vendor_err = 0;
> > -		wc->byte_len = wqe->dma.length - wqe->dma.resid;
> > +		wc->byte_len = (pkt->mask & RXE_IMMDT_MASK &&
> > +				pkt->mask & RXE_WRITE_MASK) ?
> > +					qp->resp.length : wqe->dma.length - wqe->dma.resid;
> >   
> >   		/* fields after byte_len are different between kernel and user
> >   		 * space
> > diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> > index e8be7f44e3be..28bfb3ece104 100644
> > --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> > +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> > @@ -213,6 +213,7 @@ struct rxe_resp_info {
> >   	struct rxe_mem		*mr;
> >   	u32			resid;
> >   	u32			rkey;
> > +	u32			length;
> >   	u64			atomic_orig;
> >   
> >   	/* SRQ only */  


^ permalink raw reply

* [PATCH for-next] RDMA/efa: Expose device statistics
From: Gal Pressman @ 2019-07-07 14:20 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: linux-rdma, Gal Pressman, Firas JahJah, Yossi Leybovich

Expose hardware statistics through the sysfs api:
/sys/class/infiniband/efa_0/hw_counters/*.

Reviewed-by: Firas JahJah <firasj@amazon.com>
Reviewed-by: Yossi Leybovich <sleybo@amazon.com>
Signed-off-by: Gal Pressman <galpress@amazon.com>
---
 drivers/infiniband/hw/efa/efa.h         |  3 +
 drivers/infiniband/hw/efa/efa_com_cmd.c | 35 +++++++++++
 drivers/infiniband/hw/efa/efa_com_cmd.h | 23 +++++++
 drivers/infiniband/hw/efa/efa_main.c    |  2 +
 drivers/infiniband/hw/efa/efa_verbs.c   | 79 +++++++++++++++++++++++++
 5 files changed, 142 insertions(+)

diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
index 119f8efec564..2283e432693e 100644
--- a/drivers/infiniband/hw/efa/efa.h
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -156,5 +156,8 @@ int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 		  int qp_attr_mask, struct ib_udata *udata);
 enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
 					 u8 port_num);
+struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u8 port_num);
+int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+		     u8 port_num, int index);
 
 #endif /* _EFA_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c
index 62345d8abf3c..16b115df63e8 100644
--- a/drivers/infiniband/hw/efa/efa_com_cmd.c
+++ b/drivers/infiniband/hw/efa/efa_com_cmd.c
@@ -702,3 +702,38 @@ int efa_com_dealloc_uar(struct efa_com_dev *edev,
 
 	return 0;
 }
+
+int efa_com_get_stats(struct efa_com_dev *edev,
+		      struct efa_com_get_stats_params *params,
+		      union efa_com_get_stats_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_aq_get_stats_cmd cmd = {};
+	struct efa_admin_acq_get_stats_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_STATS;
+	cmd.type = params->type;
+	cmd.scope = params->scope;
+	cmd.scope_modifier = params->scope_modifier;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err(edev->efa_dev,
+			  "Failed to get stats type-%u scope-%u.%u [%d]\n",
+			  cmd.type, cmd.scope, cmd.scope_modifier, err);
+		return err;
+	}
+
+	result->basic_stats.tx_bytes = resp.basic_stats.tx_bytes;
+	result->basic_stats.tx_pkts = resp.basic_stats.tx_pkts;
+	result->basic_stats.rx_bytes = resp.basic_stats.rx_bytes;
+	result->basic_stats.rx_pkts = resp.basic_stats.rx_pkts;
+	result->basic_stats.rx_drops = resp.basic_stats.rx_drops;
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h
index a1174380462c..7f6c13052f49 100644
--- a/drivers/infiniband/hw/efa/efa_com_cmd.h
+++ b/drivers/infiniband/hw/efa/efa_com_cmd.h
@@ -225,6 +225,26 @@ struct efa_com_dealloc_uar_params {
 	u16 uarn;
 };
 
+struct efa_com_get_stats_params {
+	/* see enum efa_admin_get_stats_type */
+	u8 type;
+	/* see enum efa_admin_get_stats_scope */
+	u8 scope;
+	u16 scope_modifier;
+};
+
+struct efa_com_basic_stats {
+	u64 tx_bytes;
+	u64 tx_pkts;
+	u64 rx_bytes;
+	u64 rx_pkts;
+	u64 rx_drops;
+};
+
+union efa_com_get_stats_result {
+	struct efa_com_basic_stats basic_stats;
+};
+
 void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low);
 int efa_com_create_qp(struct efa_com_dev *edev,
 		      struct efa_com_create_qp_params *params,
@@ -266,5 +286,8 @@ int efa_com_alloc_uar(struct efa_com_dev *edev,
 		      struct efa_com_alloc_uar_result *result);
 int efa_com_dealloc_uar(struct efa_com_dev *edev,
 			struct efa_com_dealloc_uar_params *params);
+int efa_com_get_stats(struct efa_com_dev *edev,
+		      struct efa_com_get_stats_params *params,
+		      union efa_com_get_stats_result *result);
 
 #endif /* _EFA_COM_CMD_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index dd1c6d49466f..83858f7e83d0 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -201,6 +201,7 @@ static const struct ib_device_ops efa_dev_ops = {
 	.driver_id = RDMA_DRIVER_EFA,
 	.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION,
 
+	.alloc_hw_stats = efa_alloc_hw_stats,
 	.alloc_pd = efa_alloc_pd,
 	.alloc_ucontext = efa_alloc_ucontext,
 	.create_ah = efa_create_ah,
@@ -212,6 +213,7 @@ static const struct ib_device_ops efa_dev_ops = {
 	.destroy_ah = efa_destroy_ah,
 	.destroy_cq = efa_destroy_cq,
 	.destroy_qp = efa_destroy_qp,
+	.get_hw_stats = efa_get_hw_stats,
 	.get_link_layer = efa_port_link_layer,
 	.get_port_immutable = efa_get_port_immutable,
 	.mmap = efa_mmap,
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index df77bc312a25..3c10e733758e 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -41,6 +41,33 @@ static inline u64 get_mmap_key(const struct efa_mmap_entry *efa)
 	       ((u64)efa->mmap_page << PAGE_SHIFT);
 }
 
+#define EFA_DEFINE_STATS(op) \
+	op(EFA_TX_BYTES, "tx_bytes") \
+	op(EFA_TX_PKTS, "tx_pkts") \
+	op(EFA_RX_BYTES, "rx_bytes") \
+	op(EFA_RX_PKTS, "rx_pkts") \
+	op(EFA_RX_DROPS, "rx_drops") \
+	op(EFA_SUBMITTED_CMDS, "submitted_cmds") \
+	op(EFA_COMPLETED_CMDS, "completed_cmds") \
+	op(EFA_NO_COMPLETION_CMDS, "no_completion_cmds") \
+	op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \
+	op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \
+	op(EFA_CREATE_QP_ERR, "create_qp_err") \
+	op(EFA_REG_MR_ERR, "reg_mr_err") \
+	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
+	op(EFA_CREATE_AH_ERR, "create_ah_err")
+
+#define EFA_STATS_ENUM(ename, name) ename,
+#define EFA_STATS_STR(ename, name) [ename] = name,
+
+enum efa_hw_stats {
+	EFA_DEFINE_STATS(EFA_STATS_ENUM)
+};
+
+static const char *const efa_stats_names[] = {
+	EFA_DEFINE_STATS(EFA_STATS_STR)
+};
+
 #define EFA_CHUNK_PAYLOAD_SHIFT       12
 #define EFA_CHUNK_PAYLOAD_SIZE        BIT(EFA_CHUNK_PAYLOAD_SHIFT)
 #define EFA_CHUNK_PAYLOAD_PTR_SIZE    8
@@ -1727,6 +1754,58 @@ void efa_destroy_ah(struct ib_ah *ibah, u32 flags)
 	efa_ah_destroy(dev, ah);
 }
 
+struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u8 port_num)
+{
+	/* Stats are per device */
+	if (port_num)
+		return NULL;
+
+	return rdma_alloc_hw_stats_struct(efa_stats_names,
+					  ARRAY_SIZE(efa_stats_names),
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+		     u8 port_num, int index)
+{
+	struct efa_com_get_stats_params params = {};
+	union efa_com_get_stats_result result;
+	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_com_basic_stats *bs;
+	struct efa_com_stats_admin *as;
+	struct efa_stats *s;
+	int err;
+
+	params.type = EFA_ADMIN_GET_STATS_TYPE_BASIC;
+	params.scope = EFA_ADMIN_GET_STATS_SCOPE_ALL;
+
+	err = efa_com_get_stats(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	bs = &result.basic_stats;
+	stats->value[EFA_TX_BYTES] = bs->tx_bytes;
+	stats->value[EFA_TX_PKTS] = bs->tx_pkts;
+	stats->value[EFA_RX_BYTES] = bs->rx_bytes;
+	stats->value[EFA_RX_PKTS] = bs->rx_pkts;
+	stats->value[EFA_RX_DROPS] = bs->rx_drops;
+
+	as = &dev->edev.aq.stats;
+	stats->value[EFA_SUBMITTED_CMDS] = atomic64_read(&as->submitted_cmd);
+	stats->value[EFA_COMPLETED_CMDS] = atomic64_read(&as->completed_cmd);
+	stats->value[EFA_NO_COMPLETION_CMDS] = atomic64_read(&as->no_completion);
+
+	s = &dev->stats;
+	stats->value[EFA_KEEP_ALIVE_RCVD] = atomic64_read(&s->keep_alive_rcvd);
+	stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->sw_stats.alloc_pd_err);
+	stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->sw_stats.create_qp_err);
+	stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->sw_stats.reg_mr_err);
+	stats->value[EFA_ALLOC_UCONTEXT_ERR] = atomic64_read(&s->sw_stats.alloc_ucontext_err);
+	stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->sw_stats.create_ah_err);
+
+	return ARRAY_SIZE(efa_stats_names);
+}
+
 enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
 					 u8 port_num)
 {
-- 
2.22.0


^ permalink raw reply related

* Re: [PATCH for-next 5/8] RDMA/hns: Bugfix for calculating qp buffer size
From: Jason Gunthorpe @ 2019-07-07 12:21 UTC (permalink / raw)
  To: oulijun; +Cc: dledford, leon, linux-rdma, linuxarm
In-Reply-To: <997bdd68-8be1-9684-5d4d-d0b5bf202b80@huawei.com>

On Sat, Jul 06, 2019 at 09:47:09AM +0800, oulijun wrote:
> 在 2019/6/24 19:47, Lijun Ou 写道:
> > From: o00290482 <o00290482@huawei.com>
> Hi, Jason
>    May be my local configuration error causing the wroong author.  How should I make changes?
> 
> The correct as follows:
> From: Lijun Ou <oulijun@huawei.com>

I fixed it this once, but please check and fix it on your end in
future.

You should be able to set the patch author in git's config file:

[user]
        email = jgg@mellanox.com
        name = Jason Gunthorpe

Jason

^ permalink raw reply

* RE: [EXT] Re: [RFC rdma 1/3] RDMA/core: Create a common mmap function
From: Michal Kalderon @ 2019-07-07 11:30 UTC (permalink / raw)
  To: Gal Pressman, Jason Gunthorpe
  Cc: dledford@redhat.com, leon@kernel.org, sleybo@amazon.com,
	Ariel Elior, linux-rdma@vger.kernel.org
In-Reply-To: <4d8c8c9e-df8a-6555-c11a-b53a5dd274fe@amazon.com>

> From: Gal Pressman <galpress@amazon.com>
> Sent: Sunday, July 7, 2019 9:41 AM
> 
> On 05/07/2019 20:35, Jason Gunthorpe wrote:
> > On Fri, Jul 05, 2019 at 05:24:18PM +0000, Michal Kalderon wrote:
> >>> From: Jason Gunthorpe <jgg@ziepe.ca>
> >>> Sent: Friday, July 5, 2019 6:33 PM
> >>>
> >>> On Fri, Jul 05, 2019 at 03:29:03PM +0000, Michal Kalderon wrote:
> >>>>> From: Jason Gunthorpe <jgg@ziepe.ca>
> >>>>> Sent: Thursday, July 4, 2019 3:35 PM
> >>>>>
> >>>>> External Email
> >>>>>
> >>>>> On Wed, Jul 03, 2019 at 11:19:34AM +0300, Gal Pressman wrote:
> >>>>>> On 03/07/2019 1:31, Jason Gunthorpe wrote:
> >>>>>>>> Seems except Mellanox + hns the mmap flags aren't ABI.
> >>>>>>>> Also, current Mellanox code seems like it won't benefit from
> >>>>>>>> mmap cookie helper functions in any case as the mmap function
> >>>>>>>> is very specific and the flags used indicate the address and
> >>>>>>>> not just how to map
> >>>>> it.
> >>>>>>>
> >>>>>>> IMHO, mlx5 has a goofy implementaiton here as it codes all of
> >>>>>>> the object type, handle and cachability flags in one thing.
> >>>>>>
> >>>>>> Do we need object type flags as well in the generic mmap code?
> >>>>>
> >>>>> At the end of the day the driver needs to know what page to map
> >>>>> during the mmap syscall.
> >>>>>
> >>>>> mlx5 does this by encoding the page type in the address, and then
> >>>>> many types have seperate lookups based onthe offset for the actual
> >>> page.
> >>>>>
> >>>>> IMHO the single lookup and opaque offset is generally better..
> >>>>>
> >>>>> Since the mlx5 scheme is ABI it can't be changed unfortunately.
> >>>>>
> >>>>> If you want to do user controlled cachability flags, or not, is a
> >>>>> fair question, but they still become ABI..
> >>>>>
> >>>>> I'm wondering if it really makes sense to do that during the mmap,
> >>>>> or if the cachability should be set as part of creating the cookie?
> >>>>>
> >>>>>> Another issue is that these flags aren't exposed in an ABI file,
> >>>>>> so a userspace library can't really make use of it in current state.
> >>>>>
> >>>>> Woops.
> >>>>>
> >>>>> Ah, this is all ABI so you need to dig out of this hole ASAP :)
> >>>>>
> >>>> Jason, I didn't follow - what is all ABI?
> >>>> currently EFA implementation encodes the cachability inside the
> >>>> key, It's not exposed in ABI file and is opaque to user-space. The
> >>>> kernel decides on the cachability And get's it back in the key when
> >>>> mmap is called. It seems good enough for the current cases.
> >>>
> >>> Then the key 'offset' should not include cachability information at all.
> >>>
> >> Fair enough, so as you stated above the cachabiliy can be set in the
> cookie.
> >> Would we still like to leave some bits for future ABI enhancements,
> requests, from user ?
> >> Similar to a page type that mlx has ?
> >
> > Doesn't make sense to mix and match, the page_type was just some way
> > to avoid tracking cookies in some cases. If we are always having a
> > cookie then the cookie should indicate the type based on how it was
> > created. Totally opaque
> 
> I'm fine with removing the cachability flags from the ABI, but I don't see how
> the page types can be added without exposing them in the key.
> 
> If we want to mmap something that's not a QP/CQ/... how can we do that? I
> guess only by returning some key in alloc_ucontext?

Right. Every call to mmap should be backed up by a cookie in the driver.

^ permalink raw reply

* Re: [PATCH rdma-next 0/2] DEVX VHCA tunnel support
From: Leon Romanovsky @ 2019-07-07  7:51 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Doug Ledford, RDMA mailing list, Max Gurtovoy, Yishai Hadas,
	Saeed Mahameed, linux-netdev
In-Reply-To: <20190705174007.GA7787@ziepe.ca>

On Fri, Jul 05, 2019 at 02:40:07PM -0300, Jason Gunthorpe wrote:
> On Mon, Jul 01, 2019 at 09:14:00PM +0300, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@mellanox.com>
> >
> > Hi,
> >
> > Those two patches introduce VHCA tunnel mechanism to DEVX interface
> > needed for Bluefield SOC. See extensive commit messages for more
> > information.
> >
> > Thanks
> >
> > Max Gurtovoy (2):
> >   net/mlx5: Introduce VHCA tunnel device capability
> >   IB/mlx5: Implement VHCA tunnel mechanism in DEVX
> >
> >  drivers/infiniband/hw/mlx5/devx.c | 24 ++++++++++++++++++++----
> >  include/linux/mlx5/mlx5_ifc.h     | 10 ++++++++--
> >  2 files changed, 28 insertions(+), 6 deletions(-)
>
> This looks Ok can you apply the mlx5-next patch please

1dd7382b1bb6 net/mlx5: Introduce VHCA tunnel device capability

Thanks

>
> Thanks,
> Jason
>

^ permalink raw reply

* Re: [EXT] Re: [RFC rdma 1/3] RDMA/core: Create a common mmap function
From: Gal Pressman @ 2019-07-07  6:41 UTC (permalink / raw)
  To: Jason Gunthorpe, Michal Kalderon
  Cc: dledford@redhat.com, leon@kernel.org, sleybo@amazon.com,
	Ariel Elior, linux-rdma@vger.kernel.org
In-Reply-To: <20190705173551.GC31543@ziepe.ca>

On 05/07/2019 20:35, Jason Gunthorpe wrote:
> On Fri, Jul 05, 2019 at 05:24:18PM +0000, Michal Kalderon wrote:
>>> From: Jason Gunthorpe <jgg@ziepe.ca>
>>> Sent: Friday, July 5, 2019 6:33 PM
>>>
>>> On Fri, Jul 05, 2019 at 03:29:03PM +0000, Michal Kalderon wrote:
>>>>> From: Jason Gunthorpe <jgg@ziepe.ca>
>>>>> Sent: Thursday, July 4, 2019 3:35 PM
>>>>>
>>>>> External Email
>>>>>
>>>>> On Wed, Jul 03, 2019 at 11:19:34AM +0300, Gal Pressman wrote:
>>>>>> On 03/07/2019 1:31, Jason Gunthorpe wrote:
>>>>>>>> Seems except Mellanox + hns the mmap flags aren't ABI.
>>>>>>>> Also, current Mellanox code seems like it won't benefit from
>>>>>>>> mmap cookie helper functions in any case as the mmap function
>>>>>>>> is very specific and the flags used indicate the address and
>>>>>>>> not just how to map
>>>>> it.
>>>>>>>
>>>>>>> IMHO, mlx5 has a goofy implementaiton here as it codes all of
>>>>>>> the object type, handle and cachability flags in one thing.
>>>>>>
>>>>>> Do we need object type flags as well in the generic mmap code?
>>>>>
>>>>> At the end of the day the driver needs to know what page to map
>>>>> during the mmap syscall.
>>>>>
>>>>> mlx5 does this by encoding the page type in the address, and then
>>>>> many types have seperate lookups based onthe offset for the actual
>>> page.
>>>>>
>>>>> IMHO the single lookup and opaque offset is generally better..
>>>>>
>>>>> Since the mlx5 scheme is ABI it can't be changed unfortunately.
>>>>>
>>>>> If you want to do user controlled cachability flags, or not, is a
>>>>> fair question, but they still become ABI..
>>>>>
>>>>> I'm wondering if it really makes sense to do that during the mmap,
>>>>> or if the cachability should be set as part of creating the cookie?
>>>>>
>>>>>> Another issue is that these flags aren't exposed in an ABI file,
>>>>>> so a userspace library can't really make use of it in current state.
>>>>>
>>>>> Woops.
>>>>>
>>>>> Ah, this is all ABI so you need to dig out of this hole ASAP :)
>>>>>
>>>> Jason, I didn't follow - what is all ABI?
>>>> currently EFA implementation encodes the cachability inside the key,
>>>> It's not exposed in ABI file and is opaque to user-space. The kernel
>>>> decides on the cachability And get's it back in the key when mmap is
>>>> called. It seems good enough for the current cases.
>>>
>>> Then the key 'offset' should not include cachability information at all.
>>>
>> Fair enough, so as you stated above the cachabiliy can be set in the cookie. 
>> Would we still like to leave some bits for future ABI enhancements, requests, from user ? 
>> Similar to a page type that mlx has ? 
> 
> Doesn't make sense to mix and match, the page_type was just some way
> to avoid tracking cookies in some cases. If we are always having a
> cookie then the cookie should indicate the type based on how it was
> created. Totally opaque

I'm fine with removing the cachability flags from the ABI, but I don't see how
the page types can be added without exposing them in the key.

If we want to mmap something that's not a QP/CQ/... how can we do that? I guess
only by returning some key in alloc_ucontext?

^ permalink raw reply

* Re: [PATCH rdma-next] IB/mlx5: Report correctly tag matching rendezvous capability
From: Leon Romanovsky @ 2019-07-06 16:35 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Doug Ledford, Danit Goldberg, RDMA mailing list, Artemy Kovalyov,
	Yishai Hadas
In-Reply-To: <20190705171555.GH31525@mellanox.com>

On Fri, Jul 05, 2019 at 05:15:59PM +0000, Jason Gunthorpe wrote:
> On Fri, Jul 05, 2019 at 07:21:57PM +0300, Leon Romanovsky wrote:
> > From: Danit Goldberg <danitg@mellanox.com>
> >
> > Tag matching with rendezvous offload for RC transport is controlled
> > by FW and before this change, it was advertised to user as supported
> > without any relation to FW.
> >
> > Separate tag matching for rendezvous and eager protocols, so users
> > will see real capabilities.
> >
> > Cc: <stable@vger.kernel.org> # 4.13
> > Fixes: eb761894351d ("IB/mlx5: Fill XRQ capabilities")
> > Signed-off-by: Danit Goldberg <danitg@mellanox.com>
> > Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
> > Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
> > Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> >  drivers/infiniband/hw/mlx5/main.c | 8 ++++++--
> >  include/rdma/ib_verbs.h           | 4 ++--
> >  2 files changed, 8 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
> > index 07a05b0b9e42..c2a5780cb394 100644
> > +++ b/drivers/infiniband/hw/mlx5/main.c
> > @@ -1046,15 +1046,19 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
> >  	}
> >
> >  	if (MLX5_CAP_GEN(mdev, tag_matching)) {
> > -		props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
> >  		props->tm_caps.max_num_tags =
> >  			(1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
> > -		props->tm_caps.flags = IB_TM_CAP_RC;
> >  		props->tm_caps.max_ops =
> >  			1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
> >  		props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
> >  	}
> >
> > +	if (MLX5_CAP_GEN(mdev, tag_matching) &&
> > +	    MLX5_CAP_GEN(mdev, rndv_offload_rc)) {
> > +		props->tm_caps.flags = IB_TM_CAP_RNDV_RC;
> > +		props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
> > +	}
> > +
> >  	if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
> >  		props->cq_caps.max_cq_moderation_count =
> >  						MLX5_MAX_CQ_COUNT;
> > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> > index 30eb68f36109..c5f8a9f17063 100644
> > +++ b/include/rdma/ib_verbs.h
> > @@ -308,8 +308,8 @@ struct ib_rss_caps {
> >  };
> >
> >  enum ib_tm_cap_flags {
> > -	/*  Support tag matching on RC transport */
> > -	IB_TM_CAP_RC		    = 1 << 0,
> > +	/*  Support tag matching with rendezvous offload for RC transport */
> > +	IB_TM_CAP_RNDV_RC = 1 << 0,
> >  };
>
> This is in the wrong header, right?

It predates our all-to-uapi headers approach and moving to UAPI this struct
is definitely too much for a fix which should go to stable@.

Thanks

>
> Jason

^ permalink raw reply

* RE: [rdma 14/16] RDMA/irdma: Add ABI definitions
From: Saleem, Shiraz @ 2019-07-06 16:15 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Leon Romanovsky, Kirsher, Jeffrey T, dledford@redhat.com,
	davem@davemloft.net, Ismail, Mustafa, linux-rdma@vger.kernel.org,
	netdev@vger.kernel.org, nhorman@redhat.com, sassmann@redhat.com,
	poswald@suse.com, Ertman, David M
In-Reply-To: <20190705171650.GI31525@mellanox.com>

> Subject: Re: [rdma 14/16] RDMA/irdma: Add ABI definitions
> 
> On Fri, Jul 05, 2019 at 04:42:19PM +0000, Saleem, Shiraz wrote:
> > > Subject: Re: [rdma 14/16] RDMA/irdma: Add ABI definitions
> > >
> > > On Thu, Jul 04, 2019 at 10:40:21AM +0300, Leon Romanovsky wrote:
> > > > On Wed, Jul 03, 2019 at 07:12:57PM -0700, Jeff Kirsher wrote:
> > > > > From: Mustafa Ismail <mustafa.ismail@intel.com>
> > > > >
> > > > > Add ABI definitions for irdma.
> > > > >
> > > > > Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
> > > > > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> > > > > include/uapi/rdma/irdma-abi.h | 130
> > > > > ++++++++++++++++++++++++++++++++++
> > > > >  1 file changed, 130 insertions(+)  create mode 100644
> > > > > include/uapi/rdma/irdma-abi.h
> > > > >
> > > > > diff --git a/include/uapi/rdma/irdma-abi.h
> > > > > b/include/uapi/rdma/irdma-abi.h new file mode 100644 index
> > > > > 000000000000..bdfbda4c829e
> > > > > +++ b/include/uapi/rdma/irdma-abi.h
> > > > > @@ -0,0 +1,130 @@
> > > > > +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
> > > > > +/* Copyright (c) 2006 - 2019 Intel Corporation.  All rights reserved.
> > > > > + * Copyright (c) 2005 Topspin Communications.  All rights reserved.
> > > > > + * Copyright (c) 2005 Cisco Systems.  All rights reserved.
> > > > > + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
> > > > > + */
> > > > > +
> > > > > +#ifndef IRDMA_ABI_H
> > > > > +#define IRDMA_ABI_H
> > > > > +
> > > > > +#include <linux/types.h>
> > > > > +
> > > > > +/* irdma must support legacy GEN_1 i40iw kernel
> > > > > + * and user-space whose last ABI ver is 5  */ #define
> > > > > +IRDMA_ABI_VER
> > > > > +6
> > > >
> > > > Can you please elaborate about it more?
> > > > There is no irdma code in RDMA yet, so it makes me wonder why new
> > > > define shouldn't start from 1.
> > >
> > > It is because they are ABI compatible with the current user space,
> > > which raises the question why we even have this confusing header file..
> >
> > It is because we need to support current providers/i40iw user-space.
> > Our user-space patch series will introduce a new provider (irdma)
> > whose ABI ver. is also 6 (capable of supporting X722 and which will
> > work with i40iw driver on older kernels) and removes providers/i40iw from rdma-
> core.
> 
> Why on earth would we do that?
> 
A unified library providers/irdma to go in hand with the driver irdma and uses the ABI header.
It can support the new network device e810 and existing x722 iWARP device. It obsoletes
providers/i40iw and extends its ABI. So why keep providers/i40iw around in rdma-core?

Shiraz 


^ permalink raw reply

* RE: [net-next 1/3] ice: Initialize and register platform device to provide RDMA
From: Saleem, Shiraz @ 2019-07-06 16:03 UTC (permalink / raw)
  To: Greg KH
  Cc: Jason Gunthorpe, Kirsher, Jeffrey T, davem@davemloft.net,
	dledford@redhat.com, Nguyen, Anthony L, netdev@vger.kernel.org,
	linux-rdma@vger.kernel.org, nhorman@redhat.com,
	sassmann@redhat.com, poswald@suse.com, Ismail, Mustafa,
	Ertman, David M, Bowers, AndrewX
In-Reply-To: <20190706082523.GA8727@kroah.com>

> Subject: Re: [net-next 1/3] ice: Initialize and register platform device to provide
> RDMA
> 
> On Fri, Jul 05, 2019 at 04:33:07PM +0000, Saleem, Shiraz wrote:
> > > Subject: Re: [net-next 1/3] ice: Initialize and register platform
> > > device to provide RDMA
> > >
> > > On Thu, Jul 04, 2019 at 12:48:29PM +0000, Jason Gunthorpe wrote:
> > > > On Thu, Jul 04, 2019 at 02:42:47PM +0200, Greg KH wrote:
> > > > > On Thu, Jul 04, 2019 at 12:37:33PM +0000, Jason Gunthorpe wrote:
> > > > > > On Thu, Jul 04, 2019 at 02:29:50PM +0200, Greg KH wrote:
> > > > > > > On Thu, Jul 04, 2019 at 12:16:41PM +0000, Jason Gunthorpe wrote:
> > > > > > > > On Wed, Jul 03, 2019 at 07:12:50PM -0700, Jeff Kirsher wrote:
> > > > > > > > > From: Tony Nguyen <anthony.l.nguyen@intel.com>
> > > > > > > > >
> > > > > > > > > The RDMA block does not advertise on the PCI bus or any other
> bus.
> > > > > > > > > Thus the ice driver needs to provide access to the RDMA
> > > > > > > > > hardware block via a virtual bus; utilize the platform
> > > > > > > > > bus to provide this
> > > access.
> > > > > > > > >
> > > > > > > > > This patch initializes the driver to support RDMA as
> > > > > > > > > well as creates and registers a platform device for the
> > > > > > > > > RDMA driver to register to. At this point the driver is
> > > > > > > > > fully initialized to register a platform driver,
> > > > > > > > > however, can not yet register as the ops have not been
> implemented.
> > > > > > > >
> > > > > > > > I think you need Greg's ack on all this driver stuff -
> > > > > > > > particularly that a platform_device is OK.
> > > > > > >
> > > > > > > A platform_device is almost NEVER ok.
> > > > > > >
> > > > > > > Don't abuse it, make a real device on a real bus.  If you
> > > > > > > don't have a real bus and just need to create a device to
> > > > > > > hang other things off of, then use the virtual one, that's what it is there
> for.
> > > > > >
> > > > > > Ideally I'd like to see all the RDMA drivers that connect to
> > > > > > ethernet drivers use some similar scheme.
> > > > >
> > > > > Why?  They should be attached to a "real" device, why make any up?
> > > >
> > > > ? A "real" device, like struct pci_device, can only bind to one
> > > > driver. How can we bind it concurrently to net, rdma, scsi, etc?
> > >
> > > MFD was designed for this very problem.
> > >
> > > > > > This is for a PCI device that plugs into multiple subsystems
> > > > > > in the kernel, ie it has net driver functionality, rdma
> > > > > > functionality, some even have SCSI functionality
> > > > >
> > > > > Sounds like a MFD device, why aren't you using that
> > > > > functionality instead?
> > > >
> > > > This was also my advice, but in another email Jeff says:
> > > >
> > > >   MFD architecture was also considered, and we selected the simpler
> > > >   platform model. Supporting a MFD architecture would require an
> > > >   additional MFD core driver, individual platform netdev, RDMA function
> > > >   drivers, and stripping a large portion of the netdev drivers into
> > > >   MFD core. The sub-devices registered by MFD core for function
> > > >   drivers are indeed platform devices.
> > >
> > > So, "mfd is too hard, let's abuse a platform device" is ok?
> > >
> > > People have been wanting to do MFD drivers for PCI devices for a
> > > long time, it's about time someone actually did the work for it, I
> > > bet it will not be all that complex if tiny embedded drivers can do
> > > it :)
> > >
> > Hi Greg - Thanks for your feedback!
> >
> > We currently have 2 PCI function netdev drivers in the kernel (i40e &
> > ice) that support devices (x722 & e810) which are RDMA capable. Our
> > objective is to add a single unified RDMA driver (as this a subsystem
> > specific requirement) which needs to access HW resources from the
> > netdev PF drivers. Attaching platform devices from the netdev drivers
> > to the platform bus and having a single RDMA platform driver bind to
> > them and access these resources seemed like a simple approach to realize our
> objective. But seems like attaching platform devices is wrong. I would like to
> understand why.
> 
> Because that is NOT what a platform device is for.
> 
> It was originally created for those types of devices that live on the "platform" bus,
> i.e. things that are hard-wired and you just "know" are in your system because they
> are located at specific locations.  We used to generate them from board files, and
> then when we got DT, we create them from the resources that DT says where the
> locations of the devices are.
> 
> They are NOT to be abused and used whenever someone wants to put them
> somewhere in the "middle" of the device tree because they feel like they are easier
> to use instead of creating a real bus and drivers.
> 
> Yes, they do get abused, and I need to sweep the tree again and fix up all of the
> places where this has crept back in.  But now that I know you are thinking of doing
> this, I'll keep saying to NOT do it for your use case either :)

Thanks Greg for the explanation.
And yes, we went by some example usages in the tree.
> 
> > Are platform sub devices only to be added from an MFD core driver? I
> > am also wondering if MFD arch.  would allow for realizing a single
> > RDMA driver and whether we need an MFD core driver for each device,
> > x722 & e810 or whether it can be a single driver.
> 
> I do not know the details of how MFD works, please ask those developers for
> specifics.  If MFD doesn't work, then create a tiny virtual bus and make sub-
> devices out of that.  If you need a "generic" way to do this for PCI devices, then
> please create that as you are not the only one that keeps wanting this, as for some
> reason PCI hardware vendors don't like dividing up their devices in ways that
> would have made it much simpler to create individual devices (probably saves
> some gates and firmware complexity on the device).
> 

Thank you for laying out options. We will review internally and get back.

Shiraz

^ permalink raw reply

* Re: [PATCH 35/39] docs: infiniband: add it to the driver-api bookset
From: Mauro Carvalho Chehab @ 2019-07-06 11:19 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Linux Doc Mailing List, Mauro Carvalho Chehab, linux-kernel,
	Jonathan Corbet, Doug Ledford, linux-rdma
In-Reply-To: <20190703180802.GA26557@ziepe.ca>

Em Wed, 3 Jul 2019 15:08:02 -0300
Jason Gunthorpe <jgg@ziepe.ca> escreveu:

> On Fri, Jun 28, 2019 at 09:30:28AM -0300, Mauro Carvalho Chehab wrote:
> > While this contains some uAPI stuff, it was intended to be
> > read by a kernel doc. So, let's not move it to a different
> > dir, but, instead, just add it to the driver-api bookset.
> > 
> > Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
> >  Documentation/index.rst            | 1 +
> >  Documentation/infiniband/index.rst | 2 +-
> >  2 files changed, 2 insertions(+), 1 deletion(-)
> > 
> > diff --git a/Documentation/index.rst b/Documentation/index.rst
> > index ea33cbbccd9d..e69d2fde7735 100644
> > +++ b/Documentation/index.rst
> > @@ -96,6 +96,7 @@ needed).
> >     block/index
> >     hid/index
> >     iio/index
> > +   infiniband/index
> >     leds/index
> >     media/index
> >     networking/index
> > diff --git a/Documentation/infiniband/index.rst b/Documentation/infiniband/index.rst
> > index 22eea64de722..9cd7615438b9 100644
> > +++ b/Documentation/infiniband/index.rst
> > @@ -1,4 +1,4 @@
> > -:orphan:
> > +.. SPDX-License-Identifier: GPL-2.0
> >  
> >  ==========
> >  InfiniBand  
> 
> Should this one go to the rdma.git as well? It looks like yes

I'm OK if you want to add to rdma.git. However, this will likely rise 
conflicts, though, as this series has lots of other patches touching
Documentation/index.rst. 

So, I suspect that it would be easier to merge this together with the
other patches via the docs tree, by the end of the merge window.

If you prefer to apply it against your tree, my plan is to do
a final rebase at the second week of the merge window, in order to
avoid such conflicts.

Thanks,
Mauro

^ permalink raw reply

* Re: [PATCH 01/43] docs: infiniband: convert docs to ReST and rename to *.rst
From: Mauro Carvalho Chehab @ 2019-07-06 11:02 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Linux Doc Mailing List, Mauro Carvalho Chehab, linux-kernel,
	Jonathan Corbet, Doug Ledford, linux-rdma
In-Reply-To: <20190703180641.GA26394@ziepe.ca>

Em Wed, 3 Jul 2019 15:06:41 -0300
Jason Gunthorpe <jgg@ziepe.ca> escreveu:

> On Fri, Jun 28, 2019 at 09:19:57AM -0300, Mauro Carvalho Chehab wrote:
> > The InfiniBand docs are plain text with no markups.
> > So, all we needed to do were to add the title markups and
> > some markup sequences in order to properly parse tables,
> > lists and literal blocks.
> > 
> > At its new index.rst, let's add a :orphan: while this is not linked to
> > the main index.rst file, in order to avoid build warnings.
> > 
> > Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
> > ---
> >  .../{core_locking.txt => core_locking.rst}    |  64 ++++++-----
> >  Documentation/infiniband/index.rst            |  23 ++++
> >  .../infiniband/{ipoib.txt => ipoib.rst}       |  24 ++--
> >  .../infiniband/{opa_vnic.txt => opa_vnic.rst} | 108 +++++++++---------
> >  .../infiniband/{sysfs.txt => sysfs.rst}       |   4 +-
> >  .../{tag_matching.txt => tag_matching.rst}    |   5 +
> >  .../infiniband/{user_mad.txt => user_mad.rst} |  33 ++++--
> >  .../{user_verbs.txt => user_verbs.rst}        |  12 +-
> >  drivers/infiniband/core/user_mad.c            |   2 +-
> >  drivers/infiniband/ulp/ipoib/Kconfig          |   2 +-
> >  10 files changed, 174 insertions(+), 103 deletions(-)
> >  rename Documentation/infiniband/{core_locking.txt => core_locking.rst} (78%)
> >  create mode 100644 Documentation/infiniband/index.rst
> >  rename Documentation/infiniband/{ipoib.txt => ipoib.rst} (90%)
> >  rename Documentation/infiniband/{opa_vnic.txt => opa_vnic.rst} (63%)
> >  rename Documentation/infiniband/{sysfs.txt => sysfs.rst} (69%)
> >  rename Documentation/infiniband/{tag_matching.txt => tag_matching.rst} (98%)
> >  rename Documentation/infiniband/{user_mad.txt => user_mad.rst} (90%)
> >  rename Documentation/infiniband/{user_verbs.txt => user_verbs.rst} (93%)  
> 
> I'm not sure anymore if I sent a note or not, but this patch was
> already applied to the rdma.git:
> 
> commit 97162a1ee8a1735fc7a7159fe08de966d88354ce
> Author: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
> Date:   Sat Jun 8 23:27:03 2019 -0300
> 
>     docs: infiniband: convert docs to ReST and rename to *.rst
>     
>     The InfiniBand docs are plain text with no markups.  So, all we needed to
>     do were to add the title markups and some markup sequences in order to
>     properly parse tables, lists and literal blocks.
>     
>     At its new index.rst, let's add a :orphan: while this is not linked to the
>     main index.rst file, in order to avoid build warnings.
>     
>     Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
>     Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

Ah, ok, thanks!

Not sure why but this one still applies on the top of -next.
Probably just the usual merge noise that happens close to
a new merge window.

Thanks,
Mauro

^ permalink raw reply

* Re: [net-next 1/3] ice: Initialize and register platform device to provide RDMA
From: Greg KH @ 2019-07-06  8:25 UTC (permalink / raw)
  To: Saleem, Shiraz
  Cc: Jason Gunthorpe, Kirsher, Jeffrey T, davem@davemloft.net,
	dledford@redhat.com, Nguyen, Anthony L, netdev@vger.kernel.org,
	linux-rdma@vger.kernel.org, nhorman@redhat.com,
	sassmann@redhat.com, poswald@suse.com, Ismail, Mustafa,
	Ertman, David M, Bowers, AndrewX
In-Reply-To: <9DD61F30A802C4429A01CA4200E302A7A684DA23@fmsmsx124.amr.corp.intel.com>

On Fri, Jul 05, 2019 at 04:33:07PM +0000, Saleem, Shiraz wrote:
> > Subject: Re: [net-next 1/3] ice: Initialize and register platform device to provide
> > RDMA
> > 
> > On Thu, Jul 04, 2019 at 12:48:29PM +0000, Jason Gunthorpe wrote:
> > > On Thu, Jul 04, 2019 at 02:42:47PM +0200, Greg KH wrote:
> > > > On Thu, Jul 04, 2019 at 12:37:33PM +0000, Jason Gunthorpe wrote:
> > > > > On Thu, Jul 04, 2019 at 02:29:50PM +0200, Greg KH wrote:
> > > > > > On Thu, Jul 04, 2019 at 12:16:41PM +0000, Jason Gunthorpe wrote:
> > > > > > > On Wed, Jul 03, 2019 at 07:12:50PM -0700, Jeff Kirsher wrote:
> > > > > > > > From: Tony Nguyen <anthony.l.nguyen@intel.com>
> > > > > > > >
> > > > > > > > The RDMA block does not advertise on the PCI bus or any other bus.
> > > > > > > > Thus the ice driver needs to provide access to the RDMA
> > > > > > > > hardware block via a virtual bus; utilize the platform bus to provide this
> > access.
> > > > > > > >
> > > > > > > > This patch initializes the driver to support RDMA as well as
> > > > > > > > creates and registers a platform device for the RDMA driver
> > > > > > > > to register to. At this point the driver is fully
> > > > > > > > initialized to register a platform driver, however, can not
> > > > > > > > yet register as the ops have not been implemented.
> > > > > > >
> > > > > > > I think you need Greg's ack on all this driver stuff -
> > > > > > > particularly that a platform_device is OK.
> > > > > >
> > > > > > A platform_device is almost NEVER ok.
> > > > > >
> > > > > > Don't abuse it, make a real device on a real bus.  If you don't
> > > > > > have a real bus and just need to create a device to hang other
> > > > > > things off of, then use the virtual one, that's what it is there for.
> > > > >
> > > > > Ideally I'd like to see all the RDMA drivers that connect to
> > > > > ethernet drivers use some similar scheme.
> > > >
> > > > Why?  They should be attached to a "real" device, why make any up?
> > >
> > > ? A "real" device, like struct pci_device, can only bind to one
> > > driver. How can we bind it concurrently to net, rdma, scsi, etc?
> > 
> > MFD was designed for this very problem.
> > 
> > > > > This is for a PCI device that plugs into multiple subsystems in
> > > > > the kernel, ie it has net driver functionality, rdma
> > > > > functionality, some even have SCSI functionality
> > > >
> > > > Sounds like a MFD device, why aren't you using that functionality
> > > > instead?
> > >
> > > This was also my advice, but in another email Jeff says:
> > >
> > >   MFD architecture was also considered, and we selected the simpler
> > >   platform model. Supporting a MFD architecture would require an
> > >   additional MFD core driver, individual platform netdev, RDMA function
> > >   drivers, and stripping a large portion of the netdev drivers into
> > >   MFD core. The sub-devices registered by MFD core for function
> > >   drivers are indeed platform devices.
> > 
> > So, "mfd is too hard, let's abuse a platform device" is ok?
> > 
> > People have been wanting to do MFD drivers for PCI devices for a long time, it's
> > about time someone actually did the work for it, I bet it will not be all that complex
> > if tiny embedded drivers can do it :)
> > 
> Hi Greg - Thanks for your feedback!
> 
> We currently have 2 PCI function netdev drivers in the kernel (i40e & ice) that support devices (x722 & e810)
> which are RDMA capable. Our objective is to add a single unified RDMA driver
> (as this a subsystem specific requirement) which needs to access HW resources from the
> netdev PF drivers. Attaching platform devices from the netdev drivers to the platform bus
> and having a single RDMA platform driver bind to them and access these resources seemed
> like a simple approach to realize our objective. But seems like attaching platform devices is
> wrong. I would like to understand why. 

Because that is NOT what a platform device is for.

It was originally created for those types of devices that live on the
"platform" bus, i.e. things that are hard-wired and you just "know" are
in your system because they are located at specific locations.  We used
to generate them from board files, and then when we got DT, we create
them from the resources that DT says where the locations of the devices
are.

They are NOT to be abused and used whenever someone wants to put them
somewhere in the "middle" of the device tree because they feel like they
are easier to use instead of creating a real bus and drivers.

Yes, they do get abused, and I need to sweep the tree again and fix up
all of the places where this has crept back in.  But now that I know you
are thinking of doing this, I'll keep saying to NOT do it for your use
case either :)

> Are platform sub devices only to be added from an MFD core driver? I
> am also wondering if MFD arch.  would allow for realizing a single
> RDMA driver and whether we need an MFD core driver for each device,
> x722 & e810 or whether it can be a single driver.

I do not know the details of how MFD works, please ask those developers
for specifics.  If MFD doesn't work, then create a tiny virtual bus and
make sub-devices out of that.  If you need a "generic" way to do this
for PCI devices, then please create that as you are not the only one
that keeps wanting this, as for some reason PCI hardware vendors don't
like dividing up their devices in ways that would have made it much
simpler to create individual devices (probably saves some gates and
firmware complexity on the device).

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH] IB/hfi1: Close PSM sdma_progress sleep window
From: Greg KH @ 2019-07-06  5:08 UTC (permalink / raw)
  To: Mike Marciniszyn; +Cc: stable, linux-rdma, stable-commits
In-Reply-To: <20190624201537.170286.13849.stgit@awfm-01.aw.intel.com>

On Mon, Jun 24, 2019 at 04:15:37PM -0400, Mike Marciniszyn wrote:
> commit da9de5f8527f4b9efc82f967d29a583318c034c7 upstream.
> 
> The call to sdma_progress() is called outside the wait lock.
> 
> In this case, there is a race condition where sdma_progress() can return
> false and the sdma_engine can idle.  If that happens, there will be no
> more sdma interrupts to cause the wakeup and the user_sdma xmit will hang.
> 
> Fix by moving the lock to enclose the sdma_progress() call.
> 
> Also, delete busycount. The need for this was removed by:
> commit bcad29137a97 ("IB/hfi1: Serve the most starved iowait entry first")
> 
> Ported to linux-4.9.y.

Now applied, thanks.

Note, this already is in 4.14.132 and 4.19.57 so I didn't need the
backports for those kernels.

greg k-h

^ permalink raw reply

* Re: [PATCH for-next 5/8] RDMA/hns: Bugfix for calculating qp buffer size
From: oulijun @ 2019-07-06  1:47 UTC (permalink / raw)
  To: dledford, jgg; +Cc: leon, linux-rdma, linuxarm
In-Reply-To: <1561376872-111496-6-git-send-email-oulijun@huawei.com>

在 2019/6/24 19:47, Lijun Ou 写道:
> From: o00290482 <o00290482@huawei.com>
Hi, Jason
   May be my local configuration error causing the wroong author.  How should I make changes?

The correct as follows:
From: Lijun Ou <oulijun@huawei.com>
> The buffer size of qp which used to allocate qp buffer space for
> storing sqwqe and rqwqe will be the length of buffer space. The
> kernel driver will use the buffer address and the same size to
> get the user memory. The same size named buff_size of qp. According
> the algorithm of calculating, The size of the two is not equal
> when users set the max sge of sq.
>
> Fixes: b28ca7cceff8 ("RDMA/hns: Limit extend sq sge num")
> Signed-off-by: Lijun Ou <oulijun@huawei.com>
> ---
>  drivers/infiniband/hw/hns/hns_roce_qp.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
> index 305be42..d56c03d 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_qp.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
> @@ -392,8 +392,8 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
>  					     hr_qp->sq.wqe_shift), PAGE_SIZE);
>  	} else {
>  		page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
> -		hr_qp->sge.sge_cnt =
> -		       max(page_size / (1 << hr_qp->sge.sge_shift), ex_sge_num);
> +		hr_qp->sge.sge_cnt = ex_sge_num ?
> +		   max(page_size / (1 << hr_qp->sge.sge_shift), ex_sge_num) : 0;
>  		hr_qp->buff_size = HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt <<
>  					     hr_qp->rq.wqe_shift), page_size) +
>  				   HNS_ROCE_ALOGN_UP((hr_qp->sge.sge_cnt <<




^ permalink raw reply

* RE: [BUGReport for rdma in kernel5.2-rc4]
From: Saleem, Shiraz @ 2019-07-05 19:01 UTC (permalink / raw)
  To: 'wangxi', 'oulijun', 'Jason Gunthorpe'
  Cc: 'linux-rdma', 'Linuxarm'
In-Reply-To: <9DD61F30A802C4429A01CA4200E302A7A683BAF6@fmsmsx123.amr.corp.intel.com>

> Subject: RE: [BUGReport for rdma in kernel5.2-rc4]
> 
> > Subject: Re: [BUGReport for rdma in kernel5.2-rc4]
> >
> >
> >
> > 在 2019/6/29 2:01, Saleem, Shiraz 写道:
> > >> Subject: [BUGReport for rdma in kernel5.2-rc4]
> > >>
> > >>
> > >> Hi Shiraz & Jason,
> > >>
> > >> We have observed a crash when run perftest on a hisilicon arm64
> > >> platform in kernel-5.2-rc4.
> > >>
> > >> We also tested with different kernel version and found it started
> > >> from the the following commit:
> > >>    d10bcf947a3e ("RDMA/umem: Combine contiguous PAGE_SIZE regions
> > >> in
> > >> SGEs")
> > >>
> > >> Could you please share any hint how to resolve this kind issue?
> > >> Thanks!
> > >>
> > >
> > > Hi Lijun - I am presuming you had this fix too?
> > >
> > > "RDMA/umem: Handle page combining avoidance correctly in
> > ib_umem_add_sg_table()"
> > > https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit
> > > /d
> > > rivers/infiniband/core/umem.c?h=v5.2-rc4&id=7872168a839144dbbfb33125
> > > 26
> > > 2dab0673f9ddf5
> > >
> > > As Jason mentioned, provide the stack backtrace.
> > >
> > I have confirmed that the kernel version used in our test already
> > contains this patch, but the phenomenon still exists on our platform.
> > The previous log is recorded under the condition that the interval of
> > the perftest test is long, and the system will hang. Calltrace will be available
> when the test interval is short.
> >
> > the kernel version as following:
> > commit dc75d8f9bf27647013bbfae1e2f1d114546994c4
> > Author: Jason Gunthorpe <jgg@mellanox.com>
> > Date:   Wed Jun 5 14:39:26 2019 -0300
> >
> >     {fromtree} RDMA: Move owner into struct ib_device_ops
> >
> >     This more closely follows how other subsytems work, with owner being a
> >     member of the structure containing the function pointers.
> >
> >     Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
> >     Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
> >     Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
> >
> > the calltrace log as following:
> > root@(none)$ uname -a
> > Linux (none) 5.2.0-rc4-gdc75d8f9 #1 SMP PREEMPT Sat Jun 29 11:23:58
> > HKT
> > 2019 aarch64 GNU/Linux root@(none)$ ib_read_bw -d hns_2 -n 5 >
> > /dev/null 2>&1 & [1] 1143 root@(none)$ ib_read_bw -d hns_2 -n 5
> > 192.168.10.110
> > ----------------------------------------------------------------------
> > ----------------- Device not recognized to implement inline feature.
> > Disabling it cqe = 5, less than minimum CQE number.
> > ---------------------------------------------------------------------------------------
> >                     RDMA_Read BW Test
> >  Dual-port       : OFF		Device         : hns_2
> >  Number of qps   : 1		Transport type : IB
> >  Connection type : RC		Using SRQ      : OFF
> >  TX depth        : 5
> >  CQ Moderation   : 5
> >  Mtu             : 1024[B]
> >  Link type       : Ethernet
> >  GID index       : 2
> >  Outstand reads  : 128
> >  rdma_cm QPs	 : OFF
> >  Data ex. method : Ethernet
> > ----------------------------------------------------------------------
> > -----------------  local address: LID 0000 QPN 0x0009 PSN 0x6340e6 OUT
> > 0x80 RKey 0x000300 VAddr 0x00ffffa271f000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> >  remote address: LID 0000 QPN 0x0008 PSN 0xbd1845 OUT 0x80 RKey
> > 0x000200 VAddr 0x00ffff98244000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> > ---------------------------------------------------------------------------------------
> >  #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]
> > MsgRate[Mpps]
> >  65536      5                6469.94            6468.60		   0.103498
> > ---------------------------------------------------------------------------------------
> > [1]+  Done                    ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1
> > root@(none)$ ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1 & [[1] 1145
> >    87.412596] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   87.412596] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   87.426751] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > [   87.426751] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > root@(none)$ ib_read_bw -d hns_2 -n 5 192.168.10.110
> > ----------------------------------------------------------------------
> > ----------------- Device not recognized to implement inline feature.
> > Disabling it cqe = 5, less than minimum CQE number.
> > ---------------------------------------------------------------------------------------
> >                     RDMA_Read BW Test
> >  Dual-port       : OFF		Device         : hns_2
> >  Number of qps   : 1		Transport type : IB
> >  Connection type : RC		Using SRQ      : OFF
> >  TX depth        : 5
> >  CQ Moderation   : 5
> >  Mtu             : 1024[B]
> >  Link type       : Ethernet
> >  GID index       : 2
> >  Outstand reads  : 128
> >  rdma_cm QPs	 : OFF
> >  Data ex. method : Ethernet
> > ----------------------------------------------------------------------
> > -----------------  local address: LID 0000 QPN 0x000b PSN 0xef5421 OUT
> > 0x80 RKey 0x000300 VAddr 0x00ffff9f7b8000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> >  remote address: LID 0000 QPN 0x000a PSN 0xd2b849 OUT 0x80 RKey
> > 0x000200 VAddr 0x00ffff822e7000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> > ---------------------------------------------------------------------------------------
> >  #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]
> > MsgRate[Mpps]
> >  65536      5                9044.77            9044.77		   0.144716
> > ---------------------------------------------------------------------------------------
> > [1]+  Done                    ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1
> > root@(none)$ ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1 & [1] 1147
> > root@(none)$ ib_read_bw -d hns_2 -n 5 192.168.10.110
> > [   88.772598] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   88.772598] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   88.785887] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > [   88.785887] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > ----------------------------------------------------------------------
> > ----------------- Device not recognized to implement inline feature.
> > Disabling it cqe = 5, less than minimum CQE number.
> > ---------------------------------------------------------------------------------------
> >                     RDMA_Read BW Test
> >  Dual-port       : OFF		Device         : hns_2
> >  Number of qps   : 1		Transport type : IB
> >  Connection type : RC		Using SRQ      : OFF
> >  TX depth        : 5
> >  CQ Moderation   : 5
> >  Mtu             : 1024[B]
> >  Link type       : Ethernet
> >  GID index       : 2
> >  Outstand reads  : 128
> >  rdma_cm QPs	 : OFF
> >  Data ex. method : Ethernet
> > ----------------------------------------------------------------------
> > -----------------  local address: LID 0000 QPN 0x000d PSN 0x553436 OUT
> > 0x80 RKey 0x000200 VAddr 0x00ffffa302c000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> >  remote address: LID 0000 QPN 0x000c PSN 0xc22528 OUT 0x80 RKey
> > 0x000300 VAddr 0x00ffffa2a0c000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> > ---------------------------------------------------------------------------------------
> >  #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]
> > MsgRate[Mpps]
> >  65536      5                8966.88            8966.88		   0.143470
> > ---------------------------------------------------------------------------------------
> > [1]+  Done                    ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1
> > root@(none)$ ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1 & [1] 1149
> > root@(none)$ ib_read_bw -d hns_2 -n 5 192.168.10.110
> > [   90.064588] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   90.064588] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   90.077875] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > [   90.077875] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > ----------------------------------------------------------------------
> > ----------------- Device not recognized to implement inline feature.
> > Disabling it cqe = 5, less than minimum CQE number.
> > ---------------------------------------------------------------------------------------
> >                     RDMA_Read BW Test
> >  Dual-port       : OFF		Device         : hns_2
> >  Number of qps   : 1		Transport type : IB
> >  Connection type : RC		Using SRQ      : OFF
> >  TX depth        : 5
> >  CQ Moderation   : 5
> >  Mtu             : 1024[B]
> >  Link type       : Ethernet
> >  GID index       : 2
> >  Outstand reads  : 128
> >  rdma_cm QPs	 : OFF
> >  Data ex. method : Ethernet
> > ----------------------------------------------------------------------
> > -----------------  local address: LID 0000 QPN 0x000f PSN 0xae6ff8 OUT
> > 0x80 RKey 0x000300 VAddr 0x00ffffb89e7000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> >  remote address: LID 0000 QPN 0x000e PSN 0x2e7a7d OUT 0x80 RKey
> > 0x000200 VAddr 0x00ffffbe2e5000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> > ---------------------------------------------------------------------------------------
> >  #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]
> > MsgRate[Mpps]
> >  65536      5                9057.91            9047.42		   0.144759
> > ---------------------------------------------------------------------------------------
> > [1]+  Done                    ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1
> > root@(none)$ ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1 & [[1] 1151
> >    91.192578] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   91.192578] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   91.206731] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > [   91.206731] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > root@(none)$ ib_read_bw -d hns_2 -n 5 192.168.10.110
> > ----------------------------------------------------------------------
> > ----------------- Device not recognized to implement inline feature.
> > Disabling it cqe = 5, less than minimum CQE number.
> > ---------------------------------------------------------------------------------------
> >                     RDMA_Read BW Test
> >  Dual-port       : OFF		Device         : hns_2
> >  Number of qps   : 1		Transport type : IB
> >  Connection type : RC		Using SRQ      : OFF
> >  TX depth        : 5
> >  CQ Moderation   : 5
> >  Mtu             : 1024[B]
> >  Link type       : Ethernet
> >  GID index       : 2
> >  Outstand reads  : 128
> >  rdma_cm QPs	 : OFF
> >  Data ex. method : Ethernet
> > ----------------------------------------------------------------------
> > -----------------  local address: LID 0000 QPN 0x0011 PSN 0xb4d02e OUT
> > 0x80 RKey 0x000300 VAddr 0x00ffffb75c5000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> >  remote address: LID 0000 QPN 0x0010 PSN 0xbe677c OUT 0x80 RKey
> > 0x000200 VAddr 0x00ffffb82f6000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> > ---------------------------------------------------------------------------------------
> >  #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]
> > MsgRate[Mpps]
> >  65536      5                8815.15            8805.21		   0.140883
> > ---------------------------------------------------------------------------------------
> > [1]+  Done                    ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1
> > root@(none)$ ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1 & [1] 1153
> > root@(none)$ ib_read_bw -d hns_2 -n 5 192.168.10.110
> > [   92.580588] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-2
> > [   92.580588] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-2
> > [   92.593874] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:2
> > [   92.593874] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:2
> > ----------------------------------------------------------------------
> > ----------------- Device not recognized to implement inline feature.
> > Disabling it cqe = 5, less than minimum CQE number.
> > ---------------------------------------------------------------------------------------
> >                     RDMA_Read BW Test
> >  Dual-port       : OFF		Device         : hns_2
> >  Number of qps   : 1		Transport type : IB
> >  Connection type : RC		Using SRQ      : OFF
> >  TX depth        : 5
> >  CQ Moderation   : 5
> >  Mtu             : 1024[B]
> >  Link type       : Ethernet
> >  GID index       : 2
> >  Outstand reads  : 128
> >  rdma_cm QPs	 : OFF
> >  Data ex. method : Ethernet
> > ----------------------------------------------------------------------
> > -----------------  local address: LID 0000 QPN 0x0013 PSN 0x94fd3e OUT
> > 0x80 RKey 0x000300 VAddr 0x00ffffb0ec9000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> >  remote address: LID 0000 QPN 0x0012 PSN 0x282156 OUT 0x80 RKey
> > 0x000200 VAddr 0x00ffff8f99d000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> > ---------------------------------------------------------------------------------------
> >  #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]
> > MsgRate[Mpps]
> >  65536      5                8890.35            8887.82		   0.142205
> > ---------------------------------------------------------------------------------------
> > [1]+  Done                    ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1
> > root@(none)$ ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1 & [1] 1155
> > root@(none)$ ib_read_bw -d hns_2 -n 5 192.168.10.110
> > ----------------------------------------------------------------------
> > ----------------- Device not recognized to implement inline feature.
> > Disabling it cqe = 5, less than minimum CQE number.
> > ---------------------------------------------------------------------------------------
> >                     RDMA_Read BW Test
> >  Dual-port       : OFF		Device         : hns_2
> >  Number of qps   : 1		Transport type : IB
> >  Connection type : RC		Using SRQ      : OFF
> >  TX depth        : 5
> >  CQ Moderation   : 5
> >  Mtu             : 1024[B]
> >  Link type       : Ethernet
> >  GID index       : 2
> >  Outstand reads  : 128
> >  rdma_cm QPs	 : OFF
> >  Data ex. method : Ethernet
> > ----------------------------------------------------------------------
> > -----------------  local address: LID 0000 QPN 0x0015 PSN 0x4ef728 OUT
> > 0x80 RKey 0x000300 VAddr 0x00ffffb46a4000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> >  remote address: LID 0000 QPN 0x0014 PSN 0x6ba809 OUT 0x80 RKey
> > 0x000200 VAddr 0x00ffff88787000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> > ---------------------------------------------------------------------------------------
> >  #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]
> > MsgRate[Mpps]
> >  65536      5                9204.54            9204.54		   0.147273
> > ---------------------------------------------------------------------------------------
> > [1]+  Done                    ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1
> > root@(none)$ ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1 & [1] 1157
> > root@(none)$ ib_read_bw -d hns_2 -n 5 192.168.10.110
> > [   95.192596] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   95.192596] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   95.205883] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > [   95.205883] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > ----------------------------------------------------------------------
> > ----------------- Device not recognized to implement inline feature.
> > Disabling it cqe = 5, less than minimum CQE number.
> > ---------------------------------------------------------------------------------------
> >                     RDMA_Read BW Test
> >  Dual-port       : OFF		Device         : hns_2
> >  Number of qps   : 1		Transport type : IB
> >  Connection type : RC		Using SRQ      : OFF
> >  TX depth        : 5
> >  CQ Moderation   : 5
> >  Mtu             : 1024[B]
> >  Link type       : Ethernet
> >  GID index       : 2
> >  Outstand reads  : 128
> >  rdma_cm QPs	 : OFF
> >  Data ex. method : Ethernet
> > ----------------------------------------------------------------------
> > -----------------  local address: LID 0000 QPN 0x0016 PSN 0xafe673 OUT
> > 0x80 RKey 0x000200 VAddr 0x00ffffb96fc000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> >  remote address: LID 0000 QPN 0x0017 PSN 0xdfb0e7 OUT 0x80 RKey
> > 0x000300 VAddr 0x00ffff927a6000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> > ---------------------------------------------------------------------------------------
> >  #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]
> > MsgRate[Mpps]
> >  65536      5                8765.63            8758.26		   0.140132
> > ---------------------------------------------------------------------------------------
> > [1]+  Done                    ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1
> > root@(none)$ ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1 & [[1] 1159
> >    96.192577] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   96.192577] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   96.206731] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > [   96.206731] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > root@(none)$ ib_read_bw -d hns_2 -n 5 192.168.10.110
> > ----------------------------------------------------------------------
> > ----------------- Device not recognized to implement inline feature.
> > Disabling it cqe = 5, less than minimum CQE number.
> > ---------------------------------------------------------------------------------------
> >                     RDMA_Read BW Test
> >  Dual-port       : OFF		Device         : hns_2
> >  Number of qps   : 1		Transport type : IB
> >  Connection type : RC		Using SRQ      : OFF
> >  TX depth        : 5
> >  CQ Moderation   : 5
> >  Mtu             : 1024[B]
> >  Link type       : Ethernet
> >  GID index       : 2
> >  Outstand reads  : 128
> >  rdma_cm QPs	 : OFF
> >  Data ex. method : Ethernet
> > ----------------------------------------------------------------------
> > -----------------  local address: LID 0000 QPN 0x0019 PSN 0xa607e OUT
> > 0x80 RKey 0x000300 VAddr 0x00ffffb6020000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> >  remote address: LID 0000 QPN 0x0018 PSN 0xc3b7bc OUT 0x80 RKey
> > 0x000200 VAddr 0x00ffffbe95e000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> > ---------------------------------------------------------------------------------------
> >  #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]
> > MsgRate[Mpps]
> >  65536      5                8620.62            8611.12		   0.137778
> > ---------------------------------------------------------------------------------------
> > [1]+  Done                    ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1
> > root@(none)$ ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1 & [1] 1161
> > root@(none)$ ib_read_bw -d hns_2 -n 5 192.168.10.110
> > [   97.540585] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   97.540585] BUG: Bad rss-counter state mm:(____ptrval____) idx:0 val:-1
> > [   97.553871] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > [   97.553871] BUG: Bad rss-counter state mm:(____ptrval____) idx:1 val:1
> > ----------------------------------------------------------------------
> > ----------------- Device not recognized to implement inline feature.
> > Disabling it cqe = 5, less than minimum CQE number.
> > ---------------------------------------------------------------------------------------
> >                     RDMA_Read BW Test
> >  Dual-port       : OFF		Device         : hns_2
> >  Number of qps   : 1		Transport type : IB
> >  Connection type : RC		Using SRQ      : OFF
> >  TX depth        : 5
> >  CQ Moderation   : 5
> >  Mtu             : 1024[B]
> >  Link type       : Ethernet
> >  GID index       : 2
> >  Outstand reads  : 128
> >  rdma_cm QPs	 : OFF
> >  Data ex. method : Ethernet
> > ----------------------------------------------------------------------
> > -----------------  local address: LID 0000 QPN 0x001b PSN 0x3ea763 OUT
> > 0x80 RKey 0x000300 VAddr 0x00ffffb8bde000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> >  remote address: LID 0000 QPN 0x001a PSN 0x818b6a OUT 0x80 RKey
> > 0x000200 VAddr 0x00ffffb8df2000
> >  GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:10:110
> > ---------------------------------------------------------------------------------------
> >  #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]
> > MsgRate[Mpps]
> >  65536      5                9070.99            9068.36		   0.145094
> > ---------------------------------------------------------------------------------------
> > [1]+  Done             [       ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1
> >    97.812496] BUG: Bad page state in process swapper/99  pfn:203f958e4
> > [   97.812496] BUG: Bad page state in process swapper/99  pfn:203f958e4
> > [   97.812498] BUG: Bad page state in process swapper/100  pfn:203f9597a
> > [   97.812498] BUG: Bad page state in process swapper/100  pfn:203f9597a
> > [   97.812502] page:ffff7e80fe565e80 refcount:-1 mapcount:0
> > mapping:0000000000000000 index:0x1
> > [   97.812502] page:ffff7e80fe565e80 refcount:-1 mapcount:0
> > mapping:0000000000000000 index:0x1
> > [   97.831388] page:ffff7e80fe563900 reroot@(none)$ fcount:-1 mapcount:0
> > mapping:0000000000000000 index:0x1
> > [   97.831388] page:ffff7e80fe563900 refcount:-1 mapcount:0
> > mapping:0000000000000000 index:0x1
> > [   97.844317] flags: 0xdfffe0000000000a(referenced|dirty)
> > [   97.844317] flags: 0xdfffe0000000000a(referenced|dirty)
> > [   97.861087] flags: 0xdfffe00000000000()
> > [   97.861087] flags: 0xdfffe00000000000()
> > [   97.861091] raw: dfffe00000000000 dead000000000100 dead000000000200
> > 0000000000000000
> > [   97.861091] raw: dfffe00000000000 dead000000000100 dead000000000200
> > 0000000000000000
> > [   97.879857] raw: dfffe0000000000a dead000000000100 dead000000000200
> > 0000000000000000
> > [   97.879857] raw: dfffe0000000000a dead000000000100 dead000000000200
> > 0000000000000000
> > [   97.890339] raw: 0000000000000001 0000000000000000 ffffffffffffffff
> > 0000000000000000
> > [   97.890339] raw: 0000000000000001 0000000000000000 ffffffffffffffff
> > 0000000000000000
> > [   97.898027] raw: 0000000000000001 0000000000000000 ffffffffffffffff
> > 0000000000000000
> > [   97.898027] raw: 0000000000000001 0000000000000000 ffffffffffffffff
> > 0000000000000000
> > [   97.913576] page dumped because: nonzero _refcount
> > [   97.913576] page dumped because: nonzero _refcount
> > [   97.929124] page dumped because: nonzero _refcount
> > [   97.929124] page dumped because: nonzero _refcount
> > [   97.929125] Modules linked in: hns_roce_hw_v2 hns_roce hclge hns3 hnae3
> > [   97.929125] Modules linked in: hns_roce_hw_v2 hns_roce hclge hns3 hnae3
> > [   97.944673] Modules linked in: hns_roce_hw_v2 hns_roce hclge hns3 hnae3
> > [   97.944673] Modules linked in: hns_roce_hw_v2 hns_roce hclge hns3 hnae3
> > [   97.960225] CPU: 100 PID: 0 Comm: swapper/100 Not tainted 5.2.0-rc4-
> > gdc75d8f9 #1
> > [   97.960225] CPU: 100 PID: 0 Comm: swapper/100 Not tainted 5.2.0-rc4-
> > gdc75d8f9 #1
> > [   98.020847] Hardware name: Huawei TaiShan 2280 V2/BC82AMDA, BIOS TA
> > BIOS 2280-A CS V2.26.01 06/13/2019
> > [   98.020847] Hardware name: Huawei TaiShan 2280 V2/BC82AMDA, BIOS TA
> > BIOS 2280-A CS V2.26.01 06/13/2019
> > [   98.039542] Call trace:
> > [   98.039542] Call trace:
> > [   98.044440]  dump_backtrace+0x0/0x140
> > [   98.044440]  dump_backtrace+0x0/0x140
> > [   98.051779]  show_stack+0x14/0x20
> > [   98.051779]  show_stack+0x14/0x20
> > [   98.058422]  dump_stack+0xa8/0xcc
> > [   98.058422]  dump_stack+0xa8/0xcc
> > [   98.065065]  bad_page+0xe8/0x150
> > [   98.065065]  bad_page+0xe8/0x150
> > [   98.071532]  free_pages_check_bad+0x70/0xa8
> > [   98.071532]  free_pages_check_bad+0x70/0xa8
> > [   98.079920]  free_pcppages_bulk+0x430/0x6d8
> > [   98.079920]  free_pcppages_bulk+0x430/0x6d8
> > [   98.088308]  free_unref_page_commit+0xc0/0xf8
> > [   98.088308]  free_unref_page_commit+0xc0/0xf8
> > [   98.097045]  free_unref_page+0x78/0x98
> > [   98.097045]  free_unref_page+0x78/0x98
> > [   98.104561]  __put_page+0x44/0x50
> > [   98.104561]  __put_page+0x44/0x50
> > [   98.111202]  free_page_and_swap_cache+0xac/0x100
> > [   98.111202]  free_page_and_swap_cache+0xac/0x100
> > [   98.120463]  tlb_remove_table_rcu+0x30/0x58
> > [   98.120463]  tlb_remove_table_rcu+0x30/0x58
> > [   98.128852]  rcu_core+0x2d8/0x5d8
> > [   98.128852]  rcu_core+0x2d8/0x5d8
> > [   98.135493]  __do_softirq+0x11c/0x3a0
> > [   98.135493]  __do_softirq+0x11c/0x3a0
> > [   98.142834]  irq_exit+0xd0/0xd8
> > [   98.142834]  irq_exit+0xd0/0xd8
> > [   98.149127]  __handle_domain_irq+0x60/0xb0
> > [   98.149127]  __handle_domain_irq+0x60/0xb0
> > [   98.157339]  gic_handle_irq+0x5c/0x154
> > [   98.157339]  gic_handle_irq+0x5c/0x154
> > [   98.164853]  el1_irq+0xb8/0x180
> > [   98.164853]  el1_irq+0xb8/0x180
> > [   98.171144]  arch_cpu_idle+0x30/0x230
> > [   98.171144]  arch_cpu_idle+0x30/0x230
> > [   98.178484]  default_idle_call+0x1c/0x38
> > [   98.178484]  default_idle_call+0x1c/0x38
> > [   98.186349]  do_idle+0x1f0/0x2d0
> > [   98.186349]  do_idle+0x1f0/0x2d0
> > [   98.192815]  cpu_startup_entry+0x24/0x28
> > [   98.192815]  cpu_startup_entry+0x24/0x28
> > [   98.200679]  secondary_start_kernel+0x18c/0x1d0
> > [   98.200679]  secondary_start_kernel+0x18c/0x1d0
> > [   98.209765] Disabling lock debugging due to kernel taint
> > [   98.209765] Disabling lock debugging due to kernel taint
> > [   98.209767] CPU: 99 PID: 0 Comm: swapper/99 Not tainted 5.2.0-rc4-
> gdc75d8f9
> > #1
> > [   98.209767] CPU: 99 PID: 0 Comm: swapper/99 Not tainted 5.2.0-rc4-
> gdc75d8f9
> > #1
> > [   98.209768] Hardware name: Huawei TaiShan 2280 V2/BC82AMDA, BIOS TA
> > BIOS 2280-A CS V2.26.01 06/13/2019
> > [   98.209768] Hardware name: Huawei TaiShan 2280 V2/BC82AMDA, BIOS TA
> > BIOS 2280-A CS V2.26.01 06/13/2019
> > [   98.220425] BUG: Bad page state in process swapper/100  pfn:203f95977
> > [   98.220425] BUG: Bad page state in process swapper/100  pfn:203f95977
> > [   98.234925] Call trace:
> > [   98.234925] Call trace:
> > [   98.253619] page:ffff7e80fe565dc0 refcount:-1 mapcount:0
> > mapping:0000000000000000 index:0x1
> > [   98.253619] page:ffff7e80fe565dc0 refcount:-1 mapcount:0
> > mapping:0000000000000000 index:0x1
> > [   98.266550]  dump_backtrace+0x0/0x140
> > [   98.266550]  dump_backtrace+0x0/0x140
> > [   98.271441] flags: 0xdfffe00000000000()
> > [   98.271441] flags: 0xdfffe00000000000()
> > [   98.271442] raw: dfffe00000000000 dead000000000100 dead000000000200
> > 0000000000000000
> > [   98.271442] raw: dfffe00000000000 dead000000000100 dead000000000200
> > 0000000000000000
> > [   98.288214]  show_stack+0x14/0x20
> > [   98.288214]  show_stack+0x14/0x20
> > [   98.295552] raw: 0000000000000001 0000000000000000 ffffffffffffffff
> > 0000000000000000
> > [   98.295552] raw: 0000000000000001 0000000000000000 ffffffffffffffff
> > 0000000000000000
> > [   98.303241]  dump_stack+0xa8/0xcc
> > [   98.303241]  dump_stack+0xa8/0xcc
> > [   98.318788] page dumped because: nonzero _refcount
> > [   98.318788] page dumped because: nonzero _refcount
> > [   98.325429]  bad_page+0xe8/0x150
> > [   98.325429]  bad_page+0xe8/0x150
> > [   98.325431]  free_pages_check_bad+0x70/0xa8
> > [   98.325431]  free_pages_check_bad+0x70/0xa8
> > [   98.340978] Modules linked in: hns_roce_hw_v2 hns_roce hclge hns3 hnae3
> > [   98.340978] Modules linked in: hns_roce_hw_v2 hns_roce hclge hns3 hnae3
> > [   98.347620]  free_pcppages_bulk+0x430/0x6d8
> > [   98.347620]  free_pcppages_bulk+0x430/0x6d8
> > [   98.393743]  free_unref_page_commit+0xc0/0xf8
> > [   98.393743]  free_unref_page_commit+0xc0/0xf8
> > [   98.402480]  free_unref_page+0x78/0x98
> > [   98.402480]  free_unref_page+0x78/0x98
> > [   98.409994]  __free_pages+0x44/0x50
> > [   98.409994]  __free_pages+0x44/0x50
> > [   98.416985]  free_pages.part.5+0x1c/0x28
> > [   98.416985]  free_pages.part.5+0x1c/0x28
> > [   98.424848]  free_pages+0x14/0x20
> > [   98.424848]  free_pages+0x14/0x20
> > [   98.431488]  tlb_remove_table_rcu+0x4c/0x58
> > [   98.431488]  tlb_remove_table_rcu+0x4c/0x58
> > [   98.439876]  rcu_core+0x2d8/0x5d8
> > [   98.439876]  rcu_core+0x2d8/0x5d8
> > [   98.446517]  __do_softirq+0x11c/0x3a0
> > [   98.446517]  __do_softirq+0x11c/0x3a0
> > [   98.453858]  irq_exit+0xd0/0xd8
> > [   98.453858]  irq_exit+0xd0/0xd8
> > [   98.460150]  __handle_domain_irq+0x60/0xb0
> > [   98.460150]  __handle_domain_irq+0x60/0xb0
> > [   98.468363]  gic_handle_irq+0x5c/0x154
> > [   98.468363]  gic_handle_irq+0x5c/0x154
> > [   98.475876]  el1_irq+0xb8/0x180
> > [   98.475876]  el1_irq+0xb8/0x180
> > [   98.482168]  arch_cpu_idle+0x30/0x230
> > [   98.482168]  arch_cpu_idle+0x30/0x230
> > [   98.489507]  default_idle_call+0x1c/0x38
> > [   98.489507]  default_idle_call+0x1c/0x38
> > [   98.497371]  do_idle+0x1f0/0x2d0
> > [   98.497371]  do_idle+0x1f0/0x2d0
> > [   98.503837]  cpu_startup_entry+0x24/0x28
> > [   98.503837]  cpu_startup_entry+0x24/0x28
> > [   98.511701]  secondary_start_kernel+0x18c/0x1d0
> > [   98.511701]  secondary_start_kernel+0x18c/0x1d0
> > [   98.520788] CPU: 100 PID: 0 Comm: swapper/100 Tainted: G    B
> 5.2.0-
> > rc4-gdc75d8f9 #1
> > [   98.520788] CPU: 100 PID: 0 Comm: swapper/100 Tainted: G    B
> 5.2.0-
> > rc4-gdc75d8f9 #1
> > [   98.520789] BUG: Bad page state in process swapper/99  pfn:203f96a5a
> > [   98.520789] BUG: Bad page state in process swapper/99  pfn:203f96a5a
> > [   98.520791] page:ffff7e80fe5a9680 refcount:-1 mapcount:0
> > mapping:0000000000000000 index:0x1
> > [   98.520791] page:ffff7e80fe5a9680 refcount:-1 mapcount:0
> > mapping:0000000000000000 index:0x1
> > [   98.538435] Hardware name: Huawei TaiShan 2280 V2/BC82AMDA, BIOS TA
> > BIOS 2280-A CS V2.26.01 06/13/2019
> > [   98.538435] Hardware name: Huawei TaiShan 2280 V2/BC82AMDA, BIOS TA
> > BIOS 2280-A CS V2.26.01 06/13/2019
> > [   98.538436] Call trace:
> > [   98.538436] Call trace:
> > [   98.551189] flags: 0xdfffe0000000000a(referenced|dirty)
> > [   98.551189] flags: 0xdfffe0000000000a(referenced|dirty)
> > [   98.551191] raw: dfffe0000000000a dead000000000100 dead000000000200
> > 0000000000000000
> > [   98.551191] raw: dfffe0000000000a dead000000000100 dead000000000200
> > 0000000000000000
> > [   98.567962]  dump_backtrace+0x0/0x140
> > [   98.567962]  dump_backtrace+0x0/0x140
> > [   98.567963]  show_stack+0x14/0x20
> > [   98.567963]  show_stack+0x14/0x20
> > [   98.586656] raw: 0000000000000001 0000000000000000 ffffffffffffffff
> > 0000000000000000
> > [   98.586656] raw: 0000000000000001 0000000000000000 ffffffffffffffff
> > 0000000000000000
> > [   98.591549]  dump_stack+0xa8/0xcc
> > [   98.591549]  dump_stack+0xa8/0xcc
> > [   98.602031] page dumped because: nonzero _refcount
> > [   98.602031] page dumped because: nonzero _refcount
> > [   98.617581]  bad_page+0xe8/0x150
> > [   98.617581]  bad_page+0xe8/0x150
> > [   98.624918] Modules linked in: hns_roce_hw_v2 hns_roce hclge hns3 hnae3
> > [   98.624918] Modules linked in: hns_roce_hw_v2 hns_roce hclge hns3 hnae3
> > [   98.631559]  free_pages_check_bad+0x70/0xa8
> > [   98.631559]  free_pages_check_bad+0x70/0xa8
> > [   98.631560]  free_pcppages_bulk+0x430/0x6d8
> > [   98.631560]  free_pcppages_bulk+0x430/0x6d8
> > [   98.699872]  free_unref_page_commit+0xc0/0xf8
> > [   98.699872]  free_unref_page_commit+0xc0/0xf8
> > [   98.708610]  free_unref_page+0x78/0x98
> > [   98.708610]  free_unref_page+0x78/0x98
> > [   98.716123]  __put_page+0x44/0x50
> > [   98.716123]  __put_page+0x44/0x50
> > [   98.722764]  free_page_and_swap_cache+0xac/0x100
> > [   98.722764]  free_page_and_swap_cache+0xac/0x100
> > [   98.732025]  tlb_remove_table_rcu+0x30/0x58
> > [   98.732025]  tlb_remove_table_rcu+0x30/0x58
> > [ib_read_bw -d hns_2 -n 5 > /dev/null 2>&1 &
> >    98.740412]  rcu_core+0x2d8/0x5d8
> > [   98.740412]  rcu_core+0x2d8/0x5d8
> > [   98.750959]  __do_softirq+0x11c/0x3a0
> > [   98.750959]  __do_softirq+0x11c/0x3a0
> > [   98.758298]  irq_exit+0xd0/0xd8
> > [   98.758298]  irq_exit+0xd0/0xd8
> > [   98.764589]  __handle_domain_irq+0x60/0xb0
> > [   98.764589]  __handle_domain_irq+0x60/0xb0
> > [   98.772802]  gic_handle_irq+0x5c/0x154
> > [   98.772802]  gic_handle_irq+0x5c/0x154
> > [   98.780316]  el1_irq+0xb8/0x180
> > [   98.780316]  el1_irq+0xb8/0x180
> > [   98.786608]  arch_cp[1] 1163
> > u_idle+0x30/0x230
> > [   98.786608]  arch_cpu_idle+0x30/0x230
> > [   98.794815]  default_idle_call+0x1c/0x38
> > [   98.794815]  default_idle_call+0x1c/0x38
> > [   98.802678]  do_idle+0x1f0/0x2d0
> > [   98.802678]  do_idle+0x1f0/0x2d0
> > [   98.809144]  cpu_startup_entry+0x24/0x28
> > [   98.809144]  cpu_startup_entry+0x24/0x28
> > [   98.817008]  secondary_start_kernel+0x18c/0x1d0
> > [   98.817008]  secondary_start_kernel+0x18c/0x1d0
> > [   98.826095] CPU: 99 PID: 0 Comm: swapper/99 Tainted: G    B             5.2.0-
> rc4-
> > gdc75d8f9 #1
> > [   98.826095] CPU: 99 PID: 0 Comm: swapper/99 Tainted: G    B             5.2.0-
> rc4-
> > gdc75d8f9 #1
> > [root@(none)$    98.843392] Hardware name: Huawei TaiShan 2280
> V2/BC82AMDA,
> > BIOS TA BIOS 2280-A CS V2.26.01 06/13/2019
> > [   98.843392] Hardware name: Huawei TaiShan 2280 V2/BC82AMDA, BIOS TA
> > BIOS 2280-A CS V2.26.01 06/13/2019
> > [   98.864082] Call trace:
> > [   98.864082] Call trace:
> > [   98.868976]  dump_backtrace+0x0/0x140
> > [   98.868976]  dump_backtrace+0x0/0x140
> > [   98.876315]  show_stack+0x14/0x20
> > [   98.876315]  show_stack+0x14/0x20
> > [   98.882955]  dump_stack+0xa8/0xcc
> > [   98.882955]  dump_stack+0xa8/0xcc
> > [   98.889596]  bad_page+0xe8/0x150
> > [   98.889596]  bad_page+0xe8/0x150
> > [   98.896062]  free_pages_check_bad+0x70/0xa8
> > [   98.896062]  free_pages_check_bad+0x70/0xa8
> > [   98.904450]  free_pcppages_bulk+0x430/0x6d8
> > [   98.904450]  free_pcppages_bulk+0x430/0x6d8
> > [   98.912838]  free_unref_page_commit+0xc0/0xf8
> > [   98.912838]  free_unref_page_commit+0xc0/0xf8
> > [   98.921575]  free_unref_page+0x78/0x98
> > [   98.921575]  free_unref_page+0x78/0x98
> > [   98.929089]  __free_pages+0x44/0x50
> > [   98.929089]  __free_pages+0x44/0x50
> > [   98.936079]  free_pages.part.5+0x1c/0x28
> > [   98.936079]  free_pages.part.5+0x1c/0x28
> > [   98.943943]  free_pages+0x14/0x20
> > [   98.943943]  free_pages+0x14/0x20
> > [   98.950583]  tlb_remove_table_rcu+0x4c/0x58
> > [   98.950583]  tlb_remove_table_rcu+0x4c/0x58
> > [   98.958970]  rcu_core+0x2d8/0x5d8
> > [   98.958970]  rcu_core+0x2d8/0x5d8
> > [   98.965611]  __do_softirq+0x11c/0x3a0
> > [   98.965611]  __do_softirq+0x11c/0x3a0
> > [   98.972951]  irq_exit+0xd0/0xd8
> > [   98.972951]  irq_exit+0xd0/0xd8
> > [   98.979242]  __handle_domain_irq+0x60/0xb0
> > [   98.979242]  __handle_domain_irq+0x60/0xb0
> > [   98.987455]  gic_handle_irq+0x5c/0x154
> > [   98.987455]  gic_handle_irq+0x5c/0x154
> > [   98.994968]  el1_irq+0xb8/0x180
> > [   98.994968]  el1_irq+0xb8/0x180
> > [   99.001260]  arch_cpu_idle+0x30/0x230
> > [   99.001260]  arch_cpu_idle+0x30/0x230
> > [   99.008599]  default_idle_call+0x1c/0x38
> > [   99.008599]  default_idle_call+0x1c/0x38
> > [   99.016462]  do_idle+0x1f0/0x2d0
> > [   99.016462]  do_idle+0x1f0/0x2d0
> > [   99.022927]  cpu_startup_entry+0x24/0x28
> > [   99.022927]  cpu_startup_entry+0x24/0x28
> > [   99.030791]  secondary_start_kernel+0x18c/0x1d0
> > [   99.030791]  secondary_start_kernel+0x18c/0x1d0
> > [   99.039878] BUG: Bad page state in process swapper/99  pfn:203f958ec
> > [   99.039878] BUG: Bad page state in process swapper/99  pfn:203f958ec
> 
> Perhaps we got bad page descriptor when unfolding SGEs in
> __ib_umem_release() and doing a put page? But I don’t see dereg MR patch in the
> trace so that’s throwing me off a little.
> 
> Anyways, since we suspect the page combining algo as a potential culprit,
> instrumenting some debug like this could tell us how the page merging and
> unfolding took place.
> 
> echo -n 'func ib_umem_get +p' >/sys/kernel/debug/dynamic_debug/control
> echo -n 'func __ib_umem_release +p' >/sys/kernel/debug/dynamic_debug/control
> echo -n 'func ib_umem_add_sg_table +p'
> >/sys/kernel/debug/dynamic_debug/control
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index
> 08da840..8d4aba1 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -47,6 +47,10 @@ static void __ib_umem_release(struct ib_device *dev, struct
> ib_umem *umem, int d  {
>  	struct sg_page_iter sg_iter;
>  	struct page *page;
> +	int i = 0;
> +
> +	pr_debug("ib_umem_release: START------------------------\n");
> +	pr_debug("umem [%px] sg_nents [%u] \n", umem, umem->sg_nents);
> 
>  	if (umem->nmap > 0)
>  		ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
> @@ -54,12 +58,15 @@ static void __ib_umem_release(struct ib_device *dev,
> struct ib_umem *umem, int d
> 
>  	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
>  		page = sg_page_iter_page(&sg_iter);
> +		pr_debug("sg [0x%px] len [%u] page [0x%px] pfn [0x%lx] refcnt
> [%d] i [%d]\n",
> +			 (&sg_iter)->sg, (&sg_iter)->sg->length, page,
> page_to_pfn(page),
> +			 page_ref_count(page), i);
>  		if (umem->writable && dirty)
>  			put_user_pages_dirty_lock(&page, 1);
>  		else
>  			put_user_page(page);
>  	}
> -
> +	pr_debug("ib_umem_release: END------------------------\n");
>  	sg_free_table(&umem->sg_head);
>  }
> 
> @@ -103,8 +110,11 @@ static struct scatterlist *ib_umem_add_sg_table(struct
> scatterlist *sg,
>  		for (len = 0; i != npages &&
>  			      first_pfn + len == page_to_pfn(page_list[i]) &&
>  			      len < (max_seg_sz >> PAGE_SHIFT);
> -		     len++)
> +		     len++) {
> +			pr_debug("page_ptr [0x%px] pfn [0x%lx] i [%lu] len
> [%lu]\n",
> +				 page_list[i], page_to_pfn(page_list[i]), i, len);
>  			i++;
> +		}
> 
>  		/* Squash N contiguous pages from page_list into current sge */
>  		if (update_cur_sg) {
> @@ -112,6 +122,8 @@ static struct scatterlist *ib_umem_add_sg_table(struct
> scatterlist *sg,
>  				sg_set_page(sg, sg_page(sg),
>  					    sg->length + (len << PAGE_SHIFT),
>  					    0);
> +				pr_debug("update_sg: [0x%px] len [%u] page
> [0x%px] pfn [0x%lx]\n",
> +					sg, sg->length, sg_page(sg),
> page_to_pfn(sg_page(sg)));
>  				update_cur_sg = false;
>  				continue;
>  			}
> @@ -125,6 +137,8 @@ static struct scatterlist *ib_umem_add_sg_table(struct
> scatterlist *sg,
>  		(*nents)++;
>  		sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
>  		first = false;
> +		pr_debug("sg: [0x%px] len [%u] page[0x%px] pfn [0x%lx]\n",
> +			 sg, sg->length, first_page, page_to_pfn(first_page));
>  	}
> 
>  	return sg;
> @@ -293,6 +307,10 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata,
> unsigned long addr,
> 
>  	sg = umem->sg_head.sgl;
> 
> +	pr_debug("ib_umem_get: START------------------------\n");
> +	pr_debug("umem [%px] max_seg_size [%u] npages [%lu] pgshift [%u] \n",
> +		umem, dma_get_max_seg_size(context->device->dma_device),
> +		npages, PAGE_SHIFT);
>  	while (npages) {
>  		down_read(&mm->mmap_sem);
>  		ret = get_user_pages(cur_base,
> @@ -323,6 +341,9 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata,
> unsigned long addr,
>  				  DMA_BIDIRECTIONAL,
>  				  dma_attrs);
> 
> +	pr_debug("umem [%px] sg_nents [%u] nmap [%u] \n", umem, umem-
> >sg_nents, umem->nmap);
> +	pr_debug("ib_umem_get: END------------------------\n");
> +
>  	if (!umem->nmap) {
>  		ret = -ENOMEM;
>  		goto umem_release;
> --
> 1.8.3.1
> 

Hi Jason - Wang re-ran to the failure point with this debug patch on arm64. And I am seeing
something anomalous that could expain their problem.

For this umem " ffffd372c23c7c00" we setup a single SGE pointing a single page.

[  267.102437] ib_umem_get: START------------------------
[  267.102440] umem [ffffd372c23c7c00] max_seg_size [2147483648] npages [1] pgshift [12]
[  267.102442] page_ptr [0xffff7f4db465a100] pfn [0x27d9684] i [0] len [0]
[  267.102445] sg: [0xffffd372c23c7d00] len [4096] page[0xffff7f4db465a100] pfn [0x27d9684]
[  267.102450] umem [ffffd372c23c7c00] sg_nents [1] nmap [1]
[  267.102451] ib_umem_get: END------------------------

But on the release, the for_each_sg_page iterator unfolds the SGE and iterates 2 pages.
In another dump, he collected sg_page_count(sg) that the iterator used = 2, so it must
that sg->offset is non-zero (Wang to confirm).

[  267.366820] ib_umem_release: START------------------------
[  267.366824] umem [ffffd372c23c7c00] sg_nents [1]
[  267.366830] sg [0xffffd372c23c7d00] len [4096] page [0xffff7f4db465a100] pfn [0x27d9684] refcnt [2] i [0] dirty [0:1] w[0]
[  267.366833] sg [0xffffd372c23c7d00] len [4096] page [0xffff7f4db465a140] pfn [0x27d9685] refcnt [2] i [1] dirty [0:1] w[0]
[  267.366835] ib_umem_release: END------------------------

Even if we did sg_set_page() with offset = 0, offset isn’t preserved after DMA
unmap the SG?

Shiraz

^ permalink raw reply

* Re: [PATCH] ibverbs/rxe: Remove variable self-initialization
From: Jason Gunthorpe @ 2019-07-05 17:44 UTC (permalink / raw)
  To: Maksym Planeta; +Cc: Moni Shoua, Doug Ledford, linux-rdma
In-Reply-To: <20190702134928.31534-1-mplaneta@os.inf.tu-dresden.de>

On Tue, Jul 02, 2019 at 03:49:28PM +0200, Maksym Planeta wrote:
> In some cases (not in this particular one) variable self-initialization
> can lead to undefined behavior. In this case, it is just obscure code.
> 
> Signed-off-by: Maksym Planeta <mplaneta@os.inf.tu-dresden.de>
> ---
>  drivers/infiniband/sw/rxe/rxe_comp.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Applied to for-next, thanks

Jason

^ permalink raw reply

* Re: [PATCH rdma-next 0/2] DEVX VHCA tunnel support
From: Jason Gunthorpe @ 2019-07-05 17:40 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list, Max Gurtovoy,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20190701181402.25286-1-leon@kernel.org>

On Mon, Jul 01, 2019 at 09:14:00PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@mellanox.com>
> 
> Hi,
> 
> Those two patches introduce VHCA tunnel mechanism to DEVX interface
> needed for Bluefield SOC. See extensive commit messages for more
> information.
> 
> Thanks
> 
> Max Gurtovoy (2):
>   net/mlx5: Introduce VHCA tunnel device capability
>   IB/mlx5: Implement VHCA tunnel mechanism in DEVX
> 
>  drivers/infiniband/hw/mlx5/devx.c | 24 ++++++++++++++++++++----
>  include/linux/mlx5/mlx5_ifc.h     | 10 ++++++++--
>  2 files changed, 28 insertions(+), 6 deletions(-)

This looks Ok can you apply the mlx5-next patch please

Thanks,
Jason


^ permalink raw reply

* Re: [EXT] Re: [RFC rdma 1/3] RDMA/core: Create a common mmap function
From: Jason Gunthorpe @ 2019-07-05 17:35 UTC (permalink / raw)
  To: Michal Kalderon
  Cc: Gal Pressman, dledford@redhat.com, leon@kernel.org,
	sleybo@amazon.com, Ariel Elior, linux-rdma@vger.kernel.org
In-Reply-To: <MN2PR18MB3182F4496DA01CA2B113DF04A1F50@MN2PR18MB3182.namprd18.prod.outlook.com>

On Fri, Jul 05, 2019 at 05:24:18PM +0000, Michal Kalderon wrote:
> > From: Jason Gunthorpe <jgg@ziepe.ca>
> > Sent: Friday, July 5, 2019 6:33 PM
> > 
> > On Fri, Jul 05, 2019 at 03:29:03PM +0000, Michal Kalderon wrote:
> > > > From: Jason Gunthorpe <jgg@ziepe.ca>
> > > > Sent: Thursday, July 4, 2019 3:35 PM
> > > >
> > > > External Email
> > > >
> > > > On Wed, Jul 03, 2019 at 11:19:34AM +0300, Gal Pressman wrote:
> > > > > On 03/07/2019 1:31, Jason Gunthorpe wrote:
> > > > > >> Seems except Mellanox + hns the mmap flags aren't ABI.
> > > > > >> Also, current Mellanox code seems like it won't benefit from
> > > > > >> mmap cookie helper functions in any case as the mmap function
> > > > > >> is very specific and the flags used indicate the address and
> > > > > >> not just how to map
> > > > it.
> > > > > >
> > > > > > IMHO, mlx5 has a goofy implementaiton here as it codes all of
> > > > > > the object type, handle and cachability flags in one thing.
> > > > >
> > > > > Do we need object type flags as well in the generic mmap code?
> > > >
> > > > At the end of the day the driver needs to know what page to map
> > > > during the mmap syscall.
> > > >
> > > > mlx5 does this by encoding the page type in the address, and then
> > > > many types have seperate lookups based onthe offset for the actual
> > page.
> > > >
> > > > IMHO the single lookup and opaque offset is generally better..
> > > >
> > > > Since the mlx5 scheme is ABI it can't be changed unfortunately.
> > > >
> > > > If you want to do user controlled cachability flags, or not, is a
> > > > fair question, but they still become ABI..
> > > >
> > > > I'm wondering if it really makes sense to do that during the mmap,
> > > > or if the cachability should be set as part of creating the cookie?
> > > >
> > > > > Another issue is that these flags aren't exposed in an ABI file,
> > > > > so a userspace library can't really make use of it in current state.
> > > >
> > > > Woops.
> > > >
> > > > Ah, this is all ABI so you need to dig out of this hole ASAP :)
> > > >
> > > Jason, I didn't follow - what is all ABI?
> > > currently EFA implementation encodes the cachability inside the key,
> > > It's not exposed in ABI file and is opaque to user-space. The kernel
> > > decides on the cachability And get's it back in the key when mmap is
> > > called. It seems good enough for the current cases.
> > 
> > Then the key 'offset' should not include cachability information at all.
> > 
> Fair enough, so as you stated above the cachabiliy can be set in the cookie. 
> Would we still like to leave some bits for future ABI enhancements, requests, from user ? 
> Similar to a page type that mlx has ? 

Doesn't make sense to mix and match, the page_type was just some way
to avoid tracking cookies in some cases. If we are always having a
cookie then the cookie should indicate the type based on how it was
created. Totally opaque

Jason

^ permalink raw reply

* RE: [EXT] Re: [RFC rdma 1/3] RDMA/core: Create a common mmap function
From: Michal Kalderon @ 2019-07-05 17:24 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Gal Pressman, dledford@redhat.com, leon@kernel.org,
	sleybo@amazon.com, Ariel Elior, linux-rdma@vger.kernel.org
In-Reply-To: <20190705153248.GB31543@ziepe.ca>

> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Friday, July 5, 2019 6:33 PM
> 
> On Fri, Jul 05, 2019 at 03:29:03PM +0000, Michal Kalderon wrote:
> > > From: Jason Gunthorpe <jgg@ziepe.ca>
> > > Sent: Thursday, July 4, 2019 3:35 PM
> > >
> > > External Email
> > >
> > > On Wed, Jul 03, 2019 at 11:19:34AM +0300, Gal Pressman wrote:
> > > > On 03/07/2019 1:31, Jason Gunthorpe wrote:
> > > > >> Seems except Mellanox + hns the mmap flags aren't ABI.
> > > > >> Also, current Mellanox code seems like it won't benefit from
> > > > >> mmap cookie helper functions in any case as the mmap function
> > > > >> is very specific and the flags used indicate the address and
> > > > >> not just how to map
> > > it.
> > > > >
> > > > > IMHO, mlx5 has a goofy implementaiton here as it codes all of
> > > > > the object type, handle and cachability flags in one thing.
> > > >
> > > > Do we need object type flags as well in the generic mmap code?
> > >
> > > At the end of the day the driver needs to know what page to map
> > > during the mmap syscall.
> > >
> > > mlx5 does this by encoding the page type in the address, and then
> > > many types have seperate lookups based onthe offset for the actual
> page.
> > >
> > > IMHO the single lookup and opaque offset is generally better..
> > >
> > > Since the mlx5 scheme is ABI it can't be changed unfortunately.
> > >
> > > If you want to do user controlled cachability flags, or not, is a
> > > fair question, but they still become ABI..
> > >
> > > I'm wondering if it really makes sense to do that during the mmap,
> > > or if the cachability should be set as part of creating the cookie?
> > >
> > > > Another issue is that these flags aren't exposed in an ABI file,
> > > > so a userspace library can't really make use of it in current state.
> > >
> > > Woops.
> > >
> > > Ah, this is all ABI so you need to dig out of this hole ASAP :)
> > >
> > Jason, I didn't follow - what is all ABI?
> > currently EFA implementation encodes the cachability inside the key,
> > It's not exposed in ABI file and is opaque to user-space. The kernel
> > decides on the cachability And get's it back in the key when mmap is
> > called. It seems good enough for the current cases.
> 
> Then the key 'offset' should not include cachability information at all.
> 
Fair enough, so as you stated above the cachabiliy can be set in the cookie. 
Would we still like to leave some bits for future ABI enhancements, requests, from user ? 
Similar to a page type that mlx has ? 

Thanks,
Michal


> Jason

^ permalink raw reply

* Re: [PATCH for-next 0/8] Some fixes from hns
From: Jason Gunthorpe @ 2019-07-05 17:23 UTC (permalink / raw)
  To: Lijun Ou; +Cc: dledford, leon, linux-rdma, linuxarm
In-Reply-To: <1561376872-111496-1-git-send-email-oulijun@huawei.com>

On Mon, Jun 24, 2019 at 07:47:44PM +0800, Lijun Ou wrote:
> Here are some bug fixes as well code optimization.
> 
> Lang Cheng (3):
>   RDMA/hns: Set reset flag when hw resetting
>   RDMA/hns: Use %pK format pointer print
>   RDMA/hns: Clean up unnecessary variable initialization
> 
> Lijun Ou (1):
>   RDMA/hns: Bugfix for cleaning mtr
> 
> Xi Wang (1):
>   RDMA/hns: Fixs hw access invalid dma memory error
> 
> Yangyang Li (1):
>   RDMA/hns: Modify ba page size for cqe
> 
> chenglang (1):
>   RDMA/hns: Fixup qp release bug
> 
> o00290482 (1):
>   RDMA/hns: Bugfix for calculating qp buffer size
> 
>  drivers/infiniband/hw/hns/hns_roce_cmd.c   |  2 +-
>  drivers/infiniband/hw/hns/hns_roce_hw_v1.c |  4 +++-
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c |  9 +++++----
>  drivers/infiniband/hw/hns/hns_roce_main.c  |  2 +-
>  drivers/infiniband/hw/hns/hns_roce_pd.c    |  2 +-
>  drivers/infiniband/hw/hns/hns_roce_qp.c    | 13 ++++---------
>  6 files changed, 15 insertions(+), 17 deletions(-)
>

Applied to for-next, thanks

Jason

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox