Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed
* [PATCH for-next 9/9] RDMA/hns: Refactor eq table init for hip08
From: Lijun Ou @ 2019-07-08 13:41 UTC (permalink / raw)
  To: dledford, jgg; +Cc: leon, linux-rdma, linuxarm
In-Reply-To: <1562593285-8037-1-git-send-email-oulijun@huawei.com>

From: Yixian Liu <liuyixian@huawei.com>

To make the code more readable, here moves the part of
naming irq and request irq out of eq table init into an
seperate function.

Signed-off-by: Yixian Liu <liuyixian@huawei.com>
Signed-off-by: Lijun Ou <oulijun@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 176 ++++++++++++++++++-----------
 1 file changed, 107 insertions(+), 69 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index c2ddfc1..83c58be 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -5735,6 +5735,94 @@ static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev,
 	return ret;
 }
 
+static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num,
+				  int comp_num, int aeq_num, int other_num)
+{
+	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
+	int i, j;
+	int ret;
+
+	for (i = 0; i < irq_num; i++) {
+		hr_dev->irq_names[i] = kzalloc(HNS_ROCE_INT_NAME_LEN,
+					       GFP_KERNEL);
+		if (!hr_dev->irq_names[i]) {
+			ret = -ENOMEM;
+			goto err_kzalloc_failed;
+		}
+	}
+
+	/* irq contains: abnormal + AEQ + CEQ*/
+	for (j = 0; j < irq_num; j++)
+		if (j < other_num)
+			snprintf((char *)hr_dev->irq_names[j],
+				 HNS_ROCE_INT_NAME_LEN, "hns-abn-%d", j);
+		else if (j < (other_num + aeq_num))
+			snprintf((char *)hr_dev->irq_names[j],
+				 HNS_ROCE_INT_NAME_LEN, "hns-aeq-%d",
+				 j - other_num);
+		else
+			snprintf((char *)hr_dev->irq_names[j],
+				 HNS_ROCE_INT_NAME_LEN, "hns-ceq-%d",
+				 j - other_num - aeq_num);
+
+	for (j = 0; j < irq_num; j++) {
+		if (j < other_num)
+			ret = request_irq(hr_dev->irq[j],
+					  hns_roce_v2_msix_interrupt_abn,
+					  0, hr_dev->irq_names[j], hr_dev);
+
+		else if (j < (other_num + comp_num))
+			ret = request_irq(eq_table->eq[j - other_num].irq,
+					  hns_roce_v2_msix_interrupt_eq,
+					  0, hr_dev->irq_names[j + aeq_num],
+					  &eq_table->eq[j - other_num]);
+		else
+			ret = request_irq(eq_table->eq[j - other_num].irq,
+					  hns_roce_v2_msix_interrupt_eq,
+					  0, hr_dev->irq_names[j - comp_num],
+					  &eq_table->eq[j - other_num]);
+		if (ret) {
+			dev_err(hr_dev->dev, "Request irq error!\n");
+			goto err_request_failed;
+		}
+	}
+
+	return 0;
+
+err_request_failed:
+	for (j -= 1; j >= 0; j--)
+		if (j < other_num)
+			free_irq(hr_dev->irq[j], hr_dev);
+		else
+			free_irq(eq_table->eq[j - other_num].irq,
+				 &eq_table->eq[j - other_num]);
+
+err_kzalloc_failed:
+	for (i -= 1; i >= 0; i--)
+		kfree(hr_dev->irq_names[i]);
+
+	return ret;
+}
+
+static void __hns_roce_free_irq(struct hns_roce_dev *hr_dev)
+{
+	int irq_num;
+	int eq_num;
+	int i;
+
+	eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors;
+	irq_num = eq_num + hr_dev->caps.num_other_vectors;
+
+	for (i = 0; i < hr_dev->caps.num_other_vectors; i++)
+		free_irq(hr_dev->irq[i], hr_dev);
+
+	for (i = 0; i < eq_num; i++)
+		free_irq(hr_dev->eq_table.eq[i].irq, &hr_dev->eq_table.eq[i]);
+
+	for (i = 0; i < irq_num; i++)
+		kfree(hr_dev->irq_names[i]);
+}
+
 static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
@@ -5746,7 +5834,7 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
 	int other_num;
 	int comp_num;
 	int aeq_num;
-	int i, j, k;
+	int i;
 	int ret;
 
 	other_num = hr_dev->caps.num_other_vectors;
@@ -5760,27 +5848,18 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
 	if (!eq_table->eq)
 		return -ENOMEM;
 
-	for (i = 0; i < irq_num; i++) {
-		hr_dev->irq_names[i] = kzalloc(HNS_ROCE_INT_NAME_LEN,
-					       GFP_KERNEL);
-		if (!hr_dev->irq_names[i]) {
-			ret = -ENOMEM;
-			goto err_failed_kzalloc;
-		}
-	}
-
 	/* create eq */
-	for (j = 0; j < eq_num; j++) {
-		eq = &eq_table->eq[j];
+	for (i = 0; i < eq_num; i++) {
+		eq = &eq_table->eq[i];
 		eq->hr_dev = hr_dev;
-		eq->eqn = j;
-		if (j < comp_num) {
+		eq->eqn = i;
+		if (i < comp_num) {
 			/* CEQ */
 			eq_cmd = HNS_ROCE_CMD_CREATE_CEQC;
 			eq->type_flag = HNS_ROCE_CEQ;
 			eq->entries = hr_dev->caps.ceqe_depth;
 			eq->eqe_size = HNS_ROCE_CEQ_ENTRY_SIZE;
-			eq->irq = hr_dev->irq[j + other_num + aeq_num];
+			eq->irq = hr_dev->irq[i + other_num + aeq_num];
 			eq->eq_max_cnt = HNS_ROCE_CEQ_DEFAULT_BURST_NUM;
 			eq->eq_period = HNS_ROCE_CEQ_DEFAULT_INTERVAL;
 		} else {
@@ -5789,7 +5868,7 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
 			eq->type_flag = HNS_ROCE_AEQ;
 			eq->entries = hr_dev->caps.aeqe_depth;
 			eq->eqe_size = HNS_ROCE_AEQ_ENTRY_SIZE;
-			eq->irq = hr_dev->irq[j - comp_num + other_num];
+			eq->irq = hr_dev->irq[i - comp_num + other_num];
 			eq->eq_max_cnt = HNS_ROCE_AEQ_DEFAULT_BURST_NUM;
 			eq->eq_period = HNS_ROCE_AEQ_DEFAULT_INTERVAL;
 		}
@@ -5804,40 +5883,11 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
 	/* enable irq */
 	hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_ENABLE);
 
-	/* irq contains: abnormal + AEQ + CEQ*/
-	for (k = 0; k < irq_num; k++)
-		if (k < other_num)
-			snprintf((char *)hr_dev->irq_names[k],
-				 HNS_ROCE_INT_NAME_LEN, "hns-abn-%d", k);
-		else if (k < (other_num + aeq_num))
-			snprintf((char *)hr_dev->irq_names[k],
-				 HNS_ROCE_INT_NAME_LEN, "hns-aeq-%d",
-				 k - other_num);
-		else
-			snprintf((char *)hr_dev->irq_names[k],
-				 HNS_ROCE_INT_NAME_LEN, "hns-ceq-%d",
-				 k - other_num - aeq_num);
-
-	for (k = 0; k < irq_num; k++) {
-		if (k < other_num)
-			ret = request_irq(hr_dev->irq[k],
-					  hns_roce_v2_msix_interrupt_abn,
-					  0, hr_dev->irq_names[k], hr_dev);
-
-		else if (k < (other_num + comp_num))
-			ret = request_irq(eq_table->eq[k - other_num].irq,
-					  hns_roce_v2_msix_interrupt_eq,
-					  0, hr_dev->irq_names[k + aeq_num],
-					  &eq_table->eq[k - other_num]);
-		else
-			ret = request_irq(eq_table->eq[k - other_num].irq,
-					  hns_roce_v2_msix_interrupt_eq,
-					  0, hr_dev->irq_names[k - comp_num],
-					  &eq_table->eq[k - other_num]);
-		if (ret) {
-			dev_err(dev, "Request irq error!\n");
-			goto err_request_irq_fail;
-		}
+	ret = __hns_roce_request_irq(hr_dev, irq_num, comp_num,
+				     aeq_num, other_num);
+	if (ret) {
+		dev_err(dev, "Request irq failed.\n");
+		goto err_request_irq_fail;
 	}
 
 	hr_dev->irq_workq =
@@ -5845,26 +5895,20 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
 	if (!hr_dev->irq_workq) {
 		dev_err(dev, "Create irq workqueue failed!\n");
 		ret = -ENOMEM;
-		goto err_request_irq_fail;
+		goto err_create_wq_fail;
 	}
 
 	return 0;
 
+err_create_wq_fail:
+	__hns_roce_free_irq(hr_dev);
+
 err_request_irq_fail:
-	for (k -= 1; k >= 0; k--)
-		if (k < other_num)
-			free_irq(hr_dev->irq[k], hr_dev);
-		else
-			free_irq(eq_table->eq[k - other_num].irq,
-				 &eq_table->eq[k - other_num]);
+	hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_DISABLE);
 
 err_create_eq_fail:
-	for (j -= 1; j >= 0; j--)
-		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[j]);
-
-err_failed_kzalloc:
 	for (i -= 1; i >= 0; i--)
-		kfree(hr_dev->irq_names[i]);
+		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[i]);
 	kfree(eq_table->eq);
 
 	return ret;
@@ -5883,20 +5927,14 @@ static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev)
 	/* Disable irq */
 	hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_DISABLE);
 
-	for (i = 0; i < hr_dev->caps.num_other_vectors; i++)
-		free_irq(hr_dev->irq[i], hr_dev);
+	__hns_roce_free_irq(hr_dev);
 
 	for (i = 0; i < eq_num; i++) {
 		hns_roce_v2_destroy_eqc(hr_dev, i);
 
-		free_irq(eq_table->eq[i].irq, &eq_table->eq[i]);
-
 		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[i]);
 	}
 
-	for (i = 0; i < irq_num; i++)
-		kfree(hr_dev->irq_names[i]);
-
 	kfree(eq_table->eq);
 
 	flush_workqueue(hr_dev->irq_workq);
-- 
1.9.1


^ permalink raw reply related

* [PATCH for-next 6/9] RDMA/hns: Optimize hns_roce_mhop_alloc function.
From: Lijun Ou @ 2019-07-08 13:41 UTC (permalink / raw)
  To: dledford, jgg; +Cc: leon, linux-rdma, linuxarm
In-Reply-To: <1562593285-8037-1-git-send-email-oulijun@huawei.com>

From: chenglang <chenglang@huawei.com>

Here packages some lines for allocating multi-hop addressing
into the independent functions in order to add the readability.

Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Lijun Ou <oulijun@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_mr.c | 275 +++++++++++++++++++-------------
 1 file changed, 160 insertions(+), 115 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 549e1a3..c4b758c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -347,155 +347,208 @@ static void hns_roce_loop_free(struct hns_roce_dev *hr_dev,
 	mr->pbl_bt_l0 = NULL;
 	mr->pbl_l0_dma_addr = 0;
 }
+static int pbl_1hop_alloc(struct hns_roce_dev *hr_dev, int npages,
+			       struct hns_roce_mr *mr, u32 pbl_bt_sz)
+{
+	struct device *dev = hr_dev->dev;
 
-/* PBL multi hop addressing */
-static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages,
-			       struct hns_roce_mr *mr)
+	if (npages > pbl_bt_sz / 8) {
+		dev_err(dev, "npages %d is larger than buf_pg_sz!",
+			npages);
+		return -EINVAL;
+	}
+	mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
+					 &(mr->pbl_dma_addr),
+					 GFP_KERNEL);
+	if (!mr->pbl_buf)
+		return -ENOMEM;
+
+	mr->pbl_size = npages;
+	mr->pbl_ba = mr->pbl_dma_addr;
+	mr->pbl_hop_num = 1;
+	mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz;
+	mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz;
+	return 0;
+
+}
+
+
+static int pbl_2hop_alloc(struct hns_roce_dev *hr_dev, int npages,
+			       struct hns_roce_mr *mr, u32 pbl_bt_sz)
 {
 	struct device *dev = hr_dev->dev;
-	int mr_alloc_done = 0;
 	int npages_allocated;
-	int i = 0, j = 0;
-	u32 pbl_bt_sz;
-	u32 mhop_num;
 	u64 pbl_last_bt_num;
 	u64 pbl_bt_cnt = 0;
-	u64 bt_idx;
 	u64 size;
+	int i;
 
-	mhop_num = (mr->type == MR_TYPE_FRMR ? 1 : hr_dev->caps.pbl_hop_num);
-	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
 	pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8);
 
-	if (mhop_num == HNS_ROCE_HOP_NUM_0)
-		return 0;
-
-	/* hop_num = 1 */
-	if (mhop_num == 1) {
-		if (npages > pbl_bt_sz / 8) {
-			dev_err(dev, "npages %d is larger than buf_pg_sz!",
-				npages);
-			return -EINVAL;
+	/* alloc L1 BT */
+	for (i = 0; i < pbl_bt_sz / 8; i++) {
+		if (pbl_bt_cnt + 1 < pbl_last_bt_num) {
+			size = pbl_bt_sz;
+		} else {
+			npages_allocated = i * (pbl_bt_sz / 8);
+			size = (npages - npages_allocated) * 8;
 		}
-		mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
-						 &(mr->pbl_dma_addr),
-						 GFP_KERNEL);
-		if (!mr->pbl_buf)
+		mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, size,
+					    &(mr->pbl_l1_dma_addr[i]),
+					    GFP_KERNEL);
+		if (!mr->pbl_bt_l1[i]) {
+			hns_roce_loop_free(hr_dev, mr, 1, i, 0);
 			return -ENOMEM;
+		}
 
-		mr->pbl_size = npages;
-		mr->pbl_ba = mr->pbl_dma_addr;
-		mr->pbl_hop_num = mhop_num;
-		mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz;
-		mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz;
-		return 0;
+		*(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i];
+
+		pbl_bt_cnt++;
+		if (pbl_bt_cnt >= pbl_last_bt_num)
+			break;
 	}
 
-	mr->pbl_l1_dma_addr = kcalloc(pbl_bt_sz / 8,
-				      sizeof(*mr->pbl_l1_dma_addr),
+	mr->l0_chunk_last_num = i + 1;
+
+	return 0;
+}
+
+static int pbl_3hop_alloc(struct hns_roce_dev *hr_dev, int npages,
+			       struct hns_roce_mr *mr, u32 pbl_bt_sz)
+{
+	struct device *dev = hr_dev->dev;
+	int mr_alloc_done = 0;
+	int npages_allocated;
+	u64 pbl_last_bt_num;
+	u64 pbl_bt_cnt = 0;
+	u64 bt_idx;
+	u64 size;
+	int i;
+	int j = 0;
+
+	pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8);
+
+	mr->pbl_l2_dma_addr = kcalloc(pbl_last_bt_num,
+				      sizeof(*mr->pbl_l2_dma_addr),
 				      GFP_KERNEL);
-	if (!mr->pbl_l1_dma_addr)
+	if (!mr->pbl_l2_dma_addr)
 		return -ENOMEM;
 
-	mr->pbl_bt_l1 = kcalloc(pbl_bt_sz / 8, sizeof(*mr->pbl_bt_l1),
+	mr->pbl_bt_l2 = kcalloc(pbl_last_bt_num,
+				sizeof(*mr->pbl_bt_l2),
 				GFP_KERNEL);
-	if (!mr->pbl_bt_l1)
-		goto err_kcalloc_bt_l1;
-
-	if (mhop_num == 3) {
-		mr->pbl_l2_dma_addr = kcalloc(pbl_last_bt_num,
-					      sizeof(*mr->pbl_l2_dma_addr),
-					      GFP_KERNEL);
-		if (!mr->pbl_l2_dma_addr)
-			goto err_kcalloc_l2_dma;
+	if (!mr->pbl_bt_l2)
+		goto err_kcalloc_bt_l2;
+
+	/* alloc L1, L2 BT */
+	for (i = 0; i < pbl_bt_sz / 8; i++) {
+		mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, pbl_bt_sz,
+					    &(mr->pbl_l1_dma_addr[i]),
+					    GFP_KERNEL);
+		if (!mr->pbl_bt_l1[i]) {
+			hns_roce_loop_free(hr_dev, mr, 1, i, 0);
+			goto err_dma_alloc_l0;
+		}
 
-		mr->pbl_bt_l2 = kcalloc(pbl_last_bt_num,
-					sizeof(*mr->pbl_bt_l2),
-					GFP_KERNEL);
-		if (!mr->pbl_bt_l2)
-			goto err_kcalloc_bt_l2;
-	}
+		*(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i];
 
-	/* alloc L0 BT */
-	mr->pbl_bt_l0 = dma_alloc_coherent(dev, pbl_bt_sz,
-					   &(mr->pbl_l0_dma_addr),
-					   GFP_KERNEL);
-	if (!mr->pbl_bt_l0)
-		goto err_dma_alloc_l0;
+		for (j = 0; j < pbl_bt_sz / 8; j++) {
+			bt_idx = i * pbl_bt_sz / 8 + j;
 
-	if (mhop_num == 2) {
-		/* alloc L1 BT */
-		for (i = 0; i < pbl_bt_sz / 8; i++) {
 			if (pbl_bt_cnt + 1 < pbl_last_bt_num) {
 				size = pbl_bt_sz;
 			} else {
-				npages_allocated = i * (pbl_bt_sz / 8);
+				npages_allocated = bt_idx *
+						   (pbl_bt_sz / 8);
 				size = (npages - npages_allocated) * 8;
 			}
-			mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, size,
-						    &(mr->pbl_l1_dma_addr[i]),
-						    GFP_KERNEL);
-			if (!mr->pbl_bt_l1[i]) {
-				hns_roce_loop_free(hr_dev, mr, 1, i, 0);
+			mr->pbl_bt_l2[bt_idx] = dma_alloc_coherent(
+				      dev, size,
+				      &(mr->pbl_l2_dma_addr[bt_idx]),
+				      GFP_KERNEL);
+			if (!mr->pbl_bt_l2[bt_idx]) {
+				hns_roce_loop_free(hr_dev, mr, 2, i, j);
 				goto err_dma_alloc_l0;
 			}
 
-			*(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i];
+			*(mr->pbl_bt_l1[i] + j) =
+					mr->pbl_l2_dma_addr[bt_idx];
 
 			pbl_bt_cnt++;
-			if (pbl_bt_cnt >= pbl_last_bt_num)
+			if (pbl_bt_cnt >= pbl_last_bt_num) {
+				mr_alloc_done = 1;
 				break;
-		}
-	} else if (mhop_num == 3) {
-		/* alloc L1, L2 BT */
-		for (i = 0; i < pbl_bt_sz / 8; i++) {
-			mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, pbl_bt_sz,
-						    &(mr->pbl_l1_dma_addr[i]),
-						    GFP_KERNEL);
-			if (!mr->pbl_bt_l1[i]) {
-				hns_roce_loop_free(hr_dev, mr, 1, i, 0);
-				goto err_dma_alloc_l0;
 			}
+		}
 
-			*(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i];
+		if (mr_alloc_done)
+			break;
+	}
 
-			for (j = 0; j < pbl_bt_sz / 8; j++) {
-				bt_idx = i * pbl_bt_sz / 8 + j;
+	mr->l0_chunk_last_num = i + 1;
+	mr->l1_chunk_last_num = j + 1;
 
-				if (pbl_bt_cnt + 1 < pbl_last_bt_num) {
-					size = pbl_bt_sz;
-				} else {
-					npages_allocated = bt_idx *
-							   (pbl_bt_sz / 8);
-					size = (npages - npages_allocated) * 8;
-				}
-				mr->pbl_bt_l2[bt_idx] = dma_alloc_coherent(
-					      dev, size,
-					      &(mr->pbl_l2_dma_addr[bt_idx]),
-					      GFP_KERNEL);
-				if (!mr->pbl_bt_l2[bt_idx]) {
-					hns_roce_loop_free(hr_dev, mr, 2, i, j);
-					goto err_dma_alloc_l0;
-				}
 
-				*(mr->pbl_bt_l1[i] + j) =
-						mr->pbl_l2_dma_addr[bt_idx];
+	return 0;
 
-				pbl_bt_cnt++;
-				if (pbl_bt_cnt >= pbl_last_bt_num) {
-					mr_alloc_done = 1;
-					break;
-				}
-			}
+err_dma_alloc_l0:
+	kfree(mr->pbl_bt_l2);
+	mr->pbl_bt_l2 = NULL;
 
-			if (mr_alloc_done)
-				break;
-		}
+err_kcalloc_bt_l2:
+	kfree(mr->pbl_l2_dma_addr);
+	mr->pbl_l2_dma_addr = NULL;
+
+	return -ENOMEM;
+}
+
+
+/* PBL multi hop addressing */
+static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages,
+			       struct hns_roce_mr *mr)
+{
+	struct device *dev = hr_dev->dev;
+	u32 pbl_bt_sz;
+	u32 mhop_num;
+
+	mhop_num = (mr->type == MR_TYPE_FRMR ? 1 : hr_dev->caps.pbl_hop_num);
+	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
+
+	if (mhop_num == HNS_ROCE_HOP_NUM_0)
+		return 0;
+
+	/* hop_num = 1 */
+	if (mhop_num == 1)
+		return pbl_1hop_alloc(hr_dev, npages, mr, pbl_bt_sz);
+
+	mr->pbl_l1_dma_addr = kcalloc(pbl_bt_sz / 8,
+				      sizeof(*mr->pbl_l1_dma_addr),
+				      GFP_KERNEL);
+	if (!mr->pbl_l1_dma_addr)
+		return -ENOMEM;
+
+	mr->pbl_bt_l1 = kcalloc(pbl_bt_sz / 8, sizeof(*mr->pbl_bt_l1),
+				GFP_KERNEL);
+	if (!mr->pbl_bt_l1)
+		goto err_kcalloc_bt_l1;
+
+	/* alloc L0 BT */
+	mr->pbl_bt_l0 = dma_alloc_coherent(dev, pbl_bt_sz,
+					   &(mr->pbl_l0_dma_addr),
+					   GFP_KERNEL);
+	if (!mr->pbl_bt_l0)
+		goto err_kcalloc_l2_dma;
+
+	if (mhop_num == 2) {
+		if (pbl_2hop_alloc(hr_dev, npages, mr, pbl_bt_sz))
+			goto err_kcalloc_l2_dma;
+	}
+
+	if (mhop_num == 3) {
+		if (pbl_3hop_alloc(hr_dev, npages, mr, pbl_bt_sz))
+			goto err_kcalloc_l2_dma;
 	}
 
-	mr->l0_chunk_last_num = i + 1;
-	if (mhop_num == 3)
-		mr->l1_chunk_last_num = j + 1;
 
 	mr->pbl_size = npages;
 	mr->pbl_ba = mr->pbl_l0_dma_addr;
@@ -505,14 +558,6 @@ static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages,
 
 	return 0;
 
-err_dma_alloc_l0:
-	kfree(mr->pbl_bt_l2);
-	mr->pbl_bt_l2 = NULL;
-
-err_kcalloc_bt_l2:
-	kfree(mr->pbl_l2_dma_addr);
-	mr->pbl_l2_dma_addr = NULL;
-
 err_kcalloc_l2_dma:
 	kfree(mr->pbl_bt_l1);
 	mr->pbl_bt_l1 = NULL;
-- 
1.9.1


^ permalink raw reply related

* [PATCH for-next 8/9] RDMA/hns: Refactor hem table mhop check and calculation
From: Lijun Ou @ 2019-07-08 13:41 UTC (permalink / raw)
  To: dledford, jgg; +Cc: leon, linux-rdma, linuxarm
In-Reply-To: <1562593285-8037-1-git-send-email-oulijun@huawei.com>

The calculation of mhop for hem is duplicated in
hns_roce_init_hem_table and hns_roce_calc_hem_mhop,
extracting it from them to a separated function. Moreover,
this patch refactor hns_roce_check_whether_mhop to
reduce complexity.

Signed-off-by: Yixian Liu <liuyixian@huawei.com>
Signed-off-by: Lijun Ou <oulijun@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_hem.c | 180 ++++++++++++-------------------
 1 file changed, 70 insertions(+), 110 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index f4da5bd..6150311 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -41,20 +41,47 @@
 
 bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type)
 {
-	if ((hr_dev->caps.qpc_hop_num && type == HEM_TYPE_QPC) ||
-	    (hr_dev->caps.mpt_hop_num && type == HEM_TYPE_MTPT) ||
-	    (hr_dev->caps.cqc_hop_num && type == HEM_TYPE_CQC) ||
-	    (hr_dev->caps.srqc_hop_num && type == HEM_TYPE_SRQC) ||
-	    (hr_dev->caps.sccc_hop_num && type == HEM_TYPE_SCCC) ||
-	    (hr_dev->caps.qpc_timer_hop_num && type == HEM_TYPE_QPC_TIMER) ||
-	    (hr_dev->caps.cqc_timer_hop_num && type == HEM_TYPE_CQC_TIMER) ||
-	    (hr_dev->caps.cqe_hop_num && type == HEM_TYPE_CQE) ||
-	    (hr_dev->caps.mtt_hop_num && type == HEM_TYPE_MTT) ||
-	    (hr_dev->caps.srqwqe_hop_num && type == HEM_TYPE_SRQWQE) ||
-	    (hr_dev->caps.idx_hop_num && type == HEM_TYPE_IDX))
-		return true;
-
-	return false;
+	int hop_num = 0;
+
+	switch (type) {
+	case HEM_TYPE_QPC:
+		hop_num = hr_dev->caps.qpc_hop_num;
+		break;
+	case HEM_TYPE_MTPT:
+		hop_num = hr_dev->caps.mpt_hop_num;
+		break;
+	case HEM_TYPE_CQC:
+		hop_num = hr_dev->caps.cqc_hop_num;
+		break;
+	case HEM_TYPE_SRQC:
+		hop_num = hr_dev->caps.srqc_hop_num;
+		break;
+	case HEM_TYPE_SCCC:
+		hop_num = hr_dev->caps.sccc_hop_num;
+		break;
+	case HEM_TYPE_QPC_TIMER:
+		hop_num = hr_dev->caps.qpc_timer_hop_num;
+		break;
+	case HEM_TYPE_CQC_TIMER:
+		hop_num = hr_dev->caps.cqc_timer_hop_num;
+		break;
+	case HEM_TYPE_CQE:
+		hop_num = hr_dev->caps.cqe_hop_num;
+		break;
+	case HEM_TYPE_MTT:
+		hop_num = hr_dev->caps.mtt_hop_num;
+		break;
+	case HEM_TYPE_SRQWQE:
+		hop_num = hr_dev->caps.srqwqe_hop_num;
+		break;
+	case HEM_TYPE_IDX:
+		hop_num = hr_dev->caps.idx_hop_num;
+		break;
+	default:
+		return false;
+	}
+
+	return hop_num ? true : false;
 }
 
 static bool hns_roce_check_hem_null(struct hns_roce_hem **hem, u64 start_idx,
@@ -92,17 +119,13 @@ static int hns_roce_get_bt_num(u32 table_type, u32 hop_num)
 		return 0;
 }
 
-int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
-			   struct hns_roce_hem_table *table, unsigned long *obj,
-			   struct hns_roce_hem_mhop *mhop)
+static int get_hem_table_config(struct hns_roce_dev *hr_dev,
+				struct hns_roce_hem_mhop *mhop,
+				u32 type)
 {
 	struct device *dev = hr_dev->dev;
-	u32 chunk_ba_num;
-	u32 table_idx;
-	u32 bt_num;
-	u32 chunk_size;
 
-	switch (table->type) {
+	switch (type) {
 	case HEM_TYPE_QPC:
 		mhop->buf_chunk_size = 1 << (hr_dev->caps.qpc_buf_pg_sz
 					     + PAGE_SHIFT);
@@ -193,10 +216,26 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
 		break;
 	default:
 		dev_err(dev, "Table %d not support multi-hop addressing!\n",
-			 table->type);
+			type);
 		return -EINVAL;
 	}
 
+	return 0;
+}
+
+int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_hem_table *table, unsigned long *obj,
+			   struct hns_roce_hem_mhop *mhop)
+{
+	struct device *dev = hr_dev->dev;
+	u32 chunk_ba_num;
+	u32 table_idx;
+	u32 bt_num;
+	u32 chunk_size;
+
+	if (get_hem_table_config(hr_dev, mhop, table->type))
+		return -EINVAL;
+
 	if (!obj)
 		return 0;
 
@@ -887,7 +926,6 @@ int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev,
 			    unsigned long obj_size, unsigned long nobj,
 			    int use_lowmem)
 {
-	struct device *dev = hr_dev->dev;
 	unsigned long obj_per_chunk;
 	unsigned long num_hem;
 
@@ -900,99 +938,21 @@ int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev,
 		if (!table->hem)
 			return -ENOMEM;
 	} else {
+		struct hns_roce_hem_mhop mhop = {};
 		unsigned long buf_chunk_size;
 		unsigned long bt_chunk_size;
 		unsigned long bt_chunk_num;
 		unsigned long num_bt_l0 = 0;
 		u32 hop_num;
 
-		switch (type) {
-		case HEM_TYPE_QPC:
-			buf_chunk_size = 1 << (hr_dev->caps.qpc_buf_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = 1 << (hr_dev->caps.qpc_ba_pg_sz
-					+ PAGE_SHIFT);
-			num_bt_l0 = hr_dev->caps.qpc_bt_num;
-			hop_num = hr_dev->caps.qpc_hop_num;
-			break;
-		case HEM_TYPE_MTPT:
-			buf_chunk_size = 1 << (hr_dev->caps.mpt_buf_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = 1 << (hr_dev->caps.mpt_ba_pg_sz
-					+ PAGE_SHIFT);
-			num_bt_l0 = hr_dev->caps.mpt_bt_num;
-			hop_num = hr_dev->caps.mpt_hop_num;
-			break;
-		case HEM_TYPE_CQC:
-			buf_chunk_size = 1 << (hr_dev->caps.cqc_buf_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = 1 << (hr_dev->caps.cqc_ba_pg_sz
-					+ PAGE_SHIFT);
-			num_bt_l0 = hr_dev->caps.cqc_bt_num;
-			hop_num = hr_dev->caps.cqc_hop_num;
-			break;
-		case HEM_TYPE_SCCC:
-			buf_chunk_size = 1 << (hr_dev->caps.sccc_buf_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = 1 << (hr_dev->caps.sccc_ba_pg_sz
-					+ PAGE_SHIFT);
-			num_bt_l0 = hr_dev->caps.sccc_bt_num;
-			hop_num = hr_dev->caps.sccc_hop_num;
-			break;
-		case HEM_TYPE_QPC_TIMER:
-			buf_chunk_size = 1 << (hr_dev->caps.qpc_timer_buf_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = 1 << (hr_dev->caps.qpc_timer_ba_pg_sz
-					+ PAGE_SHIFT);
-			num_bt_l0 = hr_dev->caps.qpc_timer_bt_num;
-			hop_num = hr_dev->caps.qpc_timer_hop_num;
-			break;
-		case HEM_TYPE_CQC_TIMER:
-			buf_chunk_size = 1 << (hr_dev->caps.cqc_timer_buf_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = 1 << (hr_dev->caps.cqc_timer_ba_pg_sz
-					+ PAGE_SHIFT);
-			num_bt_l0 = hr_dev->caps.cqc_timer_bt_num;
-			hop_num = hr_dev->caps.cqc_timer_hop_num;
-			break;
-		case HEM_TYPE_SRQC:
-			buf_chunk_size = 1 << (hr_dev->caps.srqc_buf_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = 1 << (hr_dev->caps.srqc_ba_pg_sz
-					+ PAGE_SHIFT);
-			num_bt_l0 = hr_dev->caps.srqc_bt_num;
-			hop_num = hr_dev->caps.srqc_hop_num;
-			break;
-		case HEM_TYPE_MTT:
-			buf_chunk_size = 1 << (hr_dev->caps.mtt_ba_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = buf_chunk_size;
-			hop_num = hr_dev->caps.mtt_hop_num;
-			break;
-		case HEM_TYPE_CQE:
-			buf_chunk_size = 1 << (hr_dev->caps.cqe_ba_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = buf_chunk_size;
-			hop_num = hr_dev->caps.cqe_hop_num;
-			break;
-		case HEM_TYPE_SRQWQE:
-			buf_chunk_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = buf_chunk_size;
-			hop_num = hr_dev->caps.srqwqe_hop_num;
-			break;
-		case HEM_TYPE_IDX:
-			buf_chunk_size = 1 << (hr_dev->caps.idx_ba_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = buf_chunk_size;
-			hop_num = hr_dev->caps.idx_hop_num;
-			break;
-		default:
-			dev_err(dev,
-			  "Table %d not support to init hem table here!\n",
-			  type);
+		if (get_hem_table_config(hr_dev, &mhop, type))
 			return -EINVAL;
-		}
+
+		buf_chunk_size = mhop.buf_chunk_size;
+		bt_chunk_size = mhop.bt_chunk_size;
+		num_bt_l0 = mhop.ba_l0_num;
+		hop_num = mhop.hop_num;
+
 		obj_per_chunk = buf_chunk_size / obj_size;
 		num_hem = (nobj + obj_per_chunk - 1) / obj_per_chunk;
 		bt_chunk_num = bt_chunk_size / BA_BYTE_LEN;
-- 
1.9.1


^ permalink raw reply related

* [PATCH for-next 2/9] RDMA/hns: Refactor the code of creating srq
From: Lijun Ou @ 2019-07-08 13:41 UTC (permalink / raw)
  To: dledford, jgg; +Cc: leon, linux-rdma, linuxarm
In-Reply-To: <1562593285-8037-1-git-send-email-oulijun@huawei.com>

Here moves the relatived codes of creating user srq and
kernel srq into the two independent functions as well as
removes some unused codes and optimizes some codes.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_srq.c | 310 ++++++++++++++++++-------------
 1 file changed, 183 insertions(+), 127 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index 38bb548..c011422 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -175,6 +175,91 @@ static void hns_roce_srq_free(struct hns_roce_dev *hr_dev,
 	hns_roce_bitmap_free(&srq_table->bitmap, srq->srqn, BITMAP_NO_RR);
 }
 
+static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata,
+			   int srq_buf_size)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device);
+	struct hns_roce_ib_create_srq  ucmd;
+	u32 page_shift;
+	u32 npages;
+	int ret;
+
+	if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
+		return -EFAULT;
+
+	srq->umem = ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0, 0);
+	if (IS_ERR(srq->umem))
+		return PTR_ERR(srq->umem);
+
+	if (hr_dev->caps.srqwqe_buf_pg_sz) {
+		npages = (ib_umem_page_count(srq->umem) +
+			 (1 << hr_dev->caps.srqwqe_buf_pg_sz) - 1) /
+			 (1 << hr_dev->caps.srqwqe_buf_pg_sz);
+		page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
+		ret = hns_roce_mtt_init(hr_dev, npages, page_shift, &srq->mtt);
+	} else
+		ret = hns_roce_mtt_init(hr_dev, ib_umem_page_count(srq->umem),
+					PAGE_SHIFT, &srq->mtt);
+	if (ret)
+		goto err_user_buf;
+
+	ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->mtt, srq->umem);
+	if (ret)
+		goto err_user_srq_mtt;
+
+	/* config index queue BA */
+	srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr,
+					srq->idx_que.buf_size, 0, 0);
+	if (IS_ERR(srq->idx_que.umem)) {
+		dev_err(hr_dev->dev, "ib_umem_get error for index queue\n");
+		ret = PTR_ERR(srq->idx_que.umem);
+		goto err_user_srq_mtt;
+	}
+
+	if (hr_dev->caps.idx_buf_pg_sz) {
+		npages = (ib_umem_page_count(srq->idx_que.umem) +
+			 (1 << hr_dev->caps.idx_buf_pg_sz) - 1) /
+			 (1 << hr_dev->caps.idx_buf_pg_sz);
+		page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz;
+		ret = hns_roce_mtt_init(hr_dev, npages, page_shift,
+					&srq->idx_que.mtt);
+	} else {
+		ret = hns_roce_mtt_init(hr_dev,
+					ib_umem_page_count(srq->idx_que.umem),
+					PAGE_SHIFT,
+					&srq->idx_que.mtt);
+	}
+
+	if (ret) {
+		dev_err(hr_dev->dev, "hns_roce_mtt_init error for idx que\n");
+		goto err_user_idx_mtt;
+	}
+
+	ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->idx_que.mtt,
+					 srq->idx_que.umem);
+	if (ret) {
+		dev_err(hr_dev->dev,
+			"hns_roce_ib_umem_write_mtt error for idx que\n");
+		goto err_user_idx_buf;
+	}
+
+	return 0;
+
+err_user_idx_buf:
+	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+
+err_user_idx_mtt:
+	ib_umem_release(srq->idx_que.umem);
+
+err_user_srq_mtt:
+	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+
+err_user_buf:
+	ib_umem_release(srq->umem);
+
+	return ret;
+}
+
 static int hns_roce_create_idx_que(struct ib_pd *pd, struct hns_roce_srq *srq,
 				   u32 page_shift)
 {
@@ -196,6 +281,93 @@ static int hns_roce_create_idx_que(struct ib_pd *pd, struct hns_roce_srq *srq,
 	return 0;
 }
 
+static int create_kernel_srq(struct hns_roce_srq *srq, int srq_buf_size)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device);
+	u32 page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
+	int ret;
+
+	if (hns_roce_buf_alloc(hr_dev, srq_buf_size, (1 << page_shift) * 2,
+			       &srq->buf, page_shift))
+		return -ENOMEM;
+
+	srq->head = 0;
+	srq->tail = srq->max - 1;
+
+	ret = hns_roce_mtt_init(hr_dev, srq->buf.npages, srq->buf.page_shift,
+				&srq->mtt);
+	if (ret)
+		goto err_kernel_buf;
+
+	ret = hns_roce_buf_write_mtt(hr_dev, &srq->mtt, &srq->buf);
+	if (ret)
+		goto err_kernel_srq_mtt;
+
+	page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz;
+	ret = hns_roce_create_idx_que(srq->ibsrq.pd, srq, page_shift);
+	if (ret) {
+		dev_err(hr_dev->dev, "Create idx queue fail(%d)!\n", ret);
+		goto err_kernel_srq_mtt;
+	}
+
+	/* Init mtt table for idx_que */
+	ret = hns_roce_mtt_init(hr_dev, srq->idx_que.idx_buf.npages,
+				srq->idx_que.idx_buf.page_shift,
+				&srq->idx_que.mtt);
+	if (ret)
+		goto err_kernel_create_idx;
+
+	/* Write buffer address into the mtt table */
+	ret = hns_roce_buf_write_mtt(hr_dev, &srq->idx_que.mtt,
+				     &srq->idx_que.idx_buf);
+	if (ret)
+		goto err_kernel_idx_buf;
+
+	srq->wrid = kvmalloc_array(srq->max, sizeof(u64), GFP_KERNEL);
+	if (!srq->wrid) {
+		ret = -ENOMEM;
+		goto err_kernel_idx_buf;
+	}
+
+	return 0;
+
+err_kernel_idx_buf:
+	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+
+err_kernel_create_idx:
+	hns_roce_buf_free(hr_dev, srq->idx_que.buf_size,
+			  &srq->idx_que.idx_buf);
+	kfree(srq->idx_que.bitmap);
+
+err_kernel_srq_mtt:
+	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+
+err_kernel_buf:
+	hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf);
+
+	return ret;
+}
+
+static void destroy_user_srq(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_srq *srq)
+{
+	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+	ib_umem_release(srq->idx_que.umem);
+	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+	ib_umem_release(srq->umem);
+}
+
+static void destroy_kernel_srq(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_srq *srq, int srq_buf_size)
+{
+	kvfree(srq->wrid);
+	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+	hns_roce_buf_free(hr_dev, srq->idx_que.buf_size, &srq->idx_que.idx_buf);
+	kfree(srq->idx_que.bitmap);
+	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+	hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf);
+}
+
 int hns_roce_create_srq(struct ib_srq *ib_srq,
 			struct ib_srq_init_attr *srq_init_attr,
 			struct ib_udata *udata)
@@ -205,9 +377,7 @@ int hns_roce_create_srq(struct ib_srq *ib_srq,
 	struct hns_roce_srq *srq = to_hr_srq(ib_srq);
 	int srq_desc_size;
 	int srq_buf_size;
-	u32 page_shift;
 	int ret = 0;
-	u32 npages;
 	u32 cqn;
 
 	/* Check the actual SRQ wqe and SRQ sge num */
@@ -233,115 +403,16 @@ int hns_roce_create_srq(struct ib_srq *ib_srq,
 	srq->idx_que.mtt.mtt_type = MTT_TYPE_IDX;
 
 	if (udata) {
-		struct hns_roce_ib_create_srq  ucmd;
-
-		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
-			return -EFAULT;
-
-		srq->umem =
-			ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0, 0);
-		if (IS_ERR(srq->umem))
-			return PTR_ERR(srq->umem);
-
-		if (hr_dev->caps.srqwqe_buf_pg_sz) {
-			npages = (ib_umem_page_count(srq->umem) +
-				  (1 << hr_dev->caps.srqwqe_buf_pg_sz) - 1) /
-				  (1 << hr_dev->caps.srqwqe_buf_pg_sz);
-			page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
-			ret = hns_roce_mtt_init(hr_dev, npages,
-						page_shift,
-						&srq->mtt);
-		} else
-			ret = hns_roce_mtt_init(hr_dev,
-						ib_umem_page_count(srq->umem),
-						PAGE_SHIFT, &srq->mtt);
-		if (ret)
-			goto err_buf;
-
-		ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->mtt, srq->umem);
-		if (ret)
-			goto err_srq_mtt;
-
-		/* config index queue BA */
-		srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr,
-						srq->idx_que.buf_size, 0, 0);
-		if (IS_ERR(srq->idx_que.umem)) {
-			dev_err(hr_dev->dev,
-				"ib_umem_get error for index queue\n");
-			ret = PTR_ERR(srq->idx_que.umem);
-			goto err_srq_mtt;
-		}
-
-		if (hr_dev->caps.idx_buf_pg_sz) {
-			npages = (ib_umem_page_count(srq->idx_que.umem) +
-				  (1 << hr_dev->caps.idx_buf_pg_sz) - 1) /
-				  (1 << hr_dev->caps.idx_buf_pg_sz);
-			page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz;
-			ret = hns_roce_mtt_init(hr_dev, npages,
-						page_shift, &srq->idx_que.mtt);
-		} else {
-			ret = hns_roce_mtt_init(
-				hr_dev, ib_umem_page_count(srq->idx_que.umem),
-				PAGE_SHIFT, &srq->idx_que.mtt);
-		}
-
-		if (ret) {
-			dev_err(hr_dev->dev,
-				"hns_roce_mtt_init error for idx que\n");
-			goto err_idx_mtt;
-		}
-
-		ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->idx_que.mtt,
-						 srq->idx_que.umem);
+		ret = create_user_srq(srq, udata, srq_buf_size);
 		if (ret) {
-			dev_err(hr_dev->dev,
-			      "hns_roce_ib_umem_write_mtt error for idx que\n");
-			goto err_idx_buf;
+			dev_err(hr_dev->dev, "Create user srq failed\n");
+			goto err_srq;
 		}
 	} else {
-		page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
-		if (hns_roce_buf_alloc(hr_dev, srq_buf_size,
-				       (1 << page_shift) * 2, &srq->buf,
-				       page_shift))
-			return -ENOMEM;
-
-		srq->head = 0;
-		srq->tail = srq->max - 1;
-
-		ret = hns_roce_mtt_init(hr_dev, srq->buf.npages,
-					srq->buf.page_shift, &srq->mtt);
-		if (ret)
-			goto err_buf;
-
-		ret = hns_roce_buf_write_mtt(hr_dev, &srq->mtt, &srq->buf);
-		if (ret)
-			goto err_srq_mtt;
-
-		page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz;
-		ret = hns_roce_create_idx_que(ib_srq->pd, srq, page_shift);
+		ret = create_kernel_srq(srq, srq_buf_size);
 		if (ret) {
-			dev_err(hr_dev->dev, "Create idx queue fail(%d)!\n",
-				ret);
-			goto err_srq_mtt;
-		}
-
-		/* Init mtt table for idx_que */
-		ret = hns_roce_mtt_init(hr_dev, srq->idx_que.idx_buf.npages,
-					srq->idx_que.idx_buf.page_shift,
-					&srq->idx_que.mtt);
-		if (ret)
-			goto err_create_idx;
-
-		/* Write buffer address into the mtt table */
-		ret = hns_roce_buf_write_mtt(hr_dev, &srq->idx_que.mtt,
-					     &srq->idx_que.idx_buf);
-		if (ret)
-			goto err_idx_buf;
-
-		srq->wrid = kvmalloc_array(srq->max, sizeof(u64), GFP_KERNEL);
-		if (!srq->wrid) {
-			ret = -ENOMEM;
-			goto err_idx_buf;
+			dev_err(hr_dev->dev, "Create kernel srq failed\n");
+			goto err_srq;
 		}
 	}
 
@@ -373,27 +444,12 @@ int hns_roce_create_srq(struct ib_srq *ib_srq,
 	hns_roce_srq_free(hr_dev, srq);
 
 err_wrid:
-	kvfree(srq->wrid);
-
-err_idx_buf:
-	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
-
-err_idx_mtt:
-	ib_umem_release(srq->idx_que.umem);
-
-err_create_idx:
-	hns_roce_buf_free(hr_dev, srq->idx_que.buf_size,
-			  &srq->idx_que.idx_buf);
-	bitmap_free(srq->idx_que.bitmap);
-
-err_srq_mtt:
-	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
-
-err_buf:
-	ib_umem_release(srq->umem);
-	if (!udata)
-		hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf);
+	if (udata)
+		destroy_user_srq(hr_dev, srq);
+	else
+		destroy_kernel_srq(hr_dev, srq, srq_buf_size);
 
+err_srq:
 	return ret;
 }
 
-- 
1.9.1


^ permalink raw reply related

* [PATCH for-next 3/9] RDMA/hns: Refactor for hns_roce_v2_modify_qp function
From: Lijun Ou @ 2019-07-08 13:41 UTC (permalink / raw)
  To: dledford, jgg; +Cc: leon, linux-rdma, linuxarm
In-Reply-To: <1562593285-8037-1-git-send-email-oulijun@huawei.com>

This patch packages some lines which exist hns_roce_v2_modify_qp
function into the new function. The codes refactored mainly
include some absolute fields of qp context and some optional
fields of qp context.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Xi Wang <wangxi11@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 356 +++++++++++++++++------------
 1 file changed, 206 insertions(+), 150 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index b76e3be..edc5dd2 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -3974,30 +3974,119 @@ static inline bool hns_roce_v2_check_qp_stat(enum ib_qp_state cur_state,
 
 }
 
-static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
-				 const struct ib_qp_attr *attr,
-				 int attr_mask, enum ib_qp_state cur_state,
-				 enum ib_qp_state new_state)
+static int hns_roce_v2_set_path(struct ib_qp *ibqp,
+				const struct ib_qp_attr *attr,
+				int attr_mask,
+				struct hns_roce_v2_qp_context *context,
+				struct hns_roce_v2_qp_context *qpc_mask)
 {
+	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
-	struct hns_roce_v2_qp_context *context;
-	struct hns_roce_v2_qp_context *qpc_mask;
-	struct device *dev = hr_dev->dev;
-	int ret = -EINVAL;
+	const struct ib_gid_attr *gid_attr = NULL;
+	int is_roce_protocol;
+	bool is_udp = false;
+	u16 vlan = 0xffff;
+	u8 ib_port;
+	u8 hr_port;
+	int ret;
 
-	context = kcalloc(2, sizeof(*context), GFP_ATOMIC);
-	if (!context)
-		return -ENOMEM;
+	ib_port = (attr_mask & IB_QP_PORT) ? attr->port_num : hr_qp->port + 1;
+	hr_port = ib_port - 1;
+	is_roce_protocol = rdma_cap_eth_ah(&hr_dev->ib_dev, ib_port) &&
+			   rdma_ah_get_ah_flags(&attr->ah_attr) & IB_AH_GRH;
+
+	if (is_roce_protocol) {
+		gid_attr = attr->ah_attr.grh.sgid_attr;
+		ret = rdma_read_gid_l2_fields(gid_attr, &vlan, NULL);
+		if (ret)
+			return ret;
+
+		if (gid_attr)
+			is_udp = (gid_attr->gid_type ==
+				 IB_GID_TYPE_ROCE_UDP_ENCAP);
+	}
+
+	if (vlan < VLAN_CFI_MASK) {
+		roce_set_bit(context->byte_76_srqn_op_en,
+			     V2_QPC_BYTE_76_RQ_VLAN_EN_S, 1);
+		roce_set_bit(qpc_mask->byte_76_srqn_op_en,
+			     V2_QPC_BYTE_76_RQ_VLAN_EN_S, 0);
+		roce_set_bit(context->byte_168_irrl_idx,
+			     V2_QPC_BYTE_168_SQ_VLAN_EN_S, 1);
+		roce_set_bit(qpc_mask->byte_168_irrl_idx,
+			     V2_QPC_BYTE_168_SQ_VLAN_EN_S, 0);
+	}
+
+	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
+		       V2_QPC_BYTE_24_VLAN_ID_S, vlan);
+	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
+		       V2_QPC_BYTE_24_VLAN_ID_S, 0);
+
+	if (grh->sgid_index >= hr_dev->caps.gid_table_len[hr_port]) {
+		dev_err(hr_dev->dev, "sgid_index(%u) too large. max is %d\n",
+			grh->sgid_index, hr_dev->caps.gid_table_len[hr_port]);
+		return -EINVAL;
+	}
+
+	if (attr->ah_attr.type != RDMA_AH_ATTR_TYPE_ROCE) {
+		dev_err(hr_dev->dev, "ah attr is not RDMA roce type\n");
+		return -EINVAL;
+	}
+
+	roce_set_field(context->byte_52_udpspn_dmac, V2_QPC_BYTE_52_UDPSPN_M,
+		       V2_QPC_BYTE_52_UDPSPN_S,
+		       is_udp ? 0x12b7 : 0);
+
+	roce_set_field(qpc_mask->byte_52_udpspn_dmac, V2_QPC_BYTE_52_UDPSPN_M,
+		       V2_QPC_BYTE_52_UDPSPN_S, 0);
+
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SGID_IDX_M, V2_QPC_BYTE_20_SGID_IDX_S,
+		       grh->sgid_index);
+
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SGID_IDX_M, V2_QPC_BYTE_20_SGID_IDX_S, 0);
+
+	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M,
+		       V2_QPC_BYTE_24_HOP_LIMIT_S, grh->hop_limit);
+	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M,
+		       V2_QPC_BYTE_24_HOP_LIMIT_S, 0);
+
+	if (hr_dev->pci_dev->revision == 0x21 && is_udp)
+		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
+			       V2_QPC_BYTE_24_TC_S, grh->traffic_class >> 2);
+	else
+		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
+			       V2_QPC_BYTE_24_TC_S, grh->traffic_class);
+	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
+		       V2_QPC_BYTE_24_TC_S, 0);
+	roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
+		       V2_QPC_BYTE_28_FL_S, grh->flow_label);
+	roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
+		       V2_QPC_BYTE_28_FL_S, 0);
+	memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw));
+	memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw));
+	roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
+		       V2_QPC_BYTE_28_SL_S, rdma_ah_get_sl(&attr->ah_attr));
+	roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
+		       V2_QPC_BYTE_28_SL_S, 0);
+	hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
+
+	return 0;
+}
+
+static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp,
+				      const struct ib_qp_attr *attr,
+				      int attr_mask,
+				      enum ib_qp_state cur_state,
+				      enum ib_qp_state new_state,
+				      struct hns_roce_v2_qp_context *context,
+				      struct hns_roce_v2_qp_context *qpc_mask)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	int ret = 0;
 
-	qpc_mask = context + 1;
-	/*
-	 * In v2 engine, software pass context and context mask to hardware
-	 * when modifying qp. If software need modify some fields in context,
-	 * we should set all bits of the relevant fields in context mask to
-	 * 0 at the same time, else set them to 0x1.
-	 */
-	memset(qpc_mask, 0xff, sizeof(*qpc_mask));
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
 		memset(qpc_mask, 0, sizeof(*qpc_mask));
 		modify_qp_reset_to_init(ibqp, attr, attr_mask, context,
@@ -4019,134 +4108,30 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 		/* Nothing */
 		;
 	} else {
-		dev_err(dev, "Illegal state for QP!\n");
+		dev_err(hr_dev->dev, "Illegal state for QP!\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
-	/* When QP state is err, SQ and RQ WQE should be flushed */
-	if (new_state == IB_QPS_ERR) {
-		roce_set_field(context->byte_160_sq_ci_pi,
-			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
-			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S,
-			       hr_qp->sq.head);
-		roce_set_field(qpc_mask->byte_160_sq_ci_pi,
-			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
-			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0);
+out:
+	return ret;
+}
 
-		if (!ibqp->srq) {
-			roce_set_field(context->byte_84_rq_ci_pi,
-			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
-			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S,
-			       hr_qp->rq.head);
-			roce_set_field(qpc_mask->byte_84_rq_ci_pi,
-			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
-			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
-		}
-	}
+static int hns_roce_v2_set_opt_fields(struct ib_qp *ibqp,
+				      const struct ib_qp_attr *attr,
+				      int attr_mask,
+				      struct hns_roce_v2_qp_context *context,
+				      struct hns_roce_v2_qp_context *qpc_mask)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	int ret = 0;
 
 	if (attr_mask & IB_QP_AV) {
-		const struct ib_global_route *grh =
-					    rdma_ah_read_grh(&attr->ah_attr);
-		const struct ib_gid_attr *gid_attr = NULL;
-		int is_roce_protocol;
-		u16 vlan = 0xffff;
-		u8 ib_port;
-		u8 hr_port;
-
-		ib_port = (attr_mask & IB_QP_PORT) ? attr->port_num :
-			   hr_qp->port + 1;
-		hr_port = ib_port - 1;
-		is_roce_protocol = rdma_cap_eth_ah(&hr_dev->ib_dev, ib_port) &&
-			       rdma_ah_get_ah_flags(&attr->ah_attr) & IB_AH_GRH;
-
-		if (is_roce_protocol) {
-			gid_attr = attr->ah_attr.grh.sgid_attr;
-			ret = rdma_read_gid_l2_fields(gid_attr, &vlan, NULL);
-			if (ret)
-				goto out;
-		}
-
-		if (vlan < VLAN_CFI_MASK) {
-			roce_set_bit(context->byte_76_srqn_op_en,
-				     V2_QPC_BYTE_76_RQ_VLAN_EN_S, 1);
-			roce_set_bit(qpc_mask->byte_76_srqn_op_en,
-				     V2_QPC_BYTE_76_RQ_VLAN_EN_S, 0);
-			roce_set_bit(context->byte_168_irrl_idx,
-				     V2_QPC_BYTE_168_SQ_VLAN_EN_S, 1);
-			roce_set_bit(qpc_mask->byte_168_irrl_idx,
-				     V2_QPC_BYTE_168_SQ_VLAN_EN_S, 0);
-		}
-
-		roce_set_field(context->byte_24_mtu_tc,
-			       V2_QPC_BYTE_24_VLAN_ID_M,
-			       V2_QPC_BYTE_24_VLAN_ID_S, vlan);
-		roce_set_field(qpc_mask->byte_24_mtu_tc,
-			       V2_QPC_BYTE_24_VLAN_ID_M,
-			       V2_QPC_BYTE_24_VLAN_ID_S, 0);
-
-		if (grh->sgid_index >= hr_dev->caps.gid_table_len[hr_port]) {
-			dev_err(hr_dev->dev,
-				"sgid_index(%u) too large. max is %d\n",
-				grh->sgid_index,
-				hr_dev->caps.gid_table_len[hr_port]);
-			ret = -EINVAL;
-			goto out;
-		}
-
-		if (attr->ah_attr.type != RDMA_AH_ATTR_TYPE_ROCE) {
-			dev_err(hr_dev->dev, "ah attr is not RDMA roce type\n");
-			ret = -EINVAL;
-			goto out;
-		}
-
-		roce_set_field(context->byte_52_udpspn_dmac,
-			   V2_QPC_BYTE_52_UDPSPN_M, V2_QPC_BYTE_52_UDPSPN_S,
-			   (gid_attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) ?
-			   0 : 0x12b7);
-
-		roce_set_field(qpc_mask->byte_52_udpspn_dmac,
-			       V2_QPC_BYTE_52_UDPSPN_M,
-			       V2_QPC_BYTE_52_UDPSPN_S, 0);
-
-		roce_set_field(context->byte_20_smac_sgid_idx,
-			       V2_QPC_BYTE_20_SGID_IDX_M,
-			       V2_QPC_BYTE_20_SGID_IDX_S, grh->sgid_index);
-
-		roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-			       V2_QPC_BYTE_20_SGID_IDX_M,
-			       V2_QPC_BYTE_20_SGID_IDX_S, 0);
-
-		roce_set_field(context->byte_24_mtu_tc,
-			       V2_QPC_BYTE_24_HOP_LIMIT_M,
-			       V2_QPC_BYTE_24_HOP_LIMIT_S, grh->hop_limit);
-		roce_set_field(qpc_mask->byte_24_mtu_tc,
-			       V2_QPC_BYTE_24_HOP_LIMIT_M,
-			       V2_QPC_BYTE_24_HOP_LIMIT_S, 0);
-
-		if (hr_dev->pci_dev->revision == 0x21 &&
-		    gid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
-			roce_set_field(context->byte_24_mtu_tc,
-				       V2_QPC_BYTE_24_TC_M, V2_QPC_BYTE_24_TC_S,
-				       grh->traffic_class >> 2);
-		else
-			roce_set_field(context->byte_24_mtu_tc,
-				       V2_QPC_BYTE_24_TC_M, V2_QPC_BYTE_24_TC_S,
-				       grh->traffic_class);
-		roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
-			       V2_QPC_BYTE_24_TC_S, 0);
-		roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
-			       V2_QPC_BYTE_28_FL_S, grh->flow_label);
-		roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
-			       V2_QPC_BYTE_28_FL_S, 0);
-		memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw));
-		memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw));
-		roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
-			       V2_QPC_BYTE_28_SL_S,
-			       rdma_ah_get_sl(&attr->ah_attr));
-		roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
-			       V2_QPC_BYTE_28_SL_S, 0);
-		hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
+		ret = hns_roce_v2_set_path(ibqp, attr, attr_mask, context,
+					   qpc_mask);
+		if (ret)
+			return ret;
 	}
 
 	if (attr_mask & IB_QP_TIMEOUT) {
@@ -4158,7 +4143,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 				       V2_QPC_BYTE_28_AT_M, V2_QPC_BYTE_28_AT_S,
 				       0);
 		} else {
-			dev_warn(dev, "Local ACK timeout shall be 0 to 30.\n");
+			dev_warn(hr_dev->dev,
+				 "Local ACK timeout shall be 0 to 30.\n");
 		}
 	}
 
@@ -4196,6 +4182,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 			       V2_QPC_BYTE_244_RNR_CNT_S, 0);
 	}
 
+	/* RC&UC&UD required attr */
 	if (attr_mask & IB_QP_SQ_PSN) {
 		roce_set_field(context->byte_172_sq_psn,
 			       V2_QPC_BYTE_172_SQ_CUR_PSN_M,
@@ -4295,6 +4282,83 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 		hr_qp->qkey = attr->qkey;
 	}
 
+	return ret;
+}
+
+static void hns_roce_v2_record_opt_fields(struct ib_qp *ibqp,
+					  const struct ib_qp_attr *attr,
+					  int attr_mask)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		hr_qp->atomic_rd_en = attr->qp_access_flags;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		hr_qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT) {
+		hr_qp->port = attr->port_num - 1;
+		hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
+	}
+}
+
+static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
+				 const struct ib_qp_attr *attr,
+				 int attr_mask, enum ib_qp_state cur_state,
+				 enum ib_qp_state new_state)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct hns_roce_v2_qp_context *context;
+	struct hns_roce_v2_qp_context *qpc_mask;
+	struct device *dev = hr_dev->dev;
+	int ret = -EINVAL;
+
+	context = kcalloc(2, sizeof(*context), GFP_ATOMIC);
+	if (!context)
+		return -ENOMEM;
+
+	qpc_mask = context + 1;
+	/*
+	 * In v2 engine, software pass context and context mask to hardware
+	 * when modifying qp. If software need modify some fields in context,
+	 * we should set all bits of the relevant fields in context mask to
+	 * 0 at the same time, else set them to 0x1.
+	 */
+	memset(qpc_mask, 0xff, sizeof(*qpc_mask));
+	ret = hns_roce_v2_set_abs_fields(ibqp, attr, attr_mask, cur_state,
+					 new_state, context, qpc_mask);
+	if (ret)
+		goto out;
+
+	/* When QP state is err, SQ and RQ WQE should be flushed */
+	if (new_state == IB_QPS_ERR) {
+		roce_set_field(context->byte_160_sq_ci_pi,
+			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
+			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S,
+			       hr_qp->sq.head);
+		roce_set_field(qpc_mask->byte_160_sq_ci_pi,
+			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
+			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0);
+
+		if (!ibqp->srq) {
+			roce_set_field(context->byte_84_rq_ci_pi,
+			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
+			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S,
+			       hr_qp->rq.head);
+			roce_set_field(qpc_mask->byte_84_rq_ci_pi,
+			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
+			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
+		}
+	}
+
+	/* Configure the optional fields */
+	ret = hns_roce_v2_set_opt_fields(ibqp, attr, attr_mask, context,
+					 qpc_mask);
+	if (ret)
+		goto out;
+
 	roce_set_bit(context->byte_108_rx_reqepsn, V2_QPC_BYTE_108_INV_CREDIT_S,
 		     ibqp->srq ? 1 : 0);
 	roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
@@ -4316,15 +4380,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 
 	hr_qp->state = new_state;
 
-	if (attr_mask & IB_QP_ACCESS_FLAGS)
-		hr_qp->atomic_rd_en = attr->qp_access_flags;
-
-	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
-		hr_qp->resp_depth = attr->max_dest_rd_atomic;
-	if (attr_mask & IB_QP_PORT) {
-		hr_qp->port = attr->port_num - 1;
-		hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
-	}
+	hns_roce_v2_record_opt_fields(ibqp, attr, attr_mask);
 
 	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
 		hns_roce_v2_cq_clean(to_hr_cq(ibqp->recv_cq), hr_qp->qpn,
-- 
1.9.1


^ permalink raw reply related

* [PATCH for-next 5/9] RDMA/hns: optimize the duplicated code for qpc setting flow
From: Lijun Ou @ 2019-07-08 13:41 UTC (permalink / raw)
  To: dledford, jgg; +Cc: leon, linux-rdma, linuxarm
In-Reply-To: <1562593285-8037-1-git-send-email-oulijun@huawei.com>

From: Xi Wang <wangxi11@huawei.com>

Currently, more than 20 lines of duplicate code exist in function
'modify_qp_init_to_init' and function 'modify_qp_reset_to_init',
which affects the readability of the code.

Signed-off-by: Xi Wang <wangxi11@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 96 ++++++++++++------------------
 1 file changed, 38 insertions(+), 58 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index edc5dd2..c2ddfc1 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -3118,6 +3118,43 @@ static void set_access_flags(struct hns_roce_qp *hr_qp,
 	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, 0);
 }
 
+static void set_qpc_wqe_cnt(struct hns_roce_qp *hr_qp,
+			    struct hns_roce_v2_qp_context *context,
+			    struct hns_roce_v2_qp_context *qpc_mask)
+{
+	if (hr_qp->ibqp.qp_type == IB_QPT_GSI)
+		roce_set_field(context->byte_4_sqpn_tst,
+			       V2_QPC_BYTE_4_SGE_SHIFT_M,
+			       V2_QPC_BYTE_4_SGE_SHIFT_S,
+			       ilog2((unsigned int)hr_qp->sge.sge_cnt));
+	else
+		roce_set_field(context->byte_4_sqpn_tst,
+			       V2_QPC_BYTE_4_SGE_SHIFT_M,
+			       V2_QPC_BYTE_4_SGE_SHIFT_S,
+			       hr_qp->sq.max_gs >
+			       HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE ?
+			       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
+
+	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
+		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
+
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S,
+		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S, 0);
+
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
+		       (hr_qp->ibqp.qp_type == IB_QPT_XRC_INI ||
+		       hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT ||
+		       hr_qp->ibqp.srq) ? 0 :
+		       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
+
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
+}
+
 static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 				    const struct ib_qp_attr *attr,
 				    int attr_mask,
@@ -3138,21 +3175,6 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
 		       V2_QPC_BYTE_4_TST_S, 0);
 
-	if (ibqp->qp_type == IB_QPT_GSI)
-		roce_set_field(context->byte_4_sqpn_tst,
-			       V2_QPC_BYTE_4_SGE_SHIFT_M,
-			       V2_QPC_BYTE_4_SGE_SHIFT_S,
-			       ilog2((unsigned int)hr_qp->sge.sge_cnt));
-	else
-		roce_set_field(context->byte_4_sqpn_tst,
-			       V2_QPC_BYTE_4_SGE_SHIFT_M,
-			       V2_QPC_BYTE_4_SGE_SHIFT_S,
-			       hr_qp->sq.max_gs > 2 ?
-			       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
-
-	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
-		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
-
 	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
 		       V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn);
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
@@ -3168,19 +3190,7 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 	roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQWS_M,
 		       V2_QPC_BYTE_20_RQWS_S, 0);
 
-	roce_set_field(context->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S,
-		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S, 0);
-
-	roce_set_field(context->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
-		       (hr_qp->ibqp.qp_type == IB_QPT_XRC_INI ||
-		       hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT || ibqp->srq) ? 0 :
-		       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
+	set_qpc_wqe_cnt(hr_qp, context, qpc_mask);
 
 	/* No VLAN need to set 0xFFF */
 	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
@@ -3456,22 +3466,6 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp,
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
 		       V2_QPC_BYTE_4_TST_S, 0);
 
-	if (ibqp->qp_type == IB_QPT_GSI)
-		roce_set_field(context->byte_4_sqpn_tst,
-			       V2_QPC_BYTE_4_SGE_SHIFT_M,
-			       V2_QPC_BYTE_4_SGE_SHIFT_S,
-			       ilog2((unsigned int)hr_qp->sge.sge_cnt));
-	else
-		roce_set_field(context->byte_4_sqpn_tst,
-			       V2_QPC_BYTE_4_SGE_SHIFT_M,
-			       V2_QPC_BYTE_4_SGE_SHIFT_S,
-			       hr_qp->sq.max_gs >
-			       HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE ?
-			       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
-
-	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
-		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
-
 	if (attr_mask & IB_QP_ACCESS_FLAGS) {
 		roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S,
 			     !!(attr->qp_access_flags & IB_ACCESS_REMOTE_READ));
@@ -3506,20 +3500,6 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp,
 			     0);
 	}
 
-	roce_set_field(context->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S,
-		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S, 0);
-
-	roce_set_field(context->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
-		       (hr_qp->ibqp.qp_type == IB_QPT_XRC_INI ||
-		       hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT || ibqp->srq) ? 0 :
-		       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
-
 	roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
 		       V2_QPC_BYTE_16_PD_S, to_hr_pd(ibqp->pd)->pdn);
 	roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
-- 
1.9.1


^ permalink raw reply related

* [PATCH for-next 1/9] RDMA/hns: Package the flow of creating cq
From: Lijun Ou @ 2019-07-08 13:41 UTC (permalink / raw)
  To: dledford, jgg; +Cc: leon, linux-rdma, linuxarm
In-Reply-To: <1562593285-8037-1-git-send-email-oulijun@huawei.com>

Here moves the relatived codes of creating cq into the separated
functions in order to comprehensibility.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_cq.c | 186 +++++++++++++++++++++-----------
 1 file changed, 124 insertions(+), 62 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 4e50c22..507d3c4 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -298,21 +298,132 @@ static void hns_roce_ib_free_cq_buf(struct hns_roce_dev *hr_dev,
 			  &buf->hr_buf);
 }
 
+static int create_user_cq(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_cq *hr_cq,
+			  struct ib_udata *udata,
+			  struct hns_roce_ib_create_cq_resp *resp,
+			  struct hns_roce_uar *uar,
+			  int cq_entries)
+{
+	struct hns_roce_ib_create_cq ucmd;
+	struct device *dev = hr_dev->dev;
+	int ret;
+	struct hns_roce_ucontext *context = rdma_udata_to_drv_context(
+				   udata, struct hns_roce_ucontext, ibucontext);
+
+	if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+		dev_err(dev, "Failed to copy_from_udata.\n");
+		return -EFAULT;
+	}
+
+	/* Get user space address, write it into mtt table */
+	ret = hns_roce_ib_get_cq_umem(hr_dev, udata, &hr_cq->hr_buf,
+				      &hr_cq->umem, ucmd.buf_addr,
+				      cq_entries);
+	if (ret) {
+		dev_err(dev, "Failed to get_cq_umem.\n");
+		return ret;
+	}
+
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+	    (udata->outlen >= sizeof(*resp))) {
+		ret = hns_roce_db_map_user(context, udata, ucmd.db_addr,
+					   &hr_cq->db);
+		if (ret) {
+			dev_err(dev, "cq record doorbell map failed!\n");
+			goto err_mtt;
+		}
+		hr_cq->db_en = 1;
+		resp->cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB;
+	}
+
+	/* Get user space parameters */
+	uar = &context->uar;
+
+	return 0;
+
+err_mtt:
+	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
+	ib_umem_release(hr_cq->umem);
+
+	return ret;
+}
+
+static int create_kernel_cq(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_cq *hr_cq, struct hns_roce_uar *uar,
+			    int cq_entries)
+{
+	struct device *dev = hr_dev->dev;
+	int ret;
+
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
+		ret = hns_roce_alloc_db(hr_dev, &hr_cq->db, 1);
+		if (ret)
+			return ret;
+
+		hr_cq->set_ci_db = hr_cq->db.db_record;
+		*hr_cq->set_ci_db = 0;
+		hr_cq->db_en = 1;
+	}
+
+	/* Init mtt table and write buff address to mtt table */
+	ret = hns_roce_ib_alloc_cq_buf(hr_dev, &hr_cq->hr_buf, cq_entries);
+	if (ret) {
+		dev_err(dev, "Failed to alloc_cq_buf.\n");
+		goto err_db;
+	}
+
+	uar = &hr_dev->priv_uar;
+	hr_cq->cq_db_l = hr_dev->reg_base + hr_dev->odb_offset +
+			 DB_REG_OFFSET * uar->index;
+
+	return 0;
+
+err_db:
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
+		hns_roce_free_db(hr_dev, &hr_cq->db);
+
+	return ret;
+}
+
+static void destroy_user_cq(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_cq *hr_cq,
+			    struct ib_udata *udata,
+			    struct hns_roce_ib_create_cq_resp *resp)
+{
+	struct hns_roce_ucontext *context = rdma_udata_to_drv_context(
+				   udata, struct hns_roce_ucontext, ibucontext);
+
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+	    (udata->outlen >= sizeof(*resp)))
+		hns_roce_db_unmap_user(context, &hr_cq->db);
+
+	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
+	ib_umem_release(hr_cq->umem);
+}
+
+static void destroy_kernel_cq(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_cq *hr_cq)
+{
+	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
+	hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, hr_cq->ib_cq.cqe);
+
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
+		hns_roce_free_db(hr_dev, &hr_cq->db);
+}
+
 int hns_roce_ib_create_cq(struct ib_cq *ib_cq,
 			  const struct ib_cq_init_attr *attr,
 			  struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
 	struct device *dev = hr_dev->dev;
-	struct hns_roce_ib_create_cq ucmd;
 	struct hns_roce_ib_create_cq_resp resp = {};
 	struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
 	struct hns_roce_uar *uar = NULL;
 	int vector = attr->comp_vector;
 	int cq_entries = attr->cqe;
 	int ret;
-	struct hns_roce_ucontext *context = rdma_udata_to_drv_context(
-		udata, struct hns_roce_ucontext, ibucontext);
 
 	if (cq_entries < 1 || cq_entries > hr_dev->caps.max_cqes) {
 		dev_err(dev, "Creat CQ failed. entries=%d, max=%d\n",
@@ -328,57 +439,18 @@ int hns_roce_ib_create_cq(struct ib_cq *ib_cq,
 	spin_lock_init(&hr_cq->lock);
 
 	if (udata) {
-		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
-			dev_err(dev, "Failed to copy_from_udata.\n");
-			ret = -EFAULT;
-			goto err_cq;
-		}
-
-		/* Get user space address, write it into mtt table */
-		ret = hns_roce_ib_get_cq_umem(hr_dev, udata, &hr_cq->hr_buf,
-					      &hr_cq->umem, ucmd.buf_addr,
-					      cq_entries);
+		ret = create_user_cq(hr_dev, hr_cq, udata, &resp, uar,
+				     cq_entries);
 		if (ret) {
-			dev_err(dev, "Failed to get_cq_umem.\n");
+			dev_err(dev, "Create cq failed in user mode!\n");
 			goto err_cq;
 		}
-
-		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-		    (udata->outlen >= sizeof(resp))) {
-			ret = hns_roce_db_map_user(context, udata, ucmd.db_addr,
-						   &hr_cq->db);
-			if (ret) {
-				dev_err(dev, "cq record doorbell map failed!\n");
-				goto err_mtt;
-			}
-			hr_cq->db_en = 1;
-			resp.cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB;
-		}
-
-		/* Get user space parameters */
-		uar = &context->uar;
 	} else {
-		if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
-			ret = hns_roce_alloc_db(hr_dev, &hr_cq->db, 1);
-			if (ret)
-				goto err_cq;
-
-			hr_cq->set_ci_db = hr_cq->db.db_record;
-			*hr_cq->set_ci_db = 0;
-			hr_cq->db_en = 1;
-		}
-
-		/* Init mmt table and write buff address to mtt table */
-		ret = hns_roce_ib_alloc_cq_buf(hr_dev, &hr_cq->hr_buf,
-					       cq_entries);
+		ret = create_kernel_cq(hr_dev, hr_cq, uar, cq_entries);
 		if (ret) {
-			dev_err(dev, "Failed to alloc_cq_buf.\n");
-			goto err_db;
+			dev_err(dev, "Create cq failed in kernel mode!\n");
+			goto err_cq;
 		}
-
-		uar = &hr_dev->priv_uar;
-		hr_cq->cq_db_l = hr_dev->reg_base + hr_dev->odb_offset +
-				DB_REG_OFFSET * uar->index;
 	}
 
 	/* Allocate cq index, fill cq_context */
@@ -416,20 +488,10 @@ int hns_roce_ib_create_cq(struct ib_cq *ib_cq,
 	hns_roce_free_cq(hr_dev, hr_cq);
 
 err_dbmap:
-	if (udata && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-	    (udata->outlen >= sizeof(resp)))
-		hns_roce_db_unmap_user(context, &hr_cq->db);
-
-err_mtt:
-	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
-	ib_umem_release(hr_cq->umem);
-	if (!udata)
-		hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf,
-					hr_cq->ib_cq.cqe);
-
-err_db:
-	if (!udata && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB))
-		hns_roce_free_db(hr_dev, &hr_cq->db);
+	if (udata)
+		destroy_user_cq(hr_dev, hr_cq, udata, &resp);
+	else
+		destroy_kernel_cq(hr_dev, hr_cq);
 
 err_cq:
 	return ret;
-- 
1.9.1


^ permalink raw reply related

* [PATCH for-next 4/9] RDMA/hns: Use a separated function for setting extend sge paramters
From: Lijun Ou @ 2019-07-08 13:41 UTC (permalink / raw)
  To: dledford, jgg; +Cc: leon, linux-rdma, linuxarm
In-Reply-To: <1562593285-8037-1-git-send-email-oulijun@huawei.com>

Here moves the related lines of setting extend sge size into
the separate function as well as removes the unused variables.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h |  2 -
 drivers/infiniband/hw/hns/hns_roce_qp.c     | 61 ++++++++++++++++++-----------
 2 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index a548b28..b39497a 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -654,8 +654,6 @@ struct hns_roce_qp {
 	u32			doorbell_qpn;
 	__le32			sq_signal_bits;
 	u32			sq_next_wqe;
-	int			sq_max_wqes_per_wr;
-	int			sq_spare_wqes;
 	struct hns_roce_wq	sq;
 
 	struct ib_umem		*umem;
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index c109602..d35033b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -502,6 +502,35 @@ static int calc_wqe_bt_page_shift(struct hns_roce_dev *hr_dev,
 	return bt_pg_shift - PAGE_SHIFT;
 }
 
+static int set_extend_sge_param(struct hns_roce_dev *hr_dev,
+				struct hns_roce_qp *hr_qp)
+{
+	struct device *dev = hr_dev->dev;
+
+	if (hr_qp->sq.max_gs > 2) {
+		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
+				     (hr_qp->sq.max_gs - 2));
+		hr_qp->sge.sge_shift = 4;
+	}
+
+	/* ud sqwqe's sge use extend sge */
+	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->ibqp.qp_type == IB_QPT_GSI) {
+		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
+				     hr_qp->sq.max_gs);
+		hr_qp->sge.sge_shift = 4;
+	}
+
+	if ((hr_qp->sq.max_gs > 2) && hr_dev->pci_dev->revision == 0x20) {
+		if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) {
+			dev_err(dev, "The extended sge cnt error! sge_cnt=%d\n",
+				hr_qp->sge.sge_cnt);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
 				       struct ib_qp_cap *cap,
 				       struct hns_roce_qp *hr_qp)
@@ -510,6 +539,7 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
 	u32 page_size;
 	u32 max_cnt;
 	int size;
+	int ret;
 
 	if (cap->max_send_wr  > hr_dev->caps.max_wqes  ||
 	    cap->max_send_sge > hr_dev->caps.max_sq_sg ||
@@ -519,8 +549,6 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
 	}
 
 	hr_qp->sq.wqe_shift = ilog2(hr_dev->caps.max_sq_desc_sz);
-	hr_qp->sq_max_wqes_per_wr = 1;
-	hr_qp->sq_spare_wqes = 0;
 
 	if (hr_dev->caps.min_wqes)
 		max_cnt = max(cap->max_send_wr, hr_dev->caps.min_wqes);
@@ -540,25 +568,10 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
 	else
 		hr_qp->sq.max_gs = max_cnt;
 
-	if (hr_qp->sq.max_gs > 2) {
-		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
-				     (hr_qp->sq.max_gs - 2));
-		hr_qp->sge.sge_shift = 4;
-	}
-
-	/* ud sqwqe's sge use extend sge */
-	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->ibqp.qp_type == IB_QPT_GSI) {
-		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
-				     hr_qp->sq.max_gs);
-		hr_qp->sge.sge_shift = 4;
-	}
-
-	if ((hr_qp->sq.max_gs > 2) && hr_dev->pci_dev->revision == 0x20) {
-		if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) {
-			dev_err(dev, "The extended sge cnt error! sge_cnt=%d\n",
-				hr_qp->sge.sge_cnt);
-			return -EINVAL;
-		}
+	ret = set_extend_sge_param(hr_dev, hr_qp);
+	if (ret) {
+		dev_err(dev, "set extend sge parameters fail\n");
+		return ret;
 	}
 
 	/* Get buf size, SQ and RQ are aligned to PAGE_SIZE */
@@ -943,11 +956,13 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		hns_roce_free_db(hr_dev, &hr_qp->rdb);
 
 err_rq_sge_list:
-	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE)
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
+	     hns_roce_qp_has_rq(init_attr))
 		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
 
 err_wqe_list:
-	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE)
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
+	     hns_roce_qp_has_rq(init_attr))
 		kfree(hr_qp->rq_inl_buf.wqe_list);
 
 err_out:
-- 
1.9.1


^ permalink raw reply related

* [PATCH for-next 7/9] RDMA/hns: Package for hns_roce_rereg_user_mr function
From: Lijun Ou @ 2019-07-08 13:41 UTC (permalink / raw)
  To: dledford, jgg; +Cc: leon, linux-rdma, linuxarm
In-Reply-To: <1562593285-8037-1-git-send-email-oulijun@huawei.com>

Here moves some codes of hns_roce_rereg_user_mr functioon into
an independent function in oder to optimize complexity.

Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Lijun Ou <oulijun@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_mr.c | 153 +++++++++++++++++++-------------
 1 file changed, 89 insertions(+), 64 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index c4b758c..0cfa946 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -1206,6 +1206,83 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	return ERR_PTR(ret);
 }
 
+static int rereg_mr_trans(struct ib_mr *ibmr, int flags,
+			  u64 start, u64 length,
+			  u64 virt_addr, int mr_access_flags,
+			  struct hns_roce_cmd_mailbox *mailbox,
+			  u32 pdn, struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device);
+	struct hns_roce_mr *mr = to_hr_mr(ibmr);
+	struct device *dev = hr_dev->dev;
+	int npages;
+	int ret;
+
+	if (mr->size != ~0ULL) {
+		npages = ib_umem_page_count(mr->umem);
+
+		if (hr_dev->caps.pbl_hop_num)
+			hns_roce_mhop_free(hr_dev, mr);
+		else
+			dma_free_coherent(dev, npages * 8,
+					  mr->pbl_buf, mr->pbl_dma_addr);
+	}
+	ib_umem_release(mr->umem);
+
+	mr->umem = ib_umem_get(udata, start, length, mr_access_flags, 0);
+	if (IS_ERR(mr->umem)) {
+		ret = PTR_ERR(mr->umem);
+		mr->umem = NULL;
+		return -ENOMEM;
+	}
+	npages = ib_umem_page_count(mr->umem);
+
+	if (hr_dev->caps.pbl_hop_num) {
+		ret = hns_roce_mhop_alloc(hr_dev, npages, mr);
+		if (ret)
+			goto release_umem;
+	} else {
+		mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
+						 &(mr->pbl_dma_addr),
+						 GFP_KERNEL);
+		if (!mr->pbl_buf) {
+			ret = -ENOMEM;
+			goto release_umem;
+		}
+	}
+
+	ret = hr_dev->hw->rereg_write_mtpt(hr_dev, mr, flags, pdn,
+					   mr_access_flags, virt_addr,
+					   length, mailbox->buf);
+	if (ret)
+		goto release_umem;
+
+
+	ret = hns_roce_ib_umem_write_mr(hr_dev, mr, mr->umem);
+	if (ret) {
+		if (mr->size != ~0ULL) {
+			npages = ib_umem_page_count(mr->umem);
+
+			if (hr_dev->caps.pbl_hop_num)
+				hns_roce_mhop_free(hr_dev, mr);
+			else
+				dma_free_coherent(dev, npages * 8,
+						  mr->pbl_buf,
+						  mr->pbl_dma_addr);
+		}
+
+		goto release_umem;
+	}
+
+	return 0;
+
+release_umem:
+	ib_umem_release(mr->umem);
+	return ret;
+
+}
+
+
 int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length,
 			   u64 virt_addr, int mr_access_flags, struct ib_pd *pd,
 			   struct ib_udata *udata)
@@ -1216,7 +1293,6 @@ int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length,
 	struct device *dev = hr_dev->dev;
 	unsigned long mtpt_idx;
 	u32 pdn = 0;
-	int npages;
 	int ret;
 
 	if (!mr->enabled)
@@ -1243,73 +1319,25 @@ int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length,
 		pdn = to_hr_pd(pd)->pdn;
 
 	if (flags & IB_MR_REREG_TRANS) {
-		if (mr->size != ~0ULL) {
-			npages = ib_umem_page_count(mr->umem);
-
-			if (hr_dev->caps.pbl_hop_num)
-				hns_roce_mhop_free(hr_dev, mr);
-			else
-				dma_free_coherent(dev, npages * 8, mr->pbl_buf,
-						  mr->pbl_dma_addr);
-		}
-		ib_umem_release(mr->umem);
-
-		mr->umem =
-			ib_umem_get(udata, start, length, mr_access_flags, 0);
-		if (IS_ERR(mr->umem)) {
-			ret = PTR_ERR(mr->umem);
-			mr->umem = NULL;
+		ret = rereg_mr_trans(ibmr, flags,
+				     start, length,
+				     virt_addr, mr_access_flags,
+				     mailbox, pdn, udata);
+		if (ret)
 			goto free_cmd_mbox;
-		}
-		npages = ib_umem_page_count(mr->umem);
-
-		if (hr_dev->caps.pbl_hop_num) {
-			ret = hns_roce_mhop_alloc(hr_dev, npages, mr);
-			if (ret)
-				goto release_umem;
-		} else {
-			mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
-							 &(mr->pbl_dma_addr),
-							 GFP_KERNEL);
-			if (!mr->pbl_buf) {
-				ret = -ENOMEM;
-				goto release_umem;
-			}
-		}
-	}
-
-	ret = hr_dev->hw->rereg_write_mtpt(hr_dev, mr, flags, pdn,
-					   mr_access_flags, virt_addr,
-					   length, mailbox->buf);
-	if (ret) {
-		if (flags & IB_MR_REREG_TRANS)
-			goto release_umem;
-		else
+	} else {
+		ret = hr_dev->hw->rereg_write_mtpt(hr_dev, mr, flags, pdn,
+						   mr_access_flags, virt_addr,
+						   length, mailbox->buf);
+		if (ret)
 			goto free_cmd_mbox;
 	}
 
-	if (flags & IB_MR_REREG_TRANS) {
-		ret = hns_roce_ib_umem_write_mr(hr_dev, mr, mr->umem);
-		if (ret) {
-			if (mr->size != ~0ULL) {
-				npages = ib_umem_page_count(mr->umem);
-
-				if (hr_dev->caps.pbl_hop_num)
-					hns_roce_mhop_free(hr_dev, mr);
-				else
-					dma_free_coherent(dev, npages * 8,
-							  mr->pbl_buf,
-							  mr->pbl_dma_addr);
-			}
-
-			goto release_umem;
-		}
-	}
-
 	ret = hns_roce_sw2hw_mpt(hr_dev, mailbox, mtpt_idx);
 	if (ret) {
 		dev_err(dev, "SW2HW_MPT failed (%d)\n", ret);
-		goto release_umem;
+		ib_umem_release(mr->umem);
+		goto free_cmd_mbox;
 	}
 
 	mr->enabled = 1;
@@ -1320,9 +1348,6 @@ int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length,
 
 	return 0;
 
-release_umem:
-	ib_umem_release(mr->umem);
-
 free_cmd_mbox:
 	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
 
-- 
1.9.1


^ permalink raw reply related

* [PATCH for-next 0/9] Codes optimization for hip08
From: Lijun Ou @ 2019-07-08 13:41 UTC (permalink / raw)
  To: dledford, jgg; +Cc: leon, linux-rdma, linuxarm

Here are codes optimization in order to reduce complexity and
add readability.

Lijun Ou (6):
  RDMA/hns: Package the flow of creating cq
  RDMA/hns: Refactor the code of creating srq
  RDMA/hns: Refactor for hns_roce_v2_modify_qp function
  RDMA/hns: Use a separated function for setting extend sge paramters
  RDMA/hns: Package for hns_roce_rereg_user_mr function
  RDMA/hns: Refactor hem table mhop check and calculation

Xi Wang (1):
  RDMA/hns: optimize the duplicated code for qpc setting flow

Yixian Liu (1):
  RDMA/hns: Refactor eq table init for hip08

chenglang (1):
  RDMA/hns: Optimize hns_roce_mhop_alloc function.

 drivers/infiniband/hw/hns/hns_roce_cq.c     | 186 +++++---
 drivers/infiniband/hw/hns/hns_roce_device.h |   2 -
 drivers/infiniband/hw/hns/hns_roce_hem.c    | 180 ++++----
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 628 ++++++++++++++++------------
 drivers/infiniband/hw/hns/hns_roce_mr.c     | 428 +++++++++++--------
 drivers/infiniband/hw/hns/hns_roce_qp.c     |  61 ++-
 drivers/infiniband/hw/hns/hns_roce_srq.c    | 310 ++++++++------
 7 files changed, 1015 insertions(+), 780 deletions(-)

-- 
1.9.1


^ permalink raw reply

* [PATCH] [net-next] net/mlx5e: xsk: dynamically allocate mlx5e_channel_param
From: Arnd Bergmann @ 2019-07-08 12:55 UTC (permalink / raw)
  To: Saeed Mahameed, Leon Romanovsky, David S. Miller,
	Alexei Starovoitov, Daniel Borkmann, Jakub Kicinski,
	Jesper Dangaard Brouer, John Fastabend
  Cc: Arnd Bergmann, Maxim Mikityanskiy, Tariq Toukan, netdev,
	linux-rdma, linux-kernel, xdp-newbies, bpf

The structure is too large to put on the stack, resulting in a
warning on 32-bit ARM:

drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c:59:5: error: stack frame size of 1344 bytes in function
      'mlx5e_open_xsk' [-Werror,-Wframe-larger-than=]

Use kzalloc() instead.

Fixes: a038e9794541 ("net/mlx5e: Add XSK zero-copy support")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 .../mellanox/mlx5/core/en/xsk/setup.c         | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
index aaffa6f68dc0..db9bbec68dbf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
@@ -60,24 +60,28 @@ int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params,
 		   struct mlx5e_xsk_param *xsk, struct xdp_umem *umem,
 		   struct mlx5e_channel *c)
 {
-	struct mlx5e_channel_param cparam = {};
+	struct mlx5e_channel_param *cparam;
 	struct dim_cq_moder icocq_moder = {};
 	int err;
 
 	if (!mlx5e_validate_xsk_param(params, xsk, priv->mdev))
 		return -EINVAL;
 
-	mlx5e_build_xsk_cparam(priv, params, xsk, &cparam);
+	cparam = kzalloc(sizeof(*cparam), GFP_KERNEL);
+	if (!cparam)
+		return -ENOMEM;
 
-	err = mlx5e_open_cq(c, params->rx_cq_moderation, &cparam.rx_cq, &c->xskrq.cq);
+	mlx5e_build_xsk_cparam(priv, params, xsk, cparam);
+
+	err = mlx5e_open_cq(c, params->rx_cq_moderation, &cparam->rx_cq, &c->xskrq.cq);
 	if (unlikely(err))
-		return err;
+		goto err_kfree_cparam;
 
-	err = mlx5e_open_rq(c, params, &cparam.rq, xsk, umem, &c->xskrq);
+	err = mlx5e_open_rq(c, params, &cparam->rq, xsk, umem, &c->xskrq);
 	if (unlikely(err))
 		goto err_close_rx_cq;
 
-	err = mlx5e_open_cq(c, params->tx_cq_moderation, &cparam.tx_cq, &c->xsksq.cq);
+	err = mlx5e_open_cq(c, params->tx_cq_moderation, &cparam->tx_cq, &c->xsksq.cq);
 	if (unlikely(err))
 		goto err_close_rq;
 
@@ -87,18 +91,18 @@ int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params,
 	 * is disabled and then reenabled, but the SQ continues receiving CQEs
 	 * from the old UMEM.
 	 */
-	err = mlx5e_open_xdpsq(c, params, &cparam.xdp_sq, umem, &c->xsksq, true);
+	err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, umem, &c->xsksq, true);
 	if (unlikely(err))
 		goto err_close_tx_cq;
 
-	err = mlx5e_open_cq(c, icocq_moder, &cparam.icosq_cq, &c->xskicosq.cq);
+	err = mlx5e_open_cq(c, icocq_moder, &cparam->icosq_cq, &c->xskicosq.cq);
 	if (unlikely(err))
 		goto err_close_sq;
 
 	/* Create a dedicated SQ for posting NOPs whenever we need an IRQ to be
 	 * triggered and NAPI to be called on the correct CPU.
 	 */
-	err = mlx5e_open_icosq(c, params, &cparam.icosq, &c->xskicosq);
+	err = mlx5e_open_icosq(c, params, &cparam->icosq, &c->xskicosq);
 	if (unlikely(err))
 		goto err_close_icocq;
 
@@ -123,6 +127,9 @@ int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params,
 err_close_rx_cq:
 	mlx5e_close_cq(&c->xskrq.cq);
 
+err_kfree_cparam:
+	kfree(cparam);
+
 	return err;
 }
 
-- 
2.20.0


^ permalink raw reply related

* [PATCH v3] RDMA/core: Fix race when resolving IP address
From: Dag Moxnes @ 2019-07-08 11:16 UTC (permalink / raw)
  To: dledford, jgg, leon, parav; +Cc: linux-rdma, linux-kernel

Use neighbour lock when copying MAC address from neighbour data struct
in dst_fetch_ha.

When not using the lock, it is possible for the function to race with
neigh_update, causing it to copy an invalid MAC address.

It is possible to provoke this error by calling rdma_resolve_addr in a
tight loop, while deleting the corresponding ARP entry in another tight
loop.

This will cause the race shown it the following sample trace:

rdma_resolve_addr()
  rdma_resolve_ip()
    addr_resolve()
      addr_resolve_neigh()
        fetch_ha()
          dst_fetch_ha()
            n->nud_state == NUD_VALID

and

net_ioctl()
  arp_ioctl()
    arp_rec_delete()
      arp_invalidate()
        neigh_update()
          __neigh_update()
            neigh->nud_state = new;

Signed-off-by: Dag Moxnes <dag.moxnes@oracle.com>
Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
---
v1 -> v2:
   * Modified implementation to improve readability

v2 -> v3:
   * Added sample trace as suggested by Parav Pandit
   * Added Reviewed-by


---
 drivers/infiniband/core/addr.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 2f7d141598..51323ffbc5 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -333,11 +333,14 @@ static int dst_fetch_ha(const struct dst_entry *dst,
 	if (!n)
 		return -ENODATA;
 
-	if (!(n->nud_state & NUD_VALID)) {
+	read_lock_bh(&n->lock);
+	if (n->nud_state & NUD_VALID) {
+		memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN);
+		read_unlock_bh(&n->lock);
+	} else {
+		read_unlock_bh(&n->lock);
 		neigh_event_send(n, NULL);
 		ret = -ENODATA;
-	} else {
-		memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN);
 	}
 
 	neigh_release(n);
-- 
2.20.1


^ permalink raw reply related

* [PATCH rdma-next v5 3/4] RDMA/nldev: Added configuration of RDMA dynamic interrupt moderation to netlink
From: Leon Romanovsky @ 2019-07-08 10:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Max Gurtovoy, Saeed Mahameed,
	Sagi Grimberg, Yamin Friedman
In-Reply-To: <20190708105905.27468-1-leon@kernel.org>

From: Yamin Friedman <yaminf@mellanox.com>

Added parameter in ib_device for enabling dynamic interrupt moderation so
that it can be configured in userspace using rdma tool.

In order to set adaptive-moderation for an ib device the command is:
rdma dev set [DEV] adaptive-moderation [on|off]
Please set on/off.

rdma dev show
0: mlx5_0: node_type ca fw 16.26.0055 node_guid 248a:0703:00a5:29d0
sys_image_guid 248a:0703:00a5:29d0 adaptive-moderation on

rdma resource show cq
dev mlx5_0 cqn 0 cqe 1023 users 4 poll-ctx UNBOUND_WORKQUEUE
adaptive-moderation off comm [ib_core]

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/Kconfig          |  1 +
 drivers/infiniband/core/core_priv.h |  1 +
 drivers/infiniband/core/device.c    |  9 +++++++++
 drivers/infiniband/core/nldev.c     | 14 ++++++++++++++
 include/uapi/rdma/rdma_netlink.h    |  5 +++++
 5 files changed, 30 insertions(+)

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index f277cb7aea29..85e103b147cc 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -7,6 +7,7 @@ menuconfig INFINIBAND
 	depends on m || IPV6 != m
 	depends on !ALPHA
 	select IRQ_POLL
+	select DIMLIB
 	---help---
 	  Core support for InfiniBand (IB).  Make sure to also select
 	  any protocols you wish to use as well as drivers for your
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index a953c2fa2e78..888d89ce81df 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -60,6 +60,7 @@ extern bool ib_devices_shared_netns;
 int ib_device_register_sysfs(struct ib_device *device);
 void ib_device_unregister_sysfs(struct ib_device *device);
 int ib_device_rename(struct ib_device *ibdev, const char *name);
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim);
 
 typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
 	      struct net_device *idev, void *cookie);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index bdf61499e6d5..7f4affe8a10d 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -448,6 +448,15 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
 	return 0;
 }
 
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim)
+{
+	if (use_dim > 1)
+		return -EINVAL;
+	ibdev->use_cq_dim = use_dim;
+
+	return 0;
+}
+
 static int alloc_name(struct ib_device *ibdev, const char *name)
 {
 	struct ib_device *device;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index a4431ed566b6..d9f2a30e6467 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -52,6 +52,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
 	[RDMA_NLDEV_ATTR_CHARDEV_TYPE]		= { .type = NLA_NUL_STRING,
 					.len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE },
+	[RDMA_NLDEV_ATTR_DEV_DIM]               = { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_DEV_INDEX]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_DEV_NAME]		= { .type = NLA_NUL_STRING,
 					.len = IB_DEVICE_NAME_MAX },
@@ -252,6 +253,8 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
 		return -EMSGSIZE;
 	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
 		return -EMSGSIZE;
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim))
+		return -EMSGSIZE;
 
 	/*
 	 * Link type is determined on first port and mlx4 device
@@ -552,6 +555,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	    nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
 		goto err;
 
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL)))
+		goto err;
+
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
 		goto err;
 	if (!rdma_is_kernel_res(res) &&
@@ -870,6 +876,14 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto put_done;
 	}
 
+	if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) {
+		u8 use_dim;
+
+		use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]);
+		err = ib_device_set_dim(device,  use_dim);
+		goto done;
+	}
+
 done:
 	ib_device_put(device);
 put_done:
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index ce6fd66e7aa3..8e277783fa96 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -520,6 +520,11 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME,	/* string */
 	RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE,	/* u64 */
 
+	/*
+	 * CQ adaptive moderatio (DIM)
+	 */
+	RDMA_NLDEV_ATTR_DEV_DIM,                /* u8 */
+
 	/*
 	 * Always the end
 	 */
-- 
2.20.1


^ permalink raw reply related

* [PATCH rdma-next v5 4/4] RDMA/mlx5: Set RDMA DIM to be enabled by default
From: Leon Romanovsky @ 2019-07-08 10:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Max Gurtovoy, Saeed Mahameed,
	Sagi Grimberg, Yamin Friedman
In-Reply-To: <20190708105905.27468-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

Enable RDMA DIM by default for better user experience.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 7581571bd9cd..07a05b0b9e42 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6424,6 +6424,8 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
 	     MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
 		mutex_init(&dev->lb.mutex);
 
+	dev->ib_dev.use_cq_dim = true;
+
 	return 0;
 }
 
-- 
2.20.1


^ permalink raw reply related

* [PATCH rdma-next v5 2/4] RDMA/core: Provide RDMA DIM support for ULPs
From: Leon Romanovsky @ 2019-07-08 10:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Max Gurtovoy, Saeed Mahameed,
	Sagi Grimberg, Yamin Friedman
In-Reply-To: <20190708105905.27468-1-leon@kernel.org>

From: Yamin Friedman <yaminf@mellanox.com>

Added the interface in the infiniband driver that applies the rdma_dim
adaptive moderation. There is now a special function for allocating an
ib_cq that uses rdma_dim.

Performance improvement (ConnectX-5 100GbE, x86) running FIO benchmark over
NVMf between two equal end-hosts with 56 cores across a Mellanox switch
using null_blk device:

READS without DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 3.8GiB/s | 7.7M | 1401  usec               | 2442  usec
4k       | 7.0GiB/s | 1.8M | 4817  usec               | 6587  usec
64k      | 10.7GiB/s| 175k | 9896  usec               | 10028 usec

IO WRITES without DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 3.6GiB/s | 7.5M | 1434  usec               | 2474  usec
4k       | 6.3GiB/s | 1.6M | 938   usec               | 1221  usec
64k      | 10.7GiB/s| 175k | 8979  usec               | 12780 usec

IO READS with DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 4GiB/s   | 8.2M | 816    usec              | 889   usec
4k       | 10.1GiB/s| 2.65M| 3359   usec              | 5080  usec
64k      | 10.7GiB/s| 175k | 9896   usec              | 10028 usec

IO WRITES with DIM:
blk size | BW       | IOPS  | 99th percentile latency | 99.99th latency
512B     | 3.9GiB/s | 8.1M  | 799   usec              | 922   usec
4k       | 9.6GiB/s | 2.5M  | 717   usec              | 1004  usec
64k      | 10.7GiB/s| 176k  | 8586  usec              | 12256 usec

The rdma_dim algorithm was designed to measure the effectiveness of
moderation on the flow in a general way and thus should be appropriate
for all RDMA storage protocols.

rdma_dim is configured to be the default option based on performance
improvement seen after extensive tests.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/cq.c | 45 ++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h      |  4 ++++
 2 files changed, 49 insertions(+)

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index 00d70f166209..ffd6e24109d5 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -18,6 +18,40 @@
 #define IB_POLL_FLAGS \
 	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 
+static void ib_cq_rdma_dim_work(struct work_struct *w)
+{
+	struct dim *dim = container_of(w, struct dim, work);
+	struct ib_cq *cq = dim->priv;
+
+	u16 usec = rdma_dim_prof[dim->profile_ix].usec;
+	u16 comps = rdma_dim_prof[dim->profile_ix].comps;
+
+	dim->state = DIM_START_MEASURE;
+
+	cq->device->ops.modify_cq(cq, comps, usec);
+}
+
+static void rdma_dim_init(struct ib_cq *cq)
+{
+	struct dim *dim;
+
+	if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim ||
+	    cq->poll_ctx == IB_POLL_DIRECT)
+		return;
+
+	dim = kzalloc(sizeof(struct dim), GFP_KERNEL);
+	if (!dim)
+		return;
+
+	dim->state = DIM_START_MEASURE;
+	dim->tune_state = DIM_GOING_RIGHT;
+	dim->profile_ix = RDMA_DIM_START_PROFILE;
+	dim->priv = cq;
+	cq->dim = dim;
+
+	INIT_WORK(&dim->work, ib_cq_rdma_dim_work);
+}
+
 static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
 			   int batch)
 {
@@ -78,6 +112,7 @@ static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
 static int ib_poll_handler(struct irq_poll *iop, int budget)
 {
 	struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+	struct dim *dim = cq->dim;
 	int completed;
 
 	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
@@ -87,6 +122,9 @@ static int ib_poll_handler(struct irq_poll *iop, int budget)
 			irq_poll_sched(&cq->iop);
 	}
 
+	if (dim)
+		rdma_dim(dim, completed);
+
 	return completed;
 }
 
@@ -105,6 +143,8 @@ static void ib_cq_poll_work(struct work_struct *work)
 	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
 	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
 		queue_work(cq->comp_wq, &cq->work);
+	else if (cq->dim)
+		rdma_dim(cq->dim, completed);
 }
 
 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
@@ -161,6 +201,8 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 
 	rdma_restrack_kadd(&cq->res);
 
+	rdma_dim_init(cq);
+
 	switch (cq->poll_ctx) {
 	case IB_POLL_DIRECT:
 		cq->comp_handler = ib_cq_completion_direct;
@@ -223,6 +265,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 
 	rdma_restrack_del(&cq->res);
 	cq->device->ops.destroy_cq(cq, udata);
+	if (cq->dim)
+		cancel_work_sync(&cq->dim->work);
+	kfree(cq->dim);
 	kfree(cq->wc);
 	kfree(cq);
 }
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 50806bef9f20..30eb68f36109 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -61,6 +61,7 @@
 #include <linux/cgroup_rdma.h>
 #include <linux/irqflags.h>
 #include <linux/preempt.h>
+#include <linux/dim.h>
 #include <uapi/rdma/ib_user_verbs.h>
 #include <rdma/rdma_counter.h>
 #include <rdma/restrack.h>
@@ -1509,6 +1510,7 @@ struct ib_cq {
 		struct work_struct	work;
 	};
 	struct workqueue_struct *comp_wq;
+	struct dim *dim;
 	/*
 	 * Implementation details of the RDMA core, don't use in drivers:
 	 */
@@ -2576,6 +2578,8 @@ struct ib_device {
 	u16                          is_switch:1;
 	/* Indicates kernel verbs support, should not be used in drivers */
 	u16                          kverbs_provider:1;
+	/* CQ adaptive moderation (RDMA DIM) */
+	u16                          use_cq_dim:1;
 	u8                           node_type;
 	u8                           phys_port_cnt;
 	struct ib_device_attr        attrs;
-- 
2.20.1


^ permalink raw reply related

* [PATCH rdma-next v5 1/4] linux/dim: Implement RDMA adaptive moderation (DIM)
From: Leon Romanovsky @ 2019-07-08 10:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Max Gurtovoy, Saeed Mahameed,
	Sagi Grimberg, Yamin Friedman
In-Reply-To: <20190708105905.27468-1-leon@kernel.org>

From: Yamin Friedman <yaminf@mellanox.com>

RDMA DIM implements a different algorithm from net DIM and is based on
completions which is how we can implement interrupt moderation in RDMA.

The algorithm optimizes for number of completions and ratio between
completions and events. In order to avoid long latencies, the
implementation performs fast reduction of moderation level when the
traffic changes.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/dim.h |  36 +++++++++++++++
 lib/dim/Makefile    |   6 +--
 lib/dim/rdma_dim.c  | 108 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 146 insertions(+), 4 deletions(-)
 create mode 100644 lib/dim/rdma_dim.c

diff --git a/include/linux/dim.h b/include/linux/dim.h
index aa9bdd47a648..aa69730c3b8d 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -82,6 +82,7 @@ struct dim_stats {
  * @prev_stats: Measured rates from previous iteration (for comparison)
  * @start_sample: Sampled data at start of current iteration
  * @work: Work to perform on action required
+ * @priv: A pointer to the struct that points to dim
  * @profile_ix: Current moderation profile
  * @mode: CQ period count mode
  * @tune_state: Algorithm tuning state (see below)
@@ -95,6 +96,7 @@ struct dim {
 	struct dim_sample start_sample;
 	struct dim_sample measuring_sample;
 	struct work_struct work;
+	void *priv;
 	u8 profile_ix;
 	u8 mode;
 	u8 tune_state;
@@ -363,4 +365,38 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
  */
 void net_dim(struct dim *dim, struct dim_sample end_sample);
 
+/* RDMA DIM */
+
+/*
+ * RDMA DIM profile:
+ * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES.
+ */
+#define RDMA_DIM_PARAMS_NUM_PROFILES 9
+#define RDMA_DIM_START_PROFILE 0
+
+static const struct dim_cq_moder
+rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
+	{1,   0, 1,  0},
+	{1,   0, 4,  0},
+	{2,   0, 4,  0},
+	{2,   0, 8,  0},
+	{4,   0, 8,  0},
+	{16,  0, 8,  0},
+	{16,  0, 16, 0},
+	{32,  0, 16, 0},
+	{32,  0, 32, 0},
+};
+
+/**
+ * rdma_dim - Runs the adaptive moderation.
+ * @dim: The moderation struct.
+ * @completions: The number of completions collected in this round.
+ *
+ * Each call to rdma_dim takes the latest amount of completions that
+ * have been collected and counts them as a new event.
+ * Once enough events have been collected the algorithm decides a new
+ * moderation level.
+ */
+void rdma_dim(struct dim *dim, u64 completions);
+
 #endif /* DIM_H */
diff --git a/lib/dim/Makefile b/lib/dim/Makefile
index 160afe288df0..1d6858a108cb 100644
--- a/lib/dim/Makefile
+++ b/lib/dim/Makefile
@@ -2,8 +2,6 @@
 # DIM Dynamic Interrupt Moderation library
 #
 
-obj-$(CONFIG_DIMLIB) = net_dim.o
+obj-$(CONFIG_DIMLIB) += dim.o
 
-net_dim-y = \
-	dim.o		\
-	net_dim.o
+dim-y := dim.o net_dim.o rdma_dim.o
diff --git a/lib/dim/rdma_dim.c b/lib/dim/rdma_dim.c
new file mode 100644
index 000000000000..f7e26c7b4749
--- /dev/null
+++ b/lib/dim/rdma_dim.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <linux/dim.h>
+
+static int rdma_dim_step(struct dim *dim)
+{
+	if (dim->tune_state == DIM_GOING_RIGHT) {
+		if (dim->profile_ix == (RDMA_DIM_PARAMS_NUM_PROFILES - 1))
+			return DIM_ON_EDGE;
+		dim->profile_ix++;
+		dim->steps_right++;
+	}
+	if (dim->tune_state == DIM_GOING_LEFT) {
+		if (dim->profile_ix == 0)
+			return DIM_ON_EDGE;
+		dim->profile_ix--;
+		dim->steps_left++;
+	}
+
+	return DIM_STEPPED;
+}
+
+static int rdma_dim_stats_compare(struct dim_stats *curr,
+				  struct dim_stats *prev)
+{
+	/* first stat */
+	if (!prev->cpms)
+		return DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->cpms, prev->cpms))
+		return (curr->cpms > prev->cpms) ? DIM_STATS_BETTER :
+						DIM_STATS_WORSE;
+
+	if (IS_SIGNIFICANT_DIFF(curr->cpe_ratio, prev->cpe_ratio))
+		return (curr->cpe_ratio > prev->cpe_ratio) ? DIM_STATS_BETTER :
+						DIM_STATS_WORSE;
+
+	return DIM_STATS_SAME;
+}
+
+static bool rdma_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
+{
+	int prev_ix = dim->profile_ix;
+	u8 state = dim->tune_state;
+	int stats_res;
+	int step_res;
+
+	if (state != DIM_PARKING_ON_TOP && state != DIM_PARKING_TIRED) {
+		stats_res = rdma_dim_stats_compare(curr_stats,
+						   &dim->prev_stats);
+
+		switch (stats_res) {
+		case DIM_STATS_SAME:
+			if (curr_stats->cpe_ratio <= 50 * prev_ix)
+				dim->profile_ix = 0;
+			break;
+		case DIM_STATS_WORSE:
+			dim_turn(dim);
+			/* fall through */
+		case DIM_STATS_BETTER:
+			step_res = rdma_dim_step(dim);
+			if (step_res == DIM_ON_EDGE)
+				dim_turn(dim);
+			break;
+		}
+	}
+
+	dim->prev_stats = *curr_stats;
+
+	return dim->profile_ix != prev_ix;
+}
+
+void rdma_dim(struct dim *dim, u64 completions)
+{
+	struct dim_sample *curr_sample = &dim->measuring_sample;
+	struct dim_stats curr_stats;
+	u32 nevents;
+
+	dim_update_sample_with_comps(curr_sample->event_ctr + 1, 0, 0,
+				     curr_sample->comp_ctr + completions,
+				     &dim->measuring_sample);
+
+	switch (dim->state) {
+	case DIM_MEASURE_IN_PROGRESS:
+		nevents = curr_sample->event_ctr - dim->start_sample.event_ctr;
+		if (nevents < DIM_NEVENTS)
+			break;
+		dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats);
+		if (rdma_dim_decision(&curr_stats, dim)) {
+			dim->state = DIM_APPLY_NEW_PROFILE;
+			schedule_work(&dim->work);
+			break;
+		}
+		/* fall through */
+	case DIM_START_MEASURE:
+		dim->state = DIM_MEASURE_IN_PROGRESS;
+		dim_update_sample_with_comps(curr_sample->event_ctr, 0, 0,
+					     curr_sample->comp_ctr,
+					     &dim->start_sample);
+		break;
+	case DIM_APPLY_NEW_PROFILE:
+		break;
+	}
+}
+EXPORT_SYMBOL(rdma_dim);
-- 
2.20.1


^ permalink raw reply related

* [PATCH rdma-next v5 0/4] Use RDMA adaptive moderation library
From: Leon Romanovsky @ 2019-07-08 10:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Max Gurtovoy, Saeed Mahameed,
	Sagi Grimberg, Yamin Friedman

From: Leon Romanovsky <leonro@mellanox.com>

Hi,

This is RDMA part of previously sent DIM library improvements series
[1], which was pulled by Dave. It needs to be pulled to RDMA too as
a pre-requirements.

Changes since v4:
 * Separated mlx5 change from IB/core changes.

Changes since v3:
 * Renamed dim_owner to be priv
 * Added Sagi's ROBs
 * Removed casting of void pointer.

Changes since v2:
- renamed user-space knob from dim to adaptive-moderation (Sagi)
- some minor code clean ups (Sagi)
- Reordered patches to ensure that netlink expose is last in the series.
- Slightly cleaned commit messages
- Changed "bool use_cq_dim" flag to be bitwise to save space.

Changes since v1:
- added per ib device configuration knob for rdma-dim (Sagi)
- add NL directives for user-space / rdma tool to configure rdma dim
  (Sagi/Leon)
- use one header file for DIM implementations (Leon)
- various point changes in the rdma dim related code in the IB core
  (Leon)
- removed the RDMA specific patches form this pull request\

Thanks

[1] https://www.spinics.net/lists/netdev/msg581046.html

Leon Romanovsky (1):
  RDMA/mlx5: Set RDMA DIM to be enabled by default

Yamin Friedman (3):
  linux/dim: Implement RDMA adaptive moderation (DIM)
  RDMA/core: Provide RDMA DIM support for ULPs
  RDMA/nldev: Added configuration of RDMA dynamic interrupt moderation
    to netlink

 drivers/infiniband/Kconfig          |   1 +
 drivers/infiniband/core/core_priv.h |   1 +
 drivers/infiniband/core/cq.c        |  45 ++++++++++++
 drivers/infiniband/core/device.c    |   9 +++
 drivers/infiniband/core/nldev.c     |  14 ++++
 drivers/infiniband/hw/mlx5/main.c   |   2 +
 include/linux/dim.h                 |  36 ++++++++++
 include/rdma/ib_verbs.h             |   4 ++
 include/uapi/rdma/rdma_netlink.h    |   5 ++
 lib/dim/Makefile                    |   6 +-
 lib/dim/rdma_dim.c                  | 108 ++++++++++++++++++++++++++++
 11 files changed, 227 insertions(+), 4 deletions(-)
 create mode 100644 lib/dim/rdma_dim.c

--
2.20.1


^ permalink raw reply

* Re: [PATCH rdma-next v4 2/3] RDMA/core: Provide RDMA DIM support for ULPs
From: Leon Romanovsky @ 2019-07-08 10:44 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Doug Ledford, Jason Gunthorpe, RDMA mailing list, Max Gurtovoy,
	Saeed Mahameed, Yamin Friedman
In-Reply-To: <efdb7d5f-1cd7-fd34-0245-b494000a0954@grimberg.me>

On Mon, Jul 08, 2019 at 02:31:51AM -0700, Sagi Grimberg wrote:
>
> >   drivers/infiniband/core/cq.c      | 45 +++++++++++++++++++++++++++++++
> >   drivers/infiniband/hw/mlx5/main.c |  2 ++
>
> Can you please separate the mlx5 patch from the core?

Sure, no problem.

Thanks

^ permalink raw reply

* Re: [PATCH rdma-next v4 2/3] RDMA/core: Provide RDMA DIM support for ULPs
From: Sagi Grimberg @ 2019-07-08  9:31 UTC (permalink / raw)
  To: Leon Romanovsky, Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Max Gurtovoy, Saeed Mahameed,
	Yamin Friedman
In-Reply-To: <20190704125743.7814-3-leon@kernel.org>


>   drivers/infiniband/core/cq.c      | 45 +++++++++++++++++++++++++++++++
>   drivers/infiniband/hw/mlx5/main.c |  2 ++

Can you please separate the mlx5 patch from the core?

^ permalink raw reply

* [PATCH v5 rdma-next 3/6] RDMA/qedr: Use the common mmap API
From: Michal Kalderon @ 2019-07-08  9:15 UTC (permalink / raw)
  To: michal.kalderon, ariel.elior, jgg, dledford, galpress
  Cc: linux-rdma, davem, netdev
In-Reply-To: <20190708091503.14723-1-michal.kalderon@marvell.com>

Remove all function related to mmap from qedr and use the common
API

Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
---
 drivers/infiniband/hw/qedr/qedr.h  |  13 ----
 drivers/infiniband/hw/qedr/verbs.c | 153 +++++++++++++------------------------
 drivers/infiniband/hw/qedr/verbs.h |   2 +-
 3 files changed, 52 insertions(+), 116 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
index 6175d1e98717..97c90d1e525d 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -231,11 +231,6 @@ struct qedr_ucontext {
 	u64 dpi_phys_addr;
 	u32 dpi_size;
 	u16 dpi;
-
-	struct list_head mm_head;
-
-	/* Lock to protect mm list */
-	struct mutex mm_list_lock;
 };
 
 union db_prod64 {
@@ -298,14 +293,6 @@ struct qedr_pd {
 	struct qedr_ucontext *uctx;
 };
 
-struct qedr_mm {
-	struct {
-		u64 phy_addr;
-		unsigned long len;
-	} key;
-	struct list_head entry;
-};
-
 union db_prod32 {
 	struct rdma_pwm_val16_data data;
 	u32 raw;
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 27d90a84ea01..f33f0f1e7d76 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -58,6 +58,10 @@
 
 #define DB_ADDR_SHIFT(addr)		((addr) << DB_PWM_ADDR_OFFSET_SHIFT)
 
+enum {
+	QEDR_USER_MMAP_IO_WC = 0,
+};
+
 static inline int qedr_ib_copy_to_udata(struct ib_udata *udata, void *src,
 					size_t len)
 {
@@ -256,60 +260,6 @@ int qedr_modify_port(struct ib_device *ibdev, u8 port, int mask,
 	return 0;
 }
 
-static int qedr_add_mmap(struct qedr_ucontext *uctx, u64 phy_addr,
-			 unsigned long len)
-{
-	struct qedr_mm *mm;
-
-	mm = kzalloc(sizeof(*mm), GFP_KERNEL);
-	if (!mm)
-		return -ENOMEM;
-
-	mm->key.phy_addr = phy_addr;
-	/* This function might be called with a length which is not a multiple
-	 * of PAGE_SIZE, while the mapping is PAGE_SIZE grained and the kernel
-	 * forces this granularity by increasing the requested size if needed.
-	 * When qedr_mmap is called, it will search the list with the updated
-	 * length as a key. To prevent search failures, the length is rounded up
-	 * in advance to PAGE_SIZE.
-	 */
-	mm->key.len = roundup(len, PAGE_SIZE);
-	INIT_LIST_HEAD(&mm->entry);
-
-	mutex_lock(&uctx->mm_list_lock);
-	list_add(&mm->entry, &uctx->mm_head);
-	mutex_unlock(&uctx->mm_list_lock);
-
-	DP_DEBUG(uctx->dev, QEDR_MSG_MISC,
-		 "added (addr=0x%llx,len=0x%lx) for ctx=%p\n",
-		 (unsigned long long)mm->key.phy_addr,
-		 (unsigned long)mm->key.len, uctx);
-
-	return 0;
-}
-
-static bool qedr_search_mmap(struct qedr_ucontext *uctx, u64 phy_addr,
-			     unsigned long len)
-{
-	bool found = false;
-	struct qedr_mm *mm;
-
-	mutex_lock(&uctx->mm_list_lock);
-	list_for_each_entry(mm, &uctx->mm_head, entry) {
-		if (len != mm->key.len || phy_addr != mm->key.phy_addr)
-			continue;
-
-		found = true;
-		break;
-	}
-	mutex_unlock(&uctx->mm_list_lock);
-	DP_DEBUG(uctx->dev, QEDR_MSG_MISC,
-		 "searched for (addr=0x%llx,len=0x%lx) for ctx=%p, result=%d\n",
-		 mm->key.phy_addr, mm->key.len, uctx, found);
-
-	return found;
-}
-
 int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
 	struct ib_device *ibdev = uctx->device;
@@ -318,6 +268,7 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 	struct qedr_alloc_ucontext_resp uresp = {};
 	struct qedr_dev *dev = get_qedr_dev(ibdev);
 	struct qed_rdma_add_user_out_params oparams;
+	u64 key;
 
 	if (!udata)
 		return -EFAULT;
@@ -334,13 +285,17 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 	ctx->dpi_addr = oparams.dpi_addr;
 	ctx->dpi_phys_addr = oparams.dpi_phys_addr;
 	ctx->dpi_size = oparams.dpi_size;
-	INIT_LIST_HEAD(&ctx->mm_head);
-	mutex_init(&ctx->mm_list_lock);
+
+	key = rdma_user_mmap_entry_insert(uctx, ctx,
+					  ctx->dpi_phys_addr, ctx->dpi_size,
+					  QEDR_USER_MMAP_IO_WC);
+	if (key == RDMA_USER_MMAP_INVALID)
+		return -ENOMEM;
 
 	uresp.dpm_enabled = dev->user_dpm_enabled;
 	uresp.wids_enabled = 1;
 	uresp.wid_count = oparams.wid_count;
-	uresp.db_pa = ctx->dpi_phys_addr;
+	uresp.db_pa = key;
 	uresp.db_size = ctx->dpi_size;
 	uresp.max_send_wr = dev->attr.max_sqe;
 	uresp.max_recv_wr = dev->attr.max_rqe;
@@ -356,10 +311,6 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 
 	ctx->dev = dev;
 
-	rc = qedr_add_mmap(ctx, ctx->dpi_phys_addr, ctx->dpi_size);
-	if (rc)
-		return rc;
-
 	DP_DEBUG(dev, QEDR_MSG_INIT, "Allocating user context %p\n",
 		 &ctx->ibucontext);
 	return 0;
@@ -368,66 +319,64 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 void qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
 {
 	struct qedr_ucontext *uctx = get_qedr_ucontext(ibctx);
-	struct qedr_mm *mm, *tmp;
 
 	DP_DEBUG(uctx->dev, QEDR_MSG_INIT, "Deallocating user context %p\n",
 		 uctx);
 	uctx->dev->ops->rdma_remove_user(uctx->dev->rdma_ctx, uctx->dpi);
-
-	list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) {
-		DP_DEBUG(uctx->dev, QEDR_MSG_MISC,
-			 "deleted (addr=0x%llx,len=0x%lx) for ctx=%p\n",
-			 mm->key.phy_addr, mm->key.len, uctx);
-		list_del(&mm->entry);
-		kfree(mm);
-	}
 }
 
-int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+int qedr_mmap(struct ib_ucontext *ucontext, struct vm_area_struct *vma)
 {
-	struct qedr_ucontext *ucontext = get_qedr_ucontext(context);
-	struct qedr_dev *dev = get_qedr_dev(context->device);
-	unsigned long phys_addr = vma->vm_pgoff << PAGE_SHIFT;
-	unsigned long len = (vma->vm_end - vma->vm_start);
-	unsigned long dpi_start;
+	struct ib_device *dev = ucontext->device;
+	u64 length = vma->vm_end - vma->vm_start;
+	u64 key = vma->vm_pgoff << PAGE_SHIFT;
+	struct rdma_user_mmap_entry *entry;
+	u64 pfn;
+	int err;
 
-	dpi_start = dev->db_phys_addr + (ucontext->dpi * ucontext->dpi_size);
+	ibdev_dbg(dev,
+		  "start %#lx, end %#lx, length = %#llx, key = %#llx\n",
+		  vma->vm_start, vma->vm_end, length, key);
 
-	DP_DEBUG(dev, QEDR_MSG_INIT,
-		 "mmap invoked with vm_start=0x%pK, vm_end=0x%pK,vm_pgoff=0x%pK; dpi_start=0x%pK dpi_size=0x%x\n",
-		 (void *)vma->vm_start, (void *)vma->vm_end,
-		 (void *)vma->vm_pgoff, (void *)dpi_start, ucontext->dpi_size);
-
-	if ((vma->vm_start & (PAGE_SIZE - 1)) || (len & (PAGE_SIZE - 1))) {
-		DP_ERR(dev,
-		       "failed mmap, addresses must be page aligned: start=0x%pK, end=0x%pK\n",
-		       (void *)vma->vm_start, (void *)vma->vm_end);
+	if (length % PAGE_SIZE != 0 || !(vma->vm_flags & VM_SHARED)) {
+		ibdev_dbg(dev,
+			  "length[%#llx] is not page size aligned[%#lx] or VM_SHARED is not set [%#lx]\n",
+			  length, PAGE_SIZE, vma->vm_flags);
 		return -EINVAL;
 	}
 
-	if (!qedr_search_mmap(ucontext, phys_addr, len)) {
-		DP_ERR(dev, "failed mmap, vm_pgoff=0x%lx is not authorized\n",
-		       vma->vm_pgoff);
-		return -EINVAL;
+	if (vma->vm_flags & VM_EXEC) {
+		ibdev_dbg(dev, "Mapping executable pages is not permitted\n");
+		return -EPERM;
 	}
+	vma->vm_flags &= ~VM_MAYEXEC;
 
-	if (phys_addr < dpi_start ||
-	    ((phys_addr + len) > (dpi_start + ucontext->dpi_size))) {
-		DP_ERR(dev,
-		       "failed mmap, pages are outside of dpi; page address=0x%pK, dpi_start=0x%pK, dpi_size=0x%x\n",
-		       (void *)phys_addr, (void *)dpi_start,
-		       ucontext->dpi_size);
+	entry = rdma_user_mmap_entry_get(ucontext, key, length);
+	if (!entry) {
+		ibdev_dbg(dev, "key[%#llx] does not have valid entry\n",
+			  key);
 		return -EINVAL;
 	}
 
-	if (vma->vm_flags & VM_READ) {
-		DP_ERR(dev, "failed mmap, cannot map doorbell bar for read\n");
-		return -EINVAL;
+	ibdev_dbg(dev,
+		  "Mapping address[%#llx], length[%#llx], mmap_flag[%d]\n",
+		  entry->address, length, entry->mmap_flag);
+
+	pfn = entry->address >> PAGE_SHIFT;
+	switch (entry->mmap_flag) {
+	case QEDR_USER_MMAP_IO_WC:
+		err = rdma_user_mmap_io(ucontext, vma, pfn, length,
+					pgprot_writecombine(vma->vm_page_prot));
+		break;
+	default:
+		err = -EINVAL;
 	}
 
-	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-	return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, len,
-				  vma->vm_page_prot);
+	ibdev_dbg(dev,
+		  "Couldn't mmap address[%#llx] length[%#llx] mmap_flag[%d] err[%d]\n",
+		  entry->address, length, entry->mmap_flag, err);
+
+	return err;
 }
 
 int qedr_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
index 9aaa90283d6e..724d0983e972 100644
--- a/drivers/infiniband/hw/qedr/verbs.h
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -46,7 +46,7 @@ int qedr_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
 int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
 void qedr_dealloc_ucontext(struct ib_ucontext *uctx);
 
-int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
+int qedr_mmap(struct ib_ucontext *ucontext, struct vm_area_struct *vma);
 int qedr_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 void qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
-- 
2.14.5


^ permalink raw reply related

* [PATCH v5 rdma-next 1/6] RDMA/core: Create mmap database and cookie helper functions
From: Michal Kalderon @ 2019-07-08  9:14 UTC (permalink / raw)
  To: michal.kalderon, ariel.elior, jgg, dledford, galpress
  Cc: linux-rdma, davem, netdev
In-Reply-To: <20190708091503.14723-1-michal.kalderon@marvell.com>

Create some common API's for adding entries to a xa_mmap.
Searching for an entry and freeing one.

The code was copied from the efa driver almost as is, just renamed
function to be generic and not efa specific.

Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
---
 drivers/infiniband/core/device.c      |   1 +
 drivers/infiniband/core/rdma_core.c   |   1 +
 drivers/infiniband/core/uverbs_cmd.c  |   1 +
 drivers/infiniband/core/uverbs_main.c | 105 ++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h               |  32 +++++++++++
 5 files changed, 140 insertions(+)

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 8a6ccb936dfe..a830c2c5d691 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2521,6 +2521,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
 	SET_DEVICE_OP(dev_ops, map_phys_fmr);
 	SET_DEVICE_OP(dev_ops, mmap);
+	SET_DEVICE_OP(dev_ops, mmap_free);
 	SET_DEVICE_OP(dev_ops, modify_ah);
 	SET_DEVICE_OP(dev_ops, modify_cq);
 	SET_DEVICE_OP(dev_ops, modify_device);
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index ccf4d069c25c..7166741834c8 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -817,6 +817,7 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
 	rdma_restrack_del(&ucontext->res);
 
 	ib_dev->ops.dealloc_ucontext(ucontext);
+	rdma_user_mmap_entries_remove_free(ucontext);
 	kfree(ucontext);
 
 	ufile->ucontext = NULL;
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 7ddd0e5bc6b3..44c0600245e4 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -254,6 +254,7 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
 
 	mutex_init(&ucontext->per_mm_list_lock);
 	INIT_LIST_HEAD(&ucontext->per_mm_list);
+	xa_init(&ucontext->mmap_xa);
 
 	ret = get_unused_fd_flags(O_CLOEXEC);
 	if (ret < 0)
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 11c13c1381cf..37507cc27e8c 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -965,6 +965,111 @@ int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL(rdma_user_mmap_io);
 
+static inline u64
+rdma_user_mmap_get_key(const struct rdma_user_mmap_entry *entry)
+{
+	return (u64)entry->mmap_page << PAGE_SHIFT;
+}
+
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, u64 key, u64 len)
+{
+	struct rdma_user_mmap_entry *entry;
+	u64 mmap_page;
+
+	mmap_page = key >> PAGE_SHIFT;
+	if (mmap_page > U32_MAX)
+		return NULL;
+
+	entry = xa_load(&ucontext->mmap_xa, mmap_page);
+	if (!entry || rdma_user_mmap_get_key(entry) != key ||
+	    entry->length != len)
+		return NULL;
+
+	ibdev_dbg(ucontext->device,
+		  "mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx] removed\n",
+		  entry->obj, key, entry->address, entry->length);
+
+	return entry;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_get);
+
+/*
+ * Note this locking scheme cannot support removal of entries, except during
+ * ucontext destruction when the core code guarentees no concurrency.
+ */
+u64 rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, void *obj,
+				u64 address, u64 length, u8 mmap_flag)
+{
+	struct rdma_user_mmap_entry *entry;
+	u32 next_mmap_page;
+	int err;
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return RDMA_USER_MMAP_INVALID;
+
+	entry->obj = obj;
+	entry->address = address;
+	entry->length = length;
+	entry->mmap_flag = mmap_flag;
+
+	xa_lock(&ucontext->mmap_xa);
+	if (check_add_overflow(ucontext->mmap_xa_page,
+			       (u32)(length >> PAGE_SHIFT),
+			       &next_mmap_page))
+		goto err_unlock;
+
+	entry->mmap_page = ucontext->mmap_xa_page;
+	ucontext->mmap_xa_page = next_mmap_page;
+	err = __xa_insert(&ucontext->mmap_xa, entry->mmap_page, entry,
+			  GFP_KERNEL);
+	if (err)
+		goto err_unlock;
+
+	xa_unlock(&ucontext->mmap_xa);
+
+	ibdev_dbg(ucontext->device,
+		  "mmap: obj[0x%p] addr[%#llx], len[%#llx], key[%#llx] inserted\n",
+		  entry->obj, entry->address, entry->length,
+		  rdma_user_mmap_get_key(entry));
+
+	return rdma_user_mmap_get_key(entry);
+
+err_unlock:
+	xa_unlock(&ucontext->mmap_xa);
+	kfree(entry);
+	return RDMA_USER_MMAP_INVALID;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_insert);
+
+/*
+ * This is only called when the ucontext is destroyed and there can be no
+ * concurrent query via mmap or allocate on the xarray, thus we can be sure no
+ * other thread is using the entry pointer. We also know that all the BAR
+ * pages have either been zap'd or munmaped at this point.  Normal pages are
+ * refcounted and will be freed at the proper time.
+ */
+void rdma_user_mmap_entries_remove_free(struct ib_ucontext *ucontext)
+{
+	struct rdma_user_mmap_entry *entry;
+	unsigned long mmap_page;
+
+	xa_for_each(&ucontext->mmap_xa, mmap_page, entry) {
+		xa_erase(&ucontext->mmap_xa, mmap_page);
+
+		ibdev_dbg(ucontext->device,
+			  "mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx] removed\n",
+			  entry->obj, rdma_user_mmap_get_key(entry),
+			  entry->address, entry->length);
+		if (ucontext->device->ops.mmap_free)
+			ucontext->device->ops.mmap_free(entry->address,
+							entry->length,
+							entry->mmap_flag);
+		kfree(entry);
+	}
+}
+
 void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
 {
 	struct rdma_umap_priv *priv, *next_priv;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 26e9c2594913..54ce3fdae180 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1425,6 +1425,8 @@ struct ib_ucontext {
 	 * Implementation details of the RDMA core, don't use in drivers:
 	 */
 	struct rdma_restrack_entry res;
+	struct xarray mmap_xa;
+	u32 mmap_xa_page;
 };
 
 struct ib_uobject {
@@ -2311,6 +2313,7 @@ struct ib_device_ops {
 			      struct ib_udata *udata);
 	void (*dealloc_ucontext)(struct ib_ucontext *context);
 	int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma);
+	void (*mmap_free)(u64 address, u64 length, u8 mmap_flag);
 	void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
 	int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
 	void (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
@@ -2706,9 +2709,23 @@ void  ib_set_client_data(struct ib_device *device, struct ib_client *client,
 void ib_set_device_ops(struct ib_device *device,
 		       const struct ib_device_ops *ops);
 
+#define RDMA_USER_MMAP_INVALID U64_MAX
+struct rdma_user_mmap_entry {
+	void  *obj;
+	u64 address;
+	u64 length;
+	u32 mmap_page;
+	u8 mmap_flag;
+};
+
 #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
 int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
 		      unsigned long pfn, unsigned long size, pgprot_t prot);
+u64 rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, void *obj,
+				u64 address, u64 length, u8 mmap_flag);
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, u64 key, u64 len);
+void rdma_user_mmap_entries_remove_free(struct ib_ucontext *ucontext);
 #else
 static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext,
 				    struct vm_area_struct *vma,
@@ -2717,6 +2734,21 @@ static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext,
 {
 	return -EINVAL;
 }
+
+static u64 rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, void *obj,
+				       u64 address, u64 length, u8 mmap_flag)
+{
+	return RDMA_USER_MMAP_INVALID;
+}
+
+static struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, u64 key, u64 len)
+{
+	return NULL;
+}
+
+static void rdma_user_mmap_entries_remove_free(struct ib_ucontext *ucontext) {}
+
 #endif
 
 static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
-- 
2.14.5


^ permalink raw reply related

* [PATCH v5 rdma-next 2/6] RDMA/efa: Use the common mmap_xa helpers
From: Michal Kalderon @ 2019-07-08  9:14 UTC (permalink / raw)
  To: michal.kalderon, ariel.elior, jgg, dledford, galpress
  Cc: linux-rdma, davem, netdev
In-Reply-To: <20190708091503.14723-1-michal.kalderon@marvell.com>

Remove the functions related to managing the mmap_xa database.
This code was copied to the ib_core. Use the common API's instead.

Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
---
 drivers/infiniband/hw/efa/efa.h       |   3 +-
 drivers/infiniband/hw/efa/efa_main.c  |   1 +
 drivers/infiniband/hw/efa/efa_verbs.c | 183 ++++++++--------------------------
 3 files changed, 42 insertions(+), 145 deletions(-)

diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
index 119f8efec564..4afcf3679cdb 100644
--- a/drivers/infiniband/hw/efa/efa.h
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -71,8 +71,6 @@ struct efa_dev {
 
 struct efa_ucontext {
 	struct ib_ucontext ibucontext;
-	struct xarray mmap_xa;
-	u32 mmap_xa_page;
 	u16 uarn;
 };
 
@@ -147,6 +145,7 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata);
 void efa_dealloc_ucontext(struct ib_ucontext *ibucontext);
 int efa_mmap(struct ib_ucontext *ibucontext,
 	     struct vm_area_struct *vma);
+void efa_mmap_free(u64 address, u64 length, u8 mmap_flag);
 int efa_create_ah(struct ib_ah *ibah,
 		  struct rdma_ah_attr *ah_attr,
 		  u32 flags,
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index dd1c6d49466f..65508c73accd 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -215,6 +215,7 @@ static const struct ib_device_ops efa_dev_ops = {
 	.get_link_layer = efa_port_link_layer,
 	.get_port_immutable = efa_get_port_immutable,
 	.mmap = efa_mmap,
+	.mmap_free = efa_mmap_free,
 	.modify_qp = efa_modify_qp,
 	.query_device = efa_query_device,
 	.query_gid = efa_query_gid,
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index df77bc312a25..5dff892da161 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -13,34 +13,15 @@
 
 #include "efa.h"
 
-#define EFA_MMAP_FLAG_SHIFT 56
-#define EFA_MMAP_PAGE_MASK GENMASK(EFA_MMAP_FLAG_SHIFT - 1, 0)
-#define EFA_MMAP_INVALID U64_MAX
-
 enum {
 	EFA_MMAP_DMA_PAGE = 0,
 	EFA_MMAP_IO_WC,
 	EFA_MMAP_IO_NC,
 };
-
 #define EFA_AENQ_ENABLED_GROUPS \
 	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
 	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
 
-struct efa_mmap_entry {
-	void  *obj;
-	u64 address;
-	u64 length;
-	u32 mmap_page;
-	u8 mmap_flag;
-};
-
-static inline u64 get_mmap_key(const struct efa_mmap_entry *efa)
-{
-	return ((u64)efa->mmap_flag << EFA_MMAP_FLAG_SHIFT) |
-	       ((u64)efa->mmap_page << PAGE_SHIFT);
-}
-
 #define EFA_CHUNK_PAYLOAD_SHIFT       12
 #define EFA_CHUNK_PAYLOAD_SIZE        BIT(EFA_CHUNK_PAYLOAD_SHIFT)
 #define EFA_CHUNK_PAYLOAD_PTR_SIZE    8
@@ -145,105 +126,7 @@ static void *efa_zalloc_mapped(struct efa_dev *dev, dma_addr_t *dma_addr,
 	return addr;
 }
 
-/*
- * This is only called when the ucontext is destroyed and there can be no
- * concurrent query via mmap or allocate on the xarray, thus we can be sure no
- * other thread is using the entry pointer. We also know that all the BAR
- * pages have either been zap'd or munmaped at this point.  Normal pages are
- * refcounted and will be freed at the proper time.
- */
-static void mmap_entries_remove_free(struct efa_dev *dev,
-				     struct efa_ucontext *ucontext)
-{
-	struct efa_mmap_entry *entry;
-	unsigned long mmap_page;
 
-	xa_for_each(&ucontext->mmap_xa, mmap_page, entry) {
-		xa_erase(&ucontext->mmap_xa, mmap_page);
-
-		ibdev_dbg(
-			&dev->ibdev,
-			"mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx] removed\n",
-			entry->obj, get_mmap_key(entry), entry->address,
-			entry->length);
-		if (entry->mmap_flag == EFA_MMAP_DMA_PAGE)
-			/* DMA mapping is already gone, now free the pages */
-			free_pages_exact(phys_to_virt(entry->address),
-					 entry->length);
-		kfree(entry);
-	}
-}
-
-static struct efa_mmap_entry *mmap_entry_get(struct efa_dev *dev,
-					     struct efa_ucontext *ucontext,
-					     u64 key, u64 len)
-{
-	struct efa_mmap_entry *entry;
-	u64 mmap_page;
-
-	mmap_page = (key & EFA_MMAP_PAGE_MASK) >> PAGE_SHIFT;
-	if (mmap_page > U32_MAX)
-		return NULL;
-
-	entry = xa_load(&ucontext->mmap_xa, mmap_page);
-	if (!entry || get_mmap_key(entry) != key || entry->length != len)
-		return NULL;
-
-	ibdev_dbg(&dev->ibdev,
-		  "mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx] removed\n",
-		  entry->obj, key, entry->address, entry->length);
-
-	return entry;
-}
-
-/*
- * Note this locking scheme cannot support removal of entries, except during
- * ucontext destruction when the core code guarentees no concurrency.
- */
-static u64 mmap_entry_insert(struct efa_dev *dev, struct efa_ucontext *ucontext,
-			     void *obj, u64 address, u64 length, u8 mmap_flag)
-{
-	struct efa_mmap_entry *entry;
-	u32 next_mmap_page;
-	int err;
-
-	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return EFA_MMAP_INVALID;
-
-	entry->obj = obj;
-	entry->address = address;
-	entry->length = length;
-	entry->mmap_flag = mmap_flag;
-
-	xa_lock(&ucontext->mmap_xa);
-	if (check_add_overflow(ucontext->mmap_xa_page,
-			       (u32)(length >> PAGE_SHIFT),
-			       &next_mmap_page))
-		goto err_unlock;
-
-	entry->mmap_page = ucontext->mmap_xa_page;
-	ucontext->mmap_xa_page = next_mmap_page;
-	err = __xa_insert(&ucontext->mmap_xa, entry->mmap_page, entry,
-			  GFP_KERNEL);
-	if (err)
-		goto err_unlock;
-
-	xa_unlock(&ucontext->mmap_xa);
-
-	ibdev_dbg(
-		&dev->ibdev,
-		"mmap: obj[0x%p] addr[%#llx], len[%#llx], key[%#llx] inserted\n",
-		entry->obj, entry->address, entry->length, get_mmap_key(entry));
-
-	return get_mmap_key(entry);
-
-err_unlock:
-	xa_unlock(&ucontext->mmap_xa);
-	kfree(entry);
-	return EFA_MMAP_INVALID;
-
-}
 
 int efa_query_device(struct ib_device *ibdev,
 		     struct ib_device_attr *props,
@@ -488,45 +371,52 @@ static int qp_mmap_entries_setup(struct efa_qp *qp,
 				 struct efa_com_create_qp_params *params,
 				 struct efa_ibv_create_qp_resp *resp)
 {
+	u64 address;
+	u64 length;
 	/*
 	 * Once an entry is inserted it might be mmapped, hence cannot be
 	 * cleaned up until dealloc_ucontext.
 	 */
 	resp->sq_db_mmap_key =
-		mmap_entry_insert(dev, ucontext, qp,
-				  dev->db_bar_addr + resp->sq_db_offset,
-				  PAGE_SIZE, EFA_MMAP_IO_NC);
-	if (resp->sq_db_mmap_key == EFA_MMAP_INVALID)
+		rdma_user_mmap_entry_insert(&ucontext->ibucontext, qp,
+					    dev->db_bar_addr +
+					    resp->sq_db_offset,
+					    PAGE_SIZE, EFA_MMAP_IO_NC);
+	if (resp->sq_db_mmap_key == RDMA_USER_MMAP_INVALID)
 		return -ENOMEM;
 
 	resp->sq_db_offset &= ~PAGE_MASK;
 
+	address = dev->mem_bar_addr + resp->llq_desc_offset;
+	length = PAGE_ALIGN(params->sq_ring_size_in_bytes +
+			    (resp->llq_desc_offset & ~PAGE_MASK));
 	resp->llq_desc_mmap_key =
-		mmap_entry_insert(dev, ucontext, qp,
-				  dev->mem_bar_addr + resp->llq_desc_offset,
-				  PAGE_ALIGN(params->sq_ring_size_in_bytes +
-					     (resp->llq_desc_offset & ~PAGE_MASK)),
-				  EFA_MMAP_IO_WC);
-	if (resp->llq_desc_mmap_key == EFA_MMAP_INVALID)
+		rdma_user_mmap_entry_insert(&ucontext->ibucontext, qp,
+					    address,
+					    length,
+					    EFA_MMAP_IO_WC);
+	if (resp->llq_desc_mmap_key == RDMA_USER_MMAP_INVALID)
 		return -ENOMEM;
 
 	resp->llq_desc_offset &= ~PAGE_MASK;
 
 	if (qp->rq_size) {
+		address = dev->db_bar_addr + resp->rq_db_offset;
 		resp->rq_db_mmap_key =
-			mmap_entry_insert(dev, ucontext, qp,
-					  dev->db_bar_addr + resp->rq_db_offset,
-					  PAGE_SIZE, EFA_MMAP_IO_NC);
-		if (resp->rq_db_mmap_key == EFA_MMAP_INVALID)
+			rdma_user_mmap_entry_insert(&ucontext->ibucontext, qp,
+						    address, PAGE_SIZE,
+						    EFA_MMAP_IO_NC);
+		if (resp->rq_db_mmap_key == RDMA_USER_MMAP_INVALID)
 			return -ENOMEM;
 
 		resp->rq_db_offset &= ~PAGE_MASK;
 
+		address = virt_to_phys(qp->rq_cpu_addr);
 		resp->rq_mmap_key =
-			mmap_entry_insert(dev, ucontext, qp,
-					  virt_to_phys(qp->rq_cpu_addr),
-					  qp->rq_size, EFA_MMAP_DMA_PAGE);
-		if (resp->rq_mmap_key == EFA_MMAP_INVALID)
+			rdma_user_mmap_entry_insert(&ucontext->ibucontext, qp,
+						    address, qp->rq_size,
+						    EFA_MMAP_DMA_PAGE);
+		if (resp->rq_mmap_key == RDMA_USER_MMAP_INVALID)
 			return -ENOMEM;
 
 		resp->rq_mmap_size = qp->rq_size;
@@ -875,11 +765,13 @@ void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
 				 struct efa_ibv_create_cq_resp *resp)
 {
+	struct efa_ucontext *ucontext = cq->ucontext;
 	resp->q_mmap_size = cq->size;
-	resp->q_mmap_key = mmap_entry_insert(dev, cq->ucontext, cq,
-					     virt_to_phys(cq->cpu_addr),
-					     cq->size, EFA_MMAP_DMA_PAGE);
-	if (resp->q_mmap_key == EFA_MMAP_INVALID)
+	resp->q_mmap_key =
+		rdma_user_mmap_entry_insert(&ucontext->ibucontext, cq,
+					    virt_to_phys(cq->cpu_addr),
+					    cq->size, EFA_MMAP_DMA_PAGE);
+	if (resp->q_mmap_key == RDMA_USER_MMAP_INVALID)
 		return -ENOMEM;
 
 	return 0;
@@ -1531,7 +1423,6 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
 		goto err_out;
 
 	ucontext->uarn = result.uarn;
-	xa_init(&ucontext->mmap_xa);
 
 	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
 	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH;
@@ -1560,19 +1451,25 @@ void efa_dealloc_ucontext(struct ib_ucontext *ibucontext)
 	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
 	struct efa_dev *dev = to_edev(ibucontext->device);
 
-	mmap_entries_remove_free(dev, ucontext);
 	efa_dealloc_uar(dev, ucontext->uarn);
 }
 
+void efa_mmap_free(u64 address, u64 length, u8 mmap_flag)
+{
+	/* DMA mapping is already gone, now free the pages */
+	if (mmap_flag == EFA_MMAP_DMA_PAGE)
+		free_pages_exact(phys_to_virt(address), length);
+}
+
 static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
 		      struct vm_area_struct *vma, u64 key, u64 length)
 {
-	struct efa_mmap_entry *entry;
+	struct rdma_user_mmap_entry *entry;
 	unsigned long va;
 	u64 pfn;
 	int err;
 
-	entry = mmap_entry_get(dev, ucontext, key, length);
+	entry = rdma_user_mmap_entry_get(&ucontext->ibucontext, key, length);
 	if (!entry) {
 		ibdev_dbg(&dev->ibdev, "key[%#llx] does not have valid entry\n",
 			  key);
-- 
2.14.5


^ permalink raw reply related

* [PATCH v5 rdma-next 4/6] qed*: Change dpi_addr to be denoted with __iomem
From: Michal Kalderon @ 2019-07-08  9:15 UTC (permalink / raw)
  To: michal.kalderon, ariel.elior, jgg, dledford, galpress
  Cc: linux-rdma, davem, netdev
In-Reply-To: <20190708091503.14723-1-michal.kalderon@marvell.com>

Several casts were required around dpi_addr parameter in qed_rdma_if.h
This is an address on the doorbell bar and should therefore be marked
with __iomem.

Reported-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
---
 drivers/infiniband/hw/qedr/main.c          | 2 +-
 drivers/infiniband/hw/qedr/qedr.h          | 2 +-
 drivers/net/ethernet/qlogic/qed/qed_rdma.c | 5 ++---
 include/linux/qed/qed_rdma_if.h            | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index a0a7ba0a5af4..3db4b6ba5ad6 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -815,7 +815,7 @@ static int qedr_init_hw(struct qedr_dev *dev)
 	if (rc)
 		goto out;
 
-	dev->db_addr = (void __iomem *)(uintptr_t)out_params.dpi_addr;
+	dev->db_addr = out_params.dpi_addr;
 	dev->db_phys_addr = out_params.dpi_phys_addr;
 	dev->db_size = out_params.dpi_size;
 	dev->dpi = out_params.dpi;
diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
index 97c90d1e525d..7e80ce521d8d 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -227,7 +227,7 @@ struct qedr_ucontext {
 	struct ib_ucontext ibucontext;
 	struct qedr_dev *dev;
 	struct qedr_pd *pd;
-	u64 dpi_addr;
+	void __iomem *dpi_addr;
 	u64 dpi_phys_addr;
 	u32 dpi_size;
 	u16 dpi;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index 7873d6dfd91f..fb3fe60a1a68 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -799,9 +799,8 @@ static int qed_rdma_add_user(void *rdma_cxt,
 	/* Calculate the corresponding DPI address */
 	dpi_start_offset = p_hwfn->dpi_start_offset;
 
-	out_params->dpi_addr = (u64)((u8 __iomem *)p_hwfn->doorbells +
-				     dpi_start_offset +
-				     ((out_params->dpi) * p_hwfn->dpi_size));
+	out_params->dpi_addr = p_hwfn->doorbells + dpi_start_offset +
+			       out_params->dpi * p_hwfn->dpi_size;
 
 	out_params->dpi_phys_addr = p_hwfn->cdev->db_phys_addr +
 				    dpi_start_offset +
diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h
index d15f8e4815e3..834166809a6c 100644
--- a/include/linux/qed/qed_rdma_if.h
+++ b/include/linux/qed/qed_rdma_if.h
@@ -225,7 +225,7 @@ struct qed_rdma_start_in_params {
 
 struct qed_rdma_add_user_out_params {
 	u16 dpi;
-	u64 dpi_addr;
+	void __iomem *dpi_addr;
 	u64 dpi_phys_addr;
 	u32 dpi_size;
 	u16 wid_count;
-- 
2.14.5


^ permalink raw reply related

* [PATCH v5 rdma-next 0/6] RDMA/qedr: Use the doorbell overflow recovery mechanism for RDMA
From: Michal Kalderon @ 2019-07-08  9:14 UTC (permalink / raw)
  To: michal.kalderon, ariel.elior, jgg, dledford, galpress
  Cc: linux-rdma, davem, netdev

This patch series uses the doorbell overflow recovery mechanism
introduced in
commit 36907cd5cd72 ("qed: Add doorbell overflow recovery mechanism")
for rdma ( RoCE and iWARP )

The first three patches modify the core code to contain helper
functions for managing mmap_xa inserting, getting and freeing
entries. The code was taken almost as is from the efa driver.
There is still an open discussion on whether we should take
this even further and make the entire mmap generic. Until a
decision is made, I only created the database API and modified 
the efa and qedr driver to use it. The doorbell recovery code will be based
on the common code.

Efa driver was compile tested only.

rdma-core pull request #493

Changes from V4:
- Add common mmap database and cookie helper functions.

Changes from V3:
- Remove casts from void to u8. Pointer arithmetic can be done on void
- rebase to tip of rdma-next

Changes from V2:
- Don't use long-lived kmap. Instead use user-trigger mmap for the
  doorbell recovery entries.
- Modify dpi_addr to be denoted with __iomem and avoid redundant
  casts

Changes from V1:
- call kmap to map virtual address into kernel space
- modify db_rec_delete to be void
- remove some cpu_to_le16 that were added to previous patch which are
  correct but not related to the overflow recovery mechanism. Will be
  submitted as part of a different patch


Michal Kalderon (6):
  RDMA/core: Create mmap database and cookie helper functions
  RDMA/efa: Use the common mmap_xa helpers
  RDMA/qedr: Use the common mmap API
  qed*: Change dpi_addr to be denoted with __iomem
  RDMA/qedr: Add doorbell overflow recovery support
  RDMA/qedr: Add iWARP doorbell recovery support

 drivers/infiniband/core/device.c           |   1 +
 drivers/infiniband/core/rdma_core.c        |   1 +
 drivers/infiniband/core/uverbs_cmd.c       |   1 +
 drivers/infiniband/core/uverbs_main.c      | 105 +++++++
 drivers/infiniband/hw/efa/efa.h            |   3 +-
 drivers/infiniband/hw/efa/efa_main.c       |   1 +
 drivers/infiniband/hw/efa/efa_verbs.c      | 183 +++---------
 drivers/infiniband/hw/qedr/main.c          |   3 +-
 drivers/infiniband/hw/qedr/qedr.h          |  32 +-
 drivers/infiniband/hw/qedr/verbs.c         | 463 ++++++++++++++++++++---------
 drivers/infiniband/hw/qedr/verbs.h         |   4 +-
 drivers/net/ethernet/qlogic/qed/qed_rdma.c |   5 +-
 include/linux/qed/qed_rdma_if.h            |   2 +-
 include/rdma/ib_verbs.h                    |  32 ++
 include/uapi/rdma/qedr-abi.h               |  25 ++
 15 files changed, 554 insertions(+), 307 deletions(-)

-- 
2.14.5


^ permalink raw reply

* [PATCH v5 rdma-next 5/6] RDMA/qedr: Add doorbell overflow recovery support
From: Michal Kalderon @ 2019-07-08  9:15 UTC (permalink / raw)
  To: michal.kalderon, ariel.elior, jgg, dledford, galpress
  Cc: linux-rdma, davem, netdev
In-Reply-To: <20190708091503.14723-1-michal.kalderon@marvell.com>

Use the doorbell recovery mechanism to register rdma related doorbells
that will be restored in case there is a doorbell overflow attention.

Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
---
 drivers/infiniband/hw/qedr/main.c  |   1 +
 drivers/infiniband/hw/qedr/qedr.h  |   7 +
 drivers/infiniband/hw/qedr/verbs.c | 273 ++++++++++++++++++++++++++++++++-----
 drivers/infiniband/hw/qedr/verbs.h |   2 +
 include/uapi/rdma/qedr-abi.h       |  25 ++++
 5 files changed, 273 insertions(+), 35 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 3db4b6ba5ad6..34225c88f03d 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -206,6 +206,7 @@ static const struct ib_device_ops qedr_dev_ops = {
 	.get_link_layer = qedr_link_layer,
 	.map_mr_sg = qedr_map_mr_sg,
 	.mmap = qedr_mmap,
+	.mmap_free = qedr_mmap_free,
 	.modify_port = qedr_modify_port,
 	.modify_qp = qedr_modify_qp,
 	.modify_srq = qedr_modify_srq,
diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
index 7e80ce521d8d..8aed24b32de6 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -231,6 +231,7 @@ struct qedr_ucontext {
 	u64 dpi_phys_addr;
 	u32 dpi_size;
 	u16 dpi;
+	bool db_rec;
 };
 
 union db_prod64 {
@@ -258,6 +259,12 @@ struct qedr_userq {
 	struct qedr_pbl *pbl_tbl;
 	u64 buf_addr;
 	size_t buf_len;
+
+	/* doorbell recovery */
+	void __iomem *db_addr;
+	struct qedr_user_db_rec *db_rec_data;
+	u64 db_rec_phys;
+	u64 db_rec_key;
 };
 
 struct qedr_cq {
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index f33f0f1e7d76..15221d9c7773 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -60,6 +60,7 @@
 
 enum {
 	QEDR_USER_MMAP_IO_WC = 0,
+	QEDR_USER_MMAP_PHYS_PAGE,
 };
 
 static inline int qedr_ib_copy_to_udata(struct ib_udata *udata, void *src,
@@ -266,6 +267,7 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 	int rc;
 	struct qedr_ucontext *ctx = get_qedr_ucontext(uctx);
 	struct qedr_alloc_ucontext_resp uresp = {};
+	struct qedr_alloc_ucontext_req ureq = {};
 	struct qedr_dev *dev = get_qedr_dev(ibdev);
 	struct qed_rdma_add_user_out_params oparams;
 	u64 key;
@@ -273,6 +275,17 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 	if (!udata)
 		return -EFAULT;
 
+	if (udata->inlen) {
+		rc = ib_copy_from_udata(&ureq, udata,
+					min(sizeof(ureq), udata->inlen));
+		if (rc) {
+			DP_ERR(dev, "Problem copying data from user space\n");
+			return -EFAULT;
+		}
+
+		ctx->db_rec = !!(ureq.context_flags & QEDR_ALLOC_UCTX_DB_REC);
+	}
+
 	rc = dev->ops->rdma_add_user(dev->rdma_ctx, &oparams);
 	if (rc) {
 		DP_ERR(dev,
@@ -325,6 +338,13 @@ void qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
 	uctx->dev->ops->rdma_remove_user(uctx->dev->rdma_ctx, uctx->dpi);
 }
 
+void qedr_mmap_free(u64 address, u64 length, u8 mmap_flag)
+{
+	/* DMA mapping is already gone, now free the pages */
+	if (mmap_flag == QEDR_USER_MMAP_PHYS_PAGE)
+		free_page((unsigned long)phys_to_virt(address));
+}
+
 int qedr_mmap(struct ib_ucontext *ucontext, struct vm_area_struct *vma)
 {
 	struct ib_device *dev = ucontext->device;
@@ -368,6 +388,11 @@ int qedr_mmap(struct ib_ucontext *ucontext, struct vm_area_struct *vma)
 		err = rdma_user_mmap_io(ucontext, vma, pfn, length,
 					pgprot_writecombine(vma->vm_page_prot));
 		break;
+	case QEDR_USER_MMAP_PHYS_PAGE:
+		err = vm_insert_page(vma, vma->vm_start, pfn_to_page(pfn));
+		if (err)
+			break;
+		break;
 	default:
 		err = -EINVAL;
 	}
@@ -606,16 +631,48 @@ static void qedr_populate_pbls(struct qedr_dev *dev, struct ib_umem *umem,
 	}
 }
 
+static int qedr_db_recovery_add(struct qedr_dev *dev,
+				void __iomem *db_addr,
+				void *db_data,
+				enum qed_db_rec_width db_width,
+				enum qed_db_rec_space db_space)
+{
+	if (!db_data) {
+		DP_DEBUG(dev, QEDR_MSG_INIT, "avoiding db rec since old lib\n");
+		return 0;
+	}
+
+	return dev->ops->common->db_recovery_add(dev->cdev, db_addr, db_data,
+						 db_width, db_space);
+}
+
+static void qedr_db_recovery_del(struct qedr_dev *dev,
+				 void __iomem *db_addr,
+				 void *db_data)
+{
+	if (!db_data) {
+		DP_DEBUG(dev, QEDR_MSG_INIT, "avoiding db rec since old lib\n");
+		return;
+	}
+
+	/* Ignore return code as there is not much we can do about it. Error
+	 * log will be printed inside.
+	 */
+	dev->ops->common->db_recovery_del(dev->cdev, db_addr, db_data);
+}
+
 static int qedr_copy_cq_uresp(struct qedr_dev *dev,
-			      struct qedr_cq *cq, struct ib_udata *udata)
+			      struct qedr_cq *cq, struct ib_udata *udata,
+			      u32 db_offset)
 {
 	struct qedr_create_cq_uresp uresp;
 	int rc;
 
 	memset(&uresp, 0, sizeof(uresp));
 
-	uresp.db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT);
+	uresp.db_offset = db_offset;
 	uresp.icid = cq->icid;
+	uresp.db_rec_addr = cq->q.db_rec_key;
 
 	rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 	if (rc)
@@ -643,10 +700,42 @@ static inline int qedr_align_cq_entries(int entries)
 	return aligned_size / QEDR_CQE_SIZE;
 }
 
+static int qedr_init_user_db_rec(struct ib_udata *udata,
+				 struct qedr_dev *dev, struct qedr_userq *q,
+				 bool requires_db_rec)
+{
+	struct qedr_ucontext *uctx =
+		rdma_udata_to_drv_context(udata, struct qedr_ucontext,
+					  ibucontext);
+
+	/* Aborting for non doorbell userqueue (SRQ) or non-supporting lib */
+	if (requires_db_rec == 0 || !uctx->db_rec)
+		return 0;
+
+	/* Allocate a page for doorbell recovery, add to mmap ) */
+	q->db_rec_data = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!q->db_rec_data) {
+		DP_ERR(dev,
+		       "get_free_page failed\n");
+		return -ENOMEM;
+	}
+
+	q->db_rec_phys = virt_to_phys(q->db_rec_data);
+	q->db_rec_key = rdma_user_mmap_entry_insert(&uctx->ibucontext, q,
+						    q->db_rec_phys,
+						    PAGE_SIZE,
+						    QEDR_USER_MMAP_PHYS_PAGE);
+	if (q->db_rec_key == RDMA_USER_MMAP_INVALID)
+		return -ENOMEM;
+
+	return 0;
+}
+
 static inline int qedr_init_user_queue(struct ib_udata *udata,
 				       struct qedr_dev *dev,
 				       struct qedr_userq *q, u64 buf_addr,
-				       size_t buf_len, int access, int dmasync,
+				       size_t buf_len, bool requires_db_rec,
+				       int access, int dmasync,
 				       int alloc_and_init)
 {
 	u32 fw_pages;
@@ -684,7 +773,8 @@ static inline int qedr_init_user_queue(struct ib_udata *udata,
 		}
 	}
 
-	return 0;
+	/* mmap the user address used to store doorbell data for recovery */
+	return qedr_init_user_db_rec(udata, dev, q, requires_db_rec);
 
 err0:
 	ib_umem_release(q->umem);
@@ -770,6 +860,7 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	int entries = attr->cqe;
 	struct qedr_cq *cq = get_qedr_cq(ibcq);
 	int chain_entries;
+	u32 db_offset;
 	int page_cnt;
 	u64 pbl_ptr;
 	u16 icid;
@@ -789,8 +880,12 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	chain_entries = qedr_align_cq_entries(entries);
 	chain_entries = min_t(int, chain_entries, QEDR_MAX_CQES);
 
+	/* calc db offset. user will add DPI base, kernel will add db addr */
+	db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT);
+
 	if (udata) {
-		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
+		if (ib_copy_from_udata(&ureq, udata, min(sizeof(ureq),
+							 udata->inlen))) {
 			DP_ERR(dev,
 			       "create cq: problem copying data from user space\n");
 			goto err0;
@@ -805,8 +900,9 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		cq->cq_type = QEDR_CQ_TYPE_USER;
 
 		rc = qedr_init_user_queue(udata, dev, &cq->q, ureq.addr,
-					  ureq.len, IB_ACCESS_LOCAL_WRITE, 1,
-					  1);
+					  ureq.len, true,
+					  IB_ACCESS_LOCAL_WRITE,
+					  1, 1);
 		if (rc)
 			goto err0;
 
@@ -814,6 +910,7 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		page_cnt = cq->q.pbl_info.num_pbes;
 
 		cq->ibcq.cqe = chain_entries;
+		cq->q.db_addr = ctx->dpi_addr + db_offset;
 	} else {
 		cq->cq_type = QEDR_CQ_TYPE_KERNEL;
 
@@ -844,14 +941,21 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	spin_lock_init(&cq->cq_lock);
 
 	if (udata) {
-		rc = qedr_copy_cq_uresp(dev, cq, udata);
+		rc = qedr_copy_cq_uresp(dev, cq, udata, db_offset);
+		if (rc)
+			goto err3;
+
+		rc = qedr_db_recovery_add(dev, cq->q.db_addr,
+					  &cq->q.db_rec_data->db_data,
+					  DB_REC_WIDTH_64B,
+					  DB_REC_USER);
 		if (rc)
 			goto err3;
+
 	} else {
 		/* Generate doorbell address. */
-		cq->db_addr = dev->db_addr +
-		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT);
 		cq->db.data.icid = cq->icid;
+		cq->db_addr = dev->db_addr + db_offset;
 		cq->db.data.params = DB_AGG_CMD_SET <<
 		    RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT;
 
@@ -861,6 +965,11 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		cq->latest_cqe = NULL;
 		consume_cqe(cq);
 		cq->cq_cons = qed_chain_get_cons_idx_u32(&cq->pbl);
+
+		rc = qedr_db_recovery_add(dev, cq->db_addr, &cq->db.data,
+					  DB_REC_WIDTH_64B, DB_REC_KERNEL);
+		if (rc)
+			goto err3;
 	}
 
 	DP_DEBUG(dev, QEDR_MSG_CQ,
@@ -879,8 +988,18 @@ int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	else
 		dev->ops->common->chain_free(dev->cdev, &cq->pbl);
 err1:
-	if (udata)
+	if (udata) {
 		ib_umem_release(cq->q.umem);
+		if (cq->q.db_rec_data) {
+			qedr_db_recovery_del(dev, cq->q.db_addr,
+					     &cq->q.db_rec_data->db_data);
+			if (cq->q.db_rec_key == RDMA_USER_MMAP_INVALID)
+				free_page((unsigned long)cq->q.db_rec_data);
+			/* o/w will be freed by ib_uverbs on context free */
+		}
+	} else {
+		qedr_db_recovery_del(dev, cq->db_addr, &cq->db.data);
+	}
 err0:
 	return -EINVAL;
 }
@@ -911,8 +1030,10 @@ void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 	cq->destroyed = 1;
 
 	/* GSIs CQs are handled by driver, so they don't exist in the FW */
-	if (cq->cq_type == QEDR_CQ_TYPE_GSI)
+	if (cq->cq_type == QEDR_CQ_TYPE_GSI) {
+		qedr_db_recovery_del(dev, cq->db_addr, &cq->db.data);
 		return;
+	}
 
 	iparams.icid = cq->icid;
 	dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
@@ -921,6 +1042,12 @@ void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 	if (udata) {
 		qedr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl);
 		ib_umem_release(cq->q.umem);
+
+		if (cq->q.db_rec_data)
+			qedr_db_recovery_del(dev, cq->q.db_addr,
+					     &cq->q.db_rec_data->db_data);
+	} else {
+		qedr_db_recovery_del(dev, cq->db_addr, &cq->db.data);
 	}
 
 	/* We don't want the IRQ handler to handle a non-existing CQ so we
@@ -1085,8 +1212,8 @@ static int qedr_copy_srq_uresp(struct qedr_dev *dev,
 }
 
 static void qedr_copy_rq_uresp(struct qedr_dev *dev,
-			       struct qedr_create_qp_uresp *uresp,
-			       struct qedr_qp *qp)
+			      struct qedr_create_qp_uresp *uresp,
+			      struct qedr_qp *qp)
 {
 	/* iWARP requires two doorbells per RQ. */
 	if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
@@ -1099,6 +1226,7 @@ static void qedr_copy_rq_uresp(struct qedr_dev *dev,
 	}
 
 	uresp->rq_icid = qp->icid;
+	uresp->rq_db_rec_addr = qp->urq.db_rec_key;
 }
 
 static void qedr_copy_sq_uresp(struct qedr_dev *dev,
@@ -1112,22 +1240,24 @@ static void qedr_copy_sq_uresp(struct qedr_dev *dev,
 		uresp->sq_icid = qp->icid;
 	else
 		uresp->sq_icid = qp->icid + 1;
+
+	uresp->sq_db_rec_addr = qp->usq.db_rec_key;
 }
 
 static int qedr_copy_qp_uresp(struct qedr_dev *dev,
-			      struct qedr_qp *qp, struct ib_udata *udata)
+			      struct qedr_qp *qp, struct ib_udata *udata,
+			      struct qedr_create_qp_uresp *uresp)
 {
-	struct qedr_create_qp_uresp uresp;
 	int rc;
 
-	memset(&uresp, 0, sizeof(uresp));
-	qedr_copy_sq_uresp(dev, &uresp, qp);
-	qedr_copy_rq_uresp(dev, &uresp, qp);
+	memset(uresp, 0, sizeof(*uresp));
+	qedr_copy_sq_uresp(dev, uresp, qp);
+	qedr_copy_rq_uresp(dev, uresp, qp);
 
-	uresp.atomic_supported = dev->atomic_cap != IB_ATOMIC_NONE;
-	uresp.qp_id = qp->qp_id;
+	uresp->atomic_supported = dev->atomic_cap != IB_ATOMIC_NONE;
+	uresp->qp_id = qp->qp_id;
 
-	rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	rc = qedr_ib_copy_to_udata(udata, uresp, sizeof(*uresp));
 	if (rc)
 		DP_ERR(dev,
 		       "create qp: failed a copy to user space with qp icid=0x%x.\n",
@@ -1171,16 +1301,35 @@ static void qedr_set_common_qp_params(struct qedr_dev *dev,
 		 qp->sq.max_sges, qp->sq_cq->icid);
 }
 
-static void qedr_set_roce_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
+static int qedr_set_roce_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
 {
+	int rc;
+
 	qp->sq.db = dev->db_addr +
 		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD);
 	qp->sq.db_data.data.icid = qp->icid + 1;
+	rc = qedr_db_recovery_add(dev, qp->sq.db,
+				  &qp->sq.db_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_KERNEL);
+	if (rc)
+		return rc;
+
 	if (!qp->srq) {
 		qp->rq.db = dev->db_addr +
 			    DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD);
 		qp->rq.db_data.data.icid = qp->icid;
+
+		rc = qedr_db_recovery_add(dev, qp->rq.db,
+					  &qp->rq.db_data,
+					  DB_REC_WIDTH_32B,
+					  DB_REC_KERNEL);
+		if (rc)
+			qedr_db_recovery_del(dev, qp->sq.db,
+					     &qp->sq.db_data);
 	}
+
+	return rc;
 }
 
 static int qedr_check_srq_params(struct qedr_dev *dev,
@@ -1234,7 +1383,7 @@ static int qedr_init_srq_user_params(struct ib_udata *udata,
 	int rc;
 
 	rc = qedr_init_user_queue(udata, srq->dev, &srq->usrq, ureq->srq_addr,
-				  ureq->srq_len, access, dmasync, 1);
+				  ureq->srq_len, false, access, dmasync, 1);
 	if (rc)
 		return rc;
 
@@ -1330,7 +1479,8 @@ int qedr_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init_attr,
 	hw_srq->max_sges = init_attr->attr.max_sge;
 
 	if (udata) {
-		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
+		if (ib_copy_from_udata(&ureq, udata, min(sizeof(ureq),
+							 udata->inlen))) {
 			DP_ERR(dev,
 			       "create srq: problem copying data from user space\n");
 			goto err0;
@@ -1526,6 +1676,14 @@ static void qedr_cleanup_user(struct qedr_dev *dev, struct qedr_qp *qp)
 
 	ib_umem_release(qp->urq.umem);
 	qp->urq.umem = NULL;
+
+	if (qp->usq.db_rec_data)
+		qedr_db_recovery_del(dev, qp->usq.db_addr,
+				     &qp->usq.db_rec_data->db_data);
+
+	if (qp->urq.db_rec_data)
+		qedr_db_recovery_del(dev, qp->urq.db_addr,
+				     &qp->urq.db_rec_data->db_data);
 }
 
 static int qedr_create_user_qp(struct qedr_dev *dev,
@@ -1537,12 +1695,14 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
 	struct qed_rdma_create_qp_in_params in_params;
 	struct qed_rdma_create_qp_out_params out_params;
 	struct qedr_pd *pd = get_qedr_pd(ibpd);
+	struct qedr_create_qp_uresp uresp;
+	struct qedr_ucontext *ctx = NULL;
 	struct qedr_create_qp_ureq ureq;
 	int alloc_and_init = rdma_protocol_roce(&dev->ibdev, 1);
 	int rc = -EINVAL;
 
 	memset(&ureq, 0, sizeof(ureq));
-	rc = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
+	rc = ib_copy_from_udata(&ureq, udata, min(sizeof(ureq), udata->inlen));
 	if (rc) {
 		DP_ERR(dev, "Problem copying data from user space\n");
 		return rc;
@@ -1550,14 +1710,16 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
 
 	/* SQ - read access only (0), dma sync not required (0) */
 	rc = qedr_init_user_queue(udata, dev, &qp->usq, ureq.sq_addr,
-				  ureq.sq_len, 0, 0, alloc_and_init);
+				  ureq.sq_len, true, 0, 0,
+				  alloc_and_init);
 	if (rc)
 		return rc;
 
 	if (!qp->srq) {
 		/* RQ - read access only (0), dma sync not required (0) */
 		rc = qedr_init_user_queue(udata, dev, &qp->urq, ureq.rq_addr,
-					  ureq.rq_len, 0, 0, alloc_and_init);
+					  ureq.rq_len, true,
+					  0, 0, alloc_and_init);
 		if (rc)
 			return rc;
 	}
@@ -1587,13 +1749,31 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
 	qp->qp_id = out_params.qp_id;
 	qp->icid = out_params.icid;
 
-	rc = qedr_copy_qp_uresp(dev, qp, udata);
+	rc = qedr_copy_qp_uresp(dev, qp, udata, &uresp);
 	if (rc)
 		goto err;
 
+	/* db offset was calculated in copy_qp_uresp, now set in the user q */
+	ctx = pd->uctx;
+	qp->usq.db_addr = ctx->dpi_addr + uresp.sq_db_offset;
+	qp->urq.db_addr = ctx->dpi_addr + uresp.rq_db_offset;
+
+	rc = qedr_db_recovery_add(dev, qp->usq.db_addr,
+				  &qp->usq.db_rec_data->db_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_USER);
+	if (rc)
+		goto err;
+
+	rc = qedr_db_recovery_add(dev, qp->urq.db_addr,
+				  &qp->urq.db_rec_data->db_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_USER);
+	if (rc)
+		goto err;
 	qedr_qp_user_print(dev, qp);
 
-	return 0;
+	return rc;
 err:
 	rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
 	if (rc)
@@ -1604,12 +1784,21 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
 	return rc;
 }
 
-static void qedr_set_iwarp_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
+static int qedr_set_iwarp_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
 {
+	int rc;
+
 	qp->sq.db = dev->db_addr +
 	    DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD);
 	qp->sq.db_data.data.icid = qp->icid;
 
+	rc = qedr_db_recovery_add(dev, qp->sq.db,
+				  &qp->sq.db_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_KERNEL);
+	if (rc)
+		return rc;
+
 	qp->rq.db = dev->db_addr +
 		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_IWARP_RQ_PROD);
 	qp->rq.db_data.data.icid = qp->icid;
@@ -1617,6 +1806,13 @@ static void qedr_set_iwarp_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
 			   DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_FLAGS);
 	qp->rq.iwarp_db2_data.data.icid = qp->icid;
 	qp->rq.iwarp_db2_data.data.value = DQ_TCM_IWARP_POST_RQ_CF_CMD;
+
+	rc = qedr_db_recovery_add(dev, qp->rq.db,
+				  &qp->rq.db_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_KERNEL);
+
+	return rc;
 }
 
 static int
@@ -1664,8 +1860,7 @@ qedr_roce_create_kernel_qp(struct qedr_dev *dev,
 	qp->qp_id = out_params.qp_id;
 	qp->icid = out_params.icid;
 
-	qedr_set_roce_db_info(dev, qp);
-	return rc;
+	return qedr_set_roce_db_info(dev, qp);
 }
 
 static int
@@ -1723,8 +1918,7 @@ qedr_iwarp_create_kernel_qp(struct qedr_dev *dev,
 	qp->qp_id = out_params.qp_id;
 	qp->icid = out_params.icid;
 
-	qedr_set_iwarp_db_info(dev, qp);
-	return rc;
+	return qedr_set_iwarp_db_info(dev, qp);
 
 err:
 	dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
@@ -1739,6 +1933,15 @@ static void qedr_cleanup_kernel(struct qedr_dev *dev, struct qedr_qp *qp)
 
 	dev->ops->common->chain_free(dev->cdev, &qp->rq.pbl);
 	kfree(qp->rqe_wr_id);
+
+	/* GSI qp is not registered to db mechanism so no need to delete */
+	if (qp->qp_type == IB_QPT_GSI)
+		return;
+
+	qedr_db_recovery_del(dev, qp->sq.db, &qp->sq.db_data);
+
+	if (!qp->srq)
+		qedr_db_recovery_del(dev, qp->rq.db, &qp->rq.db_data);
 }
 
 static int qedr_create_kernel_qp(struct qedr_dev *dev,
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
index 724d0983e972..21f0ee356bef 100644
--- a/drivers/infiniband/hw/qedr/verbs.h
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -47,6 +47,8 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
 void qedr_dealloc_ucontext(struct ib_ucontext *uctx);
 
 int qedr_mmap(struct ib_ucontext *ucontext, struct vm_area_struct *vma);
+void qedr_mmap_free(u64 address, u64 length, u8 mmap_flag);
+
 int qedr_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 void qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h
index 7a10b3a325fa..c022ee26089b 100644
--- a/include/uapi/rdma/qedr-abi.h
+++ b/include/uapi/rdma/qedr-abi.h
@@ -38,6 +38,15 @@
 #define QEDR_ABI_VERSION		(8)
 
 /* user kernel communication data structures. */
+enum qedr_alloc_ucontext_flags {
+	QEDR_ALLOC_UCTX_RESERVED	= 1 << 0,
+	QEDR_ALLOC_UCTX_DB_REC		= 1 << 1
+};
+
+struct qedr_alloc_ucontext_req {
+	__u32 context_flags;
+	__u32 reserved;
+};
 
 struct qedr_alloc_ucontext_resp {
 	__aligned_u64 db_pa;
@@ -74,6 +83,7 @@ struct qedr_create_cq_uresp {
 	__u32 db_offset;
 	__u16 icid;
 	__u16 reserved;
+	__aligned_u64 db_rec_addr;
 };
 
 struct qedr_create_qp_ureq {
@@ -109,6 +119,13 @@ struct qedr_create_qp_uresp {
 
 	__u32 rq_db2_offset;
 	__u32 reserved;
+
+	/* address of SQ doorbell recovery user entry */
+	__aligned_u64 sq_db_rec_addr;
+
+	/* address of RQ doorbell recovery user entry */
+	__aligned_u64 rq_db_rec_addr;
+
 };
 
 struct qedr_create_srq_ureq {
@@ -128,4 +145,12 @@ struct qedr_create_srq_uresp {
 	__u32 reserved1;
 };
 
+/* doorbell recovery entry allocated and populated by userspace doorbelling
+ * entities and mapped to kernel. Kernel uses this to register doorbell
+ * information with doorbell drop recovery mechanism.
+ */
+struct qedr_user_db_rec {
+	__aligned_u64 db_data; /* doorbell data */
+};
+
 #endif /* __QEDR_USER_H__ */
-- 
2.14.5


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox