Linux RDMA and InfiniBand development

Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed

* [PATCH] i40iw: Convert page_size to encoded value
From: Henry Orosco @ 2016-11-10  3:25 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	e1000-rdma-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Henry Orosco

Passed in page_size was used as encoded value for writing
the WQE and passed in value was usually 4096. This was
working out since bit 0 was 0 and implies 4KB pages,
but would not work for other page sizes.

Signed-off-by: Mustafa Ismail <mustafa.ismail-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Henry Orosco <henry.orosco-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
 drivers/infiniband/hw/i40iw/i40iw_ctrl.c | 12 +++++++++---
 drivers/infiniband/hw/i40iw/i40iw_type.h |  5 +++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
index cd71444..bdb4421 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
@@ -2613,7 +2613,9 @@ static enum i40iw_status_code i40iw_sc_alloc_stag(
 	u64 *wqe;
 	struct i40iw_sc_cqp *cqp;
 	u64 header;
+	enum i40iw_page_size page_size;
 
+	page_size = (info->page_size == 0x200000) ? I40IW_PAGE_SIZE_2M : I40IW_PAGE_SIZE_4K;
 	cqp = dev->cqp;
 	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
 	if (!wqe)
@@ -2633,7 +2635,7 @@ static enum i40iw_status_code i40iw_sc_alloc_stag(
 		 LS_64(1, I40IW_CQPSQ_STAG_MR) |
 		 LS_64(info->access_rights, I40IW_CQPSQ_STAG_ARIGHTS) |
 		 LS_64(info->chunk_size, I40IW_CQPSQ_STAG_LPBLSIZE) |
-		 LS_64(info->page_size, I40IW_CQPSQ_STAG_HPAGESIZE) |
+		 LS_64(page_size, I40IW_CQPSQ_STAG_HPAGESIZE) |
 		 LS_64(info->remote_access, I40IW_CQPSQ_STAG_REMACCENABLED) |
 		 LS_64(info->use_hmc_fcn_index, I40IW_CQPSQ_STAG_USEHMCFNIDX) |
 		 LS_64(info->use_pf_rid, I40IW_CQPSQ_STAG_USEPFRID) |
@@ -2669,7 +2671,9 @@ static enum i40iw_status_code i40iw_sc_mr_reg_non_shared(
 	u32 pble_obj_cnt;
 	bool remote_access;
 	u8 addr_type;
+	enum i40iw_page_size page_size;
 
+	page_size = (info->page_size == 0x200000) ? I40IW_PAGE_SIZE_2M : I40IW_PAGE_SIZE_4K;
 	if (info->access_rights & (I40IW_ACCESS_FLAGS_REMOTEREAD_ONLY |
 				   I40IW_ACCESS_FLAGS_REMOTEWRITE_ONLY))
 		remote_access = true;
@@ -2712,7 +2716,7 @@ static enum i40iw_status_code i40iw_sc_mr_reg_non_shared(
 	header = LS_64(I40IW_CQP_OP_REG_MR, I40IW_CQPSQ_OPCODE) |
 		 LS_64(1, I40IW_CQPSQ_STAG_MR) |
 		 LS_64(info->chunk_size, I40IW_CQPSQ_STAG_LPBLSIZE) |
-		 LS_64(info->page_size, I40IW_CQPSQ_STAG_HPAGESIZE) |
+		 LS_64(page_size, I40IW_CQPSQ_STAG_HPAGESIZE) |
 		 LS_64(info->access_rights, I40IW_CQPSQ_STAG_ARIGHTS) |
 		 LS_64(remote_access, I40IW_CQPSQ_STAG_REMACCENABLED) |
 		 LS_64(addr_type, I40IW_CQPSQ_STAG_VABASEDTO) |
@@ -2927,7 +2931,9 @@ enum i40iw_status_code i40iw_sc_mr_fast_register(
 	u64 temp, header;
 	u64 *wqe;
 	u32 wqe_idx;
+	enum i40iw_page_size page_size;
 
+	page_size = (info->page_size == 0x200000) ? I40IW_PAGE_SIZE_2M : I40IW_PAGE_SIZE_4K;
 	wqe = i40iw_qp_get_next_send_wqe(&qp->qp_uk, &wqe_idx, I40IW_QP_WQE_MIN_SIZE,
 					 0, info->wr_id);
 	if (!wqe)
@@ -2954,7 +2960,7 @@ enum i40iw_status_code i40iw_sc_mr_fast_register(
 		 LS_64(info->stag_idx, I40IWQPSQ_STAGINDEX) |
 		 LS_64(I40IWQP_OP_FAST_REGISTER, I40IWQPSQ_OPCODE) |
 		 LS_64(info->chunk_size, I40IWQPSQ_LPBLSIZE) |
-		 LS_64(info->page_size, I40IWQPSQ_HPAGESIZE) |
+		 LS_64(page_size, I40IWQPSQ_HPAGESIZE) |
 		 LS_64(info->access_rights, I40IWQPSQ_STAGRIGHTS) |
 		 LS_64(info->addr_type, I40IWQPSQ_VABASEDTO) |
 		 LS_64(info->read_fence, I40IWQPSQ_READFENCE) |
diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h
index f60f0e2..1c58ba6 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_type.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_type.h
@@ -74,6 +74,11 @@ struct i40iw_cq_shadow_area {
 struct i40iw_priv_cq_ops;
 struct i40iw_hmc_ops;
 
+enum i40iw_page_size {
+	I40IW_PAGE_SIZE_4K = 0,
+	I40IW_PAGE_SIZE_2M
+};
+
 enum i40iw_resource_indicator_type {
 	I40IW_RSRC_INDICATOR_TYPE_ADAPTER = 0,
 	I40IW_RSRC_INDICATOR_TYPE_CQ,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] i40iw: Use vector when creating CQs
From: Henry Orosco @ 2016-11-10  3:24 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	e1000-rdma-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Henry Orosco

Assign each CEQ vector to a different CPU when possible, then
when creating a CQ, use the vector for the CEQ id. This
allows completion work to be distributed over multiple cores.

Signed-off-by: Mustafa Ismail <mustafa.ismail-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Henry Orosco <henry.orosco-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
 drivers/infiniband/hw/i40iw/i40iw_main.c  | 8 +++++++-
 drivers/infiniband/hw/i40iw/i40iw_verbs.c | 5 +++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
index 078d784..154a9d4 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -270,6 +270,7 @@ static void i40iw_disable_irq(struct i40iw_sc_dev *dev,
 		i40iw_wr32(dev->hw, I40E_PFINT_DYN_CTLN(msix_vec->idx - 1), 0);
 	else
 		i40iw_wr32(dev->hw, I40E_VFINT_DYN_CTLN1(msix_vec->idx - 1), 0);
+	irq_set_affinity_hint(msix_vec->irq, NULL);
 	free_irq(msix_vec->irq, dev_id);
 }
 
@@ -688,6 +689,7 @@ static enum i40iw_status_code i40iw_configure_ceq_vector(struct i40iw_device *iw
 							 struct i40iw_msix_vector *msix_vec)
 {
 	enum i40iw_status_code status;
+	cpumask_t mask;
 
 	if (iwdev->msix_shared && !ceq_id) {
 		tasklet_init(&iwdev->dpc_tasklet, i40iw_dpc, (unsigned long)iwdev);
@@ -697,12 +699,15 @@ static enum i40iw_status_code i40iw_configure_ceq_vector(struct i40iw_device *iw
 		status = request_irq(msix_vec->irq, i40iw_ceq_handler, 0, "CEQ", iwceq);
 	}
 
+	cpumask_clear(&mask);
+	cpumask_set_cpu(msix_vec->cpu_affinity, &mask);
+	irq_set_affinity_hint(msix_vec->irq, &mask);
+
 	if (status) {
 		i40iw_pr_err("ceq irq config fail\n");
 		return I40IW_ERR_CONFIG;
 	}
 	msix_vec->ceq_id = ceq_id;
-	msix_vec->cpu_affinity = 0;
 
 	return 0;
 }
@@ -1384,6 +1389,7 @@ static enum i40iw_status_code i40iw_save_msix_info(struct i40iw_device *iwdev,
 	for (i = 0, ceq_idx = 0; i < iwdev->msix_count; i++, iw_qvinfo++) {
 		iwdev->iw_msixtbl[i].idx = ldev->msix_entries[i].entry;
 		iwdev->iw_msixtbl[i].irq = ldev->msix_entries[i].vector;
+		iwdev->iw_msixtbl[i].cpu_affinity = ceq_idx;
 		if (i == 0) {
 			iw_qvinfo->aeq_idx = 0;
 			if (iwdev->msix_shared)
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 6329c97..2dbbf67 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -1137,7 +1137,8 @@ static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev,
 	ukinfo->cq_id = cq_num;
 	iwcq->ibcq.cqe = info.cq_uk_init_info.cq_size;
 	info.ceqe_mask = 0;
-	info.ceq_id = 0;
+	if (attr->comp_vector < iwdev->ceqs_count)
+		info.ceq_id = attr->comp_vector;
 	info.ceq_id_valid = true;
 	info.ceqe_mask = 1;
 	info.type = I40IW_CQ_TYPE_IWARP;
@@ -2621,7 +2622,7 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
 	    (1ull << IB_USER_VERBS_CMD_POST_RECV) |
 	    (1ull << IB_USER_VERBS_CMD_POST_SEND);
 	iwibdev->ibdev.phys_port_cnt = 1;
-	iwibdev->ibdev.num_comp_vectors = 1;
+	iwibdev->ibdev.num_comp_vectors = iwdev->ceqs_count;
 	iwibdev->ibdev.dma_device = &pcidev->dev;
 	iwibdev->ibdev.dev.parent = &pcidev->dev;
 	iwibdev->ibdev.query_port = i40iw_query_port;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: wireshark's RPC-over-RDMA dissector
From: Chuck Lever @ 2016-11-09 22:42 UTC (permalink / raw)
  To: Parav Pandit; +Cc: Linux NFS Mailing List, List Linux RDMA Mailing
In-Reply-To: <CAG53R5WQRCD32iEefGoaRqggAT1hyv_FSpCAN3c3Z7sVspzx1Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>


> On Nov 9, 2016, at 11:41 AM, Parav Pandit <pandit.parav-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> 
> Hi Chuck,
> 
> Just FYI.
> I have committed few fixes in wireshark trunk for RoCE and IB
> dissectors which will in general benefit other ULPs as well (primary
> for statefulness of ULPs) last month.
> 
> I have few patches pending in my sandbox in area of RoCE and for other
> ULP that I will push in coming days, likely next week.
> I am currently actively testing them and have made steady progress so far.
> 
> I will try to find sometime to review them next week.
> 
> Regards,
> Parav Pandit

Stateful dissection might be helpful for associating RPC-over-RDMA
headers with RDMA Reads and Writes on the same QP. Thanks Parav,
I'll watch for your work.


> On Wed, Nov 9, 2016 at 9:35 PM, Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org> wrote:
>> Hi-
>> 
>> Thanks to Yan Berman, for a couple of years now we've had a basic
>> RPC-over-RDMA dissector in wireshark that can be used with ibdump
>> captures. There have been some bugs noted, but no-one has had the
>> cycles to dig in and address.
>> 
>> Recently Tom Haynes helped me set up my own wireshark build so I
>> could help address some of the known issues.
>> 
>> http://git.linux-nfs.org/?p=cel/wireshark.git;a=shortlog;h=refs/heads/rpc-rdma-fixes
>> 
>> Posting here for review before I take the next steps to push these
>> to the wireshark community. Constructive critique and other
>> suggestions are welcome.
>> 
>> The fixes so far focus on dissecting transport headers correctly.
>> There continue to be significant open issues with the dissector:
>>        • There does not appear to be any support for dissecting
>>          RPC-over-RDMA on iWARP or RoCE
>>        • The NFS dissector does not handle portions of the XDR
>>          stream that were transmitted via RDMA Read/Write
>>        • RPC messages conveyed via RDMA_NOMSG are not recognized
>>          or dissected
>>        • There is no association between RDMA Reads and Writes
>>          and the RPC-over-RDMA message they go with
>>        • A CREQ / CREP pair are needed to identify which QP
>>          numbers are used for RPC-over-RDMA traffic
>>        • With TCP, the dissector fully outdents the RPC and NFS
>>          dissection results; but with RDMA, the dissector places
>>          the results in the tree under the Infiniband header
>>        • Not enough error detection in the dissector
>> 
>> --
>> Chuck Lever


--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH rdma-next 2/4] IB/core: Support rate limit for packet pacing
From: Bodong Wang @ 2016-11-09 21:00 UTC (permalink / raw)
  To: Hefty, Sean, Leon Romanovsky,
	dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1828884A29C6694DAF28B7E6B8A82373AB0A7F70-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>

On 11/9/2016 11:27 AM, Hefty, Sean wrote:
>>   enum ib_qp_state {
>> @@ -1151,6 +1152,7 @@ struct ib_qp_attr {
>>   	u8			rnr_retry;
>>   	u8			alt_port_num;
>>   	u8			alt_timeout;
>> +	u32			rate_limit;
>>   };
> We already have ib_qp_attr::ib_ah_attr::static_rate, and that accounts for both the primary and alternate paths.  We should not add a conflicting rate_limit field.  Either use static_rate as defined by the spec, or replace/update it, with corresponding changes to how it is used in conjunction with SM data.

They are different features. Static rate has a limitation on how many 
different speeds we could get, while packet pacing(rate limit) allows 
customer to set any number between the min and max range.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH net 2/2] qed: Correct rdma params configuration
From: Yuval Mintz @ 2016-11-09 20:48 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	dledford-H+wXaHxf7aLQT0dZR+AlfA,
	Ram.Amrani-YGCgFSpz5w/QT0dZR+AlfA, Yuval Mintz
In-Reply-To: <1478724524-16940-1-git-send-email-Yuval.Mintz-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>

From: Ram Amrani <Ram.Amrani-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>

Previous fix has broken RoCE support as the rdma_pf_params are now
being set into the parameters only after the params are alrady assigned
into the hw-function.

Fixes: 0189efb8f4f8 ("qed*: Fix Kconfig dependencies with INFINIBAND_QEDR")
Signed-off-by: Ram Amrani <Ram.Amrani-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
Signed-off-by: Yuval Mintz <Yuval.Mintz-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
---
 drivers/net/ethernet/qlogic/qed/qed_main.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index c418360..333c744 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -839,20 +839,19 @@ static void qed_update_pf_params(struct qed_dev *cdev,
 {
 	int i;
 
+	if (IS_ENABLED(CONFIG_QED_RDMA)) {
+		params->rdma_pf_params.num_qps = QED_ROCE_QPS;
+		params->rdma_pf_params.min_dpis = QED_ROCE_DPIS;
+		/* divide by 3 the MRs to avoid MF ILT overflow */
+		params->rdma_pf_params.num_mrs = RDMA_MAX_TIDS;
+		params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX;
+	}
+
 	for (i = 0; i < cdev->num_hwfns; i++) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
 		p_hwfn->pf_params = *params;
 	}
-
-	if (!IS_ENABLED(CONFIG_QED_RDMA))
-		return;
-
-	params->rdma_pf_params.num_qps = QED_ROCE_QPS;
-	params->rdma_pf_params.min_dpis = QED_ROCE_DPIS;
-	/* divide by 3 the MRs to avoid MF ILT overflow */
-	params->rdma_pf_params.num_mrs = RDMA_MAX_TIDS;
-	params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX;
 }
 
 static int qed_slowpath_start(struct qed_dev *cdev,
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH net 1/2] qed: configure ll2 RoCE v1/v2 flavor correctly
From: Yuval Mintz @ 2016-11-09 20:48 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	dledford-H+wXaHxf7aLQT0dZR+AlfA,
	Ram.Amrani-YGCgFSpz5w/QT0dZR+AlfA, Yuval Mintz
In-Reply-To: <1478724524-16940-1-git-send-email-Yuval.Mintz-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>

From: Ram Amrani <Ram.Amrani-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>

Currently RoCE v2 won't operate with RDMA CM due to missing setting of
the roce-flavour in the ll2 configuration.
This patch properly sets the flavour, and deletes incorrect HSI
that doesn't [yet] exist.

Fixes: abd49676c707 ("qed: Add RoCE ll2 & GSI support")
Signed-off-by: Ram Amrani <Ram.Amrani-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
Signed-off-by: Yuval Mintz <Yuval.Mintz-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
---
 drivers/net/ethernet/qlogic/qed/qed_hsi.h | 3 ---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 1 +
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 72eee29..2777d5b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -727,9 +727,6 @@ struct core_tx_bd_flags {
 #define CORE_TX_BD_FLAGS_L4_PROTOCOL_SHIFT	6
 #define CORE_TX_BD_FLAGS_L4_PSEUDO_CSUM_MODE_MASK	0x1
 #define CORE_TX_BD_FLAGS_L4_PSEUDO_CSUM_MODE_SHIFT 7
-#define CORE_TX_BD_FLAGS_ROCE_FLAV_MASK		0x1
-#define CORE_TX_BD_FLAGS_ROCE_FLAV_SHIFT	12
-
 };
 
 struct core_tx_bd {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 63e1a1b..f95385c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -1119,6 +1119,7 @@ static void qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
 	start_bd->bd_flags.as_bitfield |= CORE_TX_BD_FLAGS_START_BD_MASK <<
 	    CORE_TX_BD_FLAGS_START_BD_SHIFT;
 	SET_FIELD(start_bd->bitfield0, CORE_TX_BD_NBDS, num_of_bds);
+	SET_FIELD(start_bd->bitfield0, CORE_TX_BD_ROCE_FLAV, type);
 	DMA_REGPAIR_LE(start_bd->addr, first_frag);
 	start_bd->nbytes = cpu_to_le16(first_frag_len);
 
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH net 0/2] qed: Fix RoCE infrastructure
From: Yuval Mintz @ 2016-11-09 20:48 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	dledford-H+wXaHxf7aLQT0dZR+AlfA,
	Ram.Amrani-YGCgFSpz5w/QT0dZR+AlfA, Yuval Mintz

This series fixes 2 basic issues with RoCE support,
one handles a missing configuration in the initial infrastructure
support while the other is a regression introduced by one of the
initial fix submissions.

Dave,

I've built this series against 'net'  [specifically on top of
cdb26d3387f0 ("net: bgmac: fix reversed checks for clock control flag")],
but I'm not entirely convinced whether that's the right place for them -
On one-hand, they're RoCE-specific fixes, but they're also infrastructure
fixes that are wholly contained inside qed.
Would like to hear your [and Doug's] preference on whether we should
target such patches to net or linux-rdma.

Thanks,
Yuval

Ram Amrani (2):
  qed: configure ll2 RoCE v1/v2 flavor correctly
  qed: correct rdma params configuration

 drivers/net/ethernet/qlogic/qed/qed_hsi.h  |  3 ---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c  |  1 +
 drivers/net/ethernet/qlogic/qed/qed_main.c | 17 ++++++++---------
 3 files changed, 9 insertions(+), 12 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v1 14/14] xprtrdma: Update documenting comment
From: Chuck Lever @ 2016-11-09 19:06 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

Clean up: If reset fails, FRMRs are no longer abandoned, rather
they are released immediately. Update the comment to reflect this.

Fixes: 2ffc871a574d ('xprtrdma: Release orphaned MRs immediately')
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/frwr_ops.c |    4 ----
 1 file changed, 4 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 900dc40..47bed53 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -171,10 +171,6 @@
 }
 
 /* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
- *
- * There's no recovery if this fails. The FRMR is abandoned, but
- * remains in rb_all. It will be cleaned up when the transport is
- * destroyed.
  */
 static void
 frwr_op_recover_mr(struct rpcrdma_mw *mw)

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 13/14] xprtrdma: Refactor FRMR invalidation
From: Chuck Lever @ 2016-11-09 19:06 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

Clean up: After some recent updates, clarifications can be made to
the FRMR invalidation logic.

- Both the remote and local invalidation case mark the frmr INVALID,
  so make that a common path.

- Manage the WR list more "tastefully" by replacing the conditional
  that discriminates between the list head and ->next pointers.

- Use mw->mw_handle in all cases, since that has the same value as
  f->fr_mr->rkey, and is already in cache.

Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/frwr_ops.c |   57 +++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 36 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index e99bf61..900dc40 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -457,26 +457,6 @@
 	return -ENOTCONN;
 }
 
-static struct ib_send_wr *
-__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
-{
-	struct rpcrdma_frmr *f = &mw->frmr;
-	struct ib_send_wr *invalidate_wr;
-
-	dprintk("RPC:       %s: invalidating frmr %p\n", __func__, f);
-
-	f->fr_state = FRMR_IS_INVALID;
-	invalidate_wr = &f->fr_invwr;
-
-	memset(invalidate_wr, 0, sizeof(*invalidate_wr));
-	f->fr_cqe.done = frwr_wc_localinv;
-	invalidate_wr->wr_cqe = &f->fr_cqe;
-	invalidate_wr->opcode = IB_WR_LOCAL_INV;
-	invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey;
-
-	return invalidate_wr;
-}
-
 /* Invalidate all memory regions that were registered for "req".
  *
  * Sleeps until it is safe for the host CPU to access the
@@ -487,7 +467,7 @@
 static void
 frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
-	struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
+	struct ib_send_wr *first, **prev, *last, *bad_wr;
 	struct rpcrdma_rep *rep = req->rl_reply;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_mw *mw, *tmp;
@@ -503,23 +483,28 @@
 	 */
 	f = NULL;
 	count = 0;
-	invalidate_wrs = pos = prev = NULL;
+	prev = &first;
 	list_for_each_entry(mw, &req->rl_registered, mw_list) {
+		mw->frmr.fr_state = FRMR_IS_INVALID;
+
 		if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) &&
-		    (mw->mw_handle == rep->rr_inv_rkey)) {
-			mw->frmr.fr_state = FRMR_IS_INVALID;
+		    (mw->mw_handle == rep->rr_inv_rkey))
 			continue;
-		}
 
-		pos = __frwr_prepare_linv_wr(mw);
+		f = &mw->frmr;
+		dprintk("RPC:       %s: invalidating frmr %p\n",
+			__func__, f);
+
+		f->fr_cqe.done = frwr_wc_localinv;
+		last = &f->fr_invwr;
+		memset(last, 0, sizeof(*last));
+		last->wr_cqe = &f->fr_cqe;
+		last->opcode = IB_WR_LOCAL_INV;
+		last->ex.invalidate_rkey = mw->mw_handle;
 		count++;
 
-		if (!invalidate_wrs)
-			invalidate_wrs = pos;
-		else
-			prev->next = pos;
-		prev = pos;
-		f = &mw->frmr;
+		*prev = last;
+		prev = &last->next;
 	}
 	if (!f)
 		goto unmap;
@@ -528,7 +513,7 @@
 	 * last WR in the chain completes, all WRs in the chain
 	 * are complete.
 	 */
-	f->fr_invwr.send_flags = IB_SEND_SIGNALED;
+	last->send_flags = IB_SEND_SIGNALED;
 	f->fr_cqe.done = frwr_wc_localinv_wake;
 	reinit_completion(&f->fr_linv_done);
 
@@ -543,7 +528,7 @@
 	 * unless ri_id->qp is a valid pointer.
 	 */
 	r_xprt->rx_stats.local_inv_needed++;
-	rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
+	rc = ib_post_send(ia->ri_id->qp, first, &bad_wr);
 	if (rc)
 		goto reset_mrs;
 
@@ -554,7 +539,7 @@
 	 */
 unmap:
 	list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
-		dprintk("RPC:       %s: unmapping frmr %p\n",
+		dprintk("RPC:       %s: DMA unmapping frmr %p\n",
 			__func__, &mw->frmr);
 		list_del_init(&mw->mw_list);
 		ib_dma_unmap_sg(ia->ri_device,
@@ -572,7 +557,7 @@
 	 */
 	list_for_each_entry(mw, &req->rl_registered, mw_list) {
 		f = &mw->frmr;
-		if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
+		if (mw->mw_handle == bad_wr->ex.invalidate_rkey) {
 			__frwr_reset_mr(ia, mw);
 			bad_wr = bad_wr->next;
 		}

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 12/14] xprtrdma: Simplify synopsis of rpcrdma_ep_connect()
From: Chuck Lever @ 2016-11-09 19:06 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

Clean up: Make the rpcrdma_xprt visible in the whole function.
The caller has it already, so eliminate the container_of noise, and
just pass it in explicitly.

Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/transport.c |    2 +-
 net/sunrpc/xprtrdma/verbs.c     |   17 +++++++----------
 net/sunrpc/xprtrdma/xprt_rdma.h |    2 +-
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 534c178..349903b 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -259,7 +259,7 @@
 
 	dprintk("RPC:       %s: %sconnect\n", __func__,
 			r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
-	rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+	rc = rpcrdma_ep_connect(r_xprt);
 	if (rc)
 		xprt_wake_pending_tasks(xprt, rc);
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 11d0774..901c3ab 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -634,26 +634,25 @@ static void rpcrdma_destroy_id(struct rdma_cm_id *id)
 	ib_free_cq(ep->rep_attr.send_cq);
 }
 
-/*
- * Connect unconnected endpoint.
- */
 int
-rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+rpcrdma_ep_connect(struct rpcrdma_xprt *r_xprt)
 {
+	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rdma_cm_id *id, *old;
 	int rc = 0;
 	int retry_count = 0;
 
 	if (ep->rep_connected != 0) {
-		struct rpcrdma_xprt *xprt;
+		struct sockaddr *sap;
+
 retry:
 		dprintk("RPC:       %s: reconnecting...\n", __func__);
 
 		rpcrdma_ep_disconnect(ep, ia);
 
-		xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
-		id = rpcrdma_create_id(xprt, ia,
-				(struct sockaddr *)&xprt->rx_data.addr);
+		sap = (struct sockaddr *)&r_xprt->rx_data.addr;
+		id = rpcrdma_create_id(r_xprt, ia, sap);
 		if (IS_ERR(id)) {
 			rc = -EHOSTUNREACH;
 			goto out;
@@ -735,12 +734,10 @@ static void rpcrdma_destroy_id(struct rdma_cm_id *id)
 		}
 		rc = ep->rep_connected;
 	} else {
-		struct rpcrdma_xprt *r_xprt;
 		unsigned int extras;
 
 		dprintk("RPC:       %s: connected\n", __func__);
 
-		r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
 		extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
 
 		if (extras) {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index e35efd4..74760d9 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -489,7 +489,7 @@ struct rpcrdma_xprt {
 int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
 				struct rpcrdma_create_data_internal *);
 void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
-int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+int rpcrdma_ep_connect(struct rpcrdma_xprt *r_xprt);
 void rpcrdma_conn_func(struct rpcrdma_ep *ep);
 void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 11/14] xprtrdma: Relocate connection helper functions
From: Chuck Lever @ 2016-11-09 19:06 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

Clean up: Disentangle connection helpers from RPC-over-RDMA reply
decoding functions.

Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/rpc_rdma.c  |   34 ----------------------------------
 net/sunrpc/xprtrdma/transport.c |   28 ++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/xprt_rdma.h |   10 +++-------
 3 files changed, 31 insertions(+), 41 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 01f5cbc..c52e0f2 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -906,28 +906,6 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
 	return fixup_copy_count;
 }
 
-void
-rpcrdma_connect_worker(struct work_struct *work)
-{
-	struct rpcrdma_ep *ep =
-		container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
-	struct rpcrdma_xprt *r_xprt =
-		container_of(ep, struct rpcrdma_xprt, rx_ep);
-	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
-
-	spin_lock_bh(&xprt->transport_lock);
-	if (++xprt->connect_cookie == 0)	/* maintain a reserved value */
-		++xprt->connect_cookie;
-	if (ep->rep_connected > 0) {
-		if (!xprt_test_and_set_connected(xprt))
-			xprt_wake_pending_tasks(xprt, 0);
-	} else {
-		if (xprt_test_and_clear_connected(xprt))
-			xprt_wake_pending_tasks(xprt, -ENOTCONN);
-	}
-	spin_unlock_bh(&xprt->transport_lock);
-}
-
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 /* By convention, backchannel calls arrive via rdma_msg type
  * messages, and never populate the chunk lists. This makes
@@ -959,18 +937,6 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
 }
 #endif	/* CONFIG_SUNRPC_BACKCHANNEL */
 
-/*
- * This function is called when an async event is posted to
- * the connection which changes the connection state. All it
- * does at this point is mark the connection up/down, the rpc
- * timers do the rest.
- */
-void
-rpcrdma_conn_func(struct rpcrdma_ep *ep)
-{
-	schedule_delayed_work(&ep->rep_connect_worker, 0);
-}
-
 /* Process received RPC/RDMA messages.
  *
  * Errors must result in the RPC task either being awakened, or
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 545d3fc..534c178 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -219,6 +219,34 @@
 		}
 }
 
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
+{
+	schedule_delayed_work(&ep->rep_connect_worker, 0);
+}
+
+void
+rpcrdma_connect_worker(struct work_struct *work)
+{
+	struct rpcrdma_ep *ep =
+		container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
+	struct rpcrdma_xprt *r_xprt =
+		container_of(ep, struct rpcrdma_xprt, rx_ep);
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+	spin_lock_bh(&xprt->transport_lock);
+	if (++xprt->connect_cookie == 0)	/* maintain a reserved value */
+		++xprt->connect_cookie;
+	if (ep->rep_connected > 0) {
+		if (!xprt_test_and_set_connected(xprt))
+			xprt_wake_pending_tasks(xprt, 0);
+	} else {
+		if (xprt_test_and_clear_connected(xprt))
+			xprt_wake_pending_tasks(xprt, -ENOTCONN);
+	}
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
 static void
 xprt_rdma_connect_worker(struct work_struct *work)
 {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index b8424fa..e35efd4 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -490,6 +490,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
 				struct rpcrdma_create_data_internal *);
 void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
 int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_conn_func(struct rpcrdma_ep *ep);
 void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 
 int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
@@ -549,13 +550,6 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction,
 }
 
 /*
- * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
- */
-void rpcrdma_connect_worker(struct work_struct *);
-void rpcrdma_conn_func(struct rpcrdma_ep *);
-void rpcrdma_reply_handler(struct work_struct *);
-
-/*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
 
@@ -572,12 +566,14 @@ bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *,
 void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *);
 int rpcrdma_marshal_req(struct rpc_rqst *);
 void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
+void rpcrdma_reply_handler(struct work_struct *work);
 
 /* RPC/RDMA module init - xprtrdma/transport.c
  */
 extern unsigned int xprt_rdma_max_inline_read;
 void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
 void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
+void rpcrdma_connect_worker(struct work_struct *work);
 void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
 int xprt_rdma_init(void);
 void xprt_rdma_cleanup(void);

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 10/14] xprtrdma: Update dprintk in rpcrdma_count_chunks
From: Chuck Lever @ 2016-11-09 19:06 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

Clean up: offset and handle should be zero-filled, just like in the
chunk encoders.

Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/rpc_rdma.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index d987c2d..01f5cbc 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -786,7 +786,7 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
 		ifdebug(FACILITY) {
 			u64 off;
 			xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
-			dprintk("RPC:       %s: chunk %d@0x%llx:0x%x\n",
+			dprintk("RPC:       %s: chunk %d@0x%016llx:0x%08x\n",
 				__func__,
 				be32_to_cpu(seg->rs_length),
 				(unsigned long long)off,

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 09/14] xprtrdma: Shorten QP access error message
From: Chuck Lever @ 2016-11-09 19:06 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

Clean up: The convention for this type of warning message is not to
show the function name or "RPC:       ".

Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/verbs.c |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 9f66fd8..11d0774 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -103,9 +103,9 @@
 {
 	struct rpcrdma_ep *ep = context;
 
-	pr_err("RPC:       %s: %s on device %s ep %p\n",
-	       __func__, ib_event_msg(event->event),
-		event->device->name, context);
+	pr_err("rpcrdma: %s on device %s ep %p\n",
+	       ib_event_msg(event->event), event->device->name, context);
+
 	if (ep->rep_connected == 1) {
 		ep->rep_connected = -EIO;
 		rpcrdma_conn_func(ep);

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 08/14] xprtrdma: Squelch "max send, max recv" messages at connect time
From: Chuck Lever @ 2016-11-09 19:05 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

Clean up: This message was intended to be a dprintk, as it is on the
server-side.

Fixes: 87cfb9a0c85c ('xprtrdma: Client-side support for ...')
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/verbs.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index cbb1885..9f66fd8 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -223,8 +223,8 @@
 		cdata->inline_rsize = rsize;
 	if (wsize < cdata->inline_wsize)
 		cdata->inline_wsize = wsize;
-	pr_info("rpcrdma: max send %u, max recv %u\n",
-		cdata->inline_wsize, cdata->inline_rsize);
+	dprintk("RPC:       %s: max send %u, max recv %u\n",
+		__func__, cdata->inline_wsize, cdata->inline_rsize);
 	rpcrdma_set_max_header_sizes(r_xprt);
 }
 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 07/14] xprtrdma: Avoid calls to ro_unmap_safe()
From: Chuck Lever @ 2016-11-09 19:05 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

Micro-optimization: Most of the time, calls to ro_unmap_safe are
expensive no-ops. Call only when there is work to do.

Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/transport.c |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index ed5e285..545d3fc 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -621,7 +621,8 @@
 
 	dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
 
-	ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
+	if (unlikely(!list_empty(&req->rl_registered)))
+		ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
 	rpcrdma_unmap_sges(ia, req);
 	rpcrdma_buffer_put(req);
 }
@@ -657,7 +658,8 @@
 	int rc = 0;
 
 	/* On retransmit, remove any previously registered chunks */
-	r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+	if (unlikely(!list_empty(&req->rl_registered)))
+		r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
 
 	rc = rpcrdma_marshal_req(rqst);
 	if (rc < 0)

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH] net/mlx4_en: Fix bpf_prog_add ref_cnt in mlx4
From: Daniel Borkmann @ 2016-11-09 19:05 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: Zhiyi Sun, Tariq Toukan, Yishai Hadas,
	netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	alexei.starovoitov-Re5JQEeQqe8AvxtiuMwx3w
In-Reply-To: <20161109170615.GB10428-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

On 11/09/2016 06:06 PM, Brenden Blanco wrote:
> On Wed, Nov 09, 2016 at 10:57:32AM +0100, Daniel Borkmann wrote:
>> On 11/09/2016 10:45 AM, Zhiyi Sun wrote:
>>> On Wed, Nov 09, 2016 at 10:05:31AM +0100, Daniel Borkmann wrote:
>>>> On 11/09/2016 08:35 AM, Zhiyi Sun wrote:
>>>>> There are rx_ring_num queues. Each queue will load xdp prog. So
>>>>> bpf_prog_add() should add rx_ring_num to ref_cnt.
>>>>>
>>>>> Signed-off-by: Zhiyi Sun <zhiyisun-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>>>
>>>> Your analysis looks incorrect to me. Please elaborate in more detail why
>>>> you think current code is buggy ...
>>>
>>> Yes, you are correct. My patch is incorrect. It is not a bug.
>>>
>>>> Call path is dev_change_xdp_fd(), which does bpf_prog_get_type() on the
>>>> fd. This already takes a ref and only drops it in case of error. Thus
>>>> in mlx4_xdp_set(), you only need priv->rx_ring_num - 1 refs for the rest
>>>> of the rings, so that dropping refs from old_prog makes sure we release
>>>> it again. Looks correct to me (maybe a comment would have helped there).
>>>
>>> I thought mlx4's code is incorrect because in mlx5's driver, function
>>> mlx5e_xdp_set() calls a pair of bpf_prog_add/put, the number of add and
>>> put to the refs are same. I didn't notice that one "add" has been called in its
>>> calller. So, it seems that mlx5's code is incorrect, right?
>>
>> Yep, I think the two attached patches are needed.
>>
>> The other thing I noticed in mlx5e_create_rq() is that it calls
>> bpf_prog_add(rq->xdp_prog, 1) without actually checking for errors.
>
>>  From d2bd6b3cd8636716a06b0ea3b1e041e16f87cce0 Mon Sep 17 00:00:00 2001
>> Message-Id: <d2bd6b3cd8636716a06b0ea3b1e041e16f87cce0.1478685278.git.daniel-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org>
>> From: Daniel Borkmann <daniel-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org>
>> Date: Wed, 9 Nov 2016 10:31:19 +0100
>> Subject: [PATCH net-next 1/2] bpf, mlx4: fix prog refcount in mlx4_en_try_alloc_resources error path
>>
>> Commit 67f8b1dcb9ee ("net/mlx4_en: Refactor the XDP forwarding rings
>> scheme") added a bug in that the prog's reference count is not dropped
>> in the error path when mlx4_en_try_alloc_resources() is failing.
>>
>> We previously took bpf_prog_add(prog, priv->rx_ring_num - 1), that we
>> need to release again. Earlier in the call-path, dev_change_xdp_fd()
>> itself holds a ref to the prog as well, which is then released though
>> bpf_prog_put() due to the propagated error.
>>
>> Fixes: 67f8b1dcb9ee ("net/mlx4_en: Refactor the XDP forwarding rings scheme")
>> Signed-off-by: Daniel Borkmann <daniel-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org>
>> ---
>>   drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  5 ++++-
>>   include/linux/bpf.h                            |  1 +
>>   kernel/bpf/syscall.c                           | 11 +++++++++++
>>   3 files changed, 16 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
>> index 0f6225c..4104aec 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
>> @@ -2747,8 +2747,11 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>>   	}
>>
>>   	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof);
>> -	if (err)
>> +	if (err) {
>> +		if (prog)
>> +			bpf_prog_add_undo(prog, priv->rx_ring_num - 1);
> Why not just move the above bpf_prog_add to be below the try_alloc?
> Nobody needs those references until all of the resources have been
> allocated, and then we can remove the need for bpf_prog_add_undo.

Right, looked into this and the convention is to call mlx4_en_try_alloc_resources()
plus mlx4_en_safe_replace_resources(), which must always succeed (currently).
Seems rather complex to go this route instead; bpf_prog_add_undo() or *_sub()
[however we name it] is safe and straight forward, since we're guaranteed to
have one reference already.

>>   		goto unlock_out;
>> +	}
>>
>>   	if (priv->port_up) {
>>   		port_up = 1;
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index edcd96d..4f6a4f1 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -234,6 +234,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
>>   struct bpf_prog *bpf_prog_get(u32 ufd);
>>   struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
>>   struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i);
>> +void bpf_prog_add_undo(struct bpf_prog *prog, int i);
>>   struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
>>   void bpf_prog_put(struct bpf_prog *prog);
>>
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 228f962..a6e4dd8 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -680,6 +680,17 @@ struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
>>   }
>>   EXPORT_SYMBOL_GPL(bpf_prog_add);
>>
>> +void bpf_prog_add_undo(struct bpf_prog *prog, int i)
>> +{
>> +	/* Only to be used for undoing previous bpf_prog_add() in some
>> +	 * error path. We still know that another entity in our call
>> +	 * path holds a reference to the program, thus atomic_sub() can
>> +	 * be safely used here!
>> +	 */
>> +	atomic_sub(i, &prog->aux->refcnt);
>> +}
>> +EXPORT_SYMBOL_GPL(bpf_prog_add_undo);
>> +
>>   struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
>>   {
>>   	return bpf_prog_add(prog, 1);
>> --
>> 1.9.3
>
>>  From f0789544432bbb89c53c3b8ac6575d48fed97786 Mon Sep 17 00:00:00 2001
>> Message-Id: <f0789544432bbb89c53c3b8ac6575d48fed97786.1478685278.git.daniel-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org>
>> In-Reply-To: <d2bd6b3cd8636716a06b0ea3b1e041e16f87cce0.1478685278.git.daniel-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org>
>> References: <d2bd6b3cd8636716a06b0ea3b1e041e16f87cce0.1478685278.git.daniel-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org>
>> From: Daniel Borkmann <daniel-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org>
>> Date: Wed, 9 Nov 2016 10:51:26 +0100
>> Subject: [PATCH net-next 2/2] bpf, mlx5: fix prog refcount in mlx5e_xdp_set
>>
>> dev_change_xdp_fd() already holds a reference, so bpf_prog_add(prog, 1)
>> is not correct as it takes one reference too much and will thus leak
>> the prog eventually. Also, bpf_prog_add() can fail and is not checked
>> for errors here.
>>
>> Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support")
>> Signed-off-by: Daniel Borkmann <daniel-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org>
>> ---
>>   drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 --
>>   1 file changed, 2 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>> index ba0c774..63309dd 100644
>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>> @@ -3121,8 +3121,6 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
>>
>>   	/* exchange programs */
>>   	old_prog = xchg(&priv->xdp_prog, prog);
>> -	if (prog)
>> -		bpf_prog_add(prog, 1);
> There is also another use of bpf_prog_add down below, which does not
> check the error return. Same in mlx5e_create_rq.

Yeah, saw that, too. These two unchecked bpf_prog_add() would be another
issue to fix on top of this, ohh well.

For net-next, I'll just add a __must_check to these functions, so we can
avoid such issues in future and let the compiler complain early enough
instead.

Thanks,
Daniel
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v1 06/14] xprtrdma: Address coverity complaint about wait_for_completion()
From: Chuck Lever @ 2016-11-09 19:05 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

> ** CID 114101:  Error handling issues  (CHECKED_RETURN)
> /net/sunrpc/xprtrdma/verbs.c: 355 in rpcrdma_create_id()

Commit 5675add36e76 ("RPC/RDMA: harden connection logic against
missing/late rdma_cm upcalls.") replaced wait_for_completion() calls
with these two call sites.

The original wait_for_completion() calls were added in the initial
commit of verbs.c, which was commit c56c65fb67d6 ("RPCRDMA: rpc rdma
verbs interface implementation"), but these returned void.

rpcrdma_create_id() is called by the RDMA connect worker, which
probably won't ever be interrupted. It is also called by
rpcrdma_ia_open which is in the synchronous mount path, and ^C is
possible there.

Add a bit of logic at those two call sites to return if the waits
return ERESTARTSYS.

Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/verbs.c |   17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 451f5f2..cbb1885 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -331,6 +331,7 @@ static void rpcrdma_destroy_id(struct rdma_cm_id *id)
 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 			struct rpcrdma_ia *ia, struct sockaddr *addr)
 {
+	unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
 	struct rdma_cm_id *id;
 	int rc;
 
@@ -352,8 +353,12 @@ static void rpcrdma_destroy_id(struct rdma_cm_id *id)
 			__func__, rc);
 		goto out;
 	}
-	wait_for_completion_interruptible_timeout(&ia->ri_done,
-				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+	rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+	if (rc < 0) {
+		dprintk("RPC:       %s: wait() exited: %i\n",
+			__func__, rc);
+		goto out;
+	}
 
 	/* FIXME:
 	 * Until xprtrdma supports DEVICE_REMOVAL, the provider must
@@ -376,8 +381,12 @@ static void rpcrdma_destroy_id(struct rdma_cm_id *id)
 			__func__, rc);
 		goto put;
 	}
-	wait_for_completion_interruptible_timeout(&ia->ri_done,
-				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+	rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+	if (rc < 0) {
+		dprintk("RPC:       %s: wait() exited: %i\n",
+			__func__, rc);
+		goto put;
+	}
 	rc = ia->ri_async_rc;
 	if (rc)
 		goto put;

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 05/14] SUNRPC: Proper metric accounting when RPC is not transmitted
From: Chuck Lever @ 2016-11-09 19:05 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

I noticed recently that during an xfstests on a krb5i mount, the
retransmit count for certain operations had gone negative, and the
backlog value became unreasonably large. I recall that Andy has
pointed this out to me in the past.

When call_refresh fails to find a valid credential for an RPC, the
RPC exits immediately without sending anything on the wire. This
leaves rq_ntrans, rq_xtime, and rq_rtt set to zero.

The solution for om_queue is to not add the to RPC's running backlog
queue total whenever rq_xtime is zero.

For om_ntrans, it's a bit more difficult. A zero rq_ntrans causes
om_ops to become larger than om_ntrans. The design of the RPC
metrics API assumes that ntrans will always be equal to or larger
than the ops count. The result is that when an RPC fails to find
credentials, the RPC operation's reported retransmit count, which is
computed in user space as the difference between ops and ntrans,
goes negative.

Ideally the kernel API should report a separate retransmit and
"exited before initial transmission" metric, so that user space can
sort out the difference properly.

To avoid kernel API changes and changes to the way rq_ntrans is used
when performing transport locking, account for untransmitted RPCs
so that om_ntrans keeps up with om_ops: always add one or more to
om_ntrans.

Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/stats.c |   10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 2ecb994..caeb01a 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -157,15 +157,17 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
 	spin_lock(&op_metrics->om_lock);

 	op_metrics->om_ops++;
-	op_metrics->om_ntrans += req->rq_ntrans;
+	/* kernel API: om_ops must never become larger than om_ntrans */
+	op_metrics->om_ntrans += max(req->rq_ntrans, 1);
 	op_metrics->om_timeouts += task->tk_timeouts;

 	op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent;
 	op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd;

-	delta = ktime_sub(req->rq_xtime, task->tk_start);
-	op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
-
+	if (ktime_to_ns(req->rq_xtime)) {
+		delta = ktime_sub(req->rq_xtime, task->tk_start);
+		op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
+	}
 	op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt);

 	delta = ktime_sub(now, task->tk_start);

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 04/14] xprtrdma: Support for SG_GAP devices
From: Chuck Lever @ 2016-11-09 19:05 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

Some devices (such as the Mellanox CX-4) can register, under a
single R_key, a set of memory regions that are not contiguous. When
this is done, all the segments in a Reply list, say, can then be
invalidated in a single LocalInv Work Request (or via Remote
Invalidation, which can invalidate exactly one R_key when completing
a Receive).

This means a single FastReg WR is used to register, and one or zero
LocalInv WRs can invalidate, the memory involved with RDMA transfers
on behalf of an RPC.

Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/frwr_ops.c  |   20 +++++++++++++-------
 net/sunrpc/xprtrdma/xprt_rdma.h |    1 +
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index adbf52c..e99bf61 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -101,7 +101,7 @@
 	struct rpcrdma_frmr *f = &r->frmr;
 	int rc;
 
-	f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
+	f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
 	if (IS_ERR(f->fr_mr))
 		goto out_mr_err;
 
@@ -157,7 +157,7 @@
 		return rc;
 	}
 
-	f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG,
+	f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype,
 			       ia->ri_max_frmr_depth);
 	if (IS_ERR(f->fr_mr)) {
 		pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
@@ -210,11 +210,16 @@
 frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 	     struct rpcrdma_create_data_internal *cdata)
 {
+	struct ib_device_attr *attrs = &ia->ri_device->attrs;
 	int depth, delta;
 
+	ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
+	if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+		ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
+
 	ia->ri_max_frmr_depth =
 			min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-			      ia->ri_device->attrs.max_fast_reg_page_list_len);
+			      attrs->max_fast_reg_page_list_len);
 	dprintk("RPC:       %s: device's max FR page list len = %u\n",
 		__func__, ia->ri_max_frmr_depth);
 
@@ -241,8 +246,8 @@
 	}
 
 	ep->rep_attr.cap.max_send_wr *= depth;
-	if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) {
-		cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth;
+	if (ep->rep_attr.cap.max_send_wr > attrs->max_qp_wr) {
+		cdata->max_requests = attrs->max_qp_wr / depth;
 		if (!cdata->max_requests)
 			return -EINVAL;
 		ep->rep_attr.cap.max_send_wr = cdata->max_requests *
@@ -348,6 +353,7 @@
 	    int nsegs, bool writing, struct rpcrdma_mw **out)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
 	struct rpcrdma_mw *mw;
 	struct rpcrdma_frmr *frmr;
 	struct ib_mr *mr;
@@ -383,8 +389,8 @@
 
 		++seg;
 		++i;
-
-		/* Check for holes */
+		if (holes_ok)
+			continue;
 		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
 		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
 			break;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index f6ae1b2..b8424fa 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -75,6 +75,7 @@ struct rpcrdma_ia {
 	unsigned int		ri_max_inline_write;
 	unsigned int		ri_max_inline_read;
 	bool			ri_reminv_expected;
+	enum ib_mr_type		ri_mrtype;
 	struct ib_qp_attr	ri_qp_attr;
 	struct ib_qp_init_attr	ri_qp_init_attr;
 };

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 03/14] xprtrdma: Make FRWR send queue entry accounting more accurate
From: Chuck Lever @ 2016-11-09 19:05 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

Verbs providers may perform house-keeping on a send queue during
each signaled send completion. It is necessary therefore for a verbs
consumer (like xprtrdma) to occasionally force a signaled send
completion if it runs unsignaled some of the time.

xprtrdma does not need signaled completions for Send or FastReg Work
Requests. So, it forces a signal about half way through the send
queue by counting the number of Send Queue Entries it consumes. It
currently does this by counting each ib_post_send as one SQE.

Commit c9918ff56dfb ("xprtrdma: Add ro_unmap_sync method for FRWR")
introduced the ability for frwr_op_unmap_sync to post more than one
WR with a single post_send. Thus the underlying assumption of one WR
per ib_post_send is no longer true.

Also, FastReg is currently never signaled. It should be signaled
once in a while to keep the accounting of consumed SQEs accurate.

While we're here, convert the CQCOUNT macros to the currently
preferred kernel coding style, which is inline functions.

Fixes: c9918ff56dfb ("xprtrdma: Add ro_unmap_sync method for FRWR")
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/frwr_ops.c  |   13 ++++++++++---
 net/sunrpc/xprtrdma/verbs.c     |   10 ++--------
 net/sunrpc/xprtrdma/xprt_rdma.h |   20 ++++++++++++++++++--
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 26b26be..adbf52c 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -421,7 +421,7 @@
 			 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
 			 IB_ACCESS_REMOTE_READ;
 
-	DECR_CQCOUNT(&r_xprt->rx_ep);
+	rpcrdma_set_signaled(&r_xprt->rx_ep, &reg_wr->wr);
 	rc = ib_post_send(ia->ri_id->qp, &reg_wr->wr, &bad_wr);
 	if (rc)
 		goto out_senderr;
@@ -486,7 +486,7 @@
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_mw *mw, *tmp;
 	struct rpcrdma_frmr *f;
-	int rc;
+	int count, rc;
 
 	dprintk("RPC:       %s: req %p\n", __func__, req);
 
@@ -496,6 +496,7 @@
 	 * a single ib_post_send() call.
 	 */
 	f = NULL;
+	count = 0;
 	invalidate_wrs = pos = prev = NULL;
 	list_for_each_entry(mw, &req->rl_registered, mw_list) {
 		if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) &&
@@ -505,6 +506,7 @@
 		}
 
 		pos = __frwr_prepare_linv_wr(mw);
+		count++;
 
 		if (!invalidate_wrs)
 			invalidate_wrs = pos;
@@ -523,7 +525,12 @@
 	f->fr_invwr.send_flags = IB_SEND_SIGNALED;
 	f->fr_cqe.done = frwr_wc_localinv_wake;
 	reinit_completion(&f->fr_linv_done);
-	INIT_CQCOUNT(&r_xprt->rx_ep);
+
+	/* Initialize CQ count, since there is always a signaled
+	 * WR being posted here.  The new cqcount depends on how
+	 * many SQEs are about to be consumed.
+	 */
+	rpcrdma_init_cqcount(&r_xprt->rx_ep, count);
 
 	/* Transport disconnect drains the receive CQ before it
 	 * replaces the QP. The RPC reply handler won't call us
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index ec74289..451f5f2 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -532,7 +532,7 @@ static void rpcrdma_destroy_id(struct rdma_cm_id *id)
 	ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
 	if (ep->rep_cqinit <= 2)
 		ep->rep_cqinit = 0;	/* always signal? */
-	INIT_CQCOUNT(ep);
+	rpcrdma_init_cqcount(ep, 0);
 	init_waitqueue_head(&ep->rep_connect_wait);
 	INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
 
@@ -1311,13 +1311,7 @@ struct rpcrdma_regbuf *
 	dprintk("RPC:       %s: posting %d s/g entries\n",
 		__func__, send_wr->num_sge);
 
-	if (DECR_CQCOUNT(ep) > 0)
-		send_wr->send_flags = 0;
-	else { /* Provider must take a send completion every now and then */
-		INIT_CQCOUNT(ep);
-		send_wr->send_flags = IB_SEND_SIGNALED;
-	}
-
+	rpcrdma_set_signaled(ep, send_wr);
 	rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
 	if (rc)
 		goto out_postsend_err;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 6e1bba3..f6ae1b2 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -95,8 +95,24 @@ struct rpcrdma_ep {
 	struct delayed_work	rep_connect_worker;
 };
 
-#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
-#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
+static inline void
+rpcrdma_init_cqcount(struct rpcrdma_ep *ep, int count)
+{
+	atomic_set(&ep->rep_cqcount, ep->rep_cqinit - count);
+}
+
+/* To update send queue accounting, provider must take a
+ * send completion every now and then.
+ */
+static inline void
+rpcrdma_set_signaled(struct rpcrdma_ep *ep, struct ib_send_wr *send_wr)
+{
+	send_wr->send_flags = 0;
+	if (unlikely(atomic_sub_return(1, &ep->rep_cqcount) <= 0)) {
+		rpcrdma_init_cqcount(ep, 0);
+		send_wr->send_flags = IB_SEND_SIGNALED;
+	}
+}
 
 /* Pre-allocate extra Work Requests for handling backward receives
  * and sends. This is a fixed value because the Work Queues are

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 02/14] xprtrdma: Cap size of callback buffer resources
From: Chuck Lever @ 2016-11-09 19:05 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

When the inline threshold size is set to large values (say, 32KB)
any NFSv4.1 CB request from the server gets a reply with status
NFS4ERR_RESOURCE.

Looks like there are some upper layer assumptions about the maximum
size of a reply (for example, in process_op). Cap the size of the
NFSv4 client's reply resources at a page.

Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/backchannel.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2c472e1..24fedd4 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -55,7 +55,8 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
 	if (IS_ERR(rb))
 		goto out_fail;
 	req->rl_sendbuf = rb;
-	xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, size);
+	xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base,
+		     min_t(size_t, size, PAGE_SIZE));
 	rpcrdma_set_xprtdata(rqst, req);
 	return 0;
 
@@ -191,6 +192,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
 	size_t maxmsg;
 
 	maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+	maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
 	return maxmsg - RPCRDMA_HDRLEN_MIN;
 }
 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 01/14] xprtrdma: Fix DMAR failure in frwr_op_map() after reconnect
From: Chuck Lever @ 2016-11-09 19:04 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161109184735.15007.96507.stgit-FYjufvaPoItvLzlybtyyYzGyq/o6K9yX@public.gmane.org>

When a LOCALINV WR is flushed, the frmr is marked STALE, then
frwr_op_unmap_sync DMA-unmaps the frmr's SGL. These STALE frmrs
are then recovered when frwr_op_map hunts for an INVALID frmr to
use.

All other cases that need frmr recovery leave that SGL DMA-mapped.
The FRMR recovery path unconditionally DMA-unmaps the frmr's SGL.

To avoid DMA unmapping the SGL twice for flushed LOCAL_INV WRs,
alter the recovery logic (rather than the hot frwr_op_unmap_sync
path) to distinguish among these cases. This solution also takes
care of the case where multiple LOCAL_INV WRs are issued for the
same rpcrdma_req, some complete successfully, but some are flushed.

Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
Tested-by: Vasco Steinmetz <linux-W+c8CscfqXQmp8TqCH86vg@public.gmane.org>
---
 net/sunrpc/xprtrdma/frwr_ops.c  |   37 ++++++++++++++++++++++---------------
 net/sunrpc/xprtrdma/xprt_rdma.h |    3 ++-
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 2109495..26b26be 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -44,18 +44,20 @@
  * being done.
  *
  * When the underlying transport disconnects, MRs are left in one of
- * three states:
+ * four states:
  *
  * INVALID:	The MR was not in use before the QP entered ERROR state.
- *		(Or, the LOCAL_INV WR has not completed or flushed yet).
- *
- * STALE:	The MR was being registered or unregistered when the QP
- *		entered ERROR state, and the pending WR was flushed.
  *
  * VALID:	The MR was registered before the QP entered ERROR state.
  *
- * When frwr_op_map encounters STALE and VALID MRs, they are recovered
- * with ib_dereg_mr and then are re-initialized. Beause MR recovery
+ * FLUSHED_FR:	The MR was being registered when the QP entered ERROR
+ *		state, and the pending WR was flushed.
+ *
+ * FLUSHED_LI:	The MR was being invalidated when the QP entered ERROR
+ *		state, and the pending WR was flushed.
+ *
+ * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered
+ * with ib_dereg_mr and then are re-initialized. Because MR recovery
  * allocates fresh resources, it is deferred to a workqueue, and the
  * recovered MRs are placed back on the rb_mws list when recovery is
  * complete. frwr_op_map allocates another MR for the current RPC while
@@ -177,12 +179,15 @@
 static void
 frwr_op_recover_mr(struct rpcrdma_mw *mw)
 {
+	enum rpcrdma_frmr_state state = mw->frmr.fr_state;
 	struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	int rc;
 
 	rc = __frwr_reset_mr(ia, mw);
-	ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
+	if (state != FRMR_FLUSHED_LI)
+		ib_dma_unmap_sg(ia->ri_device,
+				mw->mw_sg, mw->mw_nents, mw->mw_dir);
 	if (rc)
 		goto out_release;
 
@@ -262,10 +267,8 @@
 }
 
 static void
-__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_frmr *frmr,
-			    const char *wr)
+__frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr)
 {
-	frmr->fr_state = FRMR_IS_STALE;
 	if (wc->status != IB_WC_WR_FLUSH_ERR)
 		pr_err("rpcrdma: %s: %s (%u/0x%x)\n",
 		       wr, ib_wc_status_msg(wc->status),
@@ -288,7 +291,8 @@
 	if (wc->status != IB_WC_SUCCESS) {
 		cqe = wc->wr_cqe;
 		frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
-		__frwr_sendcompletion_flush(wc, frmr, "fastreg");
+		frmr->fr_state = FRMR_FLUSHED_FR;
+		__frwr_sendcompletion_flush(wc, "fastreg");
 	}
 }
 
@@ -308,7 +312,8 @@
 	if (wc->status != IB_WC_SUCCESS) {
 		cqe = wc->wr_cqe;
 		frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
-		__frwr_sendcompletion_flush(wc, frmr, "localinv");
+		frmr->fr_state = FRMR_FLUSHED_LI;
+		__frwr_sendcompletion_flush(wc, "localinv");
 	}
 }
 
@@ -328,8 +333,10 @@
 	/* WARNING: Only wr_cqe and status are reliable at this point */
 	cqe = wc->wr_cqe;
 	frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
-	if (wc->status != IB_WC_SUCCESS)
-		__frwr_sendcompletion_flush(wc, frmr, "localinv");
+	if (wc->status != IB_WC_SUCCESS) {
+		frmr->fr_state = FRMR_FLUSHED_LI;
+		__frwr_sendcompletion_flush(wc, "localinv");
+	}
 	complete(&frmr->fr_linv_done);
 }
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0d35b76..6e1bba3 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -216,7 +216,8 @@ struct rpcrdma_rep {
 enum rpcrdma_frmr_state {
 	FRMR_IS_INVALID,	/* ready to be used */
 	FRMR_IS_VALID,		/* in use */
-	FRMR_IS_STALE,		/* failed completion */
+	FRMR_FLUSHED_FR,	/* flushed FASTREG WR */
+	FRMR_FLUSHED_LI,	/* flushed LOCALINV WR */
 };
 
 struct rpcrdma_frmr {

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v1 00/14] client-side NFS/RDMA patches proposed for v4.10
From: Chuck Lever @ 2016-11-09 19:04 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

The following patch series makes these changes:

- Support for devices that support SG_GAP
- Several bug fixes
- A number of clean-ups and optimizations

SG_GAP devices allow the client transport implementation to register
non-contiguous memory regions using one offset and handle. Scatter
and gather from these regions are then managed entirely in hardware.

Normally a Reply chunk for a READDIR or GETACL is built of multiple
RDMA segments. Now, when our client wants to build a Reply chunk out
of the xdr_buf's head, page list, and tail, it appears to the server
as a single contiguous RDMA segment if the device supports SG_GAP.

Instead of a separate Write WR for each of several RDMA segments,
the server can then post a single Write WR to transmit the whole RPC
Reply (assuming the Reply is smaller than the server RNIC's max_sge)
and can invalidate the whole Reply chunk remotely.

SG_GAP is currently supported by mlx5 devices when using the FRWR
registration mechanism.

Available in the "nfs-rdma-for-4.10" topic branch of this git repo:

git://git.linux-nfs.org/projects/cel/cel-2.6.git

Or for browsing:

http://git.linux-nfs.org/?p=cel/cel-2.6.git;a=log;h=refs/heads/nfs-rdma-for-4.10

---

Chuck Lever (14):
      xprtrdma: Fix DMAR failure in frwr_op_map() after reconnect
      xprtrdma: Cap size of callback buffer resources
      xprtrdma: Make FRWR send queue entry accounting more accurate
      xprtrdma: Support for SG_GAP devices
      SUNRPC: Proper metric accounting when RPC is not transmitted
      xprtrdma: Address coverity complaint about wait_for_completion()
      xprtrdma: Avoid calls to ro_unmap_safe()
      xprtrdma: Squelch "max send, max recv" messages at connect time
      xprtrdma: Shorten QP access error message
      xprtrdma: Update dprintk in rpcrdma_count_chunks
      xprtrdma: Relocate connection helper functions
      xprtrdma: Simplify synopsis of rpcrdma_ep_connect()
      xprtrdma: Refactor FRMR invalidation
      xprtrdma: Update documenting comment

 net/sunrpc/stats.c                |   10 ++-
 net/sunrpc/xprtrdma/backchannel.c |    4 +
 net/sunrpc/xprtrdma/frwr_ops.c    |  131 +++++++++++++++++++------------------
 net/sunrpc/xprtrdma/rpc_rdma.c    |   36 ----------
 net/sunrpc/xprtrdma/transport.c   |   36 +++++++++-
 net/sunrpc/xprtrdma/verbs.c       |   54 ++++++++-------
 net/sunrpc/xprtrdma/xprt_rdma.h   |   36 +++++++---
 7 files changed, 161 insertions(+), 146 deletions(-)

--
Chuck Lever
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC ABI V5 02/10] RDMA/core: Add support for custom types
From: Jason Gunthorpe @ 2016-11-09 18:50 UTC (permalink / raw)
  To: Hefty, Sean
  Cc: Matan Barak, Matan Barak,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Doug Ledford,
	Christoph Lameter, Liran Liss, Haggai Eran, Majd Dibbiny,
	Tal Alon, Leon Romanovsky
In-Reply-To: <1828884A29C6694DAF28B7E6B8A82373AB0A8000-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>

On Wed, Nov 09, 2016 at 06:00:48PM +0000, Hefty, Sean wrote:

> In any case, the two approaches are not exclusive.  By forcing the
> rule language into the framework, everything is forced to deal with
> it.  By leaving it out, each ioctl provider can decide if they need
> this or not.  If you want verbs to process all ioctl's using a
> single pre-validation function that operates based on these rules
> you can.  Nothing prevents that.  But ioctl providers that want
> better performance can elect for a more straightforward validation
> model.

The pre-validation is tied into the hash expansion and will hopefully
be the raw data to support a new discoverability scheme. So, making it
optional really wrecks the whole design, I think.

Also, this is really the best way to ensure that we have consistent
checking and error reporting around attributes (eg what happens if the
kernel does not support a requested attribute, or uses the wrong size,
etc) It is very important these things use the correct errnos not the
random mismatch we see today.

I'm not seeing that it is a clear peformance loss (relative to open
coding at least), the major work is the expansion to the hash table
and doing a couple size tests along the way is not hard. Matan's
revised series should be event better on this regard as I gave alot
of feedback to speed it up at plumbers.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* RE: [RFC ABI V5 02/10] RDMA/core: Add support for custom types
From: Hefty, Sean @ 2016-11-09 18:00 UTC (permalink / raw)
  To: Matan Barak
  Cc: Matan Barak, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Doug Ledford, Jason Gunthorpe, Christoph Lameter, Liran Liss,
	Haggai Eran, Majd Dibbiny, Tal Alon, Leon Romanovsky
In-Reply-To: <CAAKD3BDWyb10baLrDu=m_mYPB64i9OOPEPVYKtDo9zVbvMM-UA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 47059 bytes --]

> I had thought about that, but the user could initialize its part of
> the object in the function handler. It can't allocate the object as we
> need it in order to allocate an IDR entry and co. The assumption here
> is that the "unlock" stage can't fail.

This is creating a generic OO type of framework, so just add constructor/destructor functions and have all objects inherit from a base ioctl object class.

> > In fact, it would be great if we could just cleanup the list in the
> reverse order that items were created.  Maybe this requires supporting
> a pre-cleanup handler, so that the driver can pluck items out of the
> list that may need to be destroyed out of order.
> >
> 
> So that's essentially one layer of ordering. Why do you consider a
> driver iterating over all objects simpler than this model?

This problem is a verbs specific issue, and one that only involves MW .  We have reference counts that can provide the same functionality.  I want to avoid the amount of meta-data that needs to be used to describe objects.

> >> Adding an object is done in two parts.
> >> First, an object is allocated and added to IDR/fd table. Then, the
> >> command's handlers (in downstream patches) could work on this object
> >> and fill in its required details.
> >> After a successful command, ib_uverbs_uobject_enable is called and
> >> this user objects becomes ucontext visible.
> >
> > If you have a way to mark that an object is used for exclusive
> access, you may be able to use that instead of introducing a new
> variable.  (I.e. acquire the object's write lock).  I think we want to
> make an effort to minimize the size of the kernel structure needed to
> track every user space object (within reason).
> >
> 
> I didn't really follow. A command attribute states the nature of the
> locking (for example, in MODIFY_QP the QP could be exclusively locked,
> but in QUERY_QP it's only locked for reading). I don't want to really
> grab a lock, as if I were I could face a dead-lock (user-space could
> pass parameters in a colliding order), It could be solved by sorting
> the handles, but that would degrade performance without a good reasob.

I'm suggesting that the locking attribute and command be separate.  This allows the framework to acquire the proper type of lock independent of what function it will invoke.

The framework doesn't need to hold locks.  It should be able to mark access to an object.  If that access is not available, it can abort.  This pushes more complex synchronization and thread handling to user space.

> >> Removing an uboject is done by calling ib_uverbs_uobject_remove.
> >>
> >> We should make sure IDR (per-device) and list (per-ucontext) could
> >> be accessed concurrently without corrupting them.
> >>
> >> Signed-off-by: Matan Barak <matanb@mellanox.com>
> >> Signed-off-by: Haggai Eran <haggaie@mellanox.com>
> >> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> >> ---
> >
> > As a general comment, I do have concerns that the resulting
> generalized parsing of everything will negatively impact performance
> for operations that do have to transition into the kernel.  Not all
> devices offload all operations to user space.  Plus the resulting code
> is extremely difficult to read and non-trivial to use.  It's equivalent
> to reading C++ code that has 4 layers of inheritance with overrides to
> basic operators...
> 
> There are two parts here. I think the handlers themselves are simpler,
> easier to read and less error-prone. They contain less code
> duplications. The macro based define language explicitly declare all
> attributes, their types, size, etc.
> The model here is a bit more complex as we want to achieve both code
> resue and add/override of new types/actions/attributes.
> 
> 
> >
> > Pre and post operators per command that can do straightforward
> validation seem like a better option.
> >
> >
> 
> I think that would duplicate a lot of code and will be more
> error-prone than one infrastrucutre that automates all that work for
> you.

I think that's a toss-up.  Either you have to write the code correctly or write the rules correctly.  Reading code is straightforward, manually converting rules into code is not.

In any case, the two approaches are not exclusive.  By forcing the rule language into the framework, everything is forced to deal with it.  By leaving it out, each ioctl provider can decide if they need this or not.  If you want verbs to process all ioctl's using a single pre-validation function that operates based on these rules you can.  Nothing prevents that.  But ioctl providers that want better performance can elect for a more straightforward validation model.

> >>  drivers/infiniband/core/Makefile      |   3 +-
> >>  drivers/infiniband/core/device.c      |   1 +
> >>  drivers/infiniband/core/rdma_core.c   | 489
> >> ++++++++++++++++++++++++++++++++++
> >>  drivers/infiniband/core/rdma_core.h   |  75 ++++++
> >>  drivers/infiniband/core/uverbs.h      |   1 +
> >>  drivers/infiniband/core/uverbs_main.c |   2 +-
> >>  include/rdma/ib_verbs.h               |  28 +-
> >>  include/rdma/uverbs_ioctl.h           | 195 ++++++++++++++
> >>  8 files changed, 789 insertions(+), 5 deletions(-)
> >>  create mode 100644 drivers/infiniband/core/rdma_core.c
> >>  create mode 100644 drivers/infiniband/core/rdma_core.h
> >>  create mode 100644 include/rdma/uverbs_ioctl.h
> >>
> >> diff --git a/drivers/infiniband/core/Makefile
> >> b/drivers/infiniband/core/Makefile
> >> index edaae9f..1819623 100644
> >> --- a/drivers/infiniband/core/Makefile
> >> +++ b/drivers/infiniband/core/Makefile
> >> @@ -28,4 +28,5 @@ ib_umad-y :=                        user_mad.o
> >>
> >>  ib_ucm-y :=                  ucm.o
> >>
> >> -ib_uverbs-y :=                       uverbs_main.o uverbs_cmd.o
> >> uverbs_marshall.o
> >> +ib_uverbs-y :=                       uverbs_main.o uverbs_cmd.o
> >> uverbs_marshall.o \
> >> +                             rdma_core.o
> >> diff --git a/drivers/infiniband/core/device.c
> >> b/drivers/infiniband/core/device.c
> >> index c3b68f5..43994b1 100644
> >> --- a/drivers/infiniband/core/device.c
> >> +++ b/drivers/infiniband/core/device.c
> >> @@ -243,6 +243,7 @@ struct ib_device *ib_alloc_device(size_t size)
> >>       spin_lock_init(&device->client_data_lock);
> >>       INIT_LIST_HEAD(&device->client_data_list);
> >>       INIT_LIST_HEAD(&device->port_list);
> >> +     INIT_LIST_HEAD(&device->type_list);
> >>
> >>       return device;
> >>  }
> >> diff --git a/drivers/infiniband/core/rdma_core.c
> >> b/drivers/infiniband/core/rdma_core.c
> >> new file mode 100644
> >> index 0000000..337abc2
> >> --- /dev/null
> >> +++ b/drivers/infiniband/core/rdma_core.c
> >> @@ -0,0 +1,489 @@
> >> +/*
> >> + * Copyright (c) 2016, Mellanox Technologies inc.  All rights
> >> reserved.
> >> + *
> >> + * This software is available to you under a choice of one of two
> >> + * licenses.  You may choose to be licensed under the terms of the
> GNU
> >> + * General Public License (GPL) Version 2, available from the file
> >> + * COPYING in the main directory of this source tree, or the
> >> + * OpenIB.org BSD license below:
> >> + *
> >> + *     Redistribution and use in source and binary forms, with or
> >> + *     without modification, are permitted provided that the
> following
> >> + *     conditions are met:
> >> + *
> >> + *      - Redistributions of source code must retain the above
> >> + *        copyright notice, this list of conditions and the
> following
> >> + *        disclaimer.
> >> + *
> >> + *      - Redistributions in binary form must reproduce the above
> >> + *        copyright notice, this list of conditions and the
> following
> >> + *        disclaimer in the documentation and/or other materials
> >> + *        provided with the distribution.
> >> + *
> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> >> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
> OF
> >> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> >> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
> HOLDERS
> >> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
> AN
> >> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
> IN
> >> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> >> + * SOFTWARE.
> >> + */
> >> +
> >> +#include <linux/file.h>
> >> +#include <linux/anon_inodes.h>
> >> +#include <rdma/ib_verbs.h>
> >> +#include "uverbs.h"
> >> +#include "rdma_core.h"
> >> +#include <rdma/uverbs_ioctl.h>
> >> +
> >> +const struct uverbs_type *uverbs_get_type(const struct ib_device
> >> *ibdev,
> >> +                                       uint16_t type)
> >> +{
> >> +     const struct uverbs_types_group *groups = ibdev->types_group;
> >> +     const struct uverbs_types *types;
> >> +     int ret = groups->dist(&type, groups->priv);
> >> +
> >> +     if (ret >= groups->num_groups)
> >> +             return NULL;
> >> +
> >> +     types = groups->type_groups[ret];
> >> +
> >> +     if (type >= types->num_types)
> >> +             return NULL;
> >> +
> >> +     return types->types[type];
> >> +}
> >> +
> >> +static int uverbs_lock_object(struct ib_uobject *uobj,
> >> +                           enum uverbs_idr_access access)
> >> +{
> >> +     if (access == UVERBS_IDR_ACCESS_READ)
> >> +             return down_read_trylock(&uobj->usecnt) == 1 ? 0 : -
> EBUSY;
> >> +
> >> +     /* lock is either WRITE or DESTROY - should be exclusive */
> >> +     return down_write_trylock(&uobj->usecnt) == 1 ? 0 : -EBUSY;
> >
> > This function could take the lock type directly (read or write),
> versus inferring it based on some other access type.
> >
> 
> We can, but since we use these enums in the attribute specifications,
> I thought it could be more convinient.
> 
> >> +}
> >> +
> >> +static struct ib_uobject *get_uobj(int id, struct ib_ucontext
> >> *context)
> >> +{
> >> +     struct ib_uobject *uobj;
> >> +
> >> +     rcu_read_lock();
> >> +     uobj = idr_find(&context->device->idr, id);
> >> +     if (uobj && uobj->live) {
> >> +             if (uobj->context != context)
> >> +                     uobj = NULL;
> >> +     }
> >> +     rcu_read_unlock();
> >> +
> >> +     return uobj;
> >> +}
> >> +
> >> +struct ib_ucontext_lock {
> >> +     struct kref  ref;
> >> +     /* locking the uobjects_list */
> >> +     struct mutex lock;
> >> +};
> >> +
> >> +static void init_uobjects_list_lock(struct ib_ucontext_lock *lock)
> >> +{
> >> +     mutex_init(&lock->lock);
> >> +     kref_init(&lock->ref);
> >> +}
> >> +
> >> +static void release_uobjects_list_lock(struct kref *ref)
> >> +{
> >> +     struct ib_ucontext_lock *lock = container_of(ref,
> >> +                                                  struct
> ib_ucontext_lock,
> >> +                                                  ref);
> >> +
> >> +     kfree(lock);
> >> +}
> >> +
> >> +static void init_uobj(struct ib_uobject *uobj, u64 user_handle,
> >> +                   struct ib_ucontext *context)
> >> +{
> >> +     init_rwsem(&uobj->usecnt);
> >> +     uobj->user_handle = user_handle;
> >> +     uobj->context     = context;
> >> +     uobj->live        = 0;
> >> +}
> >> +
> >> +static int add_uobj(struct ib_uobject *uobj)
> >> +{
> >> +     int ret;
> >> +
> >> +     idr_preload(GFP_KERNEL);
> >> +     spin_lock(&uobj->context->device->idr_lock);
> >> +
> >> +     ret = idr_alloc(&uobj->context->device->idr, uobj, 0, 0,
> >> GFP_NOWAIT);
> >> +     if (ret >= 0)
> >> +             uobj->id = ret;
> >> +
> >> +     spin_unlock(&uobj->context->device->idr_lock);
> >> +     idr_preload_end();
> >> +
> >> +     return ret < 0 ? ret : 0;
> >> +}
> >> +
> >> +static void remove_uobj(struct ib_uobject *uobj)
> >> +{
> >> +     spin_lock(&uobj->context->device->idr_lock);
> >> +     idr_remove(&uobj->context->device->idr, uobj->id);
> >> +     spin_unlock(&uobj->context->device->idr_lock);
> >> +}
> >> +
> >> +static void put_uobj(struct ib_uobject *uobj)
> >> +{
> >> +     kfree_rcu(uobj, rcu);
> >> +}
> >> +
> >> +static struct ib_uobject *get_uobject_from_context(struct
> ib_ucontext
> >> *ucontext,
> >> +                                                const struct
> >> uverbs_type_alloc_action *type,
> >> +                                                u32 idr,
> >> +                                                enum
> uverbs_idr_access access)
> >> +{
> >> +     struct ib_uobject *uobj;
> >> +     int ret;
> >> +
> >> +     rcu_read_lock();
> >> +     uobj = get_uobj(idr, ucontext);
> >> +     if (!uobj)
> >> +             goto free;
> >> +
> >> +     if (uobj->type != type) {
> >> +             uobj = NULL;
> >> +             goto free;
> >> +     }
> >> +
> >> +     ret = uverbs_lock_object(uobj, access);
> >> +     if (ret)
> >> +             uobj = ERR_PTR(ret);
> >> +free:
> >> +     rcu_read_unlock();
> >> +     return uobj;
> >> +
> >> +     return NULL;
> >> +}
> >> +
> >> +static int ib_uverbs_uobject_add(struct ib_uobject *uobject,
> >> +                              const struct uverbs_type_alloc_action
> >> *uobject_type)
> >> +{
> >> +     uobject->type = uobject_type;
> >> +     return add_uobj(uobject);
> >> +}
> >> +
> >> +struct ib_uobject *uverbs_get_type_from_idr(const struct
> >> uverbs_type_alloc_action *type,
> >> +                                         struct ib_ucontext
> *ucontext,
> >> +                                         enum uverbs_idr_access
> access,
> >> +                                         uint32_t idr)
> >> +{
> >> +     struct ib_uobject *uobj;
> >> +     int ret;
> >> +
> >> +     if (access == UVERBS_IDR_ACCESS_NEW) {
> >> +             uobj = kmalloc(type->obj_size, GFP_KERNEL);
> >> +             if (!uobj)
> >> +                     return ERR_PTR(-ENOMEM);
> >> +
> >> +             init_uobj(uobj, 0, ucontext);
> >> +
> >> +             /* lock idr */
> >
> > Command to lock idr, but no lock is obtained.
> >
> 
> ib_uverbs_uobject_add calls add_uobj which locks the IDR.
> 
> >> +             ret = ib_uverbs_uobject_add(uobj, type);
> >> +             if (ret) {
> >> +                     kfree(uobj);
> >> +                     return ERR_PTR(ret);
> >> +             }
> >> +
> >> +     } else {
> >> +             uobj = get_uobject_from_context(ucontext, type, idr,
> >> +                                             access);
> >> +
> >> +             if (!uobj)
> >> +                     return ERR_PTR(-ENOENT);
> >> +     }
> >> +
> >> +     return uobj;
> >> +}
> >> +
> >> +struct ib_uobject *uverbs_get_type_from_fd(const struct
> >> uverbs_type_alloc_action *type,
> >> +                                        struct ib_ucontext
> *ucontext,
> >> +                                        enum uverbs_idr_access
> access,
> >> +                                        int fd)
> >> +{
> >> +     if (access == UVERBS_IDR_ACCESS_NEW) {
> >> +             int _fd;
> >> +             struct ib_uobject *uobj = NULL;
> >> +             struct file *filp;
> >> +
> >> +             _fd = get_unused_fd_flags(O_CLOEXEC);
> >> +             if (_fd < 0 || WARN_ON(type->obj_size < sizeof(struct
> >> ib_uobject)))
> >> +                     return ERR_PTR(_fd);
> >> +
> >> +             uobj = kmalloc(type->obj_size, GFP_KERNEL);
> >> +             init_uobj(uobj, 0, ucontext);
> >> +
> >> +             if (!uobj)
> >> +                     return ERR_PTR(-ENOMEM);
> >> +
> >> +             filp = anon_inode_getfile(type->fd.name, type-
> >fd.fops,
> >> +                                       uobj + 1, type->fd.flags);
> >> +             if (IS_ERR(filp)) {
> >> +                     put_unused_fd(_fd);
> >> +                     kfree(uobj);
> >> +                     return (void *)filp;
> >> +             }
> >> +
> >> +             uobj->type = type;
> >> +             uobj->id = _fd;
> >> +             uobj->object = filp;
> >> +
> >> +             return uobj;
> >> +     } else if (access == UVERBS_IDR_ACCESS_READ) {
> >> +             struct file *f = fget(fd);
> >> +             struct ib_uobject *uobject;
> >> +
> >> +             if (!f)
> >> +                     return ERR_PTR(-EBADF);
> >> +
> >> +             uobject = f->private_data - sizeof(struct ib_uobject);
> >> +             if (f->f_op != type->fd.fops ||
> >> +                 !uobject->live) {
> >> +                     fput(f);
> >> +                     return ERR_PTR(-EBADF);
> >> +             }
> >> +
> >> +             /*
> >> +              * No need to protect it with a ref count, as fget
> >> increases
> >> +              * f_count.
> >> +              */
> >> +             return uobject;
> >> +     } else {
> >> +             return ERR_PTR(-EOPNOTSUPP);
> >> +     }
> >> +}
> >> +
> >> +static void ib_uverbs_uobject_enable(struct ib_uobject *uobject)
> >> +{
> >> +     mutex_lock(&uobject->context->uobjects_lock->lock);
> >> +     list_add(&uobject->list, &uobject->context->uobjects);
> >> +     mutex_unlock(&uobject->context->uobjects_lock->lock);
> >
> > Why not just insert the object into the list on creation?
> >
> >> +     uobject->live = 1;
> >
> > See my comments above on removing the live field.
> >
> 
> Seems that the list could suffice, but I'll look into that.
> 
> >> +}
> >> +
> >> +static void ib_uverbs_uobject_remove(struct ib_uobject *uobject,
> bool
> >> lock)
> >> +{
> >> +     /*
> >> +      * Calling remove requires exclusive access, so it's not
> possible
> >> +      * another thread will use our object.
> >> +      */
> >> +     uobject->live = 0;
> >> +     uobject->type->free_fn(uobject->type, uobject);
> >> +     if (lock)
> >> +             mutex_lock(&uobject->context->uobjects_lock->lock);
> >> +     list_del(&uobject->list);
> >> +     if (lock)
> >> +             mutex_unlock(&uobject->context->uobjects_lock->lock);
> >> +     remove_uobj(uobject);
> >> +     put_uobj(uobject);
> >> +}
> >> +
> >> +static void uverbs_unlock_idr(struct ib_uobject *uobj,
> >> +                           enum uverbs_idr_access access,
> >> +                           bool success)
> >> +{
> >> +     switch (access) {
> >> +     case UVERBS_IDR_ACCESS_READ:
> >> +             up_read(&uobj->usecnt);
> >> +             break;
> >> +     case UVERBS_IDR_ACCESS_NEW:
> >> +             if (success) {
> >> +                     ib_uverbs_uobject_enable(uobj);
> >> +             } else {
> >> +                     remove_uobj(uobj);
> >> +                     put_uobj(uobj);
> >> +             }
> >> +             break;
> >> +     case UVERBS_IDR_ACCESS_WRITE:
> >> +             up_write(&uobj->usecnt);
> >> +             break;
> >> +     case UVERBS_IDR_ACCESS_DESTROY:
> >> +             if (success)
> >> +                     ib_uverbs_uobject_remove(uobj, true);
> >> +             else
> >> +                     up_write(&uobj->usecnt);
> >> +             break;
> >> +     }
> >> +}
> >> +
> >> +static void uverbs_unlock_fd(struct ib_uobject *uobj,
> >> +                          enum uverbs_idr_access access,
> >> +                          bool success)
> >> +{
> >> +     struct file *filp = uobj->object;
> >> +
> >> +     if (access == UVERBS_IDR_ACCESS_NEW) {
> >> +             if (success) {
> >> +                     kref_get(&uobj->context->ufile->ref);
> >> +                     uobj->uobjects_lock = uobj->context-
> >uobjects_lock;
> >> +                     kref_get(&uobj->uobjects_lock->ref);
> >> +                     ib_uverbs_uobject_enable(uobj);
> >> +                     fd_install(uobj->id, uobj->object);
> >
> > I don't get this.  The function is unlocking something, but there are
> calls to get krefs?
> >
> 
> Before invoking the user's callback, we're first locking all objects
> and afterwards we're unlocking them. When we need to create a new
> object, the lock becomes object creation and the unlock could become
> (assuming the user's callback succeeded) enabling this new object.
> When you add a new object (or fd in this case), we take a reference
> count to both the uverbs_file and the locking context.
> 
> >> +             } else {
> >> +                     fput(uobj->object);
> >> +                     put_unused_fd(uobj->id);
> >> +                     kfree(uobj);
> >> +             }
> >> +     } else {
> >> +             fput(filp);
> >> +     }
> >> +}
> >> +
> >> +void uverbs_unlock_object(struct ib_uobject *uobj,
> >> +                       enum uverbs_idr_access access,
> >> +                       bool success)
> >> +{
> >> +     if (uobj->type->type == UVERBS_ATTR_TYPE_IDR)
> >> +             uverbs_unlock_idr(uobj, access, success);
> >> +     else if (uobj->type->type == UVERBS_ATTR_TYPE_FD)
> >> +             uverbs_unlock_fd(uobj, access, success);
> >> +     else
> >> +             WARN_ON(true);
> >> +}
> >> +
> >> +static void ib_uverbs_remove_fd(struct ib_uobject *uobject)
> >> +{
> >> +     /*
> >> +      * user should release the uobject in the release
> >> +      * callback.
> >> +      */
> >> +     if (uobject->live) {
> >> +             uobject->live = 0;
> >> +             list_del(&uobject->list);
> >> +             uobject->type->free_fn(uobject->type, uobject);
> >> +             kref_put(&uobject->context->ufile->ref,
> >> ib_uverbs_release_file);
> >> +             uobject->context = NULL;
> >> +     }
> >> +}
> >> +
> >> +void ib_uverbs_close_fd(struct file *f)
> >> +{
> >> +     struct ib_uobject *uobject = f->private_data - sizeof(struct
> >> ib_uobject);
> >> +
> >> +     mutex_lock(&uobject->uobjects_lock->lock);
> >> +     if (uobject->live) {
> >> +             uobject->live = 0;
> >> +             list_del(&uobject->list);
> >> +             kref_put(&uobject->context->ufile->ref,
> >> ib_uverbs_release_file);
> >> +             uobject->context = NULL;
> >> +     }
> >> +     mutex_unlock(&uobject->uobjects_lock->lock);
> >> +     kref_put(&uobject->uobjects_lock->ref,
> >> release_uobjects_list_lock);
> >> +}
> >> +
> >> +void ib_uverbs_cleanup_fd(void *private_data)
> >> +{
> >> +     struct ib_uboject *uobject = private_data - sizeof(struct
> >> ib_uobject);
> >> +
> >> +     kfree(uobject);
> >> +}
> >> +
> >> +void uverbs_unlock_objects(struct uverbs_attr_array *attr_array,
> >> +                        size_t num,
> >> +                        const struct uverbs_action_spec *spec,
> >> +                        bool success)
> >> +{
> >> +     unsigned int i;
> >> +
> >> +     for (i = 0; i < num; i++) {
> >> +             struct uverbs_attr_array *attr_spec_array =
> &attr_array[i];
> >> +             const struct uverbs_attr_group_spec *group_spec =
> >> +                     spec->attr_groups[i];
> >> +             unsigned int j;
> >> +
> >> +             for (j = 0; j < attr_spec_array->num_attrs; j++) {
> >> +                     struct uverbs_attr *attr = &attr_spec_array-
> >> >attrs[j];
> >> +                     struct uverbs_attr_spec *spec = &group_spec-
> >> >attrs[j];
> >> +
> >> +                     if (!attr->valid)
> >> +                             continue;
> >> +
> >> +                     if (spec->type == UVERBS_ATTR_TYPE_IDR ||
> >> +                         spec->type == UVERBS_ATTR_TYPE_FD)
> >> +                             /*
> >> +                              * refcounts should be handled at the
> object
> >> +                              * level and not at the uobject level.
> >> +                              */
> >> +                             uverbs_unlock_object(attr-
> >obj_attr.uobject,
> >> +                                                  spec->obj.access,
> success);
> >> +             }
> >> +     }
> >> +}
> >> +
> >> +static unsigned int get_type_orders(const struct uverbs_types_group
> >> *types_group)
> >> +{
> >> +     unsigned int i;
> >> +     unsigned int max = 0;
> >> +
> >> +     for (i = 0; i < types_group->num_groups; i++) {
> >> +             unsigned int j;
> >> +             const struct uverbs_types *types = types_group-
> >> >type_groups[i];
> >> +
> >> +             for (j = 0; j < types->num_types; j++) {
> >> +                     if (!types->types[j] || !types->types[j]-
> >alloc)
> >> +                             continue;
> >> +                     if (types->types[j]->alloc->order > max)
> >> +                             max = types->types[j]->alloc->order;
> >> +             }
> >> +     }
> >> +
> >> +     return max;
> >> +}
> >> +
> >> +void ib_uverbs_uobject_type_cleanup_ucontext(struct ib_ucontext
> >> *ucontext,
> >> +                                          const struct
> uverbs_types_group
> >> *types_group)
> >> +{
> >> +     unsigned int num_orders = get_type_orders(types_group);
> >> +     unsigned int i;
> >> +
> >> +     for (i = 0; i <= num_orders; i++) {
> >> +             struct ib_uobject *obj, *next_obj;
> >> +
> >> +             /*
> >> +              * No need to take lock here, as cleanup should be
> called
> >> +              * after all commands finished executing. Newly
> executed
> >> +              * commands should fail.
> >> +              */
> >> +             mutex_lock(&ucontext->uobjects_lock->lock);
> >
> > It's really confusing to see a comment about 'no need to take lock'
> immediately followed by a call to lock.
> >
> 
> Yeah :) That was before adding the fd. I'll delete the comment.
> 
> >> +             list_for_each_entry_safe(obj, next_obj, &ucontext-
> >> >uobjects,
> >> +                                      list)
> >> +                     if (obj->type->order == i) {
> >> +                             if (obj->type->type ==
> UVERBS_ATTR_TYPE_IDR)
> >> +                                     ib_uverbs_uobject_remove(obj,
> false);
> >> +                             else
> >> +                                     ib_uverbs_remove_fd(obj);
> >> +                     }
> >> +             mutex_unlock(&ucontext->uobjects_lock->lock);
> >> +     }
> >> +     kref_put(&ucontext->uobjects_lock->ref,
> >> release_uobjects_list_lock);
> >> +}
> >> +
> >> +int ib_uverbs_uobject_type_initialize_ucontext(struct ib_ucontext
> >> *ucontext)
> >
> > Please work on the function names.  This is horrendously long and
> still doesn't help describe what it does.
> >
> 
> This just initialized the types part of the ucontext. Any suggestions?
> 
> >> +{
> >> +     ucontext->uobjects_lock = kmalloc(sizeof(*ucontext-
> >> >uobjects_lock),
> >> +                                       GFP_KERNEL);
> >> +     if (!ucontext->uobjects_lock)
> >> +             return -ENOMEM;
> >> +
> >> +     init_uobjects_list_lock(ucontext->uobjects_lock);
> >> +     INIT_LIST_HEAD(&ucontext->uobjects);
> >> +
> >> +     return 0;
> >> +}
> >> +
> >> +void ib_uverbs_uobject_type_release_ucontext(struct ib_ucontext
> >> *ucontext)
> >> +{
> >> +     kfree(ucontext->uobjects_lock);
> >> +}
> >
> > No need to wrap a call to 'free'.
> >
> 
> In order to abstract away the ucontext type data structure.
> 
> >> +
> >> diff --git a/drivers/infiniband/core/rdma_core.h
> >> b/drivers/infiniband/core/rdma_core.h
> >> new file mode 100644
> >> index 0000000..8990115
> >> --- /dev/null
> >> +++ b/drivers/infiniband/core/rdma_core.h
> >> @@ -0,0 +1,75 @@
> >> +/*
> >> + * Copyright (c) 2005 Topspin Communications.  All rights reserved.
> >> + * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
> >> + * Copyright (c) 2005-2016 Mellanox Technologies. All rights
> reserved.
> >> + * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
> >> + * Copyright (c) 2005 PathScale, Inc. All rights reserved.
> >> + *
> >> + * This software is available to you under a choice of one of two
> >> + * licenses.  You may choose to be licensed under the terms of the
> GNU
> >> + * General Public License (GPL) Version 2, available from the file
> >> + * COPYING in the main directory of this source tree, or the
> >> + * OpenIB.org BSD license below:
> >> + *
> >> + *     Redistribution and use in source and binary forms, with or
> >> + *     without modification, are permitted provided that the
> following
> >> + *     conditions are met:
> >> + *
> >> + *      - Redistributions of source code must retain the above
> >> + *        copyright notice, this list of conditions and the
> following
> >> + *        disclaimer.
> >> + *
> >> + *      - Redistributions in binary form must reproduce the above
> >> + *        copyright notice, this list of conditions and the
> following
> >> + *        disclaimer in the documentation and/or other materials
> >> + *        provided with the distribution.
> >> + *
> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> >> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
> OF
> >> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> >> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
> HOLDERS
> >> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
> AN
> >> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
> IN
> >> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> >> + * SOFTWARE.
> >> + */
> >> +
> >> +#ifndef UOBJECT_H
> >> +#define UOBJECT_H
> >> +
> >> +#include <linux/idr.h>
> >> +#include <rdma/uverbs_ioctl.h>
> >> +#include <rdma/ib_verbs.h>
> >> +#include <linux/mutex.h>
> >> +
> >> +const struct uverbs_type *uverbs_get_type(const struct ib_device
> >> *ibdev,
> >> +                                       uint16_t type);
> >> +struct ib_uobject *uverbs_get_type_from_idr(const struct
> >> uverbs_type_alloc_action *type,
> >> +                                         struct ib_ucontext
> *ucontext,
> >> +                                         enum uverbs_idr_access
> access,
> >> +                                         uint32_t idr);
> >> +struct ib_uobject *uverbs_get_type_from_fd(const struct
> >> uverbs_type_alloc_action *type,
> >> +                                        struct ib_ucontext
> *ucontext,
> >> +                                        enum uverbs_idr_access
> access,
> >> +                                        int fd);
> >> +void uverbs_unlock_object(struct ib_uobject *uobj,
> >> +                       enum uverbs_idr_access access,
> >> +                       bool success);
> >> +void uverbs_unlock_objects(struct uverbs_attr_array *attr_array,
> >> +                        size_t num,
> >> +                        const struct uverbs_action_spec *spec,
> >> +                        bool success);
> >> +
> >> +void ib_uverbs_uobject_type_cleanup_ucontext(struct ib_ucontext
> >> *ucontext,
> >> +                                          const struct
> uverbs_types_group
> >> *types_group);
> >> +int ib_uverbs_uobject_type_initialize_ucontext(struct ib_ucontext
> >> *ucontext);
> >> +void ib_uverbs_uobject_type_release_ucontext(struct ib_ucontext
> >> *ucontext);
> >> +void ib_uverbs_close_fd(struct file *f);
> >> +void ib_uverbs_cleanup_fd(void *private_data);
> >> +
> >> +static inline void *uverbs_fd_to_priv(struct ib_uobject *uobj)
> >> +{
> >> +     return uobj + 1;
> >> +}
> >
> > This seems like a rather useless function.
> >
> 
> Why? The user sholdn't know or care how we put our structs together.
> 
> >> +
> >> +#endif /* UIDR_H */
> >> diff --git a/drivers/infiniband/core/uverbs.h
> >> b/drivers/infiniband/core/uverbs.h
> >> index 8074705..ae7d4b8 100644
> >> --- a/drivers/infiniband/core/uverbs.h
> >> +++ b/drivers/infiniband/core/uverbs.h
> >> @@ -180,6 +180,7 @@ void idr_remove_uobj(struct ib_uobject *uobj);
> >>  struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file
> >> *uverbs_file,
> >>                                       struct ib_device *ib_dev,
> >>                                       int is_async);
> >> +void ib_uverbs_release_file(struct kref *ref);
> >>  void ib_uverbs_free_async_event_file(struct ib_uverbs_file
> >> *uverbs_file);
> >>  struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
> >>
> >> diff --git a/drivers/infiniband/core/uverbs_main.c
> >> b/drivers/infiniband/core/uverbs_main.c
> >> index f783723..e63357a 100644
> >> --- a/drivers/infiniband/core/uverbs_main.c
> >> +++ b/drivers/infiniband/core/uverbs_main.c
> >> @@ -341,7 +341,7 @@ static void ib_uverbs_comp_dev(struct
> >> ib_uverbs_device *dev)
> >>       complete(&dev->comp);
> >>  }
> >>
> >> -static void ib_uverbs_release_file(struct kref *ref)
> >> +void ib_uverbs_release_file(struct kref *ref)
> >>  {
> >>       struct ib_uverbs_file *file =
> >>               container_of(ref, struct ib_uverbs_file, ref);
> >> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> >> index b5d2075..7240615 100644
> >> --- a/include/rdma/ib_verbs.h
> >> +++ b/include/rdma/ib_verbs.h
> >> @@ -1329,8 +1329,11 @@ struct ib_fmr_attr {
> >>
> >>  struct ib_umem;
> >>
> >> +struct ib_ucontext_lock;
> >> +
> >>  struct ib_ucontext {
> >>       struct ib_device       *device;
> >> +     struct ib_uverbs_file  *ufile;
> >>       struct list_head        pd_list;
> >>       struct list_head        mr_list;
> >>       struct list_head        mw_list;
> >> @@ -1344,6 +1347,10 @@ struct ib_ucontext {
> >>       struct list_head        rwq_ind_tbl_list;
> >>       int                     closing;
> >>
> >> +     /* lock for uobjects list */
> >> +     struct ib_ucontext_lock *uobjects_lock;
> >> +     struct list_head        uobjects;
> >> +
> >>       struct pid             *tgid;
> >>  #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
> >>       struct rb_root      umem_tree;
> >> @@ -1363,16 +1370,28 @@ struct ib_ucontext {
> >>  #endif
> >>  };
> >>
> >> +struct uverbs_object_list;
> >> +
> >> +#define OLD_ABI_COMPAT
> >> +
> >>  struct ib_uobject {
> >>       u64                     user_handle;    /* handle given to us
> by userspace
> >> */
> >>       struct ib_ucontext     *context;        /* associated user
> context
> >> */
> >>       void                   *object;         /* containing object
> */
> >>       struct list_head        list;           /* link to context's
> list */
> >> -     int                     id;             /* index into kernel
> idr */
> >> -     struct kref             ref;
> >> -     struct rw_semaphore     mutex;          /* protects .live */
> >> +     int                     id;             /* index into kernel
> idr/fd */
> >> +#ifdef OLD_ABI_COMPAT
> >> +     struct kref             ref;
> >> +#endif
> >> +     struct rw_semaphore     usecnt;         /* protects exclusive
> >> access */
> >> +#ifdef OLD_ABI_COMPAT
> >> +     struct rw_semaphore     mutex;          /* protects .live */
> >> +#endif
> >>       struct rcu_head         rcu;            /* kfree_rcu()
> overhead */
> >>       int                     live;
> >> +
> >> +     const struct uverbs_type_alloc_action *type;
> >> +     struct ib_ucontext_lock *uobjects_lock;
> >>  };
> >>
> >>  struct ib_udata {
> >> @@ -2101,6 +2120,9 @@ struct ib_device {
> >>        */
> >>       int (*get_port_immutable)(struct ib_device *, u8, struct
> >> ib_port_immutable *);
> >>       void (*get_dev_fw_str)(struct ib_device *, char *str, size_t
> >> str_len);
> >> +     struct list_head type_list;
> >> +
> >> +     const struct uverbs_types_group *types_group;
> >>  };
> >>
> >>  struct ib_client {
> >> diff --git a/include/rdma/uverbs_ioctl.h
> b/include/rdma/uverbs_ioctl.h
> >> new file mode 100644
> >> index 0000000..2f50045
> >> --- /dev/null
> >> +++ b/include/rdma/uverbs_ioctl.h
> >> @@ -0,0 +1,195 @@
> >> +/*
> >> + * Copyright (c) 2016, Mellanox Technologies inc.  All rights
> >> reserved.
> >> + *
> >> + * This software is available to you under a choice of one of two
> >> + * licenses.  You may choose to be licensed under the terms of the
> GNU
> >> + * General Public License (GPL) Version 2, available from the file
> >> + * COPYING in the main directory of this source tree, or the
> >> + * OpenIB.org BSD license below:
> >> + *
> >> + *     Redistribution and use in source and binary forms, with or
> >> + *     without modification, are permitted provided that the
> following
> >> + *     conditions are met:
> >> + *
> >> + *      - Redistributions of source code must retain the above
> >> + *        copyright notice, this list of conditions and the
> following
> >> + *        disclaimer.
> >> + *
> >> + *      - Redistributions in binary form must reproduce the above
> >> + *        copyright notice, this list of conditions and the
> following
> >> + *        disclaimer in the documentation and/or other materials
> >> + *        provided with the distribution.
> >> + *
> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> >> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
> OF
> >> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> >> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
> HOLDERS
> >> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
> AN
> >> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
> IN
> >> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> >> + * SOFTWARE.
> >> + */
> >> +
> >> +#ifndef _UVERBS_IOCTL_
> >> +#define _UVERBS_IOCTL_
> >> +
> >> +#include <linux/kernel.h>
> >> +
> >> +struct uverbs_object_type;
> >> +struct ib_ucontext;
> >> +struct ib_uobject;
> >> +struct ib_device;
> >> +struct uverbs_uobject_type;
> >> +
> >> +/*
> >> + * =======================================
> >> + *   Verbs action specifications
> >> + * =======================================
> >> + */
> >
> > I intentionally used urdma (though condensed to 3 letters that I
> don't recall atm), rather than uverbs.  This will need to work with
> non-verbs devices and interfaces -- again, consider how this fits with
> the rdma cm.  Verbs has a very specific meaning, which gets lost if we
> start referring to everything as 'verbs'.  It's bad enough that we're
> stuck with 'drivers/infiniband' and 'rdma', such that 'infiniband' also
> means ethernet and rdma means nothing.
> >
> 
> IMHO - let's agree on the concept of this infrastructure. One we
> decide its scope, we could generalize it (i.e - ioctl_provider and
> ioctl_context) and implement it to rdma-cm as well.
> 
> >> +
> >> +enum uverbs_attr_type {
> >> +     UVERBS_ATTR_TYPE_PTR_IN,
> >> +     UVERBS_ATTR_TYPE_PTR_OUT,
> >> +     UVERBS_ATTR_TYPE_IDR,
> >> +     UVERBS_ATTR_TYPE_FD,
> >> +};
> >> +
> >> +enum uverbs_idr_access {
> >> +     UVERBS_IDR_ACCESS_READ,
> >> +     UVERBS_IDR_ACCESS_WRITE,
> >> +     UVERBS_IDR_ACCESS_NEW,
> >> +     UVERBS_IDR_ACCESS_DESTROY
> >> +};
> >> +
> >> +struct uverbs_attr_spec {
> >> +     u16                             len;
> >> +     enum uverbs_attr_type           type;
> >> +     struct {
> >> +             u16                     obj_type;
> >> +             u8                      access;
> >
> > Is access intended to be an enum uverbs_idr_access value?
> >
> 
> Yeah, worth using this enum. Thanks.
> 
> >> +     } obj;
> >
> > I would remove (flatten) the substructure and re-order the fields for
> better alignment.
> >
> 
> I noticed there are several places which aren't aliged. It's in my todo
> list.
> 
> >> +};
> >> +
> >> +struct uverbs_attr_group_spec {
> >> +     struct uverbs_attr_spec         *attrs;
> >> +     size_t                          num_attrs;
> >> +};
> >> +
> >> +struct uverbs_action_spec {
> >> +     const struct uverbs_attr_group_spec             **attr_groups;
> >> +     /* if > 0 -> validator, otherwise, error */
> >
> > ? not sure what this comment means
> >
> >> +     int (*dist)(__u16 *attr_id, void *priv);
> >
> > What does 'dist' stand for?
> >
> 
> dist = distribution function.
> It maps the attributes you got from the user-space to your groups. You
> could think of each group as a namespace - where its attributes (or
> types/actions) starts from zero in the sake of compactness.
> So, for example, it gets an attribute 0x8010 and maps to to "group 1"
> (provider) and attribute 0x10.
> 
> >> +     void                                            *priv;
> >> +     size_t                                          num_groups;
> >> +};
> >> +
> >> +struct uverbs_attr_array;
> >> +struct ib_uverbs_file;
> >> +
> >> +struct uverbs_action {
> >> +     struct uverbs_action_spec spec;
> >> +     void *priv;
> >> +     int (*handler)(struct ib_device *ib_dev, struct ib_uverbs_file
> >> *ufile,
> >> +                    struct uverbs_attr_array *ctx, size_t num, void
> >> *priv);
> >> +};
> >> +
> >> +struct uverbs_type_alloc_action;
> >> +typedef void (*free_type)(const struct uverbs_type_alloc_action
> >> *uobject_type,
> >> +                       struct ib_uobject *uobject);
> >> +
> >> +struct uverbs_type_alloc_action {
> >> +     enum uverbs_attr_type           type;
> >> +     int                             order;
> >
> > I think this is being used as destroy order, in which case I would
> rename it to clarify the intent.  Though I'd prefer we come up with a
> more efficient destruction mechanism than the repeated nested looping.
> >
> 
> In one of the earlier revisions I used a sorted list, which was
> efficient. I recall that Jason didn't like its complexity and
> re-thinking about that - he's right. Most of your types are "order
> number" 0 anyway. So you'll probably iterate very few objects in the
> next round (in verbs, everything but MRs and PDs).
> 
> >> +     size_t                          obj_size;
> >
> > This can be alloc_fn
> >
> >> +     free_type                       free_fn;
> >> +     struct {
> >> +             const struct file_operations    *fops;
> >> +             const char                      *name;
> >> +             int                             flags;
> >> +     } fd;
> >> +};
> >> +
> >> +struct uverbs_type_actions_group {
> >> +     size_t                                  num_actions;
> >> +     const struct uverbs_action              **actions;
> >> +};
> >> +
> >> +struct uverbs_type {
> >> +     size_t                                  num_groups;
> >> +     const struct uverbs_type_actions_group  **action_groups;
> >> +     const struct uverbs_type_alloc_action   *alloc;
> >> +     int (*dist)(__u16 *action_id, void *priv);
> >> +     void                                    *priv;
> >> +};
> >> +
> >> +struct uverbs_types {
> >> +     size_t                                  num_types;
> >> +     const struct uverbs_type                **types;
> >> +};
> >> +
> >> +struct uverbs_types_group {
> >> +     const struct uverbs_types               **type_groups;
> >> +     size_t                                  num_groups;
> >> +     int (*dist)(__u16 *type_id, void *priv);
> >> +     void                                    *priv;
> >> +};
> >> +
> >> +/* =================================================
> >> + *              Parsing infrastructure
> >> + * =================================================
> >> + */
> >> +
> >> +struct uverbs_ptr_attr {
> >> +     void    * __user ptr;
> >> +     __u16           len;
> >> +};
> >> +
> >> +struct uverbs_fd_attr {
> >> +     int             fd;
> >> +};
> >> +
> >> +struct uverbs_uobj_attr {
> >> +     /*  idr handle */
> >> +     __u32   idr;
> >> +};
> >> +
> >> +struct uverbs_obj_attr {
> >> +     /* pointer to the kernel descriptor -> type, access, etc */
> >> +     const struct uverbs_attr_spec *val;
> >> +     struct ib_uverbs_attr __user    *uattr;
> >> +     const struct uverbs_type_alloc_action   *type;
> >> +     struct ib_uobject               *uobject;
> >> +     union {
> >> +             struct uverbs_fd_attr           fd;
> >> +             struct uverbs_uobj_attr         uobj;
> >> +     };
> >> +};
> >> +
> >> +struct uverbs_attr {
> >> +     bool valid; > >> +     union {
> >> +             struct uverbs_ptr_attr  cmd_attr;
> >> +             struct uverbs_obj_attr  obj_attr;
> >> +     };
> >> +};
> >
> > It's odd to have a union that's part of a structure without some
> field to indicate which union field is accessible.
> >
> 
> You index this array but the attribute id from the user's callback
> funciton. The user should know what's the type of the attribute, as
> [s]he declared the specification.
> 
> >> +
> >> +/* output of one validator */
> >> +struct uverbs_attr_array {
> >> +     size_t num_attrs;
> >> +     /* arrays of attrubytes, index is the id i.e SEND_CQ */
> >> +     struct uverbs_attr *attrs;
> >> +};
> >> +
> >> +/* =================================================
> >> + *              Types infrastructure
> >> + * =================================================
> >> + */
> >> +
> >> +int ib_uverbs_uobject_type_add(struct list_head      *head,
> >> +                            void (*free)(struct uverbs_uobject_type
> *type,
> >> +                                         struct ib_uobject
> *uobject,
> >> +                                         struct ib_ucontext
> *ucontext),
> >> +                            uint16_t obj_type);
> >> +void ib_uverbs_uobject_types_remove(struct ib_device *ib_dev);
> >> +
> >> +#endif
> >> --
> >> 2.7.4

Matan, please re-look at the architecture that I proposed:

https://patchwork.kernel.org/patch/9178991/

including the terminology (and consider using common OOP terms).  The *core* of the ioctl framework is to simply invoke a function dispatch table.  IMO, that's where we should start.  Anything beyond that is extra that we should have a strong reason for including.  (Yes, I think we need more.)  Starting simple and adding necessary functionality should let us get something upstream quicker and re-use more of the existing code.

If we're going to re-create netlink as part of the rdma ioctl interface, then why don't we just use netlink directly?
N‹§²æìr¸›yúèšØb²X¬¶Ç§vØ^–)Þº{.nÇ+‰·¥Š{±ÙšŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hš‹»öì\x17/oSc¾™Ú³9˜uÀ¦æå‰È&jw¨®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿïêäz¹Þ–Šàþf£¢·hšˆ§~ˆmš

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox