public inbox for linux-rdma@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC] RDMA CM + XRC, take two
@ 2011-05-09 22:33 frank zago
       [not found] ` <4DC86BB2.1020002-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: frank zago @ 2011-05-09 22:33 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

[-- Attachment #1: Type: text/plain, Size: 954 bytes --]

Hello,

This is an improved version of my previous patch to add XRC support to RDMA CM.

This version now support the newer rdma CM interface. It needs some polishing but it works.

It modifies the rdma_client/server examples to use xrc instead of rc.

The big hack is that the SRQ number needs to be transmitted to the remote side. This patch hijacks the private data, so it's not acceptable. Ideally the SRQ number should be transmitted either in the REQ or REP packet (depending on which side the sender or the receiver) alongside the QP number. But that would need a change in the specs. Any suggestions ?

Also a good chunk of the patch is to deal with the XRC verbs API. I wonder whether XRC could/should be more integrated into the existing verbs:
- sender should not need a domain,
- there should be 2 types of xrc QPs (send and receive) instead of one,
- *_xrc_rcv_qp verbs should be abstracted under the cover in libibverbs,

Regards,
  frank.

[-- Attachment #2: rdmacm-xrc-v4.diff --]
[-- Type: text/x-diff, Size: 15666 bytes --]

diff -rubw librdmacm-1.0.14.1.org/examples/rdma_client.c librdmacm-1.0.14.1/examples/rdma_client.c
--- librdmacm-1.0.14.1.org/examples/rdma_client.c	2010-10-04 19:00:18.000000000 -0500
+++ librdmacm-1.0.14.1/examples/rdma_client.c	2011-05-09 16:40:05.000000000 -0500
@@ -52,6 +52,7 @@
 
 	memset(&hints, 0, sizeof hints);
 	hints.ai_port_space = RDMA_PS_TCP;
+	hints.ai_qp_type = IBV_QPT_XRC;
 	ret = rdma_getaddrinfo(server, port, &hints, &res);
 	if (ret) {
 		printf("rdma_getaddrinfo %d\n", errno);
@@ -77,12 +78,6 @@
 		return ret;
 	}
 
-	ret = rdma_post_recv(id, NULL, recv_msg, 16, mr);
-	if (ret) {
-		printf("rdma_post_recv %d\n", errno);
-		return ret;
-	}
-
 	ret = rdma_connect(id, NULL);
 	if (ret) {
 		printf("rdma_connect %d\n", errno);
@@ -95,7 +90,7 @@
 		return ret;
 	}
 
-	ret = rdma_get_recv_comp(id, &wc);
+	ret = rdma_get_send_comp(id, &wc);
 	if (ret <= 0) {
 		printf("rdma_get_recv_comp %d\n", ret);
 		return ret;
diff -rubw librdmacm-1.0.14.1.org/examples/rdma_server.c librdmacm-1.0.14.1/examples/rdma_server.c
--- librdmacm-1.0.14.1.org/examples/rdma_server.c	2010-10-04 19:00:18.000000000 -0500
+++ librdmacm-1.0.14.1/examples/rdma_server.c	2011-05-06 16:01:51.000000000 -0500
@@ -53,6 +53,7 @@
 	memset(&hints, 0, sizeof hints);
 	hints.ai_flags = RAI_PASSIVE;
 	hints.ai_port_space = RDMA_PS_TCP;
+	hints.ai_qp_type = IBV_QPT_XRC;
 	ret = rdma_getaddrinfo(NULL, port, &hints, &res);
 	if (ret) {
 		printf("rdma_getaddrinfo %d\n", errno);
@@ -60,7 +61,8 @@
 	}
 
 	memset(&attr, 0, sizeof attr);
-	attr.cap.max_send_wr = attr.cap.max_recv_wr = 1;
+	attr.cap.max_send_wr = 0;	/* means XRC receive */
+	attr.cap.max_recv_wr = 1;
 	attr.cap.max_send_sge = attr.cap.max_recv_sge = 1;
 	attr.cap.max_inline_data = 16;
 	attr.sq_sig_all = 1;
@@ -97,7 +99,7 @@
 
 	ret = rdma_accept(id, NULL);
 	if (ret) {
-		printf("rdma_connect %d\n", errno);
+		printf("rdma_accept %d\n", errno);
 		return ret;
 	}
 
@@ -107,18 +109,6 @@
 		return ret;
 	}
 
-	ret = rdma_post_send(id, NULL, send_msg, 16, NULL, IBV_SEND_INLINE);
-	if (ret) {
-		printf("rdma_post_send %d\n", errno);
-		return ret;
-	}
-
-	ret = rdma_get_send_comp(id, &wc);
-	if (ret <= 0) {
-		printf("rdma_get_send_comp %d\n", ret);
-		return ret;
-	}
-
 	rdma_disconnect(id);
 	rdma_dereg_mr(mr);
 	rdma_destroy_ep(id);
diff -rubw librdmacm-1.0.14.1.org/include/rdma/rdma_cma.h librdmacm-1.0.14.1/include/rdma/rdma_cma.h
--- librdmacm-1.0.14.1.org/include/rdma/rdma_cma.h	2010-12-10 14:05:34.000000000 -0600
+++ librdmacm-1.0.14.1/include/rdma/rdma_cma.h	2011-05-06 10:22:36.000000000 -0500
@@ -125,6 +125,14 @@
 	struct ibv_cq		*send_cq;
 	struct ibv_comp_channel *recv_cq_channel;
 	struct ibv_cq		*recv_cq;
+
+	/* XRC support */
+ 	struct ibv_xrc_domain *xrc_domain;
+	struct ibv_srq *xrc_srq;	/* if receive side */
+ 	union {
+		uint32_t xrc_rcv_qpn;	/* if receive side */
+		uint32_t xrc_srq_num;	/* if send side */
+	};
 };
 
 enum {
diff -rubw librdmacm-1.0.14.1.org/include/rdma/rdma_verbs.h librdmacm-1.0.14.1/include/rdma/rdma_verbs.h
--- librdmacm-1.0.14.1.org/include/rdma/rdma_verbs.h	2010-12-10 14:05:34.000000000 -0600
+++ librdmacm-1.0.14.1/include/rdma/rdma_verbs.h	2011-05-06 15:26:47.000000000 -0500
@@ -57,7 +57,10 @@
 static inline struct ibv_mr *
 rdma_reg_msgs(struct rdma_cm_id *id, void *addr, size_t length)
 {
+	if (id->qp)
 	return ibv_reg_mr(id->qp->pd, addr, length, IBV_ACCESS_LOCAL_WRITE);
+	else
+		return ibv_reg_mr(id->xrc_srq->pd, addr, length, IBV_ACCESS_LOCAL_WRITE);
 }
 
 static inline struct ibv_mr *
@@ -96,7 +99,10 @@
 	wr.sg_list = sgl;
 	wr.num_sge = nsge;
 
+	if (id->qp)
 	return rdma_seterrno(ibv_post_recv(id->qp, &wr, &bad));
+	else
+		return rdma_seterrno(ibv_post_srq_recv(id->xrc_srq, &wr, &bad));
 }
 
 static inline int
@@ -111,6 +117,7 @@
 	wr.num_sge = nsge;
 	wr.opcode = IBV_WR_SEND;
 	wr.send_flags = flags;
+	wr.xrc_remote_srq_num = id->xrc_srq_num;
 
 	return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad));
 }
diff -rubw librdmacm-1.0.14.1.org/man/rdma_create_qp.3 librdmacm-1.0.14.1/man/rdma_create_qp.3
--- librdmacm-1.0.14.1.org/man/rdma_create_qp.3	2010-12-10 14:05:34.000000000 -0600
+++ librdmacm-1.0.14.1/man/rdma_create_qp.3	2011-05-05 11:53:53.000000000 -0500
@@ -40,6 +40,10 @@
 channels.  Completion channels and CQ data created by the rdma_cm are
 exposed to the user through the rdma_cm_id structure.
 .P
+To create an XRC receive QP, and in addition to the XRC QP type,
+ibv_qp_init_attr.cap.max_send_wr must be set to 0. Conversely, to
+create the XRC send QP, that attribute must be non-zero.
+.P
 The actual capabilities and properties of the created QP will be
 returned to the user through the qp_init_attr parameter.
 .SH "SEE ALSO"
diff -rubw librdmacm-1.0.14.1.org/src/addrinfo.c librdmacm-1.0.14.1/src/addrinfo.c
--- librdmacm-1.0.14.1.org/src/addrinfo.c	2010-12-10 14:05:34.000000000 -0600
+++ librdmacm-1.0.14.1/src/addrinfo.c	2011-05-06 16:20:52.000000000 -0500
@@ -82,7 +82,8 @@
 	ai->ai_next = NULL;
 }
 
-static int ucma_convert_to_rai(struct rdma_addrinfo *rai, struct addrinfo *ai)
+static int ucma_convert_to_rai(struct rdma_addrinfo *rai, struct addrinfo *ai,
+							   struct rdma_addrinfo *hints)
 {
 	struct sockaddr *addr;
 	char *canonname;
@@ -91,6 +92,9 @@
 
 	switch (ai->ai_socktype) {
 	case SOCK_STREAM:
+		if (hints && hints->ai_qp_type == IBV_QPT_XRC)
+			rai->ai_qp_type = IBV_QPT_XRC;
+		else
 		rai->ai_qp_type = IBV_QPT_RC;
 		break;
 	case SOCK_DGRAM:
@@ -149,7 +153,7 @@
 	if (ret)
 		return ret;
 
-	ret = ucma_convert_to_rai(rai, ai);
+	ret = ucma_convert_to_rai(rai, ai, hints);
 	freeaddrinfo(ai);
 	return ret;
 }
diff -rubw librdmacm-1.0.14.1.org/src/cma.c librdmacm-1.0.14.1/src/cma.c
--- librdmacm-1.0.14.1.org/src/cma.c	2010-12-10 14:05:34.000000000 -0600
+++ librdmacm-1.0.14.1/src/cma.c	2011-05-09 17:15:53.000000000 -0500
@@ -944,12 +944,29 @@
 	return 0;
 }
 
+static int rdma_modify_qp(struct rdma_cm_id *id, 
+						  struct ibv_qp_attr *qp_attr,
+						  int qp_attr_mask)
+{
+	int ret;
+
+	if (id->qp)
+		ret = ibv_modify_qp(id->qp, qp_attr, qp_attr_mask);
+	else if (id->xrc_domain)
+		ret = ibv_modify_xrc_rcv_qp(id->xrc_domain, id->xrc_rcv_qpn,
+									qp_attr, qp_attr_mask);
+	else 
+		ret = EINVAL;
+
+	return ret;
+}
+
 static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res)
 {
 	struct ibv_qp_attr qp_attr;
 	int qp_attr_mask, ret;
 
-	if (!id->qp)
+	if (!id->qp && !id->xrc_domain)
 		return ERR(EINVAL);
 
 	/* Need to update QP attributes from default values. */
@@ -958,7 +975,7 @@
 	if (ret)
 		return ret;
 
-	ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask);
+	ret = rdma_modify_qp(id, &qp_attr, qp_attr_mask);
 	if (ret)
 		return ERR(ret);
 
@@ -969,7 +986,7 @@
 
 	if (resp_res != RDMA_MAX_RESP_RES)
 		qp_attr.max_dest_rd_atomic = resp_res;
-	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask));
+	return rdma_seterrno(rdma_modify_qp(id, &qp_attr, qp_attr_mask));
 }
 
 static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth)
@@ -984,29 +1001,29 @@
 
 	if (init_depth != RDMA_MAX_INIT_DEPTH)
 		qp_attr.max_rd_atomic = init_depth;
-	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask));
+	return rdma_seterrno(rdma_modify_qp(id, &qp_attr, qp_attr_mask));
 }
 
 static int ucma_modify_qp_sqd(struct rdma_cm_id *id)
 {
 	struct ibv_qp_attr qp_attr;
 
-	if (!id->qp)
+	if (!id->qp && !id->xrc_domain)
 		return 0;
 
 	qp_attr.qp_state = IBV_QPS_SQD;
-	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE));
+	return rdma_seterrno(rdma_modify_qp(id, &qp_attr, IBV_QP_STATE));
 }
 
 static int ucma_modify_qp_err(struct rdma_cm_id *id)
 {
 	struct ibv_qp_attr qp_attr;
 
-	if (!id->qp)
+	if (!id->qp && !id->xrc_domain)
 		return 0;
 
 	qp_attr.qp_state = IBV_QPS_ERR;
-	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE));
+	return rdma_seterrno(rdma_modify_qp(id, &qp_attr, IBV_QP_STATE));
 }
 
 static int ucma_find_pkey(struct cma_device *cma_dev, uint8_t port_num,
@@ -1025,7 +1042,7 @@
 	return ERR(EINVAL);
 }
 
-static int ucma_init_conn_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp)
+static int ucma_init_conn_qp3(struct cma_id_private *id_priv)
 {
 	struct ibv_qp_attr qp_attr;
 	int ret;
@@ -1040,25 +1057,25 @@
 	qp_attr.qp_state = IBV_QPS_INIT;
 	qp_attr.qp_access_flags = 0;
 
-	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS |
+	ret = rdma_modify_qp(&id_priv->id, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS |
 					  IBV_QP_PKEY_INDEX | IBV_QP_PORT);
 	return rdma_seterrno(ret);
 }
 
-static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
+static int ucma_init_conn_qp(struct cma_id_private *id_priv)
 {
 	struct ibv_qp_attr qp_attr;
 	int qp_attr_mask, ret;
 
 	if (abi_ver == 3)
-		return ucma_init_conn_qp3(id_priv, qp);
+		return ucma_init_conn_qp3(id_priv);
 
 	qp_attr.qp_state = IBV_QPS_INIT;
 	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
 	if (ret)
 		return ret;
 
-	return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask));
+	return rdma_seterrno(rdma_modify_qp(&id_priv->id, &qp_attr, qp_attr_mask));
 }
 
 static int ucma_init_ud_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp)
@@ -1137,7 +1154,7 @@
 
 static int ucma_create_cqs(struct rdma_cm_id *id, struct ibv_qp_init_attr *attr)
 {
-	if (!attr->recv_cq) {
+	if (!attr->recv_cq /*&& !(attr->qp_type == IBV_QPT_XRC && attr->cap.max_send_wr != 0)*/) {
 		id->recv_cq_channel = ibv_create_comp_channel(id->verbs);
 		if (!id->recv_cq_channel)
 			goto err;
@@ -1150,7 +1167,7 @@
 		attr->recv_cq = id->recv_cq;
 	}
 
-	if (!attr->send_cq) {
+	if (!attr->send_cq && !(attr->qp_type == IBV_QPT_XRC && attr->cap.max_send_wr == 0)) {
 		id->send_cq_channel = ibv_create_comp_channel(id->verbs);
 		if (!id->send_cq_channel)
 			goto err;
@@ -1175,45 +1192,141 @@
 	struct cma_id_private *id_priv;
 	struct ibv_qp *qp;
 	int ret;
+	struct ibv_xrc_domain *xrc_domain = NULL;
+	struct ibv_srq *xrc_srq = NULL;
 
 	id_priv = container_of(id, struct cma_id_private, id);
+
 	if (!pd)
 		pd = id_priv->cma_dev->pd;
 	else if (id->verbs != pd->context)
 		return ERR(EINVAL);
 
+	/* TODO: if xrc domain, create either the send or recv CQ. Should
+	 * split ucma_create_cqs(). */
 	ret = ucma_create_cqs(id, qp_init_attr);
 	if (ret)
 		return ret;
 
+	/* If no XRC domain/SRQ was passed, create one. */
+	if (qp_init_attr->qp_type == IBV_QPT_XRC 
+		/* Note: for some reason, a send QP must also have a domain.
+		   && qp_init_attr->cap.max_send_wr == 0*/) {
+		/* a receive side must only have a domain with an srq, or just
+		 * a domain, or nothing at all. */
+		if (qp_init_attr->cap.max_send_wr == 0 && 
+			!qp_init_attr->xrc_domain &&
+			qp_init_attr->srq) {
+			ret = ERR(EINVAL);
+			goto err1;
+		}
+
+		if (!qp_init_attr->xrc_domain) {
+			xrc_domain = ibv_open_xrc_domain(pd->context, -1, O_CREAT);
+			if (!xrc_domain) {
+				ret = ERR(EINVAL);
+				goto err1;
+			}
+			qp_init_attr->xrc_domain = xrc_domain;
+		}
+	}
+
+	if (qp_init_attr->qp_type == IBV_QPT_XRC &&
+		qp_init_attr->cap.max_send_wr == 0) {
+		/* Special case: this is a receive XRC QP. */
+
+		if (!qp_init_attr->srq) {
+			struct ibv_srq_init_attr srq_init_attr;
+
+			memset(&srq_init_attr, 0, sizeof(struct ibv_srq_init_attr));
+
+			srq_init_attr.srq_context = pd->context;
+			srq_init_attr.attr.max_wr = qp_init_attr->cap.max_recv_wr;
+			srq_init_attr.attr.max_sge = qp_init_attr->cap.max_recv_sge;
+			srq_init_attr.attr.srq_limit = 0; /* should be ignored */
+			
+			xrc_srq = ibv_create_xrc_srq(pd,
+										 qp_init_attr->xrc_domain,
+										 id->recv_cq,
+										 &srq_init_attr);
+			if (!xrc_srq) {
+				ret = ERR(EINVAL);
+				goto err1;
+			}
+
+			qp_init_attr->srq = xrc_srq;
+		}
+
+		id->xrc_srq = qp_init_attr->srq;
+
+		ret = ibv_create_xrc_rcv_qp(qp_init_attr, &id->xrc_rcv_qpn);
+		if (ret) {
+			ret = ERR(ret);
+			goto err1;
+		}
+		id->xrc_domain = qp_init_attr->xrc_domain;
+		qp = NULL;
+
+	} else {
 	qp = ibv_create_qp(pd, qp_init_attr);
 	if (!qp) {
 		ret = ERR(ENOMEM);
 		goto err1;
 	}
+	}
+
+	id->qp = qp;
 
 	if (ucma_is_ud_ps(id->ps))
 		ret = ucma_init_ud_qp(id_priv, qp);
 	else
-		ret = ucma_init_conn_qp(id_priv, qp);
+		ret = ucma_init_conn_qp(id_priv);
 	if (ret)
 		goto err2;
 
-	id->qp = qp;
 	return 0;
+
 err2:
+	if (qp)
 	ibv_destroy_qp(qp);
+	else if (xrc_domain && id->xrc_rcv_qpn)
+		ibv_unreg_xrc_rcv_qp(xrc_domain, id->xrc_rcv_qpn);
+
 err1:
+	if (xrc_srq) {
+		qp_init_attr->srq = NULL;
+		ibv_destroy_srq(xrc_srq);
+	}
+
+	if (xrc_domain) {
+		qp_init_attr->xrc_domain = NULL;
+		ibv_close_xrc_domain(xrc_domain);
+	}
+
+	id->qp = NULL;
+	id->xrc_domain = NULL;
 	ucma_destroy_cqs(id);
 	return ret;
 }
 
 void rdma_destroy_qp(struct rdma_cm_id *id)
 {
+	if (id->qp) {
 	ibv_destroy_qp(id->qp);
-	ucma_destroy_cqs(id);
 	id->qp = NULL;
 }
+	else if (id->xrc_domain) {
+		ibv_unreg_xrc_rcv_qp(id->xrc_domain, id->xrc_rcv_qpn);
+		/* TODO: we must only destroy domain/SRQ if we created them. */
+		if (id->xrc_srq) {
+			ibv_destroy_srq(id->xrc_srq);
+			id->xrc_srq = NULL;
+		}
+		ibv_close_xrc_domain(id->xrc_domain);
+		id->xrc_domain = NULL;
+	}
+	ucma_destroy_cqs(id);
+}
 
 static int ucma_valid_param(struct cma_id_private *id_priv,
 			    struct rdma_conn_param *param)
@@ -1221,7 +1334,7 @@
 	if (id_priv->id.ps != RDMA_PS_TCP)
 		return 0;
 
-	if (!id_priv->id.qp && !param)
+	if (!id_priv->id.qp && !id_priv->id.xrc_srq && !param)
 		goto err;
 
 	if (!param)
@@ -1313,7 +1426,16 @@
 		id_priv->connect_len = 0;
 	}
 
-	return ucma_complete(id_priv);
+	ret = ucma_complete(id_priv);
+	if (ret)
+		return ret;
+
+	{
+		// HACK: retrieve the SRQ number
+		id->xrc_srq_num = *(uint32_t *)id->event->param.conn.private_data;
+	}
+
+	return ret;
 }
 
 int rdma_listen(struct rdma_cm_id *id, int backlog)
@@ -1391,12 +1513,33 @@
 	struct cma_id_private *id_priv;
 	void *msg;
 	int ret, size;
+	struct rdma_conn_param my_conn_param;
 
 	id_priv = container_of(id, struct cma_id_private, id);
 	ret = ucma_valid_param(id_priv, conn_param);
 	if (ret)
 		return ret;
 
+	{
+		// HACK: add srq number; hijack the private data
+		if (conn_param) {
+			printf("too bad\n");
+			return ERR(EINVAL);
+		}
+
+		memset(&my_conn_param, 0, sizeof(struct rdma_conn_param));
+		my_conn_param.private_data = &id->xrc_srq->xrc_srq_num;
+		my_conn_param.private_data_len = sizeof(id->xrc_rcv_qpn);
+
+		my_conn_param.initiator_depth = min(id_priv->initiator_depth,
+											id_priv->cma_dev->max_initiator_depth);
+		my_conn_param.responder_resources = min(id_priv->responder_resources,
+												id_priv->cma_dev->max_responder_resources);
+
+		conn_param = &my_conn_param;
+	}
+
+
 	if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) {
 		id_priv->initiator_depth = min(id_priv->initiator_depth,
 					       id_priv->cma_dev->max_initiator_depth);
@@ -1427,10 +1570,19 @@
 		ucma_copy_conn_param_to_kern(id_priv, &cmd->conn_param,
 					     conn_param, id->qp->qp_num,
 					     (id->qp->srq != NULL));
-	else
+	else {
+		uint32_t qp_num;
+
+		if (id->xrc_domain) {
+			qp_num = id->xrc_rcv_qpn;
+		} else {
+			qp_num = conn_param->qp_num;
+		}
+
 		ucma_copy_conn_param_to_kern(id_priv, &cmd->conn_param,
-					     conn_param, conn_param->qp_num,
-					     conn_param->srq);
+									 conn_param, qp_num,
+									 !!id->xrc_domain);
+	}
 
 	ret = write(id->channel->fd, msg, size);
 	if (ret != size) {
@@ -2180,7 +2332,7 @@
 {
 	struct cma_id_private *id_priv;
 
-	if (id->qp)
+	if (id->qp || id->xrc_domain)
 		rdma_destroy_qp(id);
 
 	id_priv = container_of(id, struct cma_id_private, id);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] RDMA CM + XRC, take two
       [not found] ` <4DC86BB2.1020002-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
@ 2011-05-10 23:17   ` Hefty, Sean
       [not found]     ` <1828884A29C6694DAF28B7E6B8A82373BBE2-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2011-05-12 22:33   ` Hefty, Sean
  1 sibling, 1 reply; 12+ messages in thread
From: Hefty, Sean @ 2011-05-10 23:17 UTC (permalink / raw)
  To: frank zago, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

> This is an improved version of my previous patch to add XRC support to
> RDMA CM.

I would still like to see support merged upstream and into mainline libibverbs before applying this to my tree.
 
> This version now support the newer rdma CM interface. It needs some
> polishing but it works.

It's hard to comment on the patch as an attachment.  In several places, if/else statements were added, but indentation was not added to the opposite part of the statement.  There was also a rather large amount of code added to rdma_create_qp() that needs to be split into a separate function.  (I dislike functions longer than a screen.)
 
> It modifies the rdma_client/server examples to use xrc instead of rc.
> 
> The big hack is that the SRQ number needs to be transmitted to the remote
> side. This patch hijacks the private data, so it's not acceptable. Ideally the SRQ
> number should be transmitted either in the REQ or REP packet (depending
> on which side the sender or the receiver) alongside the QP number. But that
> would need a change in the specs. Any suggestions ?

What are the QPNs carried in the REQ and REP?

> Also a good chunk of the patch is to deal with the XRC verbs API. I wonder
> whether XRC could/should be more integrated into the existing verbs:

That would be my preference.   I'm not familiar with the OFED XRC interfaces, so I'm only going off of the annex.

> - sender should not need a domain,

Hiding the xrc domain with the pd might help prevent breaking the user space ABI.

> - there should be 2 types of xrc QPs (send and receive) instead of one,

I've thought of this as well, and think it may also help prevent breaking the ABI.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] RDMA CM + XRC, take two
       [not found]     ` <1828884A29C6694DAF28B7E6B8A82373BBE2-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2011-05-11 14:42       ` frank zago
       [not found]         ` <4DCAA047.2040904-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: frank zago @ 2011-05-11 14:42 UTC (permalink / raw)
  To: Hefty, Sean; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

Hello Sean,

On 05/10/2011 06:17 PM, Hefty, Sean wrote:
>> This is an improved version of my previous patch to add XRC support to
>> RDMA CM.
> 
> I would still like to see support merged upstream and into mainline libibverbs before applying this to my tree.

Agreed. 

>  
>> This version now support the newer rdma CM interface. It needs some
>> polishing but it works.
> 
> It's hard to comment on the patch as an attachment.  In several places, if/else statements were added, but indentation was not added to the opposite part of the statement.  There was also a rather large amount of code added to rdma_create_qp() that needs to be split into a separate function.  (I dislike functions longer than a screen.)

Sorry for that attachment. I tried several time to post inline patches but was always getting a mangled patch.
I'll address you comments in a new patch.

>  
>> It modifies the rdma_client/server examples to use xrc instead of rc.
>>
>> The big hack is that the SRQ number needs to be transmitted to the remote
>> side. This patch hijacks the private data, so it's not acceptable. Ideally the SRQ
>> number should be transmitted either in the REQ or REP packet (depending
>> on which side the sender or the receiver) alongside the QP number. But that
>> would need a change in the specs. Any suggestions ?
> 
> What are the QPNs carried in the REQ and REP?

Those for the XRC QPs. Same as a for regular RC connection.

Frank.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] RDMA CM + XRC, take two
       [not found]         ` <4DCAA047.2040904-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
@ 2011-05-11 15:03           ` Hefty, Sean
       [not found]             ` <1828884A29C6694DAF28B7E6B8A82373BC59-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Hefty, Sean @ 2011-05-11 15:03 UTC (permalink / raw)
  To: frank zago; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

> > What are the QPNs carried in the REQ and REP?
> 
> Those for the XRC QPs. Same as a for regular RC connection.

I guess I need to go back and study the specs more to understand why the QPN is in the REQ.

The REQ/REP also carry EECN fields which might be usable here, and maybe that was the intent. 

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] RDMA CM + XRC, take two
       [not found]             ` <1828884A29C6694DAF28B7E6B8A82373BC59-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2011-05-11 18:36               ` frank zago
       [not found]                 ` <4DCAD746.7050606-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: frank zago @ 2011-05-11 18:36 UTC (permalink / raw)
  To: Hefty, Sean; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

On 05/11/2011 10:03 AM, Hefty, Sean wrote:
>>> What are the QPNs carried in the REQ and REP?
>>
>> Those for the XRC QPs. Same as a for regular RC connection.
> 
> I guess I need to go back and study the specs more to understand why the QPN is in the REQ.

I looked at the kernel headers; struct cm_req_msg and cm_rep_msg, and they both contain the QP numbers.

> 
> The REQ/REP also carry EECN fields which might be usable here, and maybe that was the intent.

The XRC spec does not make room for the srq number. Which implies it's left to the application to transmit it.

Frank.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] RDMA CM + XRC, take two
       [not found]                 ` <4DCAD746.7050606-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
@ 2011-05-11 19:08                   ` Hefty, Sean
       [not found]                     ` <1828884A29C6694DAF28B7E6B8A82373BE79-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Hefty, Sean @ 2011-05-11 19:08 UTC (permalink / raw)
  To: frank zago; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

> > I guess I need to go back and study the specs more to understand why the
> QPN is in the REQ.
> 
> I looked at the kernel headers; struct cm_req_msg and cm_rep_msg, and
> they both contain the QP numbers.

I was referring only to *why* the XRC TGT QP needed to know the XRC INI QPN (at a conceptual level).  I think it's for control messages.
 
> > The REQ/REP also carry EECN fields which might be usable here, and maybe
> that was the intent.
> 
> The XRC spec does not make room for the srq number. Which implies it's left
> to the application to transmit it.

Agreed - I was wondering if using the EECN would work, but I don't think that it does.  There would typically be multiple SRQs targeted by a single XRC INI->TGT connection.  Maybe SIDR could be adopted for this purpose as the standard discovery technique.

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] RDMA CM + XRC, take two
       [not found]                     ` <1828884A29C6694DAF28B7E6B8A82373BE79-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2011-05-11 19:36                       ` Bob Pearson
       [not found]                         ` <9ECD201D-0CD8-4339-81E0-163FC130C705-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Bob Pearson @ 2011-05-11 19:36 UTC (permalink / raw)
  To: Hefty, Sean
  Cc: frank zago, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

There is a connection between the ini and TFT qps they have to exchange qpns so that they can check them later
The sewn is also sent and used to route messages to the srq at the target

Sent from my iPhone

On May 11, 2011, at 2:08 PM, "Hefty, Sean" <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:

>>> I guess I need to go back and study the specs more to understand why the
>> QPN is in the REQ.
>> 
>> I looked at the kernel headers; struct cm_req_msg and cm_rep_msg, and
>> they both contain the QP numbers.
> 
> I was referring only to *why* the XRC TGT QP needed to know the XRC INI QPN (at a conceptual level).  I think it's for control messages.
> 
>>> The REQ/REP also carry EECN fields which might be usable here, and maybe
>> that was the intent.
>> 
>> The XRC spec does not make room for the srq number. Which implies it's left
>> to the application to transmit it.
> 
> Agreed - I was wondering if using the EECN would work, but I don't think that it does.  There would typically be multiple SRQs targeted by a single XRC INI->TGT connection.  Maybe SIDR could be adopted for this purpose as the standard discovery technique.
> 
> - Sean
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] RDMA CM + XRC, take two
       [not found]                         ` <9ECD201D-0CD8-4339-81E0-163FC130C705-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
@ 2011-05-11 23:23                           ` Bob Pearson
  2011-05-11 23:47                           ` Hefty, Sean
  1 sibling, 0 replies; 12+ messages in thread
From: Bob Pearson @ 2011-05-11 23:23 UTC (permalink / raw)
  To: Bob Pearson
  Cc: Hefty, Sean, frank zago,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

Sewn was supposed to be srqn but got spell checked

Sent from my iPhone

On May 11, 2011, at 3:36 PM, Bob Pearson <rpearson-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org> wrote:

> There is a connection between the ini and TFT qps they have to exchange qpns so that they can check them later
> The sewn is also sent and used to route messages to the srq at the target
> 
> Sent from my iPhone
> 
> On May 11, 2011, at 2:08 PM, "Hefty, Sean" <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> 
>>>> I guess I need to go back and study the specs more to understand why the
>>> QPN is in the REQ.
>>> 
>>> I looked at the kernel headers; struct cm_req_msg and cm_rep_msg, and
>>> they both contain the QP numbers.
>> 
>> I was referring only to *why* the XRC TGT QP needed to know the XRC INI QPN (at a conceptual level).  I think it's for control messages.
>> 
>>>> The REQ/REP also carry EECN fields which might be usable here, and maybe
>>> that was the intent.
>>> 
>>> The XRC spec does not make room for the srq number. Which implies it's left
>>> to the application to transmit it.
>> 
>> Agreed - I was wondering if using the EECN would work, but I don't think that it does.  There would typically be multiple SRQs targeted by a single XRC INI->TGT connection.  Maybe SIDR could be adopted for this purpose as the standard discovery technique.
>> 
>> - Sean
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] RDMA CM + XRC, take two
       [not found]                         ` <9ECD201D-0CD8-4339-81E0-163FC130C705-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
  2011-05-11 23:23                           ` Bob Pearson
@ 2011-05-11 23:47                           ` Hefty, Sean
  1 sibling, 0 replies; 12+ messages in thread
From: Hefty, Sean @ 2011-05-11 23:47 UTC (permalink / raw)
  To: Bob Pearson
  Cc: frank zago, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

> There is a connection between the ini and TFT qps they have to exchange
> qpns so that they can check them later

Speaking of connecting the XRC INI/TGT QPs, how is this being done?  I don't see any patches to the IB CM to support XRC in Roland's git tree.  Is the setup of XRC expected to be done through proprietary means?

Btw, the libibverbs APIs to support XRC look entirely confusing to me.  For as much as the MPI developers have complained about how complicated it is to connect over RDMA, I'm surprised that they're adopting them.  Is there documentation somewhere that describes ALL of the steps needed to connect using XRC?

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] RDMA CM + XRC, take two
       [not found] ` <4DC86BB2.1020002-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
  2011-05-10 23:17   ` Hefty, Sean
@ 2011-05-12 22:33   ` Hefty, Sean
       [not found]     ` <1828884A29C6694DAF28B7E6B8A82373F23B-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
  1 sibling, 1 reply; 12+ messages in thread
From: Hefty, Sean @ 2011-05-12 22:33 UTC (permalink / raw)
  To: frank zago, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

> The big hack is that the SRQ number needs to be transmitted to the remote
> side. This patch hijacks the private data, so it's not acceptable. Ideally the SRQ
> number should be transmitted either in the REQ or REP packet (depending
> on which side the sender or the receiver) alongside the QP number. But that
> would need a change in the specs. Any suggestions ?
> 
> Also a good chunk of the patch is to deal with the XRC verbs API. I wonder
> whether XRC could/should be more integrated into the existing verbs:
> - sender should not need a domain,
> - there should be 2 types of xrc QPs (send and receive) instead of one,
> - *_xrc_rcv_qp verbs should be abstracted under the cover in libibverbs,

I've spent some time reading over the XRC patches in Roland's git tree and the XRC patches to OFED's version of libibverbs.  These are some of the ideas that I've jotted down to support XRC through the librdmacm and mainline libibverbs, in no specific order.   (There may very well be implementation issues with these.)

1. The IB CM needs to be updated to connect XRC INI QPs to XRC TGT QPs.  This should be fairly simple.

2. As mentioned, there's no standard way of obtaining the SRQN.  I will submit a comment to the IBTA on this.  My recommendation will be to use the IB CM SIDR protocol.
	2a. As an optimization, the IB CM REP could optionally return an SRQN in the EECN.
	2b. It may be useful if the SIDR REQ carried the XRC INI/TGT QPNs

3. I don't see an easy way to hide the 'XRC domain'.  However, if I look at the existing libibverbs and librdmacm APIs, it may be simpler for the user and API compatibility if it were abstracted behind a PD (struct ibv_pd).  For example, a kernel XRC domain could be created the first time an XRC object is allocated on a PD.  In order to share an XRC domain among multiple processes, we would need a new call (ibv_share_pd?  ibv_modify_pd?).

4. There doesn't seem to be a strong reason to expose the XRC TGT QP to user space.  A kernel XRC component could accept and manage XRC target connections.

5. Assuming that XRC TGT QP is not exposed, libibverbs would use IBV_QPT_XRC only as the send side QP.  In order to support this QPT through the librdmacm, we would need to know what port space XRC QPs use, to know what SID range to map to, if any.  I will submit a comment to the IBTA on how XRC makes use of the RDMA IP CM Service.

6.  The XRC SRQ is more troubling to fit under the existing APIs.  The one idea I had was to treat the XRC SRQ as a QP (struct ibv_qp) rather than like an SRQ (struct ibv_srq).  This would require defining an IBV_QPT_SRQ type.  The SRQN ends up being a QPN for all purposes, though I don't know if this would cause other issues if the SRQNs and real QPNs overlap.


With these changes, the librdmacm usage model would be:

Passive side:
	rdma_create_ep()  /* qp_type = IBV_QPT_SRQ */
	rdma_listen()  /* listen for SIDR REQ */
	rdma_get_cm_event()
	rdma_accept()

Active side:
	rdma_getaddrinfo() /* qp_type = IBV_QPT_XRC */
	rdma_create_ep()
	rdma_connect() /* connects to XRC TGT QP */

	rdma_getaddrinfo() /* qp_type = IBV_QPT_SRQ */
	rdma_create_ep()
	rdma_connect() /* resolves XRC SRQN */

This doesn't include code necessary to share the 'XRC domain' among multiple processes on the passive side.  On the passive side, the kernel XRC component would respond to IB CM REQs based on whether there was an associated SRQ listen.

I realize this model would deviate from the OFED libibverbs APIs, but I don't want to break the librdmacm ABI, or necessarily add an entire new set of APIs just to support XRC.

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] RDMA CM + XRC, take two
       [not found]     ` <1828884A29C6694DAF28B7E6B8A82373F23B-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2011-05-13  6:16       ` Bob Pearson
       [not found]         ` <984CAD75-D012-4725-B294-99B1AF32E49B-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Bob Pearson @ 2011-05-13  6:16 UTC (permalink / raw)
  To: Hefty, Sean
  Cc: frank zago, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

Some comments but first I  glad this is getting attention

Frank managed to get xrc to work with user space rdmacm without any changes to ibcm. This probably depends on faking ibcm into acting like it is connecting an rc qp but it just worked

After trying to combine srqn exchange with req/rep I think it should be separate. As you also seem to do.

The typical use case foe xrc is for lots of related processes on each node. In this case it would be helpful to cache these to avoid P^2 effects. The CM seems like a nice place to do this. The usual problem is knowing when to stop.

I'm not thrilled with 3. Seems to me like these concepts don't merge well.
The idea behind the implementation is to let apps use the file system permissions to control who can join. Whatever we do has to preserve this.

My thumbs are wearing out more later

Bob

Sent from my iPhone

On May 13, 2011, at 12:33 AM, "Hefty, Sean" <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:

>> The big hack is that the SRQ number needs to be transmitted to the remote
>> side. This patch hijacks the private data, so it's not acceptable. Ideally the SRQ
>> number should be transmitted either in the REQ or REP packet (depending
>> on which side the sender or the receiver) alongside the QP number. But that
>> would need a change in the specs. Any suggestions ?
>> 
>> Also a good chunk of the patch is to deal with the XRC verbs API. I wonder
>> whether XRC could/should be more integrated into the existing verbs:
>> - sender should not need a domain,
>> - there should be 2 types of xrc QPs (send and receive) instead of one,
>> - *_xrc_rcv_qp verbs should be abstracted under the cover in libibverbs,
> 
> I've spent some time reading over the XRC patches in Roland's git tree and the XRC patches to OFED's version of libibverbs.  These are some of the ideas that I've jotted down to support XRC through the librdmacm and mainline libibverbs, in no specific order.   (There may very well be implementation issues with these.)
> 
> 1. The IB CM needs to be updated to connect XRC INI QPs to XRC TGT QPs.  This should be fairly simple.
> 
> 2. As mentioned, there's no standard way of obtaining the SRQN.  I will submit a comment to the IBTA on this.  My recommendation will be to use the IB CM SIDR protocol.
>    2a. As an optimization, the IB CM REP could optionally return an SRQN in the EECN.
>    2b. It may be useful if the SIDR REQ carried the XRC INI/TGT QPNs
> 
> 3. I don't see an easy way to hide the 'XRC domain'.  However, if I look at the existing libibverbs and librdmacm APIs, it may be simpler for the user and API compatibility if it were abstracted behind a PD (struct ibv_pd).  For example, a kernel XRC domain could be created the first time an XRC object is allocated on a PD.  In order to share an XRC domain among multiple processes, we would need a new call (ibv_share_pd?  ibv_modify_pd?).
> 
> 4. There doesn't seem to be a strong reason to expose the XRC TGT QP to user space.  A kernel XRC component could accept and manage XRC target connections.
> 
> 5. Assuming that XRC TGT QP is not exposed, libibverbs would use IBV_QPT_XRC only as the send side QP.  In order to support this QPT through the librdmacm, we would need to know what port space XRC QPs use, to know what SID range to map to, if any.  I will submit a comment to the IBTA on how XRC makes use of the RDMA IP CM Service.
> 
> 6.  The XRC SRQ is more troubling to fit under the existing APIs.  The one idea I had was to treat the XRC SRQ as a QP (struct ibv_qp) rather than like an SRQ (struct ibv_srq).  This would require defining an IBV_QPT_SRQ type.  The SRQN ends up being a QPN for all purposes, though I don't know if this would cause other issues if the SRQNs and real QPNs overlap.
> 
> 
> With these changes, the librdmacm usage model would be:
> 
> Passive side:
>    rdma_create_ep()  /* qp_type = IBV_QPT_SRQ */
>    rdma_listen()  /* listen for SIDR REQ */
>    rdma_get_cm_event()
>    rdma_accept()
> 
> Active side:
>    rdma_getaddrinfo() /* qp_type = IBV_QPT_XRC */
>    rdma_create_ep()
>    rdma_connect() /* connects to XRC TGT QP */
> 
>    rdma_getaddrinfo() /* qp_type = IBV_QPT_SRQ */
>    rdma_create_ep()
>    rdma_connect() /* resolves XRC SRQN */
> 
> This doesn't include code necessary to share the 'XRC domain' among multiple processes on the passive side.  On the passive side, the kernel XRC component would respond to IB CM REQs based on whether there was an associated SRQ listen.
> 
> I realize this model would deviate from the OFED libibverbs APIs, but I don't want to break the librdmacm ABI, or necessarily add an entire new set of APIs just to support XRC.
> 
> - Sean
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] RDMA CM + XRC, take two
       [not found]         ` <984CAD75-D012-4725-B294-99B1AF32E49B-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
@ 2011-05-13 14:39           ` Hefty, Sean
  0 siblings, 0 replies; 12+ messages in thread
From: Hefty, Sean @ 2011-05-13 14:39 UTC (permalink / raw)
  To: Bob Pearson
  Cc: frank zago, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

> The typical use case foe xrc is for lots of related processes on each node. In
> this case it would be helpful to cache these to avoid P^2 effects. The CM
> seems like a nice place to do this. The usual problem is knowing when to
> stop.

After your thumb recovers, I could use more details here.  I didn't follow what you were trying to say.

> I'm not thrilled with 3. Seems to me like these concepts don't merge well.
> The idea behind the implementation is to let apps use the file system
> permissions to control who can join. Whatever we do has to preserve this.

I agree with using the file system permissions.  This was the reason behind adding some new call like ibv_share_pd().

In hindsight, the APIs could have been simpler if we had just hid the PD entirely.  I'd hate to make it worse by having two 'protection domain' objects.  We end up adding new function parameters and structure fields.  How many apps allocate more than 1 PD or XRC domain per device?

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2011-05-13 14:39 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-05-09 22:33 [RFC] RDMA CM + XRC, take two frank zago
     [not found] ` <4DC86BB2.1020002-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
2011-05-10 23:17   ` Hefty, Sean
     [not found]     ` <1828884A29C6694DAF28B7E6B8A82373BBE2-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
2011-05-11 14:42       ` frank zago
     [not found]         ` <4DCAA047.2040904-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
2011-05-11 15:03           ` Hefty, Sean
     [not found]             ` <1828884A29C6694DAF28B7E6B8A82373BC59-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
2011-05-11 18:36               ` frank zago
     [not found]                 ` <4DCAD746.7050606-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
2011-05-11 19:08                   ` Hefty, Sean
     [not found]                     ` <1828884A29C6694DAF28B7E6B8A82373BE79-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
2011-05-11 19:36                       ` Bob Pearson
     [not found]                         ` <9ECD201D-0CD8-4339-81E0-163FC130C705-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
2011-05-11 23:23                           ` Bob Pearson
2011-05-11 23:47                           ` Hefty, Sean
2011-05-12 22:33   ` Hefty, Sean
     [not found]     ` <1828884A29C6694DAF28B7E6B8A82373F23B-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
2011-05-13  6:16       ` Bob Pearson
     [not found]         ` <984CAD75-D012-4725-B294-99B1AF32E49B-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
2011-05-13 14:39           ` Hefty, Sean

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox