diff -rubw librdmacm-1.0.14.1.org/examples/rdma_client.c librdmacm-1.0.14.1/examples/rdma_client.c --- librdmacm-1.0.14.1.org/examples/rdma_client.c 2010-10-04 19:00:18.000000000 -0500 +++ librdmacm-1.0.14.1/examples/rdma_client.c 2011-05-09 16:40:05.000000000 -0500 @@ -52,6 +52,7 @@ memset(&hints, 0, sizeof hints); hints.ai_port_space = RDMA_PS_TCP; + hints.ai_qp_type = IBV_QPT_XRC; ret = rdma_getaddrinfo(server, port, &hints, &res); if (ret) { printf("rdma_getaddrinfo %d\n", errno); @@ -77,12 +78,6 @@ return ret; } - ret = rdma_post_recv(id, NULL, recv_msg, 16, mr); - if (ret) { - printf("rdma_post_recv %d\n", errno); - return ret; - } - ret = rdma_connect(id, NULL); if (ret) { printf("rdma_connect %d\n", errno); @@ -95,7 +90,7 @@ return ret; } - ret = rdma_get_recv_comp(id, &wc); + ret = rdma_get_send_comp(id, &wc); if (ret <= 0) { printf("rdma_get_recv_comp %d\n", ret); return ret; diff -rubw librdmacm-1.0.14.1.org/examples/rdma_server.c librdmacm-1.0.14.1/examples/rdma_server.c --- librdmacm-1.0.14.1.org/examples/rdma_server.c 2010-10-04 19:00:18.000000000 -0500 +++ librdmacm-1.0.14.1/examples/rdma_server.c 2011-05-06 16:01:51.000000000 -0500 @@ -53,6 +53,7 @@ memset(&hints, 0, sizeof hints); hints.ai_flags = RAI_PASSIVE; hints.ai_port_space = RDMA_PS_TCP; + hints.ai_qp_type = IBV_QPT_XRC; ret = rdma_getaddrinfo(NULL, port, &hints, &res); if (ret) { printf("rdma_getaddrinfo %d\n", errno); @@ -60,7 +61,8 @@ } memset(&attr, 0, sizeof attr); - attr.cap.max_send_wr = attr.cap.max_recv_wr = 1; + attr.cap.max_send_wr = 0; /* means XRC receive */ + attr.cap.max_recv_wr = 1; attr.cap.max_send_sge = attr.cap.max_recv_sge = 1; attr.cap.max_inline_data = 16; attr.sq_sig_all = 1; @@ -97,7 +99,7 @@ ret = rdma_accept(id, NULL); if (ret) { - printf("rdma_connect %d\n", errno); + printf("rdma_accept %d\n", errno); return ret; } @@ -107,18 +109,6 @@ return ret; } - ret = rdma_post_send(id, NULL, send_msg, 16, NULL, IBV_SEND_INLINE); - if (ret) { - printf("rdma_post_send %d\n", errno); - return ret; - } - - ret = rdma_get_send_comp(id, &wc); - if (ret <= 0) { - printf("rdma_get_send_comp %d\n", ret); - return ret; - } - rdma_disconnect(id); rdma_dereg_mr(mr); rdma_destroy_ep(id); diff -rubw librdmacm-1.0.14.1.org/include/rdma/rdma_cma.h librdmacm-1.0.14.1/include/rdma/rdma_cma.h --- librdmacm-1.0.14.1.org/include/rdma/rdma_cma.h 2010-12-10 14:05:34.000000000 -0600 +++ librdmacm-1.0.14.1/include/rdma/rdma_cma.h 2011-05-06 10:22:36.000000000 -0500 @@ -125,6 +125,14 @@ struct ibv_cq *send_cq; struct ibv_comp_channel *recv_cq_channel; struct ibv_cq *recv_cq; + + /* XRC support */ + struct ibv_xrc_domain *xrc_domain; + struct ibv_srq *xrc_srq; /* if receive side */ + union { + uint32_t xrc_rcv_qpn; /* if receive side */ + uint32_t xrc_srq_num; /* if send side */ + }; }; enum { diff -rubw librdmacm-1.0.14.1.org/include/rdma/rdma_verbs.h librdmacm-1.0.14.1/include/rdma/rdma_verbs.h --- librdmacm-1.0.14.1.org/include/rdma/rdma_verbs.h 2010-12-10 14:05:34.000000000 -0600 +++ librdmacm-1.0.14.1/include/rdma/rdma_verbs.h 2011-05-06 15:26:47.000000000 -0500 @@ -57,7 +57,10 @@ static inline struct ibv_mr * rdma_reg_msgs(struct rdma_cm_id *id, void *addr, size_t length) { + if (id->qp) return ibv_reg_mr(id->qp->pd, addr, length, IBV_ACCESS_LOCAL_WRITE); + else + return ibv_reg_mr(id->xrc_srq->pd, addr, length, IBV_ACCESS_LOCAL_WRITE); } static inline struct ibv_mr * @@ -96,7 +99,10 @@ wr.sg_list = sgl; wr.num_sge = nsge; + if (id->qp) return rdma_seterrno(ibv_post_recv(id->qp, &wr, &bad)); + else + return rdma_seterrno(ibv_post_srq_recv(id->xrc_srq, &wr, &bad)); } static inline int @@ -111,6 +117,7 @@ wr.num_sge = nsge; wr.opcode = IBV_WR_SEND; wr.send_flags = flags; + wr.xrc_remote_srq_num = id->xrc_srq_num; return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad)); } diff -rubw librdmacm-1.0.14.1.org/man/rdma_create_qp.3 librdmacm-1.0.14.1/man/rdma_create_qp.3 --- librdmacm-1.0.14.1.org/man/rdma_create_qp.3 2010-12-10 14:05:34.000000000 -0600 +++ librdmacm-1.0.14.1/man/rdma_create_qp.3 2011-05-05 11:53:53.000000000 -0500 @@ -40,6 +40,10 @@ channels. Completion channels and CQ data created by the rdma_cm are exposed to the user through the rdma_cm_id structure. .P +To create an XRC receive QP, and in addition to the XRC QP type, +ibv_qp_init_attr.cap.max_send_wr must be set to 0. Conversely, to +create the XRC send QP, that attribute must be non-zero. +.P The actual capabilities and properties of the created QP will be returned to the user through the qp_init_attr parameter. .SH "SEE ALSO" diff -rubw librdmacm-1.0.14.1.org/src/addrinfo.c librdmacm-1.0.14.1/src/addrinfo.c --- librdmacm-1.0.14.1.org/src/addrinfo.c 2010-12-10 14:05:34.000000000 -0600 +++ librdmacm-1.0.14.1/src/addrinfo.c 2011-05-06 16:20:52.000000000 -0500 @@ -82,7 +82,8 @@ ai->ai_next = NULL; } -static int ucma_convert_to_rai(struct rdma_addrinfo *rai, struct addrinfo *ai) +static int ucma_convert_to_rai(struct rdma_addrinfo *rai, struct addrinfo *ai, + struct rdma_addrinfo *hints) { struct sockaddr *addr; char *canonname; @@ -91,6 +92,9 @@ switch (ai->ai_socktype) { case SOCK_STREAM: + if (hints && hints->ai_qp_type == IBV_QPT_XRC) + rai->ai_qp_type = IBV_QPT_XRC; + else rai->ai_qp_type = IBV_QPT_RC; break; case SOCK_DGRAM: @@ -149,7 +153,7 @@ if (ret) return ret; - ret = ucma_convert_to_rai(rai, ai); + ret = ucma_convert_to_rai(rai, ai, hints); freeaddrinfo(ai); return ret; } diff -rubw librdmacm-1.0.14.1.org/src/cma.c librdmacm-1.0.14.1/src/cma.c --- librdmacm-1.0.14.1.org/src/cma.c 2010-12-10 14:05:34.000000000 -0600 +++ librdmacm-1.0.14.1/src/cma.c 2011-05-09 17:15:53.000000000 -0500 @@ -944,12 +944,29 @@ return 0; } +static int rdma_modify_qp(struct rdma_cm_id *id, + struct ibv_qp_attr *qp_attr, + int qp_attr_mask) +{ + int ret; + + if (id->qp) + ret = ibv_modify_qp(id->qp, qp_attr, qp_attr_mask); + else if (id->xrc_domain) + ret = ibv_modify_xrc_rcv_qp(id->xrc_domain, id->xrc_rcv_qpn, + qp_attr, qp_attr_mask); + else + ret = EINVAL; + + return ret; +} + static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res) { struct ibv_qp_attr qp_attr; int qp_attr_mask, ret; - if (!id->qp) + if (!id->qp && !id->xrc_domain) return ERR(EINVAL); /* Need to update QP attributes from default values. */ @@ -958,7 +975,7 @@ if (ret) return ret; - ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask); + ret = rdma_modify_qp(id, &qp_attr, qp_attr_mask); if (ret) return ERR(ret); @@ -969,7 +986,7 @@ if (resp_res != RDMA_MAX_RESP_RES) qp_attr.max_dest_rd_atomic = resp_res; - return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); + return rdma_seterrno(rdma_modify_qp(id, &qp_attr, qp_attr_mask)); } static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth) @@ -984,29 +1001,29 @@ if (init_depth != RDMA_MAX_INIT_DEPTH) qp_attr.max_rd_atomic = init_depth; - return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); + return rdma_seterrno(rdma_modify_qp(id, &qp_attr, qp_attr_mask)); } static int ucma_modify_qp_sqd(struct rdma_cm_id *id) { struct ibv_qp_attr qp_attr; - if (!id->qp) + if (!id->qp && !id->xrc_domain) return 0; qp_attr.qp_state = IBV_QPS_SQD; - return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); + return rdma_seterrno(rdma_modify_qp(id, &qp_attr, IBV_QP_STATE)); } static int ucma_modify_qp_err(struct rdma_cm_id *id) { struct ibv_qp_attr qp_attr; - if (!id->qp) + if (!id->qp && !id->xrc_domain) return 0; qp_attr.qp_state = IBV_QPS_ERR; - return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); + return rdma_seterrno(rdma_modify_qp(id, &qp_attr, IBV_QP_STATE)); } static int ucma_find_pkey(struct cma_device *cma_dev, uint8_t port_num, @@ -1025,7 +1042,7 @@ return ERR(EINVAL); } -static int ucma_init_conn_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp) +static int ucma_init_conn_qp3(struct cma_id_private *id_priv) { struct ibv_qp_attr qp_attr; int ret; @@ -1040,25 +1057,25 @@ qp_attr.qp_state = IBV_QPS_INIT; qp_attr.qp_access_flags = 0; - ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS | + ret = rdma_modify_qp(&id_priv->id, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS | IBV_QP_PKEY_INDEX | IBV_QP_PORT); return rdma_seterrno(ret); } -static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) +static int ucma_init_conn_qp(struct cma_id_private *id_priv) { struct ibv_qp_attr qp_attr; int qp_attr_mask, ret; if (abi_ver == 3) - return ucma_init_conn_qp3(id_priv, qp); + return ucma_init_conn_qp3(id_priv); qp_attr.qp_state = IBV_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; - return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask)); + return rdma_seterrno(rdma_modify_qp(&id_priv->id, &qp_attr, qp_attr_mask)); } static int ucma_init_ud_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp) @@ -1137,7 +1154,7 @@ static int ucma_create_cqs(struct rdma_cm_id *id, struct ibv_qp_init_attr *attr) { - if (!attr->recv_cq) { + if (!attr->recv_cq /*&& !(attr->qp_type == IBV_QPT_XRC && attr->cap.max_send_wr != 0)*/) { id->recv_cq_channel = ibv_create_comp_channel(id->verbs); if (!id->recv_cq_channel) goto err; @@ -1150,7 +1167,7 @@ attr->recv_cq = id->recv_cq; } - if (!attr->send_cq) { + if (!attr->send_cq && !(attr->qp_type == IBV_QPT_XRC && attr->cap.max_send_wr == 0)) { id->send_cq_channel = ibv_create_comp_channel(id->verbs); if (!id->send_cq_channel) goto err; @@ -1175,45 +1192,141 @@ struct cma_id_private *id_priv; struct ibv_qp *qp; int ret; + struct ibv_xrc_domain *xrc_domain = NULL; + struct ibv_srq *xrc_srq = NULL; id_priv = container_of(id, struct cma_id_private, id); + if (!pd) pd = id_priv->cma_dev->pd; else if (id->verbs != pd->context) return ERR(EINVAL); + /* TODO: if xrc domain, create either the send or recv CQ. Should + * split ucma_create_cqs(). */ ret = ucma_create_cqs(id, qp_init_attr); if (ret) return ret; + /* If no XRC domain/SRQ was passed, create one. */ + if (qp_init_attr->qp_type == IBV_QPT_XRC + /* Note: for some reason, a send QP must also have a domain. + && qp_init_attr->cap.max_send_wr == 0*/) { + /* a receive side must only have a domain with an srq, or just + * a domain, or nothing at all. */ + if (qp_init_attr->cap.max_send_wr == 0 && + !qp_init_attr->xrc_domain && + qp_init_attr->srq) { + ret = ERR(EINVAL); + goto err1; + } + + if (!qp_init_attr->xrc_domain) { + xrc_domain = ibv_open_xrc_domain(pd->context, -1, O_CREAT); + if (!xrc_domain) { + ret = ERR(EINVAL); + goto err1; + } + qp_init_attr->xrc_domain = xrc_domain; + } + } + + if (qp_init_attr->qp_type == IBV_QPT_XRC && + qp_init_attr->cap.max_send_wr == 0) { + /* Special case: this is a receive XRC QP. */ + + if (!qp_init_attr->srq) { + struct ibv_srq_init_attr srq_init_attr; + + memset(&srq_init_attr, 0, sizeof(struct ibv_srq_init_attr)); + + srq_init_attr.srq_context = pd->context; + srq_init_attr.attr.max_wr = qp_init_attr->cap.max_recv_wr; + srq_init_attr.attr.max_sge = qp_init_attr->cap.max_recv_sge; + srq_init_attr.attr.srq_limit = 0; /* should be ignored */ + + xrc_srq = ibv_create_xrc_srq(pd, + qp_init_attr->xrc_domain, + id->recv_cq, + &srq_init_attr); + if (!xrc_srq) { + ret = ERR(EINVAL); + goto err1; + } + + qp_init_attr->srq = xrc_srq; + } + + id->xrc_srq = qp_init_attr->srq; + + ret = ibv_create_xrc_rcv_qp(qp_init_attr, &id->xrc_rcv_qpn); + if (ret) { + ret = ERR(ret); + goto err1; + } + id->xrc_domain = qp_init_attr->xrc_domain; + qp = NULL; + + } else { qp = ibv_create_qp(pd, qp_init_attr); if (!qp) { ret = ERR(ENOMEM); goto err1; } + } + + id->qp = qp; if (ucma_is_ud_ps(id->ps)) ret = ucma_init_ud_qp(id_priv, qp); else - ret = ucma_init_conn_qp(id_priv, qp); + ret = ucma_init_conn_qp(id_priv); if (ret) goto err2; - id->qp = qp; return 0; + err2: + if (qp) ibv_destroy_qp(qp); + else if (xrc_domain && id->xrc_rcv_qpn) + ibv_unreg_xrc_rcv_qp(xrc_domain, id->xrc_rcv_qpn); + err1: + if (xrc_srq) { + qp_init_attr->srq = NULL; + ibv_destroy_srq(xrc_srq); + } + + if (xrc_domain) { + qp_init_attr->xrc_domain = NULL; + ibv_close_xrc_domain(xrc_domain); + } + + id->qp = NULL; + id->xrc_domain = NULL; ucma_destroy_cqs(id); return ret; } void rdma_destroy_qp(struct rdma_cm_id *id) { + if (id->qp) { ibv_destroy_qp(id->qp); - ucma_destroy_cqs(id); id->qp = NULL; } + else if (id->xrc_domain) { + ibv_unreg_xrc_rcv_qp(id->xrc_domain, id->xrc_rcv_qpn); + /* TODO: we must only destroy domain/SRQ if we created them. */ + if (id->xrc_srq) { + ibv_destroy_srq(id->xrc_srq); + id->xrc_srq = NULL; + } + ibv_close_xrc_domain(id->xrc_domain); + id->xrc_domain = NULL; + } + ucma_destroy_cqs(id); +} static int ucma_valid_param(struct cma_id_private *id_priv, struct rdma_conn_param *param) @@ -1221,7 +1334,7 @@ if (id_priv->id.ps != RDMA_PS_TCP) return 0; - if (!id_priv->id.qp && !param) + if (!id_priv->id.qp && !id_priv->id.xrc_srq && !param) goto err; if (!param) @@ -1313,7 +1426,16 @@ id_priv->connect_len = 0; } - return ucma_complete(id_priv); + ret = ucma_complete(id_priv); + if (ret) + return ret; + + { + // HACK: retrieve the SRQ number + id->xrc_srq_num = *(uint32_t *)id->event->param.conn.private_data; + } + + return ret; } int rdma_listen(struct rdma_cm_id *id, int backlog) @@ -1391,12 +1513,33 @@ struct cma_id_private *id_priv; void *msg; int ret, size; + struct rdma_conn_param my_conn_param; id_priv = container_of(id, struct cma_id_private, id); ret = ucma_valid_param(id_priv, conn_param); if (ret) return ret; + { + // HACK: add srq number; hijack the private data + if (conn_param) { + printf("too bad\n"); + return ERR(EINVAL); + } + + memset(&my_conn_param, 0, sizeof(struct rdma_conn_param)); + my_conn_param.private_data = &id->xrc_srq->xrc_srq_num; + my_conn_param.private_data_len = sizeof(id->xrc_rcv_qpn); + + my_conn_param.initiator_depth = min(id_priv->initiator_depth, + id_priv->cma_dev->max_initiator_depth); + my_conn_param.responder_resources = min(id_priv->responder_resources, + id_priv->cma_dev->max_responder_resources); + + conn_param = &my_conn_param; + } + + if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) { id_priv->initiator_depth = min(id_priv->initiator_depth, id_priv->cma_dev->max_initiator_depth); @@ -1427,10 +1570,19 @@ ucma_copy_conn_param_to_kern(id_priv, &cmd->conn_param, conn_param, id->qp->qp_num, (id->qp->srq != NULL)); - else + else { + uint32_t qp_num; + + if (id->xrc_domain) { + qp_num = id->xrc_rcv_qpn; + } else { + qp_num = conn_param->qp_num; + } + ucma_copy_conn_param_to_kern(id_priv, &cmd->conn_param, - conn_param, conn_param->qp_num, - conn_param->srq); + conn_param, qp_num, + !!id->xrc_domain); + } ret = write(id->channel->fd, msg, size); if (ret != size) { @@ -2180,7 +2332,7 @@ { struct cma_id_private *id_priv; - if (id->qp) + if (id->qp || id->xrc_domain) rdma_destroy_qp(id); id_priv = container_of(id, struct cma_id_private, id);