linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Work completion error: "transport retry counter exceeded"
@ 2012-07-26  2:07 Ira Weiny
       [not found] ` <20120725190719.475605dc169353b775cd3463-i2BcT+NCU+M@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Ira Weiny @ 2012-07-26  2:07 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

I am at a loss.  I am hacking some RDMA code to do an RDMA write from a server
to a client.

I have it working perfectly on a small 2 node test system.  When I move the
code to another system I am getting a "transport retry counter exceeded"
error.  I just can't figure out why an RDMA Write is timing out like this.

What ibv_qp_init_attr's in ibv_create_qp or ibv_qp_attr's in ibv_modify_qp might I
have to change to account for different hardware?

NOTE: On both of these tests I am trying to xfer data from 2 nodes on the same
switch.  The hardware is different and the payload in small (<512 bytes).

Here is my init code on the server side:

rdma_create_qp...

	struct ibv_qp *qp;

	struct ibv_qp_init_attr attr = {
		.send_cq = p_sa->rdma_ctx.cq,
		.recv_cq = p_sa->rdma_ctx.cq,
		.cap     = {
			.max_send_wr  = 10,
			.max_recv_wr  = 500,
			.max_send_sge = 1,
			.max_recv_sge = 1
		},
		.qp_type = IBV_QPT_RC
	};
	
	qp = ibv_create_qp(p_sa->rdma_ctx.pd, &attr);
	if (!qp) {
		return (IB_INSUFFICIENT_RESOURCES);
	}

	{
		struct ibv_qp_attr attr = {
			.qp_state        = IBV_QPS_INIT,
			.pkey_index      = 0,
			.port_num        = rdma_ctx.device_port,
			.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE
		};

		if (ibv_modify_qp(qp, &attr,
				  IBV_QP_STATE              |
				  IBV_QP_PKEY_INDEX         |
				  IBV_QP_PORT               |
				  IBV_QP_ACCESS_FLAGS)) {
			goto DestroyQP;
		}
	}
...

rdma_modify_qp...

	/* transition it to RTR/RTS with eth_info and path record */
	struct ibv_qp_attr attr = {
		.qp_state		= IBV_QPS_RTR,
		.path_mtu		= path->mtu,
		.dest_qp_num		= eth_info->qpn,
		.rq_psn			= 1,
		.max_dest_rd_atomic	= 1,
		.min_rnr_timer		= 12,
		.ah_attr		= {
			.is_global	= 0,
			.dlid		= path->dlid,
			.sl		= path->sl,
			.src_path_bits	= 0,
			.port_num	= rdma_ctx.device_port
		}
	};

	if (ibv_modify_qp(qp, &attr,
			  IBV_QP_STATE              |
			  IBV_QP_PATH_MTU           |
			  IBV_QP_DEST_QPN           |
			  IBV_QP_RQ_PSN             |
			  IBV_QP_MAX_DEST_RD_ATOMIC |
			  IBV_QP_MIN_RNR_TIMER      |
			  IBV_QP_AV)) {
		return 1;
	}

	attr.qp_state	    = IBV_QPS_RTS;
	attr.timeout	    = 14;
	attr.retry_cnt	    = 7;
	attr.rnr_retry	    = 7;
	attr.sq_psn	    = 1;
	attr.max_rd_atomic  = 1;
	if (ibv_modify_qp(qp, &attr,
			  IBV_QP_STATE              |
			  IBV_QP_TIMEOUT            |
			  IBV_QP_RETRY_CNT          |
			  IBV_QP_RNR_RETRY          |
			  IBV_QP_SQ_PSN             |
			  IBV_QP_MAX_QP_RD_ATOMIC)) {
		return 1;
	}
...


Here is the code on the client:

rdma_create_qp...

	struct ibv_qp_init_attr attr = {
		.send_cq = rdma_ctx.cq,
		.recv_cq = rdma_ctx.cq,
		.cap     = {
			.max_send_wr  = 10,
			.max_recv_wr  = 10,
			.max_send_sge = 1,
			.max_recv_sge = 1
		},
		.qp_type = IBV_QPT_RC
	};
	
	qp = ibv_create_qp(rdma_ctx.pd, &attr);
	if (!qp) {
		return (-ENOMEM);
	}

	{
		struct ibv_qp_attr attr = {
			.qp_state        = IBV_QPS_INIT,
			.pkey_index      = 0,
			.port_num	 = rdma_ctx.device_port,
			.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE
		};

		if (ibv_modify_qp(qp, &attr,
				  IBV_QP_STATE              |
				  IBV_QP_PKEY_INDEX         |
				  IBV_QP_PORT               |
				  IBV_QP_ACCESS_FLAGS)) {
			return (-ENOMEM);
		}
	}

...

rdma_connect_qp...

	struct ibv_qp_attr attr = {
		.qp_state		= IBV_QPS_RTR,
		.path_mtu		= conn->path.mtu,
		.dest_qp_num		= conn->rqpn,
		.rq_psn			= 1,
		.max_dest_rd_atomic	= 1,
		.min_rnr_timer		= 12,
		.ah_attr		= {
			.is_global	= 0,
			.dlid		= conn->path.dlid,
			.sl		= conn->path.sl,
			.src_path_bits	= 0,
			.port_num	= rdma_ctx.device_port
		}
	};

	if (ibv_modify_qp(conn->qp, &attr,
			  IBV_QP_STATE              |
			  IBV_QP_PATH_MTU           |
			  IBV_QP_DEST_QPN           |
			  IBV_QP_RQ_PSN             |
			  IBV_QP_MAX_DEST_RD_ATOMIC |
			  IBV_QP_MIN_RNR_TIMER      |
			  IBV_QP_AV)) {
		return 1;
	}

	attr.qp_state	     = IBV_QPS_RTS;
	attr.timeout	     = 14;
	attr.retry_cnt	     = 7;
	attr.rnr_retry	     = 7;
	attr.sq_psn	     = 1;
	attr.max_rd_atomic   = 1;
	if (ibv_modify_qp(conn->qp, &attr,
			  IBV_QP_STATE              |
			  IBV_QP_TIMEOUT            |
			  IBV_QP_RETRY_CNT          |
			  IBV_QP_RNR_RETRY          |
			  IBV_QP_SQ_PSN             |
			  IBV_QP_MAX_QP_RD_ATOMIC)) {
		return 1;
	}
...


Thanks,
Ira

-- 
Ira Weiny
Member of Technical Staff
Lawrence Livermore National Lab
925-423-8008
weiny2-i2BcT+NCU+M@public.gmane.org
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2012-08-01 20:08 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-07-26  2:07 Work completion error: "transport retry counter exceeded" Ira Weiny
     [not found] ` <20120725190719.475605dc169353b775cd3463-i2BcT+NCU+M@public.gmane.org>
2012-07-26  7:15   ` Roland Dreier
     [not found]     ` <CAL1RGDWmpHy43b5TarBWpUk1RXOEdNitXWE4+xRCspgfpwUisQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2012-07-26 12:21       ` Albert Strasheim
     [not found]         ` <CALfB72Ad4+R48Nc-kawsrk1JQo964OkJ6DE46mcR5b9pS2_hEA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2012-07-26 17:45           ` Roland Dreier
     [not found]             ` <CAL1RGDVwy56YL7OLxVuvap5WZRzzZsosQmKBkhWZB73uy3ysDg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2012-07-27 16:50               ` Paul Grun
2012-07-27 17:08                 ` Roland Dreier
2012-07-27 17:33                 ` Albert Strasheim
     [not found]                   ` <CALfB72A+ghTETXqVt63YW-cWF_ygiEDkFq9SvQos=Vuv4ZcfwQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2012-07-27 17:42                     ` Ira Weiny
     [not found]                       ` <20120727104212.293cdc3e4a14ad267988d6ee-i2BcT+NCU+M@public.gmane.org>
2012-08-01 20:08                         ` Ira Weiny
2012-07-27 18:51                     ` Paul Grun

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).