All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tom Talpey <talpey@netapp.com>
To: linux-nfs@vger.kernel.org
Subject: [PATCH 04/15] RPC/RDMA: support FRMR client memory registration.
Date: Wed, 08 Oct 2008 11:47:34 -0400	[thread overview]
Message-ID: <20081008154734.1336.75775.stgit@tmt3.nane.netapp.com> (raw)
In-Reply-To: <20081008154506.1336.59892.stgit-pfX4bTJKMULWwzOYslWYilaTQe2KTcn/@public.gmane.org>

Configure, detect and use "fastreg" support from IB/iWARP verbs
layer to perform RPC/RDMA memory registration.

Make FRMR the default memreg mode (will fall back if not supported
by the selected RDMA adapter).

This allows full and optimal operation over the cxgb3 adapter, and others.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
Signed-off-by: Tom Talpey <talpey@netapp.com>
---

 net/sunrpc/xprtrdma/transport.c |    6 -
 net/sunrpc/xprtrdma/verbs.c     |  167 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 167 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index a564c1a..89970b0 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -70,11 +70,7 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
 static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_inline_write_padding;
-#if !RPCRDMA_PERSISTENT_REGISTRATION
-static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */
-#else
-static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL;
-#endif
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
 
 #ifdef RPC_DEBUG
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 0f3b431..39a1652 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -488,6 +488,26 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 #endif
 		}
 		break;
+	case RPCRDMA_FRMR:
+		/* Requires both frmr reg and local dma lkey */
+		if ((devattr.device_cap_flags &
+		     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+		    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+#if RPCRDMA_PERSISTENT_REGISTRATION
+			dprintk("RPC:       %s: FRMR registration "
+				"specified but not supported by adapter, "
+				"using riskier RPCRDMA_ALLPHYSICAL\n",
+				__func__);
+			memreg = RPCRDMA_ALLPHYSICAL;
+#else
+			dprintk("RPC:       %s: FRMR registration "
+				"specified but not supported by adapter, "
+				"using slower RPCRDMA_REGISTER\n",
+				__func__);
+			memreg = RPCRDMA_REGISTER;
+#endif
+		}
+		break;
 	}
 
 	/*
@@ -501,6 +521,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	switch (memreg) {
 	case RPCRDMA_BOUNCEBUFFERS:
 	case RPCRDMA_REGISTER:
+	case RPCRDMA_FRMR:
 		break;
 #if RPCRDMA_PERSISTENT_REGISTRATION
 	case RPCRDMA_ALLPHYSICAL:
@@ -602,6 +623,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	ep->rep_attr.srq = NULL;
 	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
 	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		/* Add room for frmr register and invalidate WRs */
+		ep->rep_attr.cap.max_send_wr *= 3;
+		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
+			return -EINVAL;
+		break;
 	case RPCRDMA_MEMWINDOWS_ASYNC:
 	case RPCRDMA_MEMWINDOWS:
 		/* Add room for mw_binds+unbinds - overkill! */
@@ -684,6 +711,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 		break;
 	case RPCRDMA_MTHCAFMR:
 	case RPCRDMA_REGISTER:
+	case RPCRDMA_FRMR:
 		ep->rep_remote_cma.responder_resources = cdata->max_requests *
 				(RPCRDMA_MAX_DATA_SEGS / 8);
 		break;
@@ -935,7 +963,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 	 *   2.  arrays of struct rpcrdma_req to fill in pointers
 	 *   3.  array of struct rpcrdma_rep for replies
 	 *   4.  padding, if any
-	 *   5.  mw's or fmr's, if any
+	 *   5.  mw's, fmr's or frmr's, if any
 	 * Send/recv buffers in req/rep need to be registered
 	 */
 
@@ -943,6 +971,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 		(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
 	len += cdata->padding;
 	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
+				sizeof(struct rpcrdma_mw);
+		break;
 	case RPCRDMA_MTHCAFMR:
 		/* TBD we are perhaps overallocating here */
 		len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
@@ -991,6 +1023,30 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 	INIT_LIST_HEAD(&buf->rb_mws);
 	r = (struct rpcrdma_mw *)p;
 	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
+			r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+							 RPCRDMA_MAX_SEGS);
+			if (IS_ERR(r->r.frmr.fr_mr)) {
+				rc = PTR_ERR(r->r.frmr.fr_mr);
+				dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
+					" failed %i\n", __func__, rc);
+				goto out;
+			}
+			r->r.frmr.fr_pgl =
+				ib_alloc_fast_reg_page_list(ia->ri_id->device,
+							    RPCRDMA_MAX_SEGS);
+			if (IS_ERR(r->r.frmr.fr_pgl)) {
+				rc = PTR_ERR(r->r.frmr.fr_pgl);
+				dprintk("RPC:       %s: "
+					"ib_alloc_fast_reg_page_list "
+					"failed %i\n", __func__, rc);
+				goto out;
+			}
+			list_add(&r->mw_list, &buf->rb_mws);
+			++r;
+		}
+		break;
 	case RPCRDMA_MTHCAFMR:
 		/* TBD we are perhaps overallocating here */
 		for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
@@ -1126,6 +1182,15 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 					struct rpcrdma_mw, mw_list);
 				list_del(&r->mw_list);
 				switch (ia->ri_memreg_strategy) {
+				case RPCRDMA_FRMR:
+					rc = ib_dereg_mr(r->r.frmr.fr_mr);
+					if (rc)
+						dprintk("RPC:       %s:"
+							" ib_dereg_mr"
+							" failed %i\n",
+							__func__, rc);
+					ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+					break;
 				case RPCRDMA_MTHCAFMR:
 					rc = ib_dealloc_fmr(r->r.fmr);
 					if (rc)
@@ -1228,6 +1293,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
 		req->rl_reply = NULL;
 	}
 	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
 	case RPCRDMA_MTHCAFMR:
 	case RPCRDMA_MEMWINDOWS_ASYNC:
 	case RPCRDMA_MEMWINDOWS:
@@ -1391,6 +1457,96 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
 }
 
 static int
+rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+			int *nsegs, int writing, struct rpcrdma_ia *ia,
+			struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct ib_send_wr frmr_wr, *bad_wr;
+	u8 key;
+	int len, pageoff;
+	int i, rc;
+
+	pageoff = offset_in_page(seg1->mr_offset);
+	seg1->mr_offset -= pageoff;	/* start of page */
+	seg1->mr_len += pageoff;
+	len = -pageoff;
+	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+		*nsegs = RPCRDMA_MAX_DATA_SEGS;
+	for (i = 0; i < *nsegs;) {
+		rpcrdma_map_one(ia, seg, writing);
+		seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+	dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
+		__func__, seg1->mr_chunk.rl_mw, i);
+
+	/* Bump the key */
+	key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
+	ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+
+	/* Prepare FRMR WR */
+	memset(&frmr_wr, 0, sizeof frmr_wr);
+	frmr_wr.opcode = IB_WR_FAST_REG_MR;
+	frmr_wr.send_flags = 0;			/* unsignaled */
+	frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
+	frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
+	frmr_wr.wr.fast_reg.page_list_len = i;
+	frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
+	frmr_wr.wr.fast_reg.access_flags = (writing ?
+				IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ);
+	frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
+
+	if (rc) {
+		dprintk("RPC:       %s: failed ib_post_send for register,"
+			" status %i\n", __func__, rc);
+		while (i--)
+			rpcrdma_unmap_one(ia, --seg);
+	} else {
+		seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+		seg1->mr_base = seg1->mr_dma + pageoff;
+		seg1->mr_nsegs = i;
+		seg1->mr_len = len;
+	}
+	*nsegs = i;
+	return rc;
+}
+
+static int
+rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
+			struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct ib_send_wr invalidate_wr, *bad_wr;
+	int rc;
+
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(ia, seg++);
+
+	memset(&invalidate_wr, 0, sizeof invalidate_wr);
+	invalidate_wr.opcode = IB_WR_LOCAL_INV;
+	invalidate_wr.send_flags = 0;			/* unsignaled */
+	invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+	if (rc)
+		dprintk("RPC:       %s: failed ib_post_send for invalidate,"
+			" status %i\n", __func__, rc);
+	return rc;
+}
+
+static int
 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
 			int *nsegs, int writing, struct rpcrdma_ia *ia)
 {
@@ -1600,6 +1756,11 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
 		break;
 #endif
 
+	/* Registration using frmr registration */
+	case RPCRDMA_FRMR:
+		rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
+		break;
+
 	/* Registration using fmr memory registration */
 	case RPCRDMA_MTHCAFMR:
 		rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
@@ -1639,6 +1800,10 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
 		break;
 #endif
 
+	case RPCRDMA_FRMR:
+		rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
+		break;
+
 	case RPCRDMA_MTHCAFMR:
 		rc = rpcrdma_deregister_fmr_external(seg, ia);
 		break;


  parent reply	other threads:[~2008-10-08 16:22 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-10-08 15:46 [PATCH 00/15] RPC/RDMA patchset for next merge window Tom Talpey
     [not found] ` <20081008154506.1336.59892.stgit-pfX4bTJKMULWwzOYslWYilaTQe2KTcn/@public.gmane.org>
2008-10-08 15:47   ` [PATCH 01/15] RPC/RDMA: refactor the inline memory registration code Tom Talpey
2008-10-08 15:47   ` [PATCH 02/15] RPC/RDMA: add data types and new FRMR memory registration enum Tom Talpey
     [not found]     ` <20081008154713.1336.41538.stgit-pfX4bTJKMULWwzOYslWYilaTQe2KTcn/@public.gmane.org>
2008-10-08 17:23       ` Trond Myklebust
2008-10-08 17:30         ` Talpey, Thomas
     [not found]           ` <RTPCLUEXC1-PRDmcarc00000072-rtwIt2gI0FxT+ZUat5FNkAK/GNPrWCqfQQ4Iyu8u01E@public.gmane.org>
2008-10-08 17:40             ` Trond Myklebust
2008-10-08 17:55             ` J. Bruce Fields
2008-10-08 17:58               ` Talpey, Thomas
2008-10-08 15:47   ` [PATCH 03/15] RPC/RDMA: check selected memory registration mode at runtime Tom Talpey
     [not found]     ` <20081008154723.1336.57976.stgit-pfX4bTJKMULWwzOYslWYilaTQe2KTcn/@public.gmane.org>
2008-10-08 17:22       ` Trond Myklebust
2008-10-08 17:29         ` Talpey, Thomas
     [not found]           ` <RTPCLUEXC1-PRD8yfog00000071-rtwIt2gI0FxT+ZUat5FNkAK/GNPrWCqfQQ4Iyu8u01E@public.gmane.org>
2008-10-08 17:40             ` Trond Myklebust
2008-10-08 15:47   ` Tom Talpey [this message]
2008-10-08 15:47   ` [PATCH 05/15] RPC/RDMA: fix connection IRD/ORD setting Tom Talpey
     [not found]     ` <20081008154744.1336.20909.stgit-pfX4bTJKMULWwzOYslWYilaTQe2KTcn/@public.gmane.org>
2008-10-08 17:26       ` Trond Myklebust
2008-10-08 17:32         ` Talpey, Thomas
2008-10-08 15:47   ` [PATCH 06/15] RPC/RDMA: suppress retransmit on RPC/RDMA clients Tom Talpey
2008-10-08 15:48   ` [PATCH 07/15] RPC/RDMA: maintain the RPC task bytes-sent statistic Tom Talpey
2008-10-08 15:48   ` [PATCH 08/15] RPC/RDMA: avoid an oops due to disconnect racing with async upcalls Tom Talpey
2008-10-08 15:48   ` [PATCH 09/15] RPC/RDMA: adhere to protocol for unpadded client trailing write chunks Tom Talpey
     [not found]     ` <20081008154825.1336.79549.stgit-pfX4bTJKMULWwzOYslWYilaTQe2KTcn/@public.gmane.org>
2008-10-08 17:29       ` Trond Myklebust
2008-10-08 17:33         ` Talpey, Thomas
2008-10-08 15:48   ` [PATCH 10/15] RPC/RDMA: return a consistent error to mount, when connect fails Tom Talpey
     [not found]     ` <20081008154835.1336.85484.stgit-pfX4bTJKMULWwzOYslWYilaTQe2KTcn/@public.gmane.org>
2008-10-08 17:31       ` Trond Myklebust
2008-10-08 17:40         ` Talpey, Thomas
     [not found]           ` <RTPCLUEXC1-PRDbpH7100000075-rtwIt2gI0FxT+ZUat5FNkAK/GNPrWCqfQQ4Iyu8u01E@public.gmane.org>
2008-10-08 17:43             ` Trond Myklebust
2008-10-08 19:56               ` Talpey, Thomas
2008-10-08 15:48   ` [PATCH 11/15] RPC/RDMA: fix connect/reconnect resource leak Tom Talpey
2008-10-08 15:48   ` [PATCH 12/15] RPC/RDMA: correct a 5 second pause on reconnecting to an idle server Tom Talpey
     [not found]     ` <20081008154856.1336.18339.stgit-pfX4bTJKMULWwzOYslWYilaTQe2KTcn/@public.gmane.org>
2008-10-08 17:35       ` Trond Myklebust
2008-10-08 17:51         ` Talpey, Thomas
     [not found]           ` <RTPCLUEXC1-PRDjbDt300000076-rtwIt2gI0FxT+ZUat5FNkAK/GNPrWCqfQQ4Iyu8u01E@public.gmane.org>
2008-10-08 18:04             ` Trond Myklebust
2008-10-08 19:05               ` Talpey, Thomas
2008-10-08 15:49   ` [PATCH 13/15] RPC/RDMA: harden connection logic against missing/late rdma_cm upcalls Tom Talpey
2008-10-08 15:49   ` [PATCH 14/15] RPC/RDMA: reformat a debug printk to keep lines together Tom Talpey
2008-10-08 15:49   ` [PATCH 15/15] RPC/RDMA: optionally emit useful transport info upon connect/disconnect Tom Talpey

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20081008154734.1336.75775.stgit@tmt3.nane.netapp.com \
    --to=talpey@netapp.com \
    --cc=linux-nfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.