Linux NFS development
 help / color / mirror / Atom feed
* [PATCH,RFC 02/02] xprtrdma: Update the RPC memory registration to use FRMR
@ 2008-08-13 16:18 Tom Tucker
  2008-08-13 17:40 ` Trond Myklebust
  0 siblings, 1 reply; 5+ messages in thread
From: Tom Tucker @ 2008-08-13 16:18 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Tom Talpey, linux-nfs


Use FRMR when registering client memory if the memory registration
strategy is FRMR.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

---
  net/sunrpc/xprtrdma/verbs.c |  296 ++++++++++++++++++++++++++++++++++++++-----
  1 files changed, 263 insertions(+), 33 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 8ea283e..ed401f9 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -423,6 +423,7 @@ rpcrdma_clean_cq(struct ib_cq *cq)
  int
  rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
  {
+	struct ib_device_attr devattr;
  	int rc;
  	struct rpcrdma_ia *ia = &xprt->rx_ia;

@@ -443,6 +444,49 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
  	}

  	/*
+	 * Query the device to determine if the requested memory
+	 * registration strategy is supported. If it isnt't set the
+	 * strategy to a globally supported model.
+	 */
+	rc = ib_query_device(ia->ri_id->device, &devattr);
+	if (rc) {
+		dprintk("RPC:       %s: ib_query_device failed %d\n",
+			__func__, rc);
+		goto out2;
+	}
+	switch (memreg) {
+	case RPCRDMA_MEMWINDOWS:
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+		if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
+			dprintk("RPC:       %s: MEMWINDOWS specified but not "
+				"supported, using RPCRDMA_ALLPHYSICAL",
+				__func__);
+			memreg = RPCRDMA_ALLPHYSICAL;
+		}
+		break;
+	case RPCRDMA_MTHCAFMR:
+		if (!ia->ri_id->device->alloc_fmr) {
+			dprintk("RPC:       %s: MTHCAFMR specified but not "
+				"supported, using RPCRDMA_ALLPHYSICAL",
+				__func__);
+			memreg = RPCRDMA_ALLPHYSICAL;
+		}
+		break;
+	case RPCRDMA_FASTREG:
+		/* Requires both fast reg and global dma lkey */
+		if ((0 ==
+		     (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) ||
+		    (0 == (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY))) {
+			dprintk("RPC:       %s: FASTREG specified but not "
+				"supported, using RPCRDMA_ALLPHYSICAL",
+				__func__);
+			memreg = RPCRDMA_ALLPHYSICAL;
+		}
+		break;
+	}
+	dprintk("RPC:       memory registration strategy is %d\n", memreg);
+
+	/*
  	 * Optionally obtain an underlying physical identity mapping in
  	 * order to do a memory window-based bind. This base registration
  	 * is protected from remote access - that is enabled only by binding
@@ -450,7 +494,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
  	 * revoked after the corresponding completion similar to a storage
  	 * adapter.
  	 */
-	if (memreg > RPCRDMA_REGISTER) {
+	if ((memreg > RPCRDMA_REGISTER) && (memreg != RPCRDMA_FASTREG)) {
  		int mem_priv = IB_ACCESS_LOCAL_WRITE;
  		switch (memreg) {
  #if RPCRDMA_PERSISTENT_REGISTRATION
@@ -475,7 +519,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
  			memreg = RPCRDMA_REGISTER;
  			ia->ri_bind_mem = NULL;
  		}
+		ia->ri_dma_lkey = ia->ri_bind_mem->lkey;
  	}
+	if (memreg == RPCRDMA_FASTREG)
+		ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;

  	/* Else will do memory reg/dereg for each chunk */
  	ia->ri_memreg_strategy = memreg;
@@ -541,6 +588,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
  	ep->rep_attr.srq = NULL;
  	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
  	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FASTREG:
+		/* Add room for fast reg and invalidate */
+		ep->rep_attr.cap.max_send_wr *= 3;
+		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
+			return -EINVAL;
+		break;
  	case RPCRDMA_MEMWINDOWS_ASYNC:
  	case RPCRDMA_MEMWINDOWS:
  		/* Add room for mw_binds+unbinds - overkill! */
@@ -623,6 +676,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
  		break;
  	case RPCRDMA_MTHCAFMR:
  	case RPCRDMA_REGISTER:
+	case RPCRDMA_FASTREG:
  		ep->rep_remote_cma.responder_resources = cdata->max_requests *
  				(RPCRDMA_MAX_DATA_SEGS / 8);
  		break;
@@ -866,6 +920,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,

  	buf->rb_max_requests = cdata->max_requests;
  	spin_lock_init(&buf->rb_lock);
+	spin_lock_init(&buf->rb_frs_lock);
  	atomic_set(&buf->rb_credits, 1);

  	/* Need to allocate:
@@ -874,6 +929,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
  	 *   3.  array of struct rpcrdma_rep for replies
  	 *   4.  padding, if any
  	 *   5.  mw's, if any
+	 *   6.  frmr's, if any
  	 * Send/recv buffers in req/rep need to be registered
  	 */

@@ -881,6 +937,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
  		(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
  	len += cdata->padding;
  	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FASTREG:
+		len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+				sizeof(struct rpcrdma_frmr);
+		break;
  	case RPCRDMA_MTHCAFMR:
  		/* TBD we are perhaps overallocating here */
  		len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
@@ -895,7 +955,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
  		break;
  	}

-	/* allocate 1, 4 and 5 in one shot */
+	/* allocate 1, 4, 5 and 6 in one shot */
  	p = kzalloc(len, GFP_KERNEL);
  	if (p == NULL) {
  		dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
@@ -927,7 +987,38 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
  	 * and also reduce unbind-to-bind collision.
  	 */
  	INIT_LIST_HEAD(&buf->rb_mws);
+	INIT_LIST_HEAD(&buf->rb_frs);
  	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FASTREG:
+		{
+		struct rpcrdma_frmr *fr = (struct rpcrdma_frmr *)p;
+		for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+			fr->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+							 RPCRDMA_MAX_SEGS);
+			if (IS_ERR(fr->fr_mr)) {
+				rc = PTR_ERR(fr->fr_mr);
+				printk("RPC:       %s: ib_alloc_fast_reg_mr"
+					" failed %i\n", __func__, rc);
+				goto out;
+			}
+			fr->fr_pgl =
+				ib_alloc_fast_reg_page_list(ia->ri_id->device,
+							    RPCRDMA_MAX_SEGS);
+			if (IS_ERR(fr->fr_pgl)) {
+				rc = PTR_ERR(fr->fr_pgl);
+				printk("RPC:       %s: "
+					"ib_alloc_fast_reg_page_list "
+					"failed %i\n", __func__, rc);
+				goto out;
+			}
+			INIT_LIST_HEAD(&fr->fr_list);
+			list_add(&fr->fr_list, &buf->rb_frs);
+			dprintk("RPC:       %s alloc fmr %p pgl %p\n", __func__,
+				fr->fr_mr, fr->fr_pgl);
+			++fr;
+		}
+		}
+		break;
  	case RPCRDMA_MTHCAFMR:
  		{
  		struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
@@ -1056,6 +1147,49 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
  	 */
  	dprintk("RPC:       %s: entering\n", __func__);

+	while (!list_empty(&buf->rb_frs)) {
+		struct rpcrdma_frmr *fr =
+			list_entry(buf->rb_frs.next,
+				   struct rpcrdma_frmr, fr_list);
+		list_del(&fr->fr_list);
+		rc = ib_dereg_mr(fr->fr_mr);
+		if (rc)
+			dprintk("RPC:       %s:"
+				" ib_dereg_mr"
+				" failed %i\n",
+				__func__, rc);
+		ib_free_fast_reg_page_list(fr->fr_pgl);
+	}
+
+	while (!list_empty(&buf->rb_mws)) {
+		struct rpcrdma_mw *r;
+		switch (ia->ri_memreg_strategy) {
+		case RPCRDMA_MTHCAFMR:
+			r = list_entry(buf->rb_mws.next,
+				       struct rpcrdma_mw, mw_list);
+			list_del(&r->mw_list);
+			rc = ib_dealloc_fmr(r->r.fmr);
+			if (rc)
+				dprintk("RPC:       %s:"
+					" ib_dealloc_fmr"
+					" failed %i\n",
+					__func__, rc);
+			break;
+		case RPCRDMA_MEMWINDOWS_ASYNC:
+		case RPCRDMA_MEMWINDOWS:
+			r = list_entry(buf->rb_mws.next,
+				       struct rpcrdma_mw, mw_list);
+			list_del(&r->mw_list);
+			rc = ib_dealloc_mw(r->r.mw);
+			if (rc)
+				dprintk("RPC:       %s: ib_dealloc_mw "
+					"failed %i\n", __func__, rc);
+			break;
+		default:
+			break;
+		}
+	}
+
  	for (i = 0; i < buf->rb_max_requests; i++) {
  		if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
  			rpcrdma_deregister_internal(ia,
@@ -1064,33 +1198,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
  			kfree(buf->rb_recv_bufs[i]);
  		}
  		if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
-			while (!list_empty(&buf->rb_mws)) {
-				struct rpcrdma_mw *r;
-				r = list_entry(buf->rb_mws.next,
-					struct rpcrdma_mw, mw_list);
-				list_del(&r->mw_list);
-				switch (ia->ri_memreg_strategy) {
-				case RPCRDMA_MTHCAFMR:
-					rc = ib_dealloc_fmr(r->r.fmr);
-					if (rc)
-						dprintk("RPC:       %s:"
-							" ib_dealloc_fmr"
-							" failed %i\n",
-							__func__, rc);
-					break;
-				case RPCRDMA_MEMWINDOWS_ASYNC:
-				case RPCRDMA_MEMWINDOWS:
-					rc = ib_dealloc_mw(r->r.mw);
-					if (rc)
-						dprintk("RPC:       %s:"
-							" ib_dealloc_mw"
-							" failed %i\n",
-							__func__, rc);
-					break;
-				default:
-					break;
-				}
-			}
  			rpcrdma_deregister_internal(ia,
  					buf->rb_send_bufs[i]->rl_handle,
  					&buf->rb_send_bufs[i]->rl_iov);
@@ -1115,6 +1222,7 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
  {
  	struct rpcrdma_req *req;
  	unsigned long flags;
+	int i;

  	spin_lock_irqsave(&buffers->rb_lock, flags);
  	if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1134,8 +1242,11 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
  		buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
  	}
  	buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
+	for (i = 0; i < RPCRDMA_MAX_SEGS; i++)
+		req->rl_segments[i].mr_chunk.rl_fr = NULL;
+
  	if (!list_empty(&buffers->rb_mws)) {
-		int i = RPCRDMA_MAX_SEGS - 1;
+		i = RPCRDMA_MAX_SEGS - 1;
  		do {
  			struct rpcrdma_mw *r;
  			r = list_entry(buffers->rb_mws.next,
@@ -1148,6 +1259,31 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
  	return req;
  }

+static void
+rpcrdma_free_frmr(struct rpcrdma_buffer *buf, struct rpcrdma_frmr *fr_mr)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&buf->rb_frs_lock, flags);
+	list_add(&fr_mr->fr_list, &buf->rb_frs);
+	spin_unlock_irqrestore(&buf->rb_frs_lock, flags);
+}
+
+static struct rpcrdma_frmr *
+rpcrdma_alloc_frmr(struct rpcrdma_buffer *buf)
+{
+	unsigned long flags;
+	struct rpcrdma_frmr *fr_mr = NULL;
+
+	spin_lock_irqsave(&buf->rb_frs_lock, flags);
+	if (!list_empty(&buf->rb_frs)) {
+		fr_mr = list_entry(buf->rb_frs.next,
+				   struct rpcrdma_frmr, fr_list);
+		list_del_init(&fr_mr->fr_list);
+	}
+	spin_unlock_irqrestore(&buf->rb_frs_lock, flags);
+	return fr_mr;
+}
+
  /*
   * Put request/reply buffers back into pool.
   * Pre-decrement counter/array index.
@@ -1252,9 +1388,10 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
  			va, len, DMA_BIDIRECTIONAL);
  	iov->length = len;

-	if (ia->ri_bind_mem != NULL) {
+	if (RPCRDMA_FASTREG == ia->ri_memreg_strategy ||
+	    ia->ri_bind_mem) {
  		*mrp = NULL;
-		iov->lkey = ia->ri_bind_mem->lkey;
+		iov->lkey = ia->ri_dma_lkey;
  		return 0;
  	}

@@ -1302,6 +1439,43 @@ rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
  /*
   * Wrappers for chunk registration, shared by read/write chunk code.
   */
+static int
+rpcrdma_fastreg_seg(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *mr,
+		    int nsegs, u32 access)
+{
+	struct ib_send_wr invalidate_wr, fastreg_wr, *bad_wr;
+	u8 key;
+	u32 rkey = mr->mr_chunk.rl_fr->fr_mr->rkey;
+	int ret;
+
+	/* Prepare INVALIDATE WR */
+	memset(&invalidate_wr, 0, sizeof invalidate_wr);
+	invalidate_wr.opcode = IB_WR_LOCAL_INV;
+	invalidate_wr.send_flags = IB_SEND_SIGNALED;
+	invalidate_wr.ex.invalidate_rkey = rkey;
+	invalidate_wr.next = &fastreg_wr;
+
+	/* Bump the key */
+	key = (u8)(mr->mr_chunk.rl_fr->fr_mr->rkey & 0x000000FF);
+	ib_update_fast_reg_key(mr->mr_chunk.rl_fr->fr_mr, ++key);
+
+	/* Prepare FASTREG WR */
+	memset(&fastreg_wr, 0, sizeof fastreg_wr);
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.send_flags = IB_SEND_SIGNALED;
+	fastreg_wr.wr.fast_reg.iova_start = (unsigned long)mr->mr_dma;
+	fastreg_wr.wr.fast_reg.page_list = mr->mr_chunk.rl_fr->fr_pgl;
+	fastreg_wr.wr.fast_reg.page_list_len = nsegs;
+	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	fastreg_wr.wr.fast_reg.length = nsegs << PAGE_SHIFT;
+	fastreg_wr.wr.fast_reg.access_flags = access;
+	fastreg_wr.wr.fast_reg.rkey = mr->mr_chunk.rl_fr->fr_mr->rkey;
+	ret = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+	dprintk("RPC:       %s fast reg rkey %08x kva %llx map_len "
+		"%d page_list_len %d ret %d\n", __func__,
+		rkey, mr->mr_dma, nsegs << PAGE_SHIFT, nsegs, ret);
+	return ret;
+}

  static void
  rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
@@ -1353,6 +1527,53 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
  #endif

  	/* Registration using fast memory registration */
+	case RPCRDMA_FASTREG:
+		{
+		int len, pageoff = offset_in_page(seg->mr_offset);
+
+		seg1->mr_chunk.rl_fr = rpcrdma_alloc_frmr(&r_xprt->rx_buf);
+		if (!seg1->mr_chunk.rl_fr) {
+			printk("RPC:       Failed to allocate frmr\n");
+			rc = -ENOMEM;
+			break;
+		}
+		seg1->mr_offset -= pageoff;	/* start of page */
+		seg1->mr_len += pageoff;
+		len = -pageoff;
+		if (nsegs > RPCRDMA_MAX_DATA_SEGS)
+			nsegs = RPCRDMA_MAX_DATA_SEGS;
+		for (i = 0; i < nsegs;) {
+			rpcrdma_map_one(ia, seg, writing);
+			seg1->mr_chunk.rl_fr->fr_pgl->page_list[i] = seg->mr_dma;
+			len += seg->mr_len;
+			++seg;
+			++i;
+			/* Check for holes */
+			if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+			    offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
+				break;
+		}
+		nsegs = i;
+		dprintk("RPC:       %s: Using fmr %p to map %d segments\n",
+			__func__, seg1->mr_chunk.rl_fr, nsegs);
+		rc = rpcrdma_fastreg_seg(ia, seg1, nsegs, mem_priv);
+		if (rc) {
+			printk("RPC:       %s: failed ib_map_phys_fmr "
+				"%u@0x%llx+%i (%d)... status %i\n", __func__,
+				len, (unsigned long long)seg1->mr_dma,
+				pageoff, nsegs, rc);
+			while (nsegs--)
+				rpcrdma_unmap_one(ia, --seg);
+		} else {
+			seg1->mr_rkey = seg1->mr_chunk.rl_fr->fr_mr->rkey;
+			seg1->mr_base = seg1->mr_dma + pageoff;
+			seg1->mr_nsegs = nsegs;
+			seg1->mr_len = len;
+		}
+		}
+		break;
+
+	/* Registration using MTHCA FMR */
  	case RPCRDMA_MTHCAFMR:
  		{
  		u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
@@ -1486,6 +1707,16 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
  		break;
  #endif

+	case RPCRDMA_FASTREG:
+		while (seg1->mr_nsegs--) {
+			if (seg1->mr_chunk.rl_fr) {
+				rpcrdma_free_frmr(&r_xprt->rx_buf, seg1->mr_chunk.rl_fr);
+				seg1->mr_chunk.rl_fr = NULL;
+			}
+			rpcrdma_unmap_one(ia, seg++);
+		}
+		break;
+
  	case RPCRDMA_MTHCAFMR:
  		{
  		LIST_HEAD(l);
@@ -1590,7 +1821,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
  		INIT_CQCOUNT(ep);
  		send_wr.send_flags = IB_SEND_SIGNALED;
  	}
-
  	rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
  	if (rc)
  		dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2008-08-13 22:40 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-13 16:18 [PATCH,RFC 02/02] xprtrdma: Update the RPC memory registration to use FRMR Tom Tucker
2008-08-13 17:40 ` Trond Myklebust
2008-08-13 22:30   ` Chuck Lever
2008-08-13 22:35     ` Tom Tucker
2008-08-13 22:37       ` Trond Myklebust

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox