From: Trond Myklebust <trond.myklebust@fys.uio.no>
To: Tom Tucker <tom@opengridcomputing.com>
Cc: Tom Talpey <Thomas.Talpey@netapp.com>, linux-nfs@vger.kernel.org
Subject: Re: [PATCH,RFC 02/02] xprtrdma: Update the RPC memory registration to use FRMR
Date: Wed, 13 Aug 2008 13:40:53 -0400 [thread overview]
Message-ID: <1218649253.9042.11.camel@localhost> (raw)
In-Reply-To: <48A30959.7060404@opengridcomputing.com>
On Wed, 2008-08-13 at 11:18 -0500, Tom Tucker wrote:
> Use FRMR when registering client memory if the memory registration
> strategy is FRMR.
>
> Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
>
> ---
> net/sunrpc/xprtrdma/verbs.c | 296 ++++++++++++++++++++++++++++++++++++++-----
> 1 files changed, 263 insertions(+), 33 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index 8ea283e..ed401f9 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -423,6 +423,7 @@ rpcrdma_clean_cq(struct ib_cq *cq)
> int
> rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
> {
> + struct ib_device_attr devattr;
> int rc;
> struct rpcrdma_ia *ia = &xprt->rx_ia;
>
> @@ -443,6 +444,49 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
> }
>
> /*
> + * Query the device to determine if the requested memory
> + * registration strategy is supported. If it isnt't set the
> + * strategy to a globally supported model.
> + */
> + rc = ib_query_device(ia->ri_id->device, &devattr);
> + if (rc) {
> + dprintk("RPC: %s: ib_query_device failed %d\n",
> + __func__, rc);
> + goto out2;
> + }
> + switch (memreg) {
> + case RPCRDMA_MEMWINDOWS:
> + case RPCRDMA_MEMWINDOWS_ASYNC:
> + if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
> + dprintk("RPC: %s: MEMWINDOWS specified but not "
> + "supported, using RPCRDMA_ALLPHYSICAL",
> + __func__);
> + memreg = RPCRDMA_ALLPHYSICAL;
> + }
> + break;
> + case RPCRDMA_MTHCAFMR:
> + if (!ia->ri_id->device->alloc_fmr) {
> + dprintk("RPC: %s: MTHCAFMR specified but not "
> + "supported, using RPCRDMA_ALLPHYSICAL",
> + __func__);
> + memreg = RPCRDMA_ALLPHYSICAL;
> + }
> + break;
> + case RPCRDMA_FASTREG:
> + /* Requires both fast reg and global dma lkey */
> + if ((0 ==
> + (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) ||
> + (0 == (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY))) {
> + dprintk("RPC: %s: FASTREG specified but not "
> + "supported, using RPCRDMA_ALLPHYSICAL",
> + __func__);
> + memreg = RPCRDMA_ALLPHYSICAL;
> + }
> + break;
> + }
> + dprintk("RPC: memory registration strategy is %d\n", memreg);
> +
> + /*
> * Optionally obtain an underlying physical identity mapping in
> * order to do a memory window-based bind. This base registration
> * is protected from remote access - that is enabled only by binding
> @@ -450,7 +494,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
> * revoked after the corresponding completion similar to a storage
> * adapter.
> */
> - if (memreg > RPCRDMA_REGISTER) {
> + if ((memreg > RPCRDMA_REGISTER) && (memreg != RPCRDMA_FASTREG)) {
> int mem_priv = IB_ACCESS_LOCAL_WRITE;
> switch (memreg) {
> #if RPCRDMA_PERSISTENT_REGISTRATION
> @@ -475,7 +519,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
> memreg = RPCRDMA_REGISTER;
> ia->ri_bind_mem = NULL;
> }
> + ia->ri_dma_lkey = ia->ri_bind_mem->lkey;
> }
> + if (memreg == RPCRDMA_FASTREG)
> + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
>
> /* Else will do memory reg/dereg for each chunk */
> ia->ri_memreg_strategy = memreg;
> @@ -541,6 +588,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
> ep->rep_attr.srq = NULL;
> ep->rep_attr.cap.max_send_wr = cdata->max_requests;
> switch (ia->ri_memreg_strategy) {
> + case RPCRDMA_FASTREG:
> + /* Add room for fast reg and invalidate */
> + ep->rep_attr.cap.max_send_wr *= 3;
> + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
> + return -EINVAL;
> + break;
> case RPCRDMA_MEMWINDOWS_ASYNC:
> case RPCRDMA_MEMWINDOWS:
> /* Add room for mw_binds+unbinds - overkill! */
> @@ -623,6 +676,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
> break;
> case RPCRDMA_MTHCAFMR:
> case RPCRDMA_REGISTER:
> + case RPCRDMA_FASTREG:
> ep->rep_remote_cma.responder_resources = cdata->max_requests *
> (RPCRDMA_MAX_DATA_SEGS / 8);
> break;
> @@ -866,6 +920,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
>
> buf->rb_max_requests = cdata->max_requests;
> spin_lock_init(&buf->rb_lock);
> + spin_lock_init(&buf->rb_frs_lock);
> atomic_set(&buf->rb_credits, 1);
>
> /* Need to allocate:
> @@ -874,6 +929,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
> * 3. array of struct rpcrdma_rep for replies
> * 4. padding, if any
> * 5. mw's, if any
> + * 6. frmr's, if any
> * Send/recv buffers in req/rep need to be registered
> */
>
> @@ -881,6 +937,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
> (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
> len += cdata->padding;
> switch (ia->ri_memreg_strategy) {
> + case RPCRDMA_FASTREG:
> + len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
> + sizeof(struct rpcrdma_frmr);
> + break;
> case RPCRDMA_MTHCAFMR:
> /* TBD we are perhaps overallocating here */
> len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
> @@ -895,7 +955,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
> break;
> }
>
> - /* allocate 1, 4 and 5 in one shot */
> + /* allocate 1, 4, 5 and 6 in one shot */
> p = kzalloc(len, GFP_KERNEL);
> if (p == NULL) {
> dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
> @@ -927,7 +987,38 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
> * and also reduce unbind-to-bind collision.
> */
> INIT_LIST_HEAD(&buf->rb_mws);
> + INIT_LIST_HEAD(&buf->rb_frs);
> switch (ia->ri_memreg_strategy) {
> + case RPCRDMA_FASTREG:
> + {
Can we please get rid of this kind of ugliness whereby we insert blocks
just in order to set up a temporary variable. I know Chuck is fond of
it, but as far as I'm concerned it just screws up readability of the
code.
If you feel you need to isolate a variable to a particular block of
code, then make a function, and that's particularly true of these huge
switch statements that are trying to do 100 entirely unrelated things in
one function.
> + struct rpcrdma_frmr *fr = (struct rpcrdma_frmr *)p;
> + for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
> + fr->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
> + RPCRDMA_MAX_SEGS);
> + if (IS_ERR(fr->fr_mr)) {
> + rc = PTR_ERR(fr->fr_mr);
> + printk("RPC: %s: ib_alloc_fast_reg_mr"
> + " failed %i\n", __func__, rc);
> + goto out;
> + }
> + fr->fr_pgl =
> + ib_alloc_fast_reg_page_list(ia->ri_id->device,
> + RPCRDMA_MAX_SEGS);
> + if (IS_ERR(fr->fr_pgl)) {
> + rc = PTR_ERR(fr->fr_pgl);
> + printk("RPC: %s: "
> + "ib_alloc_fast_reg_page_list "
> + "failed %i\n", __func__, rc);
> + goto out;
> + }
> + INIT_LIST_HEAD(&fr->fr_list);
> + list_add(&fr->fr_list, &buf->rb_frs);
> + dprintk("RPC: %s alloc fmr %p pgl %p\n", __func__,
> + fr->fr_mr, fr->fr_pgl);
> + ++fr;
> + }
> + }
^urgh
> + break;
> case RPCRDMA_MTHCAFMR:
> {
> struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
> @@ -1056,6 +1147,49 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
> */
> dprintk("RPC: %s: entering\n", __func__);
>
> + while (!list_empty(&buf->rb_frs)) {
> + struct rpcrdma_frmr *fr =
> + list_entry(buf->rb_frs.next,
> + struct rpcrdma_frmr, fr_list);
> + list_del(&fr->fr_list);
> + rc = ib_dereg_mr(fr->fr_mr);
> + if (rc)
> + dprintk("RPC: %s:"
> + " ib_dereg_mr"
> + " failed %i\n",
> + __func__, rc);
> + ib_free_fast_reg_page_list(fr->fr_pgl);
> + }
> +
> + while (!list_empty(&buf->rb_mws)) {
> + struct rpcrdma_mw *r;
> + switch (ia->ri_memreg_strategy) {
> + case RPCRDMA_MTHCAFMR:
> + r = list_entry(buf->rb_mws.next,
> + struct rpcrdma_mw, mw_list);
> + list_del(&r->mw_list);
> + rc = ib_dealloc_fmr(r->r.fmr);
> + if (rc)
> + dprintk("RPC: %s:"
> + " ib_dealloc_fmr"
> + " failed %i\n",
> + __func__, rc);
> + break;
> + case RPCRDMA_MEMWINDOWS_ASYNC:
> + case RPCRDMA_MEMWINDOWS:
> + r = list_entry(buf->rb_mws.next,
> + struct rpcrdma_mw, mw_list);
> + list_del(&r->mw_list);
> + rc = ib_dealloc_mw(r->r.mw);
> + if (rc)
> + dprintk("RPC: %s: ib_dealloc_mw "
> + "failed %i\n", __func__, rc);
> + break;
> + default:
> + break;
> + }
> + }
> +
> for (i = 0; i < buf->rb_max_requests; i++) {
> if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
> rpcrdma_deregister_internal(ia,
> @@ -1064,33 +1198,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
> kfree(buf->rb_recv_bufs[i]);
> }
> if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
> - while (!list_empty(&buf->rb_mws)) {
> - struct rpcrdma_mw *r;
> - r = list_entry(buf->rb_mws.next,
> - struct rpcrdma_mw, mw_list);
> - list_del(&r->mw_list);
> - switch (ia->ri_memreg_strategy) {
> - case RPCRDMA_MTHCAFMR:
> - rc = ib_dealloc_fmr(r->r.fmr);
> - if (rc)
> - dprintk("RPC: %s:"
> - " ib_dealloc_fmr"
> - " failed %i\n",
> - __func__, rc);
> - break;
> - case RPCRDMA_MEMWINDOWS_ASYNC:
> - case RPCRDMA_MEMWINDOWS:
> - rc = ib_dealloc_mw(r->r.mw);
> - if (rc)
> - dprintk("RPC: %s:"
> - " ib_dealloc_mw"
> - " failed %i\n",
> - __func__, rc);
> - break;
> - default:
> - break;
> - }
> - }
> rpcrdma_deregister_internal(ia,
> buf->rb_send_bufs[i]->rl_handle,
> &buf->rb_send_bufs[i]->rl_iov);
> @@ -1115,6 +1222,7 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
> {
> struct rpcrdma_req *req;
> unsigned long flags;
> + int i;
>
> spin_lock_irqsave(&buffers->rb_lock, flags);
> if (buffers->rb_send_index == buffers->rb_max_requests) {
> @@ -1134,8 +1242,11 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
> buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
> }
> buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
> + for (i = 0; i < RPCRDMA_MAX_SEGS; i++)
> + req->rl_segments[i].mr_chunk.rl_fr = NULL;
> +
> if (!list_empty(&buffers->rb_mws)) {
> - int i = RPCRDMA_MAX_SEGS - 1;
> + i = RPCRDMA_MAX_SEGS - 1;
> do {
> struct rpcrdma_mw *r;
> r = list_entry(buffers->rb_mws.next,
> @@ -1148,6 +1259,31 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
> return req;
> }
>
> +static void
> +rpcrdma_free_frmr(struct rpcrdma_buffer *buf, struct rpcrdma_frmr *fr_mr)
> +{
> + unsigned long flags;
> + spin_lock_irqsave(&buf->rb_frs_lock, flags);
> + list_add(&fr_mr->fr_list, &buf->rb_frs);
> + spin_unlock_irqrestore(&buf->rb_frs_lock, flags);
> +}
> +
> +static struct rpcrdma_frmr *
> +rpcrdma_alloc_frmr(struct rpcrdma_buffer *buf)
> +{
> + unsigned long flags;
> + struct rpcrdma_frmr *fr_mr = NULL;
> +
> + spin_lock_irqsave(&buf->rb_frs_lock, flags);
> + if (!list_empty(&buf->rb_frs)) {
> + fr_mr = list_entry(buf->rb_frs.next,
> + struct rpcrdma_frmr, fr_list);
> + list_del_init(&fr_mr->fr_list);
> + }
> + spin_unlock_irqrestore(&buf->rb_frs_lock, flags);
> + return fr_mr;
> +}
> +
> /*
> * Put request/reply buffers back into pool.
> * Pre-decrement counter/array index.
> @@ -1252,9 +1388,10 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
> va, len, DMA_BIDIRECTIONAL);
> iov->length = len;
>
> - if (ia->ri_bind_mem != NULL) {
> + if (RPCRDMA_FASTREG == ia->ri_memreg_strategy ||
> + ia->ri_bind_mem) {
> *mrp = NULL;
> - iov->lkey = ia->ri_bind_mem->lkey;
> + iov->lkey = ia->ri_dma_lkey;
> return 0;
> }
>
> @@ -1302,6 +1439,43 @@ rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
> /*
> * Wrappers for chunk registration, shared by read/write chunk code.
> */
> +static int
> +rpcrdma_fastreg_seg(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *mr,
> + int nsegs, u32 access)
> +{
> + struct ib_send_wr invalidate_wr, fastreg_wr, *bad_wr;
> + u8 key;
> + u32 rkey = mr->mr_chunk.rl_fr->fr_mr->rkey;
> + int ret;
> +
> + /* Prepare INVALIDATE WR */
> + memset(&invalidate_wr, 0, sizeof invalidate_wr);
> + invalidate_wr.opcode = IB_WR_LOCAL_INV;
> + invalidate_wr.send_flags = IB_SEND_SIGNALED;
> + invalidate_wr.ex.invalidate_rkey = rkey;
> + invalidate_wr.next = &fastreg_wr;
> +
> + /* Bump the key */
> + key = (u8)(mr->mr_chunk.rl_fr->fr_mr->rkey & 0x000000FF);
> + ib_update_fast_reg_key(mr->mr_chunk.rl_fr->fr_mr, ++key);
> +
> + /* Prepare FASTREG WR */
> + memset(&fastreg_wr, 0, sizeof fastreg_wr);
> + fastreg_wr.opcode = IB_WR_FAST_REG_MR;
> + fastreg_wr.send_flags = IB_SEND_SIGNALED;
> + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)mr->mr_dma;
> + fastreg_wr.wr.fast_reg.page_list = mr->mr_chunk.rl_fr->fr_pgl;
> + fastreg_wr.wr.fast_reg.page_list_len = nsegs;
> + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
> + fastreg_wr.wr.fast_reg.length = nsegs << PAGE_SHIFT;
> + fastreg_wr.wr.fast_reg.access_flags = access;
> + fastreg_wr.wr.fast_reg.rkey = mr->mr_chunk.rl_fr->fr_mr->rkey;
> + ret = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
> + dprintk("RPC: %s fast reg rkey %08x kva %llx map_len "
> + "%d page_list_len %d ret %d\n", __func__,
> + rkey, mr->mr_dma, nsegs << PAGE_SHIFT, nsegs, ret);
> + return ret;
> +}
>
> static void
> rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
> @@ -1353,6 +1527,53 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
> #endif
>
> /* Registration using fast memory registration */
> + case RPCRDMA_FASTREG:
> + {
> + int len, pageoff = offset_in_page(seg->mr_offset);
Ditto...
> +
> + seg1->mr_chunk.rl_fr = rpcrdma_alloc_frmr(&r_xprt->rx_buf);
> + if (!seg1->mr_chunk.rl_fr) {
> + printk("RPC: Failed to allocate frmr\n");
> + rc = -ENOMEM;
> + break;
> + }
> + seg1->mr_offset -= pageoff; /* start of page */
> + seg1->mr_len += pageoff;
> + len = -pageoff;
> + if (nsegs > RPCRDMA_MAX_DATA_SEGS)
> + nsegs = RPCRDMA_MAX_DATA_SEGS;
> + for (i = 0; i < nsegs;) {
> + rpcrdma_map_one(ia, seg, writing);
> + seg1->mr_chunk.rl_fr->fr_pgl->page_list[i] = seg->mr_dma;
> + len += seg->mr_len;
> + ++seg;
> + ++i;
> + /* Check for holes */
> + if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
> + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
> + break;
> + }
> + nsegs = i;
> + dprintk("RPC: %s: Using fmr %p to map %d segments\n",
> + __func__, seg1->mr_chunk.rl_fr, nsegs);
> + rc = rpcrdma_fastreg_seg(ia, seg1, nsegs, mem_priv);
> + if (rc) {
> + printk("RPC: %s: failed ib_map_phys_fmr "
> + "%u@0x%llx+%i (%d)... status %i\n", __func__,
> + len, (unsigned long long)seg1->mr_dma,
> + pageoff, nsegs, rc);
> + while (nsegs--)
> + rpcrdma_unmap_one(ia, --seg);
> + } else {
> + seg1->mr_rkey = seg1->mr_chunk.rl_fr->fr_mr->rkey;
> + seg1->mr_base = seg1->mr_dma + pageoff;
> + seg1->mr_nsegs = nsegs;
> + seg1->mr_len = len;
> + }
> + }
> + break;
> +
> + /* Registration using MTHCA FMR */
> case RPCRDMA_MTHCAFMR:
> {
> u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
> @@ -1486,6 +1707,16 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
> break;
> #endif
>
> + case RPCRDMA_FASTREG:
> + while (seg1->mr_nsegs--) {
> + if (seg1->mr_chunk.rl_fr) {
> + rpcrdma_free_frmr(&r_xprt->rx_buf, seg1->mr_chunk.rl_fr);
> + seg1->mr_chunk.rl_fr = NULL;
> + }
> + rpcrdma_unmap_one(ia, seg++);
> + }
> + break;
> +
> case RPCRDMA_MTHCAFMR:
> {
> LIST_HEAD(l);
> @@ -1590,7 +1821,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
> INIT_CQCOUNT(ep);
> send_wr.send_flags = IB_SEND_SIGNALED;
> }
> -
> rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
> if (rc)
> dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
next prev parent reply other threads:[~2008-08-13 17:41 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-08-13 16:18 [PATCH,RFC 02/02] xprtrdma: Update the RPC memory registration to use FRMR Tom Tucker
2008-08-13 17:40 ` Trond Myklebust [this message]
2008-08-13 22:30 ` Chuck Lever
2008-08-13 22:35 ` Tom Tucker
2008-08-13 22:37 ` Trond Myklebust
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1218649253.9042.11.camel@localhost \
--to=trond.myklebust@fys.uio.no \
--cc=Thomas.Talpey@netapp.com \
--cc=linux-nfs@vger.kernel.org \
--cc=tom@opengridcomputing.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox