All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tom Tucker <tom@opengridcomputing.com>
To: bfields@fieldses.org
Cc: thomas.talpey@netapp.com, linux-nfs@vger.kernel.org,
	Tom Tucker <tom@opengridcomputing.com>
Subject: [PATCH 07/12] svcrdma: Modify the RPC recv path to use FRMR when available
Date: Fri,  3 Oct 2008 16:33:44 -0500	[thread overview]
Message-ID: <1223069629-5267-8-git-send-email-tom@opengridcomputing.com> (raw)
In-Reply-To: <1223069629-5267-7-git-send-email-tom@opengridcomputing.com>

RPCRDMA requests that specify a read-list are fetched with RDMA_READ. Using
an FRMR to map the data sink improves NFSRDMA security on transports that
place the RDMA_READ data sink LKEY on the wire because the valid lifetime
of the MR is only the duration of the RDMA_READ. The LKEY is invalidated
when the last RDMA_READ WR completes.

Mapping the data sink also allows for very large amounts to data to be
fetched with a single WR, so if the client is also using FRMR, the entire
RPC read-list can be fetched with a single WR.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

---
 include/linux/sunrpc/svc_rdma.h          |    1 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  |  187 ++++++++++++++++++++++++++----
 net/sunrpc/xprtrdma/svc_rdma_transport.c |    5 +-
 3 files changed, 171 insertions(+), 22 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 1402d19..c14fe86 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -212,6 +212,7 @@ extern int svc_rdma_post_recv(struct svcxprt_rdma *);
 extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
 extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
 extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
+extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
 extern struct svc_rdma_req_map *svc_rdma_get_req_map(void);
 extern void svc_rdma_put_req_map(struct svc_rdma_req_map *);
 extern int svc_rdma_fastreg(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 74de31a..a475657 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -116,7 +116,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
  *
  * Assumptions:
  * - chunk[0]->position points to pages[0] at an offset of 0
- * - pages[] is not physically or virtually contigous and consists of
+ * - pages[] is not physically or virtually contiguous and consists of
  *   PAGE_SIZE elements.
  *
  * Output:
@@ -125,7 +125,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
  *   chunk in the read list
  *
  */
-static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
+static int map_read_chunks(struct svcxprt_rdma *xprt,
 			   struct svc_rqst *rqstp,
 			   struct svc_rdma_op_ctxt *head,
 			   struct rpcrdma_msg *rmsgp,
@@ -211,26 +211,128 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
 	return sge_no;
 }
 
-static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
-			      struct svc_rdma_op_ctxt *ctxt,
-			      struct kvec *vec,
-			      u64 *sgl_offset,
-			      int count)
+/* Map a read-chunk-list to an XDR and fast register the page-list.
+ *
+ * Assumptions:
+ * - chunk[0]	position points to pages[0] at an offset of 0
+ * - pages[]	will be made physically contiguous by creating a one-off memory
+ *		region using the fastreg verb.
+ * - byte_count is # of bytes in read-chunk-list
+ * - ch_count	is # of chunks in read-chunk-list
+ *
+ * Output:
+ * - sge array pointing into pages[] array.
+ * - chunk_sge array specifying sge index and count for each
+ *   chunk in the read list
+ */
+static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
+				struct svc_rqst *rqstp,
+				struct svc_rdma_op_ctxt *head,
+				struct rpcrdma_msg *rmsgp,
+				struct svc_rdma_req_map *rpl_map,
+				struct svc_rdma_req_map *chl_map,
+				int ch_count,
+				int byte_count)
+{
+	int page_no;
+	int ch_no;
+	u32 offset;
+	struct rpcrdma_read_chunk *ch;
+	struct svc_rdma_fastreg_mr *frmr;
+	int ret = 0;
+
+	frmr = svc_rdma_get_frmr(xprt);
+	if (IS_ERR(frmr))
+		return -ENOMEM;
+
+	head->frmr = frmr;
+	head->arg.head[0] = rqstp->rq_arg.head[0];
+	head->arg.tail[0] = rqstp->rq_arg.tail[0];
+	head->arg.pages = &head->pages[head->count];
+	head->hdr_count = head->count; /* save count of hdr pages */
+	head->arg.page_base = 0;
+	head->arg.page_len = byte_count;
+	head->arg.len = rqstp->rq_arg.len + byte_count;
+	head->arg.buflen = rqstp->rq_arg.buflen + byte_count;
+
+	/* Fast register the page list */
+	frmr->kva = page_address(rqstp->rq_arg.pages[0]);
+	frmr->direction = DMA_FROM_DEVICE;
+	frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
+	frmr->map_len = byte_count;
+	frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
+	for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+		frmr->page_list->page_list[page_no] =
+			ib_dma_map_single(xprt->sc_cm_id->device,
+					  page_address(rqstp->rq_arg.pages[page_no]),
+					  PAGE_SIZE, DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+					 frmr->page_list->page_list[page_no]))
+			goto fatal_err;
+		atomic_inc(&xprt->sc_dma_used);
+		head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+	}
+	head->count += page_no;
+
+	/* rq_respages points one past arg pages */
+	rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+
+	/* Create the reply and chunk maps */
+	offset = 0;
+	ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+	for (ch_no = 0; ch_no < ch_count; ch_no++) {
+		rpl_map->sge[ch_no].iov_base = frmr->kva + offset;
+		rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length;
+		chl_map->ch[ch_no].count = 1;
+		chl_map->ch[ch_no].start = ch_no;
+		offset += ch->rc_target.rs_length;
+		ch++;
+	}
+
+	ret = svc_rdma_fastreg(xprt, frmr);
+	if (ret)
+		goto fatal_err;
+
+	return ch_no;
+
+ fatal_err:
+	printk("svcrdma: error fast registering xdr for xprt %p", xprt);
+	svc_rdma_put_frmr(xprt, frmr);
+	return -EIO;
+}
+
+static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
+			     struct svc_rdma_op_ctxt *ctxt,
+			     struct svc_rdma_fastreg_mr *frmr,
+			     struct kvec *vec,
+			     u64 *sgl_offset,
+			     int count)
 {
 	int i;
 
 	ctxt->count = count;
 	ctxt->direction = DMA_FROM_DEVICE;
 	for (i = 0; i < count; i++) {
-		atomic_inc(&xprt->sc_dma_used);
-		ctxt->sge[i].addr =
-			ib_dma_map_single(xprt->sc_cm_id->device,
-					  vec[i].iov_base, vec[i].iov_len,
-					  DMA_FROM_DEVICE);
+		ctxt->sge[i].length = 0; /* in case map fails */
+		if (!frmr) {
+			ctxt->sge[i].addr =
+				ib_dma_map_single(xprt->sc_cm_id->device,
+						  vec[i].iov_base,
+						  vec[i].iov_len,
+						  DMA_FROM_DEVICE);
+			if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+						 ctxt->sge[i].addr))
+				return -EINVAL;
+			ctxt->sge[i].lkey = xprt->sc_dma_lkey;
+			atomic_inc(&xprt->sc_dma_used);
+		} else {
+			ctxt->sge[i].addr = (unsigned long)vec[i].iov_base;
+			ctxt->sge[i].lkey = frmr->mr->lkey;
+		}
 		ctxt->sge[i].length = vec[i].iov_len;
-		ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey;
 		*sgl_offset = *sgl_offset + vec[i].iov_len;
 	}
+	return 0;
 }
 
 static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
@@ -278,6 +380,7 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
 			 struct svc_rdma_op_ctxt *hdr_ctxt)
 {
 	struct ib_send_wr read_wr;
+	struct ib_send_wr inv_wr;
 	int err = 0;
 	int ch_no;
 	int ch_count;
@@ -301,9 +404,20 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
 	svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
 	if (ch_count > RPCSVC_MAXPAGES)
 		return -EINVAL;
-	sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
-				    rpl_map, chl_map,
-				    ch_count, byte_count);
+
+	if (!xprt->sc_frmr_pg_list_len)
+		sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+					    rpl_map, chl_map, ch_count,
+					    byte_count);
+	else
+		sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+						 rpl_map, chl_map, ch_count,
+						 byte_count);
+	if (sge_count < 0) {
+		err = -EIO;
+		goto out;
+	}
+
 	sgl_offset = 0;
 	ch_no = 0;
 
@@ -312,13 +426,16 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
 next_sge:
 		ctxt = svc_rdma_get_context(xprt);
 		ctxt->direction = DMA_FROM_DEVICE;
+		ctxt->frmr = hdr_ctxt->frmr;
+		ctxt->read_hdr = NULL;
 		clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+		clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
 
 		/* Prepare READ WR */
 		memset(&read_wr, 0, sizeof read_wr);
-		ctxt->wr_op = IB_WR_RDMA_READ;
 		read_wr.wr_id = (unsigned long)ctxt;
 		read_wr.opcode = IB_WR_RDMA_READ;
+		ctxt->wr_op = read_wr.opcode;
 		read_wr.send_flags = IB_SEND_SIGNALED;
 		read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
 		read_wr.wr.rdma.remote_addr =
@@ -327,10 +444,15 @@ next_sge:
 		read_wr.sg_list = ctxt->sge;
 		read_wr.num_sge =
 			rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
-		rdma_set_ctxt_sge(xprt, ctxt,
-				  &rpl_map->sge[chl_map->ch[ch_no].start],
-				  &sgl_offset,
-				  read_wr.num_sge);
+		err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr,
+					&rpl_map->sge[chl_map->ch[ch_no].start],
+					&sgl_offset,
+					read_wr.num_sge);
+		if (err) {
+			svc_rdma_unmap_dma(ctxt);
+			svc_rdma_put_context(ctxt, 0);
+			goto out;
+		}
 		if (((ch+1)->rc_discrim == 0) &&
 		    (read_wr.num_sge == chl_map->ch[ch_no].count)) {
 			/*
@@ -339,6 +461,29 @@ next_sge:
 			 * the client and the RPC needs to be enqueued.
 			 */
 			set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+			if (hdr_ctxt->frmr) {
+				set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+				/*
+				 * Invalidate the local MR used to map the data
+				 * sink.
+				 */
+				if (xprt->sc_dev_caps &
+				    SVCRDMA_DEVCAP_READ_W_INV) {
+					read_wr.opcode =
+						IB_WR_RDMA_READ_WITH_INV;
+					ctxt->wr_op = read_wr.opcode;
+					read_wr.ex.invalidate_rkey =
+						ctxt->frmr->mr->lkey;
+				} else {
+					/* Prepare INVALIDATE WR */
+					memset(&inv_wr, 0, sizeof inv_wr);
+					inv_wr.opcode = IB_WR_LOCAL_INV;
+					inv_wr.send_flags = IB_SEND_SIGNALED;
+					inv_wr.ex.invalidate_rkey =
+						hdr_ctxt->frmr->mr->lkey;
+					read_wr.next = &inv_wr;
+				}
+			}
 			ctxt->read_hdr = hdr_ctxt;
 		}
 		/* Post the read */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 5ea7901..30778a5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -105,7 +105,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 	return ctxt;
 }
 
-static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
+void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
 {
 	struct svcxprt_rdma *xprt = ctxt->xprt;
 	int i;
@@ -343,9 +343,12 @@ static void process_context(struct svcxprt_rdma *xprt,
 		break;
 
 	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_READ_WITH_INV:
 		if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
 			struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
 			BUG_ON(!read_hdr);
+			if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+				svc_rdma_put_frmr(xprt, ctxt->frmr);
 			spin_lock_bh(&xprt->sc_rq_dto_lock);
 			set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
 			list_add_tail(&read_hdr->dto_q,

  reply	other threads:[~2008-10-03 21:33 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-10-03 21:33 [PATCH 00/12] svcrdma: Fast memory registration support Tom Tucker
2008-10-03 21:33 ` [PATCH 01/12] svcrdma: Add Fast Reg MR Data Types Tom Tucker
2008-10-03 21:33   ` [PATCH 02/12] svcrdma: Add FRMR get/put services Tom Tucker
2008-10-03 21:33     ` [PATCH 03/12] svcrdma: Query device for Fast Reg support during connection setup Tom Tucker
2008-10-03 21:33       ` [PATCH 04/12] svcrdma: Add a service to register a Fast Reg MR with the device Tom Tucker
2008-10-03 21:33         ` [PATCH 05/12] svcrdma: Modify post recv path to use local dma key Tom Tucker
2008-10-03 21:33           ` [PATCH 06/12] svcrdma: Add support to svc_rdma_send to handle chained WR Tom Tucker
2008-10-03 21:33             ` Tom Tucker [this message]
2008-10-03 21:33               ` [PATCH 08/12] svcrdma: Modify the RPC reply path to use FRMR when available Tom Tucker
2008-10-03 21:33                 ` [PATCH 09/12] svcrdma: Update svc_rdma_send_error to use DMA LKEY Tom Tucker
2008-10-03 21:33                   ` [PATCH 10/12] svcrdma: Fix IRD/ORD polarity Tom Tucker
2008-10-03 21:33                     ` [PATCH 11/12] svcrdma: Add a message log string to indicate if FastReg is being used Tom Tucker
2008-10-03 21:33                       ` [PATCH 12/12] svcrdma: Documentation update for the FastReg memory model Tom Tucker
2008-10-08 22:48                       ` [PATCH 11/12] svcrdma: Add a message log string to indicate if FastReg is being used J. Bruce Fields
2008-10-09  6:37                         ` Tom Tucker
2008-10-09 16:26                           ` J. Bruce Fields
2008-10-09 18:46                             ` Tom Tucker
2008-10-10 21:02                               ` J. Bruce Fields
2008-10-13  2:18                                 ` Tom Tucker
2008-10-13  2:20                                   ` Tom Tucker
2008-10-22 20:23                                     ` J. Bruce Fields
2008-10-22 21:37                                       ` Tom Tucker
2008-10-04  1:05     ` [PATCH 02/12] svcrdma: Add FRMR get/put services Tom Tucker
2008-10-06 20:02     ` Tom Tucker
2008-10-08 22:26 ` [PATCH 00/12] svcrdma: Fast memory registration support J. Bruce Fields

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1223069629-5267-8-git-send-email-tom@opengridcomputing.com \
    --to=tom@opengridcomputing.com \
    --cc=bfields@fieldses.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=thomas.talpey@netapp.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.