From: Chuck Lever <chuck.lever@oracle.com>
To: linux-rdma@vger.kernel.org, linux-nfs@vger.kernel.org
Subject: [PATCH v1 07/22] xprtrdma: Support Write+Reply Replies
Date: Mon, 10 Sep 2018 11:09:32 -0400 [thread overview]
Message-ID: <20180910150932.10564.1879.stgit@manet.1015granger.net> (raw)
In-Reply-To: <20180910150040.10564.97487.stgit@manet.1015granger.net>
Currently the client handles a large NFS READ request by providing
the server with a Write chunk, and expecting that the non-payload
part of the RPC Reply will always fit inline.
When the inline threshold is small (for instance, when talking to a
server that uses a 1024-byte threshold) the non-payload part of the
Reply might not fit inline in certain rare cases. The server has to
drop the Reply or return an ERR_CHUNK and the RPC transaction fails.
Let's add a little logic to recognize when the non-payload part of
an NFS READ might be large, and marshal both a Write chunk and a
Reply chunk to enable the server to send the payload in the Write
chunk and the large non-payload part in the Reply chunk.
I've never seen this failure in the wild.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/trace/events/rpcrdma.h | 4 ++
net/sunrpc/xprtrdma/rpc_rdma.c | 63 +++++++++++++++++++++++++--------------
net/sunrpc/xprtrdma/xprt_rdma.h | 3 +-
3 files changed, 46 insertions(+), 24 deletions(-)
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index b9e6802..cd3e5e7 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -446,6 +446,7 @@
TRACE_DEFINE_ENUM(rpcrdma_areadch);
TRACE_DEFINE_ENUM(rpcrdma_writech);
TRACE_DEFINE_ENUM(rpcrdma_replych);
+TRACE_DEFINE_ENUM(rpcrdma_writereply);
#define xprtrdma_show_chunktype(x) \
__print_symbolic(x, \
@@ -453,7 +454,8 @@
{ rpcrdma_readch, "read list" }, \
{ rpcrdma_areadch, "*read list" }, \
{ rpcrdma_writech, "write list" }, \
- { rpcrdma_replych, "reply chunk" })
+ { rpcrdma_replych, "reply chunk" }, \
+ { rpcrdma_writereply, "write+reply" })
TRACE_EVENT(xprtrdma_marshal,
TP_PROTO(
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 26640e6..3594562 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -202,21 +202,20 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
*/
static int
rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
- unsigned int pos, struct rpcrdma_mr_seg *seg,
- bool omit_xdr_pad)
+ unsigned int pos, unsigned int page_len,
+ struct rpcrdma_mr_seg *seg, bool omit_xdr_pad)
{
unsigned long page_base;
- unsigned int len, n;
struct page **ppages;
+ unsigned int n;
n = 0;
if (pos == 0)
seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n);
- len = xdrbuf->page_len;
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
page_base = offset_in_page(xdrbuf->page_base);
- while (len) {
+ while (page_len) {
if (unlikely(!*ppages)) {
/* XXX: Certain upper layer operations do
* not provide receive buffer pages.
@@ -227,8 +226,8 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
}
seg->mr_page = *ppages;
seg->mr_offset = (char *)page_base;
- seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
- len -= seg->mr_len;
+ seg->mr_len = min_t(u32, PAGE_SIZE - page_base, page_len);
+ page_len -= seg->mr_len;
++ppages;
++seg;
++n;
@@ -352,8 +351,9 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
}
seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, seg,
- omit_xdr_pad);
+ nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
+ rqst->rq_snd_buf.page_len,
+ seg, omit_xdr_pad);
if (nsegs < 0)
return nsegs;
@@ -401,8 +401,13 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
int nsegs, nchunks;
__be32 *segcount;
- if (restype != rpcrdma_writech)
+ switch (restype) {
+ case rpcrdma_writech:
+ case rpcrdma_writereply:
+ break;
+ default:
goto done;
+ }
/* When encoding a Write chunk, some servers need to see an
* extra segment for non-XDR-aligned Write chunks. The upper
@@ -411,8 +416,9 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
*/
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
- rqst->rq_rcv_buf.head[0].iov_len, seg,
- r_xprt->rx_ia.ri_implicit_roundup);
+ rqst->rq_rcv_buf.head[0].iov_len,
+ rqst->rq_rcv_buf.page_len,
+ seg, r_xprt->rx_ia.ri_implicit_roundup);
if (nsegs < 0)
return nsegs;
@@ -468,14 +474,24 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
struct rpcrdma_mr *mr;
+ unsigned int page_len;
int nsegs, nchunks;
__be32 *segcount;
- if (restype != rpcrdma_replych)
+ switch (restype) {
+ case rpcrdma_replych:
+ page_len = rqst->rq_rcv_buf.page_len;
+ break;
+ case rpcrdma_writereply:
+ page_len = 0;
+ break;
+ default:
return encode_item_not_present(xdr);
+ }
seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, seg, false);
+ nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0,
+ page_len, seg, false);
if (nsegs < 0)
return nsegs;
@@ -775,16 +791,21 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
*
* o If the expected result is under the inline threshold, all ops
* return as inline.
- * o Large read ops return data as write chunk(s), header as
- * inline.
+ * o Large read ops return data as a write chunk and
+ * small header as inline, large header as a reply chunk.
* o Large non-read ops return as a single reply chunk.
*/
if (rpcrdma_results_inline(r_xprt, rqst))
restype = rpcrdma_noch;
- else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) {
restype = rpcrdma_writech;
- else
+ if ((rqst->rq_rcv_buf.head[0].iov_len +
+ rqst->rq_rcv_buf.tail[0].iov_len) >
+ r_xprt->rx_ia.ri_max_inline_read)
+ restype = rpcrdma_writereply;
+ } else {
restype = rpcrdma_replych;
+ }
/*
* Chunks needed for arguments?
@@ -1163,14 +1184,12 @@ static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
return -EIO;
/* RDMA_NOMSG sanity checks */
- if (unlikely(writelist))
- return -EIO;
if (unlikely(!replychunk))
return -EIO;
/* Reply chunk buffer already is the reply vector */
- r_xprt->rx_stats.total_rdma_reply += replychunk;
- return replychunk;
+ r_xprt->rx_stats.total_rdma_reply += writelist + replychunk;
+ return writelist + replychunk;
}
static noinline int
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index d29bf38..5e19bb59 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -627,7 +627,8 @@ enum rpcrdma_chunktype {
rpcrdma_readch,
rpcrdma_areadch,
rpcrdma_writech,
- rpcrdma_replych
+ rpcrdma_replych,
+ rpcrdma_writereply,
};
int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
next prev parent reply other threads:[~2018-09-10 20:04 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-09-10 15:08 [PATCH v1 00/22] NFS/RDMA client patches for v4.20 Chuck Lever
2018-09-10 15:09 ` [PATCH v1 01/22] xprtrdma: Reset credit grant properly after a disconnect Chuck Lever
2018-09-10 15:09 ` [PATCH v1 02/22] xprtrdma: Create more MRs at a time Chuck Lever
2018-09-10 15:09 ` [PATCH v1 03/22] xprtrdma: Explicitly resetting MRs is no longer necessary Chuck Lever
2018-09-10 15:09 ` [PATCH v1 04/22] xprtrdma: Name MR trace events consistently Chuck Lever
2018-09-10 15:09 ` [PATCH v1 05/22] xprtrdma: Refactor chunk encoding Chuck Lever
2018-09-10 15:09 ` [PATCH v1 06/22] xprtrdma: Refactor chunktype handling Chuck Lever
2018-09-10 15:09 ` Chuck Lever [this message]
2018-09-10 15:09 ` [PATCH v1 08/22] sunrpc: Fix connect metrics Chuck Lever
2018-09-12 18:41 ` Anna Schumaker
2018-09-10 15:09 ` [PATCH v1 09/22] sunrpc: Report connect_time in seconds Chuck Lever
2018-09-10 15:09 ` [PATCH v1 10/22] xprtrdma: Rename rpcrdma_conn_upcall Chuck Lever
2018-09-10 15:09 ` [PATCH v1 11/22] xprtrdma: Conventional variable names in rpcrdma_conn_upcall Chuck Lever
2018-09-10 15:09 ` [PATCH v1 12/22] xprtrdma: Eliminate "connstate" variable from rpcrdma_conn_upcall() Chuck Lever
2018-09-10 15:10 ` [PATCH v1 13/22] xprtrdma: Re-organize the switch() in rpcrdma_conn_upcall Chuck Lever
2018-09-10 15:10 ` [PATCH v1 14/22] xprtrdma: Simplify RPC wake-ups on connect Chuck Lever
2018-09-10 15:10 ` [PATCH v1 15/22] xprtrdma: Rename rpcrdma_qp_async_error_upcall Chuck Lever
2018-09-10 15:10 ` [PATCH v1 16/22] xprtrdma: Remove memory address of "ep" from an error message Chuck Lever
2018-09-10 15:10 ` [PATCH v1 17/22] svcrdma: Don't disable BH's in backchannel Chuck Lever
2018-09-10 15:10 ` [PATCH v1 18/22] xprtrdma: Move rb_flags initialization Chuck Lever
2018-09-10 15:10 ` [PATCH v1 19/22] xprtrdma: Report when there were zero posted Receives Chuck Lever
2018-09-10 15:10 ` [PATCH v1 20/22] xprtrdma: Add documenting comments Chuck Lever
2018-09-10 15:10 ` [PATCH v1 21/22] xprtrdma: Clean up xprt_rdma_disconnect_inject Chuck Lever
2018-09-10 15:10 ` [PATCH v1 22/22] xprtrdma: Squelch a sparse warning Chuck Lever
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180910150932.10564.1879.stgit@manet.1015granger.net \
--to=chuck.lever@oracle.com \
--cc=linux-nfs@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).