From: Chuck Lever <chuck.lever@oracle.com>
To: linux-rdma@vger.kernel.org, linux-nfs@vger.kernel.org
Subject: [PATCH RFC 03/12] xprtrdma: Fix occasional transport deadlock
Date: Tue, 28 May 2019 14:21:06 -0400 [thread overview]
Message-ID: <20190528182106.19012.43122.stgit@manet.1015granger.net> (raw)
In-Reply-To: <20190528181018.19012.61210.stgit@manet.1015granger.net>
Under high I/O workloads, I've noticed that an RPC/RDMA transport
occasionally deadlocks (IOPS goes to zero, and doesn't recover).
Diagnosis shows that the sendctx queue is empty, but when sendctxs
are returned to the queue, the xprt_write_space wake-up never
occurs. The wake-up logic in rpcrdma_sendctx_put_locked is racy.
I noticed that both EMPTY_SCQ and XPRT_WRITE_SPACE are implemented
via an atomic bit. Just one of those is sufficient. Removing
EMPTY_SCQ in favor of the generic bit mechanism makes the deadlock
un-reproducible.
Without EMPTY_SCQ, rpcrdma_buffer::rb_flags is no longer used and
is therefore removed.
Unfortunately this patch does not apply cleanly to stable. If
needed, someone will have to port it and test it.
Fixes: 2fad659209d5 ("xprtrdma: Wait on empty sendctx queue")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/trace/events/rpcrdma.h | 27 +++++++++++++++++++++++++++
net/sunrpc/xprtrdma/frwr_ops.c | 6 +++++-
net/sunrpc/xprtrdma/rpc_rdma.c | 26 ++++++++++++--------------
net/sunrpc/xprtrdma/verbs.c | 11 +++--------
net/sunrpc/xprtrdma/xprt_rdma.h | 6 ------
5 files changed, 47 insertions(+), 29 deletions(-)
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index df9851cb..a4ab3a2 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -538,6 +538,33 @@
)
);
+TRACE_EVENT(xprtrdma_prepsend_failed,
+ TP_PROTO(const struct rpc_rqst *rqst,
+ int ret
+ ),
+
+ TP_ARGS(rqst, ret),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, task_id)
+ __field(unsigned int, client_id)
+ __field(u32, xid)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->task_id = rqst->rq_task->tk_pid;
+ __entry->client_id = rqst->rq_task->tk_client->cl_clid;
+ __entry->xid = be32_to_cpu(rqst->rq_xid);
+ __entry->ret = ret;
+ ),
+
+ TP_printk("task:%u@%u xid=0x%08x: ret=%d",
+ __entry->task_id, __entry->client_id, __entry->xid,
+ __entry->ret
+ )
+);
+
TRACE_EVENT(xprtrdma_post_send,
TP_PROTO(
const struct rpcrdma_req *req,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 794ba4c..ac47314 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -391,7 +391,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
rpcrdma_mr_recycle(mr);
mr = rpcrdma_mr_get(r_xprt);
if (!mr)
- return ERR_PTR(-EAGAIN);
+ goto out_getmr_err;
} while (mr->frwr.fr_state != FRWR_IS_INVALID);
frwr = &mr->frwr;
frwr->fr_state = FRWR_IS_VALID;
@@ -448,6 +448,10 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
*out = mr;
return seg;
+out_getmr_err:
+ xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
+ return ERR_PTR(-EAGAIN);
+
out_dmamap_err:
mr->mr_dir = DMA_NONE;
trace_xprtrdma_frwr_sgerr(mr, i);
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 608b9ac..d1f44f4 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -699,22 +699,28 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req, u32 hdrlen,
struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
{
+ int ret;
+
+ ret = -EAGAIN;
req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
if (!req->rl_sendctx)
- return -EAGAIN;
+ goto err;
req->rl_sendctx->sc_wr.num_sge = 0;
req->rl_sendctx->sc_unmap_count = 0;
req->rl_sendctx->sc_req = req;
__clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
+ ret = -EIO;
if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen))
- return -EIO;
-
+ goto err;
if (rtype != rpcrdma_areadch)
if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype))
- return -EIO;
-
+ goto err;
return 0;
+
+err:
+ trace_xprtrdma_prepsend_failed(&req->rl_slot, ret);
+ return ret;
}
/**
@@ -877,15 +883,7 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
out_err:
trace_xprtrdma_marshal_failed(rqst, ret);
- switch (ret) {
- case -EAGAIN:
- xprt_wait_for_buffer_space(rqst->rq_xprt);
- break;
- case -ENOBUFS:
- break;
- default:
- r_xprt->rx_stats.failed_marshal_count++;
- }
+ r_xprt->rx_stats.failed_marshal_count++;
return ret;
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index a803e94..78be985 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -902,7 +902,7 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt)
* completions recently. This is a sign the Send Queue is
* backing up. Cause the caller to pause and try again.
*/
- set_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags);
+ xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
r_xprt->rx_stats.empty_sendctx_q++;
return NULL;
}
@@ -937,10 +937,7 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt)
/* Paired with READ_ONCE */
smp_store_release(&buf->rb_sc_tail, next_tail);
- if (test_and_clear_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags)) {
- smp_mb__after_atomic();
- xprt_write_space(&sc->sc_xprt->rx_xprt);
- }
+ xprt_write_space(&sc->sc_xprt->rx_xprt);
}
static void
@@ -978,8 +975,6 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt)
r_xprt->rx_stats.mrs_allocated += count;
spin_unlock(&buf->rb_mrlock);
trace_xprtrdma_createmrs(r_xprt, count);
-
- xprt_write_space(&r_xprt->rx_xprt);
}
static void
@@ -991,6 +986,7 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt)
rx_buf);
rpcrdma_mrs_create(r_xprt);
+ xprt_write_space(&r_xprt->rx_xprt);
}
/**
@@ -1090,7 +1086,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
int i, rc;
- buf->rb_flags = 0;
buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests;
buf->rb_bc_srv_max_requests = 0;
spin_lock_init(&buf->rb_mrlock);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index d1e0749..2c6c5d8 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -391,7 +391,6 @@ struct rpcrdma_buffer {
struct list_head rb_recv_bufs;
struct list_head rb_allreqs;
- unsigned long rb_flags;
u32 rb_max_requests;
u32 rb_credits; /* most recent credit grant */
@@ -402,11 +401,6 @@ struct rpcrdma_buffer {
struct delayed_work rb_refresh_worker;
};
-/* rb_flags */
-enum {
- RPCRDMA_BUF_F_EMPTY_SCQ = 0,
-};
-
/*
* Statistics for RPCRDMA
*/
next prev parent reply other threads:[~2019-05-28 18:21 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-05-28 18:20 [PATCH RFC 00/12] for-5.3 NFS/RDMA patches for review Chuck Lever
2019-05-28 18:20 ` [PATCH RFC 01/12] xprtrdma: Fix use-after-free in rpcrdma_post_recvs Chuck Lever
2019-05-28 18:21 ` [PATCH RFC 02/12] xprtrdma: Replace use of xdr_stream_pos in rpcrdma_marshal_req Chuck Lever
2019-05-28 18:21 ` Chuck Lever [this message]
2019-05-28 18:21 ` [PATCH RFC 04/12] xprtrdma: Remove the RPCRDMA_REQ_F_PENDING flag Chuck Lever
2019-05-28 18:21 ` [PATCH RFC 05/12] xprtrdma: Remove fr_state Chuck Lever
2019-05-30 14:05 ` Anna Schumaker
2019-05-31 13:36 ` Chuck Lever
2019-05-28 18:21 ` [PATCH RFC 06/12] xprtrdma: Add mechanism to place MRs back on the free list Chuck Lever
2019-05-28 18:21 ` [PATCH RFC 07/12] xprtrdma: Reduce context switching due to Local Invalidation Chuck Lever
2019-05-28 18:21 ` [PATCH RFC 08/12] xprtrdma: Wake RPCs directly in rpcrdma_wc_send path Chuck Lever
2019-05-28 18:21 ` [PATCH RFC 09/12] xprtrdma: Simplify rpcrdma_rep_create Chuck Lever
2019-05-28 18:21 ` [PATCH RFC 10/12] xprtrdma: Streamline rpcrdma_post_recvs Chuck Lever
2019-05-28 18:21 ` [PATCH RFC 11/12] xprtrdma: Refactor chunk encoding Chuck Lever
2019-05-28 18:21 ` [PATCH RFC 12/12] xprtrdma: Remove rpcrdma_req::rl_buffer Chuck Lever
2019-05-29 6:40 ` [PATCH RFC 00/12] for-5.3 NFS/RDMA patches for review Christoph Hellwig
2019-05-29 14:35 ` Chuck Lever
2019-05-31 14:32 ` Dennis Dalessandro
2019-05-31 14:34 ` Chuck Lever
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190528182106.19012.43122.stgit@manet.1015granger.net \
--to=chuck.lever@oracle.com \
--cc=linux-nfs@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).