linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH stable 4.14.y] nvme-rdma: don't suppress send completions
@ 2018-03-05 19:32 Sagi Grimberg
  2018-03-07 17:27 ` Greg KH
  0 siblings, 1 reply; 4+ messages in thread
From: Sagi Grimberg @ 2018-03-05 19:32 UTC (permalink / raw)


The entire completions suppress mechanism is currently broken because the
HCA might retry a send operation (due to dropped ack) after the nvme
transaction has completed.

In order to handle this, we signal all send completions and introduce a
separate done handler for async events as they will be handled differently
(as they don't include in-capsule data by definition).

Cc: <stable at vger.kernel.org> # v4.14+
Signed-off-by: Sagi Grimberg <sagi at grimberg.me>
Reviewed-by: Max Gurtovoy <maxg at mellanox.com>
Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/nvme/host/rdma.c | 54 +++++++++++++-----------------------------------
 1 file changed, 14 insertions(+), 40 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 33d4431c2b4b..93a082e0bdd4 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -88,7 +88,6 @@ enum nvme_rdma_queue_flags {
 
 struct nvme_rdma_queue {
 	struct nvme_rdma_qe	*rsp_ring;
-	atomic_t		sig_count;
 	int			queue_size;
 	size_t			cmnd_capsule_len;
 	struct nvme_rdma_ctrl	*ctrl;
@@ -521,7 +520,6 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
 		queue->cmnd_capsule_len = sizeof(struct nvme_command);
 
 	queue->queue_size = queue_size;
-	atomic_set(&queue->sig_count, 0);
 
 	queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
 			RDMA_PS_TCP, IB_QPT_RC);
@@ -1232,21 +1230,9 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
 		nvme_end_request(rq, req->status, req->result);
 }
 
-/*
- * We want to signal completion at least every queue depth/2.  This returns the
- * largest power of two that is not above half of (queue size + 1) to optimize
- * (avoid divisions).
- */
-static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
-{
-	int limit = 1 << ilog2((queue->queue_size + 1) / 2);
-
-	return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0;
-}
-
 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
 		struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
-		struct ib_send_wr *first, bool flush)
+		struct ib_send_wr *first)
 {
 	struct ib_send_wr wr, *bad_wr;
 	int ret;
@@ -1255,31 +1241,12 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
 	sge->length = sizeof(struct nvme_command),
 	sge->lkey   = queue->device->pd->local_dma_lkey;
 
-	qe->cqe.done = nvme_rdma_send_done;
-
 	wr.next       = NULL;
 	wr.wr_cqe     = &qe->cqe;
 	wr.sg_list    = sge;
 	wr.num_sge    = num_sge;
 	wr.opcode     = IB_WR_SEND;
-	wr.send_flags = 0;
-
-	/*
-	 * Unsignalled send completions are another giant desaster in the
-	 * IB Verbs spec:  If we don't regularly post signalled sends
-	 * the send queue will fill up and only a QP reset will rescue us.
-	 * Would have been way to obvious to handle this in hardware or
-	 * at least the RDMA stack..
-	 *
-	 * Always signal the flushes. The magic request used for the flush
-	 * sequencer is not allocated in our driver's tagset and it's
-	 * triggered to be freed by blk_cleanup_queue(). So we need to
-	 * always mark it as signaled to ensure that the "wr_cqe", which is
-	 * embedded in request's payload, is not freed when __ib_process_cq()
-	 * calls wr_cqe->done().
-	 */
-	if (nvme_rdma_queue_sig_limit(queue) || flush)
-		wr.send_flags |= IB_SEND_SIGNALED;
+	wr.send_flags = IB_SEND_SIGNALED;
 
 	if (first)
 		first->next = &wr;
@@ -1329,6 +1296,12 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
 	return queue->ctrl->tag_set.tags[queue_idx - 1];
 }
 
+static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	if (unlikely(wc->status != IB_WC_SUCCESS))
+		nvme_rdma_wr_error(cq, wc, "ASYNC");
+}
+
 static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 {
 	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
@@ -1350,10 +1323,12 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 	cmd->common.flags |= NVME_CMD_SGL_METABUF;
 	nvme_rdma_set_sg_null(cmd);
 
+	sqe->cqe.done = nvme_rdma_async_done;
+
 	ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
 			DMA_TO_DEVICE);
 
-	ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false);
+	ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
 	WARN_ON_ONCE(ret);
 }
 
@@ -1639,7 +1614,6 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 	struct nvme_rdma_qe *sqe = &req->sqe;
 	struct nvme_command *c = sqe->data;
-	bool flush = false;
 	struct ib_device *dev;
 	blk_status_t ret;
 	int err;
@@ -1668,13 +1642,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 		goto err;
 	}
 
+	sqe->cqe.done = nvme_rdma_send_done;
+
 	ib_dma_sync_single_for_device(dev, sqe->dma,
 			sizeof(struct nvme_command), DMA_TO_DEVICE);
 
-	if (req_op(rq) == REQ_OP_FLUSH)
-		flush = true;
 	err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
-			req->mr->need_inval ? &req->reg_wr.wr : NULL, flush);
+			req->mr->need_inval ? &req->reg_wr.wr : NULL);
 	if (unlikely(err)) {
 		nvme_rdma_unmap_data(queue, rq);
 		goto err;
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH stable 4.14.y] nvme-rdma: don't suppress send completions
  2018-03-05 19:32 [PATCH stable 4.14.y] nvme-rdma: don't suppress send completions Sagi Grimberg
@ 2018-03-07 17:27 ` Greg KH
  2018-03-07 17:41   ` Keith Busch
  0 siblings, 1 reply; 4+ messages in thread
From: Greg KH @ 2018-03-07 17:27 UTC (permalink / raw)


On Mon, Mar 05, 2018@09:32:15PM +0200, Sagi Grimberg wrote:
> The entire completions suppress mechanism is currently broken because the
> HCA might retry a send operation (due to dropped ack) after the nvme
> transaction has completed.
> 
> In order to handle this, we signal all send completions and introduce a
> separate done handler for async events as they will be handled differently
> (as they don't include in-capsule data by definition).
> 
> Cc: <stable at vger.kernel.org> # v4.14+
> Signed-off-by: Sagi Grimberg <sagi at grimberg.me>
> Reviewed-by: Max Gurtovoy <maxg at mellanox.com>
> Signed-off-by: Christoph Hellwig <hch at lst.de>
> ---
>  drivers/nvme/host/rdma.c | 54 +++++++++++++-----------------------------------
>  1 file changed, 14 insertions(+), 40 deletions(-)

What is the git commit id for this patch in Linus's tree?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH stable 4.14.y] nvme-rdma: don't suppress send completions
  2018-03-07 17:27 ` Greg KH
@ 2018-03-07 17:41   ` Keith Busch
  2018-03-07 19:03     ` Greg KH
  0 siblings, 1 reply; 4+ messages in thread
From: Keith Busch @ 2018-03-07 17:41 UTC (permalink / raw)


On Wed, Mar 07, 2018@09:27:21AM -0800, Greg KH wrote:
> On Mon, Mar 05, 2018@09:32:15PM +0200, Sagi Grimberg wrote:
> > The entire completions suppress mechanism is currently broken because the
> > HCA might retry a send operation (due to dropped ack) after the nvme
> > transaction has completed.
> > 
> > In order to handle this, we signal all send completions and introduce a
> > separate done handler for async events as they will be handled differently
> > (as they don't include in-capsule data by definition).
> > 
> > Cc: <stable at vger.kernel.org> # v4.14+
> > Signed-off-by: Sagi Grimberg <sagi at grimberg.me>
> > Reviewed-by: Max Gurtovoy <maxg at mellanox.com>
> > Signed-off-by: Christoph Hellwig <hch at lst.de>
> > ---
> >  drivers/nvme/host/rdma.c | 54 +++++++++++++-----------------------------------
> >  1 file changed, 14 insertions(+), 40 deletions(-)
> 
> What is the git commit id for this patch in Linus's tree?

Commit b4b591c87f2b0f4ebaf3a68d4f13873b241aa584 upstream

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH stable 4.14.y] nvme-rdma: don't suppress send completions
  2018-03-07 17:41   ` Keith Busch
@ 2018-03-07 19:03     ` Greg KH
  0 siblings, 0 replies; 4+ messages in thread
From: Greg KH @ 2018-03-07 19:03 UTC (permalink / raw)


On Wed, Mar 07, 2018@10:41:22AM -0700, Keith Busch wrote:
> On Wed, Mar 07, 2018@09:27:21AM -0800, Greg KH wrote:
> > On Mon, Mar 05, 2018@09:32:15PM +0200, Sagi Grimberg wrote:
> > > The entire completions suppress mechanism is currently broken because the
> > > HCA might retry a send operation (due to dropped ack) after the nvme
> > > transaction has completed.
> > > 
> > > In order to handle this, we signal all send completions and introduce a
> > > separate done handler for async events as they will be handled differently
> > > (as they don't include in-capsule data by definition).
> > > 
> > > Cc: <stable at vger.kernel.org> # v4.14+
> > > Signed-off-by: Sagi Grimberg <sagi at grimberg.me>
> > > Reviewed-by: Max Gurtovoy <maxg at mellanox.com>
> > > Signed-off-by: Christoph Hellwig <hch at lst.de>
> > > ---
> > >  drivers/nvme/host/rdma.c | 54 +++++++++++++-----------------------------------
> > >  1 file changed, 14 insertions(+), 40 deletions(-)
> > 
> > What is the git commit id for this patch in Linus's tree?
> 
> Commit b4b591c87f2b0f4ebaf3a68d4f13873b241aa584 upstream

thanks, now queued up.

greg k-h

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2018-03-07 19:03 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-03-05 19:32 [PATCH stable 4.14.y] nvme-rdma: don't suppress send completions Sagi Grimberg
2018-03-07 17:27 ` Greg KH
2018-03-07 17:41   ` Keith Busch
2018-03-07 19:03     ` Greg KH

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).