nvme completion path optimizations and fixes

linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* nvme completion path optimizations and fixes
@ 2015-09-21 18:40 Christoph Hellwig
  2015-09-21 18:40 ` [PATCH 1/7] blk-mq: fix racy updates of rq->errors Christoph Hellwig
                   ` (6 more replies)
  0 siblings, 7 replies; 13+ messages in thread
From: Christoph Hellwig @ 2015-09-21 18:40 UTC (permalink / raw)


This series optimized the NVMe completion path by taking full advantage
to the block layer multiple completion protection.  For that we'll need
to switch the remaining internal NVMe admin commands over to use the
block layer queuing.

The first patch fixes a potential race in updating the error value which
is required to make this conversion bullet proof.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/7] blk-mq: fix racy updates of rq->errors
  2015-09-21 18:40 nvme completion path optimizations and fixes Christoph Hellwig
@ 2015-09-21 18:40 ` Christoph Hellwig
  2015-09-27  7:35   ` Sagi Grimberg
  2015-09-21 18:40 ` [PATCH 2/7] nvme: switch AEN processing to use blk_execute_rq_nowait Christoph Hellwig
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2015-09-21 18:40 UTC (permalink / raw)


blk_mq_complete_request may be a no-op if the request has already
been completed by others means (e.g. a timeout or cancellation), but
currently drivers have to set rq->errors before calling
blk_mq_complete_request, which might leave us with the wrong error value.

Add an error parameter to blk_mq_complete_request so that we can
defer setting rq->errors until we known we won the race to complete the
request.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 block/blk-mq.c               | 12 ++++++------
 drivers/block/loop.c         | 11 +++++------
 drivers/block/null_blk.c     |  2 +-
 drivers/block/nvme-core.c    | 16 +++++++---------
 drivers/block/virtio_blk.c   |  2 +-
 drivers/block/xen-blkfront.c | 19 ++++++++++---------
 drivers/scsi/scsi_lib.c      |  2 +-
 include/linux/blk-mq.h       |  2 +-
 8 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f2d67b4..ca827a3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -393,14 +393,16 @@ void __blk_mq_complete_request(struct request *rq)
  *	Ends all I/O on a request. It does not handle partial completions.
  *	The actual completion happens out-of-order, through a IPI handler.
  **/
-void blk_mq_complete_request(struct request *rq)
+void blk_mq_complete_request(struct request *rq, int error)
 {
 	struct request_queue *q = rq->q;
 
 	if (unlikely(blk_should_fake_timeout(q)))
 		return;
-	if (!blk_mark_rq_complete(rq))
+	if (!blk_mark_rq_complete(rq)) {	
+		rq->errors = error;
 		__blk_mq_complete_request(rq);
+	}
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
@@ -616,10 +618,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		 * If a request wasn't started before the queue was
 		 * marked dying, kill it here or it'll go unnoticed.
 		 */
-		if (unlikely(blk_queue_dying(rq->q))) {
-			rq->errors = -EIO;
-			blk_mq_complete_request(rq);
-		}
+		if (unlikely(blk_queue_dying(rq->q)))
+			blk_mq_complete_request(rq, -EIO);
 		return;
 	}
 	if (rq->cmd_flags & REQ_NO_TIMEOUT)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f9889b6..674f800 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1486,17 +1486,16 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 {
 	const bool write = cmd->rq->cmd_flags & REQ_WRITE;
 	struct loop_device *lo = cmd->rq->q->queuedata;
-	int ret = -EIO;
+	int ret = 0;
 
-	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
+	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
+		ret = -EIO;
 		goto failed;
+	}
 
 	ret = do_req_filebacked(lo, cmd->rq);
-
  failed:
-	if (ret)
-		cmd->rq->errors = -EIO;
-	blk_mq_complete_request(cmd->rq);
+	blk_mq_complete_request(cmd->rq, ret ? -EIO : 0);
 }
 
 static void loop_queue_write_work(struct work_struct *work)
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index a295b98..1c9e4fe 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -289,7 +289,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd)
 	case NULL_IRQ_SOFTIRQ:
 		switch (queue_mode)  {
 		case NULL_Q_MQ:
-			blk_mq_complete_request(cmd->rq);
+			blk_mq_complete_request(cmd->rq, cmd->rq->errors);
 			break;
 		case NULL_Q_RQ:
 			blk_complete_request(cmd->rq);
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index b97fc3f..ebc3138 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -618,16 +618,15 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 			spin_unlock_irqrestore(req->q->queue_lock, flags);
 			return;
 		}
+
 		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
 			if (cmd_rq->ctx == CMD_CTX_CANCELLED)
-				req->errors = -EINTR;
-			else
-				req->errors = status;
+				status = -EINTR;
 		} else {
-			req->errors = nvme_error_status(status);
+			status = nvme_error_status(status);
 		}
-	} else
-		req->errors = 0;
+	}
+
 	if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
 		u32 result = le32_to_cpup(&cqe->result);
 		req->special = (void *)(uintptr_t)result;
@@ -650,7 +649,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 	}
 	nvme_free_iod(nvmeq->dev, iod);
 
-	blk_mq_complete_request(req);
+	blk_mq_complete_request(req, status);
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
@@ -863,8 +862,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (ns && ns->ms && !blk_integrity_rq(req)) {
 		if (!(ns->pi_type && ns->ms == 8) &&
 					req->cmd_type != REQ_TYPE_DRV_PRIV) {
-			req->errors = -EFAULT;
-			blk_mq_complete_request(req);
+			blk_mq_complete_request(req, -EFAULT);
 			return BLK_MQ_RQ_QUEUE_OK;
 		}
 	}
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index e93899c..6ca3549 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -144,7 +144,7 @@ static void virtblk_done(struct virtqueue *vq)
 	do {
 		virtqueue_disable_cb(vq);
 		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
-			blk_mq_complete_request(vbr->req);
+			blk_mq_complete_request(vbr->req, vbr->req->errors);
 			req_done = true;
 		}
 		if (unlikely(virtqueue_is_broken(vq)))
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 0823a96..6111708 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1142,6 +1142,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 	RING_IDX i, rp;
 	unsigned long flags;
 	struct blkfront_info *info = (struct blkfront_info *)dev_id;
+	int error;
 
 	spin_lock_irqsave(&info->io_lock, flags);
 
@@ -1182,37 +1183,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 			continue;
 		}
 
-		req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
+		error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
 		switch (bret->operation) {
 		case BLKIF_OP_DISCARD:
 			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
 				struct request_queue *rq = info->rq;
 				printk(KERN_WARNING "blkfront: %s: %s op failed\n",
 					   info->gd->disk_name, op_name(bret->operation));
-				req->errors = -EOPNOTSUPP;
+				error = -EOPNOTSUPP;
 				info->feature_discard = 0;
 				info->feature_secdiscard = 0;
 				queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
 				queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
 			}
-			blk_mq_complete_request(req);
+			blk_mq_complete_request(req, error);
 			break;
 		case BLKIF_OP_FLUSH_DISKCACHE:
 		case BLKIF_OP_WRITE_BARRIER:
 			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
 				printk(KERN_WARNING "blkfront: %s: %s op failed\n",
 				       info->gd->disk_name, op_name(bret->operation));
-				req->errors = -EOPNOTSUPP;
+				error = -EOPNOTSUPP;
 			}
 			if (unlikely(bret->status == BLKIF_RSP_ERROR &&
 				     info->shadow[id].req.u.rw.nr_segments == 0)) {
 				printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
 				       info->gd->disk_name, op_name(bret->operation));
-				req->errors = -EOPNOTSUPP;
+				error = -EOPNOTSUPP;
 			}
-			if (unlikely(req->errors)) {
-				if (req->errors == -EOPNOTSUPP)
-					req->errors = 0;
+			if (unlikely(error)) {
+				if (error == -EOPNOTSUPP)
+					error = 0;
 				info->feature_flush = 0;
 				xlvbd_flush(info);
 			}
@@ -1223,7 +1224,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
 					"request: %x\n", bret->status);
 
-			blk_mq_complete_request(req);
+			blk_mq_complete_request(req, error);
 			break;
 		default:
 			BUG();
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index cbfc599..126a48c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1957,7 +1957,7 @@ static int scsi_mq_prep_fn(struct request *req)
 static void scsi_mq_done(struct scsi_cmnd *cmd)
 {
 	trace_scsi_dispatch_cmd_done(cmd);
-	blk_mq_complete_request(cmd->request);
+	blk_mq_complete_request(cmd->request, cmd->request->errors);
 }
 
 static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 37d1602..4b4577a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -215,7 +215,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
 void blk_mq_cancel_requeue_work(struct request_queue *q);
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_abort_requeue_list(struct request_queue *q);
-void blk_mq_complete_request(struct request *rq);
+void blk_mq_complete_request(struct request *rq, int error);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/7] nvme: switch AEN processing to use blk_execute_rq_nowait
  2015-09-21 18:40 nvme completion path optimizations and fixes Christoph Hellwig
  2015-09-21 18:40 ` [PATCH 1/7] blk-mq: fix racy updates of rq->errors Christoph Hellwig
@ 2015-09-21 18:40 ` Christoph Hellwig
  2015-09-23 23:16   ` Keith Busch
  2015-09-21 18:40 ` [PATCH 3/7] nvme: switch delete SQ/CQ to blk_execute_rq_nowait Christoph Hellwig
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2015-09-21 18:40 UTC (permalink / raw)


For this we add a new nvme_submit_async_cmd helper that is similar to
nvme_submit_sync_cmd, but allows finer control of the request flags and
executes the command asynchronously.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/block/nvme-core.c | 96 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 34 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index ebc3138..70bfb0b 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -305,26 +305,6 @@ static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
 	return ctx;
 }
 
-static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	u32 result = le32_to_cpup(&cqe->result);
-	u16 status = le16_to_cpup(&cqe->status) >> 1;
-
-	if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
-		++nvmeq->dev->event_limit;
-	if (status != NVME_SC_SUCCESS)
-		return;
-
-	switch (result & 0xff07) {
-	case NVME_AER_NOTICE_NS_CHANGED:
-		dev_info(nvmeq->q_dmadev, "rescanning\n");
-		schedule_work(&nvmeq->dev->scan_work);
-	default:
-		dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result);
-	}
-}
-
 static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
@@ -1057,28 +1037,70 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 	return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0);
 }
 
-static int nvme_submit_async_admin_req(struct nvme_dev *dev)
+static int nvme_submit_async_cmd(struct request_queue *q,
+		struct nvme_command *cmd, unsigned long timeout, gfp_t gfp_mask,
+		bool reserved, void *end_io_data, rq_end_io_fn *done)
+		
 {
-	struct nvme_queue *nvmeq = dev->queues[0];
-	struct nvme_command c;
-	struct nvme_cmd_info *cmd_info;
+	bool write = cmd->common.opcode & 1;
 	struct request *req;
 
-	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true);
+	req = blk_mq_alloc_request(q, write, gfp_mask, reserved);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	req->cmd_flags |= REQ_NO_TIMEOUT;
-	cmd_info = blk_mq_rq_to_pdu(req);
-	nvme_set_info(cmd_info, NULL, async_req_completion);
+	req->cmd_type = REQ_TYPE_DRV_PRIV;
+	req->cmd_flags |= REQ_FAILFAST_DRIVER;
+	req->__data_len = 0;
+	req->__sector = (sector_t) -1;
+	req->bio = req->biotail = NULL;
+
+	if (timeout == ULONG_MAX)
+		req->cmd_flags |= REQ_NO_TIMEOUT;
+	else
+		req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+	req->cmd = (unsigned char *)cmd;
+	req->cmd_len = sizeof(struct nvme_command);
+	req->special = (void *)0;
+	req->end_io_data = end_io_data;
+
+	blk_execute_rq_nowait(req->q, NULL, req, 0, done);
+	return 0;
+}
+
+static void aen_endio(struct request *req, int error)
+{
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = cmd->nvmeq;
+	u32 result = (u32)(uintptr_t)req->special;
+	u16 status = req->errors;
+
+	if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
+		++nvmeq->dev->event_limit;
+	if (status != NVME_SC_SUCCESS)
+		return;
+
+	switch (result & 0xff07) {
+	case NVME_AER_NOTICE_NS_CHANGED:
+		dev_info(nvmeq->q_dmadev, "rescanning\n");
+		schedule_work(&nvmeq->dev->scan_work);
+	default:
+		dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result);
+	}
+
+	blk_mq_free_request(req);
+}
+
+static int nvme_submit_aen(struct nvme_dev *dev)
+{
+	struct nvme_command c;
 
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = nvme_admin_async_event;
-	c.common.command_id = req->tag;
 
-	blk_mq_free_request(req);
-	__nvme_submit_cmd(nvmeq, &c);
-	return 0;
+	return nvme_submit_async_cmd(dev->admin_q, &c, ULONG_MAX, GFP_ATOMIC,
+			true, aen_endio, NULL);
 }
 
 static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
@@ -1699,6 +1721,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 			dev->admin_q = NULL;
 			return -ENODEV;
 		}
+		dev->admin_q->queuedata = dev;
 	} else
 		blk_mq_unfreeze_queue(dev->admin_q);
 
@@ -2091,9 +2114,14 @@ static int nvme_kthread(void *data)
 				nvme_process_cq(nvmeq);
 
 				while ((i == 0) && (dev->event_limit > 0)) {
-					if (nvme_submit_async_admin_req(dev))
-						break;
 					dev->event_limit--;
+					spin_unlock_irq(&nvmeq->q_lock);
+					if (nvme_submit_aen(dev)) {
+						dev->event_limit++;
+						spin_lock_irq(&nvmeq->q_lock);
+						break;
+					}
+					spin_lock_irq(&nvmeq->q_lock);
 				}
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/7] nvme: switch delete SQ/CQ to blk_execute_rq_nowait
  2015-09-21 18:40 nvme completion path optimizations and fixes Christoph Hellwig
  2015-09-21 18:40 ` [PATCH 1/7] blk-mq: fix racy updates of rq->errors Christoph Hellwig
  2015-09-21 18:40 ` [PATCH 2/7] nvme: switch AEN processing to use blk_execute_rq_nowait Christoph Hellwig
@ 2015-09-21 18:40 ` Christoph Hellwig
  2015-09-21 18:40 ` [PATCH 4/7] nvme: switch abort " Christoph Hellwig
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2015-09-21 18:40 UTC (permalink / raw)


Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/block/nvme-core.c | 42 ++++++++----------------------------------
 1 file changed, 8 insertions(+), 34 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 70bfb0b..d15c886 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -91,8 +91,6 @@ static int nvme_process_cq(struct nvme_queue *nvmeq);
 struct async_cmd_info {
 	struct kthread_work work;
 	struct kthread_worker *worker;
-	struct request *req;
-	u32 result;
 	int status;
 	void *ctx;
 };
@@ -319,16 +317,6 @@ static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
 	++nvmeq->dev->abort_limit;
 }
 
-static void async_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct async_cmd_info *cmdinfo = ctx;
-	cmdinfo->result = le32_to_cpup(&cqe->result);
-	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
-	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
-	blk_mq_free_request(cmdinfo->req);
-}
-
 static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
 				  unsigned int tag)
 {
@@ -1103,28 +1091,13 @@ static int nvme_submit_aen(struct nvme_dev *dev)
 			true, aen_endio, NULL);
 }
 
-static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
-			struct nvme_command *cmd,
-			struct async_cmd_info *cmdinfo, unsigned timeout)
+static void async_cmd_info_endio(struct request *req, int error)
 {
-	struct nvme_queue *nvmeq = dev->queues[0];
-	struct request *req;
-	struct nvme_cmd_info *cmd_rq;
+	struct async_cmd_info *cmdinfo = req->end_io_data;
 
-	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
-	req->timeout = timeout;
-	cmd_rq = blk_mq_rq_to_pdu(req);
-	cmdinfo->req = req;
-	nvme_set_info(cmd_rq, cmdinfo, async_completion);
-	cmdinfo->status = -EINTR;
-
-	cmd->common.command_id = req->tag;
-
-	nvme_submit_cmd(nvmeq, cmd);
-	return 0;
+	cmdinfo->status = req->errors;
+	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
+	blk_mq_free_request(req);
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -2680,8 +2653,9 @@ static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
 	c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
 
 	init_kthread_work(&nvmeq->cmdinfo.work, fn);
-	return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
-								ADMIN_TIMEOUT);
+	return nvme_submit_async_cmd(nvmeq->dev->admin_q, &c, ADMIN_TIMEOUT,
+				     GFP_KERNEL, false,
+				     &nvmeq->cmdinfo, async_cmd_info_endio);
 }
 
 static void nvme_del_cq_work_handler(struct kthread_work *work)
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/7] nvme: switch abort to blk_execute_rq_nowait
  2015-09-21 18:40 nvme completion path optimizations and fixes Christoph Hellwig
                   ` (2 preceding siblings ...)
  2015-09-21 18:40 ` [PATCH 3/7] nvme: switch delete SQ/CQ to blk_execute_rq_nowait Christoph Hellwig
@ 2015-09-21 18:40 ` Christoph Hellwig
  2015-09-21 18:40 ` [PATCH 5/7] nvme: simplify completion handling Christoph Hellwig
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2015-09-21 18:40 UTC (permalink / raw)


And remove the new unused nvme_submit_cmd helper.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/block/nvme-core.c | 52 ++++++++++++++++-------------------------------
 1 file changed, 17 insertions(+), 35 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index d15c886..c49f07f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -303,20 +303,6 @@ static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
 	return ctx;
 }
 
-static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct request *req = ctx;
-
-	u16 status = le16_to_cpup(&cqe->status) >> 1;
-	u32 result = le32_to_cpup(&cqe->result);
-
-	blk_mq_free_request(req);
-
-	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
-	++nvmeq->dev->abort_limit;
-}
-
 static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
 				  unsigned int tag)
 {
@@ -346,7 +332,7 @@ static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
 }
 
 /**
- * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
+ * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
  * @cmd: The command to send
  *
@@ -368,14 +354,6 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
 	nvmeq->sq_tail = tail;
 }
 
-static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&nvmeq->q_lock, flags);
-	__nvme_submit_cmd(nvmeq, cmd);
-	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
-}
-
 static __le64 **iod_list(struct nvme_iod *iod)
 {
 	return ((void *)iod) + iod->offset;
@@ -1256,6 +1234,19 @@ int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log)
 	return error;
 }
 
+static void abort_endio(struct request *req, int error)
+{
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = cmd->nvmeq;
+	u32 result = (u32)(uintptr_t)req->special;
+	u16 status = req->errors;
+
+	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
+	++nvmeq->dev->abort_limit;
+
+	blk_mq_free_request(req);
+}
+
 /**
  * nvme_abort_req - Attempt aborting a request
  *
@@ -1267,8 +1258,6 @@ static void nvme_abort_req(struct request *req)
 	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
 	struct nvme_dev *dev = nvmeq->dev;
-	struct request *abort_req;
-	struct nvme_cmd_info *abort_cmd;
 	struct nvme_command cmd;
 
 	if (!nvmeq->qid || cmd_rq->aborted) {
@@ -1290,26 +1279,19 @@ static void nvme_abort_req(struct request *req)
 	if (!dev->abort_limit)
 		return;
 
-	abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
-									false);
-	if (IS_ERR(abort_req))
-		return;
-
-	abort_cmd = blk_mq_rq_to_pdu(abort_req);
-	nvme_set_info(abort_cmd, abort_req, abort_completion);
-
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.abort.opcode = nvme_admin_abort_cmd;
 	cmd.abort.cid = req->tag;
 	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
-	cmd.abort.command_id = abort_req->tag;
 
 	--dev->abort_limit;
 	cmd_rq->aborted = 1;
 
 	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
 							nvmeq->qid);
-	nvme_submit_cmd(dev->queues[0], &cmd);
+
+	nvme_submit_async_cmd(dev->admin_q, &cmd, 0, GFP_ATOMIC, true,
+			abort_endio, NULL);
 }
 
 static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 5/7] nvme: simplify completion handling
  2015-09-21 18:40 nvme completion path optimizations and fixes Christoph Hellwig
                   ` (3 preceding siblings ...)
  2015-09-21 18:40 ` [PATCH 4/7] nvme: switch abort " Christoph Hellwig
@ 2015-09-21 18:40 ` Christoph Hellwig
  2015-09-21 18:40 ` [PATCH 6/7] nvme: properly free resources for cancelled command Christoph Hellwig
  2015-09-21 18:40 ` [PATCH 7/7] nvme: micro optimize nvme_submit_priv Christoph Hellwig
  6 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2015-09-21 18:40 UTC (permalink / raw)


Now that all commands are executed as block layer requests we can remove the
internal completion in the NVMe driver.  Note that we can simply call
blk_mq_complete_request to abort commands as the block layer will protect
against double copletions internally.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/block/nvme-core.c | 143 ++++++++++------------------------------------
 1 file changed, 29 insertions(+), 114 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index c49f07f..5649f8f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -141,15 +141,11 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
 }
 
-typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
-						struct nvme_completion *);
-
 struct nvme_cmd_info {
-	nvme_completion_fn fn;
-	void *ctx;
 	int aborted;
 	struct nvme_queue *nvmeq;
-	struct nvme_iod iod[0];
+	struct nvme_iod *iod;
+	struct nvme_iod __iod[0];
 };
 
 /*
@@ -243,15 +239,6 @@ static int nvme_init_request(void *data, struct request *req,
 	return 0;
 }
 
-static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
-				nvme_completion_fn handler)
-{
-	cmd->fn = handler;
-	cmd->ctx = ctx;
-	cmd->aborted = 0;
-	blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
-}
-
 static void *iod_get_private(struct nvme_iod *iod)
 {
 	return (void *) (iod->private & ~0x1UL);
@@ -265,72 +252,6 @@ static bool iod_should_kfree(struct nvme_iod *iod)
 	return (iod->private & NVME_INT_MASK) == 0;
 }
 
-/* Special values must be less than 0x1000 */
-#define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
-#define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
-#define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
-#define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
-
-static void special_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	if (ctx == CMD_CTX_CANCELLED)
-		return;
-	if (ctx == CMD_CTX_COMPLETED) {
-		dev_warn(nvmeq->q_dmadev,
-				"completed id %d twice on queue %d\n",
-				cqe->command_id, le16_to_cpup(&cqe->sq_id));
-		return;
-	}
-	if (ctx == CMD_CTX_INVALID) {
-		dev_warn(nvmeq->q_dmadev,
-				"invalid id %d completed on queue %d\n",
-				cqe->command_id, le16_to_cpup(&cqe->sq_id));
-		return;
-	}
-	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
-}
-
-static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
-{
-	void *ctx;
-
-	if (fn)
-		*fn = cmd->fn;
-	ctx = cmd->ctx;
-	cmd->fn = special_completion;
-	cmd->ctx = CMD_CTX_CANCELLED;
-	return ctx;
-}
-
-static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
-				  unsigned int tag)
-{
-	struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag);
-
-	return blk_mq_rq_to_pdu(req);
-}
-
-/*
- * Called with local interrupts disabled and the q_lock held.  May not sleep.
- */
-static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
-						nvme_completion_fn *fn)
-{
-	struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
-	void *ctx;
-	if (tag >= nvmeq->q_depth) {
-		*fn = special_completion;
-		return CMD_CTX_INVALID;
-	}
-	if (fn)
-		*fn = cmd->fn;
-	ctx = cmd->ctx;
-	cmd->fn = special_completion;
-	cmd->ctx = CMD_CTX_COMPLETED;
-	return ctx;
-}
-
 /**
  * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
@@ -394,7 +315,7 @@ static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
 	    size <= NVME_INT_BYTES(dev)) {
 		struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
 
-		iod = cmd->iod;
+		iod = cmd->__iod;
 		iod_init(iod, size, rq->nr_phys_segments,
 				(unsigned long) rq | NVME_INT_MASK);
 		return iod;
@@ -543,13 +464,15 @@ static void nvme_init_integrity(struct nvme_ns *ns)
 }
 #endif
 
-static void req_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
+static void nvme_finish_cmd(struct nvme_queue *nvmeq,
+		struct nvme_completion *cqe)
 {
-	struct nvme_iod *iod = ctx;
-	struct request *req = iod_get_private(iod);
+	struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
 	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
-
+	struct nvme_iod *iod = cmd_rq->iod;
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 
 	if (unlikely(status)) {
@@ -565,12 +488,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 			return;
 		}
 
-		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
-			if (cmd_rq->ctx == CMD_CTX_CANCELLED)
-				status = -EINTR;
-		} else {
+		if (req->cmd_type != REQ_TYPE_DRV_PRIV)
 			status = nvme_error_status(status);
-		}
 	}
 
 	if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
@@ -863,7 +782,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 		}
 	}
 
-	nvme_set_info(cmd, iod, req_completion);
+	cmd->iod = iod;
+	cmd->aborted = 0;
+	blk_mq_start_request(req);
+
 	spin_lock_irq(&nvmeq->q_lock);
 	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
 		nvme_submit_priv(nvmeq, req, iod);
@@ -894,8 +816,6 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
 	phase = nvmeq->cq_phase;
 
 	for (;;) {
-		void *ctx;
-		nvme_completion_fn fn;
 		struct nvme_completion cqe = nvmeq->cqes[head];
 		if ((le16_to_cpu(cqe.status) & 1) != phase)
 			break;
@@ -904,8 +824,14 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
 			head = 0;
 			phase = !phase;
 		}
-		ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
-		fn(nvmeq, ctx, &cqe);
+
+		if (unlikely(cqe.command_id >= nvmeq->q_depth)) {
+			dev_warn(nvmeq->q_dmadev,
+				"invalid id %d completed on queue %d\n",
+				cqe.command_id, le16_to_cpu(cqe.sq_id));
+			continue;
+		}
+		nvme_finish_cmd(nvmeq, &cqe);
 	}
 
 	/* If the controller ignores the cq head doorbell and continuously
@@ -1297,29 +1223,18 @@ static void nvme_abort_req(struct request *req)
 static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
 {
 	struct nvme_queue *nvmeq = data;
-	void *ctx;
-	nvme_completion_fn fn;
-	struct nvme_cmd_info *cmd;
-	struct nvme_completion cqe;
 
 	if (!blk_mq_request_started(req))
 		return;
 
-	cmd = blk_mq_rq_to_pdu(req);
-
-	if (cmd->ctx == CMD_CTX_CANCELLED)
-		return;
-
-	if (blk_queue_dying(req->q))
-		cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
-	else
-		cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
-
-
 	dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
 						req->tag, nvmeq->qid);
-	ctx = cancel_cmd_info(cmd, &fn);
-	fn(nvmeq, ctx, &cqe);
+
+	/*
+	 * Use a negative Linux errno here so that the submitter can
+	 * distinguish between hardware and driver errors.
+	 */
+	blk_mq_complete_request(req, -EINTR);
 }
 
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 6/7] nvme: properly free resources for cancelled command
  2015-09-21 18:40 nvme completion path optimizations and fixes Christoph Hellwig
                   ` (4 preceding siblings ...)
  2015-09-21 18:40 ` [PATCH 5/7] nvme: simplify completion handling Christoph Hellwig
@ 2015-09-21 18:40 ` Christoph Hellwig
  2015-09-21 18:40 ` [PATCH 7/7] nvme: micro optimize nvme_submit_priv Christoph Hellwig
  6 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2015-09-21 18:40 UTC (permalink / raw)


We need to move freeing of resources to the ->complete handler to ensure
they are also freed when we cancel the command.

Clear the QUEUE_FLAG_SAME_COMP flag to ensure we don't try to bounce to
another CPU because we now have a ->complete handler.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/block/nvme-core.c | 57 +++++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 5649f8f..6cc8f58 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -471,8 +471,6 @@ static void nvme_finish_cmd(struct nvme_queue *nvmeq,
 		struct nvme_completion *cqe)
 {
 	struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
-	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
-	struct nvme_iod *iod = cmd_rq->iod;
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 
 	if (unlikely(status)) {
@@ -497,23 +495,6 @@ static void nvme_finish_cmd(struct nvme_queue *nvmeq,
 		req->special = (void *)(uintptr_t)result;
 	}
 
-	if (cmd_rq->aborted)
-		dev_warn(nvmeq->dev->dev,
-			"completing aborted command with status:%04x\n",
-			status);
-
-	if (iod->nents) {
-		dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
-			rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		if (blk_integrity_rq(req)) {
-			if (!rq_data_dir(req))
-				nvme_dif_remap(req, nvme_dif_complete);
-			dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1,
-				rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		}
-	}
-	nvme_free_iod(nvmeq->dev, iod);
-
 	blk_mq_complete_request(req, status);
 }
 
@@ -808,6 +789,34 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_MQ_RQ_QUEUE_BUSY;
 }
 
+static void nvme_complete_rq(struct request *req)
+{
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = cmd->nvmeq;
+	struct nvme_iod *iod = cmd->iod;
+
+	if (cmd->aborted) {
+		dev_warn(nvmeq->dev->dev,
+			"completing aborted command with status:%04x\n",
+			req->errors);
+	}
+
+	if (iod->nents) {
+		enum dma_data_direction dir = rq_data_dir(req) ?
+					DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+		dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents, dir);
+		if (blk_integrity_rq(req)) {
+			if (!rq_data_dir(req))
+				nvme_dif_remap(req, nvme_dif_complete);
+			dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1, dir);
+		}
+	}
+
+	nvme_free_iod(nvmeq->dev, iod);
+	blk_mq_end_request(req, req->errors);
+}
+
 static int nvme_process_cq(struct nvme_queue *nvmeq)
 {
 	u16 head, phase;
@@ -1543,6 +1552,7 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
 
 static struct blk_mq_ops nvme_mq_admin_ops = {
 	.queue_rq	= nvme_queue_rq,
+	.complete	= nvme_complete_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_admin_init_hctx,
 	.exit_hctx      = nvme_admin_exit_hctx,
@@ -1552,6 +1562,7 @@ static struct blk_mq_ops nvme_mq_admin_ops = {
 
 static struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
+	.complete	= nvme_complete_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_init_hctx,
 	.init_request	= nvme_init_request,
@@ -1591,6 +1602,10 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 			dev->admin_q = NULL;
 			return -ENODEV;
 		}
+
+		/* we assume that we always have a local completion queue */
+		queue_flag_clear_unlocked(QUEUE_FLAG_SAME_COMP, dev->admin_q);
+
 		dev->admin_q->queuedata = dev;
 	} else
 		blk_mq_unfreeze_queue(dev->admin_q);
@@ -2017,6 +2032,10 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
 		goto out_free_ns;
 	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+
+	/* we assume that we always have a local completion queue */
+	queue_flag_clear_unlocked(QUEUE_FLAG_SAME_COMP, ns->queue);
+
 	ns->dev = dev;
 	ns->queue->queuedata = ns;
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 7/7] nvme: micro optimize nvme_submit_priv
  2015-09-21 18:40 nvme completion path optimizations and fixes Christoph Hellwig
                   ` (5 preceding siblings ...)
  2015-09-21 18:40 ` [PATCH 6/7] nvme: properly free resources for cancelled command Christoph Hellwig
@ 2015-09-21 18:40 ` Christoph Hellwig
  6 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2015-09-21 18:40 UTC (permalink / raw)


Avoid a copy of the command from the request to the stack.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/block/nvme-core.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 6cc8f58..35236df 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -581,16 +581,15 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
 static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req,
 		struct nvme_iod *iod)
 {
-	struct nvme_command cmnd;
+	struct nvme_command *cmnd = (struct nvme_command *)req->cmd;
 
-	memcpy(&cmnd, req->cmd, sizeof(cmnd));
-	cmnd.rw.command_id = req->tag;
+	cmnd->common.command_id = req->tag;
 	if (req->nr_phys_segments) {
-		cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-		cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
+		cmnd->common.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+		cmnd->common.prp2 = cpu_to_le64(iod->first_dma);
 	}
 
-	__nvme_submit_cmd(nvmeq, &cmnd);
+	__nvme_submit_cmd(nvmeq, cmnd);
 }
 
 /*
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/7] nvme: switch AEN processing to use blk_execute_rq_nowait
  2015-09-21 18:40 ` [PATCH 2/7] nvme: switch AEN processing to use blk_execute_rq_nowait Christoph Hellwig
@ 2015-09-23 23:16   ` Keith Busch
  2015-09-24 13:38     ` Christoph Hellwig
  0 siblings, 1 reply; 13+ messages in thread
From: Keith Busch @ 2015-09-23 23:16 UTC (permalink / raw)


On Mon, 21 Sep 2015, Christoph Hellwig wrote:
> +static int nvme_submit_aen(struct nvme_dev *dev)
> +{
> +	struct nvme_command c;
>
> 	memset(&c, 0, sizeof(c));
> 	c.common.opcode = nvme_admin_async_event;
> -	c.common.command_id = req->tag;
>
> -	blk_mq_free_request(req);
> -	__nvme_submit_cmd(nvmeq, &c);
> -	return 0;
> +	return nvme_submit_async_cmd(dev->admin_q, &c, ULONG_MAX, GFP_ATOMIC,
> +			true, aen_endio, NULL);
> }

AEN needs to release the request immediately. That's why it uses
a "reserved" tag because it is tracked by the driver instead of
the block layer: there is no guarantee the controller will ever
return it, which deadlocks blk-mq's hot cpu notification. See
1efccc9ddb98fd533169669160201b027562af7e for details.

Other than that, love the rest of the series.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 2/7] nvme: switch AEN processing to use blk_execute_rq_nowait
  2015-09-23 23:16   ` Keith Busch
@ 2015-09-24 13:38     ` Christoph Hellwig
  2015-09-24 13:51       ` Keith Busch
  0 siblings, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2015-09-24 13:38 UTC (permalink / raw)


On Wed, Sep 23, 2015@11:16:06PM +0000, Keith Busch wrote:
> AEN needs to release the request immediately. That's why it uses
> a "reserved" tag because it is tracked by the driver instead of
> the block layer: there is no guarantee the controller will ever
> return it, which deadlocks blk-mq's hot cpu notification. See
> 1efccc9ddb98fd533169669160201b027562af7e for details.
>
> Other than that, love the rest of the series.

Unfortunately we really need to keep the struct request around to
avoid special casing AENs in the completion path.

Yigal, is there a nice easy way to trigger the CPU hotplug migration
through sysfs or another userspace interface?  I think I can solve
this without freeing the AEN early with a little more thought.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 2/7] nvme: switch AEN processing to use blk_execute_rq_nowait
  2015-09-24 13:38     ` Christoph Hellwig
@ 2015-09-24 13:51       ` Keith Busch
  2015-09-25 22:11         ` Christoph Hellwig
  0 siblings, 1 reply; 13+ messages in thread
From: Keith Busch @ 2015-09-24 13:51 UTC (permalink / raw)


On Thu, 24 Sep 2015, Christoph Hellwig wrote:
> Unfortunately we really need to keep the struct request around to
> avoid special casing AENs in the completion path.
>
> Yigal, is there a nice easy way to trigger the CPU hotplug migration
> through sysfs or another userspace interface?  I think I can solve
> this without freeing the AEN early with a little more thought.

To take a cpu offline and trigger a removal:

   echo 0 > /sys/devices/system/cpu/cpu1/online

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 2/7] nvme: switch AEN processing to use blk_execute_rq_nowait
  2015-09-24 13:51       ` Keith Busch
@ 2015-09-25 22:11         ` Christoph Hellwig
  0 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2015-09-25 22:11 UTC (permalink / raw)


On Thu, Sep 24, 2015@01:51:33PM +0000, Keith Busch wrote:
> >Yigal, is there a nice easy way to trigger the CPU hotplug migration
> >through sysfs or another userspace interface?  I think I can solve
> >this without freeing the AEN early with a little more thought.
> 
> To take a cpu offline and trigger a removal:
> 
>   echo 0 > /sys/devices/system/cpu/cpu1/online

That is already broken in 4.3-rc2 unfortunately:

[  149.094678] Failed to recover vector for irq 104
[  149.144128] BUG: unable to handle kernel NULL pointer dereference at 00000000
[  149.152022] IP: [<ffffffff8131e0b1>] blk_mq_map_swqueue+0xf1/0x260
[  149.158239] PGD c9d59067 PUD c78ac067 PMD 0 
[  149.162556] Oops: 0002 [#1] SMP DEBUG_PAGEALLOC 
[  149.167222] Modules linked in: nfsd auth_rpcgss oid_registry nfs_acl nfs loc2
[  149.249367] CPU: 7 PID: 1192 Comm: bash Tainted: G          I 4.3.0-rc2+9
[  149.256788] Hardware name: Dell Inc. PowerEdge R710/00NH4P, BIOS 3.0.0 01/311
[  149.264295] task: ffff8806115f8040 ti: ffff880613124000 task.ti: ffff88061310
[  149.271805] RIP: 0010:[<ffffffff8131e0b1>]  [<ffffffff8131e0b1>] blk_mq_map_0
[  149.280461] RSP: 0018:ffff880613127c28  EFLAGS: 00010282
[  149.285789] RAX: ffff880604b9ac00 RBX: ffffe8fcf6f06a80 RCX: ffff8803234edf00
[  149.292946] RDX: 0000000000000000 RSI: 0000000000000009 RDI: ffff8806049c12e0
[  149.300103] RBP: ffff880613127c60 R08: 0000000000000007 R09: 0000000000000000
[  149.307261] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000009
[  149.314419] R13: ffff8806049c12e0 R14: ffff880607fcdc20 R15: ffffffff81d2ad00
[  149.321577] FS:  00007fa17ee79700(0000) GS:ffff880323ac0000(0000) knlGS:00000
[  149.329694] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  149.335457] CR2: 0000000000000080 CR3: 00000000caf51000 CR4: 00000000000006e0
[  149.342616] Stack:
[  149.344633]  0000000000000000 0000000000000000 ffff8806049c12e0 ffffffff81cc0
[  149.352101]  ffffffff81cfaa00 0000000000000000 0000000000000001 ffff880613120
[  149.359567]  ffffffff81320227 ffffffff81cc4740 00000000ffffffe1 ffff880613120
[  149.367034] Call Trace:
[  149.369491]  [<ffffffff81320227>] blk_mq_queue_reinit_notify+0x137/0x1d0
[  149.376216]  [<ffffffff810a16fd>] notifier_call_chain+0x5d/0x80
[  149.382156]  [<ffffffff810a172e>] __raw_notifier_call_chain+0xe/0x10
[  149.388533]  [<ffffffff8107ba13>] cpu_notify+0x23/0x40
[  149.393687]  [<ffffffff8107bb3e>] cpu_notify_nofail+0xe/0x20
[  149.399365]  [<ffffffff8107be05>] _cpu_down+0x1b5/0x2b0
[  149.404607]  [<ffffffff8107bf36>] cpu_down+0x36/0x50
[  149.409589]  [<ffffffff8146b9a4>] cpu_subsys_offline+0x14/0x20
[  149.415443]  [<ffffffff81466478>] device_offline+0x88/0xb0
[  149.420947]  [<ffffffff8146656d>] online_store+0x3d/0x80
[  149.426277]  [<ffffffff81463798>] dev_attr_store+0x18/0x30
[  149.431782]  [<ffffffff81296765>] sysfs_kf_write+0x45/0x60
[  149.437286]  [<ffffffff81296111>] kernfs_fop_write+0x141/0x190
[  149.443140]  [<ffffffff8120e8b8>] __vfs_write+0x28/0xe0
[  149.448385]  [<ffffffff810cb92f>] ? percpu_down_read+0x5f/0xa0
[  149.455461]  [<ffffffff81211cc1>] ? __sb_start_write+0xf1/0x110
[  149.462610]  [<ffffffff81211cc1>] ? __sb_start_write+0xf1/0x110
[  149.469747]  [<ffffffff8120efa9>] vfs_write+0xa9/0x190
[  149.476074]  [<ffffffff8122ff96>] ? __fget_light+0x66/0x90
[  149.482726]  [<ffffffff8120fcb9>] SyS_write+0x49/0xa0
[  149.488935]  [<ffffffff81618236>] entry_SYSCALL_64_fastpath+0x16/0x7a

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/7] blk-mq: fix racy updates of rq->errors
  2015-09-21 18:40 ` [PATCH 1/7] blk-mq: fix racy updates of rq->errors Christoph Hellwig
@ 2015-09-27  7:35   ` Sagi Grimberg
  0 siblings, 0 replies; 13+ messages in thread
From: Sagi Grimberg @ 2015-09-27  7:35 UTC (permalink / raw)


On 9/21/2015 9:40 PM, Christoph Hellwig wrote:
> blk_mq_complete_request may be a no-op if the request has already
> been completed by others means (e.g. a timeout or cancellation), but
> currently drivers have to set rq->errors before calling
> blk_mq_complete_request, which might leave us with the wrong error value.
>
> Add an error parameter to blk_mq_complete_request so that we can
> defer setting rq->errors until we known we won the race to complete the
> request.
>
> Signed-off-by: Christoph Hellwig <hch at lst.de>

Looks good.

Reviewed-by: Sagi Grimberg <sagig at mellanox.com>

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2015-09-27  7:35 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-09-21 18:40 nvme completion path optimizations and fixes Christoph Hellwig
2015-09-21 18:40 ` [PATCH 1/7] blk-mq: fix racy updates of rq->errors Christoph Hellwig
2015-09-27  7:35   ` Sagi Grimberg
2015-09-21 18:40 ` [PATCH 2/7] nvme: switch AEN processing to use blk_execute_rq_nowait Christoph Hellwig
2015-09-23 23:16   ` Keith Busch
2015-09-24 13:38     ` Christoph Hellwig
2015-09-24 13:51       ` Keith Busch
2015-09-25 22:11         ` Christoph Hellwig
2015-09-21 18:40 ` [PATCH 3/7] nvme: switch delete SQ/CQ to blk_execute_rq_nowait Christoph Hellwig
2015-09-21 18:40 ` [PATCH 4/7] nvme: switch abort " Christoph Hellwig
2015-09-21 18:40 ` [PATCH 5/7] nvme: simplify completion handling Christoph Hellwig
2015-09-21 18:40 ` [PATCH 6/7] nvme: properly free resources for cancelled command Christoph Hellwig
2015-09-21 18:40 ` [PATCH 7/7] nvme: micro optimize nvme_submit_priv Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).