[PATCH 0/4] nvme-blkmq fixes

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/4] nvme-blkmq fixes
@ 2014-12-20  0:54 Keith Busch
  2014-12-20  0:54 ` [PATCH 1/4] blk-mq: Exit queue on alloc failure Keith Busch
                   ` (4 more replies)
  0 siblings, 5 replies; 27+ messages in thread
From: Keith Busch @ 2014-12-20  0:54 UTC (permalink / raw)


This fixes some surprise hot plug and error handling cases.

There is still one deadlock when we run out of tags and get stuck in
bt_get, but Jens sent me a test patch that I verfied fixes that when
used with this set.

Keith Busch (4):
  blk-mq: Exit queue on alloc failure
  blk-mq: Export freeze/unfreeze functions
  NVMe: Fix double free irq
  NVMe: Freeze queues on shutdown

 block/blk-mq.c            |   10 +++++++---
 drivers/block/nvme-core.c |   46 ++++++++++++++++++++++++++++++++++++---------
 include/linux/blk-mq.h    |    2 ++
 3 files changed, 46 insertions(+), 12 deletions(-)

-- 
1.7.10.4

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 1/4] blk-mq: Exit queue on alloc failure
  2014-12-20  0:54 [PATCH 0/4] nvme-blkmq fixes Keith Busch
@ 2014-12-20  0:54 ` Keith Busch
  2014-12-20  0:54 ` [PATCH 2/4] blk-mq: Export freeze/unfreeze functions Keith Busch
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 27+ messages in thread
From: Keith Busch @ 2014-12-20  0:54 UTC (permalink / raw)


Fixes usage counter when a request could not be allocated.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 block/blk-mq.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6cd94ba..68050ae 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -258,8 +258,10 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 		ctx = alloc_data.ctx;
 	}
 	blk_mq_put_ctx(ctx);
-	if (!rq)
+	if (!rq) {
+		blk_mq_queue_exit(q);
 		return ERR_PTR(-EWOULDBLOCK);
+	}
 	return rq;
 }
 EXPORT_SYMBOL(blk_mq_alloc_request);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 2/4] blk-mq: Export freeze/unfreeze functions
  2014-12-20  0:54 [PATCH 0/4] nvme-blkmq fixes Keith Busch
  2014-12-20  0:54 ` [PATCH 1/4] blk-mq: Exit queue on alloc failure Keith Busch
@ 2014-12-20  0:54 ` Keith Busch
  2014-12-20  0:54 ` [PATCH 3/4] NVMe: Fix double free irq Keith Busch
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 27+ messages in thread
From: Keith Busch @ 2014-12-20  0:54 UTC (permalink / raw)


Let drivers prevent entering a queue that isn't available.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 block/blk-mq.c         |    6 ++++--
 include/linux/blk-mq.h |    2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 68050ae..0b227fa 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -107,7 +107,7 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 	wake_up_all(&q->mq_freeze_wq);
 }
 
-static void blk_mq_freeze_queue_start(struct request_queue *q)
+void blk_mq_freeze_queue_start(struct request_queue *q)
 {
 	bool freeze;
 
@@ -120,6 +120,7 @@ static void blk_mq_freeze_queue_start(struct request_queue *q)
 		blk_mq_run_queues(q, false);
 	}
 }
+EXPORT_SYMBOL(blk_mq_freeze_queue_start);
 
 static void blk_mq_freeze_queue_wait(struct request_queue *q)
 {
@@ -136,7 +137,7 @@ void blk_mq_freeze_queue(struct request_queue *q)
 	blk_mq_freeze_queue_wait(q);
 }
 
-static void blk_mq_unfreeze_queue(struct request_queue *q)
+void blk_mq_unfreeze_queue(struct request_queue *q)
 {
 	bool wake;
 
@@ -149,6 +150,7 @@ static void blk_mq_unfreeze_queue(struct request_queue *q)
 		wake_up_all(&q->mq_freeze_wq);
 	}
 }
+EXPORT_SYMBOL(blk_mq_unfreeze_queue);
 
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index fb0a4fb..87b22c2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -195,6 +195,8 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv);
+void blk_mq_unfreeze_queue(struct request_queue *q);
+void blk_mq_freeze_queue_start(struct request_queue *q);
 
 /*
  * Driver command data is immediately after the request. So subtract request
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 3/4] NVMe: Fix double free irq
  2014-12-20  0:54 [PATCH 0/4] nvme-blkmq fixes Keith Busch
  2014-12-20  0:54 ` [PATCH 1/4] blk-mq: Exit queue on alloc failure Keith Busch
  2014-12-20  0:54 ` [PATCH 2/4] blk-mq: Export freeze/unfreeze functions Keith Busch
@ 2014-12-20  0:54 ` Keith Busch
  2014-12-22 15:15   ` Matthew Wilcox
  2014-12-20  0:54 ` [PATCH 4/4] NVMe: Freeze queues on shutdown Keith Busch
  2014-12-20 17:32 ` [PATCH 0/4] nvme-blkmq fixes Jens Axboe
  4 siblings, 1 reply; 27+ messages in thread
From: Keith Busch @ 2014-12-20  0:54 UTC (permalink / raw)


Sets the vector to an invalid value after it's freed so we don't free
it twice.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |   19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index b1d5d87..9642c83 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -106,7 +106,7 @@ struct nvme_queue {
 	dma_addr_t cq_dma_addr;
 	u32 __iomem *q_db;
 	u16 q_depth;
-	u16 cq_vector;
+	int cq_vector;
 	u16 sq_head;
 	u16 sq_tail;
 	u16 cq_head;
@@ -1131,10 +1131,16 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
  */
 static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 {
-	int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
+	int vector;
 
 	spin_lock_irq(&nvmeq->q_lock);
+	if (nvmeq->cq_vector == -1) {
+		spin_unlock_irq(&nvmeq->q_lock);
+		return 1;
+	}
+	vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
 	nvmeq->dev->online_queues--;
+	nvmeq->cq_vector = -1;
 	spin_unlock_irq(&nvmeq->q_lock);
 
 	irq_set_affinity_hint(vector, NULL);
@@ -1173,7 +1179,7 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid)
 }
 
 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
-							int depth, int vector)
+							int depth)
 {
 	struct device *dmadev = &dev->pci_dev->dev;
 	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
@@ -1199,7 +1205,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->cq_phase = 1;
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 	nvmeq->q_depth = depth;
-	nvmeq->cq_vector = vector;
 	nvmeq->qid = qid;
 	dev->queue_count++;
 	dev->queues[qid] = nvmeq;
@@ -1252,6 +1257,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	if (result < 0)
 		goto release_cq;
 
+	nvmeq->cq_vector = qid - 1;
 	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
 	if (result < 0)
 		goto release_sq;
@@ -1416,7 +1422,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 
 	nvmeq = dev->queues[0];
 	if (!nvmeq) {
-		nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, 0);
+		nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
 		if (!nvmeq)
 			return -ENOMEM;
 	}
@@ -1443,6 +1449,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	if (result)
 		goto free_nvmeq;
 
+	nvmeq->cq_vector = 0;
 	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
 	if (result)
 		goto free_tags;
@@ -1944,7 +1951,7 @@ static void nvme_create_io_queues(struct nvme_dev *dev)
 	unsigned i;
 
 	for (i = dev->queue_count; i <= dev->max_qid; i++)
-		if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
+		if (!nvme_alloc_queue(dev, i, dev->q_depth))
 			break;
 
 	for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 4/4] NVMe: Freeze queues on shutdown
  2014-12-20  0:54 [PATCH 0/4] nvme-blkmq fixes Keith Busch
                   ` (2 preceding siblings ...)
  2014-12-20  0:54 ` [PATCH 3/4] NVMe: Fix double free irq Keith Busch
@ 2014-12-20  0:54 ` Keith Busch
  2014-12-20 17:32 ` [PATCH 0/4] nvme-blkmq fixes Jens Axboe
  4 siblings, 0 replies; 27+ messages in thread
From: Keith Busch @ 2014-12-20  0:54 UTC (permalink / raw)


We need to prevent entering queues and allocating requests when we are
about to disable a h/w queue.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |   27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 9642c83..3063958 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2244,10 +2244,13 @@ static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
 					fatal_signal_pending(current)) {
 			set_current_state(TASK_RUNNING);
 
+			blk_mq_freeze_queue_start(dev->admin_q);
 			nvme_disable_ctrl(dev, readq(&dev->bar->cap));
 			nvme_disable_queue(dev, 0);
 
-			send_sig(SIGKILL, dq->worker->task, 1);
+			blk_cleanup_queue(dev->admin_q);
+			blk_mq_unfreeze_queue(dev->admin_q);
+
 			flush_kthread_worker(dq->worker);
 			return;
 		}
@@ -2383,6 +2386,22 @@ static void nvme_dev_list_remove(struct nvme_dev *dev)
 		kthread_stop(tmp);
 }
 
+static void nvme_freeze_queues(struct nvme_dev *dev)
+{
+	struct nvme_ns *ns;
+
+	list_for_each_entry(ns, &dev->namespaces, list)
+		blk_mq_freeze_queue_start(ns->queue);
+}
+
+static void nvme_unfreeze_queues(struct nvme_dev *dev)
+{
+	struct nvme_ns *ns;
+
+	list_for_each_entry(ns, &dev->namespaces, list)
+		blk_mq_unfreeze_queue(ns->queue);
+}
+
 static void nvme_dev_shutdown(struct nvme_dev *dev)
 {
 	int i;
@@ -2391,6 +2410,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 	dev->initialized = 0;
 	nvme_dev_list_remove(dev);
 
+	nvme_freeze_queues(dev);
 	if (dev->bar)
 		csts = readl(&dev->bar->csts);
 	if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
@@ -2646,7 +2666,8 @@ static int nvme_dev_resume(struct nvme_dev *dev)
 		dev->reset_workfn = nvme_remove_disks;
 		queue_work(nvme_workq, &dev->reset_work);
 		spin_unlock(&dev_list_lock);
-	}
+	} else
+		nvme_unfreeze_queues(dev);
 	dev->initialized = 1;
 	return 0;
 }
@@ -2783,8 +2804,8 @@ static void nvme_remove(struct pci_dev *pdev)
 	pci_set_drvdata(pdev, NULL);
 	flush_work(&dev->reset_work);
 	misc_deregister(&dev->miscdev);
-	nvme_dev_remove(dev);
 	nvme_dev_shutdown(dev);
+	nvme_dev_remove(dev);
 	nvme_dev_remove_admin(dev);
 	nvme_free_queues(dev, 0);
 	nvme_free_admin_tags(dev);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-20  0:54 [PATCH 0/4] nvme-blkmq fixes Keith Busch
                   ` (3 preceding siblings ...)
  2014-12-20  0:54 ` [PATCH 4/4] NVMe: Freeze queues on shutdown Keith Busch
@ 2014-12-20 17:32 ` Jens Axboe
  2014-12-20 19:29   ` Jens Axboe
  4 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2014-12-20 17:32 UTC (permalink / raw)


On 12/19/2014 05:54 PM, Keith Busch wrote:
> This fixes some surprise hot plug and error handling cases.
> 
> There is still one deadlock when we run out of tags and get stuck in
> bt_get, but Jens sent me a test patch that I verfied fixes that when
> used with this set.

Thanks Keith, I'll apply these for the current series. Only change I'll
make is turn the freeze/unfreeze exports into _GPL exports.


-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-20 17:32 ` [PATCH 0/4] nvme-blkmq fixes Jens Axboe
@ 2014-12-20 19:29   ` Jens Axboe
  2014-12-22 16:38     ` Keith Busch
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2014-12-20 19:29 UTC (permalink / raw)


On 12/20/2014 10:32 AM, Jens Axboe wrote:
> On 12/19/2014 05:54 PM, Keith Busch wrote:
>> This fixes some surprise hot plug and error handling cases.
>>
>> There is still one deadlock when we run out of tags and get stuck in
>> bt_get, but Jens sent me a test patch that I verfied fixes that when
>> used with this set.
>
> Thanks Keith, I'll apply these for the current series. Only change I'll
> make is turn the freeze/unfreeze exports into _GPL exports.

Here's the patch referenced. Keith, if you tested it, can I add your 
tested/reviewed-by to it?

-- 
Jens Axboe

-------------- next part --------------
A non-text attachment was scrubbed...
Name: blk-mq-wake-on-dying.patch
Type: text/x-patch
Size: 3004 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-nvme/attachments/20141220/a39f758d/attachment-0001.bin>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 3/4] NVMe: Fix double free irq
  2014-12-20  0:54 ` [PATCH 3/4] NVMe: Fix double free irq Keith Busch
@ 2014-12-22 15:15   ` Matthew Wilcox
  0 siblings, 0 replies; 27+ messages in thread
From: Matthew Wilcox @ 2014-12-22 15:15 UTC (permalink / raw)


On Fri, Dec 19, 2014@05:54:15PM -0700, Keith Busch wrote:
> Sets the vector to an invalid value after it's freed so we don't free
> it twice.
> 
> Signed-off-by: Keith Busch <keith.busch at intel.com>
> ---
>  drivers/block/nvme-core.c |   19 +++++++++++++------
>  1 file changed, 13 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
> index b1d5d87..9642c83 100644
> --- a/drivers/block/nvme-core.c
> +++ b/drivers/block/nvme-core.c
> @@ -106,7 +106,7 @@ struct nvme_queue {
>  	dma_addr_t cq_dma_addr;
>  	u32 __iomem *q_db;
>  	u16 q_depth;
> -	u16 cq_vector;
> +	int cq_vector;

If you're going to change its size, can you move it up one slot so as
to not leave a hole?  The 'pahole' tool can tell you when there are holes
in structures.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-20 19:29   ` Jens Axboe
@ 2014-12-22 16:38     ` Keith Busch
  2014-12-22 16:47       ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Keith Busch @ 2014-12-22 16:38 UTC (permalink / raw)


On Sat, 20 Dec 2014, Jens Axboe wrote:
> Here's the patch referenced. Keith, if you tested it, can I add your 
> tested/reviewed-by to it?

Oh, the perils of sending patches at the end of a Friday before holiday...

I re-tested on my dual-ported machine but without the linux-dm 3.20
bits, so we're not multipath capable here. DM rejects the device, clears
its request_queue and hits a bug, like the wait queue's task_list has
something invalid.

---
device-mapper: table: table load rejected: including non-request-stackable devices
device-mapper: table: unable to set table type
BUG: unable to handle kernel NULL pointer dereference at           (null)
IP: [<ffffffff81065459>] __wake_up_common+0x1e/0x78
PGD 7bb0d067 PUD 36b24067 PMD 0
SMP
Modules linked in: nvme bnep rfcomm bluetooth rfkill nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc dm_round_robin loop dm_multipath parport_pc evdev parport pcspkr psmouse serio_raw processor thermal_sys button ext4 crc16 jbd2 mbcache sg sr_mod cdrom sd_mod ata_generic ata_piix e1000 floppy libata scsi_mod [last unloaded: nvme]
CPU: 0 PID: 4597 Comm: multipath Tainted: G      D        3.18.0+ #8
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
task: ffff880036ac15d0 ti: ffff880076880000 task.ti: ffff880076880000
RIP: 0010:[<ffffffff81065459>]  [<ffffffff81065459>] __wake_up_common+0x1e/0x78
RSP: 0018:ffff880076883bd8  EFLAGS: 00010096
RAX: 0000000000000296 RBX: 0000000000000001 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000003 RDI: ffff88007ab00878
RBP: ffff88007ab00880 R08: 0000000000000000 R09: 000000000000c201
R10: 000000000000c210 R11: 000000000000c1c1 R12: 0000000000000003
R13: 0000000000000000 R14: ffff880077ca5110 R15: ffff880076883d50
FS:  00007f53829d67a0(0000) GS:ffff88007f200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000000 CR3: 00000000776be000 CR4: 00000000000006f0
Stack:
  ffff88007c3df800 ffffffff810e75ae ffff88007f219430 ffff88007ab00878
  0000000000000296 ffff88007ab00e90 ffff880077ca5110 ffff880077ca5110
  ffff880076883d50 ffffffff8106579f ffff880077ca5110 0000000000000000
Call Trace:
  [<ffffffff810e75ae>] ? pcpu_free_area+0x79/0xf8
  [<ffffffff8106579f>] ? __wake_up+0x35/0x46
  [<ffffffff811bb67d>] ? blk_set_queue_dying+0x33/0x69
  [<ffffffff811bce39>] ? blk_cleanup_queue+0x25/0xfd
  [<ffffffff812b2ca5>] ? __dm_destroy+0x22c/0x254
  [<ffffffff812b6ff8>] ? dev_suspend+0x1cd/0x1cd
  [<ffffffff812b70e4>] ? dev_remove+0xec/0xf8
  [<ffffffff812b6223>] ? ctl_ioctl+0x384/0x3ac
  [<ffffffff81177d07>] ? SYSC_semtimedop+0x669/0x6ce
  [<ffffffff812b6257>] ? dm_ctl_ioctl+0xc/0x11
  [<ffffffff81126622>] ? do_vfs_ioctl+0x413/0x45a
  [<ffffffff81175492>] ? ipcget+0x129/0x14e
  [<ffffffff811266b2>] ? SyS_ioctl+0x49/0x77
  [<ffffffff813a3ed2>] ? system_call_fastpath+0x12/0x17
Code: 00 00 00 00 48 89 47 08 48 89 47 10 c3 41 57 41 56 41 55 41 89 cd 41 54 41 89 f4 55 48 8d 6f 08 53 89 d3 48 83 ec 18 48 8b 57 08 <4c> 8b 3a 48 8d 42 e8 49 83 ef 18 eb 35 44 8b 30 4c 89 c1 4c 89
RIP  [<ffffffff81065459>] __wake_up_common+0x1e/0x78
  RSP <ffff880076883bd8>
CR2: 0000000000000000
---[ end trace d9242e782d917b09 ]---

I also couldn't remember if I wrote this next part. It looks like I did,
and it's needed when we run out of requests. I think this still might lose
a "wake" in the case we call blk_cleanup_queue() just before bt_get()
calls prepare_to_wait(), so maybe need to check for dying before and
after io_schedule().

---
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 32e8dbb..69628ef 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -275,6 +275,9 @@ static int bt_get(struct blk_mq_alloc_data *data,

                 io_schedule();

+               if (blk_queue_dying(data->q))
+                       break;
+
                 data->ctx = blk_mq_get_ctx(data->q);
                 data->hctx = data->q->mq_ops->map_queue(data->q,
                                 data->ctx->cpu);
--

Finally as Willy pointed out, I messed the nvme_queue struct's natural
alignment, so it doesn't pack.

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-22 16:38     ` Keith Busch
@ 2014-12-22 16:47       ` Jens Axboe
  2014-12-22 18:17         ` Keith Busch
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2014-12-22 16:47 UTC (permalink / raw)


On 12/22/2014 09:38 AM, Keith Busch wrote:
> On Sat, 20 Dec 2014, Jens Axboe wrote:
>> Here's the patch referenced. Keith, if you tested it, can I add your
>> tested/reviewed-by to it?
>
> Oh, the perils of sending patches at the end of a Friday before holiday...
>
> I re-tested on my dual-ported machine but without the linux-dm 3.20
> bits, so we're not multipath capable here. DM rejects the device, clears
> its request_queue and hits a bug, like the wait queue's task_list has
> something invalid.
>
> ---
> device-mapper: table: table load rejected: including
> non-request-stackable devices
> device-mapper: table: unable to set table type
> BUG: unable to handle kernel NULL pointer dereference at           (null)
> IP: [<ffffffff81065459>] __wake_up_common+0x1e/0x78
> PGD 7bb0d067 PUD 36b24067 PMD 0
> SMP
> Modules linked in: nvme bnep rfcomm bluetooth rfkill nfsd auth_rpcgss
> oid_registry nfs_acl nfs lockd grace fscache sunrpc dm_round_robin loop
> dm_multipath parport_pc evdev parport pcspkr psmouse serio_raw processor
> thermal_sys button ext4 crc16 jbd2 mbcache sg sr_mod cdrom sd_mod
> ata_generic ata_piix e1000 floppy libata scsi_mod [last unloaded: nvme]
> CPU: 0 PID: 4597 Comm: multipath Tainted: G      D        3.18.0+ #8
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
> task: ffff880036ac15d0 ti: ffff880076880000 task.ti: ffff880076880000
> RIP: 0010:[<ffffffff81065459>]  [<ffffffff81065459>]
> __wake_up_common+0x1e/0x78
> RSP: 0018:ffff880076883bd8  EFLAGS: 00010096
> RAX: 0000000000000296 RBX: 0000000000000001 RCX: 0000000000000000
> RDX: 0000000000000000 RSI: 0000000000000003 RDI: ffff88007ab00878
> RBP: ffff88007ab00880 R08: 0000000000000000 R09: 000000000000c201
> R10: 000000000000c210 R11: 000000000000c1c1 R12: 0000000000000003
> R13: 0000000000000000 R14: ffff880077ca5110 R15: ffff880076883d50
> FS:  00007f53829d67a0(0000) GS:ffff88007f200000(0000)
> knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> CR2: 0000000000000000 CR3: 00000000776be000 CR4: 00000000000006f0
> Stack:
>   ffff88007c3df800 ffffffff810e75ae ffff88007f219430 ffff88007ab00878
>   0000000000000296 ffff88007ab00e90 ffff880077ca5110 ffff880077ca5110
>   ffff880076883d50 ffffffff8106579f ffff880077ca5110 0000000000000000
> Call Trace:
>   [<ffffffff810e75ae>] ? pcpu_free_area+0x79/0xf8
>   [<ffffffff8106579f>] ? __wake_up+0x35/0x46
>   [<ffffffff811bb67d>] ? blk_set_queue_dying+0x33/0x69
>   [<ffffffff811bce39>] ? blk_cleanup_queue+0x25/0xfd
>   [<ffffffff812b2ca5>] ? __dm_destroy+0x22c/0x254

I wonder if it cleaned up the requests lists upfront, otherwise I don't 
see where that would crash. I'll look into that. This particular patch 
isn't pushed out yet.

> I also couldn't remember if I wrote this next part. It looks like I did,
> and it's needed when we run out of requests. I think this still might lose
> a "wake" in the case we call blk_cleanup_queue() just before bt_get()
> calls prepare_to_wait(), so maybe need to check for dying before and
> after io_schedule().
>
> ---
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 32e8dbb..69628ef 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -275,6 +275,9 @@ static int bt_get(struct blk_mq_alloc_data *data,
>
>                  io_schedule();
>
> +               if (blk_queue_dying(data->q))
> +                       break;
> +
>                  data->ctx = blk_mq_get_ctx(data->q);
>                  data->hctx = data->q->mq_ops->map_queue(data->q,
>                                  data->ctx->cpu);

I think a cleaner way to handle this would be to add a queue mapped/dead 
check to hctx_may_queue(). That should catch the case of the queue being 
dead on entry, and after the io_schedule().

> Finally as Willy pointed out, I messed the nvme_queue struct's natural
> alignment, so it doesn't pack.

No biggie, just send a one-liner cq_vector and q_depth.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-22 16:47       ` Jens Axboe
@ 2014-12-22 18:17         ` Keith Busch
  2014-12-22 18:19           ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Keith Busch @ 2014-12-22 18:17 UTC (permalink / raw)

On Mon, 22 Dec 2014, Jens Axboe wrote:
> On 12/22/2014 09:38 AM, Keith Busch wrote:
>> Call Trace:
>>   [<ffffffff810e75ae>] ? pcpu_free_area+0x79/0xf8
>>   [<ffffffff8106579f>] ? __wake_up+0x35/0x46
>>   [<ffffffff811bb67d>] ? blk_set_queue_dying+0x33/0x69
>>   [<ffffffff811bce39>] ? blk_cleanup_queue+0x25/0xfd
>>   [<ffffffff812b2ca5>] ? __dm_destroy+0x22c/0x254
>
> I wonder if it cleaned up the requests lists upfront, otherwise I don't see 
> where that would crash. I'll look into that. This particular patch isn't 
> pushed out yet.

The above failure happens because dm called blk_alloc_queue(), but
never made it far enough to call blk_init_allocated_queue(). One way
to fix is call blk_init_rl() from blk_alloc_queue_node() instead of
blk_init_allocated_queue(). I'm not sure if that's the right way to fix
it or if blk_cleanup_queue() should somehow be aware if the request_list
was initialized in the first place.

Also I found out I set nvmeq->cq_vector in the wrong place, messing up
h/w completion queue interrupt setup (I was wondering why things got so
slow!), so I'll fix that along with the struct alignment.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-22 18:17         ` Keith Busch
@ 2014-12-22 18:19           ` Jens Axboe
  2014-12-22 20:08             ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2014-12-22 18:19 UTC (permalink / raw)


On 12/22/2014 11:17 AM, Keith Busch wrote:
> On Mon, 22 Dec 2014, Jens Axboe wrote:
>> On 12/22/2014 09:38 AM, Keith Busch wrote:
>>> Call Trace:
>>>   [<ffffffff810e75ae>] ? pcpu_free_area+0x79/0xf8
>>>   [<ffffffff8106579f>] ? __wake_up+0x35/0x46
>>>   [<ffffffff811bb67d>] ? blk_set_queue_dying+0x33/0x69
>>>   [<ffffffff811bce39>] ? blk_cleanup_queue+0x25/0xfd
>>>   [<ffffffff812b2ca5>] ? __dm_destroy+0x22c/0x254
>>
>> I wonder if it cleaned up the requests lists upfront, otherwise I
>> don't see where that would crash. I'll look into that. This particular
>> patch isn't pushed out yet.
>
> The above failure happens because dm called blk_alloc_queue(), but
> never made it far enough to call blk_init_allocated_queue(). One way
> to fix is call blk_init_rl() from blk_alloc_queue_node() instead of
> blk_init_allocated_queue(). I'm not sure if that's the right way to fix
> it or if blk_cleanup_queue() should somehow be aware if the request_list
> was initialized in the first place.

OK, I'll take care of this one.

> Also I found out I set nvmeq->cq_vector in the wrong place, messing up
> h/w completion queue interrupt setup (I was wondering why things got so
> slow!), so I'll fix that along with the struct alignment.

Oops... I'll just fold the patches, these are queued up for this round, 
so not a stable development base anyawy.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-22 18:19           ` Jens Axboe
@ 2014-12-22 20:08             ` Jens Axboe
  2014-12-22 21:01               ` Keith Busch
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2014-12-22 20:08 UTC (permalink / raw)


On 12/22/2014 11:19 AM, Jens Axboe wrote:
> On 12/22/2014 11:17 AM, Keith Busch wrote:
>> On Mon, 22 Dec 2014, Jens Axboe wrote:
>>> On 12/22/2014 09:38 AM, Keith Busch wrote:
>>>> Call Trace:
>>>>   [<ffffffff810e75ae>] ? pcpu_free_area+0x79/0xf8
>>>>   [<ffffffff8106579f>] ? __wake_up+0x35/0x46
>>>>   [<ffffffff811bb67d>] ? blk_set_queue_dying+0x33/0x69
>>>>   [<ffffffff811bce39>] ? blk_cleanup_queue+0x25/0xfd
>>>>   [<ffffffff812b2ca5>] ? __dm_destroy+0x22c/0x254
>>>
>>> I wonder if it cleaned up the requests lists upfront, otherwise I
>>> don't see where that would crash. I'll look into that. This particular
>>> patch isn't pushed out yet.
>>
>> The above failure happens because dm called blk_alloc_queue(), but
>> never made it far enough to call blk_init_allocated_queue(). One way
>> to fix is call blk_init_rl() from blk_alloc_queue_node() instead of
>> blk_init_allocated_queue(). I'm not sure if that's the right way to fix
>> it or if blk_cleanup_queue() should somehow be aware if the request_list
>> was initialized in the first place.
>
> OK, I'll take care of this one.

Should be enough to just check for ->rq_pool being initialized or not - 
if it is, we could have waiters and we know the waitqueues have been 
setup, etc.

V2 attached.

-- 
Jens Axboe

-------------- next part --------------
A non-text attachment was scrubbed...
Name: blk-mq-wake-on-dying-v2.patch
Type: text/x-patch
Size: 3856 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-nvme/attachments/20141222/1d3ba333/attachment.bin>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-22 20:08             ` Jens Axboe
@ 2014-12-22 21:01               ` Keith Busch
  2014-12-23  1:34                 ` Keith Busch
  0 siblings, 1 reply; 27+ messages in thread
From: Keith Busch @ 2014-12-22 21:01 UTC (permalink / raw)

On Mon, 22 Dec 2014, Jens Axboe wrote:
> Should be enough to just check for ->rq_pool being initialized or not - if it 
> is, we could have waiters and we know the waitqueues have been setup, etc.
>
> V2 attached.

Yep, that fixes the bug.

I'm not sure I follow your suggestion for forcing bt_get() to abandon
allocating a request tag when the queue is dying. If hctx_may_queue()
fails, it returns a generic error and bt_get() reschedules itself. Should
a different error than -1 be returned if the queue is dying?

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-22 21:01               ` Keith Busch
@ 2014-12-23  1:34                 ` Keith Busch
  2014-12-23 17:49                   ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Keith Busch @ 2014-12-23  1:34 UTC (permalink / raw)

On Mon, 22 Dec 2014, Keith Busch wrote:
> On Mon, 22 Dec 2014, Jens Axboe wrote:
>> Should be enough to just check for ->rq_pool being initialized or not - if 
>> it is, we could have waiters and we know the waitqueues have been setup, 
>> etc.
>> 
>> V2 attached.
>
> Yep, that fixes the bug.
>
> I'm not sure I follow your suggestion for forcing bt_get() to abandon
> allocating a request tag when the queue is dying. If hctx_may_queue()
> fails, it returns a generic error and bt_get() reschedules itself. Should
> a different error than -1 be returned if the queue is dying?

We're making good incremental improvements, but finding oddities the
more I test this. This one's a doozy.

Requeued IO's are automatically dispatched, and I don't see an immediately
available way stop them. It causes a bug because the queue doorbells are
unmapped during reset, so you can't touch them when the queue should be
quiesced. I could fix that by having the driver not kick the requeue_list
when it knows a reset is in progress, but there's no immediate way
to drain the list if the reset fails and the device requires removal,
and blk_cleanup_queue() will be stuck.

Is there something available to call that I'm missing or do I need to
add more removal handling?

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-23  1:34                 ` Keith Busch
@ 2014-12-23 17:49                   ` Jens Axboe
  2014-12-23 17:54                     ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2014-12-23 17:49 UTC (permalink / raw)


On 12/22/2014 06:34 PM, Keith Busch wrote:
> On Mon, 22 Dec 2014, Keith Busch wrote:
>> On Mon, 22 Dec 2014, Jens Axboe wrote:
>>> Should be enough to just check for ->rq_pool being initialized or not
>>> - if it is, we could have waiters and we know the waitqueues have
>>> been setup, etc.
>>>
>>> V2 attached.
>>
>> Yep, that fixes the bug.
>>
>> I'm not sure I follow your suggestion for forcing bt_get() to abandon
>> allocating a request tag when the queue is dying. If hctx_may_queue()
>> fails, it returns a generic error and bt_get() reschedules itself. Should
>> a different error than -1 be returned if the queue is dying?
>
> We're making good incremental improvements, but finding oddities the
> more I test this. This one's a doozy.
>
> Requeued IO's are automatically dispatched, and I don't see an immediately
> available way stop them. It causes a bug because the queue doorbells are
> unmapped during reset, so you can't touch them when the queue should be
> quiesced. I could fix that by having the driver not kick the requeue_list
> when it knows a reset is in progress, but there's no immediate way
> to drain the list if the reset fails and the device requires removal,
> and blk_cleanup_queue() will be stuck.
>
> Is there something available to call that I'm missing or do I need to
> add more removal handling?

So that's actually a case where having the queues auto-started on 
requeue run is harmful, since we should be able to handle this situation 
by stopping queues, requeueing, and then having a helper to eventually 
abort pending requeued work, if we have to. But if you simply requeue 
them and defer kicking the requeue list it might work. At that point 
you'd either kick the requeues (and hence start processing them) if 
things went well on the reset, or we could have some 
blk_mq_abort_requeues() helper that'd kill them with -EIO instead. Would 
that work for you?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-23 17:49                   ` Jens Axboe
@ 2014-12-23 17:54                     ` Jens Axboe
  2014-12-23 18:09                       ` Keith Busch
  2014-12-23 21:10                       ` Keith Busch
  0 siblings, 2 replies; 27+ messages in thread
From: Jens Axboe @ 2014-12-23 17:54 UTC (permalink / raw)


On 12/23/2014 10:49 AM, Jens Axboe wrote:
> On 12/22/2014 06:34 PM, Keith Busch wrote:
>> On Mon, 22 Dec 2014, Keith Busch wrote:
>>> On Mon, 22 Dec 2014, Jens Axboe wrote:
>>>> Should be enough to just check for ->rq_pool being initialized or not
>>>> - if it is, we could have waiters and we know the waitqueues have
>>>> been setup, etc.
>>>>
>>>> V2 attached.
>>>
>>> Yep, that fixes the bug.
>>>
>>> I'm not sure I follow your suggestion for forcing bt_get() to abandon
>>> allocating a request tag when the queue is dying. If hctx_may_queue()
>>> fails, it returns a generic error and bt_get() reschedules itself.
>>> Should
>>> a different error than -1 be returned if the queue is dying?
>>
>> We're making good incremental improvements, but finding oddities the
>> more I test this. This one's a doozy.
>>
>> Requeued IO's are automatically dispatched, and I don't see an
>> immediately
>> available way stop them. It causes a bug because the queue doorbells are
>> unmapped during reset, so you can't touch them when the queue should be
>> quiesced. I could fix that by having the driver not kick the requeue_list
>> when it knows a reset is in progress, but there's no immediate way
>> to drain the list if the reset fails and the device requires removal,
>> and blk_cleanup_queue() will be stuck.
>>
>> Is there something available to call that I'm missing or do I need to
>> add more removal handling?
>
> So that's actually a case where having the queues auto-started on
> requeue run is harmful, since we should be able to handle this situation
> by stopping queues, requeueing, and then having a helper to eventually
> abort pending requeued work, if we have to. But if you simply requeue
> them and defer kicking the requeue list it might work. At that point
> you'd either kick the requeues (and hence start processing them) if
> things went well on the reset, or we could have some
> blk_mq_abort_requeues() helper that'd kill them with -EIO instead. Would
> that work for you?

Something like this.

-- 
Jens Axboe

-------------- next part --------------
A non-text attachment was scrubbed...
Name: blk-mq-abort-requeue-list.patch
Type: text/x-patch
Size: 1461 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-nvme/attachments/20141223/b92098c3/attachment.bin>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-23 17:54                     ` Jens Axboe
@ 2014-12-23 18:09                       ` Keith Busch
  2014-12-23 21:10                       ` Keith Busch
  1 sibling, 0 replies; 27+ messages in thread
From: Keith Busch @ 2014-12-23 18:09 UTC (permalink / raw)

On Tue, 23 Dec 2014, Jens Axboe wrote:
> On 12/23/2014 10:49 AM, Jens Axboe wrote:
>> On 12/22/2014 06:34 PM, Keith Busch wrote:
>>> Is there something available to call that I'm missing or do I need to
>>> add more removal handling?
>> 
>> So that's actually a case where having the queues auto-started on
>> requeue run is harmful, since we should be able to handle this situation
>> by stopping queues, requeueing, and then having a helper to eventually
>> abort pending requeued work, if we have to. But if you simply requeue
>> them and defer kicking the requeue list it might work. At that point
>> you'd either kick the requeues (and hence start processing them) if
>> things went well on the reset, or we could have some
>> blk_mq_abort_requeues() helper that'd kill them with -EIO instead. Would
>> that work for you?
>
> Something like this.

That's very close to where I was trying to taking this, but you beat me
to it! At least I was going the right direction. :)

I'll test this with my driver bits and resend when it works.

If anyone's interested, I'm testing by queueing lots of IO then taking
the link down. I don't have a hotplug capable machine, so using pcie
trickery to force it offline.

Here's my 'fio' commandline:

   # fio --name=global --filename=/dev/nvme0n1 --direct=1 --bs=4k \
     --rw=randrw --ioengine=libaio --iodepth=128 --name=foobar

And then taking the link down with pci link control. My nvme controller
is connected to bridge downstream port 00:03.2.

   # setpci -s 00:03.2 CAP_EXP+10.b=10:10

That should almost immediately trigger reset handling since reading CSTS
returns -1 in the polling thread, which will of course fail to recover
and request removal.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-23 17:54                     ` Jens Axboe
  2014-12-23 18:09                       ` Keith Busch
@ 2014-12-23 21:10                       ` Keith Busch
  2014-12-23 21:23                         ` Keith Busch
  1 sibling, 1 reply; 27+ messages in thread
From: Keith Busch @ 2014-12-23 21:10 UTC (permalink / raw)


On Tue, 23 Dec 2014, Jens Axboe wrote:
> On 12/23/2014 10:49 AM, Jens Axboe wrote:
>> So that's actually a case where having the queues auto-started on
>> requeue run is harmful, since we should be able to handle this situation
>> by stopping queues, requeueing, and then having a helper to eventually
>> abort pending requeued work, if we have to. But if you simply requeue
>> them and defer kicking the requeue list it might work. At that point
>> you'd either kick the requeues (and hence start processing them) if
>> things went well on the reset, or we could have some
>> blk_mq_abort_requeues() helper that'd kill them with -EIO instead. Would
>> that work for you?
>
> Something like this.

Ok, this works when used with the driver update below. I tested disabling
the link and issuing a PCI-e FLR, so both recovery failure and success
cases covered.

There are still a couple problems that look possible, but I haven't been
able to make either happen yet. It looks like we could lose requests in
either ctx->rq_list or hctx->dispatch when recovery fails. It also looks
possible the driver's .queue_rq can still be called during controller
reset.

---
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 94f5578..030fdc2 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -106,7 +106,7 @@ struct nvme_queue {
  	dma_addr_t cq_dma_addr;
  	u32 __iomem *q_db;
  	u16 q_depth;
-	u16 cq_vector;
+	s16 cq_vector;
  	u16 sq_head;
  	u16 sq_tail;
  	u16 cq_head;
@@ -432,7 +432,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
  		if (!(status & NVME_SC_DNR || blk_noretry_request(req))
  		    && (jiffies - req->start_time) < req->timeout) {
  			blk_mq_requeue_request(req);
-			blk_mq_kick_requeue_list(req->q);
+			if (!blk_queue_stopped(req->q))
+				blk_mq_kick_requeue_list(req->q);
  			return;
  		}
  		req->errors = nvme_error_status(status);
@@ -2398,8 +2399,10 @@ static void nvme_unfreeze_queues(struct nvme_dev *dev)
  {
  	struct nvme_ns *ns;

-	list_for_each_entry(ns, &dev->namespaces, list)
+	list_for_each_entry(ns, &dev->namespaces, list) {
  		blk_mq_unfreeze_queue(ns->queue);
+		blk_mq_kick_requeue_list(ns->queue);
+	}
  }

  static void nvme_dev_shutdown(struct nvme_dev *dev)
@@ -2438,6 +2441,7 @@ static void nvme_dev_remove(struct nvme_dev *dev)
  	struct nvme_ns *ns;

  	list_for_each_entry(ns, &dev->namespaces, list) {
+		blk_mq_abort_requeue_list(ns->queue);
  		if (ns->disk->flags & GENHD_FL_UP)
  			del_gendisk(ns->disk);
  		if (!blk_queue_dying(ns->queue))
--

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-23 21:10                       ` Keith Busch
@ 2014-12-23 21:23                         ` Keith Busch
  2014-12-23 21:24                           ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Keith Busch @ 2014-12-23 21:23 UTC (permalink / raw)


On Tue, 23 Dec 2014, Keith Busch wrote:
> @@ -432,7 +432,8 @@ static void req_completion(struct nvme_queue *nvmeq, void 
> *ctx,
> 		if (!(status & NVME_SC_DNR || blk_noretry_request(req))
> 		    && (jiffies - req->start_time) < req->timeout) {
> 			blk_mq_requeue_request(req);
> -			blk_mq_kick_requeue_list(req->q);
> +			if (!blk_queue_stopped(req->q))
> +				blk_mq_kick_requeue_list(req->q);
> 			return;
> 		}

Oops, experimenting with different things, took the wrong snapshot of
the patch. Should be:

+			if (nvmeq->cq_vector != -1)

rather than:

+			if (!blk_queue_stopped(req->q))

Anyway, I'm going to keep messing with this until I can either hit the
other failures I mentioned or convince myself it's safe before sending
something for official consideration.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-23 21:23                         ` Keith Busch
@ 2014-12-23 21:24                           ` Jens Axboe
  2014-12-31  2:31                             ` Keith Busch
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2014-12-23 21:24 UTC (permalink / raw)


On 12/23/2014 02:23 PM, Keith Busch wrote:
> On Tue, 23 Dec 2014, Keith Busch wrote:
>> @@ -432,7 +432,8 @@ static void req_completion(struct nvme_queue
>> *nvmeq, void *ctx,
>>         if (!(status & NVME_SC_DNR || blk_noretry_request(req))
>>             && (jiffies - req->start_time) < req->timeout) {
>>             blk_mq_requeue_request(req);
>> -            blk_mq_kick_requeue_list(req->q);
>> +            if (!blk_queue_stopped(req->q))
>> +                blk_mq_kick_requeue_list(req->q);
>>             return;
>>         }
> 
> Oops, experimenting with different things, took the wrong snapshot of
> the patch. Should be:
> 
> +            if (nvmeq->cq_vector != -1)
> 
> rather than:
> 
> +            if (!blk_queue_stopped(req->q))
> 
> Anyway, I'm going to keep messing with this until I can either hit the
> other failures I mentioned or convince myself it's safe before sending
> something for official consideration.

I was puzzled by the signed change for cq_vector by itself. I'll wait on
more results from you before doing anything.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-23 21:24                           ` Jens Axboe
@ 2014-12-31  2:31                             ` Keith Busch
  2014-12-31 16:38                               ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Keith Busch @ 2014-12-31  2:31 UTC (permalink / raw)


Abandon the whole series...  Too many corner cases where this falls
to pieces. I'm running high queue-depth IO with random error injection
that causes requests to get on lists from ctx->rq_list, hctx->dispatch,
and q->requeue_list. No matter what I do from the driver, there is
always a case in either reset or removal where a requests get lost and
blk_cleanup_queue never completes.

On Tue, 23 Dec 2014, Jens Axboe wrote:
> On 12/23/2014 02:23 PM, Keith Busch wrote:
>> On Tue, 23 Dec 2014, Keith Busch wrote:
>>> @@ -432,7 +432,8 @@ static void req_completion(struct nvme_queue
>>> *nvmeq, void *ctx,
>>>         if (!(status & NVME_SC_DNR || blk_noretry_request(req))
>>>             && (jiffies - req->start_time) < req->timeout) {
>>>             blk_mq_requeue_request(req);
>>> -            blk_mq_kick_requeue_list(req->q);
>>> +            if (!blk_queue_stopped(req->q))
>>> +                blk_mq_kick_requeue_list(req->q);
>>>             return;
>>>         }
>>
>> Oops, experimenting with different things, took the wrong snapshot of
>> the patch. Should be:
>>
>> +            if (nvmeq->cq_vector != -1)
>>
>> rather than:
>>
>> +            if (!blk_queue_stopped(req->q))
>>
>> Anyway, I'm going to keep messing with this until I can either hit the
>> other failures I mentioned or convince myself it's safe before sending
>> something for official consideration.
>
> I was puzzled by the signed change for cq_vector by itself. I'll wait on
> more results from you before doing anything.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-31  2:31                             ` Keith Busch
@ 2014-12-31 16:38                               ` Jens Axboe
  2015-01-05 15:17                                 ` Keith Busch
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2014-12-31 16:38 UTC (permalink / raw)


On 12/30/2014 07:31 PM, Keith Busch wrote:
> Abandon the whole series...  Too many corner cases where this falls
> to pieces. I'm running high queue-depth IO with random error injection
> that causes requests to get on lists from ctx->rq_list, hctx->dispatch,
> and q->requeue_list. No matter what I do from the driver, there is
> always a case in either reset or removal where a requests get lost and
> blk_cleanup_queue never completes.

Back to the drawing board, I'll drop the series. Still off from work 
here, I'll take a look when I get back soon. I'm surprised it's this 
difficult, we already went through most of this design/test hash out 
with scsi-mq.

I'll drop this one:

NVMe: Freeze queues on shutdown

but keep this one:

NVMe: Fix double free irq

Agree?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2014-12-31 16:38                               ` Jens Axboe
@ 2015-01-05 15:17                                 ` Keith Busch
  2015-01-05 19:30                                   ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Keith Busch @ 2015-01-05 15:17 UTC (permalink / raw)

On Wed, 31 Dec 2014, Jens Axboe wrote:
> On 12/30/2014 07:31 PM, Keith Busch wrote:
>> Abandon the whole series...  Too many corner cases where this falls
>> to pieces. I'm running high queue-depth IO with random error injection
>> that causes requests to get on lists from ctx->rq_list, hctx->dispatch,
>> and q->requeue_list. No matter what I do from the driver, there is
>> always a case in either reset or removal where a requests get lost and
>> blk_cleanup_queue never completes.
>
> Back to the drawing board, I'll drop the series. Still off from work here, 
> I'll take a look when I get back soon. I'm surprised it's this difficult, we 
> already went through most of this design/test hash out with scsi-mq.
>
> I'll drop this one:
>
> NVMe: Freeze queues on shutdown
>
> but keep this one:
>
> NVMe: Fix double free irq
>
> Agree?

Yep, that sounds good. Sorry I rushed out that last email without much
explanation.

We need the driver to temporarily block tasks allocating new requests but
let existing requests requeue. Freeze looked good, but unfreeze expects
the usage count to have been 0, which it's not guaranteed with when we
let failed requests requeue.

The only reason I need the freeze/unfreeze exports is for the IOCTL path
that submits commands outside the block layer. If I can change all those
usages to be "blk_execute_rq" or something like that, we don't need the
new exports, and can block requests at a different level, but that brings
me to the next issue.

We also need the driver to temporarily prevent the block layer from
submitting requests to the driver's hw queues. 'blk_mq_stop_hw_queues'
looked right, but anyone can restart them at the wrong time by kicking
the requeue list.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2015-01-05 15:17                                 ` Keith Busch
@ 2015-01-05 19:30                                   ` Jens Axboe
  2015-01-05 20:19                                     ` Keith Busch
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2015-01-05 19:30 UTC (permalink / raw)


On 01/05/2015 08:17 AM, Keith Busch wrote:
> On Wed, 31 Dec 2014, Jens Axboe wrote:
>> On 12/30/2014 07:31 PM, Keith Busch wrote:
>>> Abandon the whole series...  Too many corner cases where this falls
>>> to pieces. I'm running high queue-depth IO with random error injection
>>> that causes requests to get on lists from ctx->rq_list, hctx->dispatch,
>>> and q->requeue_list. No matter what I do from the driver, there is
>>> always a case in either reset or removal where a requests get lost and
>>> blk_cleanup_queue never completes.
>>
>> Back to the drawing board, I'll drop the series. Still off from work
>> here, I'll take a look when I get back soon. I'm surprised it's this
>> difficult, we already went through most of this design/test hash out
>> with scsi-mq.
>>
>> I'll drop this one:
>>
>> NVMe: Freeze queues on shutdown
>>
>> but keep this one:
>>
>> NVMe: Fix double free irq
>>
>> Agree?
> 
> Yep, that sounds good. Sorry I rushed out that last email without much
> explanation.
> 
> We need the driver to temporarily block tasks allocating new requests but
> let existing requests requeue. Freeze looked good, but unfreeze expects
> the usage count to have been 0, which it's not guaranteed with when we
> let failed requests requeue.

OK, I think that is a concern we can fix. And yes, that was the intended
use case for it originally.

> The only reason I need the freeze/unfreeze exports is for the IOCTL path
> that submits commands outside the block layer. If I can change all those
> usages to be "blk_execute_rq" or something like that, we don't need the
> new exports, and can block requests at a different level, but that brings
> me to the next issue.

That would be one way of doing it, as it effectively gets rid of the
ioctl being an OOB mechanism. But we should be able to make it work
otherwise.

> We also need the driver to temporarily prevent the block layer from
> submitting requests to the driver's hw queues. 'blk_mq_stop_hw_queues'
> looked right, but anyone can restart them at the wrong time by kicking
> the requeue list.

The driver is the only one that should kick the requeue action into
gear, which would start those queues up again. So that should be under
your control already.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2015-01-05 19:30                                   ` Jens Axboe
@ 2015-01-05 20:19                                     ` Keith Busch
  2015-01-05 20:20                                       ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Keith Busch @ 2015-01-05 20:19 UTC (permalink / raw)


On Mon, 5 Jan 2015, Jens Axboe wrote:
> On 01/05/2015 08:17 AM, Keith Busch wrote:
>> On Wed, 31 Dec 2014, Jens Axboe wrote:
>> We need the driver to temporarily block tasks allocating new requests but
>> let existing requests requeue. Freeze looked good, but unfreeze expects
>> the usage count to have been 0, which it's not guaranteed with when we
>> let failed requests requeue.
>
> OK, I think that is a concern we can fix. And yes, that was the intended
> use case for it originally.
>

Okay cool, that would help a lot.

>> We also need the driver to temporarily prevent the block layer from
>> submitting requests to the driver's hw queues. 'blk_mq_stop_hw_queues'
>> looked right, but anyone can restart them at the wrong time by kicking
>> the requeue list.
>
> The driver is the only one that should kick the requeue action into
> gear, which would start those queues up again. So that should be under
> your control already.

Right, we can stop the driver from kicking if it knows a device reset
is occuring, but there can't be any requeue work prior to stopping all
h/w queues to prevent a race condition. We could have the driver's reset
handler call 'cancel_work_sync(q->requeue_work)' to address that. There's
no existing driver using q->reset_work, but it looks safe to treat
as public.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 0/4] nvme-blkmq fixes
  2015-01-05 20:19                                     ` Keith Busch
@ 2015-01-05 20:20                                       ` Jens Axboe
  0 siblings, 0 replies; 27+ messages in thread
From: Jens Axboe @ 2015-01-05 20:20 UTC (permalink / raw)


On 01/05/2015 01:19 PM, Keith Busch wrote:
> On Mon, 5 Jan 2015, Jens Axboe wrote:
>> On 01/05/2015 08:17 AM, Keith Busch wrote:
>>> On Wed, 31 Dec 2014, Jens Axboe wrote:
>>> We need the driver to temporarily block tasks allocating new requests
>>> but
>>> let existing requests requeue. Freeze looked good, but unfreeze expects
>>> the usage count to have been 0, which it's not guaranteed with when we
>>> let failed requests requeue.
>>
>> OK, I think that is a concern we can fix. And yes, that was the intended
>> use case for it originally.
>>
> 
> Okay cool, that would help a lot.
> 
>>> We also need the driver to temporarily prevent the block layer from
>>> submitting requests to the driver's hw queues. 'blk_mq_stop_hw_queues'
>>> looked right, but anyone can restart them at the wrong time by kicking
>>> the requeue list.
>>
>> The driver is the only one that should kick the requeue action into
>> gear, which would start those queues up again. So that should be under
>> your control already.
> 
> Right, we can stop the driver from kicking if it knows a device reset
> is occuring, but there can't be any requeue work prior to stopping all
> h/w queues to prevent a race condition. We could have the driver's reset
> handler call 'cancel_work_sync(q->requeue_work)' to address that. There's
> no existing driver using q->reset_work, but it looks safe to treat
> as public.

Assuming you meant q->requeue_work here. I'd just add that to an
exported function for that functionality, don't muck with it directly in
case we want to change it later for some reason.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2015-01-05 20:20 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-12-20  0:54 [PATCH 0/4] nvme-blkmq fixes Keith Busch
2014-12-20  0:54 ` [PATCH 1/4] blk-mq: Exit queue on alloc failure Keith Busch
2014-12-20  0:54 ` [PATCH 2/4] blk-mq: Export freeze/unfreeze functions Keith Busch
2014-12-20  0:54 ` [PATCH 3/4] NVMe: Fix double free irq Keith Busch
2014-12-22 15:15   ` Matthew Wilcox
2014-12-20  0:54 ` [PATCH 4/4] NVMe: Freeze queues on shutdown Keith Busch
2014-12-20 17:32 ` [PATCH 0/4] nvme-blkmq fixes Jens Axboe
2014-12-20 19:29   ` Jens Axboe
2014-12-22 16:38     ` Keith Busch
2014-12-22 16:47       ` Jens Axboe
2014-12-22 18:17         ` Keith Busch
2014-12-22 18:19           ` Jens Axboe
2014-12-22 20:08             ` Jens Axboe
2014-12-22 21:01               ` Keith Busch
2014-12-23  1:34                 ` Keith Busch
2014-12-23 17:49                   ` Jens Axboe
2014-12-23 17:54                     ` Jens Axboe
2014-12-23 18:09                       ` Keith Busch
2014-12-23 21:10                       ` Keith Busch
2014-12-23 21:23                         ` Keith Busch
2014-12-23 21:24                           ` Jens Axboe
2014-12-31  2:31                             ` Keith Busch
2014-12-31 16:38                               ` Jens Axboe
2015-01-05 15:17                                 ` Keith Busch
2015-01-05 19:30                                   ` Jens Axboe
2015-01-05 20:19                                     ` Keith Busch
2015-01-05 20:20                                       ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.