[PATCH v5] conversion to blk-mq

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v5] conversion to blk-mq
@ 2014-06-02 20:55 Matias Bjørling
  2014-06-02 20:55 ` [PATCH v5] NVMe: " Matias Bjørling
  2014-06-02 22:49 ` [PATCH v5] " Keith Busch
  0 siblings, 2 replies; 14+ messages in thread
From: Matias Bjørling @ 2014-06-02 20:55 UTC (permalink / raw)
  To: willy, keith.busch, sbradshaw, axboe, linux-kernel, linux-nvme,
	hch
  Cc: Matias Bjørling

Hi Matthew and Keith,

Here is an updated patch with the feedback from the previous days. It's against
Jens' for-3.16/core tree. You may use the nvmemq_wip_review branch at:

https://github.com/MatiasBjorling/linux-collab nvmemq_wip_review

and see the changes from v4 to v5 here:

https://github.com/MatiasBjorling/linux-collab/commit/e650bc5

Changes since v4:
  * Fix timeout retries
  * Fix naming in nvme_init_hctx
  * Fix racy behavior of admin queue in nvme_dev_remove
  * Fix wrong return values in nvme_queue_request
  * Put cqe_seen back
  * Introduce abort_completion for killing timed out I/Os
  * Move locks outside of nvme_submit_iod
  * Various renaming and style fixes

Changes since v3:
  * Added abortion logic
  * Fixed possible race on abortion
  * Removed req data with flush. Handled by by blk-mq
  * Added safety check for submitting user rq to admin queue.
  * Use dev->online_queues for nr_hw_queues
  * Fix loop with initialization in nvme_create_io_queues
  * Style fixups

Changes since v2:
  * rebased on top of current 3.16/core.
  * use blk-mq queue management for spreading io queues
  * removed rcu handling and allocated all io queues up front for mgmt by blk-mq
  * removed the need for hotplugging notification
  * fixed flush data handling
  * fixed double free of spinlock
  * various cleanup

Matias Bjørling (1):
  NVMe: conversion to blk-mq

 drivers/block/nvme-core.c | 1149 ++++++++++++++++++---------------------------
 drivers/block/nvme-scsi.c |    8 +-
 include/linux/nvme.h      |   14 +-
 3 files changed, 478 insertions(+), 693 deletions(-)

-- 
1.9.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v5] NVMe: conversion to blk-mq
  2014-06-02 20:55 [PATCH v5] conversion to blk-mq Matias Bjørling
@ 2014-06-02 20:55 ` Matias Bjørling
  2014-06-02 22:49 ` [PATCH v5] " Keith Busch
  1 sibling, 0 replies; 14+ messages in thread
From: Matias Bjørling @ 2014-06-02 20:55 UTC (permalink / raw)
  To: willy, keith.busch, sbradshaw, axboe, linux-kernel, linux-nvme,
	hch
  Cc: Matias Bjørling

This converts the current NVMe driver to utilize the blk-mq layer.

Contributions in this patch from:

  Sam Bradshaw <sbradshaw@micron.com>
  Jens Axboe <axboe@kernel.dk>
  Christoph Hellwig <hch@infradead.org>
  Robert Nelson <rlnelson@google.com>

Signed-off-by: Matias Bjørling <m@bjorling.me>
---
 drivers/block/nvme-core.c | 1149 ++++++++++++++++++---------------------------
 drivers/block/nvme-scsi.c |    8 +-
 include/linux/nvme.h      |   14 +-
 3 files changed, 478 insertions(+), 693 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 9212dcc..dd556d6 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -13,9 +13,9 @@
  */
 
 #include <linux/nvme.h>
-#include <linux/bio.h>
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
@@ -33,7 +33,6 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
-#include <linux/percpu.h>
 #include <linux/poison.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
@@ -42,11 +41,11 @@
 #include <scsi/sg.h>
 #include <asm-generic/io-64-nonatomic-lo-hi.h>
 
-#define NVME_Q_DEPTH 1024
+#define NVME_Q_DEPTH		1024
+#define NVME_AQ_DEPTH		64
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
-#define ADMIN_TIMEOUT	(60 * HZ)
-#define IOD_TIMEOUT	(4 * NVME_IO_TIMEOUT)
+#define ADMIN_TIMEOUT		(60 * HZ)
 
 unsigned char io_timeout = 30;
 module_param(io_timeout, byte, 0644);
@@ -65,10 +64,12 @@ static struct workqueue_struct *nvme_workq;
 static wait_queue_head_t nvme_kthread_wait;
 
 static void nvme_reset_failed_dev(struct work_struct *ws);
+static int nvme_process_cq(struct nvme_queue *nvmeq);
 
 struct async_cmd_info {
 	struct kthread_work work;
 	struct kthread_worker *worker;
+	struct request *req;
 	u32 result;
 	int status;
 	void *ctx;
@@ -79,7 +80,6 @@ struct async_cmd_info {
  * commands and one for I/O commands).
  */
 struct nvme_queue {
-	struct rcu_head r_head;
 	struct device *q_dmadev;
 	struct nvme_dev *dev;
 	char irqname[24];	/* nvme4294967295-65535\0 */
@@ -88,10 +88,6 @@ struct nvme_queue {
 	volatile struct nvme_completion *cqes;
 	dma_addr_t sq_dma_addr;
 	dma_addr_t cq_dma_addr;
-	wait_queue_head_t sq_full;
-	wait_queue_t sq_cong_wait;
-	struct bio_list sq_cong;
-	struct list_head iod_bio;
 	u32 __iomem *q_db;
 	u16 q_depth;
 	u16 cq_vector;
@@ -104,7 +100,7 @@ struct nvme_queue {
 	u8 q_suspended;
 	cpumask_var_t cpu_mask;
 	struct async_cmd_info cmdinfo;
-	unsigned long cmdid_data[];
+	struct blk_mq_hw_ctx *hctx;
 };
 
 /*
@@ -132,62 +128,70 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
 struct nvme_cmd_info {
 	nvme_completion_fn fn;
 	void *ctx;
-	unsigned long timeout;
 	int aborted;
+	struct nvme_queue *nvmeq;
 };
 
-static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
+static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+				unsigned int hctx_idx)
 {
-	return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
+	struct nvme_dev *dev = data;
+	struct nvme_queue *nvmeq = dev->queues[0];
+	BUG_ON(!nvmeq);
+	WARN_ON(nvmeq->hctx);
+	nvmeq->hctx = hctx;
+	hctx->driver_data = nvmeq;
+	return 0;
 }
 
-static unsigned nvme_queue_extra(int depth)
+static int nvme_admin_init_request(void *data, struct request *req,
+				unsigned int hctx_idx, unsigned int rq_idx,
+				unsigned int numa_node)
 {
-	return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info));
+	struct nvme_dev *dev = data;
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = dev->queues[0];
+	WARN_ON(!nvmeq);
+	WARN_ON(!cmd);
+	cmd->nvmeq = nvmeq;
+	return 0;
 }
 
-/**
- * alloc_cmdid() - Allocate a Command ID
- * @nvmeq: The queue that will be used for this command
- * @ctx: A pointer that will be passed to the handler
- * @handler: The function to call on completion
- *
- * Allocate a Command ID for a queue.  The data passed in will
- * be passed to the completion handler.  This is implemented by using
- * the bottom two bits of the ctx pointer to store the handler ID.
- * Passing in a pointer that's not 4-byte aligned will cause a BUG.
- * We can change this if it becomes a problem.
- *
- * May be called with local interrupts disabled and the q_lock held,
- * or with interrupts enabled and no locks held.
- */
-static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
-				nvme_completion_fn handler, unsigned timeout)
+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+			  unsigned int hctx_idx)
 {
-	int depth = nvmeq->q_depth - 1;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	int cmdid;
-
-	do {
-		cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
-		if (cmdid >= depth)
-			return -EBUSY;
-	} while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
+	struct nvme_dev *dev = data;
+	struct nvme_queue *nvmeq = dev->queues[(hctx_idx % dev->queue_count)
+									+ 1];
+	/* nvmeq queues are shared between namespaces. We assume here that
+	 * blk-mq map the tags so they match up with the nvme queue tags */
+	if (!nvmeq->hctx)
+		nvmeq->hctx = hctx;
+	else
+		WARN_ON(nvmeq->hctx->tags != hctx->tags);
+	hctx->driver_data = nvmeq;
+	return 0;
+}
 
-	info[cmdid].fn = handler;
-	info[cmdid].ctx = ctx;
-	info[cmdid].timeout = jiffies + timeout;
-	info[cmdid].aborted = 0;
-	return cmdid;
+static int nvme_init_request(void *data, struct request *req,
+				unsigned int hctx_idx, unsigned int rq_idx,
+				unsigned int numa_node)
+{
+	struct nvme_dev *dev = data;
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+	WARN_ON(!nvmeq);
+	WARN_ON(!cmd);
+	cmd->nvmeq = nvmeq;
+	return 0;
 }
 
-static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
-				nvme_completion_fn handler, unsigned timeout)
+static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
+				nvme_completion_fn handler)
 {
-	int cmdid;
-	wait_event_killable(nvmeq->sq_full,
-		(cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
-	return (cmdid < 0) ? -EINTR : cmdid;
+	cmd->fn = handler;
+	cmd->ctx = ctx;
+	cmd->aborted = 0;
 }
 
 /* Special values must be less than 0x1000 */
@@ -195,20 +199,12 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 #define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
 #define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
 #define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
-#define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
-#define CMD_CTX_ABORT		(0x31C + CMD_CTX_BASE)
 
 static void special_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	if (ctx == CMD_CTX_CANCELLED)
 		return;
-	if (ctx == CMD_CTX_FLUSH)
-		return;
-	if (ctx == CMD_CTX_ABORT) {
-		++nvmeq->dev->abort_limit;
-		return;
-	}
 	if (ctx == CMD_CTX_COMPLETED) {
 		dev_warn(nvmeq->q_dmadev,
 				"completed id %d twice on queue %d\n",
@@ -225,6 +221,42 @@ static void special_completion(struct nvme_queue *nvmeq, void *ctx,
 	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
 }
 
+static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
+{
+	void *ctx;
+	if (fn)
+		*fn = cmd->fn;
+	ctx = cmd->ctx;
+	cmd->fn = special_completion;
+	cmd->ctx = CMD_CTX_CANCELLED;
+	return ctx;
+}
+
+static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct request *req;
+	struct nvme_cmd_info *aborted = ctx;
+	struct nvme_queue *a_nvmeq = aborted->nvmeq;
+	void *a_ctx;
+	nvme_completion_fn a_fn;
+	static struct nvme_completion a_cqe = {
+		.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
+	};
+
+	req = blk_mq_tag_to_rq(nvmeq->hctx, cqe->command_id);
+	blk_put_request(req);
+
+	if (!cqe->status)
+		dev_warn(nvmeq->q_dmadev, "Could not abort I/O %d QID %d",
+							req->tag, nvmeq->qid);
+
+	a_ctx = cancel_cmd_info(aborted, &a_fn);
+	a_fn(a_nvmeq, a_ctx, &a_cqe);
+
+	++nvmeq->dev->abort_limit;
+}
+
 static void async_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
@@ -232,75 +264,37 @@ static void async_completion(struct nvme_queue *nvmeq, void *ctx,
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
+	blk_put_request(cmdinfo->req);
+}
+
+static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
+				  unsigned int tag)
+{
+	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
+	struct request *req = blk_mq_tag_to_rq(hctx, tag);
+	return blk_mq_rq_to_pdu(req);
 }
 
 /*
  * Called with local interrupts disabled and the q_lock held.  May not sleep.
  */
-static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
+static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
 						nvme_completion_fn *fn)
 {
+	struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
 	void *ctx;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-
-	if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) {
-		if (fn)
-			*fn = special_completion;
+	if (tag >= nvmeq->q_depth) {
+		*fn = special_completion;
 		return CMD_CTX_INVALID;
 	}
 	if (fn)
-		*fn = info[cmdid].fn;
-	ctx = info[cmdid].ctx;
-	info[cmdid].fn = special_completion;
-	info[cmdid].ctx = CMD_CTX_COMPLETED;
-	clear_bit(cmdid, nvmeq->cmdid_data);
-	wake_up(&nvmeq->sq_full);
+		*fn = cmd->fn;
+	ctx = cmd->ctx;
+	cmd->fn = special_completion;
+	cmd->ctx = CMD_CTX_COMPLETED;
 	return ctx;
 }
 
-static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
-						nvme_completion_fn *fn)
-{
-	void *ctx;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	if (fn)
-		*fn = info[cmdid].fn;
-	ctx = info[cmdid].ctx;
-	info[cmdid].fn = special_completion;
-	info[cmdid].ctx = CMD_CTX_CANCELLED;
-	return ctx;
-}
-
-static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
-{
-	return rcu_dereference_raw(dev->queues[qid]);
-}
-
-static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
-{
-	unsigned queue_id = get_cpu_var(*dev->io_queue);
-	rcu_read_lock();
-	return rcu_dereference(dev->queues[queue_id]);
-}
-
-static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
-{
-	rcu_read_unlock();
-	put_cpu_var(nvmeq->dev->io_queue);
-}
-
-static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
-							__acquires(RCU)
-{
-	rcu_read_lock();
-	return rcu_dereference(dev->queues[q_idx]);
-}
-
-static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
-{
-	rcu_read_unlock();
-}
-
 /**
  * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
@@ -357,7 +351,6 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
 		iod->length = nbytes;
 		iod->nents = 0;
 		iod->first_dma = 0ULL;
-		iod->start_time = jiffies;
 	}
 
 	return iod;
@@ -381,59 +374,31 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 	kfree(iod);
 }
 
-static void nvme_start_io_acct(struct bio *bio)
-{
-	struct gendisk *disk = bio->bi_bdev->bd_disk;
-	const int rw = bio_data_dir(bio);
-	int cpu = part_stat_lock();
-	part_round_stats(cpu, &disk->part0);
-	part_stat_inc(cpu, &disk->part0, ios[rw]);
-	part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio));
-	part_inc_in_flight(&disk->part0, rw);
-	part_stat_unlock();
-}
-
-static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
-{
-	struct gendisk *disk = bio->bi_bdev->bd_disk;
-	const int rw = bio_data_dir(bio);
-	unsigned long duration = jiffies - start_time;
-	int cpu = part_stat_lock();
-	part_stat_add(cpu, &disk->part0, ticks[rw], duration);
-	part_round_stats(cpu, &disk->part0);
-	part_dec_in_flight(&disk->part0, rw);
-	part_stat_unlock();
-}
-
-static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
+static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct nvme_iod *iod = ctx;
-	struct bio *bio = iod->private;
+	struct request *req = iod->private;
+
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 
 	if (unlikely(status)) {
-		if (!(status & NVME_SC_DNR ||
-				bio->bi_rw & REQ_FAILFAST_MASK) &&
-				(jiffies - iod->start_time) < IOD_TIMEOUT) {
-			if (!waitqueue_active(&nvmeq->sq_full))
-				add_wait_queue(&nvmeq->sq_full,
-							&nvmeq->sq_cong_wait);
-			list_add_tail(&iod->node, &nvmeq->iod_bio);
-			wake_up(&nvmeq->sq_full);
+		if (!(status & NVME_SC_DNR || blk_noretry_request(req))
+		    && (jiffies - req->start_time) < req->timeout) {
+			blk_mq_requeue_request(req);
 			return;
 		}
-	}
+		req->errors = -EIO;
+	} else
+		req->errors = 0;
+
 	if (iod->nents) {
-		dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
-			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		nvme_end_io_acct(bio, iod->start_time);
+		dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
+			rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	}
 	nvme_free_iod(nvmeq->dev, iod);
-	if (status)
-		bio_endio(bio, -EIO);
-	else
-		bio_endio(bio, 0);
+
+	blk_mq_complete_request(req);
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
@@ -515,86 +480,25 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
 	return total_len;
 }
 
-static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
-				 int len)
-{
-	struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL);
-	if (!split)
-		return -ENOMEM;
-
-	bio_chain(split, bio);
-
-	if (!waitqueue_active(&nvmeq->sq_full))
-		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-	bio_list_add(&nvmeq->sq_cong, split);
-	bio_list_add(&nvmeq->sq_cong, bio);
-	wake_up(&nvmeq->sq_full);
-
-	return 0;
-}
-
-/* NVMe scatterlists require no holes in the virtual address */
-#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)	((vec2)->bv_offset || \
-			(((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
-
-static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
-		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
-{
-	struct bio_vec bvec, bvprv;
-	struct bvec_iter iter;
-	struct scatterlist *sg = NULL;
-	int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
-	int first = 1;
-
-	if (nvmeq->dev->stripe_size)
-		split_len = nvmeq->dev->stripe_size -
-			((bio->bi_iter.bi_sector << 9) &
-			 (nvmeq->dev->stripe_size - 1));
-
-	sg_init_table(iod->sg, psegs);
-	bio_for_each_segment(bvec, bio, iter) {
-		if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) {
-			sg->length += bvec.bv_len;
-		} else {
-			if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec))
-				return nvme_split_and_submit(bio, nvmeq,
-							     length);
-
-			sg = sg ? sg + 1 : iod->sg;
-			sg_set_page(sg, bvec.bv_page,
-				    bvec.bv_len, bvec.bv_offset);
-			nsegs++;
-		}
-
-		if (split_len - length < bvec.bv_len)
-			return nvme_split_and_submit(bio, nvmeq, split_len);
-		length += bvec.bv_len;
-		bvprv = bvec;
-		first = 0;
-	}
-	iod->nents = nsegs;
-	sg_mark_end(sg);
-	if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
-		return -ENOMEM;
-
-	BUG_ON(length != bio->bi_iter.bi_size);
-	return length;
-}
-
-static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-		struct bio *bio, struct nvme_iod *iod, int cmdid)
+/*
+ * We reuse the small pool to allocate the 16-byte range here as it is not
+ * worth having a special pool for these or additional cases to handle freeing
+ * the iod.
+ */
+static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+		struct request *req, struct nvme_iod *iod)
 {
 	struct nvme_dsm_range *range =
 				(struct nvme_dsm_range *)iod_list(iod)[0];
 	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 
 	range->cattr = cpu_to_le32(0);
-	range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift);
-	range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
+	range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
+	range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
 
 	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->dsm.opcode = nvme_cmd_dsm;
-	cmnd->dsm.command_id = cmdid;
+	cmnd->dsm.command_id = req->tag;
 	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
 	cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
 	cmnd->dsm.nr = 0;
@@ -603,11 +507,9 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
 	writel(nvmeq->sq_tail, nvmeq->q_db);
-
-	return 0;
 }
 
-static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								int cmdid)
 {
 	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
@@ -620,59 +522,34 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
 	writel(nvmeq->sq_tail, nvmeq->q_db);
-
-	return 0;
 }
 
-int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
+static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
+							struct nvme_ns *ns)
 {
-	int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
-					special_completion, NVME_IO_TIMEOUT);
-	if (unlikely(cmdid < 0))
-		return cmdid;
-
-	return nvme_submit_flush(nvmeq, ns, cmdid);
-}
-
-static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
-{
-	struct bio *bio = iod->private;
-	struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+	struct request *req = iod->private;
 	struct nvme_command *cmnd;
-	int cmdid;
-	u16 control;
-	u32 dsmgmt;
-
-	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
-	if (unlikely(cmdid < 0))
-		return cmdid;
+	u16 control = 0;
+	u32 dsmgmt = 0;
 
-	if (bio->bi_rw & REQ_DISCARD)
-		return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
-	if ((bio->bi_rw & REQ_FLUSH) && !iod->nents)
-		return nvme_submit_flush(nvmeq, ns, cmdid);
-
-	control = 0;
-	if (bio->bi_rw & REQ_FUA)
+	if (req->cmd_flags & REQ_FUA)
 		control |= NVME_RW_FUA;
-	if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
 		control |= NVME_RW_LR;
 
-	dsmgmt = 0;
-	if (bio->bi_rw & REQ_RAHEAD)
+	if (req->cmd_flags & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
 	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 	memset(cmnd, 0, sizeof(*cmnd));
 
-	cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read;
-	cmnd->rw.command_id = cmdid;
+	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+	cmnd->rw.command_id = req->tag;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 	cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
 	cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
-	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
-	cmnd->rw.length =
-		cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1);
+	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
@@ -683,28 +560,32 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
 	return 0;
 }
 
-/*
- * Called with local interrupts disabled and the q_lock held.  May not sleep.
- */
-static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-								struct bio *bio)
+static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 {
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_iod *iod;
-	int psegs = bio_phys_segments(ns->queue, bio);
-	int result;
-
-	if ((bio->bi_rw & REQ_FLUSH) && psegs) {
-		result = nvme_submit_flush_data(nvmeq, ns);
-		if (result)
-			return result;
-	}
+	enum dma_data_direction dma_dir;
+	int psegs = req->nr_phys_segments;
+	int result = BLK_MQ_RQ_QUEUE_BUSY;
+	/*
+	 * Requeued IO has already been prepped
+	 */
+	iod = req->special;
+	if (iod)
+		goto submit_iod;
 
-	iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
+	iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC);
 	if (!iod)
-		return -ENOMEM;
+		return result;
+
+	iod->private = req;
+	req->special = iod;
 
-	iod->private = bio;
-	if (bio->bi_rw & REQ_DISCARD) {
+	nvme_set_info(cmd, iod, req_completion);
+
+	if (req->cmd_flags & REQ_DISCARD) {
 		void *range;
 		/*
 		 * We reuse the small pool to allocate the 16-byte range here
@@ -714,33 +595,53 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
 						GFP_ATOMIC,
 						&iod->first_dma);
-		if (!range) {
-			result = -ENOMEM;
-			goto free_iod;
-		}
+		if (!range)
+			goto finish_cmd;
 		iod_list(iod)[0] = (__le64 *)range;
 		iod->npages = 0;
 	} else if (psegs) {
-		result = nvme_map_bio(nvmeq, iod, bio,
-			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
-			psegs);
-		if (result <= 0)
-			goto free_iod;
-		if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) !=
-								result) {
-			result = -ENOMEM;
-			goto free_iod;
+		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+		sg_init_table(iod->sg, psegs);
+		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
+		if (!iod->nents) {
+			result = BLK_MQ_RQ_QUEUE_ERROR;
+			goto finish_cmd;
 		}
-		nvme_start_io_acct(bio);
+
+		if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
+			goto finish_cmd;
+
+		if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod,
+						blk_rq_bytes(req), GFP_ATOMIC))
+			goto finish_cmd;
+	}
+
+ submit_iod:
+	spin_lock_irq(&nvmeq->q_lock);
+	if (nvmeq->q_suspended) {
+		spin_unlock_irq(&nvmeq->q_lock);
+		goto finish_cmd;
 	}
-	if (unlikely(nvme_submit_iod(nvmeq, iod))) {
-		if (!waitqueue_active(&nvmeq->sq_full))
-			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-		list_add_tail(&iod->node, &nvmeq->iod_bio);
+
+	if (req->cmd_flags & REQ_DISCARD) {
+		nvme_submit_discard(nvmeq, ns, req, iod);
+		goto queued;
 	}
-	return 0;
 
- free_iod:
+	if (req->cmd_flags & REQ_FLUSH) {
+		nvme_submit_flush(nvmeq, ns, req->tag);
+		goto queued;
+	}
+
+	nvme_submit_iod(nvmeq, iod, ns);
+ queued:
+	nvme_process_cq(nvmeq);
+	spin_unlock_irq(&nvmeq->q_lock);
+	return BLK_MQ_RQ_QUEUE_OK;
+
+ finish_cmd:
+	nvme_finish_cmd(nvmeq, req->tag, NULL);
 	nvme_free_iod(nvmeq->dev, iod);
 	return result;
 }
@@ -763,8 +664,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
 			head = 0;
 			phase = !phase;
 		}
-
-		ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
+		ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
 		fn(nvmeq, ctx, &cqe);
 	}
 
@@ -785,30 +685,12 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
 	return 1;
 }
 
-static void nvme_make_request(struct request_queue *q, struct bio *bio)
+/* Admin queue isn't initialized as a request queue. If at some point this
+ * happens anyway, make sure to notify the user */
+static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 {
-	struct nvme_ns *ns = q->queuedata;
-	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
-	int result = -EBUSY;
-
-	if (!nvmeq) {
-		put_nvmeq(NULL);
-		bio_endio(bio, -EIO);
-		return;
-	}
-
-	spin_lock_irq(&nvmeq->q_lock);
-	if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
-		result = nvme_submit_bio_queue(nvmeq, ns, bio);
-	if (unlikely(result)) {
-		if (!waitqueue_active(&nvmeq->sq_full))
-			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-		bio_list_add(&nvmeq->sq_cong, bio);
-	}
-
-	nvme_process_cq(nvmeq);
-	spin_unlock_irq(&nvmeq->q_lock);
-	put_nvmeq(nvmeq);
+	WARN_ON(1);
+	return BLK_MQ_RQ_QUEUE_ERROR;
 }
 
 static irqreturn_t nvme_irq(int irq, void *data)
@@ -832,10 +714,11 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 	return IRQ_WAKE_THREAD;
 }
 
-static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
+static void nvme_abort_cmd_info(struct nvme_queue *nvmeq, struct nvme_cmd_info *
+								cmd_info)
 {
 	spin_lock_irq(&nvmeq->q_lock);
-	cancel_cmdid(nvmeq, cmdid, NULL);
+	cancel_cmd_info(cmd_info, NULL);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
@@ -858,46 +741,31 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
  */
-static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
-						struct nvme_command *cmd,
+static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
 						u32 *result, unsigned timeout)
 {
-	int cmdid, ret;
+	int ret;
 	struct sync_cmd_info cmdinfo;
-	struct nvme_queue *nvmeq;
-
-	nvmeq = lock_nvmeq(dev, q_idx);
-	if (!nvmeq) {
-		unlock_nvmeq(nvmeq);
-		return -ENODEV;
-	}
+	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
 
 	cmdinfo.task = current;
 	cmdinfo.status = -EINTR;
 
-	cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout);
-	if (cmdid < 0) {
-		unlock_nvmeq(nvmeq);
-		return cmdid;
-	}
-	cmd->common.command_id = cmdid;
+	cmd->common.command_id = req->tag;
+
+	nvme_set_info(cmd_rq, &cmdinfo, sync_completion);
 
 	set_current_state(TASK_KILLABLE);
 	ret = nvme_submit_cmd(nvmeq, cmd);
 	if (ret) {
-		free_cmdid(nvmeq, cmdid, NULL);
-		unlock_nvmeq(nvmeq);
+		nvme_finish_cmd(nvmeq, req->tag, NULL);
 		set_current_state(TASK_RUNNING);
-		return ret;
 	}
-	unlock_nvmeq(nvmeq);
 	schedule_timeout(timeout);
 
 	if (cmdinfo.status == -EINTR) {
-		nvmeq = lock_nvmeq(dev, q_idx);
-		if (nvmeq)
-			nvme_abort_command(nvmeq, cmdid);
-		unlock_nvmeq(nvmeq);
+		nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req));
 		return -EINTR;
 	}
 
@@ -907,59 +775,78 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
 	return cmdinfo.status;
 }
 
-static int nvme_submit_async_cmd(struct nvme_queue *nvmeq,
+static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
 			struct nvme_command *cmd,
 			struct async_cmd_info *cmdinfo, unsigned timeout)
 {
-	int cmdid;
+	struct nvme_queue *nvmeq = dev->queues[0];
+	struct request *req;
+	struct nvme_cmd_info *cmd_rq;
 
-	cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout);
-	if (cmdid < 0)
-		return cmdid;
+	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
+	if (!req)
+		return -ENOMEM;
+
+	req->timeout = timeout;
+	cmd_rq = blk_mq_rq_to_pdu(req);
+	cmdinfo->req = req;
+	nvme_set_info(cmd_rq, cmdinfo, async_completion);
 	cmdinfo->status = -EINTR;
-	cmd->common.command_id = cmdid;
+
+	cmd->common.command_id = req->tag;
+
 	return nvme_submit_cmd(nvmeq, cmd);
 }
 
-int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
-								u32 *result)
+int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+						u32 *result, unsigned timeout)
 {
-	return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT);
+	int res;
+	struct request *req;
+
+	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
+	if (!req)
+		return -ENOMEM;
+	res = nvme_submit_sync_cmd(req, cmd, result, timeout);
+	blk_put_request(req);
+	return res;
 }
 
-int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 								u32 *result)
 {
-	return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result,
-							NVME_IO_TIMEOUT);
+	return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT);
 }
 
-static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
-		struct nvme_command *cmd, struct async_cmd_info *cmdinfo)
+int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
+					struct nvme_command *cmd, u32 *result)
 {
-	return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo,
-								ADMIN_TIMEOUT);
+	int res;
+	struct request *req;
+
+	req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT),
+									false);
+	if (!req)
+		return -ENOMEM;
+	res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT);
+	blk_put_request(req);
+	return res;
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 {
-	int status;
 	struct nvme_command c;
 
 	memset(&c, 0, sizeof(c));
 	c.delete_queue.opcode = opcode;
 	c.delete_queue.qid = cpu_to_le16(id);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
-	if (status)
-		return -EIO;
-	return 0;
+	return nvme_submit_admin_cmd(dev, &c, NULL);
 }
 
 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 						struct nvme_queue *nvmeq)
 {
-	int status;
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
 
@@ -971,16 +858,12 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.cq_flags = cpu_to_le16(flags);
 	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
-	if (status)
-		return -EIO;
-	return 0;
+	return nvme_submit_admin_cmd(dev, &c, NULL);
 }
 
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 						struct nvme_queue *nvmeq)
 {
-	int status;
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
 
@@ -992,10 +875,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 	c.create_sq.sq_flags = cpu_to_le16(flags);
 	c.create_sq.cqid = cpu_to_le16(qid);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
-	if (status)
-		return -EIO;
-	return 0;
+	return nvme_submit_admin_cmd(dev, &c, NULL);
 }
 
 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1051,28 +931,27 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
 }
 
 /**
- * nvme_abort_cmd - Attempt aborting a command
- * @cmdid: Command id of a timed out IO
- * @queue: The queue with timed out IO
+ * nvme_abort_req - Attempt aborting a request
  *
  * Schedule controller reset if the command was already aborted once before and
  * still hasn't been returned to the driver, or if this is the admin queue.
  */
-static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
+static void nvme_abort_req(struct request *req)
 {
-	int a_cmdid;
-	struct nvme_command cmd;
+	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
 	struct nvme_dev *dev = nvmeq->dev;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	struct nvme_queue *adminq;
+	struct request *abort_req;
+	struct nvme_cmd_info *abort_cmd;
+	struct nvme_command cmd;
 
-	if (!nvmeq->qid || info[cmdid].aborted) {
+	if (!nvmeq->qid || cmd_rq->aborted) {
 		if (work_busy(&dev->reset_work))
 			return;
 		list_del_init(&dev->node);
 		dev_warn(&dev->pci_dev->dev,
-			"I/O %d QID %d timeout, reset controller\n", cmdid,
-								nvmeq->qid);
+			"I/O %d QID %d timeout, reset controller\n",
+							req->tag, nvmeq->qid);
 		dev->reset_workfn = nvme_reset_failed_dev;
 		queue_work(nvme_workq, &dev->reset_work);
 		return;
@@ -1081,83 +960,98 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
 	if (!dev->abort_limit)
 		return;
 
-	adminq = rcu_dereference(dev->queues[0]);
-	a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion,
-								ADMIN_TIMEOUT);
-	if (a_cmdid < 0)
+	abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
+									true);
+	if (!abort_req)
 		return;
 
+	abort_cmd = blk_mq_rq_to_pdu(abort_req);
+	nvme_set_info(abort_cmd, cmd_rq, abort_completion);
+
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.abort.opcode = nvme_admin_abort_cmd;
-	cmd.abort.cid = cmdid;
+	cmd.abort.cid = req->tag;
 	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
-	cmd.abort.command_id = a_cmdid;
+	cmd.abort.command_id = abort_req->tag;
 
 	--dev->abort_limit;
-	info[cmdid].aborted = 1;
-	info[cmdid].timeout = jiffies + ADMIN_TIMEOUT;
+	cmd_rq->aborted = 1;
 
-	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid,
+	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
 							nvmeq->qid);
-	nvme_submit_cmd(adminq, &cmd);
+	if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) {
+		dev_warn(nvmeq->q_dmadev, "Could not abort I/O %d QID %d",
+							req->tag, nvmeq->qid);
+	}
 }
 
-/**
- * nvme_cancel_ios - Cancel outstanding I/Os
- * @queue: The queue to cancel I/Os on
- * @timeout: True to only cancel I/Os which have timed out
- */
-static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
+static void nvme_cancel_queue_ios(void *data, unsigned long *tag_map)
 {
-	int depth = nvmeq->q_depth - 1;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	unsigned long now = jiffies;
-	int cmdid;
+	struct nvme_queue *nvmeq = data;
+	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
+	unsigned int tag = 0;
 
-	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
+	tag = 0;
+	do {
+		struct request *req;
 		void *ctx;
 		nvme_completion_fn fn;
+		struct nvme_cmd_info *cmd;
 		static struct nvme_completion cqe = {
 			.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
 		};
+		int qdepth = nvmeq == nvmeq->dev->queues[0] ?
+					nvmeq->dev->admin_tagset.queue_depth :
+					nvmeq->dev->tagset.queue_depth;
 
-		if (timeout && !time_after(now, info[cmdid].timeout))
-			continue;
-		if (info[cmdid].ctx == CMD_CTX_CANCELLED)
-			continue;
-		if (timeout && nvmeq->dev->initialized) {
-			nvme_abort_cmd(cmdid, nvmeq);
+		/* zero'd bits are free tags */
+		tag = find_next_zero_bit(tag_map, qdepth, tag);
+		if (tag >= qdepth)
+			break;
+
+		req = blk_mq_tag_to_rq(hctx, tag++);
+		cmd = blk_mq_rq_to_pdu(req);
+
+		if (cmd->ctx == CMD_CTX_CANCELLED)
 			continue;
-		}
-		dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
-								nvmeq->qid);
-		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
+
+		dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
+							req->tag, nvmeq->qid);
+		ctx = cancel_cmd_info(cmd, &fn);
 		fn(nvmeq, ctx, &cqe);
-	}
+
+	} while (1);
 }
 
-static void nvme_free_queue(struct rcu_head *r)
+/**
+ * nvme_cancel_ios - Cancel outstanding I/Os
+ * @nvmeq: The queue to cancel I/Os on
+ * @tagset: The tag set associated with the queue
+ */
+static void nvme_cancel_ios(struct nvme_queue *nvmeq)
 {
-	struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head);
+	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
 
-	spin_lock_irq(&nvmeq->q_lock);
-	while (bio_list_peek(&nvmeq->sq_cong)) {
-		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
-		bio_endio(bio, -EIO);
-	}
-	while (!list_empty(&nvmeq->iod_bio)) {
-		static struct nvme_completion cqe = {
-			.status = cpu_to_le16(
-				(NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1),
-		};
-		struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio,
-							struct nvme_iod,
-							node);
-		list_del(&iod->node);
-		bio_completion(nvmeq, iod, &cqe);
-	}
-	spin_unlock_irq(&nvmeq->q_lock);
+	if (nvmeq->dev->initialized)
+		blk_mq_tag_busy_iter(hctx->tags, nvme_cancel_queue_ios, nvmeq);
+}
+
+static enum blk_eh_timer_return nvme_timeout(struct request *req)
+{
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = cmd->nvmeq;
+
+	dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", req->tag,
+							nvmeq->qid);
+
+	if (nvmeq->dev->initialized)
+		nvme_abort_req(req);
+
+	return BLK_EH_HANDLED;
+}
 
+static void nvme_free_queue(struct nvme_queue *nvmeq)
+{
 	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
 				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
 	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
@@ -1172,10 +1066,10 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 	int i;
 
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
-		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
-		rcu_assign_pointer(dev->queues[i], NULL);
-		call_rcu(&nvmeq->r_head, nvme_free_queue);
+		struct nvme_queue *nvmeq = dev->queues[i];
+		nvme_free_queue(nvmeq);
 		dev->queue_count--;
+		dev->queues[i] = NULL;
 	}
 }
 
@@ -1208,13 +1102,13 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq)
 {
 	spin_lock_irq(&nvmeq->q_lock);
 	nvme_process_cq(nvmeq);
-	nvme_cancel_ios(nvmeq, false);
+	nvme_cancel_ios(nvmeq);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
 static void nvme_disable_queue(struct nvme_dev *dev, int qid)
 {
-	struct nvme_queue *nvmeq = raw_nvmeq(dev, qid);
+	struct nvme_queue *nvmeq = dev->queues[qid];
 
 	if (!nvmeq)
 		return;
@@ -1234,8 +1128,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 							int depth, int vector)
 {
 	struct device *dmadev = &dev->pci_dev->dev;
-	unsigned extra = nvme_queue_extra(depth);
-	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
+	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
 	if (!nvmeq)
 		return NULL;
 
@@ -1260,17 +1153,13 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	spin_lock_init(&nvmeq->q_lock);
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
-	init_waitqueue_head(&nvmeq->sq_full);
-	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
-	bio_list_init(&nvmeq->sq_cong);
-	INIT_LIST_HEAD(&nvmeq->iod_bio);
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 	nvmeq->q_depth = depth;
 	nvmeq->cq_vector = vector;
 	nvmeq->qid = qid;
 	nvmeq->q_suspended = 1;
 	dev->queue_count++;
-	rcu_assign_pointer(dev->queues[qid], nvmeq);
+	dev->queues[qid] = nvmeq;
 
 	return nvmeq;
 
@@ -1299,15 +1188,12 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
-	unsigned extra = nvme_queue_extra(nvmeq->q_depth);
 
 	nvmeq->sq_tail = 0;
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
-	memset(nvmeq->cmdid_data, 0, extra);
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
-	nvme_cancel_ios(nvmeq, false);
 	nvmeq->q_suspended = 0;
 	dev->online_queues++;
 }
@@ -1408,6 +1294,53 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
 	return 0;
 }
 
+static struct blk_mq_ops nvme_mq_admin_ops = {
+	.queue_rq	= nvme_admin_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.init_hctx	= nvme_admin_init_hctx,
+	.init_request	= nvme_admin_init_request,
+	.timeout	= nvme_timeout,
+};
+
+static struct blk_mq_ops nvme_mq_ops = {
+	.queue_rq	= nvme_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.init_hctx	= nvme_init_hctx,
+	.init_request	= nvme_init_request,
+	.timeout	= nvme_timeout,
+};
+
+static int nvme_alloc_admin_tags(struct nvme_dev *dev)
+{
+	if (!dev->admin_q) {
+		dev->admin_tagset.ops = &nvme_mq_admin_ops;
+		dev->admin_tagset.nr_hw_queues = 1;
+		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH;
+		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
+		dev->admin_tagset.reserved_tags = 1,
+		dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+		dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
+		dev->admin_tagset.driver_data = dev;
+
+		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
+			return -ENOMEM;
+
+		dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);
+		if (!dev->admin_q) {
+			blk_mq_free_tag_set(&dev->admin_tagset);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static void nvme_free_admin_tags(struct nvme_dev *dev)
+{
+	if (dev->admin_q)
+		blk_mq_free_tag_set(&dev->admin_tagset);
+}
+
 static int nvme_configure_admin_queue(struct nvme_dev *dev)
 {
 	int result;
@@ -1419,9 +1352,9 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	if (result < 0)
 		return result;
 
-	nvmeq = raw_nvmeq(dev, 0);
+	nvmeq = dev->queues[0];
 	if (!nvmeq) {
-		nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+		nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, 0);
 		if (!nvmeq)
 			return -ENOMEM;
 	}
@@ -1441,16 +1374,26 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 
 	result = nvme_enable_ctrl(dev, cap);
 	if (result)
-		return result;
+		goto free_nvmeq;
+
+	result = nvme_alloc_admin_tags(dev);
+	if (result)
+		goto free_nvmeq;
 
 	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
 	if (result)
-		return result;
+		goto free_tags;
 
 	spin_lock_irq(&nvmeq->q_lock);
 	nvme_init_queue(nvmeq, 0);
 	spin_unlock_irq(&nvmeq->q_lock);
 	return result;
+
+ free_tags:
+	nvme_free_admin_tags(dev);
+ free_nvmeq:
+	nvme_free_queues(dev, 0);
+	return result;
 }
 
 struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
@@ -1605,7 +1548,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	if (length != (io.nblocks + 1) << ns->lba_shift)
 		status = -ENOMEM;
 	else
-		status = nvme_submit_io_cmd(dev, &c, NULL);
+		status = nvme_submit_io_cmd(dev, ns, &c, NULL);
 
 	if (meta_len) {
 		if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) {
@@ -1677,10 +1620,11 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
 
 	timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
 								ADMIN_TIMEOUT;
+
 	if (length != cmd.data_len)
 		status = -ENOMEM;
 	else
-		status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout);
+		status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout);
 
 	if (cmd.data_len) {
 		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
@@ -1769,41 +1713,6 @@ static const struct block_device_operations nvme_fops = {
 	.getgeo		= nvme_getgeo,
 };
 
-static void nvme_resubmit_iods(struct nvme_queue *nvmeq)
-{
-	struct nvme_iod *iod, *next;
-
-	list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) {
-		if (unlikely(nvme_submit_iod(nvmeq, iod)))
-			break;
-		list_del(&iod->node);
-		if (bio_list_empty(&nvmeq->sq_cong) &&
-						list_empty(&nvmeq->iod_bio))
-			remove_wait_queue(&nvmeq->sq_full,
-						&nvmeq->sq_cong_wait);
-	}
-}
-
-static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
-{
-	while (bio_list_peek(&nvmeq->sq_cong)) {
-		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
-		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
-
-		if (bio_list_empty(&nvmeq->sq_cong) &&
-						list_empty(&nvmeq->iod_bio))
-			remove_wait_queue(&nvmeq->sq_full,
-							&nvmeq->sq_cong_wait);
-		if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
-			if (!waitqueue_active(&nvmeq->sq_full))
-				add_wait_queue(&nvmeq->sq_full,
-							&nvmeq->sq_cong_wait);
-			bio_list_add_head(&nvmeq->sq_cong, bio);
-			break;
-		}
-	}
-}
-
 static int nvme_kthread(void *data)
 {
 	struct nvme_dev *dev, *next;
@@ -1824,23 +1733,17 @@ static int nvme_kthread(void *data)
 				queue_work(nvme_workq, &dev->reset_work);
 				continue;
 			}
-			rcu_read_lock();
 			for (i = 0; i < dev->queue_count; i++) {
-				struct nvme_queue *nvmeq =
-						rcu_dereference(dev->queues[i]);
+				struct nvme_queue *nvmeq = dev->queues[i];
 				if (!nvmeq)
 					continue;
 				spin_lock_irq(&nvmeq->q_lock);
 				if (nvmeq->q_suspended)
 					goto unlock;
 				nvme_process_cq(nvmeq);
-				nvme_cancel_ios(nvmeq, true);
-				nvme_resubmit_bios(nvmeq);
-				nvme_resubmit_iods(nvmeq);
  unlock:
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
-			rcu_read_unlock();
 		}
 		spin_unlock(&dev_list_lock);
 		schedule_timeout(round_jiffies_relative(HZ));
@@ -1863,27 +1766,29 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 {
 	struct nvme_ns *ns;
 	struct gendisk *disk;
+	int node = dev_to_node(&dev->pci_dev->dev);
 	int lbaf;
 
 	if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
 		return NULL;
 
-	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
 		return NULL;
-	ns->queue = blk_alloc_queue(GFP_KERNEL);
+	ns->queue = blk_mq_init_queue(&dev->tagset);
 	if (!ns->queue)
 		goto out_free_ns;
-	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
+	queue_flag_set_unlocked(QUEUE_FLAG_DEFAULT, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
-	blk_queue_make_request(ns->queue, nvme_make_request);
+	queue_flag_clear_unlocked(QUEUE_FLAG_IO_STAT, ns->queue);
 	ns->dev = dev;
 	ns->queue->queuedata = ns;
 
-	disk = alloc_disk(0);
+	disk = alloc_disk_node(0, node);
 	if (!disk)
 		goto out_free_queue;
+
 	ns->ns_id = nsid;
 	ns->disk = disk;
 	lbaf = id->flbas & 0xf;
@@ -1917,143 +1822,19 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	return NULL;
 }
 
-static int nvme_find_closest_node(int node)
-{
-	int n, val, min_val = INT_MAX, best_node = node;
-
-	for_each_online_node(n) {
-		if (n == node)
-			continue;
-		val = node_distance(node, n);
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-	return best_node;
-}
-
-static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq,
-								int count)
-{
-	int cpu;
-	for_each_cpu(cpu, qmask) {
-		if (cpumask_weight(nvmeq->cpu_mask) >= count)
-			break;
-		if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask))
-			*per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid;
-	}
-}
-
-static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
-	const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue)
-{
-	int next_cpu;
-	for_each_cpu(next_cpu, new_mask) {
-		cpumask_or(mask, mask, get_cpu_mask(next_cpu));
-		cpumask_or(mask, mask, topology_thread_cpumask(next_cpu));
-		cpumask_and(mask, mask, unassigned_cpus);
-		nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue);
-	}
-}
-
 static void nvme_create_io_queues(struct nvme_dev *dev)
 {
-	unsigned i, max;
+	unsigned i;
 
-	max = min(dev->max_qid, num_online_cpus());
-	for (i = dev->queue_count; i <= max; i++)
+	for (i = dev->queue_count; i <= dev->max_qid; i++)
 		if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
 			break;
 
-	max = min(dev->queue_count - 1, num_online_cpus());
-	for (i = dev->online_queues; i <= max; i++)
-		if (nvme_create_queue(raw_nvmeq(dev, i), i))
+	for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
+		if (nvme_create_queue(dev->queues[i], i))
 			break;
 }
 
-/*
- * If there are fewer queues than online cpus, this will try to optimally
- * assign a queue to multiple cpus by grouping cpus that are "close" together:
- * thread siblings, core, socket, closest node, then whatever else is
- * available.
- */
-static void nvme_assign_io_queues(struct nvme_dev *dev)
-{
-	unsigned cpu, cpus_per_queue, queues, remainder, i;
-	cpumask_var_t unassigned_cpus;
-
-	nvme_create_io_queues(dev);
-
-	queues = min(dev->online_queues - 1, num_online_cpus());
-	if (!queues)
-		return;
-
-	cpus_per_queue = num_online_cpus() / queues;
-	remainder = queues - (num_online_cpus() - queues * cpus_per_queue);
-
-	if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL))
-		return;
-
-	cpumask_copy(unassigned_cpus, cpu_online_mask);
-	cpu = cpumask_first(unassigned_cpus);
-	for (i = 1; i <= queues; i++) {
-		struct nvme_queue *nvmeq = lock_nvmeq(dev, i);
-		cpumask_t mask;
-
-		cpumask_clear(nvmeq->cpu_mask);
-		if (!cpumask_weight(unassigned_cpus)) {
-			unlock_nvmeq(nvmeq);
-			break;
-		}
-
-		mask = *get_cpu_mask(cpu);
-		nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				topology_thread_cpumask(cpu),
-				nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				topology_core_cpumask(cpu),
-				nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				cpumask_of_node(cpu_to_node(cpu)),
-				nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				cpumask_of_node(
-					nvme_find_closest_node(
-						cpu_to_node(cpu))),
-				nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				unassigned_cpus,
-				nvmeq, cpus_per_queue);
-
-		WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue,
-			"nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
-			dev->instance, i);
-
-		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
-							nvmeq->cpu_mask);
-		cpumask_andnot(unassigned_cpus, unassigned_cpus,
-						nvmeq->cpu_mask);
-		cpu = cpumask_next(cpu, unassigned_cpus);
-		if (remainder && !--remainder)
-			cpus_per_queue++;
-		unlock_nvmeq(nvmeq);
-	}
-	WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n",
-								dev->instance);
-	i = 0;
-	cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
-	for_each_cpu(cpu, unassigned_cpus)
-		*per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1;
-	free_cpumask_var(unassigned_cpus);
-}
-
 static int set_queue_count(struct nvme_dev *dev, int count)
 {
 	int status;
@@ -2077,22 +1858,9 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
 	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
 }
 
-static int nvme_cpu_notify(struct notifier_block *self,
-				unsigned long action, void *hcpu)
-{
-	struct nvme_dev *dev = container_of(self, struct nvme_dev, nb);
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		nvme_assign_io_queues(dev);
-		break;
-	}
-	return NOTIFY_OK;
-}
-
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	struct nvme_queue *adminq = raw_nvmeq(dev, 0);
+	struct nvme_queue *adminq = dev->queues[0];
 	struct pci_dev *pdev = dev->pci_dev;
 	int result, i, vecs, nr_io_queues, size;
 
@@ -2151,12 +1919,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
 	/* Free previously allocated queues that are no longer usable */
 	nvme_free_queues(dev, nr_io_queues + 1);
-	nvme_assign_io_queues(dev);
-
-	dev->nb.notifier_call = &nvme_cpu_notify;
-	result = register_hotcpu_notifier(&dev->nb);
-	if (result)
-		goto free_queues;
+	nvme_create_io_queues(dev);
 
 	return 0;
 
@@ -2208,6 +1971,18 @@ static int nvme_dev_add(struct nvme_dev *dev)
 			(pdev->device == 0x0953) && ctrl->vs[3])
 		dev->stripe_size = 1 << (ctrl->vs[3] + shift);
 
+	dev->tagset.ops = &nvme_mq_ops;
+	dev->tagset.nr_hw_queues = dev->online_queues - 1;
+	dev->tagset.timeout = NVME_IO_TIMEOUT;
+	dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+	dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH);
+	dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
+	dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
+	dev->tagset.driver_data = dev;
+
+	if (blk_mq_alloc_tag_set(&dev->tagset))
+		goto out;
+
 	id_ns = mem;
 	for (i = 1; i <= nn; i++) {
 		res = nvme_identify(dev, i, 0, dma_addr);
@@ -2356,7 +2131,8 @@ static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
 	c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
 
 	init_kthread_work(&nvmeq->cmdinfo.work, fn);
-	return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo);
+	return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
+								ADMIN_TIMEOUT);
 }
 
 static void nvme_del_cq_work_handler(struct kthread_work *work)
@@ -2419,7 +2195,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
 	atomic_set(&dq.refcount, 0);
 	dq.worker = &worker;
 	for (i = dev->queue_count - 1; i > 0; i--) {
-		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+		struct nvme_queue *nvmeq = dev->queues[i];
 
 		if (nvme_suspend_queue(nvmeq))
 			continue;
@@ -2457,13 +2233,12 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 	int i;
 
 	dev->initialized = 0;
-	unregister_hotcpu_notifier(&dev->nb);
 
 	nvme_dev_list_remove(dev);
 
 	if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
 		for (i = dev->queue_count - 1; i >= 0; i--) {
-			struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+			struct nvme_queue *nvmeq = dev->queues[i];
 			nvme_suspend_queue(nvmeq);
 			nvme_clear_queue(nvmeq);
 		}
@@ -2475,6 +2250,12 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 	nvme_dev_unmap(dev);
 }
 
+static void nvme_dev_remove_admin(struct nvme_dev *dev)
+{
+	if (dev->admin_q && !blk_queue_dying(dev->admin_q))
+		blk_cleanup_queue(dev->admin_q);
+}
+
 static void nvme_dev_remove(struct nvme_dev *dev)
 {
 	struct nvme_ns *ns;
@@ -2556,7 +2337,7 @@ static void nvme_free_dev(struct kref *kref)
 	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
 
 	nvme_free_namespaces(dev);
-	free_percpu(dev->io_queue);
+	blk_mq_free_tag_set(&dev->tagset);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2708,23 +2489,24 @@ static void nvme_reset_workfn(struct work_struct *work)
 
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	int result = -ENOMEM;
+	int node, result = -ENOMEM;
 	struct nvme_dev *dev;
 
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	node = dev_to_node(&pdev->dev);
+	if (node == NUMA_NO_NODE)
+		set_dev_node(&pdev->dev, 0);
+
+	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
 	if (!dev)
 		return -ENOMEM;
-	dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
-								GFP_KERNEL);
+	dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
+							GFP_KERNEL, node);
 	if (!dev->entry)
 		goto free;
-	dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
-								GFP_KERNEL);
+	dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
+							GFP_KERNEL, node);
 	if (!dev->queues)
 		goto free;
-	dev->io_queue = alloc_percpu(unsigned short);
-	if (!dev->io_queue)
-		goto free;
 
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->reset_workfn = nvme_reset_failed_dev;
@@ -2766,6 +2548,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
  remove:
 	nvme_dev_remove(dev);
+	nvme_dev_remove_admin(dev);
 	nvme_free_namespaces(dev);
  shutdown:
 	nvme_dev_shutdown(dev);
@@ -2775,7 +2558,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  release:
 	nvme_release_instance(dev);
  free:
-	free_percpu(dev->io_queue);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2801,8 +2583,9 @@ static void nvme_remove(struct pci_dev *pdev)
 	misc_deregister(&dev->miscdev);
 	nvme_dev_remove(dev);
 	nvme_dev_shutdown(dev);
+	nvme_dev_remove_admin(dev);
 	nvme_free_queues(dev, 0);
-	rcu_barrier();
+	nvme_free_admin_tags(dev);
 	nvme_release_instance(dev);
 	nvme_release_prp_pools(dev);
 	kref_put(&dev->kref, nvme_free_dev);
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index da3b252..d386cae 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -2103,7 +2103,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 
 		nvme_offset += unit_num_blocks;
 
-		nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
+		nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
 		if (nvme_sc != NVME_SC_SUCCESS) {
 			nvme_unmap_user_pages(dev,
 				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
@@ -2656,7 +2656,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 			c.common.opcode = nvme_cmd_flush;
 			c.common.nsid = cpu_to_le32(ns->ns_id);
 
-			nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
+			nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
 			res = nvme_trans_status_code(hdr, nvme_sc);
 			if (res)
 				goto out;
@@ -2684,7 +2684,7 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
 	c.common.opcode = nvme_cmd_flush;
 	c.common.nsid = cpu_to_le32(ns->ns_id);
 
-	nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
+	nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
 
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
@@ -2892,7 +2892,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	c.dsm.nr = cpu_to_le32(ndesc - 1);
 	c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 
-	nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
+	nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 
 	dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 6266373..38aade9 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -19,6 +19,7 @@
 #include <linux/pci.h>
 #include <linux/miscdevice.h>
 #include <linux/kref.h>
+#include <linux/blk-mq.h>
 
 struct nvme_bar {
 	__u64			cap;	/* Controller Capabilities */
@@ -70,8 +71,10 @@ extern unsigned char io_timeout;
  */
 struct nvme_dev {
 	struct list_head node;
-	struct nvme_queue __rcu **queues;
-	unsigned short __percpu *io_queue;
+	struct nvme_queue **queues;
+	struct request_queue *admin_q;
+	struct blk_mq_tag_set tagset;
+	struct blk_mq_tag_set admin_tagset;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
 	struct dma_pool *prp_page_pool;
@@ -90,7 +93,6 @@ struct nvme_dev {
 	struct miscdevice miscdev;
 	work_func_t reset_workfn;
 	struct work_struct reset_work;
-	struct notifier_block nb;
 	char name[12];
 	char serial[20];
 	char model[40];
@@ -132,7 +134,6 @@ struct nvme_iod {
 	int offset;		/* Of PRP list */
 	int nents;		/* Used in scatterlist */
 	int length;		/* Of data, in bytes */
-	unsigned long start_time;
 	dma_addr_t first_dma;
 	struct list_head node;
 	struct scatterlist sg[0];
@@ -150,12 +151,13 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
  */
 void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod);
 
-int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int , gfp_t);
+int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int, gfp_t);
 struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 				unsigned long addr, unsigned length);
 void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
 			struct nvme_iod *iod);
-int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_command *, u32 *);
+int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_ns *,
+						struct nvme_command *, u32 *);
 int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns);
 int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *,
 							u32 *result);
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] conversion to blk-mq
  2014-06-02 20:55 [PATCH v5] conversion to blk-mq Matias Bjørling
  2014-06-02 20:55 ` [PATCH v5] NVMe: " Matias Bjørling
@ 2014-06-02 22:49 ` Keith Busch
  2014-06-02 23:06   ` Keith Busch
  1 sibling, 1 reply; 14+ messages in thread
From: Keith Busch @ 2014-06-02 22:49 UTC (permalink / raw)
  To: Matias Bjørling
  Cc: willy, keith.busch, sbradshaw, axboe, linux-kernel, linux-nvme,
	hch

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3821 bytes --]

On Mon, 2 Jun 2014, Matias Bjørling wrote:
> Hi Matthew and Keith,
>
> Here is an updated patch with the feedback from the previous days. It's against
> Jens' for-3.16/core tree. You may use the nvmemq_wip_review branch at:

I'm testing this on my normal hardware now. As I feared, hot removal
doesn't work while the device is actively being managed. Here's the oops:

[ 1267.018283] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004
[ 1267.018288] IP: [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
[ 1267.018292] PGD b5ed5067 PUD b57e2067 PMD 0
[ 1267.018294] Oops: 0000 [#1] SMP
[ 1267.018343] Modules linked in: nvme parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc joydev jfs hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd microcode ehci_pci ehci_hcd usbcore pcspkr lpc_ich acpi_cpufreq ioatdma mfd_core usb_common i2c_i801 evdev wmi ipmi_si tpm_tis ipmi_msghandler tpm processor thermal_sys button ext4 crc16 jbd2 mbcache dm_mod nbd sg sr_mod cdrom sd_mod crc_t10dif crct10dif_common isci libsas ahci scsi_transport_sas libahci igb libata ptp pps_core i2c_algo_bit scsi_mod i2c_core dca [last unloaded: nvme]
[ 1267.018346] CPU: 1 PID: 6618 Comm: nvme_id_ctrl Tainted: G        W     3.15.0-rc1+ #2
[ 1267.018347] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
[ 1267.018349] task: ffff88042879eef0 ti: ffff8800b5f14000 task.ti: ffff8800b5f14000
[ 1267.018354] RIP: 0010:[<ffffffff811c51b8>]  [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
[ 1267.018356] RSP: 0018:ffff8800b5f15db0  EFLAGS: 00010206
[ 1267.018357] RAX: 0000000000000000 RBX: ffffe8fbffa21e80 RCX: ffff88042f3952e0
[ 1267.018359] RDX: 0000000000001c10 RSI: 0000000000000001 RDI: ffff8800b5e9f008
[ 1267.018360] RBP: ffff88042d993480 R08: ffff8800b5e18cc0 R09: 0000000000000000
[ 1267.018362] R10: 0000000000000ca0 R11: 0000000000000ca0 R12: ffff8800b5f15e94
[ 1267.018363] R13: 0000000000003a98 R14: 00007fffc7b44c40 R15: 0000000000000000
[ 1267.018365] FS:  00007feb4cb05700(0000) GS:ffff88043f620000(0000) knlGS:0000000000000000
[ 1267.018367] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1267.018369] CR2: 0000000000000004 CR3: 00000000b50f2000 CR4: 00000000000407e0
[ 1267.018369] Stack:
[ 1267.018372]  ffffffff811c6334 00000000fffffffc ffff88042d993480 00000000fffffffc
[ 1267.018375]  ffffffffa0532d73 0000000000000ca0 00000000fffffff4 ffff88042562b440
[ 1267.018378]  ffff88042d853000 0000000000001000 ffffffffa05348f3 ffffc9001518b2a8
[ 1267.018378] Call Trace:
[ 1267.018383]  [<ffffffff811c6334>] ? blk_mq_free_request+0x37/0x48
[ 1267.018388]  [<ffffffffa0532d73>] ? __nvme_submit_admin_cmd+0x7f/0x8a [nvme]
[ 1267.018392]  [<ffffffffa05348f3>] ? nvme_user_admin_cmd+0x144/0x1b1 [nvme]
[ 1267.018397]  [<ffffffffa053497d>] ? nvme_dev_ioctl+0x1d/0x2b [nvme]
[ 1267.018399]  [<ffffffff81125916>] ? do_vfs_ioctl+0x3f2/0x43b
[ 1267.018402]  [<ffffffff8105ca6b>] ? finish_task_switch+0x84/0xc4
[ 1267.018406]  [<ffffffff81394a8c>] ? __schedule+0x45c/0x4f0
[ 1267.018408]  [<ffffffff811259ad>] ? SyS_ioctl+0x4e/0x7d
[ 1267.018411]  [<ffffffff8139c6d2>] ? system_call_fastpath+0x16/0x1b
[ 1267.018435] Code: 8b 4a 38 48 39 4e 38 72 12 74 06 b8 01 00 00 00 c3 48 8b 4a 60 48 39 4e 60 73 f0 c3 66 66 66 66 90 48 8b 87 e0 00 00 00 48 63 f6 <8b> 14 b0 48 8b 87 f8 00 00 00 48 8b 04 d0 c3 89 ff f0 48 0f ab
[ 1267.018438] RIP  [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
[ 1267.018439]  RSP <ffff8800b5f15db0>
[ 1267.018440] CR2: 0000000000000004
[ 1267.018443] ---[ end trace 1e244ae5f3ceb23b ]---

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] conversion to blk-mq
  2014-06-02 22:49 ` [PATCH v5] " Keith Busch
@ 2014-06-02 23:06   ` Keith Busch
  2014-06-03 13:56     ` Matias Bjørling
  2014-06-03 20:12     ` Matias Bjorling
  0 siblings, 2 replies; 14+ messages in thread
From: Keith Busch @ 2014-06-02 23:06 UTC (permalink / raw)
  To: Keith Busch
  Cc: Matias Bjørling, willy, sbradshaw, axboe, linux-kernel,
	linux-nvme, hch

[-- Attachment #1: Type: TEXT/PLAIN, Size: 8442 bytes --]

Depending on the timing, it might fail in alloc instead of free:

Jun  2 16:45:40 kbgrz1 kernel: [  265.421243] NULL pointer dereference at           (null)
Jun  2 16:45:40 kbgrz1 kernel: [  265.434284] PGD 429acf067 PUD 42ce28067 PMD 0
Jun  2 16:45:40 kbgrz1 kernel: [  265.439565] Oops: 0000 [#1] SMP
Jun  2 16:45:40 kbgrz1 kernel: [  265.443413] Modules linked in: nvme parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc jfs joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd ehci_pci ehci_hcd microcode usbcore pcspkr ioatdma lpc_ich usb_common mfd_core i2c_i801 evdev wmi acpi_cpufreq tpm_tis ipmi_si tpm ipmi_msghandler processor thermal_sys button ext4 crc16 jbd2 mbcache dm_mod nbd sg sd_mod sr_mod crc_t10dif cdrom crct10dif_common isci libsas igb ahci scsi_transport_sas libahci ptp libata pps_core i2c_algo_bit i2c_core scsi_mod dca
Jun  2 16:45:40 kbgrz1 kernel: [  265.526398] CPU: 4 PID: 5454 Comm: nvme_id_ctrl Not tainted 3.15.0-rc1+ #2
Jun  2 16:45:40 kbgrz1 kernel: [  265.534181] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
Jun  2 16:45:40 kbgrz1 kernel: [  265.545789] task: ffff88042e418390 ti: ffff8804283e6000 task.ti: ffff8804283e6000
Jun  2 16:45:40 kbgrz1 kernel: [  265.554270] RIP: 0010:[<ffffffff811c51b8>]  [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
Jun  2 16:45:40 kbgrz1 kernel: [  265.563720] RSP: 0018:ffff8804283e7d80  EFLAGS: 00010286
Jun  2 16:45:40 kbgrz1 kernel: [  265.569755] RAX: 0000000000000000 RBX: ffff880035a06008 RCX: 00000000000f4240
Jun  2 16:45:40 kbgrz1 kernel: [  265.577830] RDX: 000000000028a360 RSI: 0000000000000000 RDI: ffff880035a06008
Jun  2 16:45:40 kbgrz1 kernel: [  265.585904] RBP: ffff88043f680000 R08: 0000000000000000 R09: 0000000000001000
Jun  2 16:45:40 kbgrz1 kernel: [  265.593979] R10: 0000000000001000 R11: 0000000000000410 R12: 00000000000000d0
Jun  2 16:45:40 kbgrz1 kernel: [  265.602053] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000
Jun  2 16:45:40 kbgrz1 kernel: [  265.610134] FS:  00007f74b8bc9700(0000) GS:ffff88043f680000(0000) knlGS:0000000000000000
Jun  2 16:45:40 kbgrz1 kernel: [  265.619303] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jun  2 16:45:40 kbgrz1 kernel: [  265.625824] CR2: 0000000000000000 CR3: 00000004292c4000 CR4: 00000000000407e0
Jun  2 16:45:40 kbgrz1 kernel: [  265.633889] Stack:
Jun  2 16:45:40 kbgrz1 kernel: [  265.636236]  ffffffff811c689f 0000000000000000 00000000fffffff4 ffff8804283e7e10
Jun  2 16:45:40 kbgrz1 kernel: [  265.644949]  ffff8804283e7e94 000000000000007d 00007fff4f8a73b0 0000000000000000
Jun  2 16:45:40 kbgrz1 kernel: [  265.653653]  ffffffffa055acc7 0000000000000246 000000002d0aaec0 ffff88042d0aaec0
Jun  2 16:45:40 kbgrz1 kernel: [  265.662358] Call Trace:
Jun  2 16:45:40 kbgrz1 kernel: [  265.665194]  [<ffffffff811c689f>] ? blk_mq_alloc_request+0x54/0xd5
Jun  2 16:45:40 kbgrz1 kernel: [  265.672217]  [<ffffffffa055acc7>] ? __nvme_submit_admin_cmd+0x2d/0x68 [nvme]
Jun  2 16:45:40 kbgrz1 kernel: [  265.680196]  [<ffffffffa055c879>] ? nvme_user_admin_cmd+0x144/0x1b1 [nvme]
Jun  2 16:45:40 kbgrz1 kernel: [  265.687987]  [<ffffffffa055c903>] ? nvme_dev_ioctl+0x1d/0x2b [nvme]
Jun  2 16:45:40 kbgrz1 kernel: [  265.695107]  [<ffffffff81125916>] ? do_vfs_ioctl+0x3f2/0x43b
Jun  2 16:45:40 kbgrz1 kernel: [  265.701547]  [<ffffffff8105ca6b>] ? finish_task_switch+0x84/0xc4
Jun  2 16:45:40 kbgrz1 kernel: [  265.708382]  [<ffffffff81394a8c>] ? __schedule+0x45c/0x4f0
Jun  2 16:45:40 kbgrz1 kernel: [  265.714603]  [<ffffffff811259ad>] ? SyS_ioctl+0x4e/0x7d
Jun  2 16:45:40 kbgrz1 kernel: [  265.720555]  [<ffffffff8139c6d2>] ? system_call_fastpath+0x16/0x1b
Jun  2 16:45:40 kbgrz1 kernel: [  265.727560] Code: 8b 4a 38 48 39 4e 38 72 12 74 06 b8 01 00 00 00 c3 48 8b 4a 60 48 39 4e 60 73 f0 c3 66 66 66 66 90 48 8b 87 e0 00 00 00 48 63 f6 <8b> 14 b0 48 8b 87 f8 00 00 00 48 8b 04 d0 c3 89 ff f0 48 0f ab
Jun  2 16:45:40 kbgrz1 kernel: [  265.760706]  RSP <ffff8804283e7d80>
Jun  2 16:45:40 kbgrz1 kernel: [  265.764705] CR2: 0000000000000000
Jun  2 16:45:40 kbgrz1 kernel: [  265.768531] ---[ end trace 785048a51785f51e ]---

On Mon, 2 Jun 2014, Keith Busch wrote:
> On Mon, 2 Jun 2014, Matias Bjørling wrote:
>> Hi Matthew and Keith,
>> 
>> Here is an updated patch with the feedback from the previous days. It's 
>> against
>> Jens' for-3.16/core tree. You may use the nvmemq_wip_review branch at:
>
> I'm testing this on my normal hardware now. As I feared, hot removal
> doesn't work while the device is actively being managed. Here's the oops:
>
> [ 1267.018283] BUG: unable to handle kernel NULL pointer dereference at 
> 0000000000000004
> [ 1267.018288] IP: [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
> [ 1267.018292] PGD b5ed5067 PUD b57e2067 PMD 0
> [ 1267.018294] Oops: 0000 [#1] SMP
> [ 1267.018343] Modules linked in: nvme parport_pc ppdev lp parport dlm sctp 
> libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache 
> sunrpc md4 hmac cifs bridge stp llc joydev jfs hid_generic usbhid hid loop 
> md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt 
> iTCO_vendor_support crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 
> glue_helper lrw gf128mul ablk_helper cryptd microcode ehci_pci ehci_hcd 
> usbcore pcspkr lpc_ich acpi_cpufreq ioatdma mfd_core usb_common i2c_i801 
> evdev wmi ipmi_si tpm_tis ipmi_msghandler tpm processor thermal_sys button 
> ext4 crc16 jbd2 mbcache dm_mod nbd sg sr_mod cdrom sd_mod crc_t10dif 
> crct10dif_common isci libsas ahci scsi_transport_sas libahci igb libata ptp 
> pps_core i2c_algo_bit scsi_mod i2c_core dca [last unloaded: nvme]
> [ 1267.018346] CPU: 1 PID: 6618 Comm: nvme_id_ctrl Tainted: G        W 
> 3.15.0-rc1+ #2
> [ 1267.018347] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS 
> SE5C600.86B.02.02.0002.122320131210 12/23/2013
> [ 1267.018349] task: ffff88042879eef0 ti: ffff8800b5f14000 task.ti: 
> ffff8800b5f14000
> [ 1267.018354] RIP: 0010:[<ffffffff811c51b8>]  [<ffffffff811c51b8>] 
> blk_mq_map_queue+0xf/0x1e
> [ 1267.018356] RSP: 0018:ffff8800b5f15db0  EFLAGS: 00010206
> [ 1267.018357] RAX: 0000000000000000 RBX: ffffe8fbffa21e80 RCX: 
> ffff88042f3952e0
> [ 1267.018359] RDX: 0000000000001c10 RSI: 0000000000000001 RDI: 
> ffff8800b5e9f008
> [ 1267.018360] RBP: ffff88042d993480 R08: ffff8800b5e18cc0 R09: 
> 0000000000000000
> [ 1267.018362] R10: 0000000000000ca0 R11: 0000000000000ca0 R12: 
> ffff8800b5f15e94
> [ 1267.018363] R13: 0000000000003a98 R14: 00007fffc7b44c40 R15: 
> 0000000000000000
> [ 1267.018365] FS:  00007feb4cb05700(0000) GS:ffff88043f620000(0000) 
> knlGS:0000000000000000
> [ 1267.018367] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 1267.018369] CR2: 0000000000000004 CR3: 00000000b50f2000 CR4: 
> 00000000000407e0
> [ 1267.018369] Stack:
> [ 1267.018372]  ffffffff811c6334 00000000fffffffc ffff88042d993480 
> 00000000fffffffc
> [ 1267.018375]  ffffffffa0532d73 0000000000000ca0 00000000fffffff4 
> ffff88042562b440
> [ 1267.018378]  ffff88042d853000 0000000000001000 ffffffffa05348f3 
> ffffc9001518b2a8
> [ 1267.018378] Call Trace:
> [ 1267.018383]  [<ffffffff811c6334>] ? blk_mq_free_request+0x37/0x48
> [ 1267.018388]  [<ffffffffa0532d73>] ? __nvme_submit_admin_cmd+0x7f/0x8a 
> [nvme]
> [ 1267.018392]  [<ffffffffa05348f3>] ? nvme_user_admin_cmd+0x144/0x1b1 [nvme]
> [ 1267.018397]  [<ffffffffa053497d>] ? nvme_dev_ioctl+0x1d/0x2b [nvme]
> [ 1267.018399]  [<ffffffff81125916>] ? do_vfs_ioctl+0x3f2/0x43b
> [ 1267.018402]  [<ffffffff8105ca6b>] ? finish_task_switch+0x84/0xc4
> [ 1267.018406]  [<ffffffff81394a8c>] ? __schedule+0x45c/0x4f0
> [ 1267.018408]  [<ffffffff811259ad>] ? SyS_ioctl+0x4e/0x7d
> [ 1267.018411]  [<ffffffff8139c6d2>] ? system_call_fastpath+0x16/0x1b
> [ 1267.018435] Code: 8b 4a 38 48 39 4e 38 72 12 74 06 b8 01 00 00 00 c3 48 8b 
> 4a 60 48 39 4e 60 73 f0 c3 66 66 66 66 90 48 8b 87 e0 00 00 00 48 63 f6 <8b> 
> 14 b0 48 8b 87 f8 00 00 00 48 8b 04 d0 c3 89 ff f0 48 0f ab
> [ 1267.018438] RIP  [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
> [ 1267.018439]  RSP <ffff8800b5f15db0>
> [ 1267.018440] CR2: 0000000000000004
> [ 1267.018443] ---[ end trace 1e244ae5f3ceb23b ]---
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] conversion to blk-mq
  2014-06-02 23:06   ` Keith Busch
@ 2014-06-03 13:56     ` Matias Bjørling
  2014-06-03 20:12     ` Matias Bjorling
  1 sibling, 0 replies; 14+ messages in thread
From: Matias Bjørling @ 2014-06-03 13:56 UTC (permalink / raw)
  To: Keith Busch; +Cc: willy, sbradshaw, axboe, linux-kernel, linux-nvme, hch

On 06/03/2014 01:06 AM, Keith Busch wrote:
> Depending on the timing, it might fail in alloc instead of free:
>
> Jun  2 16:45:40 kbgrz1 kernel: [  265.421243] NULL pointer dereference
> at           (null)
> Jun  2 16:45:40 kbgrz1 kernel: [  265.434284] PGD 429acf067 PUD
> 42ce28067 PMD 0
> Jun  2 16:45:40 kbgrz1 kernel: [  265.439565] Oops: 0000 [#1] SMP
> Jun  2 16:45:40 kbgrz1 kernel: [  265.443413] Modules linked in: nvme
> parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss
> oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp
> llc jfs joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal
> coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel
> ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul
> ablk_helper cryptd ehci_pci ehci_hcd microcode usbcore pcspkr ioatdma
> lpc_ich usb_common mfd_core i2c_i801 evdev wmi acpi_cpufreq tpm_tis
> ipmi_si tpm ipmi_msghandler processor thermal_sys button ext4 crc16 jbd2
> mbcache dm_mod nbd sg sd_mod sr_mod crc_t10dif cdrom crct10dif_common
> isci libsas igb ahci scsi_transport_sas libahci ptp libata pps_core
> i2c_algo_bit i2c_core scsi_mod dca
> Jun  2 16:45:40 kbgrz1 kernel: [  265.526398] CPU: 4 PID: 5454 Comm:
> nvme_id_ctrl Not tainted 3.15.0-rc1+ #2
> Jun  2 16:45:40 kbgrz1 kernel: [  265.534181] Hardware name: Intel
> Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210
> 12/23/2013
> Jun  2 16:45:40 kbgrz1 kernel: [  265.545789] task: ffff88042e418390 ti:
> ffff8804283e6000 task.ti: ffff8804283e6000
> Jun  2 16:45:40 kbgrz1 kernel: [  265.554270] RIP:
> 0010:[<ffffffff811c51b8>]  [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
> Jun  2 16:45:40 kbgrz1 kernel: [  265.563720] RSP:
> 0018:ffff8804283e7d80  EFLAGS: 00010286
> Jun  2 16:45:40 kbgrz1 kernel: [  265.569755] RAX: 0000000000000000 RBX:
> ffff880035a06008 RCX: 00000000000f4240
> Jun  2 16:45:40 kbgrz1 kernel: [  265.577830] RDX: 000000000028a360 RSI:
> 0000000000000000 RDI: ffff880035a06008
> Jun  2 16:45:40 kbgrz1 kernel: [  265.585904] RBP: ffff88043f680000 R08:
> 0000000000000000 R09: 0000000000001000
> Jun  2 16:45:40 kbgrz1 kernel: [  265.593979] R10: 0000000000001000 R11:
> 0000000000000410 R12: 00000000000000d0
> Jun  2 16:45:40 kbgrz1 kernel: [  265.602053] R13: 0000000000000001 R14:
> 0000000000000000 R15: 0000000000000000
> Jun  2 16:45:40 kbgrz1 kernel: [  265.610134] FS:
> 00007f74b8bc9700(0000) GS:ffff88043f680000(0000) knlGS:0000000000000000
> Jun  2 16:45:40 kbgrz1 kernel: [  265.619303] CS:  0010 DS: 0000 ES:
> 0000 CR0: 0000000080050033
> Jun  2 16:45:40 kbgrz1 kernel: [  265.625824] CR2: 0000000000000000 CR3:
> 00000004292c4000 CR4: 00000000000407e0
> Jun  2 16:45:40 kbgrz1 kernel: [  265.633889] Stack:
> Jun  2 16:45:40 kbgrz1 kernel: [  265.636236]  ffffffff811c689f
> 0000000000000000 00000000fffffff4 ffff8804283e7e10
> Jun  2 16:45:40 kbgrz1 kernel: [  265.644949]  ffff8804283e7e94
> 000000000000007d 00007fff4f8a73b0 0000000000000000
> Jun  2 16:45:40 kbgrz1 kernel: [  265.653653]  ffffffffa055acc7
> 0000000000000246 000000002d0aaec0 ffff88042d0aaec0
> Jun  2 16:45:40 kbgrz1 kernel: [  265.662358] Call Trace:
> Jun  2 16:45:40 kbgrz1 kernel: [  265.665194]  [<ffffffff811c689f>] ?
> blk_mq_alloc_request+0x54/0xd5
> Jun  2 16:45:40 kbgrz1 kernel: [  265.672217]  [<ffffffffa055acc7>] ?
> __nvme_submit_admin_cmd+0x2d/0x68 [nvme]
> Jun  2 16:45:40 kbgrz1 kernel: [  265.680196]  [<ffffffffa055c879>] ?
> nvme_user_admin_cmd+0x144/0x1b1 [nvme]
> Jun  2 16:45:40 kbgrz1 kernel: [  265.687987]  [<ffffffffa055c903>] ?
> nvme_dev_ioctl+0x1d/0x2b [nvme]
> Jun  2 16:45:40 kbgrz1 kernel: [  265.695107]  [<ffffffff81125916>] ?
> do_vfs_ioctl+0x3f2/0x43b
> Jun  2 16:45:40 kbgrz1 kernel: [  265.701547]  [<ffffffff8105ca6b>] ?
> finish_task_switch+0x84/0xc4
> Jun  2 16:45:40 kbgrz1 kernel: [  265.708382]  [<ffffffff81394a8c>] ?
> __schedule+0x45c/0x4f0
> Jun  2 16:45:40 kbgrz1 kernel: [  265.714603]  [<ffffffff811259ad>] ?
> SyS_ioctl+0x4e/0x7d
> Jun  2 16:45:40 kbgrz1 kernel: [  265.720555]  [<ffffffff8139c6d2>] ?
> system_call_fastpath+0x16/0x1b
> Jun  2 16:45:40 kbgrz1 kernel: [  265.727560] Code: 8b 4a 38 48 39 4e 38
> 72 12 74 06 b8 01 00 00 00 c3 48 8b 4a 60 48 39 4e 60 73 f0 c3 66 66 66
> 66 90 48 8b 87 e0 00 00 00 48 63 f6 <8b> 14 b0 48 8b 87 f8 00 00 00 48
> 8b 04 d0 c3 89 ff f0 48 0f ab
> Jun  2 16:45:40 kbgrz1 kernel: [  265.760706]  RSP <ffff8804283e7d80>
> Jun  2 16:45:40 kbgrz1 kernel: [  265.764705] CR2: 0000000000000000
> Jun  2 16:45:40 kbgrz1 kernel: [  265.768531] ---[ end trace
> 785048a51785f51e ]---
>
> On Mon, 2 Jun 2014, Keith Busch wrote:
>> On Mon, 2 Jun 2014, Matias Bjørling wrote:
>>> Hi Matthew and Keith,
>>>
>>> Here is an updated patch with the feedback from the previous days.
>>> It's against
>>> Jens' for-3.16/core tree. You may use the nvmemq_wip_review branch at:
>>
>> I'm testing this on my normal hardware now. As I feared, hot removal
>> doesn't work while the device is actively being managed. Here's the oops:
>>
>> [ 1267.018283] BUG: unable to handle kernel NULL pointer dereference
>> at 0000000000000004
>> [ 1267.018288] IP: [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
>> [ 1267.018292] PGD b5ed5067 PUD b57e2067 PMD 0
>> [ 1267.018294] Oops: 0000 [#1] SMP
>> [ 1267.018343] Modules linked in: nvme parport_pc ppdev lp parport dlm
>> sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs
>> lockd fscache sunrpc md4 hmac cifs bridge stp llc joydev jfs
>> hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp
>> kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel
>> ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul
>> ablk_helper cryptd microcode ehci_pci ehci_hcd usbcore pcspkr lpc_ich
>> acpi_cpufreq ioatdma mfd_core usb_common i2c_i801 evdev wmi ipmi_si
>> tpm_tis ipmi_msghandler tpm processor thermal_sys button ext4 crc16
>> jbd2 mbcache dm_mod nbd sg sr_mod cdrom sd_mod crc_t10dif
>> crct10dif_common isci libsas ahci scsi_transport_sas libahci igb
>> libata ptp pps_core i2c_algo_bit scsi_mod i2c_core dca [last unloaded:
>> nvme]
>> [ 1267.018346] CPU: 1 PID: 6618 Comm: nvme_id_ctrl Tainted: G        W
>> 3.15.0-rc1+ #2
>> [ 1267.018347] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS
>> SE5C600.86B.02.02.0002.122320131210 12/23/2013
>> [ 1267.018349] task: ffff88042879eef0 ti: ffff8800b5f14000 task.ti:
>> ffff8800b5f14000
>> [ 1267.018354] RIP: 0010:[<ffffffff811c51b8>]  [<ffffffff811c51b8>]
>> blk_mq_map_queue+0xf/0x1e
>> [ 1267.018356] RSP: 0018:ffff8800b5f15db0  EFLAGS: 00010206
>> [ 1267.018357] RAX: 0000000000000000 RBX: ffffe8fbffa21e80 RCX:
>> ffff88042f3952e0
>> [ 1267.018359] RDX: 0000000000001c10 RSI: 0000000000000001 RDI:
>> ffff8800b5e9f008
>> [ 1267.018360] RBP: ffff88042d993480 R08: ffff8800b5e18cc0 R09:
>> 0000000000000000
>> [ 1267.018362] R10: 0000000000000ca0 R11: 0000000000000ca0 R12:
>> ffff8800b5f15e94
>> [ 1267.018363] R13: 0000000000003a98 R14: 00007fffc7b44c40 R15:
>> 0000000000000000
>> [ 1267.018365] FS:  00007feb4cb05700(0000) GS:ffff88043f620000(0000)
>> knlGS:0000000000000000
>> [ 1267.018367] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [ 1267.018369] CR2: 0000000000000004 CR3: 00000000b50f2000 CR4:
>> 00000000000407e0
>> [ 1267.018369] Stack:
>> [ 1267.018372]  ffffffff811c6334 00000000fffffffc ffff88042d993480
>> 00000000fffffffc
>> [ 1267.018375]  ffffffffa0532d73 0000000000000ca0 00000000fffffff4
>> ffff88042562b440
>> [ 1267.018378]  ffff88042d853000 0000000000001000 ffffffffa05348f3
>> ffffc9001518b2a8
>> [ 1267.018378] Call Trace:
>> [ 1267.018383]  [<ffffffff811c6334>] ? blk_mq_free_request+0x37/0x48
>> [ 1267.018388]  [<ffffffffa0532d73>] ?
>> __nvme_submit_admin_cmd+0x7f/0x8a [nvme]
>> [ 1267.018392]  [<ffffffffa05348f3>] ? nvme_user_admin_cmd+0x144/0x1b1
>> [nvme]
>> [ 1267.018397]  [<ffffffffa053497d>] ? nvme_dev_ioctl+0x1d/0x2b [nvme]
>> [ 1267.018399]  [<ffffffff81125916>] ? do_vfs_ioctl+0x3f2/0x43b
>> [ 1267.018402]  [<ffffffff8105ca6b>] ? finish_task_switch+0x84/0xc4
>> [ 1267.018406]  [<ffffffff81394a8c>] ? __schedule+0x45c/0x4f0
>> [ 1267.018408]  [<ffffffff811259ad>] ? SyS_ioctl+0x4e/0x7d
>> [ 1267.018411]  [<ffffffff8139c6d2>] ? system_call_fastpath+0x16/0x1b
>> [ 1267.018435] Code: 8b 4a 38 48 39 4e 38 72 12 74 06 b8 01 00 00 00
>> c3 48 8b 4a 60 48 39 4e 60 73 f0 c3 66 66 66 66 90 48 8b 87 e0 00 00
>> 00 48 63 f6 <8b> 14 b0 48 8b 87 f8 00 00 00 48 8b 04 d0 c3 89 ff f0 48
>> 0f ab
>> [ 1267.018438] RIP  [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
>> [ 1267.018439]  RSP <ffff8800b5f15db0>
>> [ 1267.018440] CR2: 0000000000000004
>> [ 1267.018443] ---[ end trace 1e244ae5f3ceb23b ]---
>>

Thanks. I'll look into it.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] conversion to blk-mq
  2014-06-02 23:06   ` Keith Busch
  2014-06-03 13:56     ` Matias Bjørling
@ 2014-06-03 20:12     ` Matias Bjorling
  2014-06-03 22:23       ` Keith Busch
  1 sibling, 1 reply; 14+ messages in thread
From: Matias Bjorling @ 2014-06-03 20:12 UTC (permalink / raw)
  To: Keith Busch; +Cc: willy, sbradshaw, axboe, linux-kernel, linux-nvme, hch

On 06/03/2014 01:06 AM, Keith Busch wrote:
> Depending on the timing, it might fail in alloc instead of free:
> 
> Jun  2 16:45:40 kbgrz1 kernel: [  265.421243] NULL pointer dereference
> at           (null)
> Jun  2 16:45:40 kbgrz1 kernel: [  265.434284] PGD 429acf067 PUD
> 42ce28067 PMD 0
> Jun  2 16:45:40 kbgrz1 kernel: [  265.439565] Oops: 0000 [#1] SMP
> Jun  2 16:45:40 kbgrz1 kernel: [  265.443413] Modules linked in: nvme
> parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss
> oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp
> llc jfs joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal
> coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel
> ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul
> ablk_helper cryptd ehci_pci ehci_hcd microcode usbcore pcspkr ioatdma
> lpc_ich usb_common mfd_core i2c_i801 evdev wmi acpi_cpufreq tpm_tis
> ipmi_si tpm ipmi_msghandler processor thermal_sys button ext4 crc16 jbd2
> mbcache dm_mod nbd sg sd_mod sr_mod crc_t10dif cdrom crct10dif_common
> isci libsas igb ahci scsi_transport_sas libahci ptp libata pps_core
> i2c_algo_bit i2c_core scsi_mod dca
> Jun  2 16:45:40 kbgrz1 kernel: [  265.526398] CPU: 4 PID: 5454 Comm:
> nvme_id_ctrl Not tainted 3.15.0-rc1+ #2
> Jun  2 16:45:40 kbgrz1 kernel: [  265.534181] Hardware name: Intel
> Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210
> 12/23/2013
> Jun  2 16:45:40 kbgrz1 kernel: [  265.545789] task: ffff88042e418390 ti:
> ffff8804283e6000 task.ti: ffff8804283e6000
> Jun  2 16:45:40 kbgrz1 kernel: [  265.554270] RIP:
> 0010:[<ffffffff811c51b8>]  [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
> Jun  2 16:45:40 kbgrz1 kernel: [  265.563720] RSP:
> 0018:ffff8804283e7d80  EFLAGS: 00010286
> Jun  2 16:45:40 kbgrz1 kernel: [  265.569755] RAX: 0000000000000000 RBX:
> ffff880035a06008 RCX: 00000000000f4240
> Jun  2 16:45:40 kbgrz1 kernel: [  265.577830] RDX: 000000000028a360 RSI:
> 0000000000000000 RDI: ffff880035a06008
> Jun  2 16:45:40 kbgrz1 kernel: [  265.585904] RBP: ffff88043f680000 R08:
> 0000000000000000 R09: 0000000000001000
> Jun  2 16:45:40 kbgrz1 kernel: [  265.593979] R10: 0000000000001000 R11:
> 0000000000000410 R12: 00000000000000d0
> Jun  2 16:45:40 kbgrz1 kernel: [  265.602053] R13: 0000000000000001 R14:
> 0000000000000000 R15: 0000000000000000
> Jun  2 16:45:40 kbgrz1 kernel: [  265.610134] FS: 
> 00007f74b8bc9700(0000) GS:ffff88043f680000(0000) knlGS:0000000000000000
> Jun  2 16:45:40 kbgrz1 kernel: [  265.619303] CS:  0010 DS: 0000 ES:
> 0000 CR0: 0000000080050033
> Jun  2 16:45:40 kbgrz1 kernel: [  265.625824] CR2: 0000000000000000 CR3:
> 00000004292c4000 CR4: 00000000000407e0
> Jun  2 16:45:40 kbgrz1 kernel: [  265.633889] Stack:
> Jun  2 16:45:40 kbgrz1 kernel: [  265.636236]  ffffffff811c689f
> 0000000000000000 00000000fffffff4 ffff8804283e7e10
> Jun  2 16:45:40 kbgrz1 kernel: [  265.644949]  ffff8804283e7e94
> 000000000000007d 00007fff4f8a73b0 0000000000000000
> Jun  2 16:45:40 kbgrz1 kernel: [  265.653653]  ffffffffa055acc7
> 0000000000000246 000000002d0aaec0 ffff88042d0aaec0
> Jun  2 16:45:40 kbgrz1 kernel: [  265.662358] Call Trace:
> Jun  2 16:45:40 kbgrz1 kernel: [  265.665194]  [<ffffffff811c689f>] ?
> blk_mq_alloc_request+0x54/0xd5
> Jun  2 16:45:40 kbgrz1 kernel: [  265.672217]  [<ffffffffa055acc7>] ?
> __nvme_submit_admin_cmd+0x2d/0x68 [nvme]
> Jun  2 16:45:40 kbgrz1 kernel: [  265.680196]  [<ffffffffa055c879>] ?
> nvme_user_admin_cmd+0x144/0x1b1 [nvme]
> Jun  2 16:45:40 kbgrz1 kernel: [  265.687987]  [<ffffffffa055c903>] ?
> nvme_dev_ioctl+0x1d/0x2b [nvme]
> Jun  2 16:45:40 kbgrz1 kernel: [  265.695107]  [<ffffffff81125916>] ?
> do_vfs_ioctl+0x3f2/0x43b
> Jun  2 16:45:40 kbgrz1 kernel: [  265.701547]  [<ffffffff8105ca6b>] ?
> finish_task_switch+0x84/0xc4
> Jun  2 16:45:40 kbgrz1 kernel: [  265.708382]  [<ffffffff81394a8c>] ?
> __schedule+0x45c/0x4f0
> Jun  2 16:45:40 kbgrz1 kernel: [  265.714603]  [<ffffffff811259ad>] ?
> SyS_ioctl+0x4e/0x7d
> Jun  2 16:45:40 kbgrz1 kernel: [  265.720555]  [<ffffffff8139c6d2>] ?
> system_call_fastpath+0x16/0x1b
> Jun  2 16:45:40 kbgrz1 kernel: [  265.727560] Code: 8b 4a 38 48 39 4e 38
> 72 12 74 06 b8 01 00 00 00 c3 48 8b 4a 60 48 39 4e 60 73 f0 c3 66 66 66
> 66 90 48 8b 87 e0 00 00 00 48 63 f6 <8b> 14 b0 48 8b 87 f8 00 00 00 48
> 8b 04 d0 c3 89 ff f0 48 0f ab
> Jun  2 16:45:40 kbgrz1 kernel: [  265.760706]  RSP <ffff8804283e7d80>
> Jun  2 16:45:40 kbgrz1 kernel: [  265.764705] CR2: 0000000000000000
> Jun  2 16:45:40 kbgrz1 kernel: [  265.768531] ---[ end trace
> 785048a51785f51e ]---
> 
> On Mon, 2 Jun 2014, Keith Busch wrote:
>> On Mon, 2 Jun 2014, Matias Bjørling wrote:
>>> Hi Matthew and Keith,
>>>
>>> Here is an updated patch with the feedback from the previous days.
>>> It's against
>>> Jens' for-3.16/core tree. You may use the nvmemq_wip_review branch at:
>>
>> I'm testing this on my normal hardware now. As I feared, hot removal
>> doesn't work while the device is actively being managed. Here's the oops:
>>
>> [ 1267.018283] BUG: unable to handle kernel NULL pointer dereference
>> at 0000000000000004
>> [ 1267.018288] IP: [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
>> [ 1267.018292] PGD b5ed5067 PUD b57e2067 PMD 0
>> [ 1267.018294] Oops: 0000 [#1] SMP
>> [ 1267.018343] Modules linked in: nvme parport_pc ppdev lp parport dlm
>> sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs
>> lockd fscache sunrpc md4 hmac cifs bridge stp llc joydev jfs
>> hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp
>> kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel
>> ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul
>> ablk_helper cryptd microcode ehci_pci ehci_hcd usbcore pcspkr lpc_ich
>> acpi_cpufreq ioatdma mfd_core usb_common i2c_i801 evdev wmi ipmi_si
>> tpm_tis ipmi_msghandler tpm processor thermal_sys button ext4 crc16
>> jbd2 mbcache dm_mod nbd sg sr_mod cdrom sd_mod crc_t10dif
>> crct10dif_common isci libsas ahci scsi_transport_sas libahci igb
>> libata ptp pps_core i2c_algo_bit scsi_mod i2c_core dca [last unloaded:
>> nvme]
>> [ 1267.018346] CPU: 1 PID: 6618 Comm: nvme_id_ctrl Tainted: G        W
>> 3.15.0-rc1+ #2
>> [ 1267.018347] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS
>> SE5C600.86B.02.02.0002.122320131210 12/23/2013
>> [ 1267.018349] task: ffff88042879eef0 ti: ffff8800b5f14000 task.ti:
>> ffff8800b5f14000
>> [ 1267.018354] RIP: 0010:[<ffffffff811c51b8>]  [<ffffffff811c51b8>]
>> blk_mq_map_queue+0xf/0x1e
>> [ 1267.018356] RSP: 0018:ffff8800b5f15db0  EFLAGS: 00010206
>> [ 1267.018357] RAX: 0000000000000000 RBX: ffffe8fbffa21e80 RCX:
>> ffff88042f3952e0
>> [ 1267.018359] RDX: 0000000000001c10 RSI: 0000000000000001 RDI:
>> ffff8800b5e9f008
>> [ 1267.018360] RBP: ffff88042d993480 R08: ffff8800b5e18cc0 R09:
>> 0000000000000000
>> [ 1267.018362] R10: 0000000000000ca0 R11: 0000000000000ca0 R12:
>> ffff8800b5f15e94
>> [ 1267.018363] R13: 0000000000003a98 R14: 00007fffc7b44c40 R15:
>> 0000000000000000
>> [ 1267.018365] FS:  00007feb4cb05700(0000) GS:ffff88043f620000(0000)
>> knlGS:0000000000000000
>> [ 1267.018367] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [ 1267.018369] CR2: 0000000000000004 CR3: 00000000b50f2000 CR4:
>> 00000000000407e0
>> [ 1267.018369] Stack:
>> [ 1267.018372]  ffffffff811c6334 00000000fffffffc ffff88042d993480
>> 00000000fffffffc
>> [ 1267.018375]  ffffffffa0532d73 0000000000000ca0 00000000fffffff4
>> ffff88042562b440
>> [ 1267.018378]  ffff88042d853000 0000000000001000 ffffffffa05348f3
>> ffffc9001518b2a8
>> [ 1267.018378] Call Trace:
>> [ 1267.018383]  [<ffffffff811c6334>] ? blk_mq_free_request+0x37/0x48
>> [ 1267.018388]  [<ffffffffa0532d73>] ?
>> __nvme_submit_admin_cmd+0x7f/0x8a [nvme]
>> [ 1267.018392]  [<ffffffffa05348f3>] ? nvme_user_admin_cmd+0x144/0x1b1
>> [nvme]
>> [ 1267.018397]  [<ffffffffa053497d>] ? nvme_dev_ioctl+0x1d/0x2b [nvme]
>> [ 1267.018399]  [<ffffffff81125916>] ? do_vfs_ioctl+0x3f2/0x43b
>> [ 1267.018402]  [<ffffffff8105ca6b>] ? finish_task_switch+0x84/0xc4
>> [ 1267.018406]  [<ffffffff81394a8c>] ? __schedule+0x45c/0x4f0
>> [ 1267.018408]  [<ffffffff811259ad>] ? SyS_ioctl+0x4e/0x7d
>> [ 1267.018411]  [<ffffffff8139c6d2>] ? system_call_fastpath+0x16/0x1b
>> [ 1267.018435] Code: 8b 4a 38 48 39 4e 38 72 12 74 06 b8 01 00 00 00
>> c3 48 8b 4a 60 48 39 4e 60 73 f0 c3 66 66 66 66 90 48 8b 87 e0 00 00
>> 00 48 63 f6 <8b> 14 b0 48 8b 87 f8 00 00 00 48 8b 04 d0 c3 89 ff f0 48
>> 0f ab
>> [ 1267.018438] RIP  [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
>> [ 1267.018439]  RSP <ffff8800b5f15db0>
>> [ 1267.018440] CR2: 0000000000000004
>> [ 1267.018443] ---[ end trace 1e244ae5f3ceb23b ]---
>>

Keith, will you take the nvmemq_wip_v6 branch for a spin? Thanks!

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] conversion to blk-mq
  2014-06-03 20:12     ` Matias Bjorling
@ 2014-06-03 22:23       ` Keith Busch
  2014-06-03 22:27         ` Keith Busch
  0 siblings, 1 reply; 14+ messages in thread
From: Keith Busch @ 2014-06-03 22:23 UTC (permalink / raw)
  To: Matias Bjorling
  Cc: Keith Busch, willy, sbradshaw, axboe, linux-kernel, linux-nvme,
	hch

On Tue, 3 Jun 2014, Matias Bjorling wrote:
>
> Keith, will you take the nvmemq_wip_v6 branch for a spin? Thanks!

Still fails as before:

[   88.933881] BUG: unable to handle kernel NULL pointer dereference at 0000000000000014
[   88.942900] IP: [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
[   88.949605] PGD 427be0067 PUD 425495067 PMD 0
[   88.954915] Oops: 0000 [#1] SMP
[   88.958787] Modules linked in: nvme parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc joydev jfs hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd microcode ehci_pci ehci_hcd pcspkr usbcore lpc_ich ioatdma usb_common mfd_core evdev i2c_i801 wmi acpi_cpufreq tpm_tis tpm ipmi_si ipmi_msghandler processor thermal_sys button ext4 crc16 jbd2 mbcache dm_mod nbd sg sd_mod sr_mod crc_t10dif cdrom crct10dif_common isci ahci libsas igb libahci scsi_transport_sas ptp pps_core i2c_algo_bit libata i2c_core scsi_mod dca
[   89.042529] CPU: 5 PID: 4544 Comm: nvme_id_ctrl Not tainted 3.15.0-rc1+ #3
[   89.050295] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
[   89.061856] task: ffff88042bbdb0d0 ti: ffff88042c24c000 task.ti: ffff88042c24c000
[   89.070305] RIP: 0010:[<ffffffff811c51b8>]  [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
[   89.079747] RSP: 0018:ffff88042c24dda0  EFLAGS: 00010202
[   89.085795] RAX: 0000000000000000 RBX: ffffe8fbffaa1b00 RCX: ffff88042e8ec4b0
[   89.093868] RDX: 0000000000008be6 RSI: 0000000000000005 RDI: ffff88042abdf048
[   89.101950] RBP: ffff88042c2b81c0 R08: ffff88042c24c000 R09: ffff880035c58410
[   89.110033] R10: ffff88043f6b2dc0 R11: ffff88043f6b2dc0 R12: ffff88042c24de94
[   89.118119] R13: 000000000000007d R14: 00007fff0cd892b0 R15: 0000000000000000
[   89.126210] FS:  00007f39866c5700(0000) GS:ffff88043f6a0000(0000) knlGS:0000000000000000
[   89.135387] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   89.141916] CR2: 0000000000000014 CR3: 000000042b387000 CR4: 00000000000407e0
[   89.149997] Stack:
[   89.152353]  ffffffff811c6334 00000000fffffffc ffff88042c2b81c0 ffff88042c24de10
[   89.161096]  ffffffffa054dcbb 0000000000000246 00000000fffffffc ffff8800b5e05cc0
[   89.169839]  00000000fffffff4 ffff8800b5e05cc0 ffff88042bbc3000 0000000000001000
[   89.178583] Call Trace:
[   89.181429]  [<ffffffff811c6334>] ? blk_mq_free_request+0x37/0x48
[   89.188360]  [<ffffffffa054dcbb>] ? __nvme_submit_admin_cmd+0x52/0x68 [nvme]
[   89.196349]  [<ffffffffa054f761>] ? nvme_user_admin_cmd+0x144/0x1b1 [nvme]
[   89.204150]  [<ffffffffa054f7eb>] ? nvme_dev_ioctl+0x1d/0x2b [nvme]
[   89.211278]  [<ffffffff81125916>] ? do_vfs_ioctl+0x3f2/0x43b
[   89.217710]  [<ffffffff81117e35>] ? vfs_write+0xde/0xfc
[   89.223657]  [<ffffffff811259ad>] ? SyS_ioctl+0x4e/0x7d
[   89.229622]  [<ffffffff8139c6d2>] ? system_call_fastpath+0x16/0x1b
[   89.236636] Code: 8b 4a 38 48 39 4e 38 72 12 74 06 b8 01 00 00 00 c3 48 8b 4a 60 48 39 4e 60 73 f0 c3 66 66 66 66 90 48 8b 87 e0 00 00 00 48 63 f6 <8b> 14 b0 48 8b 87 f8 00 00 00 48 8b 04 d0 c3 89 ff f0 48 0f ab
[   89.263435] RIP  [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
[   89.270237]  RSP <ffff88042c24dda0>
[   89.274237] CR2: 0000000000000014
[   89.278095] ---[ end trace 54c0e8cbb1fe2ec3 ]---

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] conversion to blk-mq
  2014-06-03 22:23       ` Keith Busch
@ 2014-06-03 22:27         ` Keith Busch
  2014-06-04  9:16           ` Matias Bjørling
  0 siblings, 1 reply; 14+ messages in thread
From: Keith Busch @ 2014-06-03 22:27 UTC (permalink / raw)
  To: Keith Busch
  Cc: Matias Bjorling, willy, sbradshaw, axboe, linux-kernel,
	linux-nvme, hch

> On Tue, 3 Jun 2014, Matias Bjorling wrote:
>> 
>> Keith, will you take the nvmemq_wip_v6 branch for a spin? Thanks!

BTW, if you want to test this out yourself, it's pretty simple to
recreate. I just run a simple user admin program sending nvme passthrough
commands in a tight loop, then run:

  # echo 1 > /sys/bus/pci/devices/<bdf>/remove

> Still fails as before:
>
> [   88.933881] BUG: unable to handle kernel NULL pointer dereference at 
> 0000000000000014
> [   88.942900] IP: [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
> [   88.949605] PGD 427be0067 PUD 425495067 PMD 0
> [   88.954915] Oops: 0000 [#1] SMP
> [   88.958787] Modules linked in: nvme parport_pc ppdev lp parport dlm sctp 
> libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache 
> sunrpc md4 hmac cifs bridge stp llc joydev jfs hid_generic usbhid hid loop 
> md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt 
> iTCO_vendor_support crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 
> glue_helper lrw gf128mul ablk_helper cryptd microcode ehci_pci ehci_hcd 
> pcspkr usbcore lpc_ich ioatdma usb_common mfd_core evdev i2c_i801 wmi 
> acpi_cpufreq tpm_tis tpm ipmi_si ipmi_msghandler processor thermal_sys button 
> ext4 crc16 jbd2 mbcache dm_mod nbd sg sd_mod sr_mod crc_t10dif cdrom 
> crct10dif_common isci ahci libsas igb libahci scsi_transport_sas ptp pps_core 
> i2c_algo_bit libata i2c_core scsi_mod dca
> [   89.042529] CPU: 5 PID: 4544 Comm: nvme_id_ctrl Not tainted 3.15.0-rc1+ #3
> [   89.050295] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS 
> SE5C600.86B.02.02.0002.122320131210 12/23/2013
> [   89.061856] task: ffff88042bbdb0d0 ti: ffff88042c24c000 task.ti: 
> ffff88042c24c000
> [   89.070305] RIP: 0010:[<ffffffff811c51b8>]  [<ffffffff811c51b8>] 
> blk_mq_map_queue+0xf/0x1e
> [   89.079747] RSP: 0018:ffff88042c24dda0  EFLAGS: 00010202
> [   89.085795] RAX: 0000000000000000 RBX: ffffe8fbffaa1b00 RCX: 
> ffff88042e8ec4b0
> [   89.093868] RDX: 0000000000008be6 RSI: 0000000000000005 RDI: 
> ffff88042abdf048
> [   89.101950] RBP: ffff88042c2b81c0 R08: ffff88042c24c000 R09: 
> ffff880035c58410
> [   89.110033] R10: ffff88043f6b2dc0 R11: ffff88043f6b2dc0 R12: 
> ffff88042c24de94
> [   89.118119] R13: 000000000000007d R14: 00007fff0cd892b0 R15: 
> 0000000000000000
> [   89.126210] FS:  00007f39866c5700(0000) GS:ffff88043f6a0000(0000) 
> knlGS:0000000000000000
> [   89.135387] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   89.141916] CR2: 0000000000000014 CR3: 000000042b387000 CR4: 
> 00000000000407e0
> [   89.149997] Stack:
> [   89.152353]  ffffffff811c6334 00000000fffffffc ffff88042c2b81c0 
> ffff88042c24de10
> [   89.161096]  ffffffffa054dcbb 0000000000000246 00000000fffffffc 
> ffff8800b5e05cc0
> [   89.169839]  00000000fffffff4 ffff8800b5e05cc0 ffff88042bbc3000 
> 0000000000001000
> [   89.178583] Call Trace:
> [   89.181429]  [<ffffffff811c6334>] ? blk_mq_free_request+0x37/0x48
> [   89.188360]  [<ffffffffa054dcbb>] ? __nvme_submit_admin_cmd+0x52/0x68 
> [nvme]
> [   89.196349]  [<ffffffffa054f761>] ? nvme_user_admin_cmd+0x144/0x1b1 [nvme]
> [   89.204150]  [<ffffffffa054f7eb>] ? nvme_dev_ioctl+0x1d/0x2b [nvme]
> [   89.211278]  [<ffffffff81125916>] ? do_vfs_ioctl+0x3f2/0x43b
> [   89.217710]  [<ffffffff81117e35>] ? vfs_write+0xde/0xfc
> [   89.223657]  [<ffffffff811259ad>] ? SyS_ioctl+0x4e/0x7d
> [   89.229622]  [<ffffffff8139c6d2>] ? system_call_fastpath+0x16/0x1b
> [   89.236636] Code: 8b 4a 38 48 39 4e 38 72 12 74 06 b8 01 00 00 00 c3 48 8b 
> 4a 60 48 39 4e 60 73 f0 c3 66 66 66 66 90 48 8b 87 e0 00 00 00 48 63 f6 <8b> 
> 14 b0 48 8b 87 f8 00 00 00 48 8b 04 d0 c3 89 ff f0 48 0f ab
> [   89.263435] RIP  [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
> [   89.270237]  RSP <ffff88042c24dda0>
> [   89.274237] CR2: 0000000000000014
> [   89.278095] ---[ end trace 54c0e8cbb1fe2ec3 ]---
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] conversion to blk-mq
  2014-06-03 22:27         ` Keith Busch
@ 2014-06-04  9:16           ` Matias Bjørling
  2014-06-04 18:28             ` Keith Busch
  0 siblings, 1 reply; 14+ messages in thread
From: Matias Bjørling @ 2014-06-04  9:16 UTC (permalink / raw)
  To: Keith Busch; +Cc: willy, sbradshaw, axboe, linux-kernel, linux-nvme, hch

On 06/04/2014 12:27 AM, Keith Busch wrote:
>> On Tue, 3 Jun 2014, Matias Bjorling wrote:
>>>
>>> Keith, will you take the nvmemq_wip_v6 branch for a spin? Thanks!
>
> BTW, if you want to test this out yourself, it's pretty simple to
> recreate. I just run a simple user admin program sending nvme passthrough
> commands in a tight loop, then run:
>
>   # echo 1 > /sys/bus/pci/devices/<bdf>/remove

I can't recreate- I use the nvme_get_feature program to continuously hit 
the ioctl path, testing using your nvme qemu branch.

>
>> Still fails as before:
>>
>> [   88.933881] BUG: unable to handle kernel NULL pointer dereference
>> at 0000000000000014
>> [   88.942900] IP: [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
>> [   88.949605] PGD 427be0067 PUD 425495067 PMD 0
>> [   88.954915] Oops: 0000 [#1] SMP
>> [   88.958787] Modules linked in: nvme parport_pc ppdev lp parport dlm
>> sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs
>> lockd fscache sunrpc md4 hmac cifs bridge stp llc joydev jfs
>> hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp
>> kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel
>> ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul
>> ablk_helper cryptd microcode ehci_pci ehci_hcd pcspkr usbcore lpc_ich
>> ioatdma usb_common mfd_core evdev i2c_i801 wmi acpi_cpufreq tpm_tis
>> tpm ipmi_si ipmi_msghandler processor thermal_sys button ext4 crc16
>> jbd2 mbcache dm_mod nbd sg sd_mod sr_mod crc_t10dif cdrom
>> crct10dif_common isci ahci libsas igb libahci scsi_transport_sas ptp
>> pps_core i2c_algo_bit libata i2c_core scsi_mod dca
>> [   89.042529] CPU: 5 PID: 4544 Comm: nvme_id_ctrl Not tainted
>> 3.15.0-rc1+ #3
>> [   89.050295] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS
>> SE5C600.86B.02.02.0002.122320131210 12/23/2013
>> [   89.061856] task: ffff88042bbdb0d0 ti: ffff88042c24c000 task.ti:
>> ffff88042c24c000
>> [   89.070305] RIP: 0010:[<ffffffff811c51b8>]  [<ffffffff811c51b8>]
>> blk_mq_map_queue+0xf/0x1e
>> [   89.079747] RSP: 0018:ffff88042c24dda0  EFLAGS: 00010202
>> [   89.085795] RAX: 0000000000000000 RBX: ffffe8fbffaa1b00 RCX:
>> ffff88042e8ec4b0
>> [   89.093868] RDX: 0000000000008be6 RSI: 0000000000000005 RDI:
>> ffff88042abdf048
>> [   89.101950] RBP: ffff88042c2b81c0 R08: ffff88042c24c000 R09:
>> ffff880035c58410
>> [   89.110033] R10: ffff88043f6b2dc0 R11: ffff88043f6b2dc0 R12:
>> ffff88042c24de94
>> [   89.118119] R13: 000000000000007d R14: 00007fff0cd892b0 R15:
>> 0000000000000000
>> [   89.126210] FS:  00007f39866c5700(0000) GS:ffff88043f6a0000(0000)
>> knlGS:0000000000000000
>> [   89.135387] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [   89.141916] CR2: 0000000000000014 CR3: 000000042b387000 CR4:
>> 00000000000407e0
>> [   89.149997] Stack:
>> [   89.152353]  ffffffff811c6334 00000000fffffffc ffff88042c2b81c0
>> ffff88042c24de10
>> [   89.161096]  ffffffffa054dcbb 0000000000000246 00000000fffffffc
>> ffff8800b5e05cc0
>> [   89.169839]  00000000fffffff4 ffff8800b5e05cc0 ffff88042bbc3000
>> 0000000000001000
>> [   89.178583] Call Trace:
>> [   89.181429]  [<ffffffff811c6334>] ? blk_mq_free_request+0x37/0x48
>> [   89.188360]  [<ffffffffa054dcbb>] ?
>> __nvme_submit_admin_cmd+0x52/0x68 [nvme]
>> [   89.196349]  [<ffffffffa054f761>] ? nvme_user_admin_cmd+0x144/0x1b1
>> [nvme]
>> [   89.204150]  [<ffffffffa054f7eb>] ? nvme_dev_ioctl+0x1d/0x2b [nvme]
>> [   89.211278]  [<ffffffff81125916>] ? do_vfs_ioctl+0x3f2/0x43b
>> [   89.217710]  [<ffffffff81117e35>] ? vfs_write+0xde/0xfc
>> [   89.223657]  [<ffffffff811259ad>] ? SyS_ioctl+0x4e/0x7d
>> [   89.229622]  [<ffffffff8139c6d2>] ? system_call_fastpath+0x16/0x1b
>> [   89.236636] Code: 8b 4a 38 48 39 4e 38 72 12 74 06 b8 01 00 00 00
>> c3 48 8b 4a 60 48 39 4e 60 73 f0 c3 66 66 66 66 90 48 8b 87 e0 00 00
>> 00 48 63 f6 <8b> 14 b0 48 8b 87 f8 00 00 00 48 8b 04 d0 c3 89 ff f0 48
>> 0f ab
>> [   89.263435] RIP  [<ffffffff811c51b8>] blk_mq_map_queue+0xf/0x1e
>> [   89.270237]  RSP <ffff88042c24dda0>
>> [   89.274237] CR2: 0000000000000014
>> [   89.278095] ---[ end trace 54c0e8cbb1fe2ec3 ]---
>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] conversion to blk-mq
  2014-06-04  9:16           ` Matias Bjørling
@ 2014-06-04 18:28             ` Keith Busch
  2014-06-04 18:42               ` Jens Axboe
  0 siblings, 1 reply; 14+ messages in thread
From: Keith Busch @ 2014-06-04 18:28 UTC (permalink / raw)
  To: Matias Bjørling
  Cc: Keith Busch, willy, sbradshaw, axboe, linux-kernel, linux-nvme,
	hch

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3384 bytes --]

On Wed, 4 Jun 2014, Matias Bjørling wrote:
> On 06/04/2014 12:27 AM, Keith Busch wrote:
>>> On Tue, 3 Jun 2014, Matias Bjorling wrote:
>>>> 
>>>> Keith, will you take the nvmemq_wip_v6 branch for a spin? Thanks!
>> 
>> BTW, if you want to test this out yourself, it's pretty simple to
>> recreate. I just run a simple user admin program sending nvme passthrough
>> commands in a tight loop, then run:
>>
>>   # echo 1 > /sys/bus/pci/devices/<bdf>/remove
>
> I can't recreate- I use the nvme_get_feature program to continuously hit the 
> ioctl path, testing using your nvme qemu branch.

Okay, I'll try to fix it.

I think there are multiple problems, but the first is that since there
is no gendisk associated with the admin_q, the QUEUE_FLAG_INIT_DONE flag
is never set, and blk_mq_queue_enter returns successful whenever this
flag is not set even though this queue is dying, so we enter with all
its invalid pointers.

Here's a couple diff's. The first fixes the kernel oops by not entering a
dying queue. The second is just a few unrelated clean-ups in nvme-core.c.

I still can't complete my current hot-removal test, though; something
appears hung, but haven't nailed that down yet.

Please let me know what you think! Thanks.

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d10013b..5a9ae8a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -105,6 +105,10 @@ static int blk_mq_queue_enter(struct request_queue *q)
  	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
  	smp_wmb();
  	/* we have problems to freeze the queue if it's initializing */
+	if (blk_queue_dying(q)) {
+		__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+		ret = -ENODEV;
+	}
  	if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
  		return 0;

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 243a5e6..22e9c82 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -98,7 +98,6 @@ struct nvme_queue {
  	u8 cq_phase;
  	u8 cqe_seen;
  	u8 q_suspended;
-	cpumask_var_t cpu_mask;
  	struct async_cmd_info cmdinfo;
  	struct blk_mq_hw_ctx *hctx;
  };
@@ -1055,8 +1054,6 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
  				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
  	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
  					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
-	if (nvmeq->qid)
-		free_cpumask_var(nvmeq->cpu_mask);
  	kfree(nvmeq);
  }

@@ -1066,9 +1063,9 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)

  	for (i = dev->queue_count - 1; i >= lowest; i--) {
  		struct nvme_queue *nvmeq = dev->queues[i];
-		nvme_free_queue(nvmeq);
  		dev->queue_count--;
  		dev->queues[i] = NULL;
+		nvme_free_queue(nvmeq);
  	}
  }

@@ -1142,9 +1139,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
  	if (!nvmeq->sq_cmds)
  		goto free_cqdma;

-	if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL))
-		goto free_sqdma;
-
  	nvmeq->q_dmadev = dmadev;
  	nvmeq->dev = dev;
  	snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
@@ -1162,9 +1156,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,

  	return nvmeq;

- free_sqdma:
-	dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds,
-							nvmeq->sq_dma_addr);
   free_cqdma:
  	dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
  							nvmeq->cq_dma_addr);

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] conversion to blk-mq
  2014-06-04 18:28             ` Keith Busch
@ 2014-06-04 18:42               ` Jens Axboe
  2014-06-04 18:52                 ` Keith Busch
  0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2014-06-04 18:42 UTC (permalink / raw)
  To: Keith Busch, Matias Bjørling
  Cc: willy, sbradshaw, linux-kernel, linux-nvme, hch

On 06/04/2014 12:28 PM, Keith Busch wrote:
> On Wed, 4 Jun 2014, Matias Bjørling wrote:
>> On 06/04/2014 12:27 AM, Keith Busch wrote:
>>>> On Tue, 3 Jun 2014, Matias Bjorling wrote:
>>>>>
>>>>> Keith, will you take the nvmemq_wip_v6 branch for a spin? Thanks!
>>>
>>> BTW, if you want to test this out yourself, it's pretty simple to
>>> recreate. I just run a simple user admin program sending nvme
>>> passthrough
>>> commands in a tight loop, then run:
>>>
>>>   # echo 1 > /sys/bus/pci/devices/<bdf>/remove
>>
>> I can't recreate- I use the nvme_get_feature program to continuously
>> hit the ioctl path, testing using your nvme qemu branch.
> 
> Okay, I'll try to fix it.
> 
> I think there are multiple problems, but the first is that since there
> is no gendisk associated with the admin_q, the QUEUE_FLAG_INIT_DONE flag
> is never set, and blk_mq_queue_enter returns successful whenever this
> flag is not set even though this queue is dying, so we enter with all
> its invalid pointers.
> 
> Here's a couple diff's. The first fixes the kernel oops by not entering a
> dying queue. The second is just a few unrelated clean-ups in nvme-core.c.
> 
> I still can't complete my current hot-removal test, though; something
> appears hung, but haven't nailed that down yet.
> 
> Please let me know what you think! Thanks.
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d10013b..5a9ae8a 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -105,6 +105,10 @@ static int blk_mq_queue_enter(struct request_queue *q)
>      __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
>      smp_wmb();
>      /* we have problems to freeze the queue if it's initializing */
> +    if (blk_queue_dying(q)) {
> +        __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
> +        ret = -ENODEV;
> +    }
>      if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
>          return 0;

Are you testing against 3.13? You really need the current tree for this,
otherwise I'm sure you'll run into issues (as you appear to be :-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] conversion to blk-mq
  2014-06-04 18:42               ` Jens Axboe
@ 2014-06-04 18:52                 ` Keith Busch
  2014-06-04 18:55                   ` Jens Axboe
  2014-06-04 20:01                   ` Matias Bjorling
  0 siblings, 2 replies; 14+ messages in thread
From: Keith Busch @ 2014-06-04 18:52 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Keith Busch, Matias Bjørling, willy, sbradshaw, linux-kernel,
	linux-nvme, hch

On Wed, 4 Jun 2014, Jens Axboe wrote:
> On 06/04/2014 12:28 PM, Keith Busch wrote:
> Are you testing against 3.13? You really need the current tree for this,
> otherwise I'm sure you'll run into issues (as you appear to be :-)

I'm using Matias' current tree:

git://github.com/MatiasBjorling/linux-collab nvmeq_wip_6

Is this the right one to use? Looks like a 3.15rc1+.

Also ... obviously that's the wrong diff from before. I didn't save my
file before creating the diff. Should have looked like this:

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d10013b..d1b986c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -105,7 +105,8 @@ static int blk_mq_queue_enter(struct request_queue *q)
  	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
  	smp_wmb();
  	/* we have problems to freeze the queue if it's initializing */
-	if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
+	if (!blk_queue_dying(q) && (!blk_queue_bypass(q) ||
+					!blk_queue_init_done(q)))
  		return 0;

  	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] conversion to blk-mq
  2014-06-04 18:52                 ` Keith Busch
@ 2014-06-04 18:55                   ` Jens Axboe
  2014-06-04 20:01                   ` Matias Bjorling
  1 sibling, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2014-06-04 18:55 UTC (permalink / raw)
  To: Keith Busch
  Cc: Matias Bjørling, willy, sbradshaw, linux-kernel, linux-nvme,
	hch

On 06/04/2014 12:52 PM, Keith Busch wrote:
> On Wed, 4 Jun 2014, Jens Axboe wrote:
>> On 06/04/2014 12:28 PM, Keith Busch wrote:
>> Are you testing against 3.13? You really need the current tree for this,
>> otherwise I'm sure you'll run into issues (as you appear to be :-)
> 
> I'm using Matias' current tree:
> 
> git://github.com/MatiasBjorling/linux-collab nvmeq_wip_6
> 
> Is this the right one to use? Looks like a 3.15rc1+.

It is, I misread that diff wrong.

> Also ... obviously that's the wrong diff from before. I didn't save my
> file before creating the diff. Should have looked like this:
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d10013b..d1b986c 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -105,7 +105,8 @@ static int blk_mq_queue_enter(struct request_queue *q)
>      __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
>      smp_wmb();
>      /* we have problems to freeze the queue if it's initializing */
> -    if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
> +    if (!blk_queue_dying(q) && (!blk_queue_bypass(q) ||
> +                    !blk_queue_init_done(q)))
>          return 0;
> 
>      __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);

This looks better.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] conversion to blk-mq
  2014-06-04 18:52                 ` Keith Busch
  2014-06-04 18:55                   ` Jens Axboe
@ 2014-06-04 20:01                   ` Matias Bjorling
  1 sibling, 0 replies; 14+ messages in thread
From: Matias Bjorling @ 2014-06-04 20:01 UTC (permalink / raw)
  To: Keith Busch, Jens Axboe; +Cc: willy, sbradshaw, linux-kernel, linux-nvme, hch

On 06/04/2014 08:52 PM, Keith Busch wrote:
> On Wed, 4 Jun 2014, Jens Axboe wrote:
>> On 06/04/2014 12:28 PM, Keith Busch wrote:
>> Are you testing against 3.13? You really need the current tree for this,
>> otherwise I'm sure you'll run into issues (as you appear to be :-)
> 
> I'm using Matias' current tree:
> 
> git://github.com/MatiasBjorling/linux-collab nvmeq_wip_6
> 
> Is this the right one to use? Looks like a 3.15rc1+.
> 
> Also ... obviously that's the wrong diff from before. I didn't save my
> file before creating the diff. Should have looked like this:
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d10013b..d1b986c 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -105,7 +105,8 @@ static int blk_mq_queue_enter(struct request_queue *q)
>      __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
>      smp_wmb();
>      /* we have problems to freeze the queue if it's initializing */
> -    if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
> +    if (!blk_queue_dying(q) && (!blk_queue_bypass(q) ||
> +                    !blk_queue_init_done(q)))
>          return 0;
> 
>      __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);

Thanks!

I've put your patch into the nvmemq_wip_v6 branch and rebased it on top
of Jens' for-linus.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2014-06-04 20:01 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-06-02 20:55 [PATCH v5] conversion to blk-mq Matias Bjørling
2014-06-02 20:55 ` [PATCH v5] NVMe: " Matias Bjørling
2014-06-02 22:49 ` [PATCH v5] " Keith Busch
2014-06-02 23:06   ` Keith Busch
2014-06-03 13:56     ` Matias Bjørling
2014-06-03 20:12     ` Matias Bjorling
2014-06-03 22:23       ` Keith Busch
2014-06-03 22:27         ` Keith Busch
2014-06-04  9:16           ` Matias Bjørling
2014-06-04 18:28             ` Keith Busch
2014-06-04 18:42               ` Jens Axboe
2014-06-04 18:52                 ` Keith Busch
2014-06-04 18:55                   ` Jens Axboe
2014-06-04 20:01                   ` Matias Bjorling

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).