[PATCH V3] basic conversion to blk-mq

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH V3] basic conversion to blk-mq
@ 2014-05-28 22:59 Matias Bjørling
  2014-05-28 22:59 ` [PATCH V3] NVMe: " Matias Bjørling
  0 siblings, 1 reply; 10+ messages in thread
From: Matias Bjørling @ 2014-05-28 22:59 UTC (permalink / raw)
  To: willy, keith.busch, sbradshaw, axboe, linux-kernel; +Cc: Matias Bjørling

Hi Matthew and Keith,

Here is an updated patch with most of the feedback fixed.

The patch is against Jens' 3.16/core tree. You may use the nvmemq_wip_review
branch at:

https://github.com/MatiasBjorling/linux-collab nvmemq_wip_review

from v2:
 * rebased on top of current 3.16/core.
 * use blk-mq queue management for spreading io queues
 * removed rcu handling and allocated all io queues up front for mgmt by blk-mq
 * removed the need for hotplugging notification
 * fixed flush data handling
 * fixed double free of spinlock
 * various cleanup

Missing work on abortion and timeouts.

Matias Bjørling (1):
  NVMe: basic conversion to blk-mq

 drivers/block/nvme-core.c | 1143 +++++++++++++++++++--------------------------
 include/linux/nvme.h      |   11 +-
 2 files changed, 486 insertions(+), 668 deletions(-)

-- 
1.9.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH V3] NVMe: basic conversion to blk-mq
  2014-05-28 22:59 [PATCH V3] basic conversion to blk-mq Matias Bjørling
@ 2014-05-28 22:59 ` Matias Bjørling
  2014-05-29  3:07   ` Keith Busch
  0 siblings, 1 reply; 10+ messages in thread
From: Matias Bjørling @ 2014-05-28 22:59 UTC (permalink / raw)
  To: willy, keith.busch, sbradshaw, axboe, linux-kernel; +Cc: Matias Bjørling

This converts the current NVMe driver to utilize the blk-mq layer.

Outstanding work:
  - Abortion of I/Os and timeouts

Contributions in this patch from:

  Sam Bradshaw <sbradshaw@micron.com>
  Jens Axboe <axboe@kernel.dk>

Signed-off-by: Matias Bjørling <m@bjorling.me>
---
 drivers/block/nvme-core.c | 1143 +++++++++++++++++++--------------------------
 include/linux/nvme.h      |   11 +-
 2 files changed, 486 insertions(+), 668 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e6b0ee4..ac695b3 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -13,9 +13,9 @@
  */
 
 #include <linux/nvme.h>
-#include <linux/bio.h>
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
@@ -33,7 +33,6 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
-#include <linux/percpu.h>
 #include <linux/poison.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
@@ -42,11 +41,11 @@
 #include <scsi/sg.h>
 #include <asm-generic/io-64-nonatomic-lo-hi.h>
 
-#define NVME_Q_DEPTH 1024
+#define NVME_Q_DEPTH		1024
+#define NVME_AQ_DEPTH		64
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
-#define ADMIN_TIMEOUT	(60 * HZ)
-#define IOD_TIMEOUT	(4 * NVME_IO_TIMEOUT)
+#define ADMIN_TIMEOUT		(60 * HZ)
 
 unsigned char io_timeout = 30;
 module_param(io_timeout, byte, 0644);
@@ -65,10 +64,14 @@ static struct workqueue_struct *nvme_workq;
 static wait_queue_head_t nvme_kthread_wait;
 
 static void nvme_reset_failed_dev(struct work_struct *ws);
+static int nvme_process_cq(struct nvme_queue *nvmeq);
+static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
+						u32 *result, unsigned timeout);
 
 struct async_cmd_info {
 	struct kthread_work work;
 	struct kthread_worker *worker;
+	struct request *req;
 	u32 result;
 	int status;
 	void *ctx;
@@ -79,7 +82,6 @@ struct async_cmd_info {
  * commands and one for I/O commands).
  */
 struct nvme_queue {
-	struct rcu_head r_head;
 	struct device *q_dmadev;
 	struct nvme_dev *dev;
 	char irqname[24];	/* nvme4294967295-65535\0 */
@@ -88,10 +90,6 @@ struct nvme_queue {
 	volatile struct nvme_completion *cqes;
 	dma_addr_t sq_dma_addr;
 	dma_addr_t cq_dma_addr;
-	wait_queue_head_t sq_full;
-	wait_queue_t sq_cong_wait;
-	struct bio_list sq_cong;
-	struct list_head iod_bio;
 	u32 __iomem *q_db;
 	u16 q_depth;
 	u16 cq_vector;
@@ -100,11 +98,10 @@ struct nvme_queue {
 	u16 cq_head;
 	u16 qid;
 	u8 cq_phase;
-	u8 cqe_seen;
 	u8 q_suspended;
 	cpumask_var_t cpu_mask;
 	struct async_cmd_info cmdinfo;
-	unsigned long cmdid_data[];
+	struct blk_mq_hw_ctx *hctx;
 };
 
 /*
@@ -132,62 +129,66 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
 struct nvme_cmd_info {
 	nvme_completion_fn fn;
 	void *ctx;
-	unsigned long timeout;
 	int aborted;
+	struct nvme_queue *nvmeq;
 };
 
-static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+			  unsigned int i)
 {
-	return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
+	struct nvme_dev *dev = data;
+	struct nvme_queue *nvmeq = dev->queues[(i % dev->queue_count) + 1];
+	BUG_ON(!nvmeq);
+	WARN_ON(nvmeq->hctx);
+	nvmeq->hctx = hctx;
+	hctx->driver_data = nvmeq;
+	return 0;
 }
 
-static unsigned nvme_queue_extra(int depth)
+static int nvme_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+				unsigned int i)
 {
-	return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info));
+	struct nvme_dev *dev = data;
+	struct nvme_queue *nvmeq = dev->queues[0];
+	BUG_ON(!nvmeq);
+	WARN_ON(nvmeq->hctx);
+	nvmeq->hctx = hctx;
+	hctx->driver_data = nvmeq;
+	return 0;
 }
 
-/**
- * alloc_cmdid() - Allocate a Command ID
- * @nvmeq: The queue that will be used for this command
- * @ctx: A pointer that will be passed to the handler
- * @handler: The function to call on completion
- *
- * Allocate a Command ID for a queue.  The data passed in will
- * be passed to the completion handler.  This is implemented by using
- * the bottom two bits of the ctx pointer to store the handler ID.
- * Passing in a pointer that's not 4-byte aligned will cause a BUG.
- * We can change this if it becomes a problem.
- *
- * May be called with local interrupts disabled and the q_lock held,
- * or with interrupts enabled and no locks held.
- */
-static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
-				nvme_completion_fn handler, unsigned timeout)
+static int nvme_init_admin_request(void *data, struct request *req,
+				unsigned int hctx_idx, unsigned int rq_idx,
+				unsigned int numa_node)
 {
-	int depth = nvmeq->q_depth - 1;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	int cmdid;
-
-	do {
-		cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
-		if (cmdid >= depth)
-			return -EBUSY;
-	} while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
+	struct nvme_dev *dev = data;
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = dev->queues[0];
+	WARN_ON(!nvmeq);
+	WARN_ON(!cmd);
+	cmd->nvmeq = nvmeq;
+	return 0;
+}
 
-	info[cmdid].fn = handler;
-	info[cmdid].ctx = ctx;
-	info[cmdid].timeout = jiffies + timeout;
-	info[cmdid].aborted = 0;
-	return cmdid;
+static int nvme_init_request(void *data, struct request *req,
+				unsigned int hctx_idx, unsigned int rq_idx,
+				unsigned int numa_node)
+{
+	struct nvme_dev *dev = data;
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+	WARN_ON(!nvmeq);
+	WARN_ON(!cmd);
+	cmd->nvmeq = nvmeq;
+	return 0;
 }
 
-static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
-				nvme_completion_fn handler, unsigned timeout)
+static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
+				nvme_completion_fn handler)
 {
-	int cmdid;
-	wait_event_killable(nvmeq->sq_full,
-		(cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
-	return (cmdid < 0) ? -EINTR : cmdid;
+	cmd->fn = handler;
+	cmd->ctx = ctx;
+	cmd->aborted = 0;
 }
 
 /* Special values must be less than 0x1000 */
@@ -229,75 +230,48 @@ static void async_completion(struct nvme_queue *nvmeq, void *ctx,
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
+	blk_put_request(cmdinfo->req);
+}
+
+static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
+				  unsigned int tag)
+{
+	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
+	struct request *req = blk_mq_tag_to_rq(hctx->tags, tag);
+	return blk_mq_rq_to_pdu(req);
 }
 
 /*
  * Called with local interrupts disabled and the q_lock held.  May not sleep.
  */
-static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
+static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
 						nvme_completion_fn *fn)
 {
+	struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
 	void *ctx;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-
-	if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) {
-		if (fn)
-			*fn = special_completion;
+	if (tag >= nvmeq->q_depth) {
+		*fn = special_completion;
 		return CMD_CTX_INVALID;
 	}
 	if (fn)
-		*fn = info[cmdid].fn;
-	ctx = info[cmdid].ctx;
-	info[cmdid].fn = special_completion;
-	info[cmdid].ctx = CMD_CTX_COMPLETED;
-	clear_bit(cmdid, nvmeq->cmdid_data);
-	wake_up(&nvmeq->sq_full);
+		*fn = cmd->fn;
+	ctx = cmd->ctx;
+	cmd->fn = special_completion;
+	cmd->ctx = CMD_CTX_COMPLETED;
 	return ctx;
 }
 
-static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
-						nvme_completion_fn *fn)
+static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
 {
 	void *ctx;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 	if (fn)
-		*fn = info[cmdid].fn;
-	ctx = info[cmdid].ctx;
-	info[cmdid].fn = special_completion;
-	info[cmdid].ctx = CMD_CTX_CANCELLED;
+		*fn = cmd->fn;
+	ctx = cmd->ctx;
+	cmd->fn = special_completion;
+	cmd->ctx = CMD_CTX_CANCELLED;
 	return ctx;
 }
 
-static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
-{
-	return rcu_dereference_raw(dev->queues[qid]);
-}
-
-static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
-{
-	unsigned queue_id = get_cpu_var(*dev->io_queue);
-	rcu_read_lock();
-	return rcu_dereference(dev->queues[queue_id]);
-}
-
-static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
-{
-	rcu_read_unlock();
-	put_cpu_var(nvmeq->dev->io_queue);
-}
-
-static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
-							__acquires(RCU)
-{
-	rcu_read_lock();
-	return rcu_dereference(dev->queues[q_idx]);
-}
-
-static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
-{
-	rcu_read_unlock();
-}
-
 /**
  * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
@@ -354,7 +328,6 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
 		iod->length = nbytes;
 		iod->nents = 0;
 		iod->first_dma = 0ULL;
-		iod->start_time = jiffies;
 	}
 
 	return iod;
@@ -378,59 +351,26 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 	kfree(iod);
 }
 
-static void nvme_start_io_acct(struct bio *bio)
-{
-	struct gendisk *disk = bio->bi_bdev->bd_disk;
-	const int rw = bio_data_dir(bio);
-	int cpu = part_stat_lock();
-	part_round_stats(cpu, &disk->part0);
-	part_stat_inc(cpu, &disk->part0, ios[rw]);
-	part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio));
-	part_inc_in_flight(&disk->part0, rw);
-	part_stat_unlock();
-}
-
-static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
-{
-	struct gendisk *disk = bio->bi_bdev->bd_disk;
-	const int rw = bio_data_dir(bio);
-	unsigned long duration = jiffies - start_time;
-	int cpu = part_stat_lock();
-	part_stat_add(cpu, &disk->part0, ticks[rw], duration);
-	part_round_stats(cpu, &disk->part0);
-	part_dec_in_flight(&disk->part0, rw);
-	part_stat_unlock();
-}
-
-static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
+static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct nvme_iod *iod = ctx;
-	struct bio *bio = iod->private;
+	struct request *req = iod->private;
+
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 
-	if (unlikely(status)) {
-		if (!(status & NVME_SC_DNR ||
-				bio->bi_rw & REQ_FAILFAST_MASK) &&
-				(jiffies - iod->start_time) < IOD_TIMEOUT) {
-			if (!waitqueue_active(&nvmeq->sq_full))
-				add_wait_queue(&nvmeq->sq_full,
-							&nvmeq->sq_cong_wait);
-			list_add_tail(&iod->node, &nvmeq->iod_bio);
-			wake_up(&nvmeq->sq_full);
-			return;
-		}
-	}
 	if (iod->nents) {
-		dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
-			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		nvme_end_io_acct(bio, iod->start_time);
+		dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
+			rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	}
 	nvme_free_iod(nvmeq->dev, iod);
-	if (status)
-		bio_endio(bio, -EIO);
+
+	if (unlikely(status))
+		req->errors = -EIO;
 	else
-		bio_endio(bio, 0);
+		req->errors = 0;
+
+	blk_mq_complete_request(req);
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
@@ -512,86 +452,38 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
 	return total_len;
 }
 
-static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
-				 int len)
-{
-	struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL);
-	if (!split)
-		return -ENOMEM;
-
-	bio_chain(split, bio);
-
-	if (!waitqueue_active(&nvmeq->sq_full))
-		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-	bio_list_add(&nvmeq->sq_cong, split);
-	bio_list_add(&nvmeq->sq_cong, bio);
-	wake_up(&nvmeq->sq_full);
-
-	return 0;
-}
-
-/* NVMe scatterlists require no holes in the virtual address */
-#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)	((vec2)->bv_offset || \
-			(((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
-
-static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
-		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
+static int nvme_map_rq(struct nvme_queue *nvmeq, struct nvme_iod *iod,
+		struct request *req, enum dma_data_direction dma_dir,
+		int psegs)
 {
-	struct bio_vec bvec, bvprv;
-	struct bvec_iter iter;
-	struct scatterlist *sg = NULL;
-	int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
-	int first = 1;
-
-	if (nvmeq->dev->stripe_size)
-		split_len = nvmeq->dev->stripe_size -
-			((bio->bi_iter.bi_sector << 9) &
-			 (nvmeq->dev->stripe_size - 1));
-
 	sg_init_table(iod->sg, psegs);
-	bio_for_each_segment(bvec, bio, iter) {
-		if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) {
-			sg->length += bvec.bv_len;
-		} else {
-			if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec))
-				return nvme_split_and_submit(bio, nvmeq,
-							     length);
-
-			sg = sg ? sg + 1 : iod->sg;
-			sg_set_page(sg, bvec.bv_page,
-				    bvec.bv_len, bvec.bv_offset);
-			nsegs++;
-		}
+	iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
 
-		if (split_len - length < bvec.bv_len)
-			return nvme_split_and_submit(bio, nvmeq, split_len);
-		length += bvec.bv_len;
-		bvprv = bvec;
-		first = 0;
-	}
-	iod->nents = nsegs;
-	sg_mark_end(sg);
-	if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
+	if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
 		return -ENOMEM;
 
-	BUG_ON(length != bio->bi_iter.bi_size);
-	return length;
+	return iod->nents;
 }
 
-static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-		struct bio *bio, struct nvme_iod *iod, int cmdid)
+/*
+ * We reuse the small pool to allocate the 16-byte range here as it is not
+ * worth having a special pool for these or additional cases to handle freeing
+ * the iod.
+ */
+static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+		struct request *req, struct nvme_iod *iod)
 {
 	struct nvme_dsm_range *range =
 				(struct nvme_dsm_range *)iod_list(iod)[0];
 	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 
 	range->cattr = cpu_to_le32(0);
-	range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift);
-	range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
+	range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
+	range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
 
 	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->dsm.opcode = nvme_cmd_dsm;
-	cmnd->dsm.command_id = cmdid;
+	cmnd->dsm.command_id = req->tag;
 	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
 	cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
 	cmnd->dsm.nr = 0;
@@ -600,66 +492,97 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
 	writel(nvmeq->sq_tail, nvmeq->q_db);
-
-	return 0;
 }
 
-static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-								int cmdid)
+static void nvme_setup_flush(struct nvme_command *cmnd,
+						struct nvme_ns *ns, int cmdid)
 {
-	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
-
 	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->common.opcode = nvme_cmd_flush;
 	cmnd->common.command_id = cmdid;
 	cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+}
+
+static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+								int cmdid)
+{
+	nvme_setup_flush(&nvmeq->sq_cmds[nvmeq->sq_tail], ns, cmdid);
 
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
 	writel(nvmeq->sq_tail, nvmeq->q_db);
+}
+
+static int nvme_submit_flush_sync(struct nvme_queue *nvmeq, struct nvme_ns *ns)
+{
+	struct request *req;
+	struct nvme_command cmnd;
+
+	req = blk_mq_alloc_request(ns->queue, WRITE, GFP_KERNEL, false);
+	if (!req)
+		return -ENOMEM;
+
+	nvme_setup_flush(&cmnd, ns, req->tag);
+	nvme_submit_sync_cmd(req, &cmnd, NULL, NVME_IO_TIMEOUT);
 
 	return 0;
 }
 
-static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
+struct flush_cmd_info {
+	struct nvme_ns *ns;
+	struct nvme_iod *iod;
+};
+
+static void req_flush_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct flush_cmd_info *flush_cmd = ctx;
+	nvme_submit_flush_sync(nvmeq, flush_cmd->ns);
+	req_completion(nvmeq, flush_cmd->iod, cqe);
+	kfree(flush_cmd);
+}
+
+static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
+							struct nvme_ns *ns)
 {
-	struct bio *bio = iod->private;
-	struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+	struct request *req = iod->private;
 	struct nvme_command *cmnd;
-	int cmdid;
-	u16 control;
-	u32 dsmgmt;
+	u16 control = 0;
+	u32 dsmgmt = 0;
 
-	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
-	if (unlikely(cmdid < 0))
-		return cmdid;
+	spin_lock_irq(&nvmeq->q_lock);
+	if (nvmeq->q_suspended) {
+		spin_unlock_irq(&nvmeq->q_lock);
+		return -EBUSY;
+	}
 
-	if (bio->bi_rw & REQ_DISCARD)
-		return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
-	if (bio->bi_rw & REQ_FLUSH)
-		return nvme_submit_flush(nvmeq, ns, cmdid);
+	if (req->cmd_flags & REQ_DISCARD) {
+		nvme_submit_discard(nvmeq, ns, req, iod);
+		goto end_submit;
+	}
+	if (req->cmd_flags & REQ_FLUSH && !iod->nents) {
+		nvme_submit_flush(nvmeq, ns, req->tag);
+		goto end_submit;
+	}
 
-	control = 0;
-	if (bio->bi_rw & REQ_FUA)
+	if (req->cmd_flags & REQ_FUA)
 		control |= NVME_RW_FUA;
-	if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
 		control |= NVME_RW_LR;
 
-	dsmgmt = 0;
-	if (bio->bi_rw & REQ_RAHEAD)
+	if (req->cmd_flags & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
 	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 	memset(cmnd, 0, sizeof(*cmnd));
 
-	cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read;
-	cmnd->rw.command_id = cmdid;
+	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+	cmnd->rw.command_id = req->tag;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 	cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
 	cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
-	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
-	cmnd->rw.length =
-		cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1);
+	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
@@ -667,48 +590,42 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
 		nvmeq->sq_tail = 0;
 	writel(nvmeq->sq_tail, nvmeq->q_db);
 
-	return 0;
-}
-
-static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio)
-{
-	struct bio *split = bio_clone(bio, GFP_ATOMIC);
-	if (!split)
-		return -ENOMEM;
-
-	split->bi_iter.bi_size = 0;
-	split->bi_phys_segments = 0;
-	bio->bi_rw &= ~REQ_FLUSH;
-	bio_chain(split, bio);
-
-	if (!waitqueue_active(&nvmeq->sq_full))
-		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-	bio_list_add(&nvmeq->sq_cong, split);
-	bio_list_add(&nvmeq->sq_cong, bio);
-	wake_up_process(nvme_thread);
-
+ end_submit:
+	nvme_process_cq(nvmeq);
+	spin_unlock_irq(&nvmeq->q_lock);
 	return 0;
 }
 
 /*
- * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ * Called with preemption disabled, may not sleep.
  */
-static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-								struct bio *bio)
+static int nvme_submit_req_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+							  struct request *req)
 {
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_iod *iod;
-	int psegs = bio_phys_segments(ns->queue, bio);
-	int result;
+	enum dma_data_direction dma_dir;
+	int psegs = req->nr_phys_segments;
 
-	if ((bio->bi_rw & REQ_FLUSH) && psegs)
-		return nvme_split_flush_data(nvmeq, bio);
-
-	iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
+	iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC);
 	if (!iod)
-		return -ENOMEM;
+		return BLK_MQ_RQ_QUEUE_BUSY;
+
+	iod->private = req;
+
+	nvme_set_info(cmd, iod, req_completion);
+
+	if ((req->cmd_flags & REQ_FLUSH) && psegs) {
+		struct flush_cmd_info *flush_cmd = kmalloc(
+				sizeof(struct flush_cmd_info), GFP_KERNEL);
+		if (!flush_cmd)
+			goto free_iod;
+		flush_cmd->ns = ns;
+		flush_cmd->iod = iod;
+		nvme_set_info(cmd, flush_cmd, req_flush_completion);
+	}
 
-	iod->private = bio;
-	if (bio->bi_rw & REQ_DISCARD) {
+	if (req->cmd_flags & REQ_DISCARD) {
 		void *range;
 		/*
 		 * We reuse the small pool to allocate the 16-byte range here
@@ -718,35 +635,29 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
 						GFP_ATOMIC,
 						&iod->first_dma);
-		if (!range) {
-			result = -ENOMEM;
-			goto free_iod;
-		}
+		if (!range)
+			goto finish_cmd;
 		iod_list(iod)[0] = (__le64 *)range;
 		iod->npages = 0;
 	} else if (psegs) {
-		result = nvme_map_bio(nvmeq, iod, bio,
-			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
-			psegs);
-		if (result <= 0)
-			goto free_iod;
-		if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) !=
-								result) {
-			result = -ENOMEM;
-			goto free_iod;
-		}
-		nvme_start_io_acct(bio);
-	}
-	if (unlikely(nvme_submit_iod(nvmeq, iod))) {
-		if (!waitqueue_active(&nvmeq->sq_full))
-			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-		list_add_tail(&iod->node, &nvmeq->iod_bio);
+		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+		if (nvme_map_rq(nvmeq, iod, req, dma_dir, psegs) <= 0)
+			goto finish_cmd;
+
+		if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod,
+						blk_rq_bytes(req), GFP_ATOMIC))
+			goto finish_cmd;
 	}
-	return 0;
 
+	if (!nvme_submit_iod(nvmeq, iod, ns))
+		return 0;
+
+ finish_cmd:
+	nvme_finish_cmd(nvmeq, req->tag, NULL);
  free_iod:
 	nvme_free_iod(nvmeq->dev, iod);
-	return result;
+	return BLK_MQ_RQ_QUEUE_ERROR;
 }
 
 static int nvme_process_cq(struct nvme_queue *nvmeq)
@@ -767,8 +678,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
 			head = 0;
 			phase = !phase;
 		}
-
-		ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
+		ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
 		fn(nvmeq, ctx, &cqe);
 	}
 
@@ -785,34 +695,21 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
 	nvmeq->cq_head = head;
 	nvmeq->cq_phase = phase;
 
-	nvmeq->cqe_seen = 1;
 	return 1;
 }
 
-static void nvme_make_request(struct request_queue *q, struct bio *bio)
+static int nvme_queue_request(struct blk_mq_hw_ctx *hctx, struct request *req)
 {
-	struct nvme_ns *ns = q->queuedata;
-	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
-	int result = -EBUSY;
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	int result;
 
 	if (!nvmeq) {
-		put_nvmeq(NULL);
-		bio_endio(bio, -EIO);
-		return;
-	}
-
-	spin_lock_irq(&nvmeq->q_lock);
-	if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
-		result = nvme_submit_bio_queue(nvmeq, ns, bio);
-	if (unlikely(result)) {
-		if (!waitqueue_active(&nvmeq->sq_full))
-			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-		bio_list_add(&nvmeq->sq_cong, bio);
+		return BLK_MQ_RQ_QUEUE_ERROR;
 	}
 
-	nvme_process_cq(nvmeq);
-	spin_unlock_irq(&nvmeq->q_lock);
-	put_nvmeq(nvmeq);
+	result = nvme_submit_req_queue(nvmeq, ns, req);
+	return result;
 }
 
 static irqreturn_t nvme_irq(int irq, void *data)
@@ -820,9 +717,7 @@ static irqreturn_t nvme_irq(int irq, void *data)
 	irqreturn_t result;
 	struct nvme_queue *nvmeq = data;
 	spin_lock(&nvmeq->q_lock);
-	nvme_process_cq(nvmeq);
-	result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
-	nvmeq->cqe_seen = 0;
+	result = nvme_process_cq(nvmeq) ? IRQ_HANDLED : IRQ_NONE;
 	spin_unlock(&nvmeq->q_lock);
 	return result;
 }
@@ -838,9 +733,11 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 
 static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
 {
-	spin_lock_irq(&nvmeq->q_lock);
-	cancel_cmdid(nvmeq, cmdid, NULL);
-	spin_unlock_irq(&nvmeq->q_lock);
+	/*
+	 * spin_lock_irq(&nvmeq->q_lock);
+	 * cancel_cmd_info(nvmeq, cmdid, NULL);
+	 * spin_unlock_irq(&nvmeq->q_lock);
+	 */
 }
 
 struct sync_cmd_info {
@@ -862,46 +759,31 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
  */
-static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
-						struct nvme_command *cmd,
+static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
 						u32 *result, unsigned timeout)
 {
-	int cmdid, ret;
+	int ret;
 	struct sync_cmd_info cmdinfo;
-	struct nvme_queue *nvmeq;
-
-	nvmeq = lock_nvmeq(dev, q_idx);
-	if (!nvmeq) {
-		unlock_nvmeq(nvmeq);
-		return -ENODEV;
-	}
+	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
 
 	cmdinfo.task = current;
 	cmdinfo.status = -EINTR;
 
-	cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout);
-	if (cmdid < 0) {
-		unlock_nvmeq(nvmeq);
-		return cmdid;
-	}
-	cmd->common.command_id = cmdid;
+	cmd->common.command_id = req->tag;
+
+	nvme_set_info(cmd_rq, &cmdinfo, sync_completion);
 
 	set_current_state(TASK_KILLABLE);
 	ret = nvme_submit_cmd(nvmeq, cmd);
 	if (ret) {
-		free_cmdid(nvmeq, cmdid, NULL);
-		unlock_nvmeq(nvmeq);
+		nvme_finish_cmd(nvmeq, req->tag, NULL);
 		set_current_state(TASK_RUNNING);
-		return ret;
 	}
-	unlock_nvmeq(nvmeq);
 	schedule_timeout(timeout);
 
 	if (cmdinfo.status == -EINTR) {
-		nvmeq = lock_nvmeq(dev, q_idx);
-		if (nvmeq)
-			nvme_abort_command(nvmeq, cmdid);
-		unlock_nvmeq(nvmeq);
+		nvme_abort_command(nvmeq, req->tag);
 		return -EINTR;
 	}
 
@@ -915,55 +797,93 @@ static int nvme_submit_async_cmd(struct nvme_queue *nvmeq,
 			struct nvme_command *cmd,
 			struct async_cmd_info *cmdinfo, unsigned timeout)
 {
-	int cmdid;
+	struct request *req;
+	struct nvme_cmd_info *cmd_rq;
 
-	cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout);
-	if (cmdid < 0)
-		return cmdid;
+	req = blk_mq_alloc_request(nvmeq->hctx->queue, WRITE, GFP_KERNEL,
+					false);
+	if (!req)
+		return -ENOMEM;
+
+	req->timeout = timeout;
+	cmd_rq = blk_mq_rq_to_pdu(req);
+	cmdinfo->req = req;
+	nvme_set_info(cmd_rq, cmdinfo, async_completion);
 	cmdinfo->status = -EINTR;
-	cmd->common.command_id = cmdid;
+
+	cmd->common.command_id = req->tag;
+
 	return nvme_submit_cmd(nvmeq, cmd);
 }
 
+int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+						u32 *result, unsigned timeout)
+{
+	int res;
+	struct request *req;
+
+	req = blk_mq_alloc_request(dev->admin_rq, WRITE, GFP_KERNEL, false);
+	if (!req)
+		return -ENOMEM;
+	res = nvme_submit_sync_cmd(req, cmd, result, timeout);
+	blk_put_request(req);
+	return res;
+}
+
 int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 								u32 *result)
 {
-	return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT);
+	return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT);
 }
 
 int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 								u32 *result)
 {
-	return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result,
-							NVME_IO_TIMEOUT);
+	int res;
+	struct request *req;
+	struct nvme_queue *nvmeq;
+
+	nvmeq = dev->queues[(smp_processor_id() + 1) % dev->online_queues];
+	if (!nvmeq)
+		return -ENODEV;
+
+	req = blk_mq_alloc_request(nvmeq->hctx->queue, WRITE, GFP_KERNEL,
+					false);
+	if (!req)
+		return -ENOMEM;
+
+	if (nvmeq->q_suspended) {
+		res = -EBUSY;
+		goto end_sync_io;
+	}
+
+	res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT);
+ end_sync_io:
+	blk_put_request(req);
+	return res;
 }
 
 static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
 		struct nvme_command *cmd, struct async_cmd_info *cmdinfo)
 {
-	return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo,
+	return nvme_submit_async_cmd(dev->queues[0], cmd, cmdinfo,
 								ADMIN_TIMEOUT);
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 {
-	int status;
 	struct nvme_command c;
 
 	memset(&c, 0, sizeof(c));
 	c.delete_queue.opcode = opcode;
 	c.delete_queue.qid = cpu_to_le16(id);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
-	if (status)
-		return -EIO;
-	return 0;
+	return nvme_submit_admin_cmd(dev, &c, NULL);
 }
 
 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 						struct nvme_queue *nvmeq)
 {
-	int status;
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
 
@@ -975,16 +895,12 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.cq_flags = cpu_to_le16(flags);
 	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
-	if (status)
-		return -EIO;
-	return 0;
+	return nvme_submit_admin_cmd(dev, &c, NULL);
 }
 
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 						struct nvme_queue *nvmeq)
 {
-	int status;
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
 
@@ -996,10 +912,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 	c.create_sq.sq_flags = cpu_to_le16(flags);
 	c.create_sq.cqid = cpu_to_le16(qid);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
-	if (status)
-		return -EIO;
-	return 0;
+	return nvme_submit_admin_cmd(dev, &c, NULL);
 }
 
 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1055,28 +968,27 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
 }
 
 /**
- * nvme_abort_cmd - Attempt aborting a command
- * @cmdid: Command id of a timed out IO
- * @queue: The queue with timed out IO
+ * nvme_abort_req - Attempt aborting a rq
  *
  * Schedule controller reset if the command was already aborted once before and
  * still hasn't been returned to the driver, or if this is the admin queue.
  */
-static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
+static void nvme_abort_req(struct request *req)
 {
-	int a_cmdid;
-	struct nvme_command cmd;
+	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
 	struct nvme_dev *dev = nvmeq->dev;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	struct nvme_queue *adminq;
+	struct nvme_command cmd;
+	struct request *abort_req;
+	struct nvme_cmd_info *abort_cmd;
 
-	if (!nvmeq->qid || info[cmdid].aborted) {
+	if (!nvmeq->qid || cmd_rq->aborted) {
 		if (work_busy(&dev->reset_work))
 			return;
 		list_del_init(&dev->node);
 		dev_warn(&dev->pci_dev->dev,
-			"I/O %d QID %d timeout, reset controller\n", cmdid,
-								nvmeq->qid);
+			"I/O %d QID %d timeout, reset controller\n",
+			req->tag, nvmeq->qid);
 		dev->reset_workfn = nvme_reset_failed_dev;
 		queue_work(nvme_workq, &dev->reset_work);
 		return;
@@ -1085,83 +997,106 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
 	if (!dev->abort_limit)
 		return;
 
-	adminq = rcu_dereference(dev->queues[0]);
-	a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion,
-								ADMIN_TIMEOUT);
-	if (a_cmdid < 0)
-		return;
+	abort_req = blk_mq_alloc_request(dev->admin_rq, WRITE, GFP_KERNEL,
+						true);
+	abort_cmd = blk_mq_rq_to_pdu(abort_req);
+	nvme_set_info(abort_cmd, CMD_CTX_ABORT, special_completion);
 
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.abort.opcode = nvme_admin_abort_cmd;
-	cmd.abort.cid = cmdid;
+	cmd.abort.cid = req->tag;
 	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
-	cmd.abort.command_id = a_cmdid;
+	cmd.abort.command_id = abort_req->tag;
 
 	--dev->abort_limit;
-	info[cmdid].aborted = 1;
-	info[cmdid].timeout = jiffies + ADMIN_TIMEOUT;
+	cmd_rq->aborted = 1;
 
-	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid,
+	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
 							nvmeq->qid);
-	nvme_submit_cmd(adminq, &cmd);
+	nvme_submit_cmd(dev->queues[0], &cmd);
 }
 
-/**
- * nvme_cancel_ios - Cancel outstanding I/Os
- * @queue: The queue to cancel I/Os on
- * @timeout: True to only cancel I/Os which have timed out
- */
-static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
+static void nvme_cancel_queue_ios(void *data, unsigned long *tag_map)
 {
-	int depth = nvmeq->q_depth - 1;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	unsigned long now = jiffies;
-	int cmdid;
+	struct nvme_queue *nvmeq = (struct nvme_queue *) data;
+	unsigned int tag = 0;
+	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
 
-	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
+	tag = 0;
+	do {
+		struct request *req;
 		void *ctx;
 		nvme_completion_fn fn;
+		struct nvme_cmd_info *cmd;
 		static struct nvme_completion cqe = {
 			.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
 		};
+		int qdepth = nvmeq == nvmeq->dev->queues[0] ?
+					nvmeq->dev->admin_tagset.queue_depth :
+					nvmeq->dev->tagset.queue_depth;
 
-		if (timeout && !time_after(now, info[cmdid].timeout))
-			continue;
-		if (info[cmdid].ctx == CMD_CTX_CANCELLED)
+		/* zero'd bits are free tags */
+		tag = find_next_zero_bit(tag_map, qdepth, tag);
+		if (tag >= qdepth)
+			break;
+
+		req = blk_mq_tag_to_rq(hctx->tags, tag++);
+		cmd = blk_mq_rq_to_pdu(req);
+
+		/* TODO: is this test necessary? */
+		if (!test_bit(1 /* REQ_ATOM_STARTED */, &req->atomic_flags))
 			continue;
-		if (timeout && nvmeq->dev->initialized) {
-			nvme_abort_cmd(cmdid, nvmeq);
+
+		if (cmd->ctx == CMD_CTX_CANCELLED)
 			continue;
-		}
-		dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
-								nvmeq->qid);
-		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
+
+		/* TODO: should we send abort request to hw? */
+
+		dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
+							req->tag, nvmeq->qid);
+		ctx = cancel_cmd_info(cmd, &fn);
 		fn(nvmeq, ctx, &cqe);
-	}
+
+	} while (1);
 }
 
-static void nvme_free_queue(struct rcu_head *r)
+/**
+ * nvme_cancel_ios - Cancel outstanding I/Os
+ * @queue: The queue to cancel I/Os on
+ * @timeout: True to only cancel I/Os which have timed out
+ */
+static void nvme_cancel_ios(struct nvme_queue *nvmeq)
 {
-	struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head);
+	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
 
-	spin_lock_irq(&nvmeq->q_lock);
-	while (bio_list_peek(&nvmeq->sq_cong)) {
-		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
-		bio_endio(bio, -EIO);
-	}
-	while (!list_empty(&nvmeq->iod_bio)) {
-		static struct nvme_completion cqe = {
-			.status = cpu_to_le16(
-				(NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1),
-		};
-		struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio,
-							struct nvme_iod,
-							node);
-		list_del(&iod->node);
-		bio_completion(nvmeq, iod, &cqe);
-	}
-	spin_unlock_irq(&nvmeq->q_lock);
+	if (nvmeq->dev->initialized)
+		blk_mq_tag_busy_iter(hctx->tags, nvme_cancel_queue_ios, nvmeq);
+}
+
+static enum blk_eh_timer_return nvme_timeout(struct request *req)
+{
+	void *ctx;
+	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = cmd->nvmeq;
+	static struct nvme_completion cqe = {
+		.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
+	};
+	nvme_completion_fn fn;
 
+	dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", req->tag,
+							nvmeq->qid);
+
+	if (nvmeq->dev->initialized)
+		nvme_abort_req(req);
+
+	ctx = cancel_cmd_info(cmd, &fn);
+	fn(nvmeq, ctx, &cqe);
+
+	return BLK_EH_HANDLED;
+}
+
+static void nvme_free_queue(struct nvme_queue *nvmeq)
+{
 	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
 				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
 	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
@@ -1176,10 +1111,10 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 	int i;
 
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
-		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
-		rcu_assign_pointer(dev->queues[i], NULL);
-		call_rcu(&nvmeq->r_head, nvme_free_queue);
+		struct nvme_queue *nvmeq = dev->queues[i];
+		nvme_free_queue(nvmeq);
 		dev->queue_count--;
+		dev->queues[i] = NULL;
 	}
 }
 
@@ -1212,13 +1147,13 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq)
 {
 	spin_lock_irq(&nvmeq->q_lock);
 	nvme_process_cq(nvmeq);
-	nvme_cancel_ios(nvmeq, false);
+	nvme_cancel_ios(nvmeq);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
 static void nvme_disable_queue(struct nvme_dev *dev, int qid)
 {
-	struct nvme_queue *nvmeq = raw_nvmeq(dev, qid);
+	struct nvme_queue *nvmeq = dev->queues[qid];
 
 	if (!nvmeq)
 		return;
@@ -1238,8 +1173,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 							int depth, int vector)
 {
 	struct device *dmadev = &dev->pci_dev->dev;
-	unsigned extra = nvme_queue_extra(depth);
-	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
+	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
 	if (!nvmeq)
 		return NULL;
 
@@ -1264,17 +1198,13 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	spin_lock_init(&nvmeq->q_lock);
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
-	init_waitqueue_head(&nvmeq->sq_full);
-	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
-	bio_list_init(&nvmeq->sq_cong);
-	INIT_LIST_HEAD(&nvmeq->iod_bio);
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 	nvmeq->q_depth = depth;
 	nvmeq->cq_vector = vector;
 	nvmeq->qid = qid;
 	nvmeq->q_suspended = 1;
 	dev->queue_count++;
-	rcu_assign_pointer(dev->queues[qid], nvmeq);
+	dev->queues[qid] = nvmeq;
 
 	return nvmeq;
 
@@ -1303,15 +1233,12 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
-	unsigned extra = nvme_queue_extra(nvmeq->q_depth);
 
 	nvmeq->sq_tail = 0;
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
-	memset(nvmeq->cmdid_data, 0, extra);
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
-	nvme_cancel_ios(nvmeq, false);
 	nvmeq->q_suspended = 0;
 	dev->online_queues++;
 }
@@ -1412,6 +1339,55 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
 	return 0;
 }
 
+static struct blk_mq_ops nvme_mq_admin_ops = {
+	.queue_rq	= nvme_queue_request,
+	.map_queue	= blk_mq_map_queue,
+	.init_hctx	= nvme_init_admin_hctx,
+	.init_request	= nvme_init_admin_request,
+	.timeout	= nvme_timeout,
+};
+
+static struct blk_mq_ops nvme_mq_ops = {
+	.queue_rq	= nvme_queue_request,
+	.map_queue	= blk_mq_map_queue,
+	.init_hctx	= nvme_init_hctx,
+	.init_request	= nvme_init_request,
+	.timeout	= nvme_timeout,
+};
+
+static int nvme_alloc_admin_tags(struct nvme_dev *dev)
+{
+	if (!dev->admin_rq) {
+		dev->admin_tagset.ops = &nvme_mq_admin_ops;
+		dev->admin_tagset.nr_hw_queues = 1;
+		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH;
+		dev->admin_tagset.reserved_tags = 1;
+		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
+		dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+		dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
+		dev->admin_tagset.driver_data = dev;
+
+		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
+			return -ENOMEM;
+
+		dev->admin_rq = blk_mq_init_queue(&dev->admin_tagset);
+		if (!dev->admin_rq) {
+			memset(&dev->admin_tagset, 0,
+						sizeof(dev->admin_tagset));
+			blk_mq_free_tag_set(&dev->admin_tagset);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static void nvme_free_admin_tags(struct nvme_dev *dev)
+{
+	if (dev->admin_rq)
+		blk_mq_free_tag_set(&dev->admin_tagset);
+}
+
 static int nvme_configure_admin_queue(struct nvme_dev *dev)
 {
 	int result;
@@ -1423,9 +1399,9 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	if (result < 0)
 		return result;
 
-	nvmeq = raw_nvmeq(dev, 0);
+	nvmeq = dev->queues[0];
 	if (!nvmeq) {
-		nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+		nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, 0);
 		if (!nvmeq)
 			return -ENOMEM;
 	}
@@ -1445,16 +1421,26 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 
 	result = nvme_enable_ctrl(dev, cap);
 	if (result)
-		return result;
+		goto free_nvmeq;
+
+	result = nvme_alloc_admin_tags(dev);
+	if (result)
+		goto free_nvmeq;
 
 	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
 	if (result)
-		return result;
+		goto free_tags;
 
 	spin_lock_irq(&nvmeq->q_lock);
 	nvme_init_queue(nvmeq, 0);
 	spin_unlock_irq(&nvmeq->q_lock);
 	return result;
+
+ free_tags:
+	nvme_free_admin_tags(dev);
+ free_nvmeq:
+	nvme_free_queues(dev, 0);
+	return result;
 }
 
 struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
@@ -1681,10 +1667,11 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
 
 	timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
 								ADMIN_TIMEOUT;
+
 	if (length != cmd.data_len)
 		status = -ENOMEM;
 	else
-		status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout);
+		status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout);
 
 	if (cmd.data_len) {
 		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
@@ -1773,41 +1760,6 @@ static const struct block_device_operations nvme_fops = {
 	.getgeo		= nvme_getgeo,
 };
 
-static void nvme_resubmit_iods(struct nvme_queue *nvmeq)
-{
-	struct nvme_iod *iod, *next;
-
-	list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) {
-		if (unlikely(nvme_submit_iod(nvmeq, iod)))
-			break;
-		list_del(&iod->node);
-		if (bio_list_empty(&nvmeq->sq_cong) &&
-						list_empty(&nvmeq->iod_bio))
-			remove_wait_queue(&nvmeq->sq_full,
-						&nvmeq->sq_cong_wait);
-	}
-}
-
-static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
-{
-	while (bio_list_peek(&nvmeq->sq_cong)) {
-		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
-		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
-
-		if (bio_list_empty(&nvmeq->sq_cong) &&
-						list_empty(&nvmeq->iod_bio))
-			remove_wait_queue(&nvmeq->sq_full,
-							&nvmeq->sq_cong_wait);
-		if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
-			if (!waitqueue_active(&nvmeq->sq_full))
-				add_wait_queue(&nvmeq->sq_full,
-							&nvmeq->sq_cong_wait);
-			bio_list_add_head(&nvmeq->sq_cong, bio);
-			break;
-		}
-	}
-}
-
 static int nvme_kthread(void *data)
 {
 	struct nvme_dev *dev, *next;
@@ -1828,23 +1780,17 @@ static int nvme_kthread(void *data)
 				queue_work(nvme_workq, &dev->reset_work);
 				continue;
 			}
-			rcu_read_lock();
 			for (i = 0; i < dev->queue_count; i++) {
-				struct nvme_queue *nvmeq =
-						rcu_dereference(dev->queues[i]);
+				struct nvme_queue *nvmeq = dev->queues[i];
 				if (!nvmeq)
 					continue;
 				spin_lock_irq(&nvmeq->q_lock);
 				if (nvmeq->q_suspended)
 					goto unlock;
 				nvme_process_cq(nvmeq);
-				nvme_cancel_ios(nvmeq, true);
-				nvme_resubmit_bios(nvmeq);
-				nvme_resubmit_iods(nvmeq);
  unlock:
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
-			rcu_read_unlock();
 		}
 		spin_unlock(&dev_list_lock);
 		schedule_timeout(round_jiffies_relative(HZ));
@@ -1867,27 +1813,29 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 {
 	struct nvme_ns *ns;
 	struct gendisk *disk;
+	int node = dev_to_node(&dev->pci_dev->dev);
 	int lbaf;
 
 	if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
 		return NULL;
 
-	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
 		return NULL;
-	ns->queue = blk_alloc_queue(GFP_KERNEL);
+	ns->queue = blk_mq_init_queue(&dev->tagset);
 	if (!ns->queue)
 		goto out_free_ns;
-	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
+	queue_flag_set_unlocked(QUEUE_FLAG_DEFAULT, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
-	blk_queue_make_request(ns->queue, nvme_make_request);
+	queue_flag_clear_unlocked(QUEUE_FLAG_IO_STAT, ns->queue);
 	ns->dev = dev;
 	ns->queue->queuedata = ns;
 
-	disk = alloc_disk(0);
+	disk = alloc_disk_node(0, node);
 	if (!disk)
 		goto out_free_queue;
+
 	ns->ns_id = nsid;
 	ns->disk = disk;
 	lbaf = id->flbas & 0xf;
@@ -1921,143 +1869,19 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	return NULL;
 }
 
-static int nvme_find_closest_node(int node)
-{
-	int n, val, min_val = INT_MAX, best_node = node;
-
-	for_each_online_node(n) {
-		if (n == node)
-			continue;
-		val = node_distance(node, n);
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-	return best_node;
-}
-
-static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq,
-								int count)
-{
-	int cpu;
-	for_each_cpu(cpu, qmask) {
-		if (cpumask_weight(nvmeq->cpu_mask) >= count)
-			break;
-		if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask))
-			*per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid;
-	}
-}
-
-static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
-	const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue)
-{
-	int next_cpu;
-	for_each_cpu(next_cpu, new_mask) {
-		cpumask_or(mask, mask, get_cpu_mask(next_cpu));
-		cpumask_or(mask, mask, topology_thread_cpumask(next_cpu));
-		cpumask_and(mask, mask, unassigned_cpus);
-		nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue);
-	}
-}
-
 static void nvme_create_io_queues(struct nvme_dev *dev)
 {
-	unsigned i, max;
+	unsigned i;
 
-	max = min(dev->max_qid, num_online_cpus());
-	for (i = dev->queue_count; i <= max; i++)
+	for (i = dev->queue_count; i <= dev->max_qid; i++)
 		if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
 			break;
 
-	max = min(dev->queue_count - 1, num_online_cpus());
-	for (i = dev->online_queues; i <= max; i++)
-		if (nvme_create_queue(raw_nvmeq(dev, i), i))
+	for (i = dev->online_queues; i <= dev->max_qid; i++)
+		if (nvme_create_queue(dev->queues[i], i))
 			break;
 }
 
-/*
- * If there are fewer queues than online cpus, this will try to optimally
- * assign a queue to multiple cpus by grouping cpus that are "close" together:
- * thread siblings, core, socket, closest node, then whatever else is
- * available.
- */
-static void nvme_assign_io_queues(struct nvme_dev *dev)
-{
-	unsigned cpu, cpus_per_queue, queues, remainder, i;
-	cpumask_var_t unassigned_cpus;
-
-	nvme_create_io_queues(dev);
-
-	queues = min(dev->online_queues - 1, num_online_cpus());
-	if (!queues)
-		return;
-
-	cpus_per_queue = num_online_cpus() / queues;
-	remainder = queues - (num_online_cpus() - queues * cpus_per_queue);
-
-	if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL))
-		return;
-
-	cpumask_copy(unassigned_cpus, cpu_online_mask);
-	cpu = cpumask_first(unassigned_cpus);
-	for (i = 1; i <= queues; i++) {
-		struct nvme_queue *nvmeq = lock_nvmeq(dev, i);
-		cpumask_t mask;
-
-		cpumask_clear(nvmeq->cpu_mask);
-		if (!cpumask_weight(unassigned_cpus)) {
-			unlock_nvmeq(nvmeq);
-			break;
-		}
-
-		mask = *get_cpu_mask(cpu);
-		nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				topology_thread_cpumask(cpu),
-				nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				topology_core_cpumask(cpu),
-				nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				cpumask_of_node(cpu_to_node(cpu)),
-				nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				cpumask_of_node(
-					nvme_find_closest_node(
-						cpu_to_node(cpu))),
-				nvmeq, cpus_per_queue);
-		if (cpus_weight(mask) < cpus_per_queue)
-			nvme_add_cpus(&mask, unassigned_cpus,
-				unassigned_cpus,
-				nvmeq, cpus_per_queue);
-
-		WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue,
-			"nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
-			dev->instance, i);
-
-		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
-							nvmeq->cpu_mask);
-		cpumask_andnot(unassigned_cpus, unassigned_cpus,
-						nvmeq->cpu_mask);
-		cpu = cpumask_next(cpu, unassigned_cpus);
-		if (remainder && !--remainder)
-			cpus_per_queue++;
-		unlock_nvmeq(nvmeq);
-	}
-	WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n",
-								dev->instance);
-	i = 0;
-	cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
-	for_each_cpu(cpu, unassigned_cpus)
-		*per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1;
-	free_cpumask_var(unassigned_cpus);
-}
-
 static int set_queue_count(struct nvme_dev *dev, int count)
 {
 	int status;
@@ -2081,22 +1905,9 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
 	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
 }
 
-static int nvme_cpu_notify(struct notifier_block *self,
-				unsigned long action, void *hcpu)
-{
-	struct nvme_dev *dev = container_of(self, struct nvme_dev, nb);
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		nvme_assign_io_queues(dev);
-		break;
-	}
-	return NOTIFY_OK;
-}
-
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	struct nvme_queue *adminq = raw_nvmeq(dev, 0);
+	struct nvme_queue *adminq = dev->queues[0];
 	struct pci_dev *pdev = dev->pci_dev;
 	int result, i, vecs, nr_io_queues, size;
 
@@ -2155,12 +1966,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
 	/* Free previously allocated queues that are no longer usable */
 	nvme_free_queues(dev, nr_io_queues + 1);
-	nvme_assign_io_queues(dev);
-
-	dev->nb.notifier_call = &nvme_cpu_notify;
-	result = register_hotcpu_notifier(&dev->nb);
-	if (result)
-		goto free_queues;
+	nvme_create_io_queues(dev);
 
 	return 0;
 
@@ -2212,6 +2018,18 @@ static int nvme_dev_add(struct nvme_dev *dev)
 			(pdev->device == 0x0953) && ctrl->vs[3])
 		dev->stripe_size = 1 << (ctrl->vs[3] + shift);
 
+	dev->tagset.ops = &nvme_mq_ops;
+	dev->tagset.nr_hw_queues = dev->queue_count - 1;
+	dev->tagset.timeout = NVME_IO_TIMEOUT;
+	dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+	dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH);
+	dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
+	dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
+	dev->tagset.driver_data = dev;
+
+	if (blk_mq_alloc_tag_set(&dev->tagset))
+		goto out;
+
 	id_ns = mem;
 	for (i = 1; i <= nn; i++) {
 		res = nvme_identify(dev, i, 0, dma_addr);
@@ -2423,7 +2241,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
 	atomic_set(&dq.refcount, 0);
 	dq.worker = &worker;
 	for (i = dev->queue_count - 1; i > 0; i--) {
-		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+		struct nvme_queue *nvmeq = dev->queues[i];
 
 		if (nvme_suspend_queue(nvmeq))
 			continue;
@@ -2461,13 +2279,12 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 	int i;
 
 	dev->initialized = 0;
-	unregister_hotcpu_notifier(&dev->nb);
 
 	nvme_dev_list_remove(dev);
 
 	if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
 		for (i = dev->queue_count - 1; i >= 0; i--) {
-			struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+			struct nvme_queue *nvmeq = dev->queues[i];
 			nvme_suspend_queue(nvmeq);
 			nvme_clear_queue(nvmeq);
 		}
@@ -2560,7 +2377,7 @@ static void nvme_free_dev(struct kref *kref)
 	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
 
 	nvme_free_namespaces(dev);
-	free_percpu(dev->io_queue);
+	blk_mq_free_tag_set(&dev->tagset);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2712,23 +2529,24 @@ static void nvme_reset_workfn(struct work_struct *work)
 
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	int result = -ENOMEM;
+	int node, result = -ENOMEM;
 	struct nvme_dev *dev;
 
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	node = dev_to_node(&pdev->dev);
+	if (node == NUMA_NO_NODE)
+		set_dev_node(&pdev->dev, 0);
+
+	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
 	if (!dev)
 		return -ENOMEM;
-	dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
-								GFP_KERNEL);
+	dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
+							GFP_KERNEL, node);
 	if (!dev->entry)
 		goto free;
-	dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
-								GFP_KERNEL);
+	dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
+							GFP_KERNEL, node);
 	if (!dev->queues)
 		goto free;
-	dev->io_queue = alloc_percpu(unsigned short);
-	if (!dev->io_queue)
-		goto free;
 
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->reset_workfn = nvme_reset_failed_dev;
@@ -2779,7 +2597,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  release:
 	nvme_release_instance(dev);
  free:
-	free_percpu(dev->io_queue);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2806,7 +2623,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_dev_remove(dev);
 	nvme_dev_shutdown(dev);
 	nvme_free_queues(dev, 0);
-	rcu_barrier();
+	nvme_free_admin_tags(dev);
 	nvme_release_instance(dev);
 	nvme_release_prp_pools(dev);
 	kref_put(&dev->kref, nvme_free_dev);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 1813cfd..3fa3de4 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -19,6 +19,7 @@
 #include <linux/pci.h>
 #include <linux/miscdevice.h>
 #include <linux/kref.h>
+#include <linux/blk-mq.h>
 
 struct nvme_bar {
 	__u64			cap;	/* Controller Capabilities */
@@ -70,8 +71,10 @@ extern unsigned char io_timeout;
  */
 struct nvme_dev {
 	struct list_head node;
-	struct nvme_queue __rcu **queues;
-	unsigned short __percpu *io_queue;
+	struct nvme_queue **queues;
+	struct request_queue *admin_rq;
+	struct blk_mq_tag_set admin_tagset;
+	struct blk_mq_tag_set tagset;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
 	struct dma_pool *prp_page_pool;
@@ -90,7 +93,6 @@ struct nvme_dev {
 	struct miscdevice miscdev;
 	work_func_t reset_workfn;
 	struct work_struct reset_work;
-	struct notifier_block nb;
 	char name[12];
 	char serial[20];
 	char model[40];
@@ -132,7 +134,6 @@ struct nvme_iod {
 	int offset;		/* Of PRP list */
 	int nents;		/* Used in scatterlist */
 	int length;		/* Of data, in bytes */
-	unsigned long start_time;
 	dma_addr_t first_dma;
 	struct list_head node;
 	struct scatterlist sg[0];
@@ -150,7 +151,7 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
  */
 void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod);
 
-int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int , gfp_t);
+int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int, gfp_t);
 struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 				unsigned long addr, unsigned length);
 void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH V3] NVMe: basic conversion to blk-mq
  2014-05-28 22:59 ` [PATCH V3] NVMe: " Matias Bjørling
@ 2014-05-29  3:07   ` Keith Busch
  2014-05-29 14:25     ` Jens Axboe
  0 siblings, 1 reply; 10+ messages in thread
From: Keith Busch @ 2014-05-29  3:07 UTC (permalink / raw)
  To: Matias Bjørling; +Cc: willy, keith.busch, sbradshaw, axboe, linux-kernel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 5173 bytes --]

On Wed, 28 May 2014, Matias Bjørling wrote:
> This converts the current NVMe driver to utilize the blk-mq layer.

I am concerned about device hot removal since the h/w queues can be
freed at any time. I *think* blk-mq helps with this in that the driver
will not see a new request after calling blk_cleanup_queue. If you can
confirm that's true and that blk-mq waits for all requests active in the
driver to return to the block layer, then we're probably okay in this
path. That wasn't true as a bio based driver which is why we are cautious
with all the locking and barriers. But what about the IOCTL paths?

It also doesn't look like we're handling the case where the SGL can't
map to a PRP.

> +static void req_completion(struct nvme_queue *nvmeq, void *ctx,
> 						struct nvme_completion *cqe)
> {
> 	struct nvme_iod *iod = ctx;
> -	struct bio *bio = iod->private;
> +	struct request *req = iod->private;
> +
> 	u16 status = le16_to_cpup(&cqe->status) >> 1;
>
> -	if (unlikely(status)) {
> -		if (!(status & NVME_SC_DNR ||
> -				bio->bi_rw & REQ_FAILFAST_MASK) &&
> -				(jiffies - iod->start_time) < IOD_TIMEOUT) {
> -			if (!waitqueue_active(&nvmeq->sq_full))
> -				add_wait_queue(&nvmeq->sq_full,
> -							&nvmeq->sq_cong_wait);
> -			list_add_tail(&iod->node, &nvmeq->iod_bio);
> -			wake_up(&nvmeq->sq_full);
> -			return;
> -		}
> -	}
> 	if (iod->nents) {
> -		dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
> -			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
> -		nvme_end_io_acct(bio, iod->start_time);
> +		dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
> +			rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
> 	}
> 	nvme_free_iod(nvmeq->dev, iod);
> -	if (status)
> -		bio_endio(bio, -EIO);
> +
> +	if (unlikely(status))
> +		req->errors = -EIO;
> 	else
> -		bio_endio(bio, 0);
> +		req->errors = 0;
> +
> +	blk_mq_complete_request(req);
> }

Is blk-mq going to retry intermittently failed commands for me? It
doesn't look like it will.

> +static int nvme_submit_flush_sync(struct nvme_queue *nvmeq, struct nvme_ns *ns)
> +{
> +	struct request *req;
> +	struct nvme_command cmnd;
> +
> +	req = blk_mq_alloc_request(ns->queue, WRITE, GFP_KERNEL, false);
> +	if (!req)
> +		return -ENOMEM;
> +
> +	nvme_setup_flush(&cmnd, ns, req->tag);
> +	nvme_submit_sync_cmd(req, &cmnd, NULL, NVME_IO_TIMEOUT);
>
> 	return 0;
> }

It looks like this function above is being called from an interrupt
context where we are already holding a spinlock. The sync command will
try to take that same lock.

> /*
> - * Called with local interrupts disabled and the q_lock held.  May not sleep.
> + * Called with preemption disabled, may not sleep.
>  */
> -static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
> -								struct bio *bio)
> +static int nvme_submit_req_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
> +							  struct request *req)
> {
> +	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
> 	struct nvme_iod *iod;
> -	int psegs = bio_phys_segments(ns->queue, bio);
> -	int result;
> +	enum dma_data_direction dma_dir;
> +	int psegs = req->nr_phys_segments;
>
> -	if ((bio->bi_rw & REQ_FLUSH) && psegs)
> -		return nvme_split_flush_data(nvmeq, bio);
> -
> -	iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
> +	iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC);
> 	if (!iod)
> -		return -ENOMEM;
> +		return BLK_MQ_RQ_QUEUE_BUSY;
> +
> +	iod->private = req;
> +
> +	nvme_set_info(cmd, iod, req_completion);
> +
> +	if ((req->cmd_flags & REQ_FLUSH) && psegs) {
> +		struct flush_cmd_info *flush_cmd = kmalloc(
> +				sizeof(struct flush_cmd_info), GFP_KERNEL);

The comment above says "may not sleep", but using GFP_KERNEL here. I
actually think it is safe to sleep, though since you're not taking a
lock until later, so maybe you can change all the allocs to GFP_KERNEL?

> +static struct blk_mq_ops nvme_mq_admin_ops = {
> +	.queue_rq	= nvme_queue_request,

I think you would get some unpredictable behavior if a request really
did go through nvme_queue_request on the admin queue. I don't see how
that would happen; just sayin.

> static void nvme_create_io_queues(struct nvme_dev *dev)
> {
> -	unsigned i, max;
> +	unsigned i;
>
> -	max = min(dev->max_qid, num_online_cpus());
> -	for (i = dev->queue_count; i <= max; i++)
> +	for (i = dev->queue_count; i <= dev->max_qid; i++)
> 		if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
> 			break;
>
> -	max = min(dev->queue_count - 1, num_online_cpus());
> -	for (i = dev->online_queues; i <= max; i++)
> -		if (nvme_create_queue(raw_nvmeq(dev, i), i))
> +	for (i = dev->online_queues; i <= dev->max_qid; i++)
> +		if (nvme_create_queue(dev->queues[i], i))
> 			break;
> }

I think your loop criteria is off. We may not have been successful in
allocating a queue to request the controller create it on its side,
so I don't think you want to use max_qid.

> +	dev->tagset.ops = &nvme_mq_ops;
> +	dev->tagset.nr_hw_queues = dev->queue_count - 1;

The queue_count is how many the driver allocated, but the controller
may not have successfully created it. The online_queues count is how
many the controller has available for use.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V3] NVMe: basic conversion to blk-mq
  2014-05-29  3:07   ` Keith Busch
@ 2014-05-29 14:25     ` Jens Axboe
  2014-05-29 19:32       ` Jens Axboe
  2014-05-29 22:34       ` Keith Busch
  0 siblings, 2 replies; 10+ messages in thread
From: Jens Axboe @ 2014-05-29 14:25 UTC (permalink / raw)
  To: Keith Busch, Matias Bjørling; +Cc: willy, sbradshaw, linux-kernel

On 2014-05-28 21:07, Keith Busch wrote:
> On Wed, 28 May 2014, Matias Bjørling wrote:
>> This converts the current NVMe driver to utilize the blk-mq layer.
>
> I am concerned about device hot removal since the h/w queues can be
> freed at any time. I *think* blk-mq helps with this in that the driver
> will not see a new request after calling blk_cleanup_queue. If you can
> confirm that's true and that blk-mq waits for all requests active in the
> driver to return to the block layer, then we're probably okay in this
> path. That wasn't true as a bio based driver which is why we are cautious
> with all the locking and barriers. But what about the IOCTL paths?

Barring any bugs in the code, then yes, this should work. On the scsi-mq 
side, extensive error injection and pulling has been done, and it seems 
to hold up fine now. The ioctl path would need to be audited.

>
> It also doesn't look like we're handling the case where the SGL can't
> map to a PRP.
>
>> +static void req_completion(struct nvme_queue *nvmeq, void *ctx,
>>                         struct nvme_completion *cqe)
>> {
>>     struct nvme_iod *iod = ctx;
>> -    struct bio *bio = iod->private;
>> +    struct request *req = iod->private;
>> +
>>     u16 status = le16_to_cpup(&cqe->status) >> 1;
>>
>> -    if (unlikely(status)) {
>> -        if (!(status & NVME_SC_DNR ||
>> -                bio->bi_rw & REQ_FAILFAST_MASK) &&
>> -                (jiffies - iod->start_time) < IOD_TIMEOUT) {
>> -            if (!waitqueue_active(&nvmeq->sq_full))
>> -                add_wait_queue(&nvmeq->sq_full,
>> -                            &nvmeq->sq_cong_wait);
>> -            list_add_tail(&iod->node, &nvmeq->iod_bio);
>> -            wake_up(&nvmeq->sq_full);
>> -            return;
>> -        }
>> -    }
>>     if (iod->nents) {
>> -        dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
>> -            bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
>> -        nvme_end_io_acct(bio, iod->start_time);
>> +        dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
>> +            rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
>>     }
>>     nvme_free_iod(nvmeq->dev, iod);
>> -    if (status)
>> -        bio_endio(bio, -EIO);
>> +
>> +    if (unlikely(status))
>> +        req->errors = -EIO;
>>     else
>> -        bio_endio(bio, 0);
>> +        req->errors = 0;
>> +
>> +    blk_mq_complete_request(req);
>> }
>
> Is blk-mq going to retry intermittently failed commands for me? It
> doesn't look like it will.

Not sure what kind of behavior you are looking for here. If you can 
expand on the above a bit, I'll gladly help sort it out. Only the driver 
really knows if a particular request should be failed hard or retried. 
So you'd probably have to track retry counts in the request and 
reinsert/end as appropriate.

>
>> +static int nvme_submit_flush_sync(struct nvme_queue *nvmeq, struct
>> nvme_ns *ns)
>> +{
>> +    struct request *req;
>> +    struct nvme_command cmnd;
>> +
>> +    req = blk_mq_alloc_request(ns->queue, WRITE, GFP_KERNEL, false);
>> +    if (!req)
>> +        return -ENOMEM;
>> +
>> +    nvme_setup_flush(&cmnd, ns, req->tag);
>> +    nvme_submit_sync_cmd(req, &cmnd, NULL, NVME_IO_TIMEOUT);
>>
>>     return 0;
>> }
>
> It looks like this function above is being called from an interrupt
> context where we are already holding a spinlock. The sync command will
> try to take that same lock.

Yes, that code still looks very buggy. The initial alloc for 
flush_cmd_info should also retry, not fail hard, if that alloc fails. 
For the reinsert part, Matias, you want to look at the flush code in 
blk-mq and how that handles it.

>> +    if ((req->cmd_flags & REQ_FLUSH) && psegs) {
>> +        struct flush_cmd_info *flush_cmd = kmalloc(
>> +                sizeof(struct flush_cmd_info), GFP_KERNEL);
>
> The comment above says "may not sleep", but using GFP_KERNEL here. I
> actually think it is safe to sleep, though since you're not taking a
> lock until later, so maybe you can change all the allocs to GFP_KERNEL?

It might be safe, and it might not be. It depends on the CPU mapping to 
the queue. preemption _could_ be off here (if needed), so GFP_KERNEL 
cannot be used. Additionally, see above comment on what to do on alloc 
failure.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V3] NVMe: basic conversion to blk-mq
  2014-05-29 14:25     ` Jens Axboe
@ 2014-05-29 19:32       ` Jens Axboe
  2014-05-29 19:33         ` Jens Axboe
  2014-05-29 22:34       ` Keith Busch
  1 sibling, 1 reply; 10+ messages in thread
From: Jens Axboe @ 2014-05-29 19:32 UTC (permalink / raw)
  To: Keith Busch, Matias Bjørling; +Cc: willy, sbradshaw, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1350 bytes --]

On 05/29/2014 08:25 AM, Jens Axboe wrote:
>>> +static int nvme_submit_flush_sync(struct nvme_queue *nvmeq, struct
>>> nvme_ns *ns)
>>> +{
>>> +    struct request *req;
>>> +    struct nvme_command cmnd;
>>> +
>>> +    req = blk_mq_alloc_request(ns->queue, WRITE, GFP_KERNEL, false);
>>> +    if (!req)
>>> +        return -ENOMEM;
>>> +
>>> +    nvme_setup_flush(&cmnd, ns, req->tag);
>>> +    nvme_submit_sync_cmd(req, &cmnd, NULL, NVME_IO_TIMEOUT);
>>>
>>>     return 0;
>>> }
>>
>> It looks like this function above is being called from an interrupt
>> context where we are already holding a spinlock. The sync command will
>> try to take that same lock.
> 
> Yes, that code still looks very buggy. The initial alloc for
> flush_cmd_info should also retry, not fail hard, if that alloc fails.
> For the reinsert part, Matias, you want to look at the flush code in
> blk-mq and how that handles it.

There's an easy fix for this. Once it's managed by blk-mq, blk-mq will
decompose requests for you. This means a flush with data will be turned
into two commands for you, so we can kill this code attempting to handle
flush request with data.

Patch attached. Depending on how the series needs to look, the prep
patch of support bio flush with data should just be dropped however. No
point in adding that, and the removing it again.

-- 
Jens Axboe


[-- Attachment #2: nvme-flush.patch --]
[-- Type: text/x-patch, Size: 2178 bytes --]

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index ac695b336a98..23bd58dfa360 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -513,35 +513,11 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	writel(nvmeq->sq_tail, nvmeq->q_db);
 }
 
-static int nvme_submit_flush_sync(struct nvme_queue *nvmeq, struct nvme_ns *ns)
-{
-	struct request *req;
-	struct nvme_command cmnd;
-
-	req = blk_mq_alloc_request(ns->queue, WRITE, GFP_KERNEL, false);
-	if (!req)
-		return -ENOMEM;
-
-	nvme_setup_flush(&cmnd, ns, req->tag);
-	nvme_submit_sync_cmd(req, &cmnd, NULL, NVME_IO_TIMEOUT);
-
-	return 0;
-}
-
 struct flush_cmd_info {
 	struct nvme_ns *ns;
 	struct nvme_iod *iod;
 };
 
-static void req_flush_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct flush_cmd_info *flush_cmd = ctx;
-	nvme_submit_flush_sync(nvmeq, flush_cmd->ns);
-	req_completion(nvmeq, flush_cmd->iod, cqe);
-	kfree(flush_cmd);
-}
-
 static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 							struct nvme_ns *ns)
 {
@@ -560,7 +536,7 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 		nvme_submit_discard(nvmeq, ns, req, iod);
 		goto end_submit;
 	}
-	if (req->cmd_flags & REQ_FLUSH && !iod->nents) {
+	if (req->cmd_flags & REQ_FLUSH) {
 		nvme_submit_flush(nvmeq, ns, req->tag);
 		goto end_submit;
 	}
@@ -615,16 +591,6 @@ static int nvme_submit_req_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 
 	nvme_set_info(cmd, iod, req_completion);
 
-	if ((req->cmd_flags & REQ_FLUSH) && psegs) {
-		struct flush_cmd_info *flush_cmd = kmalloc(
-				sizeof(struct flush_cmd_info), GFP_KERNEL);
-		if (!flush_cmd)
-			goto free_iod;
-		flush_cmd->ns = ns;
-		flush_cmd->iod = iod;
-		nvme_set_info(cmd, flush_cmd, req_flush_completion);
-	}
-
 	if (req->cmd_flags & REQ_DISCARD) {
 		void *range;
 		/*
@@ -655,7 +621,6 @@ static int nvme_submit_req_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 
  finish_cmd:
 	nvme_finish_cmd(nvmeq, req->tag, NULL);
- free_iod:
 	nvme_free_iod(nvmeq->dev, iod);
 	return BLK_MQ_RQ_QUEUE_ERROR;
 }

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH V3] NVMe: basic conversion to blk-mq
  2014-05-29 19:32       ` Jens Axboe
@ 2014-05-29 19:33         ` Jens Axboe
  0 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2014-05-29 19:33 UTC (permalink / raw)
  To: Keith Busch, Matias Bjørling; +Cc: willy, sbradshaw, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1492 bytes --]

On 05/29/2014 01:32 PM, Jens Axboe wrote:
> On 05/29/2014 08:25 AM, Jens Axboe wrote:
>>>> +static int nvme_submit_flush_sync(struct nvme_queue *nvmeq, struct
>>>> nvme_ns *ns)
>>>> +{
>>>> +    struct request *req;
>>>> +    struct nvme_command cmnd;
>>>> +
>>>> +    req = blk_mq_alloc_request(ns->queue, WRITE, GFP_KERNEL, false);
>>>> +    if (!req)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    nvme_setup_flush(&cmnd, ns, req->tag);
>>>> +    nvme_submit_sync_cmd(req, &cmnd, NULL, NVME_IO_TIMEOUT);
>>>>
>>>>     return 0;
>>>> }
>>>
>>> It looks like this function above is being called from an interrupt
>>> context where we are already holding a spinlock. The sync command will
>>> try to take that same lock.
>>
>> Yes, that code still looks very buggy. The initial alloc for
>> flush_cmd_info should also retry, not fail hard, if that alloc fails.
>> For the reinsert part, Matias, you want to look at the flush code in
>> blk-mq and how that handles it.
> 
> There's an easy fix for this. Once it's managed by blk-mq, blk-mq will
> decompose requests for you. This means a flush with data will be turned
> into two commands for you, so we can kill this code attempting to handle
> flush request with data.
> 
> Patch attached. Depending on how the series needs to look, the prep
> patch of support bio flush with data should just be dropped however. No
> point in adding that, and the removing it again.

Updated, we can kill the flush_cmd_info struct as well.

-- 
Jens Axboe


[-- Attachment #2: nvme-flush-v2.patch --]
[-- Type: text/x-patch, Size: 2177 bytes --]

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index ac695b336a98..4cd525ee5f4a 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -513,35 +513,6 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	writel(nvmeq->sq_tail, nvmeq->q_db);
 }
 
-static int nvme_submit_flush_sync(struct nvme_queue *nvmeq, struct nvme_ns *ns)
-{
-	struct request *req;
-	struct nvme_command cmnd;
-
-	req = blk_mq_alloc_request(ns->queue, WRITE, GFP_KERNEL, false);
-	if (!req)
-		return -ENOMEM;
-
-	nvme_setup_flush(&cmnd, ns, req->tag);
-	nvme_submit_sync_cmd(req, &cmnd, NULL, NVME_IO_TIMEOUT);
-
-	return 0;
-}
-
-struct flush_cmd_info {
-	struct nvme_ns *ns;
-	struct nvme_iod *iod;
-};
-
-static void req_flush_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct flush_cmd_info *flush_cmd = ctx;
-	nvme_submit_flush_sync(nvmeq, flush_cmd->ns);
-	req_completion(nvmeq, flush_cmd->iod, cqe);
-	kfree(flush_cmd);
-}
-
 static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 							struct nvme_ns *ns)
 {
@@ -560,7 +531,7 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 		nvme_submit_discard(nvmeq, ns, req, iod);
 		goto end_submit;
 	}
-	if (req->cmd_flags & REQ_FLUSH && !iod->nents) {
+	if (req->cmd_flags & REQ_FLUSH) {
 		nvme_submit_flush(nvmeq, ns, req->tag);
 		goto end_submit;
 	}
@@ -615,16 +586,6 @@ static int nvme_submit_req_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 
 	nvme_set_info(cmd, iod, req_completion);
 
-	if ((req->cmd_flags & REQ_FLUSH) && psegs) {
-		struct flush_cmd_info *flush_cmd = kmalloc(
-				sizeof(struct flush_cmd_info), GFP_KERNEL);
-		if (!flush_cmd)
-			goto free_iod;
-		flush_cmd->ns = ns;
-		flush_cmd->iod = iod;
-		nvme_set_info(cmd, flush_cmd, req_flush_completion);
-	}
-
 	if (req->cmd_flags & REQ_DISCARD) {
 		void *range;
 		/*
@@ -655,7 +616,6 @@ static int nvme_submit_req_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 
  finish_cmd:
 	nvme_finish_cmd(nvmeq, req->tag, NULL);
- free_iod:
 	nvme_free_iod(nvmeq->dev, iod);
 	return BLK_MQ_RQ_QUEUE_ERROR;
 }

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH V3] NVMe: basic conversion to blk-mq
  2014-05-29 14:25     ` Jens Axboe
  2014-05-29 19:32       ` Jens Axboe
@ 2014-05-29 22:34       ` Keith Busch
  2014-05-29 23:06         ` Jens Axboe
  1 sibling, 1 reply; 10+ messages in thread
From: Keith Busch @ 2014-05-29 22:34 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Keith Busch, Matias Bjørling, willy, sbradshaw, linux-kernel

On Thu, 29 May 2014, Jens Axboe wrote:
> On 2014-05-28 21:07, Keith Busch wrote:
> Barring any bugs in the code, then yes, this should work. On the scsi-mq 
> side, extensive error injection and pulling has been done, and it seems to 
> hold up fine now. The ioctl path would need to be audited.

It's a little different than scsi. This would be like pulling the drive and
the HBA. In any case, it still looks like it works as expected.

>>> +static void req_completion(struct nvme_queue *nvmeq, void *ctx,
>>>                         struct nvme_completion *cqe)
>>> {
>>>     struct nvme_iod *iod = ctx;
>>> -    struct bio *bio = iod->private;
>>> +    struct request *req = iod->private;
>>> +
>>>     u16 status = le16_to_cpup(&cqe->status) >> 1;
>>> 
>>> -    if (unlikely(status)) {
>>> -        if (!(status & NVME_SC_DNR ||
>>> -                bio->bi_rw & REQ_FAILFAST_MASK) &&
>>> -                (jiffies - iod->start_time) < IOD_TIMEOUT) {
>>> -            if (!waitqueue_active(&nvmeq->sq_full))
>>> -                add_wait_queue(&nvmeq->sq_full,
>>> -                            &nvmeq->sq_cong_wait);
>>> -            list_add_tail(&iod->node, &nvmeq->iod_bio);
>>> -            wake_up(&nvmeq->sq_full);
>>> -            return;
>>> -        }
>>> -    }
>> 
>> Is blk-mq going to retry intermittently failed commands for me? It
>> doesn't look like it will.
>
> Not sure what kind of behavior you are looking for here. If you can expand on 
> the above a bit, I'll gladly help sort it out. Only the driver really knows 
> if a particular request should be failed hard or retried. So you'd probably 
> have to track retry counts in the request and reinsert/end as appropriate.

Some vendor's drives return a failure status for a command but fully
expect a retry to be successul. It'd be addressing this bug:

bugzilla.kernel.org/show_bug.cgi?id=61061

The code being removed at the top of this function in the latest patch was
taking care of the requeuing. I wasn't sure how many retries would be
necessary, so I capped it at a total time instead of total tries. I'm told
from 3rd parties that what we're doing is successful in their tests.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V3] NVMe: basic conversion to blk-mq
  2014-05-29 22:34       ` Keith Busch
@ 2014-05-29 23:06         ` Jens Axboe
  2014-05-29 23:12           ` Jens Axboe
  0 siblings, 1 reply; 10+ messages in thread
From: Jens Axboe @ 2014-05-29 23:06 UTC (permalink / raw)
  To: Keith Busch; +Cc: Matias Bjørling, willy, sbradshaw, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2749 bytes --]

On 05/29/2014 04:34 PM, Keith Busch wrote:
> On Thu, 29 May 2014, Jens Axboe wrote:
>> On 2014-05-28 21:07, Keith Busch wrote:
>> Barring any bugs in the code, then yes, this should work. On the
>> scsi-mq side, extensive error injection and pulling has been done, and
>> it seems to hold up fine now. The ioctl path would need to be audited.
> 
> It's a little different than scsi. This would be like pulling the drive and
> the HBA. In any case, it still looks like it works as expected.

That is true, but block/blk-mq generally only cares about whether the
device goes or not. So it should be pretty much the same for this case.

>>>> +static void req_completion(struct nvme_queue *nvmeq, void *ctx,
>>>>                         struct nvme_completion *cqe)
>>>> {
>>>>     struct nvme_iod *iod = ctx;
>>>> -    struct bio *bio = iod->private;
>>>> +    struct request *req = iod->private;
>>>> +
>>>>     u16 status = le16_to_cpup(&cqe->status) >> 1;
>>>>
>>>> -    if (unlikely(status)) {
>>>> -        if (!(status & NVME_SC_DNR ||
>>>> -                bio->bi_rw & REQ_FAILFAST_MASK) &&
>>>> -                (jiffies - iod->start_time) < IOD_TIMEOUT) {
>>>> -            if (!waitqueue_active(&nvmeq->sq_full))
>>>> -                add_wait_queue(&nvmeq->sq_full,
>>>> -                            &nvmeq->sq_cong_wait);
>>>> -            list_add_tail(&iod->node, &nvmeq->iod_bio);
>>>> -            wake_up(&nvmeq->sq_full);
>>>> -            return;
>>>> -        }
>>>> -    }
>>>
>>> Is blk-mq going to retry intermittently failed commands for me? It
>>> doesn't look like it will.
>>
>> Not sure what kind of behavior you are looking for here. If you can
>> expand on the above a bit, I'll gladly help sort it out. Only the
>> driver really knows if a particular request should be failed hard or
>> retried. So you'd probably have to track retry counts in the request
>> and reinsert/end as appropriate.
> 
> Some vendor's drives return a failure status for a command but fully
> expect a retry to be successul. It'd be addressing this bug:
> 
> bugzilla.kernel.org/show_bug.cgi?id=61061
> 
> The code being removed at the top of this function in the latest patch was
> taking care of the requeuing. I wasn't sure how many retries would be
> necessary, so I capped it at a total time instead of total tries. I'm told
> from 3rd parties that what we're doing is successful in their tests.

Ah I see, yes that code apparently got axed. The attached patch brings
it back. Totally untested, I'll try and synthetically hit it to ensure
that it does work. Note that it currently does unmap and iod free, so
the request comes back pristine. We could preserve that if we really
wanted to, I'm guessing it's not a big deal.

-- 
Jens Axboe


[-- Attachment #2: nvme-retry.patch --]
[-- Type: text/x-patch, Size: 604 bytes --]

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 22d8013cd4ff..ccdd02fa6882 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -363,9 +363,14 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 	}
 	nvme_free_iod(nvmeq->dev, iod);
 
-	if (unlikely(status))
+	if (unlikely(status)) {
+		if (!(status & NVME_SC_DNR || blk_noretry_request(req))
+		    && (jiffies - req->start_time) < req->timeout) {
+			blk_mq_requeue_request(req);
+			return;
+		}
 		req->errors = -EIO;
-	else
+	} else
 		req->errors = 0;
 
 	blk_mq_complete_request(req);

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH V3] NVMe: basic conversion to blk-mq
  2014-05-29 23:06         ` Jens Axboe
@ 2014-05-29 23:12           ` Jens Axboe
  2014-05-30 17:20             ` Matias Bjorling
  0 siblings, 1 reply; 10+ messages in thread
From: Jens Axboe @ 2014-05-29 23:12 UTC (permalink / raw)
  To: Keith Busch; +Cc: Matias Bjørling, willy, sbradshaw, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 586 bytes --]

On 05/29/2014 05:06 PM, Jens Axboe wrote:
> Ah I see, yes that code apparently got axed. The attached patch brings
> it back. Totally untested, I'll try and synthetically hit it to ensure
> that it does work. Note that it currently does unmap and iod free, so
> the request comes back pristine. We could preserve that if we really
> wanted to, I'm guessing it's not a big deal.

And another totally untested patch that retains the iod and dma mapping
on the requeue event. Again, probably not a big deal, I'm assuming these
happen rarely, but it's simple enough to do.

-- 
Jens Axboe


[-- Attachment #2: nvme-retry-v2.patch --]
[-- Type: text/x-patch, Size: 1525 bytes --]

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 22d8013cd4ff..fd3f11d837bb 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -357,17 +357,22 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 
+	if (unlikely(status)) {
+		if (!(status & NVME_SC_DNR || blk_noretry_request(req))
+		    && (jiffies - req->start_time) < req->timeout) {
+			blk_mq_requeue_request(req);
+			return;
+		}
+		req->errors = -EIO;
+	} else
+		req->errors = 0;
+
 	if (iod->nents) {
 		dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
 			rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	}
 	nvme_free_iod(nvmeq->dev, iod);
 
-	if (unlikely(status))
-		req->errors = -EIO;
-	else
-		req->errors = 0;
-
 	blk_mq_complete_request(req);
 }
 
@@ -569,11 +574,19 @@ static int nvme_submit_req_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	enum dma_data_direction dma_dir;
 	int psegs = req->nr_phys_segments;
 
+	/*
+	 * Requeued IO has already been prepped
+	 */
+	iod = req->special;
+	if (iod)
+		goto submit_iod;
+
 	iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC);
 	if (!iod)
 		return BLK_MQ_RQ_QUEUE_BUSY;
 
 	iod->private = req;
+	req->special = iod;
 
 	nvme_set_info(cmd, iod, req_completion);
 
@@ -602,6 +615,7 @@ static int nvme_submit_req_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 			goto finish_cmd;
 	}
 
+ submit_iod:
 	if (!nvme_submit_iod(nvmeq, iod, ns))
 		return 0;
 

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH V3] NVMe: basic conversion to blk-mq
  2014-05-29 23:12           ` Jens Axboe
@ 2014-05-30 17:20             ` Matias Bjorling
  0 siblings, 0 replies; 10+ messages in thread
From: Matias Bjorling @ 2014-05-30 17:20 UTC (permalink / raw)
  To: Jens Axboe, Keith Busch; +Cc: willy, sbradshaw, linux-kernel

On 05/30/2014 01:12 AM, Jens Axboe wrote:
> On 05/29/2014 05:06 PM, Jens Axboe wrote:
>> Ah I see, yes that code apparently got axed. The attached patch brings
>> it back. Totally untested, I'll try and synthetically hit it to ensure
>> that it does work. Note that it currently does unmap and iod free, so
>> the request comes back pristine. We could preserve that if we really
>> wanted to, I'm guessing it's not a big deal.
> 
> And another totally untested patch that retains the iod and dma mapping
> on the requeue event. Again, probably not a big deal, I'm assuming these
> happen rarely, but it's simple enough to do.
> 

Thanks. I've added it and rebased nvmemq_wip_review on the latest 3.16/core.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2014-05-30 17:20 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-05-28 22:59 [PATCH V3] basic conversion to blk-mq Matias Bjørling
2014-05-28 22:59 ` [PATCH V3] NVMe: " Matias Bjørling
2014-05-29  3:07   ` Keith Busch
2014-05-29 14:25     ` Jens Axboe
2014-05-29 19:32       ` Jens Axboe
2014-05-29 19:33         ` Jens Axboe
2014-05-29 22:34       ` Keith Busch
2014-05-29 23:06         ` Jens Axboe
2014-05-29 23:12           ` Jens Axboe
2014-05-30 17:20             ` Matias Bjorling

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox